Source code for scholar_flux.data.data_extractor

# /data/data_extractor.py
"""The scholar_flux.data.data_extractor builds on the BaseDataExtractor to implement automated extraction when the paths
are not known beforehand.

The extracted list of responses and metadata dictionaries are used in later steps prior to further response record
processing.

"""
from typing import Any, Optional, Union
from scholar_flux.exceptions import DataExtractionException

from scholar_flux.data.base_extractor import BaseDataExtractor

import logging

logger = logging.getLogger(__name__)



[docs]
class DataExtractor(BaseDataExtractor):
    """The DataExtractor allows for the streamlined extraction of records and metadata from responses retrieved from
    APIs. This proceeds as the second stage of the response processing step where metadata and records are extracted
    from parsed responses.

    The data extractor provides two ways to identify metadata paths and record paths:

        1) manual identification: If record path or metadata_path are specified,
           then the data extractor will attempt to retrieve the metadata and records at the
           provided paths. Note that, as metadata_paths can be associated with multiple keys,
           starting from the outside dictionary, we may have to specify a dictionary containing
           keys denoting metadata variables and their paths as a list of values indicating how to
           retrieve the value. The path can also be given by a list of lists describing how to
           retrieve the last element.
        2) Dynamic identification: Uses heuristics to determine records from metadata. records
           will nearly always be defined by a list containing only dictionaries as its elements
           while the metadata will generally contain a variety of elements, some nested and others
           as integers, strings, etc. In some cases where its harder to determine, we can use
           dynamic_record_identifiers to determine whether a list containing a single nested dictionary
           is a record or metadata. For scientific purposes, its keys may contain 'abstract', 'title', 'doi',
           etc. This can be defined manually by the users if the defaults are not reliable for a given API.

    Upon initializing the class, the class can be used as a callable that returns the records and metadata
    in that order.

    Example:
        >>> from scholar_flux.data import DataExtractor
        >>> data = dict(query='specification driven development', options={'record_count':5,'response_time':'50ms'})
        >>> data['records'] = [dict(id=1, record='protocol vs.code'), dict(id=2, record='Impact of Agile')]
        >>> extractor = DataExtractor()
        >>> records, metadata = extractor(data)
        >>> print(metadata)
        # OUTPUT: {'query': 'specification driven development', 'record_count': 5, 'response_time': '50ms'}
        >>> print(records)
        # OUTPUT: [{'id': 1, 'record': 'protocol vs.code'}, {'id': 2, 'record': 'Impact of Agile'}]

    """

    DEFAULT_DYNAMIC_RECORD_IDENTIFIERS = ("title", "doi", "abstract")
    DEFAULT_DYNAMIC_METADATA_IDENTIFIERS = ("metadata", "facets", "IdList")


[docs]
    def __init__(
        self,
        record_path: Optional[list] = None,
        metadata_path: Optional[list[list] | dict[str, list]] = None,
        dynamic_record_identifiers: Optional[list | tuple] = None,
        dynamic_metadata_identifiers: Optional[list | tuple] = None,
    ):
        """Initialize the DataExtractor with optional path overrides for metadata and records.

        Args:
            record_path (Optional[List[str]]): Custom path to find records in the parsed data. Contains a list of strings and
                                               rarely integers indexes indicating how to recursively find the list of records
            metadata_path (List[List[str]] | Optional[Dict[str, List[str]]]): Identifies the paths in a dictionary
                associated with metadata as opposed to records. This can be a list of paths where each element is a list
                describing how to get to a terminal
            element
            dynamic_record_identifiers (Optional[List[str]]): Helps to identify dictionary keys that only belong to records when
                                                              dealing with a single element that would otherwise be classified
                                                              as metadata.
            dynamic_metadata_identifiers (Optional[List[str]]): Helps to identify dictionary keys that are likely to only belong
                                                               to metadata that could otherwise share a similar structure to
                                                               a list of dictionaries, similar to what's seen with records.

        """

        self.dynamic_record_identifiers = (
            dynamic_record_identifiers
            if dynamic_record_identifiers is not None
            else self.DEFAULT_DYNAMIC_RECORD_IDENTIFIERS
        )
        self.dynamic_metadata_identifiers = (
            dynamic_metadata_identifiers
            if dynamic_metadata_identifiers is not None
            else self.DEFAULT_DYNAMIC_METADATA_IDENTIFIERS
        )

        super().__init__(record_path, metadata_path)


    @classmethod
    def _validate_dynamic_identifiers(
        cls,
        dynamic_record_identifiers: Optional[list | tuple] = None,
        dynamic_metadata_identifiers: Optional[list | tuple] = None,
    ):
        """
        Method used to validate the dynamic record identifiers provided to the DataExtractor prior to its later use
        In extracting metadata and records
        Args:
            dynamic_record_identifiers (Optional[List[str | None]]): Keyword identifier indicating when singular records in a dictionary
                                                                     can be identified as such in contrast to metadata
            dynamic_metadata_identifiers (Optional[List[str | None]]): Keyword identifier indicating when record metadata keys in a dictionary
                                                                        can be identified as such in contrast to metadata
        Raises:
            DataExtractionException: Indicates an error in the DataExtractor and identifies where the inputs take on an invalid value
        """
        try:
            if dynamic_record_identifiers is not None:
                if not isinstance(dynamic_record_identifiers, (list, tuple)):
                    raise KeyError(
                        f"The dynamic record identifiers provided must be a tuple or list. Received: {type(dynamic_record_identifiers)}"
                    )
                if not all(isinstance(path, (str)) for path in dynamic_record_identifiers):
                    raise KeyError(
                        f"At least one value in the provided dynamic record identifier is not an integer or string: {dynamic_record_identifiers}"
                    )

            if dynamic_metadata_identifiers is not None:
                if not isinstance(dynamic_metadata_identifiers, (list, tuple)):
                    raise KeyError(
                        f"The dynamic metadata identifiers provided must be a tuple or list. Received: {type(dynamic_metadata_identifiers)}"
                    )
                if not all(isinstance(path, (str)) for path in dynamic_metadata_identifiers):
                    raise KeyError(
                        f"At least one value in the provided dynamic metadata identifier is not an integer or string: {dynamic_metadata_identifiers}"
                    )

        except (KeyError, TypeError) as e:
            raise DataExtractionException(
                f"Error initializing the DataExtractor: At least one of the inputs are invalid. {e}"
            ) from e
        return None

    def _validate_inputs(self) -> None:
        """Method used to validate the inputs provided to the DataExtractor prior to its later use In extracting
        metadata and records. This method operates by verifying the attributes associated with the current data
        extractor once the attributes are set.

        Note that this method is overridden so that all additional fields are validated once super().__init__(...) is
        called.

        Validated Attributes:
            record_path (Optional[List[str | None]]): The path where a list of records are located
            metadata_path (Optional[List[str | None]]): The list or dictionary of paths where metadata records are located
            dynamic_record_identifiers (Optional[List[str | None]]): Keyword identifier indicating when singular records in a dictionary
                                                                       can be identified as such in contrast to metadata
            dynamic_metadata_identifiers (Optional[List[str | None]]): Keyword identifier indicating when record metadata keys in a dictionary
                                                                        can be identified as such in contrast to metadata
        Raises:
            DataExtractionException: Indicates an error in the DataExtractor and identifies where the inputs take on an invalid value

        """
        self._validate_paths(self.record_path, self.metadata_path)
        self._validate_dynamic_identifiers(self.dynamic_record_identifiers, self.dynamic_metadata_identifiers)
        return None


[docs]
    def dynamic_identification(self, parsed_page_dict: dict) -> tuple[list[dict[str, Any]], dict[str, Any]]:
        """Dynamically identify and separate metadata from records. This function recursively traverses the dictionary
        and uses a heuristic to determine whether a specific record corresponds to metadata or is a list of records:
        Generally, keys associated with values corresponding to metadata will contain only lists of dictionaries On the
        other hand, nested structures containing metadata will be associated with a singular value other a dictionary of
        keys associated with a singular value that is not a list. Using this heuristic, we're able to determine metadata
        from records with a high degree of confidence.

        Args:
            parsed_page_dict (Dict): The dictionary containing the page data and metadata to be extracted.

        Returns:
            Tuple[Dict[str, Any], List[Dict[str, Any]]]: A tuple containing the metadata dictionary and the list of record dictionaries.

        """
        metadata = {}
        records = []

        for key, value in parsed_page_dict.items():

            if key in self.dynamic_metadata_identifiers:
                metadata[key] = value

            elif isinstance(value, dict):
                sub_records, sub_metadata = self.dynamic_identification(value)
                metadata.update(sub_metadata)
                records.extend(sub_records)

            elif isinstance(value, list):
                if all(isinstance(item, dict) for item in value):
                    if len(value) == 0:
                        logger.debug(f"Element at key: {key} is empty")
                    elif len(value) > 1:  # assuming it's records if it's a list of dicts

                        records.extend(value)
                    else:
                        record = value[0]
                        if self._identify_by_key(record, self.dynamic_record_identifiers):
                            records.extend(value)
                            continue

                        sub_records, sub_metadata = self.dynamic_identification(value[0])
                        metadata.update(sub_metadata)
                        records.extend(sub_records)
            else:
                metadata[key] = value

        return records, metadata


    @staticmethod
    def _identify_by_key(record: Any, key_identifiers: list | tuple) -> bool:
        """Helper method for determining if a key exists in a dictionary.

        If a record is not a dictionary, or a key is not contained in a record dictionary,
        then this method will return False by default.
        Args:
            record (Any):
                The an element in a JSON object. if a dictionary, Is checked to determine
                whether any of the selected key identifiers exist within it.
            key_identifiers (list | tuple):
                              contains keys to check for. if the key exists, we'll

        """
        return all(
            [
                isinstance(record, dict),
                any(True for id_key in (key_identifiers or []) if any(id_key in record_key for record_key in record)),
            ]
        )


[docs]
    def extract(self, parsed_page: Union[list[dict], dict]) -> tuple[Optional[list[dict]], Optional[dict[str, Any]]]:
        """Extract both records and metadata from the parsed page dictionary.

        Args:
            parsed_page (List[Dict] | Dict): The dictionary containing the page data and metadata to be extracted.

        Returns:
            Tuple[Optional[List[Dict]], Optional[Dict]]: A tuple containing the list of records and the metadata dictionary.

        """

        parsed_page_dict = self._prepare_page(parsed_page)

        if self.metadata_path or self.record_path:
            records = self.extract_records(parsed_page_dict)
            metadata = self.extract_metadata(parsed_page_dict)
        else:
            records, metadata = self.dynamic_identification(parsed_page_dict)

        return records, metadata




__all__ = ["DataExtractor"]