Source code for scholar_flux.data.data_extractor

# /data/data_extractor.py
"""The scholar_flux.data.data_extractor builds on the BaseDataExtractor to implement automated extraction when the paths
are not known beforehand.

The extracted list of responses and metadata dictionaries are used in later steps prior to further response record
processing.

"""
from typing import Any, Optional, Union
from scholar_flux.exceptions import DataExtractionException

from scholar_flux.data.base_extractor import BaseDataExtractor

import logging

logger = logging.getLogger(__name__)


[docs] class DataExtractor(BaseDataExtractor): """The DataExtractor allows for the streamlined extraction of records and metadata from responses retrieved from APIs. This proceeds as the second stage of the response processing step where metadata and records are extracted from parsed responses. The data extractor provides two ways to identify metadata paths and record paths: 1) manual identification: If record path or metadata_path are specified, then the data extractor will attempt to retrieve the metadata and records at the provided paths. Note that, as metadata_paths can be associated with multiple keys, starting from the outside dictionary, we may have to specify a dictionary containing keys denoting metadata variables and their paths as a list of values indicating how to retrieve the value. The path can also be given by a list of lists describing how to retrieve the last element. 2) Dynamic identification: Uses heuristics to determine records from metadata. records will nearly always be defined by a list containing only dictionaries as its elements while the metadata will generally contain a variety of elements, some nested and others as integers, strings, etc. In some cases where its harder to determine, we can use dynamic_record_identifiers to determine whether a list containing a single nested dictionary is a record or metadata. For scientific purposes, its keys may contain 'abstract', 'title', 'doi', etc. This can be defined manually by the users if the defaults are not reliable for a given API. Upon initializing the class, the class can be used as a callable that returns the records and metadata in that order. Example: >>> from scholar_flux.data import DataExtractor >>> data = dict(query='specification driven development', options={'record_count':5,'response_time':'50ms'}) >>> data['records'] = [dict(id=1, record='protocol vs.code'), dict(id=2, record='Impact of Agile')] >>> extractor = DataExtractor() >>> records, metadata = extractor(data) >>> print(metadata) # OUTPUT: {'query': 'specification driven development', 'record_count': 5, 'response_time': '50ms'} >>> print(records) # OUTPUT: [{'id': 1, 'record': 'protocol vs.code'}, {'id': 2, 'record': 'Impact of Agile'}] """ DEFAULT_DYNAMIC_RECORD_IDENTIFIERS = ("title", "doi", "abstract") DEFAULT_DYNAMIC_METADATA_IDENTIFIERS = ("metadata", "facets", "IdList")
[docs] def __init__( self, record_path: Optional[list] = None, metadata_path: Optional[list[list] | dict[str, list]] = None, dynamic_record_identifiers: Optional[list | tuple] = None, dynamic_metadata_identifiers: Optional[list | tuple] = None, ): """Initialize the DataExtractor with optional path overrides for metadata and records. Args: record_path (Optional[List[str]]): Custom path to find records in the parsed data. Contains a list of strings and rarely integers indexes indicating how to recursively find the list of records metadata_path (List[List[str]] | Optional[Dict[str, List[str]]]): Identifies the paths in a dictionary associated with metadata as opposed to records. This can be a list of paths where each element is a list describing how to get to a terminal element dynamic_record_identifiers (Optional[List[str]]): Helps to identify dictionary keys that only belong to records when dealing with a single element that would otherwise be classified as metadata. dynamic_metadata_identifiers (Optional[List[str]]): Helps to identify dictionary keys that are likely to only belong to metadata that could otherwise share a similar structure to a list of dictionaries, similar to what's seen with records. """ self.dynamic_record_identifiers = ( dynamic_record_identifiers if dynamic_record_identifiers is not None else self.DEFAULT_DYNAMIC_RECORD_IDENTIFIERS ) self.dynamic_metadata_identifiers = ( dynamic_metadata_identifiers if dynamic_metadata_identifiers is not None else self.DEFAULT_DYNAMIC_METADATA_IDENTIFIERS ) super().__init__(record_path, metadata_path)
@classmethod def _validate_dynamic_identifiers( cls, dynamic_record_identifiers: Optional[list | tuple] = None, dynamic_metadata_identifiers: Optional[list | tuple] = None, ): """ Method used to validate the dynamic record identifiers provided to the DataExtractor prior to its later use In extracting metadata and records Args: dynamic_record_identifiers (Optional[List[str | None]]): Keyword identifier indicating when singular records in a dictionary can be identified as such in contrast to metadata dynamic_metadata_identifiers (Optional[List[str | None]]): Keyword identifier indicating when record metadata keys in a dictionary can be identified as such in contrast to metadata Raises: DataExtractionException: Indicates an error in the DataExtractor and identifies where the inputs take on an invalid value """ try: if dynamic_record_identifiers is not None: if not isinstance(dynamic_record_identifiers, (list, tuple)): raise KeyError( f"The dynamic record identifiers provided must be a tuple or list. Received: {type(dynamic_record_identifiers)}" ) if not all(isinstance(path, (str)) for path in dynamic_record_identifiers): raise KeyError( f"At least one value in the provided dynamic record identifier is not an integer or string: {dynamic_record_identifiers}" ) if dynamic_metadata_identifiers is not None: if not isinstance(dynamic_metadata_identifiers, (list, tuple)): raise KeyError( f"The dynamic metadata identifiers provided must be a tuple or list. Received: {type(dynamic_metadata_identifiers)}" ) if not all(isinstance(path, (str)) for path in dynamic_metadata_identifiers): raise KeyError( f"At least one value in the provided dynamic metadata identifier is not an integer or string: {dynamic_metadata_identifiers}" ) except (KeyError, TypeError) as e: raise DataExtractionException( f"Error initializing the DataExtractor: At least one of the inputs are invalid. {e}" ) from e return None def _validate_inputs(self) -> None: """Method used to validate the inputs provided to the DataExtractor prior to its later use In extracting metadata and records. This method operates by verifying the attributes associated with the current data extractor once the attributes are set. Note that this method is overridden so that all additional fields are validated once super().__init__(...) is called. Validated Attributes: record_path (Optional[List[str | None]]): The path where a list of records are located metadata_path (Optional[List[str | None]]): The list or dictionary of paths where metadata records are located dynamic_record_identifiers (Optional[List[str | None]]): Keyword identifier indicating when singular records in a dictionary can be identified as such in contrast to metadata dynamic_metadata_identifiers (Optional[List[str | None]]): Keyword identifier indicating when record metadata keys in a dictionary can be identified as such in contrast to metadata Raises: DataExtractionException: Indicates an error in the DataExtractor and identifies where the inputs take on an invalid value """ self._validate_paths(self.record_path, self.metadata_path) self._validate_dynamic_identifiers(self.dynamic_record_identifiers, self.dynamic_metadata_identifiers) return None
[docs] def dynamic_identification(self, parsed_page_dict: dict) -> tuple[list[dict[str, Any]], dict[str, Any]]: """Dynamically identify and separate metadata from records. This function recursively traverses the dictionary and uses a heuristic to determine whether a specific record corresponds to metadata or is a list of records: Generally, keys associated with values corresponding to metadata will contain only lists of dictionaries On the other hand, nested structures containing metadata will be associated with a singular value other a dictionary of keys associated with a singular value that is not a list. Using this heuristic, we're able to determine metadata from records with a high degree of confidence. Args: parsed_page_dict (Dict): The dictionary containing the page data and metadata to be extracted. Returns: Tuple[Dict[str, Any], List[Dict[str, Any]]]: A tuple containing the metadata dictionary and the list of record dictionaries. """ metadata = {} records = [] for key, value in parsed_page_dict.items(): if key in self.dynamic_metadata_identifiers: metadata[key] = value elif isinstance(value, dict): sub_records, sub_metadata = self.dynamic_identification(value) metadata.update(sub_metadata) records.extend(sub_records) elif isinstance(value, list): if all(isinstance(item, dict) for item in value): if len(value) == 0: logger.debug(f"Element at key: {key} is empty") elif len(value) > 1: # assuming it's records if it's a list of dicts records.extend(value) else: record = value[0] if self._identify_by_key(record, self.dynamic_record_identifiers): records.extend(value) continue sub_records, sub_metadata = self.dynamic_identification(value[0]) metadata.update(sub_metadata) records.extend(sub_records) else: metadata[key] = value return records, metadata
@staticmethod def _identify_by_key(record: Any, key_identifiers: list | tuple) -> bool: """Helper method for determining if a key exists in a dictionary. If a record is not a dictionary, or a key is not contained in a record dictionary, then this method will return False by default. Args: record (Any): The an element in a JSON object. if a dictionary, Is checked to determine whether any of the selected key identifiers exist within it. key_identifiers (list | tuple): contains keys to check for. if the key exists, we'll """ return all( [ isinstance(record, dict), any(True for id_key in (key_identifiers or []) if any(id_key in record_key for record_key in record)), ] )
[docs] def extract(self, parsed_page: Union[list[dict], dict]) -> tuple[Optional[list[dict]], Optional[dict[str, Any]]]: """Extract both records and metadata from the parsed page dictionary. Args: parsed_page (List[Dict] | Dict): The dictionary containing the page data and metadata to be extracted. Returns: Tuple[Optional[List[Dict]], Optional[Dict]]: A tuple containing the list of records and the metadata dictionary. """ parsed_page_dict = self._prepare_page(parsed_page) if self.metadata_path or self.record_path: records = self.extract_records(parsed_page_dict) metadata = self.extract_metadata(parsed_page_dict) else: records, metadata = self.dynamic_identification(parsed_page_dict) return records, metadata
__all__ = ["DataExtractor"]