Source code for scholar_flux.data.base_extractor

# /data/base_extractor
"""The scholar_flux.data.base_extractor implements the core processes used to extract data from parsed responses when
the structure and the locations of records and metadata are already known."""
from typing import Any, Optional, Union
from scholar_flux.exceptions import DataExtractionException
from scholar_flux.utils import get_nested_data, try_int, try_dict, as_list_1d, unlist_1d
from scholar_flux.utils.repr_utils import generate_repr

import logging

logger = logging.getLogger(__name__)



[docs]
class BaseDataExtractor:
    """Base DataExtractor implementing the minimum components necessary to extract records and metadata from parsed
    responses when the location of records and metadata is known beforehand."""


[docs]
    def __init__(
        self,
        record_path: Optional[list] = None,
        metadata_path: Optional[list[list] | dict[str, list]] = None,
    ):
        """Initialize the DataExtractor with metadata and records to extract separately.

        If record path or metadata_path are specified,
           then the data extractor will attempt to retrieve the metadata and records at the
           provided paths. Note that, as metadata_paths can be associated with multiple keys,
           starting from the outside dictionary, we may have to specify a dictionary containing
           keys denoting metadata variables and their paths as a list of values indicating how to
           retrieve the value. The path can also be given by a list of lists describing how to
           retrieve the last element.

        Args:
            record_path (Optional[List[str]]): Custom path to find records in the parsed data. Contains a list of strings and
                                               rarely integers indexes indicating how to recursively find the list of records
            metadata_path (List[List[str]] | Optional[Dict[str, List[str]]]): Identifies the paths in a dictionary
                associated with metadata as opposed to records. This can be a list of paths where each element is a list
                describing how to get to a terminal
            element

        """
        self.metadata_path = metadata_path or {}
        self.record_path = record_path
        self._validate_inputs()


    def _validate_inputs(self) -> None:
        """Method used to validate the inputs provided to the DataExtractor prior to its later use In extracting
        metadata and records. This method operates by verifying the attributes associated with the current data
        extractor once the attributes are set.

        Validated Attributes:
            record_path (Optional[List[str | None]]): The path where a list of records are located
            metadata_path (Optional[List[str | None]]): The list or dictionary of paths where metadata records are located
            dynamic_record_identifiers (Optional[List[str | None]]): Keyword identifier indicating when singular records in a dictionary
                                                                       can be identified as such in contrast to metadata
            dynamic_metadata_identifiers (Optional[List[str | None]]): Keyword identifier indicating when record metadata keys in a dictionary
                                                                        can be identified as such in contrast to metadata
        Raises:
            DataExtractionException: Indicates an error in the DataExtractor and identifies where the inputs take on an invalid value

        """
        self._validate_paths(self.record_path, self.metadata_path)
        return None

    @classmethod
    def _validate_paths(
        cls,
        record_path: Optional[list] = None,
        metadata_path: Optional[list[list] | dict[str, list]] = None,
    ):
        """
        Method used to validate the path inputs provided to the DataExtractor prior to its later use
        In extracting metadata and records
        Args:
            record_path (Optional[List[str | None]]): The path where a list of records are located
            metadata_path (Optional[List[str | None]]): The list or dictionary of paths where metadata records are located
        Raises:
            DataExtractionException: Indicates an error in the DataExtractor and identifies where the inputs take on an invalid value
        """
        try:
            if record_path is not None:
                if not isinstance(record_path, list):
                    raise TypeError(f"A list is required for a record path. Received: {type(record_path)}")

                if not all(isinstance(path, (str, int)) for path in record_path):
                    raise KeyError(
                        f"At least one path in the provided record path is not an integer or string: {record_path}"
                    )
            if metadata_path is not None:
                if not isinstance(metadata_path, (list, dict)):
                    raise KeyError(
                        f"The provided metadata path override is not a list or dictionary: {type(metadata_path)}"
                    )
                if not all(isinstance(path, (str, int, list)) for path in metadata_path):
                    raise KeyError(
                        f"At least one path in the provided metadata path override is not a list, integer, or string: {metadata_path}"
                    )

        except (KeyError, TypeError) as e:
            raise DataExtractionException(
                f"Error initializing the DataExtractor: At least one of the inputs are invalid. {e}"
            ) from e
        return None


[docs]
    def extract_metadata(self, parsed_page_dict: dict) -> dict:
        """Extract metadata from the parsed page dictionary.

        Args:
            parsed_page_dict (Dict): The dictionary containing the page data to be parsed.

        Returns:
            Dict: The extracted metadata.

        """
        if not self.metadata_path:
            logger.info("Metadata paths are empty: skipping metadata extraction")
            return {}

        metadata = {}
        try:
            if isinstance(self.metadata_path, list):
                # converts a list into a dictionary to ensure compatibility with the current method
                # as_list_1d ensures that, if the current path is not in a list, it is coerced into a list
                metadata_path = {as_list_1d(path)[-1]: as_list_1d(path) for path in self.metadata_path}
            else:
                ## ensures that all paths are lists and nests the path in a list otherwise
                metadata_path = {as_list_1d(key)[-1]: as_list_1d(path) for key, path in self.metadata_path.items()}

            # attempts to retrieve the path from the dictionary of metadata paths
            metadata = {key: try_int(get_nested_data(parsed_page_dict, path)) for key, path in metadata_path.items()}

            missing_keys = [str(k) for k, v in metadata.items() if v is None]
            if missing_keys:
                logger.warning(f"The following metadata keys are missing or None: {', '.join(missing_keys)}")

        except KeyError as e:
            logger.error(f"Error extracting metadata due to missing key: {e}")

        except Exception as e:
            msg = f"An unexpected error occurred during metadata extraction due to the following exception: {e}"
            logger.error(msg)
            raise DataExtractionException(msg)

        return metadata



[docs]
    def extract_records(self, parsed_page_dict: dict) -> Optional[list[dict[str, Any]]]:
        """Extract records from parsed data as a list of dicts.

        Args:
            parsed_page_dict (Dict): The dictionary containing the page data to be parsed.

        Returns:
            Optional[List[Dict]]: A list of records as dictionaries, or None if extraction fails.

        """
        try:
            nested_data = get_nested_data(parsed_page_dict, self.record_path) if self.record_path else None

            if isinstance(nested_data, list):
                return nested_data

            if not nested_data:
                logger.debug(f"No records extracted from path {self.record_path}")
                return None

            logger.debug(f"Expected a list at path {self.record_path}. Instead received {type(nested_data)}")
            return None
        except Exception as e:
            msg = f"An unexpected error occurred during record extraction due to the following exception: {e}"
            logger.error(msg)
            raise DataExtractionException(msg)


    @classmethod
    def _prepare_page(cls, parsed_page: Union[list[dict], dict]) -> dict:
        """Prepares the JSON data for metadata and record extraction by coercing it into a dictionary if not already a
        dictionary.

        Args:
            parsed_page (List[Dict] | Dict): The list or dictionary containing the page data and metadata to be
                                             extracted.

        Returns:
            Dict]: A dictionary containing the metadata and records to extract

        """

        if isinstance(parsed_page, list):
            parsed_page = unlist_1d(parsed_page)

        if not isinstance(parsed_page, dict):
            parsed_page_dict = try_dict(parsed_page)

            if parsed_page_dict is None:
                raise DataExtractionException(
                    f"Error converting parsed_page_dict of type {parsed_page} to a dictionary"
                )
            parsed_page_dict = {str(k): v for k, v in parsed_page_dict.items()}
            parsed_page = parsed_page_dict
        return parsed_page


[docs]
    def extract(self, parsed_page: Union[list[dict], dict]) -> tuple[Optional[list[dict]], Optional[dict[str, Any]]]:
        """Extract both records and metadata from the parsed page dictionary.

        Args:
            parsed_page (List[Dict] | Dict): The dictionary containing the page data and metadata to be extracted.

        Returns:
            Tuple[Optional[List[Dict]], Optional[Dict]]: A tuple containing the list of records and the metadata dictionary.

        """

        parsed_page = self._prepare_page(parsed_page)

        records = self.extract_records(parsed_page)
        metadata = self.extract_metadata(parsed_page)

        return records, metadata


    def __call__(self, parsed_page: Union[list[dict], dict]) -> tuple[Optional[list[dict]], Optional[dict[str, Any]]]:
        """Helper method enabling users to call the extractor as a function to extract both records and metadata.

        Args:
            parsed_page (List[Dict] | Dict): The dictionary containing the page data and metadata to be extracted.

        Returns:
            Tuple[Optional[List[Dict]], Optional[Dict]]: A tuple containing the list of records and the metadata dictionary.

        """
        return self.extract(parsed_page)


[docs]
    def structure(self, flatten: bool = False, show_value_attributes: bool = True) -> str:
        """Base method for showing the structure of the current Data Extractor. This  method reveals the configuration
        settings of the extractor config that will be used to extract records and metadata.

        Returns:
            str: The current structure of the BaseDataExtractor or its subclass.

        """

        return generate_repr(self, flatten=flatten, show_value_attributes=show_value_attributes)


    def __repr__(self) -> str:
        """Base method for identifying the current implementation of the BaseDataExtractor. Subclasses can override this
        for more specific descriptions of attributes and defaults. Useful for showing the options being used for
        extracting metadata and records from the parsed json/data dictionaries from the api response.

        Returns:
            str: The representation of the current object

        """
        return self.structure()



__all__ = ["BaseDataExtractor"]