Source code for scholar_flux.data.data_parser

# /data/data_parser.py
"""The scholar_flux.data.data_parser module defines the DataParser used within the scholar_flux API to parse JSON as
well as uncommon response formats.

This module implements the DataParser which allows for custom overrides to JSON, XML, and YAML files to prepare and
parse dictionary-based nested structures prior to record extraction and processing.

"""

from typing import Optional, Callable
from scholar_flux.data.base_parser import BaseDataParser
from scholar_flux.exceptions import DataParsingException
from scholar_flux.utils.response_protocol import ResponseProtocol
import requests

import logging

logger = logging.getLogger(__name__)


[docs] class DataParser(BaseDataParser): """Extensible class that handles the identification and parsing of typical formats seen in APIs that send news and academic articles in XML, JSON, and YAML formats. The BaseDataParser contains each of the necessary class elements to parse JSON, XML, and YAML formats as class methods while this class allows for the specification of additional parsers. Args: additional_parsers (Optional[dict[str, Callable]]): Allows overrides for parsers in addition to the JSON, XML and YAML parsers that are enabled by default. """
[docs] def __init__(self, additional_parsers: Optional[dict[str, Callable]] = None): """On initialization, the data parser is set to use built-in class methods to parse json, xml, and yaml-based response content by default and the parse helper class to determine which parser to use based on the Content- Type. Args: additional_parsers (Optional[dict[str, Callable]]): Allows for the addition of new parsers and overrides to class methods to be used on content-type identification. """ self.format_parsers = self.get_default_parsers() | (additional_parsers or {})
[docs] def parse( self, response: requests.Response | ResponseProtocol, format: Optional[str] = None ) -> dict | list[dict] | None: """Parses the API response content using to core steps. 1. Detects the API response format if a format is not already specified 2. Uses the previously determined format to parse the content of the response and return a parsed dictionary (json) structure. Args: response (requests.Response | ResponseProtocol): The response or response-like object from the API request. format (str): The parser needed to format the response as a list of dicts Returns: dict: response dict containing fields including a list of metadata records as dictionaries. """ try: use_format = format.lower() if format is not None else self.detect_format(response) parser = self.format_parsers.get(use_format, None) if use_format else None if parser is not None: return parser(response.content) else: logger.error("Unsupported format: %s", format) return None except DataParsingException: raise except Exception as e: raise DataParsingException(f"An error occurred during response content parsing: {e}") from e
__all__ = ["DataParser"]