Source code for scholar_flux.api.normalization.springer_nature_field_map

# scholar_flux.api.normalization.springer_nature_field_map.py
"""scholar_flux.api.normalization.springer_nature_field_map.py defines the normalization steps for Springer Nature."""
from scholar_flux.api.normalization.academic_field_map import AcademicFieldMap, URL_PATTERN
from scholar_flux.utils.record_types import NormalizedRecordType
from typing import Optional
from re import Pattern



[docs]
class SpringerNatureFieldMap(AcademicFieldMap):
    """Springer Nature specific field mapping with custom transformations.

    The `SpringerNatureFieldMap` defines a minimal set of Springer Nature-specific post-processing steps that aid in the
    normalization of preprocessed records retrieved from the Springer Nature API.

    Post-Processed Fields:
        - DOI and record identifiers
        - Year extraction from publication date
        - URL extraction from nested URL objects
        - Open access status conversion from string to boolean
        - Author and abstract normalization

    Note:
        Springer Nature records may contain arrays and nested objects for key fields (e.g., `year` and primary URL).
        The field map configuration and post-processing logic handle these for consistent normalization.

    """

    def _post_process(self, record: NormalizedRecordType) -> NormalizedRecordType:
        """Applies Springer Nature-specific transformations to an individual Springer Nature record.

        Args:
            record (NormalizedRecordType): The Normalized Springer Nature record dictionary to further process.

        Returns:
            NormalizedRecordType: The normalized, post-processed Springer Nature record with transformations applied.

        """
        record = super()._post_process(record)

        # Extract year from date strings (e.g., "2026-03-01" -> "2026")
        record["year"] = self.extract_year(record, "date_published")

        # Extract and format a list of authors from the current set of records
        record["authors"] = self.extract_authors(record)

        # Convert open_access string to boolean
        record["open_access"] = self.extract_open_access(record)

        # Extracts the article/record creation date when available
        record["date_created"] = self.extract_iso_date(record, "date_created")

        # Extracts the article/record publication date when available
        record["date_published"] = self.extract_iso_date(record, "date_published")

        # Springer Nature-specific URL extraction for the first valid URL in a nested array of URLs
        record["url"] = self.extract_primary_url(record, field="url")

        return record


[docs]
    @classmethod
    def extract_primary_url(
        cls, record: NormalizedRecordType, field: str = "url", pattern_delimiter: Optional[str | Pattern] = URL_PATTERN
    ) -> Optional[str]:
        """Extracts the primary (or first valid) URL from a record field with a flat or nested JSON structure.

        Args:
            record (NormalizedRecordType): The normalized record to extract the primary URL field from
            field (str): The field to extract a primary URL from
            pattern_delimiter (Optional[str | re.Pattern]):
                An optional pattern used to separate URLs when combined as a list for primary URL extraction

        Returns:
            Optional[str]: The extracted primary URL when extraction is successful, and None otherwise.

        """
        return (
            cls.extract_url(
                record, [field], [field, 0], [field, "value"], [field, 0, "value"], pattern_delimiter=pattern_delimiter
            )
            or None
        )



[docs]
    @classmethod
    def extract_open_access(cls, record: NormalizedRecordType, field: str = "open_access") -> Optional[bool]:
        """Extracts the current record's open access status by delegating processing to `.extract_boolean_field()`.

        Args:
            record (NormalizedRecordType): The normalized record to extract the open_access status for
            field (str): The field to extract the open access status from.

        Returns:
            (Optional[bool]): The open access status of the record when available, and None otherwise.

        Examples:
            >>> from scholar_flux.api.normalization import SpringerNatureFieldMap
            >>> record = {"doi": "10.1234/example", "title": "Sample Article","open_access": "true"}
            >>> SpringerNatureFieldMap.extract_open_access(record)
            # OUTPUT: True
            >>> record = {"doi": "10.5678/example", "title": "Sample Publication","open_access": "false"}
            >>> SpringerNatureFieldMap.extract_open_access(record)
            # OUTPUT: False
            >>> record = {"title": "Another Article","open_access": "N/A"}
            >>> SpringerNatureFieldMap.extract_open_access(record)
            # OUTPUT: None

        """
        return cls.extract_boolean_field(record, field)




field_map = SpringerNatureFieldMap(
    provider_name="springernature",
    # Identifiers
    doi="doi",
    url="url",  # Array of URL objects
    record_id="identifier",
    # Bibliographic
    title="title",
    abstract="abstract",
    authors="creators.creator",  # Auto-traverses list of creator names
    # Publication metadata
    journal="publicationName",
    publisher="publisher",
    year="publicationDate",  # Extract year from date string
    date_published="publicationDate",
    date_created="onlineDate",
    # Content
    keywords="keyword",
    subjects="subjects",
    full_text=None,  # Not provided in metadata API
    # Metrics
    citation_count=None,  # Not in metadata API
    # Access
    open_access="openaccess",  # Boolean field
    license="copyright",  # Or openaccess field
    # Metadata
    record_type="contentType",  # Article, Chapter, Book, etc.
    language="language",
    # API-specific fields
    api_specific_fields={
        "isbn": "isbn",
        "issn": "issn",
        "eisbn": "eisbn",
        "eissn": "eIssn",
        "journal_id": "journalId",
        "volume": "volume",
        "issue": "number",  # Issue number
        "start_page": "startingPage",
        "end_page": "endingPage",
        "article_type": "genre",
        "url_list": "url",
    },
)


__all__ = ["SpringerNatureFieldMap", "field_map"]