Source code for scholar_flux.api.normalization.core_field_map

# scholar_flux.api.normalization.core_field_map.py
"""The scholar_flux.api.normalization.core_field_map.py module defines the normalization mappings used for Core API."""
from scholar_flux.api.normalization.academic_field_map import AcademicFieldMap
from scholar_flux.utils.helpers import as_tuple, try_none
from scholar_flux.utils.record_types import NormalizedRecordType
from typing import Optional



[docs]
class CoreFieldMap(AcademicFieldMap):
    """Core specific field mappings with custom transformations.

    The Core API provides open access scholarly content aggregated from thousands of repositories worldwide.

    The `CoreFieldMap` implements several methods for record normalization and the extraction of record fields and cross
    platform IDs. The post-processing step finalizes the structure of each normalized record to consistently prepare and
    post-process records retrieved from the CORE API.

    Post-Processed Fields:
        - Year extraction from various date formats
        - Journal list flattening (Core can return multiple journal titles)
        - Record ID coercion to string format
        - Open access default (Core sources are generally all open access)
        - Cross-reference identifier extraction (arXiv, PubMed, MAG IDs)
        - Multi-identifier normalization for entity resolution

    """

    def _post_process(self, record: NormalizedRecordType) -> NormalizedRecordType:
        """Applies Core API-specific transformations to an individual Core record.

        Args:
            record (NormalizedRecordType): The Normalized Core API record dictionary further process.

        Returns:
            NormalizedRecordType: The normalized, post-processed Core record with transformations applied.

        """
        record = super()._post_process(record)

        # Extracts year from date strings (e.g., "2025-12-01" -> 2025)
        record["year"] = self.extract_year(record)

        # Coerces a record ID into string
        record["record_id"] = self.extract_id(record)

        # Flattens a journal list to semicolon-delimited string
        record["journal"] = self.extract_journal(record)

        # Ensures that author fields are lists when available
        record["authors"] = self.extract_authors(record)

        # Extracts and clean arXiv cross-reference identifiers
        record["arxiv_id"] = self.extract_arxiv_id(record)

        # Extracts the article/record creation date when available
        record["date_created"] = self.extract_iso_date(record, "date_created")

        # Extracts the article/record publication date when available
        record["date_published"] = self.extract_iso_date(record, "date_published")

        # Extracts pmid (PubMed record ID resolution)
        record["pmid"] = self.extract_pmid(record)

        # Microsoft Academic Graph identifier-database resolution.
        record["mag_id"] = self.extract_mag_id(record)

        # Core Articles are generally open source, although no explicit field exists
        record["open_access"] = True

        # Extracts the OAI IDs for the current record
        record["oai_ids"] = self.extract_oai_ids(record)
        return record


[docs]
    @classmethod
    def extract_arxiv_id(cls, record: NormalizedRecordType) -> Optional[str]:
        """Extracts the arXiv identifier for cross-database entity resolution.

        Args:
            record (NormalizedRecordType): Normalized Core record dictionary.

        Returns:
            Optional[str]: The arXiv ID (e.g., '1012.4340') or None if not available.

        """
        return cls.extract_id(record, "arxiv_id")



[docs]
    @classmethod
    def extract_pmid(cls, record: NormalizedRecordType) -> Optional[str]:
        """Extracts the PubMed identifier for cross-database entity resolution.

        Args:
            record (NormalizedRecordType): Normalized Core record dictionary

        Returns:
            Optional[str]: The PubMed ID or None if not available

        Examples:
            >>> CoreFieldMap.extract_pmid({"pmid": "12345678"})
            '12345678'
            >>> CoreFieldMap.extract_pmid({"pmid": "None"})
            None

        """
        return cls.extract_id(record, "pmid") or cls.extract_id(record, "pubmed_id")



[docs]
    @classmethod
    def extract_mag_id(cls, record: NormalizedRecordType) -> Optional[str]:
        """Extracts the Microsoft Academic Graph identifier for cross-database entity resolution.

        Args:
            record (NormalizedRecordType): Normalized Core record dictionary

        Returns:
            Optional[str]: The MAG ID or None if not available

        Examples:
            >>> CoreFieldMap.extract_mag_id({"mag_id": "2056403249"})
            '2056403249'
            >>> CoreFieldMap.extract_mag_id({"mag_id": "None"})
            None

        """
        return cls.extract_id(record, "mag_id")



[docs]
    @classmethod
    def extract_oai_ids(cls, record: NormalizedRecordType) -> Optional[list[str]]:
        """Extracts the OAI identifiers for cross-database entity resolution.

        Args:
            record (NormalizedRecordType): Normalized Core record dictionary

        Returns:
            Optional[list[str]]: The OAI IDs for the current record as a list or None if not available

        """
        oai_ids = record.get("oai_ids")

        # Keeps only valid IDs, returning a list of IDs when valid, an empty list if empty, and None if not found
        return [str(id) for id in as_tuple(oai_ids) if try_none(id)] if oai_ids is not None else None




field_map = CoreFieldMap(
    provider_name="core",
    # ==== Core Identifiers ====
    doi="doi",
    url="downloadUrl",  # Direct PDF/full text link
    record_id="id",
    # ==== Bibliographic ====
    title="title",
    abstract="abstract",
    authors="authors.name",
    # ==== Publication Metadata ====
    journal="journals.title",
    publisher="publisher",
    year="yearPublished",
    date_published="publishedDate",
    date_created="createdDate",
    # Content
    keywords="fieldOfStudy",
    subjects="fieldOfStudy",
    full_text="fullText",
    # Metrics
    citation_count="citationCount",
    # Access
    open_access=None,  # Core sources are generally all open access
    license=None,  # Not provided by Core API
    # Metadata
    record_type="documentType",
    language="language.name",
    default_field_values={"open_access": True},
    api_specific_fields={
        # Cross-reference identifiers for entity resolution
        "arxiv_id": "arxivId",
        "pmid": "pubmedId",
        "mag_id": "magId",
        # OAI identifiers for cross-repository deduplication
        "oai_ids": "oaiIds",
        # Reference/citation data for graph construction
        "references": "references",
    },
)

__all__ = ["CoreFieldMap", "field_map"]