Source code for scholar_flux.api.normalization.open_alex_field_map

# scholar_flux.api.normalization.open_alex_field_map.py
"""The scholar_flux.api.normalization.open_alex_field_map.py module defines the normalization mappings for OpenAlex."""

from scholar_flux.api.normalization.academic_field_map import AcademicFieldMap
from scholar_flux.utils.helpers import get_nested_data
from scholar_flux.utils.record_types import NormalizedRecordType
from typing import Optional
import re



[docs]
class OpenAlexFieldMap(AcademicFieldMap):
    """OpenAlex specific field mapping with custom transformations.

    The `OpenAlexFieldMap` implements a minimal set of methods for field extraction and abstract reconstruction,
    finalizing the structure of each normalized record in the post-processing step.

    Post-Processed Fields:
        - Abstract reconstruction from inverted index format
        - DOI normalization (stripping URL prefix)
        - PMID extraction from ids object
        - Author list cleanup (filter empty entries)

    """

    def _post_process(self, record: NormalizedRecordType) -> NormalizedRecordType:
        """Applies OpenAlex-specific transformations to an individual OpenAlex record.

        Args:
            record (NormalizedRecordType): The Normalized OpenAlex record dictionary to further process.

        Returns:
            NormalizedRecordType: The normalized, post-processed OpenAlex record with transformations applied.

        """
        record = super()._post_process(record)

        # Reconstruct abstract from inverted index
        record["abstract"] = self.reconstruct_abstract(record)

        # Normalize DOI (strip https://doi.org/ prefix)
        record["doi"] = self.normalize_doi(record)

        # Clean author list (filter empty/None entries)
        record["authors"] = self.extract_authors(record)

        # Coerces year into an integer and returns None otherwise
        record["year"] = self.extract_year(record)

        # Indicates whether the current paper is open access (available to the public online)
        record["open_access"] = self.extract_open_access(record)

        # Extracts the `url` field when validated (the default). Tries to fallback and validate record_id
        record["url"] = self.extract_url(record, "url", "record_id")

        return record


[docs]
    @classmethod
    def reconstruct_abstract(
        cls, record: NormalizedRecordType, field: str = "abstract_inverted_index"
    ) -> Optional[str]:
        """Reconstructs abstract text from OpenAlex inverted index format.

        OpenAlex stores abstracts as inverted indexes where keys are words
        and values are arrays of positions where those words appear.

        Args:
            record (NormalizedRecordType): Normalized OpenAlex record dictionary.
            field (str): The field containing the inverted index.

        Returns:
            Optional[str]: Reconstructed abstract string, or None if not available.

        Examples:
            >>> record = {'abstract_inverted_index': {'Hello': [0], 'world': [1]}}
            >>> OpenAlexFieldMap.reconstruct_abstract(record)
            'Hello world'

        """
        inverted_index = record.get(field)
        if not isinstance(inverted_index, dict) or not inverted_index:
            return None

        # Build position -> word mapping
        position_word_pairs: list[tuple[int, str]] = []
        for word, positions in inverted_index.items():
            if isinstance(positions, list):
                for pos in positions:
                    if isinstance(pos, int):
                        if re.search(r"[a-zA-Z0-9]", word) and pos != 0 and not word.startswith(" "):
                            word = " " + word
                        position_word_pairs.append((pos, str(word)))

        if not position_word_pairs:
            return None

        # Sort by position and join words
        position_word_pairs.sort(key=lambda x: x[0])

        # Ensure that punctuation and special characters are not preceded by a space
        return "".join(word for _, word in position_word_pairs) if position_word_pairs else None



[docs]
    @classmethod
    def extract_pmid(cls, record: NormalizedRecordType, field: str = "pmid") -> Optional[str]:
        """Extracts PubMed ID from the ids object.

        Args:
            record (NormalizedRecordType): Normalized OpenAlex record dictionary.
            field (str): The field to extract the PMID from.

        Returns:
            Optional[str]: PMID string without URL prefix, or None if not found.

        Examples:
            >>> record = {'pmid': 'https://pubmed.ncbi.nlm.nih.gov/29241234'}
            >>> OpenAlexFieldMap.extract_pmid(record)
            '29241234'

        """
        pmid_url = record.get(field) or get_nested_data(record, "ids.pmid", verbose=False)
        return (
            (pmid_url.strip().removeprefix("https://pubmed.ncbi.nlm.nih.gov/").strip("/") or None)
            if isinstance(pmid_url, str)
            else None
        )



[docs]
    @classmethod
    def extract_open_access(cls, record: NormalizedRecordType, field: str = "open_access") -> Optional[bool]:
        """Extracts the open access status from the OpenAlex record as a boolean field.

        The value returned can be `True` or `False`, indicating whether the full text of the record is freely accessible
        to the public, or `None` if the field is missing or status cannot be determined from the field.

        Args:
            record (NormalizedRecordType): The Normalized OpenAlex record dictionary.
            field (str): The field to extract the open access status from.

        Returns:
            Optional[bool]:
                - True if the record is open access (e.g., arXiv, CORE, PubMed Central, CC-BY license).
                - False if the record is not open access (e.g., subscription, restricted, or fee-based access).
                - None if the status cannot be determined from the available metadata.

        Note:
            How OpenAlex determines open access status is explained here:
                https://help.openalex.org/hc/en-us/articles/24347035046295-Open-Access-OA

        """
        return cls.extract_boolean_field(
            record,
            field,
            true_values=("diamond", "gold", "green", "hybrid", "bronze", "true"),
            false_values=("closed", "false"),
            default=None,
        )




field_map = OpenAlexFieldMap(
    provider_name="openalex",
    # Core identifiers
    doi="doi",
    url="primary_location.landing_page_url",
    record_id="id",
    # Bibliographic metadata
    title="title",
    abstract=None,  # Reconstructed from abstract_inverted_index in _post_process
    authors="authorships.author.display_name",
    # Publication metadata
    journal="primary_location.source.display_name",
    publisher="primary_location.source.host_organization_name",
    year="publication_year",
    date_published="publication_date",
    date_created="created_date",
    # Content and classification
    keywords="keywords.display_name",
    subjects=["topics.display_name", "concepts.display_name"],
    full_text=None,
    # Metrics and impact
    citation_count="cited_by_count",
    # Access Permissions
    open_access=["open_access.is_oa", "open_access.oa_status"],
    license="primary_location.license",
    # Document metadata
    record_type="type",
    language="language",
    is_retracted="is_retracted",
    # API-specific fields (extensible for downstream use)
    api_specific_fields={
        # Required for abstract reconstruction
        "abstract_inverted_index": "abstract_inverted_index",
        # Additional identifiers
        "openalex_id": "ids.openalex",
        "pmid": "ids.pmid",
        "mag_id": "ids.mag",
        # Open Access details
        "oa_status": "open_access.oa_status",
        # Bibliographic details
        "volume": "biblio.volume",
        "issue": "biblio.issue",
        "first_page": "biblio.first_page",
        "last_page": "biblio.last_page",
        # Journal identifiers
        "issn": "primary_location.source.issn",
        "issn_l": "primary_location.source.issn_l",
        # Author affiliations
        "affiliations": "authorships.institutions.display_name",
        # Citation data
        "references_count": "referenced_works_count",
        "fwci": "fwci",
        # Retraction status
    },
)

__all__ = ["OpenAlexFieldMap", "field_map"]