Source code for scholar_flux.api.normalization.pubmed_field_map

# scholar_flux.api.normalization.pubmed_field_map.py
"""The scholar_flux.api.normalization.pubmed_field_map.py module defines the normalization mappings used for PubMed."""
from scholar_flux.api.normalization.academic_field_map import AcademicFieldMap
from scholar_flux.utils.record_types import NormalizedRecordType
from scholar_flux.utils.helpers import (
    get_nested_data,
    unlist_1d,
    as_tuple,
)
from typing import Optional



[docs]
class PubMedFieldMap(AcademicFieldMap):
    """PubMed specific field mapping with custom transformations.

    The `PubMedFieldMap` builds on the original `AcademicFieldMap` to add a minimal PubMed-specific array of
    post-processing steps that produces final, consistent, normalized record structures across several record types.

    Post-Processed Fields:
        -  `PMCID`, 'PMID', and 'PII' identifiers
        - The date and year of creation or publication
        - The base URL for the record
        - Authors (After formatting nested authorship fields)
        - The DOI for the record
        - Open Access status
        - Abstract Retrieval

    Note:
        PubMed's XML structure varies between Articles and BookDocuments, which is handled via
        fallback paths in the field_map configuration (e.g., multiple paths for 'year' and 'abstract').

        Article identifiers (DOI, PMCID, PII) are extracted from the ArticleIdList by filtering on
        the '@IdType' attribute, with additional fallback logic for DOI via ELocationID.

        Open access status is determined by the presence of a PMCID, indicating the article is
        available in PubMed Central.

    """

    def _post_process(self, record: NormalizedRecordType) -> NormalizedRecordType:
        """Applies PubMed-specific transformations to an individual normalized record.

        Args:
            record (NormalizedRecordType): The Normalized PubMed record dictionary further process.

        Returns:
            NormalizedRecordType: The normalized, post-processed PubMed record with transformations applied

        """
        record = super()._post_process(record)
        # Extract year from date strings if present
        record["year"] = self.extract_year(record)

        # Reconstruct URL from PMID if PMID exists
        record["url"] = self.reconstruct_pubmed_url(record)

        record["doi"] = self.extract_doi(record)

        # Extract and override PMCID (filters by IdType='pmc')
        pmcid = self.extract_pmcid(record)
        record["pmcid"] = pmcid  # Override generic extraction, can be None

        # Extract and override PII (filters by IdType='pii')
        pii = self.extract_pii(record)
        record["pii"] = pii  # Override generic extraction, can be None

        # Extract formatted author names (overrides basic LastName extraction)
        record["authors"] = self.extract_authors(record)

        # Extract formatted publication date
        record["date_published"] = self.extract_iso_date(record, "date_published")

        # Extract formatted creation date
        record["date_created"] = self.extract_date_created(record)

        # Extract open access status based on PMC ID presence
        record["open_access"] = self.extract_open_access(record)

        record["abstract"] = self.extract_abstract(record)

        return record

    @classmethod
    def _extract_article_id(
        cls,
        record: NormalizedRecordType,
        id_type: str,
        strip_prefix: str = "",
    ) -> Optional[str]:
        """Extracts the article identifier from `ArticleIdList` by the `IdType` attribute.

        This is a helper for extracting DOI, PMCID, PII, and other identifiers
        from PubMed's ArticleIdList structure, which contains multiple identifier types
        distinguished by the '@IdType' attribute.

        Args:
            record (NormalizedRecordType): Normalized record with 'article_id_list' already extracted
            id_type (str): The IdType to filter for (e.g., 'doi', 'pmc', 'pii')
            strip_prefix (str): An optional prefix to remove from the identifier (e.g., 'PMC' for PMC IDs)

        Returns:
            Optional[str]: Extracted identifier string, or None if not found or results in empty string

        Examples:
            >>> from scholar_flux.api.normalization.pubmed_field_map import PubMedFieldMap
            >>> record = {'article_id_list': {'ArticleId': [{'@IdType': 'pmc', '#text': 'PMC123456'}]}}
            >>> PubMedFieldMap._extract_article_id(record, 'pmc', strip_prefix='PMC')
            # OUTPUT: '123456'
            >>> PubMedFieldMap._extract_article_id(record, 'doi')
            # OUTPUT: None

        """
        article_ids = as_tuple(get_nested_data(record, "article_id_list.ArticleId", verbose=False))

        for article_id in article_ids:
            if isinstance(article_id, dict) and article_id.get("@IdType") == id_type:
                value = unlist_1d(article_id.get("#text"))
                if isinstance(value, str) and strip_prefix:
                    value = value.replace(strip_prefix, "")
                # Return None if stripping results in empty string
                return value if value else None

        return None


[docs]
    @classmethod
    def extract_doi(cls, record: NormalizedRecordType) -> Optional[str]:
        """Extracts the DOI from the ArticleIdList based on the IdType attribute.

        Attempts to extract DOI from two sources:
        1. ArticleIdList with IdType='doi' (primary)
        2. ELocationID with EIdType='doi' (fallback)

        Args:
            record (NormalizedRecordType): Normalized record with 'article_id_list' and 'elocation_id' already extracted

        Returns:
            Optional[str]: DOI string or None if not found

        """
        # Primary: Check PubmedData.ArticleIdList.ArticleId
        if doi := cls._extract_article_id(record, "doi"):
            return doi

        # Fallback: Check MedlineCitation.Article.ELocationID
        elocation_ids = as_tuple(get_nested_data(record, "elocation_id", verbose=False))
        for elocation_id in elocation_ids:
            if isinstance(elocation_id, dict) and elocation_id.get("@EIdType") == "doi":
                return unlist_1d(elocation_id.get("#text"))

        return None



[docs]
    @classmethod
    def extract_pmcid(cls, record: NormalizedRecordType) -> Optional[str]:
        """Extracts the PMC ID for full-text access from the normalized record.

        Returns the PMCID without the 'PMC' prefix for consistency. Handles edge cases
        where stripping the prefix results in an empty string.

        Args:
            record (NormalizedRecordType): Normalized record with 'article_id_list' already extracted

        Returns:
            Optional[str]: PMC ID without 'PMC' prefix, or None if not found or invalid

        Examples:
            >>> from scholar_flux.api.normalization.pubmed_field_map import PubMedFieldMap
            >>> record = {'article_id_list': {'ArticleId': [{'@IdType': 'pmc', '#text': 'PMC123456'}]}}
            >>> PubMedFieldMap.extract_pmcid(record)
            # OUTPUT: '123456'

        """
        return cls._extract_article_id(record, "pmc", strip_prefix="PMC")



[docs]
    @classmethod
    def extract_pii(cls, record: NormalizedRecordType) -> Optional[str]:
        """Extracts the Publisher Item Identifier (PII) from the ArticleIdList.

        Args:
            record (NormalizedRecordType): Normalized record with 'article_id_list' already extracted

        Returns:
            Optional[str]: PII string or None if not found

        """
        return PubMedFieldMap._extract_article_id(record, "pii")



[docs]
    @classmethod
    def extract_open_access(cls, record: NormalizedRecordType) -> Optional[bool]:
        """Determines if an article is open access based on PMC ID presence.

        The presence of a PMCID indicates the article is available in PubMed Central, which means it is accessible as
        open access. This is a reliable indicator for PubMed records, though it may not capture all open access
        articles (e.g., those available only on publisher websites).

        Args:
            record (NormalizedRecordType): Normalized record with 'article_id_list' already extracted

        Returns:
            Optional[bool]: True if PMCID present (open access), False if no PMCID, None if indeterminate

        Examples:
            >>> from scholar_flux.api.normalization.pubmed_field_map import PubMedFieldMap
            >>> record = {'article_id_list': {'ArticleId': [{'@IdType': 'pmc', '#text': 'PMC123456'}]}}
            >>> PubMedFieldMap.extract_open_access(record)
            # OUTPUT: True
            >>> record = {'article_id_list': {'ArticleId': [{'@IdType': 'doi', '#text': '10.1234/example'}]}}
            >>> PubMedFieldMap.extract_open_access(record)
            # OUTPUT: False

        """
        # Check if article_id_list is present in the record
        if get_nested_data(record, "article_id_list.ArticleId", verbose=False) is None:
            return None
        # Return True if PMCID exists, False if article_id_list exists but no PMCID, None otherwise
        return bool(PubMedFieldMap.extract_pmcid(record))



[docs]
    @classmethod
    def extract_authors(cls, record: NormalizedRecordType, field: str = "authors") -> Optional[list[str]]:
        """Extract formatted author names combining ForeName and LastName.

        Args:
            record (NormalizedRecordType): Raw PubMed record dictionary
            field (str): The location to extract the nested list of authors from.

        Returns:
            Optional[list[str]]: List of author names in 'ForeName LastName' format, or None if no authors

        Notes:
            For an author with LastName='Smith', ForeName='John':
            Returns ['John Smith']

            For an author with only LastName='Smith':
            Returns ['Smith']

        """
        authors = as_tuple(record.get(field))
        formatted_authors = [
            (
                f"{given} {author['LastName']}"
                if (given := author.get("ForeName", author.get("Initials", "")))
                else f"{author['LastName']}"
            )
            for author in authors
            if isinstance(author, dict) and author.get("LastName")
        ]

        return formatted_authors or None



[docs]
    @classmethod
    def extract_date_created(cls, record: NormalizedRecordType) -> Optional[str]:
        """Extract date created or article date.

        Args:
            record (NormalizedRecordType): Raw PubMed record dictionary

        Returns:
            Optional[str]: ISO formatted date string (YYYY-MM-DD) or None

        Notes:
            Tries date created (MedlineCitation.DateCompleted) first, then falls back to article_date
            (ArticleDate.DateCompleted)

        """
        if date_created := cls.extract_iso_date(record, "date_created"):
            return date_created
        return cls.extract_iso_date(record, "article_date")



[docs]
    @classmethod
    def reconstruct_pubmed_url(cls, record: NormalizedRecordType) -> Optional[str]:
        """Reconstruct PubMed article URL from the PMID.

        Args:
            record (NormalizedRecordType): The record containing the 'pmid' field

        Returns:
            A Reconstructed URL if PMID is valid, None otherwise.

        Examples:
            >>> from scholar_flux.api.normalization.pubmed_field_map import PubMedFieldMap
            >>> PubMedFieldMap.reconstruct_pubmed_url({"pmid": "41418093"})
            # OUTPUT: 'https://pubmed.ncbi.nlm.nih.gov/41418093/'
            >>> PubMedFieldMap.reconstruct_pubmed_url({"pmid": None})
            # OUTPUT: None

        """
        url = cls.reconstruct_url(record.get("pmid"), url="https://pubmed.ncbi.nlm.nih.gov/{}/")
        return url or None




field_map = PubMedFieldMap(
    provider_name="pubmed",
    # Identifiers
    doi=None,  # Extracted from the ArticleIDList or ElocationID where @IdType = doi
    url=None,  # is reconstructed from PMID
    record_id=["MedlineCitation.PMID.#text", "BookDocument.PMID.#text"],
    # Bibliographic
    title=["MedlineCitation.Article.ArticleTitle.#text", "MedlineCitation.Article.ArticleTitle"],
    abstract=[
        "MedlineCitation.Article.Abstract.AbstractText.#text",
        "MedlineCitation.Article.Abstract.AbstractText",
        "BookDocument.Abstract.AbstractText.#text",
    ],
    # Intermediates Dictionary
    authors=[
        "MedlineCitation.Article.AuthorList.Author",
        "BookDocument.AuthorList.Author",
    ],  # Auto-traverses Author list
    # Publication metadata
    journal="MedlineCitation.Article.Journal.Title",
    publisher=None,  # Not typically in PubMed
    year=[
        "MedlineCitation.Article.Journal.JournalIssue.PubDate.Year",
        "BookDocument.Book.PubDate.Year",
        "MedlineCitation.Article.ArticleDate.Year",
        "MedlineCitation.DateCompleted.Year",
        "MedlineCitation.DateRevised.Year",
    ],
    date_published=[
        "MedlineCitation.Article.Journal.JournalIssue.PubDate",
        "BookDocument.Book.PubDate",
    ],
    date_created="MedlineCitation.DateCompleted",
    # Content
    keywords=[
        "MedlineCitation.KeywordList.Keyword.#text",
        "MedlineCitation.KeywordList.Keyword",
    ],  # Auto-traverses Keyword list
    subjects="MedlineCitation.MeshHeadingList.MeshHeading.DescriptorName.#text",  # MeSH terms!
    full_text=None,
    # Metrics
    citation_count=None,
    # Access
    open_access=None,  # Extracted via PMCID presence in _post_process
    license="MedlineCitation.Article.Abstract.CopyrightInformation",
    # Metadata
    record_type="MedlineCitation.Article.PublicationTypeList.PublicationType.#text",
    language="MedlineCitation.Article.Language",
    # API-specific fields
    api_specific_fields={
        "article_date": "MedlineCitation.Article.ArticleDate",
        "article_id_list": "PubmedData.ArticleIdList",
        "elocation_id": "MedlineCitation.Article.ELocationID",
        "pmid": ["MedlineCitation.PMID.#text", "BookDocument.PMID.#text"],
        "pmcid": "PubmedData.ArticleIdList.ArticleId.#text",
        "pii": "PubmedData.ArticleIdList.ArticleId.#text",
        # MeSH terms with qualifiers
        "mesh_terms": "MedlineCitation.MeshHeadingList.MeshHeading.DescriptorName.#text",
        "mesh_qualifiers": "MedlineCitation.MeshHeadingList.MeshHeading.QualifierName.#text",
        "mesh_ui": "MedlineCitation.MeshHeadingList.MeshHeading.DescriptorName.@UI",
        # Journal details
        "issn": "MedlineCitation.Article.Journal.ISSN.#text",
        "iso_abbreviation": "MedlineCitation.Article.Journal.ISOAbbreviation",
        "volume": "MedlineCitation.Article.Journal.JournalIssue.Volume",
        "issue": "MedlineCitation.Article.Journal.JournalIssue.Issue",
        "pages": "MedlineCitation.Article.Pagination.MedlinePgn",
        "start_page": "MedlineCitation.Article.Pagination.StartPage",
        "end_page": "MedlineCitation.Article.Pagination.EndPage",
    },
)

__all__ = ["PubMedFieldMap", "field_map"]