# scholar_flux.api.normalization.pubmed_field_map.py
"""The scholar_flux.api.normalization.pubmed_field_map.py module defines the normalization mappings used for PubMed."""
from scholar_flux.api.normalization.academic_field_map import AcademicFieldMap
from scholar_flux.utils.record_types import NormalizedRecordType
from scholar_flux.utils.helpers import (
get_nested_data,
unlist_1d,
as_tuple,
)
from typing import Optional
[docs]
class PubMedFieldMap(AcademicFieldMap):
"""PubMed specific field mapping with custom transformations.
The `PubMedFieldMap` builds on the original `AcademicFieldMap` to add a minimal PubMed-specific array of
post-processing steps that produces final, consistent, normalized record structures across several record types.
Post-Processed Fields:
- `PMCID`, 'PMID', and 'PII' identifiers
- The date and year of creation or publication
- The base URL for the record
- Authors (After formatting nested authorship fields)
- The DOI for the record
- Open Access status
- Abstract Retrieval
Note:
PubMed's XML structure varies between Articles and BookDocuments, which is handled via
fallback paths in the field_map configuration (e.g., multiple paths for 'year' and 'abstract').
Article identifiers (DOI, PMCID, PII) are extracted from the ArticleIdList by filtering on
the '@IdType' attribute, with additional fallback logic for DOI via ELocationID.
Open access status is determined by the presence of a PMCID, indicating the article is
available in PubMed Central.
"""
def _post_process(self, record: NormalizedRecordType) -> NormalizedRecordType:
"""Applies PubMed-specific transformations to an individual normalized record.
Args:
record (NormalizedRecordType): The Normalized PubMed record dictionary further process.
Returns:
NormalizedRecordType: The normalized, post-processed PubMed record with transformations applied
"""
record = super()._post_process(record)
# Extract year from date strings if present
record["year"] = self.extract_year(record)
# Reconstruct URL from PMID if PMID exists
record["url"] = self.reconstruct_pubmed_url(record)
record["doi"] = self.extract_doi(record)
# Extract and override PMCID (filters by IdType='pmc')
pmcid = self.extract_pmcid(record)
record["pmcid"] = pmcid # Override generic extraction, can be None
# Extract and override PII (filters by IdType='pii')
pii = self.extract_pii(record)
record["pii"] = pii # Override generic extraction, can be None
# Extract formatted author names (overrides basic LastName extraction)
record["authors"] = self.extract_authors(record)
# Extract formatted publication date
record["date_published"] = self.extract_iso_date(record, "date_published")
# Extract formatted creation date
record["date_created"] = self.extract_date_created(record)
# Extract open access status based on PMC ID presence
record["open_access"] = self.extract_open_access(record)
record["abstract"] = self.extract_abstract(record)
return record
@classmethod
def _extract_article_id(
cls,
record: NormalizedRecordType,
id_type: str,
strip_prefix: str = "",
) -> Optional[str]:
"""Extracts the article identifier from `ArticleIdList` by the `IdType` attribute.
This is a helper for extracting DOI, PMCID, PII, and other identifiers
from PubMed's ArticleIdList structure, which contains multiple identifier types
distinguished by the '@IdType' attribute.
Args:
record (NormalizedRecordType): Normalized record with 'article_id_list' already extracted
id_type (str): The IdType to filter for (e.g., 'doi', 'pmc', 'pii')
strip_prefix (str): An optional prefix to remove from the identifier (e.g., 'PMC' for PMC IDs)
Returns:
Optional[str]: Extracted identifier string, or None if not found or results in empty string
Examples:
>>> from scholar_flux.api.normalization.pubmed_field_map import PubMedFieldMap
>>> record = {'article_id_list': {'ArticleId': [{'@IdType': 'pmc', '#text': 'PMC123456'}]}}
>>> PubMedFieldMap._extract_article_id(record, 'pmc', strip_prefix='PMC')
# OUTPUT: '123456'
>>> PubMedFieldMap._extract_article_id(record, 'doi')
# OUTPUT: None
"""
article_ids = as_tuple(get_nested_data(record, "article_id_list.ArticleId", verbose=False))
for article_id in article_ids:
if isinstance(article_id, dict) and article_id.get("@IdType") == id_type:
value = unlist_1d(article_id.get("#text"))
if isinstance(value, str) and strip_prefix:
value = value.replace(strip_prefix, "")
# Return None if stripping results in empty string
return value if value else None
return None
[docs]
@classmethod
def reconstruct_pubmed_url(cls, record: NormalizedRecordType) -> Optional[str]:
"""Reconstruct PubMed article URL from the PMID.
Args:
record (NormalizedRecordType): The record containing the 'pmid' field
Returns:
A Reconstructed URL if PMID is valid, None otherwise.
Examples:
>>> from scholar_flux.api.normalization.pubmed_field_map import PubMedFieldMap
>>> PubMedFieldMap.reconstruct_pubmed_url({"pmid": "41418093"})
# OUTPUT: 'https://pubmed.ncbi.nlm.nih.gov/41418093/'
>>> PubMedFieldMap.reconstruct_pubmed_url({"pmid": None})
# OUTPUT: None
"""
url = cls.reconstruct_url(record.get("pmid"), url="https://pubmed.ncbi.nlm.nih.gov/{}/")
return url or None
field_map = PubMedFieldMap(
provider_name="pubmed",
# Identifiers
doi=None, # Extracted from the ArticleIDList or ElocationID where @IdType = doi
url=None, # is reconstructed from PMID
record_id=["MedlineCitation.PMID.#text", "BookDocument.PMID.#text"],
# Bibliographic
title=["MedlineCitation.Article.ArticleTitle.#text", "MedlineCitation.Article.ArticleTitle"],
abstract=[
"MedlineCitation.Article.Abstract.AbstractText.#text",
"MedlineCitation.Article.Abstract.AbstractText",
"BookDocument.Abstract.AbstractText.#text",
],
# Intermediates Dictionary
authors=[
"MedlineCitation.Article.AuthorList.Author",
"BookDocument.AuthorList.Author",
], # Auto-traverses Author list
# Publication metadata
journal="MedlineCitation.Article.Journal.Title",
publisher=None, # Not typically in PubMed
year=[
"MedlineCitation.Article.Journal.JournalIssue.PubDate.Year",
"BookDocument.Book.PubDate.Year",
"MedlineCitation.Article.ArticleDate.Year",
"MedlineCitation.DateCompleted.Year",
"MedlineCitation.DateRevised.Year",
],
date_published=[
"MedlineCitation.Article.Journal.JournalIssue.PubDate",
"BookDocument.Book.PubDate",
],
date_created="MedlineCitation.DateCompleted",
# Content
keywords=[
"MedlineCitation.KeywordList.Keyword.#text",
"MedlineCitation.KeywordList.Keyword",
], # Auto-traverses Keyword list
subjects="MedlineCitation.MeshHeadingList.MeshHeading.DescriptorName.#text", # MeSH terms!
full_text=None,
# Metrics
citation_count=None,
# Access
open_access=None, # Extracted via PMCID presence in _post_process
license="MedlineCitation.Article.Abstract.CopyrightInformation",
# Metadata
record_type="MedlineCitation.Article.PublicationTypeList.PublicationType.#text",
language="MedlineCitation.Article.Language",
# API-specific fields
api_specific_fields={
"article_date": "MedlineCitation.Article.ArticleDate",
"article_id_list": "PubmedData.ArticleIdList",
"elocation_id": "MedlineCitation.Article.ELocationID",
"pmid": ["MedlineCitation.PMID.#text", "BookDocument.PMID.#text"],
"pmcid": "PubmedData.ArticleIdList.ArticleId.#text",
"pii": "PubmedData.ArticleIdList.ArticleId.#text",
# MeSH terms with qualifiers
"mesh_terms": "MedlineCitation.MeshHeadingList.MeshHeading.DescriptorName.#text",
"mesh_qualifiers": "MedlineCitation.MeshHeadingList.MeshHeading.QualifierName.#text",
"mesh_ui": "MedlineCitation.MeshHeadingList.MeshHeading.DescriptorName.@UI",
# Journal details
"issn": "MedlineCitation.Article.Journal.ISSN.#text",
"iso_abbreviation": "MedlineCitation.Article.Journal.ISOAbbreviation",
"volume": "MedlineCitation.Article.Journal.JournalIssue.Volume",
"issue": "MedlineCitation.Article.Journal.JournalIssue.Issue",
"pages": "MedlineCitation.Article.Pagination.MedlinePgn",
"start_page": "MedlineCitation.Article.Pagination.StartPage",
"end_page": "MedlineCitation.Article.Pagination.EndPage",
},
)
__all__ = ["PubMedFieldMap", "field_map"]