Source code for scholar_flux.api.normalization.crossref_field_map

# scholar_flux.api.normalization.crossref_field_map.py
"""The scholar_flux.api.normalization.crossref_field_map.py module defines the normalization mappings for Crossref."""
from scholar_flux.api.normalization.academic_field_map import AcademicFieldMap
from scholar_flux.utils.helpers import (
    build_iso_date,
    coerce_int,
    as_tuple,
    unlist_1d,
    coerce_flattened_str,
    infer_text_pattern_search,
    try_compile,
)
from scholar_flux.utils.record_types import NormalizedRecordType
from typing import Optional
import re

# Direct mapping: pattern -> Optional[bool]
LICENSE_PATTERNS: dict[str, Optional[bool]] = {
    # === BOAI-Compliant (True) ===
    # CC0 - Public domain dedication, no restrictions
    "creativecommons.org/publicdomain/zero": True,
    "creativecommons.org/licenses/cc0": True,
    # CC-BY - Attribution only, BOAI recommended license
    # Note: Must check SA variant first to avoid false match
    "creativecommons.org/licenses/by-sa/": True,  # ShareAlike (copyleft, still BOAI)
    "creativecommons.org/licenses/by/": True,
    # === Debatable (None) - Restrictions violate BOAI "any lawful purpose" ===
    # Order: most restrictive first for accurate substring matching
    "creativecommons.org/licenses/by-nc-nd/": None,  # NonCommercial + NoDerivatives
    "creativecommons.org/licenses/by-nc-sa/": None,  # NonCommercial + ShareAlike
    "creativecommons.org/licenses/by-nc/": None,  # NonCommercial
    "creativecommons.org/licenses/by-nd/": None,  # NoDerivatives
    # === Restricted (False) - Subscription/publisher-controlled ===
    # TDM (Text and Data Mining) licenses - require institutional subscription
    "tdm_license": False,  # Wiley: doi.wiley.com/10.1002/tdm_license_1.1
    "tdm/userlicense": False,  # Elsevier: elsevier.com/tdm/userlicense/1.0/
    "/tdm": False,  # Generic TDM path: springer.com/tdm
    "text-and-data-mining": False,  # Springer Nature verbose URL
    # Publisher terms pages - not licenses, indicate restricted access
    "termsandconditions": False,  # Wiley: onlinelibrary.wiley.com/termsAndConditions
    "/core/terms": False,  # Cambridge: cambridge.org/core/terms
}

RETRACTION_PATTERN: re.Pattern = re.compile(r"retract|withdraw")


[docs] class CrossrefFieldMap(AcademicFieldMap): """Crossref specific field mapping with custom transformations. The `CrossrefFieldMap` implements a minimal set of methods for field extraction and abstract HTML tag removal, preparing and finalizing the structure of each normalized record in the post-processing step. Post-Processed Fields: - DOI, URL, and record identifiers - Year and date extraction from nested date fields - Author name formatting - Open access status resolution from license URLs - Journal extraction - Abstract retrieval and HTML tag removal - Retraction status detection Note: Crossref records may contain nested lists and multiple date fields. The field map configuration and post-processing logic handle these variations and normalizes the output. """ def _post_process(self, record: NormalizedRecordType) -> NormalizedRecordType: """Applies Crossref-specific transformations to an individual normalized record. Args: record (NormalizedRecordType): The Normalized Crossref record dictionary to further process. Returns: NormalizedRecordType: The normalized, post-processed Crossref record with transformations applied. """ record = super()._post_process(record) record["authors"] = self.extract_authors(record) record["date_created"] = self.extract_date_parts(record, field="date_created") record["date_published"] = self.extract_date_parts(record) record["year"] = self.extract_year(record) record["open_access"] = self.resolve_open_access(record) record["journal"] = self.extract_journal(record) record["title"] = self.extract_title(record) record["abstract"] = self.extract_abstract(record, strip_html=True, separator=" ", strip=True) record["is_retracted"] = self.check_retraction(record) return record
[docs] @classmethod def extract_title(cls, record: NormalizedRecordType, field: str = "title") -> Optional[str]: """Extracts the record title or a nested list indicating the title (or titles for the article) as a string. Args: record (NormalizedRecordType): Normalized Crossref record dictionary. field (str): The field to extract the title from. Returns: Optional[str]: The title or delimited set of titles associated with the record, joined by a semicolon. """ return coerce_flattened_str(record.get(field))
[docs] @classmethod def extract_year(cls, record: NormalizedRecordType, field: str = "year") -> Optional[int]: """Extracts the year of publication or creation. Args: record (NormalizedRecordType): Normalized Crossref record dictionary. field (str): The field name to extract the year from. Returns: Optional[int]: The year of record publication or creation extracted as an integer. """ date_parts = as_tuple(unlist_1d(record.get(field))) return coerce_int(date_parts[0]) if date_parts else None
[docs] @classmethod def extract_date_parts(cls, record: NormalizedRecordType, field: str = "date_published") -> Optional[str]: """Extracts the publication date or `date_created` for the current record. Args: record (NormalizedRecordType): Normalized Crossref record dictionary. field (str): The field to extract a Crossref date field from. Returns: Optional[str]: ISO formatted date string (YYYY-MM-DD) or None. Note: This class method is designed to handle Crossref's unique processing structure to consistently convert date fields in `[[Year, Month, Date]]` format to `%Y-%m-%d` format. """ date_published = as_tuple(unlist_1d(record.get(field))) return build_iso_date(*date_published) if date_published else None
[docs] @classmethod def extract_authors(cls, record: NormalizedRecordType, field: str = "author_list") -> Optional[list[str]]: """Extracts formatted author names by combining GivenName and LastName. Args: record (NormalizedRecordType): Normalized Crossref record dictionary. field (str): The field to extract the nested list of authors from. Returns: Optional[list[str]]: List of author names in 'ForeName LastName' format, or None if no authors. Note: Returns None for organizational records (datasets, reports) where Crossref does not provide individual authors. Check the 'publisher' or 'institution' fields for organizational attribution. """ authors = as_tuple(record.get(field)) formatted_authors = [ f"{author['given']} {author['family']}" if author.get("given") else f"{author['family']}" for author in authors if isinstance(author, dict) and author.get("family") ] return formatted_authors if formatted_authors else None
[docs] @classmethod def resolve_open_access(cls, record: NormalizedRecordType, field: str = "license") -> Optional[bool]: """Resolves the Open Access Status from known license URLs. Args: record (NormalizedRecordType): Normalized Crossref record dictionary. field (str): The field to extract license URLs from. Returns: Optional[bool]: True if open access, False if restricted, None if indeterminate. """ open_access_statuses = { infer_text_pattern_search(url, LICENSE_PATTERNS, default=None, regex=False, flags=re.IGNORECASE) for url in as_tuple(record.get(field)) if isinstance(url, str) } if True in open_access_statuses: return True if open_access_statuses == {False}: return False return None
[docs] @classmethod def check_retraction( cls, record: NormalizedRecordType, field: str = "updated_by_list", pattern: Optional[str | re.Pattern] = None, ) -> Optional[bool]: """Checks if the record is a retraction notice. Args: record (NormalizedRecordType): Normalized Crossref record dictionary. field (str): The field to check for retraction updates. pattern (str): An optional field or pattern used to verify retraction status Returns: Optional[bool]: True if the paper has been retracted, None if the status is unknown. Note: ┌─────────────────────┐ updated-by ┌─────────────────────┐ │ Retracted Paper │ ◄───────────────────── │ Retraction Notice │ │ (original article) │ ─────────────────────► │ (update record) │ └─────────────────────┘ update-to └─────────────────────┘ Crossref's `update-to` field is on the retraction NOTICE, pointing to the retracted paper. The retracted paper itself might instead contain an `updated-by` field indicating that the paper has been retracted. When retraction status can't be determined for certain due to a lack of information, retraction can be verified with the following steps: 1. Sending a separate crossref search with the `filter='update-type:retraction'` API-specific parameter 2. Checking the https://gitlab.com/crossref/retraction-watch-data repo (updated daily) Source: https://www.crossref.org/documentation/retrieve-metadata/retraction-watch/ (2026) """ # Check if this paper HAS BEEN retracted (updated-by field) retraction_pattern = try_compile(pattern) or RETRACTION_PATTERN updated_by_list = as_tuple(record.get(field)) for update in updated_by_list: if ( isinstance(update, dict) and isinstance(update.get("type"), str) and re.search(retraction_pattern, update["type"].lower()) ): return True # This paper HAS BEEN retracted return None
field_map = CrossrefFieldMap( provider_name="crossref", # Identifiers doi="DOI", url="URL", record_id="DOI", # Bibliographic title="title", abstract="abstract", authors="author", # Publication metadata journal="container-title", # Array publisher="publisher", year=[ "created.date-parts", "published.date-parts", "published-print.date-parts", "published-online.date-parts", "indexed.date-parts", ], # Nested: [[year, month, day]] date_published=["published.date-parts", "published-print.date-parts", "indexed.date-parts"], date_created=[ "created.date-parts", "published.date-parts", "published-print.date-parts", "published-online.date-parts", "indexed.date-parts", ], # Content keywords="subject", subjects="subject", # Array of subject classifications full_text=None, # Metrics citation_count="is-referenced-by-count", # Access open_access=None, # Check 'license' array for access info license="license.URL", # License array is_retracted=None, # Calculated using `retraction` and `update_to_list` # Metadata record_type="type", language="language", api_specific_fields={ "author_list": "author", "institution": "institution.name", "license_list": "license", "update_to_list": "update-to", "updated_by_list": "updated-by", "issn": "ISSN", "isbn": "ISBN", "volume": "volume", "issue": "issue", "page": "page", "references_count": "reference-count", "funder": "funder.name", }, ) __all__ = ["CrossrefFieldMap", "field_map"]