# scholar_flux.api.normalization.crossref_field_map.py
"""The scholar_flux.api.normalization.crossref_field_map.py module defines the normalization mappings for Crossref."""
from scholar_flux.api.normalization.academic_field_map import AcademicFieldMap
from scholar_flux.utils.helpers import (
build_iso_date,
coerce_int,
as_tuple,
unlist_1d,
coerce_flattened_str,
infer_text_pattern_search,
try_compile,
)
from scholar_flux.utils.record_types import NormalizedRecordType
from typing import Optional
import re
# Direct mapping: pattern -> Optional[bool]
LICENSE_PATTERNS: dict[str, Optional[bool]] = {
# === BOAI-Compliant (True) ===
# CC0 - Public domain dedication, no restrictions
"creativecommons.org/publicdomain/zero": True,
"creativecommons.org/licenses/cc0": True,
# CC-BY - Attribution only, BOAI recommended license
# Note: Must check SA variant first to avoid false match
"creativecommons.org/licenses/by-sa/": True, # ShareAlike (copyleft, still BOAI)
"creativecommons.org/licenses/by/": True,
# === Debatable (None) - Restrictions violate BOAI "any lawful purpose" ===
# Order: most restrictive first for accurate substring matching
"creativecommons.org/licenses/by-nc-nd/": None, # NonCommercial + NoDerivatives
"creativecommons.org/licenses/by-nc-sa/": None, # NonCommercial + ShareAlike
"creativecommons.org/licenses/by-nc/": None, # NonCommercial
"creativecommons.org/licenses/by-nd/": None, # NoDerivatives
# === Restricted (False) - Subscription/publisher-controlled ===
# TDM (Text and Data Mining) licenses - require institutional subscription
"tdm_license": False, # Wiley: doi.wiley.com/10.1002/tdm_license_1.1
"tdm/userlicense": False, # Elsevier: elsevier.com/tdm/userlicense/1.0/
"/tdm": False, # Generic TDM path: springer.com/tdm
"text-and-data-mining": False, # Springer Nature verbose URL
# Publisher terms pages - not licenses, indicate restricted access
"termsandconditions": False, # Wiley: onlinelibrary.wiley.com/termsAndConditions
"/core/terms": False, # Cambridge: cambridge.org/core/terms
}
RETRACTION_PATTERN: re.Pattern = re.compile(r"retract|withdraw")
[docs]
class CrossrefFieldMap(AcademicFieldMap):
"""Crossref specific field mapping with custom transformations.
The `CrossrefFieldMap` implements a minimal set of methods for field extraction and abstract HTML tag removal,
preparing and finalizing the structure of each normalized record in the post-processing step.
Post-Processed Fields:
- DOI, URL, and record identifiers
- Year and date extraction from nested date fields
- Author name formatting
- Open access status resolution from license URLs
- Journal extraction
- Abstract retrieval and HTML tag removal
- Retraction status detection
Note:
Crossref records may contain nested lists and multiple date fields.
The field map configuration and post-processing logic handle these variations and normalizes the output.
"""
def _post_process(self, record: NormalizedRecordType) -> NormalizedRecordType:
"""Applies Crossref-specific transformations to an individual normalized record.
Args:
record (NormalizedRecordType): The Normalized Crossref record dictionary to further process.
Returns:
NormalizedRecordType: The normalized, post-processed Crossref record with transformations applied.
"""
record = super()._post_process(record)
record["authors"] = self.extract_authors(record)
record["date_created"] = self.extract_date_parts(record, field="date_created")
record["date_published"] = self.extract_date_parts(record)
record["year"] = self.extract_year(record)
record["open_access"] = self.resolve_open_access(record)
record["journal"] = self.extract_journal(record)
record["title"] = self.extract_title(record)
record["abstract"] = self.extract_abstract(record, strip_html=True, separator=" ", strip=True)
record["is_retracted"] = self.check_retraction(record)
return record
[docs]
@classmethod
def resolve_open_access(cls, record: NormalizedRecordType, field: str = "license") -> Optional[bool]:
"""Resolves the Open Access Status from known license URLs.
Args:
record (NormalizedRecordType): Normalized Crossref record dictionary.
field (str): The field to extract license URLs from.
Returns:
Optional[bool]: True if open access, False if restricted, None if indeterminate.
"""
open_access_statuses = {
infer_text_pattern_search(url, LICENSE_PATTERNS, default=None, regex=False, flags=re.IGNORECASE)
for url in as_tuple(record.get(field))
if isinstance(url, str)
}
if True in open_access_statuses:
return True
if open_access_statuses == {False}:
return False
return None
[docs]
@classmethod
def check_retraction(
cls,
record: NormalizedRecordType,
field: str = "updated_by_list",
pattern: Optional[str | re.Pattern] = None,
) -> Optional[bool]:
"""Checks if the record is a retraction notice.
Args:
record (NormalizedRecordType): Normalized Crossref record dictionary.
field (str): The field to check for retraction updates.
pattern (str): An optional field or pattern used to verify retraction status
Returns:
Optional[bool]: True if the paper has been retracted, None if the status is unknown.
Note:
┌─────────────────────┐ updated-by ┌─────────────────────┐
│ Retracted Paper │ ◄───────────────────── │ Retraction Notice │
│ (original article) │ ─────────────────────► │ (update record) │
└─────────────────────┘ update-to └─────────────────────┘
Crossref's `update-to` field is on the retraction NOTICE, pointing to the retracted paper. The retracted
paper itself might instead contain an `updated-by` field indicating that the paper has been retracted.
When retraction status can't be determined for certain due to a lack of information, retraction can be
verified with the following steps:
1. Sending a separate crossref search with the `filter='update-type:retraction'` API-specific parameter
2. Checking the https://gitlab.com/crossref/retraction-watch-data repo (updated daily)
Source: https://www.crossref.org/documentation/retrieve-metadata/retraction-watch/ (2026)
"""
# Check if this paper HAS BEEN retracted (updated-by field)
retraction_pattern = try_compile(pattern) or RETRACTION_PATTERN
updated_by_list = as_tuple(record.get(field))
for update in updated_by_list:
if (
isinstance(update, dict)
and isinstance(update.get("type"), str)
and re.search(retraction_pattern, update["type"].lower())
):
return True # This paper HAS BEEN retracted
return None
field_map = CrossrefFieldMap(
provider_name="crossref",
# Identifiers
doi="DOI",
url="URL",
record_id="DOI",
# Bibliographic
title="title",
abstract="abstract",
authors="author",
# Publication metadata
journal="container-title", # Array
publisher="publisher",
year=[
"created.date-parts",
"published.date-parts",
"published-print.date-parts",
"published-online.date-parts",
"indexed.date-parts",
], # Nested: [[year, month, day]]
date_published=["published.date-parts", "published-print.date-parts", "indexed.date-parts"],
date_created=[
"created.date-parts",
"published.date-parts",
"published-print.date-parts",
"published-online.date-parts",
"indexed.date-parts",
],
# Content
keywords="subject",
subjects="subject", # Array of subject classifications
full_text=None,
# Metrics
citation_count="is-referenced-by-count",
# Access
open_access=None, # Check 'license' array for access info
license="license.URL", # License array
is_retracted=None, # Calculated using `retraction` and `update_to_list`
# Metadata
record_type="type",
language="language",
api_specific_fields={
"author_list": "author",
"institution": "institution.name",
"license_list": "license",
"update_to_list": "update-to",
"updated_by_list": "updated-by",
"issn": "ISSN",
"isbn": "ISBN",
"volume": "volume",
"issue": "issue",
"page": "page",
"references_count": "reference-count",
"funder": "funder.name",
},
)
__all__ = ["CrossrefFieldMap", "field_map"]