# scholar_flux.api.normalization.core_field_map.py
"""The scholar_flux.api.normalization.core_field_map.py module defines the normalization mappings used for Core API."""
from scholar_flux.api.normalization.academic_field_map import AcademicFieldMap
from scholar_flux.utils.helpers import as_tuple, try_none
from scholar_flux.utils.record_types import NormalizedRecordType
from typing import Optional
[docs]
class CoreFieldMap(AcademicFieldMap):
"""Core specific field mappings with custom transformations.
The Core API provides open access scholarly content aggregated from thousands of repositories worldwide.
The `CoreFieldMap` implements several methods for record normalization and the extraction of record fields and cross
platform IDs. The post-processing step finalizes the structure of each normalized record to consistently prepare and
post-process records retrieved from the CORE API.
Post-Processed Fields:
- Year extraction from various date formats
- Journal list flattening (Core can return multiple journal titles)
- Record ID coercion to string format
- Open access default (Core sources are generally all open access)
- Cross-reference identifier extraction (arXiv, PubMed, MAG IDs)
- Multi-identifier normalization for entity resolution
"""
def _post_process(self, record: NormalizedRecordType) -> NormalizedRecordType:
"""Applies Core API-specific transformations to an individual Core record.
Args:
record (NormalizedRecordType): The Normalized Core API record dictionary further process.
Returns:
NormalizedRecordType: The normalized, post-processed Core record with transformations applied.
"""
record = super()._post_process(record)
# Extracts year from date strings (e.g., "2025-12-01" -> 2025)
record["year"] = self.extract_year(record)
# Coerces a record ID into string
record["record_id"] = self.extract_id(record)
# Flattens a journal list to semicolon-delimited string
record["journal"] = self.extract_journal(record)
# Ensures that author fields are lists when available
record["authors"] = self.extract_authors(record)
# Extracts and clean arXiv cross-reference identifiers
record["arxiv_id"] = self.extract_arxiv_id(record)
# Extracts the article/record creation date when available
record["date_created"] = self.extract_iso_date(record, "date_created")
# Extracts the article/record publication date when available
record["date_published"] = self.extract_iso_date(record, "date_published")
# Extracts pmid (PubMed record ID resolution)
record["pmid"] = self.extract_pmid(record)
# Microsoft Academic Graph identifier-database resolution.
record["mag_id"] = self.extract_mag_id(record)
# Core Articles are generally open source, although no explicit field exists
record["open_access"] = True
# Extracts the OAI IDs for the current record
record["oai_ids"] = self.extract_oai_ids(record)
return record
field_map = CoreFieldMap(
provider_name="core",
# ==== Core Identifiers ====
doi="doi",
url="downloadUrl", # Direct PDF/full text link
record_id="id",
# ==== Bibliographic ====
title="title",
abstract="abstract",
authors="authors.name",
# ==== Publication Metadata ====
journal="journals.title",
publisher="publisher",
year="yearPublished",
date_published="publishedDate",
date_created="createdDate",
# Content
keywords="fieldOfStudy",
subjects="fieldOfStudy",
full_text="fullText",
# Metrics
citation_count="citationCount",
# Access
open_access=None, # Core sources are generally all open access
license=None, # Not provided by Core API
# Metadata
record_type="documentType",
language="language.name",
default_field_values={"open_access": True},
api_specific_fields={
# Cross-reference identifiers for entity resolution
"arxiv_id": "arxivId",
"pmid": "pubmedId",
"mag_id": "magId",
# OAI identifiers for cross-repository deduplication
"oai_ids": "oaiIds",
# Reference/citation data for graph construction
"references": "references",
},
)
__all__ = ["CoreFieldMap", "field_map"]