# scholar_flux.api.normalization.arxiv_field_map.py
"""The scholar_flux.api.normalization.arxiv_field_map.py module defines the normalization mappings used for Arxiv."""
from scholar_flux.api.normalization.academic_field_map import AcademicFieldMap
from scholar_flux.utils.helpers import as_tuple, try_none, infer_text_pattern_search
from scholar_flux.api.validators import validate_url
from scholar_flux.utils.record_types import NormalizedRecordType
from typing import Optional
import re
RECORD_TYPE_PATTERNS: dict[re.Pattern[str], str] = {
# Book chapters (these are the most specific)
re.compile(r"\bIn .+\(Eds?\.\)|\(eds?\.\)|chapter:|book chapter", re.IGNORECASE): "book-chapter",
# Conference/proceedings
re.compile(
r"proceedings? of\b|workshop|conference paper|accepted at.* (conference|[a-z]+ *[1-2][0-9]{3})", re.IGNORECASE
): "proceedings-article",
# arXiv-specific statuses
re.compile(r"accepted (by|for publication|in)\b|to appear in\b", re.IGNORECASE): "accepted",
re.compile(r"submitted to\b|(under|to be) review|has been submitted", re.IGNORECASE): "submitted",
# Journal (literal word in title)
re.compile(r"published in\b|in press at|open access published|\bjournal\b", re.IGNORECASE): "journal-article",
}
[docs]
class ArXivFieldMap(AcademicFieldMap):
"""ArXiv specific field mapping with custom transformations.
The `ArXivFieldMap` implements a minimal set of methods for record normalization to finalize the structure of each
extracted and normalized record during postprocessing.
Post-Processed Fields:
- arXiv, DOI, and record identifiers
- Year extraction from ISO date strings
- PDF URL extraction from link arrays
- Open access status (always true for arXiv)
- Subject and category normalization
Note:
arXiv records use unique identifier and link structures.
The field map configuration and post-processing logic handle these for consistent output.
"""
def _post_process(self, record: NormalizedRecordType) -> NormalizedRecordType:
"""Applies arXiv-specific transformations to an individual arXiv record.
Args:
record (NormalizedRecordType): The normalized arXiv API record dictionary to further process.
Returns:
NormalizedRecordType: The normalized, post-processed arXiv record with transformations applied.
"""
record = super()._post_process(record)
# Extracts year from date strings (e.g., "2026-03-01" -> "2026")
record["year"] = self.extract_year(record)
# Both date_published and date_created extract from the same field
record["date_published"] = self.extract_iso_date(record, "date_published")
record["date_created"] = self.extract_iso_date(record, "date_created")
# Convert open_access string to boolean
record["open_access"] = True
record["record_id"] = self.extract_url_id(record, field="record_id", strip_prefix="https?://arxiv.org/abs/")
record["pdf_url"] = self.extract_pdf_url(record)
record["authors"] = self.extract_authors(record)
# Uses heuristics and record types to determine record type by `journal` or `comment`
record["record_type"] = self.extract_record_type(record)
return record
field_map = ArXivFieldMap(
provider_name="arxiv",
doi="arxiv:doi",
url="id", # The arxiv.org/abs/...
record_id="id",
# Bibliographic
title="title",
abstract="summary",
authors="author.name",
# Publication metadata
journal="arxiv:journal_ref",
publisher=None, # arXiv itself
year="published", # Extract year from ISO datetime
date_published="published",
date_created="published",
# Content
keywords=None, # arXiv doesn't provide keywords
subjects="arxiv:primary_category.@term",
full_text=None,
# Metrics
citation_count=None,
# Access
open_access=None, # Always true, no explicit field
license="rights",
# Metadata
record_type=None, # Inferred from journal/comment fields
language=None,
api_specific_fields={
"primary_category": "arxiv:primary_category.@term",
"categories": "category.@term", # NEW: All categories (list)
"comment": "arxiv:comment",
"updated_date": "updated",
"url_list": "link",
"pdf_url": "link.@href", # Filtered to extract the PDF link
},
)
__all__ = ["ArXivFieldMap", "field_map"]