Source code for scholar_flux.api.normalization.arxiv_field_map

# scholar_flux.api.normalization.arxiv_field_map.py
"""The scholar_flux.api.normalization.arxiv_field_map.py module defines the normalization mappings used for Arxiv."""

from scholar_flux.api.normalization.academic_field_map import AcademicFieldMap
from scholar_flux.utils.helpers import as_tuple, try_none, infer_text_pattern_search
from scholar_flux.api.validators import validate_url
from scholar_flux.utils.record_types import NormalizedRecordType
from typing import Optional
import re

RECORD_TYPE_PATTERNS: dict[re.Pattern[str], str] = {
    # Book chapters (these are the most specific)
    re.compile(r"\bIn .+\(Eds?\.\)|\(eds?\.\)|chapter:|book chapter", re.IGNORECASE): "book-chapter",
    # Conference/proceedings
    re.compile(
        r"proceedings? of\b|workshop|conference paper|accepted at.* (conference|[a-z]+ *[1-2][0-9]{3})", re.IGNORECASE
    ): "proceedings-article",
    # arXiv-specific statuses
    re.compile(r"accepted (by|for publication|in)\b|to appear in\b", re.IGNORECASE): "accepted",
    re.compile(r"submitted to\b|(under|to be) review|has been submitted", re.IGNORECASE): "submitted",
    # Journal (literal word in title)
    re.compile(r"published in\b|in press at|open access published|\bjournal\b", re.IGNORECASE): "journal-article",
}



[docs]
class ArXivFieldMap(AcademicFieldMap):
    """ArXiv specific field mapping with custom transformations.

    The `ArXivFieldMap` implements a minimal set of methods for record normalization to finalize the structure of each
    extracted and normalized record during postprocessing.

    Post-Processed Fields:
        - arXiv, DOI, and record identifiers
        - Year extraction from ISO date strings
        - PDF URL extraction from link arrays
        - Open access status (always true for arXiv)
        - Subject and category normalization

    Note:
        arXiv records use unique identifier and link structures.
        The field map configuration and post-processing logic handle these for consistent output.

    """

    def _post_process(self, record: NormalizedRecordType) -> NormalizedRecordType:
        """Applies arXiv-specific transformations to an individual arXiv record.

        Args:
            record (NormalizedRecordType): The normalized arXiv API record dictionary to further process.

        Returns:
            NormalizedRecordType: The normalized, post-processed arXiv record with transformations applied.

        """
        record = super()._post_process(record)

        # Extracts year from date strings (e.g., "2026-03-01" -> "2026")
        record["year"] = self.extract_year(record)

        # Both date_published and date_created extract from the same field
        record["date_published"] = self.extract_iso_date(record, "date_published")

        record["date_created"] = self.extract_iso_date(record, "date_created")

        # Convert open_access string to boolean
        record["open_access"] = True

        record["record_id"] = self.extract_url_id(record, field="record_id", strip_prefix="https?://arxiv.org/abs/")

        record["pdf_url"] = self.extract_pdf_url(record)

        record["authors"] = self.extract_authors(record)

        # Uses heuristics and record types to determine record type by `journal` or `comment`
        record["record_type"] = self.extract_record_type(record)
        return record


[docs]
    @classmethod
    def extract_pdf_url(cls, record: NormalizedRecordType, field: str = "url_list") -> Optional[str]:
        """Extracts a valid PDF URL from the array of URLs corresponding to the record.

        Args:
            record (NormalizedRecordType): A normalized arXiv record dictionary to extract a PDF URL from.
            field (str): The field to extract the list of URLs from.

        Returns:
            Optional[str]: A validated PDF URL if available, otherwise None.

        """
        links = as_tuple(record.get(field))
        for link in links:
            if (
                isinstance(link, dict)
                and link.get("@type") == "application/pdf"
                and validate_url(link.get("@href") or "", verbose=False)
            ):
                return link.get("@href")
        return None



[docs]
    @classmethod
    def extract_record_type(
        cls, record: NormalizedRecordType, journal_field: str = "journal", comment_field: str = "comment"
    ) -> str:
        """Infers a record type from the journal or comment field for an arXiv record using pattern matching.

        The record type is inferred using predefined patterns to determine whether a record is a journal article, a book
        chapter, preprint, etc.

        The possible types are defined within `scholar_flux.api.normalization.arxiv_field_map` as a
        `RECORD_TYPE_PATTERNS` dictionary that maps patterns to record types.

        Args:
            record (NormalizedRecordType): a normalized arXiv record to infer the record type with.
            journal_field (str): The journal field used to infer record type with.
            comment_field (str): The comment field used to infer record type with if a journal field does not exist.

        Returns:
            When `journal_field` is available, the record type associated with the matched pattern is returned.
            If no patterns match, then `journal-article` is returned instead.
            When `journal_field` is not available, a record type is extracted by pattern matching against the
            `comment_field`. If no match is found, `preprint` is returned.

        """

        return (
            infer_text_pattern_search(record[journal_field], RECORD_TYPE_PATTERNS, default="journal-article")
            if try_none(record.get(journal_field))
            else infer_text_pattern_search(record.get(comment_field, ""), RECORD_TYPE_PATTERNS, default="preprint")
        )




field_map = ArXivFieldMap(
    provider_name="arxiv",
    doi="arxiv:doi",
    url="id",  # The arxiv.org/abs/...
    record_id="id",
    # Bibliographic
    title="title",
    abstract="summary",
    authors="author.name",
    # Publication metadata
    journal="arxiv:journal_ref",
    publisher=None,  # arXiv itself
    year="published",  # Extract year from ISO datetime
    date_published="published",
    date_created="published",
    # Content
    keywords=None,  # arXiv doesn't provide keywords
    subjects="arxiv:primary_category.@term",
    full_text=None,
    # Metrics
    citation_count=None,
    # Access
    open_access=None,  # Always true, no explicit field
    license="rights",
    # Metadata
    record_type=None,  # Inferred from journal/comment fields
    language=None,
    api_specific_fields={
        "primary_category": "arxiv:primary_category.@term",
        "categories": "category.@term",  # NEW: All categories (list)
        "comment": "arxiv:comment",
        "updated_date": "updated",
        "url_list": "link",
        "pdf_url": "link.@href",  # Filtered to extract the PDF link
    },
)

__all__ = ["ArXivFieldMap", "field_map"]