Source code for scholar_flux.api.normalization.academic_field_map
# scholar_flux.api.normalization.academic_field_map.py
"""The scholar_flux.api.normalization.academic_field_map implements the `AcademicFieldMap` for record normalization.
This implementation subclasses the `NormalizingFieldMap` class for use in academic record normalization by defining
additional combinations of fields that apply solely to academic APIs and databases.
Architecture Context:
This layer is the third step in a 3 part configuration system tailored to each individual provider.
1. Parameter Map (BaseAPIParameterMap) - Translates search parameters to provider-specific API parameters
2. Metadata Map (ResponseMetadataMap) - Extracts pagination metadata (total hits, records per page)
3. Field Map (AcademicFieldMap) - Normalizes provider-specific fields into a universal schema
All three layers compose via ProviderConfig for complete provider integration:
>>> from scholar_flux.api.providers import provider_registry
>>> config = provider_registry.get("plos")
>>> config.parameter_map # Request building
>>> config.metadata_map # Pagination intelligence
>>> config.field_map # Response normalization
Design Philosophy:
- **Minimal defaults**: Works out-of-box for common use cases
- **Provider-specific when needed**: Subclasses override `_post_process()` for domain logic
- **User-extensible**: Users can customize or replace field maps entirely
This is NOT a rigid frameworkâeach provider handles genuinely different data structures.
The base class provides common helpers, not enforced patterns.
"""
from typing import Any, Optional, Sequence
import datetime
import re
from scholar_flux.api.normalization.normalizing_field_map import NormalizingFieldMap
from scholar_flux.api.validators import validate_url
from scholar_flux.utils.record_types import NormalizedRecordType
from scholar_flux.utils.helpers import (
unlist_1d,
get_nested_data,
try_none,
try_compile,
coerce_str,
coerce_bool,
extract_year,
parse_iso_timestamp,
build_iso_date,
coerce_flattened_str,
strip_html_tags,
as_tuple,
)
URL_PATTERN_SUFFIX = "(?=http)"
URL_PATTERN = try_compile(r"; *|, *|\| *", suffix=URL_PATTERN_SUFFIX)
[docs]
class AcademicFieldMap(NormalizingFieldMap):
"""Extends the `NormalizingFieldMap` to customize field extraction and processing for academic record normalization.
This class is used to normalize the names of academic data fields consistently across provider. By default, the
AcademicFieldMap includes fields for several attributes of academic records including:
1. Core identifiers (e.g. `doi`, `url`, `record_id`)
2. Bibliographic metadata ( `title`, `abstract`, `authors`)
3. Publication metadata (`journal`, `publisher`, `year`, `date_published`, `date_created`)
4. Content and classification (`keywords`, `subjects`, `full_text`)
5. Metrics and impact (`citation_count`)
6. Access and rights (`open_access`, `license`)
7. Document metadata (`record_type`, `language`)
8. All other fields that are relevant to only the current API (`api_specific_fields`)
During normalization, the `AcademicFieldMap.fields` property returns all subclassed field mappings as a flattened
dictionary (excluding private fields prefixed with underscores). Both simple and nested API-specific
field names are matched and mapped to universal field names.
Any changes to the instance configuration are automatically detected during normalization by comparing the
`_cached_fields` to the updated `fields` property.
Examples:
>>> from scholar_flux.api.normalization import AcademicFieldMap
>>> field_map = AcademicFieldMap(provider_name = None, title = 'article_title', record_id='ID')
>>> expected_result = field_map.fields | {'provider_name':'core', 'title': 'Decomposition of Political Tactics', 'record_id': 196}
>>> result = field_map.apply(dict(provider_name='core', ID=196, article_title='Decomposition of Political Tactics'))
>>> cached_fields = field_map._cached_fields
>>> print(result == expected_result)
>>> result2 = field_map.apply(dict(provider_name='core', ID=196, article_title='Decomposition of Political Tactics'))
>>> assert cached_fields is field_map._cached_fields
>>> assert result is not result2
Note:
To account for special cases, the `AcademicFieldMap` can be subclassed to perform two-step normalization to
further process extracted elements.
1. **Phase 1**:
The `AcademicFieldMap` extracts nested fields for each record. This class traverses paths like
'MedlineCitation.Article.AuthorList.Author' (PubMed) or `authorships.institutions.display_name` (OpenAlex)
to map API-specific fields to universal parameter names
2. **Phase 2 (Subclasses)**:
Subclasses can reformat extracted data into finalized fields. For example, `PubMed` prepares the `authors`
field by combining each author's 'ForeName' and 'LastName' into 'FirstName LastName'. PLOS creates the
record URL for each article by combining the URL prefix for the website with the `DOI` of the current
record. The `AcademicFieldMap` defines common (yet optional) class methods to aid in the extraction and
processing of normalized fields.
"""
# Core identifiers
doi: Optional[str | list[str]] = None
url: Optional[str | list[str]] = None
record_id: Optional[str | list[str]] = None
# Bibliographic metadata
title: Optional[str | list[str]] = None
abstract: Optional[str | list[str]] = None
authors: Optional[str | list[str]] = None
# Publication metadata
journal: Optional[str | list[str]] = None
publisher: Optional[str | list[str]] = None
year: Optional[str | list[str]] = None
date_published: Optional[str | list[str]] = None
date_created: Optional[str | list[str]] = None
# Content and classification
keywords: Optional[str | list[str]] = None
subjects: Optional[str | list[str]] = None
full_text: Optional[str | list[str]] = None
# Metrics and impact
citation_count: Optional[str | list[str]] = None
# Access and rights
open_access: Optional[str | list[str]] = None
license: Optional[str | list[str]] = None
# Document metadata
record_type: Optional[str | list[str]] = None
language: Optional[str | list[str]] = None
is_retracted: Optional[str | list[str]] = None
[docs]
@classmethod
def extract_url(
cls,
record: NormalizedRecordType,
*paths: list[str | int] | str,
pattern_delimiter: Optional[str | re.Pattern] = URL_PATTERN,
delimiter_prefix: Optional[str] = None,
delimiter_suffix: Optional[str] = URL_PATTERN_SUFFIX,
) -> Optional[str]:
"""Helper function for extracting a single, primary URL from record based on the path taken to traverse the URL.
Args:
record (NormalizedRecordType): The record dictionary to extract the URL from.
*paths:
Arbitrary positional path arguments leading to a single URL or list of URLs. Each path can be a string
or list of keys representing the path needed to find a URL in a nested record. Defaults to the tuple
('url', ) if not provided, defaulting to a basic `url` lookup.
pattern_delimiter (str | Pattern):
Regex pattern to split URL strings. Defaults to "; *". A positive lookahead `(?=http)` is automatically
appended to the delimiter to prevent splitting URLs mid-domain. Set to None to disable splitting.
Note that if a re.Pattern object is provided, it will be used as is without transformation.
delimiter_prefix (str):
An option string appended as a prefix to each element within a pattern. This prefix is `None` by default
but can be used to identify URLs that directly follow a specific pattern.
delimiter_suffix (str):
An option string appended as a suffix to each element within a pattern. This suffix is used to identify
`http` schemes (typically associated with URLs) that may directly follow a string delimited by the suffix
separator.
Returns:
The first value found at any of the specified paths. Commonly a string URL,
but could be any type depending on the data structure. Returns None if not found.
Examples:
>>> from scholar_flux.api.normalization import AcademicFieldMap
>>> record = {"url": "http://example.com; http://backup.com"}
>>> AcademicFieldMap.extract_url(record)
# OUTPUT: 'http://example.com'
>>> record = {"url": [{"value": "http://example.com"}]}
>>> AcademicFieldMap.extract_url(record, ["url", 0, "value"], ["url", 0])
# OUTPUT: 'http://example.com'
>>> # Semicolon-delimited URLs (common in CrossRef, Springer)
>>> record = {"url": "http://example.com; http://backup.com"}
>>> AcademicFieldMap.extract_url(record)
# OUTPUT: 'http://example.com'
"""
paths = paths if paths else ("url",)
# If URLs are delimited by the provided pattern, the pattern delimiter will be used.
pattern = (
try_compile(pattern_delimiter, prefix=delimiter_prefix, suffix=delimiter_suffix)
if pattern_delimiter and isinstance(pattern_delimiter, (str, re.Pattern))
else None
)
for path in paths:
nested_element = unlist_1d(get_nested_data(record, path, verbose=False))
url_list: Sequence = (
re.split(pattern, nested_element)
if isinstance(nested_element, str) and pattern
else as_tuple(nested_element) # nests strings, converts lists, replaces None with an empty tuple
)
# Retrieve the first valid URL from the sequence:
url = next(
(url for url in url_list if isinstance(url, str) and validate_url(url.strip(), verbose=False)), None
)
if url:
return url.strip()
return None
[docs]
@classmethod
def extract_id(
cls, record: NormalizedRecordType, field: str = "record_id", strip_prefix: Optional[str | re.Pattern] = None
) -> Optional[str]:
"""Extracts and coerces the ID from the current record into a string.
Args:
record (NormalizedRecordType): A normalized record dictionary before or after post-processing
field (str): The IdType to filter for (e.g., 'arxiv_id', 'pmid', 'mag_id')
strip_prefix (Optional[str | re.Pattern]):
An optional prefix to remove from the identifier (e.g., 'PMC' for PMC IDs)
Returns:
The record ID as a string, or None if not available
Examples:
>>> from scholar_flux.api.normalization import AcademicFieldMap
>>> AcademicFieldMap.extract_id({"record_id": 12345678})
'12345678'
>>> AcademicFieldMap.extract_id({"record_id": "mock_id:123"})
mock_id:123'
"""
record_id = record.get(field)
parsed_record_id = try_none(coerce_str(record_id) if isinstance(record_id, (str, int)) else None)
prefix_pattern = try_compile(strip_prefix, prefix="^")
return re.sub(prefix_pattern, "", parsed_record_id) if prefix_pattern and parsed_record_id else parsed_record_id
[docs]
@classmethod
def extract_url_id(
cls, record: NormalizedRecordType, field: str = "record_id", strip_prefix: Optional[str | re.Pattern] = None
) -> Optional[str]:
"""Extracts an ID from the URL of the current record, removing a URL prefix when specified.
Args:
record (NormalizedRecordType): The record containing the URL ID to extract
field (str): The field containing the ID (with or without a prefix)
strip_prefix (Optional[str | re.Pattern]): The prefix or regex pattern to optionally remove from the URL
Returns:
Optional[str]:
The ID after field extraction and the removal the string prefix, if provided. If the record field
doesn't exist, None is returned instead.
"""
url = record.get(field)
if not (url and isinstance(url, str)):
return None
url = url.strip()
prefix_pattern = try_compile(strip_prefix, prefix="^")
url = re.sub(prefix_pattern, "", url) if prefix_pattern and validate_url(url, verbose=False) else url
return url or None
[docs]
@classmethod
def extract_year(cls, record: NormalizedRecordType, field: str = "year") -> Optional[int]:
"""Extracts the year of publication or record creation from the manuscript/record.
Args:
record (NormalizedRecordType): Normalized record dictionary
field (str): The field to extract the year of publication or record creation from.
Returns:
Optional[int]: The year as an integer, or None if not extractable.
Examples:
>>> AcademicFieldMap.extract_year({"year": "2024-06-15"})
2024
>>> AcademicFieldMap.extract_year({"year": 2024})
2024
>>> AcademicFieldMap.extract_year({"year": None})
None
"""
year = record.get(field)
# internally extracts a 4 digit year between 1900 and 2100
return extract_year(year) if year else None
[docs]
@classmethod
def reconstruct_url(cls, id: Optional[str], url: str) -> Optional[str]:
"""Reconstruct an article URL from the ID of the article.
Useful for PLOS and PubMed URL reconstruction.
Args:
id (Optional[str]): The ID/DOI identifier (e.g., "10.1371/journal.pone.0123456")
url (str): The URL prefix (e.g. f"https://journals.plos.org/plosone/article?id=")
Returns:
str: Reconstructed URL if ID is valid, None otherwise.
Examples:
>>> from scholar_flux.api.normalization import AcademicFieldMap
>>> AcademicFieldMap.reconstruct_url(
... id="10.1371/journal.pone.0123456",
... url=f"https://journals.plos.org/plosone/article?id="
... )
# OUTPUT: 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0123456'
>>> AcademicFieldMap.reconstruct_url(None, '')
# OUTPUT: None
>>> AcademicFieldMap.reconstruct_url("", None)
# OUTPUT: None
"""
# Extract primary URL if multiple exist
id = id.strip() if isinstance(id, str) else ""
url = url.strip() if isinstance(url, str) else ""
if id and url:
url = url.format(id) if "{}" in url else f"{url}{id}"
return url if url and validate_url(url, verbose=False) else None
return None
[docs]
@classmethod
def normalize_doi(cls, record: NormalizedRecordType, field: str = "doi") -> Optional[str]:
"""Normalizes DOI by stripping the https://doi.org/ prefix.
Args:
record (NormalizedRecordType): Normalized record containing the 'doi' field to extract.
field (str): The field to extract the record doi from.
Returns:
Optional[str]: Cleaned DOI string without URL prefix, or None if invalid
Examples:
>>> from scholar_flux.api.normalization import AcademicFieldMap
>>> record = {'doi': 'https://doi.org/10.1234/example'}
>>> AcademicFieldMap.normalize_doi(record)
# OUTPUT: '10.1234/example'
"""
doi = record.get(field)
if isinstance(doi, str):
cleaned = doi.replace("https://doi.org/", "").strip()
return cleaned if cleaned else None
return None
[docs]
@classmethod
def extract_iso_date(cls, record: NormalizedRecordType, field: str = "date_created") -> Optional[str]:
"""Extracts and formats a date from a dictionary or strings in ISO format (%Y-%m-%d).
Args:
record (NormalizedRecordType):
A normalized record having a `date_created` or similar field to extract an ISO date from.
Note: Users can extract an ISO date from a nested dictionary field if its formatted with `year`,
`month`, or `day`. If the nested field is a string, this method will instead attempt to parse it as an
ISO timestamp otherwise. If the field is a datetime or date, the object will be parsed directly.
field (str):
The name of the field containing date information to extract.
Returns:
(Optional[str]): An ISO formatted date string (YYYY-MM-DD, YYYY-MM, or YYYY) or None.
Examples:
PubDate with Year='2025', Month='Dec', Day='19':
Returns '2025-12-19'
PubDate with Year='2025', Month='12':
Returns '2025-12'
PLOS with timestamp: '2016-12-08T00:00:00Z'
Returns '2016-12-08'
"""
date_data = record.get(field)
# Accepts both string and datetime/date objects for maximum compatibility with provider and internal data.
if isinstance(date_data, (str, datetime.datetime, datetime.date)):
parsed_date = parse_iso_timestamp(date_data) if isinstance(date_data, str) else date_data
return parsed_date.strftime("%Y-%m-%d") if parsed_date else None
if isinstance(date_data, dict):
return build_iso_date(
year=date_data.get("Year") or date_data.get("year"),
month=date_data.get("Month") or date_data.get("month"),
day=date_data.get("Day") or date_data.get("day"),
)
return None
[docs]
@classmethod
def extract_authors(cls, record: NormalizedRecordType, field: str = "authors") -> Optional[list[str]]:
"""Filters and cleans the author names list.
Args:
record (NormalizedRecordType): Normalized record with an 'authors' field.
field (str): The field to extract the list of authors from.
Returns:
Optional[list[str]]: A list of non-empty author names, or None if empty
Examples:
>>> from scholar_flux.api.normalization import AcademicFieldMap
>>> record = {'authors': 'Evan Doodle; Jane Doe'}
>>> AcademicFieldMap.extract_authors(record)
# OUTPUT: ['Evan Doodle', 'Jane Doe']
>>> record = {'authors': ['Evan Doodle', 'Jane Noah']}
>>> AcademicFieldMap.extract_authors(record)
# OUTPUT: ['Evan Doodle', 'Jane Noah']
>>> record = {'authors': [102, 203]}
>>> AcademicFieldMap.extract_authors(record) # returns, elements aren't strings
# OUTPUT: None
"""
authors = record.get(field) or ""
authors = authors.split(";") if isinstance(authors, str) else authors
authors = [author.strip() for author in as_tuple(authors) if try_none(author) and isinstance(author, str)]
return authors if authors else None
[docs]
@classmethod
def extract_abstract(
cls, record: NormalizedRecordType, strip_html: bool = False, field: str = "abstract", **kwargs: Any
) -> Optional[str]:
"""Extracts and prepares the abstract for the current record.
Args:
record (NormalizedRecordType): Normalized record with 'abstract' already available as a field.
strip_html (bool): Indicates whether html tags should be checked and removed if found in the abstract.
field (str): The field where an abstract or text field can be found.
**kwargs: Additional arguments to pass to `get_text` when stripping html elements.
Returns:
Optional[str]: An abstract string or None if not found or not a string/list of strings
Example:
>>> from scholar_flux.api.normalization import AcademicFieldMap
>>> record = {'abstract': 'Analysis of the Placebo effect on...'}
>>> AcademicFieldMap.extract_abstract(record)
# OUTPUT: 'Analysis of the Placebo effect on...'
>>> record = {'abstract': '<h1>Game theory in the technological industry.</h1><p>This study explores...</p>'}
>>> AcademicFieldMap.extract_abstract(record, strip_html=True, separator=' ')
# OUTPUT: 'Game theory in the technological industry. This study explores...'
"""
abstract = record.get(field)
if isinstance(abstract, (tuple, list)) and all(isinstance(paragraph, str) for paragraph in abstract):
abstract = " ".join(abstract) or None
if isinstance(abstract, str):
return strip_html_tags(abstract, verbose=False, **kwargs) if strip_html else abstract
return None
[docs]
@classmethod
def extract_journal(cls, record: NormalizedRecordType, field: str = "journal") -> Optional[str]:
"""Extracts the publication journal title or a list of journal titles as a semicolon delimited string.
Args:
record (NormalizedRecordType): The normalized record dictionary to extract the journal field from.
field (str): The field to extract the journal from.
Returns:
Optional[str]: The journal or journals of publication, joined by a semicolon, or None if not available.
Examples:
>>> AcademicFieldMap.extract_journal({"journal": "Nature"})
# OUTPUT: 'Nature'
>>> AcademicFieldMap.extract_journal({"journal": ["Nature", "Science"]})
# OUTPUT: 'Nature; Science'
>>> AcademicFieldMap.extract_journal({"journal": ["Nature", "", None, "Science"]})
# OUTPUT: 'Nature; Science'
"""
journal = record.get(field)
return coerce_flattened_str(journal) or None
[docs]
@classmethod
def extract_boolean_field(
cls,
record: NormalizedRecordType,
field: str,
true_values: tuple[str, ...] = ("true", "1", "yes"),
false_values: tuple[str, ...] = ("false", "0", "no"),
default: Optional[bool] = None,
) -> Optional[bool]:
"""Extracts a field's value from the current record as a boolean ('true'->True/'false'->False/'None'->None).
Args:
record (NormalizedRecordType): The normalized record dictionary to extract a boolean value from.
field (str): The record field to be used for the extraction of a boolean value.
true_values (tuple[str, ...]): Values to be mapped to True when found.
false_values (tuple[str, ...]): Values to be mapped to false when found.
default (Optional[bool]): The value to default to when neither True values or False values can be found.
Returns:
Optional[bool]:
- True if the field appears in the list of the tuple of `true_values`
- False if the field appears in the list of the tuple of `false_values`
- The `default` if the observed value cannot be found within `true_values` and `false_values`
"""
value = record.get(field) # Maps `None` objects/strings and empty fields to None
boolean_value = coerce_bool(value, true_values, false_values)
return boolean_value if boolean_value is not None else default
__all__ = ["AcademicFieldMap"]