Files
2025-12-09 12:13:01 +01:00

420 lines
14 KiB
Python

"""
Unit handling and normalization for financial facts.
This module provides comprehensive unit normalization and conversion capabilities
to address unit inconsistencies across different companies' SEC filings.
Key features:
- Currency unit normalization (USD, EUR, GBP, etc.)
- Share-based unit standardization
- Scale-aware unit matching
- Unit compatibility checking
- Error reporting with unit mismatch details
Usage:
from edgar.entity.unit_handling import UnitNormalizer, UnitResult
# Normalize a unit
normalized = UnitNormalizer.normalize_unit("US DOLLAR") # Returns "USD"
# Check unit compatibility
compatible = UnitNormalizer.are_compatible("USD", "DOLLARS") # Returns True
# Get unit with error details
result = UnitNormalizer.get_normalized_value(fact, target_unit="USD")
"""
import logging
from dataclasses import dataclass
from enum import Enum
from typing import Dict, List, Optional
from edgar.entity.models import FinancialFact
logger = logging.getLogger(__name__)
class UnitType(Enum):
"""Types of financial units."""
CURRENCY = "currency"
SHARES = "shares"
RATIO = "ratio"
BUSINESS = "business"
TIME = "time"
AREA = "area"
OTHER = "other"
@dataclass
class UnitResult:
"""Result of unit normalization with error details."""
value: Optional[float]
normalized_unit: Optional[str]
original_unit: str
success: bool
error_reason: Optional[str] = None
scale_applied: Optional[int] = None
unit_type: Optional[UnitType] = None
suggestions: List[str] = None
def __post_init__(self):
if self.suggestions is None:
self.suggestions = []
class UnitNormalizer:
"""Comprehensive unit normalization for financial facts."""
# Currency unit mappings
CURRENCY_MAPPINGS = {
'USD': ['USD', 'US DOLLAR', 'DOLLARS', 'usd', 'US$', 'DOLLAR'],
'EUR': ['EUR', 'EURO', 'EUROS', 'eur', '', 'EUROPEAN UNION EURO'],
'GBP': ['GBP', 'POUND', 'POUNDS', 'gbp', '£', 'BRITISH POUND', 'POUND STERLING'],
'JPY': ['JPY', 'YEN', 'yen', 'jpy', '¥', 'JAPANESE YEN'],
'CAD': ['CAD', 'CANADIAN DOLLAR', 'CANADIAN DOLLARS', 'cad'],
'CHF': ['CHF', 'SWISS FRANC', 'SWISS FRANCS', 'chf'],
'AUD': ['AUD', 'AUSTRALIAN DOLLAR', 'AUSTRALIAN DOLLARS', 'aud'],
'CNY': ['CNY', 'YUAN', 'CHINESE YUAN', 'cny', '¥'],
}
# Share unit mappings
SHARE_MAPPINGS = {
'shares': ['shares', 'share', 'SHARES', 'SHARE', 'STOCK', 'EQUITY'],
'shares_unit': ['shares_unit', 'share_unit', 'SHARES_UNIT'],
'partnership_unit': ['USD/PartnershipUnit', 'PartnershipUnit', 'partnership_unit']
}
# Ratio/dimensionless unit mappings
RATIO_MAPPINGS = {
'pure': ['pure', 'number', 'ratio', 'percent', '%', 'PURE', 'NUMBER'],
'basis_points': ['bp', 'bps', 'basis_points', 'BASIS_POINTS']
}
# Per-share combinations
PER_SHARE_MAPPINGS = {
'USD_per_share': ['USD/shares', 'USD per share', 'USD/share', 'usd/shares'],
'USD_per_share_unit': ['USD/shares_unit', 'USD per share unit', 'USD/share_unit']
}
# Business/operational unit mappings
BUSINESS_MAPPINGS = {
'customer': ['Customer', 'customer', 'CUSTOMER'],
'store': ['Store', 'store', 'STORE'],
'entity': ['Entity', 'entity', 'ENTITY'],
'segment': ['Segment', 'segment', 'SEGMENT', 'reportable_segment'],
'instrument': ['instrument', 'INSTRUMENT', 'financial_instrument'],
'contract': ['USD/Contract', 'contract', 'CONTRACT'],
'investment': ['USD/Investment', 'investment', 'INVESTMENT']
}
# Time-based unit mappings
TIME_MAPPINGS = {
'years': ['Year', 'years', 'YEAR', 'YEARS'],
'months': ['Month', 'months', 'MONTH', 'MONTHS'],
'days': ['Day', 'days', 'DAY', 'DAYS']
}
# Area unit mappings
AREA_MAPPINGS = {
'sqft': ['sqft', 'square_feet', 'SQFT', 'sq_ft'],
'sqm': ['sqm', 'square_meters', 'SQMETER', 'sq_m']
}
# Comprehensive mapping combining all categories
ALL_MAPPINGS = {
**CURRENCY_MAPPINGS,
**SHARE_MAPPINGS,
**RATIO_MAPPINGS,
**PER_SHARE_MAPPINGS,
**BUSINESS_MAPPINGS,
**TIME_MAPPINGS,
**AREA_MAPPINGS
}
# Reverse mapping for faster lookups
_REVERSE_MAPPING = None
@classmethod
def _build_reverse_mapping(cls) -> Dict[str, str]:
"""Build reverse mapping from variant to normalized unit."""
if cls._REVERSE_MAPPING is not None:
return cls._REVERSE_MAPPING
reverse_map = {}
for normalized_unit, variants in cls.ALL_MAPPINGS.items():
for variant in variants:
reverse_map[variant.upper()] = normalized_unit
cls._REVERSE_MAPPING = reverse_map
return reverse_map
@classmethod
def normalize_unit(cls, unit: str) -> str:
"""
Normalize a unit string to its canonical form.
Args:
unit: Raw unit string from SEC filing
Returns:
Normalized unit string
Example:
>>> UnitNormalizer.normalize_unit("US DOLLAR")
'USD'
>>> UnitNormalizer.normalize_unit("shares_unit")
'shares_unit'
"""
if not unit:
return ""
reverse_map = cls._build_reverse_mapping()
normalized = reverse_map.get(unit.upper())
return normalized if normalized else unit
@classmethod
def get_unit_type(cls, unit: str) -> UnitType:
"""
Determine the type of a unit.
Args:
unit: Unit string (normalized or raw)
Returns:
UnitType enum value
"""
normalized = cls.normalize_unit(unit)
if normalized in cls.CURRENCY_MAPPINGS:
return UnitType.CURRENCY
elif normalized in cls.PER_SHARE_MAPPINGS:
# Per-share units are a special currency-like type (amount per share)
return UnitType.CURRENCY # Treat per-share as currency-derived
elif normalized in cls.SHARE_MAPPINGS:
return UnitType.SHARES
elif normalized in cls.RATIO_MAPPINGS:
return UnitType.RATIO
elif normalized in cls.BUSINESS_MAPPINGS:
return UnitType.BUSINESS
elif normalized in cls.TIME_MAPPINGS:
return UnitType.TIME
elif normalized in cls.AREA_MAPPINGS:
return UnitType.AREA
else:
return UnitType.OTHER
@classmethod
def are_compatible(cls, unit1: str, unit2: str) -> bool:
"""
Check if two units are compatible for calculations.
Args:
unit1: First unit
unit2: Second unit
Returns:
True if units are compatible
"""
norm1 = cls.normalize_unit(unit1)
norm2 = cls.normalize_unit(unit2)
# Exact match
if norm1 == norm2:
return True
# Same unit type
type1 = cls.get_unit_type(norm1)
type2 = cls.get_unit_type(norm2)
if type1 == type2:
# Special cases for compatible unit types
if type1 == UnitType.CURRENCY:
# Regular currencies are compatible, but per-share must match exactly
if norm1 in cls.PER_SHARE_MAPPINGS or norm2 in cls.PER_SHARE_MAPPINGS:
# Per-share units must match exactly (USD_per_share != USD_per_share_unit)
return norm1 == norm2
return True # Regular currencies could be converted
elif type1 == UnitType.SHARES:
# shares and shares_unit are compatible for some calculations
return norm1 in ['shares', 'shares_unit'] and norm2 in ['shares', 'shares_unit']
return False
@classmethod
def get_normalized_value(
cls,
fact: FinancialFact,
target_unit: Optional[str] = None,
apply_scale: bool = True,
strict_unit_match: bool = False
) -> UnitResult:
"""
Get a normalized value from a financial fact with detailed error reporting.
Args:
fact: FinancialFact to normalize
target_unit: Desired unit (if None, just normalize existing unit)
apply_scale: Whether to apply scale factor
strict_unit_match: If True, require exact unit match. If False, allow compatible units.
Returns:
UnitResult with value and metadata
"""
if fact.numeric_value is None:
return UnitResult(
value=None,
normalized_unit=None,
original_unit=fact.unit,
success=False,
error_reason="No numeric value available"
)
original_unit = fact.unit or ""
normalized_unit = cls.normalize_unit(original_unit)
unit_type = cls.get_unit_type(normalized_unit)
# Apply scale factor if requested
value = fact.numeric_value
scale_applied = None
if apply_scale and fact.scale:
value *= fact.scale
scale_applied = fact.scale
# If no target unit specified, return normalized value
if target_unit is None:
return UnitResult(
value=value,
normalized_unit=normalized_unit,
original_unit=original_unit,
success=True,
scale_applied=scale_applied,
unit_type=unit_type
)
# Check compatibility with target unit
target_normalized = cls.normalize_unit(target_unit)
if normalized_unit == target_normalized:
# Exact match
return UnitResult(
value=value,
normalized_unit=target_normalized,
original_unit=original_unit,
success=True,
scale_applied=scale_applied,
unit_type=unit_type
)
elif not strict_unit_match and cls.are_compatible(normalized_unit, target_normalized):
# Compatible units - could potentially convert (only if not in strict mode)
suggestions = []
if cls.get_unit_type(normalized_unit) == UnitType.CURRENCY:
suggestions.append(f"Consider currency conversion from {normalized_unit} to {target_normalized}")
return UnitResult(
value=value,
normalized_unit=normalized_unit, # Keep original, mark as compatible
original_unit=original_unit,
success=True,
scale_applied=scale_applied,
unit_type=unit_type,
suggestions=suggestions
)
else:
# Incompatible units
suggestions = cls._get_unit_suggestions(normalized_unit, target_normalized)
return UnitResult(
value=None,
normalized_unit=normalized_unit,
original_unit=original_unit,
success=False,
error_reason=f"Unit mismatch: {normalized_unit} is not compatible with {target_normalized}",
unit_type=unit_type,
suggestions=suggestions
)
@classmethod
def _get_unit_suggestions(cls, actual_unit: str, target_unit: str) -> List[str]:
"""Generate helpful suggestions for unit mismatches."""
suggestions = []
actual_type = cls.get_unit_type(actual_unit)
target_type = cls.get_unit_type(target_unit)
if actual_type != target_type:
suggestions.append(f"Unit type mismatch: {actual_unit} is {actual_type.value}, "
f"but {target_unit} is {target_type.value}")
# Specific suggestions based on unit types
if target_type == UnitType.CURRENCY and actual_type != UnitType.CURRENCY:
suggestions.append("Consider using a financial amount concept instead of a ratio/count")
elif target_type == UnitType.SHARES and actual_type != UnitType.SHARES:
suggestions.append("Consider using a share-based concept instead of a monetary amount")
# Alternative units in the same category
if actual_type == target_type:
if actual_type == UnitType.CURRENCY:
suggestions.append("Use currency conversion or specify the correct currency unit")
elif actual_type == UnitType.SHARES:
suggestions.append("Try using 'shares' instead of 'shares_unit' or vice versa")
return suggestions
def apply_scale_factor(value: float, scale: Optional[int]) -> float:
"""
Apply scale factor to a value.
Args:
value: Numeric value
scale: Scale factor (e.g., 1000 for thousands)
Returns:
Scaled value
"""
if scale and scale != 1:
return value * scale
return value
def format_unit_error(unit_result: UnitResult) -> str:
"""
Format a unit error message for user display.
Args:
unit_result: UnitResult with error details
Returns:
Formatted error message
"""
if unit_result.success:
return "No error"
message = f"Unit handling error: {unit_result.error_reason}"
if unit_result.suggestions:
message += "\n Suggestions:\n"
for suggestion in unit_result.suggestions:
message += f" - {suggestion}\n"
message += f" Original unit: '{unit_result.original_unit}'"
if unit_result.normalized_unit != unit_result.original_unit:
message += f" Normalized to: '{unit_result.normalized_unit}'"
return message
# Legacy support - maintain compatibility with existing code
def normalize_unit_legacy(unit: str) -> str:
"""Legacy unit normalization for backward compatibility."""
return UnitNormalizer.normalize_unit(unit)
def are_units_compatible_legacy(unit1: str, unit2: str) -> bool:
"""Legacy unit compatibility check for backward compatibility."""
return UnitNormalizer.are_compatible(unit1, unit2)