Files
2025-12-09 12:13:01 +01:00

383 lines
13 KiB
Python

"""
Parser for converting SEC API data to the new Entity Facts format.
This module handles the conversion of raw SEC company facts JSON data
into the new unified FinancialFact model.
"""
import logging
from datetime import date, datetime
from typing import Any, Dict, List, Optional
from edgar.entity.entity_facts import EntityFacts
from edgar.entity.mappings_loader import load_learned_mappings
from edgar.entity.models import DataQuality, FinancialFact
log = logging.getLogger(__name__)
class EntityFactsParser:
"""
Parser for converting SEC company facts to EntityFacts.
This class handles the transformation of raw SEC API data into
the new unified fact model with proper typing and AI-ready metadata.
"""
# Concept mapping for common financial statement items
STATEMENT_MAPPING = {
# Income Statement
'Revenue': 'IncomeStatement',
'Revenues': 'IncomeStatement', # Fix for Issue #438 - ensure us-gaap:Revenues maps properly
'RevenueFromContractWithCustomerExcludingAssessedTax': 'IncomeStatement',
'SalesRevenueNet': 'IncomeStatement',
'CostOfRevenue': 'IncomeStatement',
'GrossProfit': 'IncomeStatement',
'OperatingExpenses': 'IncomeStatement',
'OperatingIncomeLoss': 'IncomeStatement',
'NetIncomeLoss': 'IncomeStatement',
'EarningsPerShareDiluted': 'IncomeStatement',
# Balance Sheet
'Assets': 'BalanceSheet',
'AssetsCurrent': 'BalanceSheet',
'CurrentAssets': 'BalanceSheet',
'AssetsNoncurrent': 'BalanceSheet',
'Liabilities': 'BalanceSheet',
'LiabilitiesCurrent': 'BalanceSheet',
'CurrentLiabilities': 'BalanceSheet',
'LiabilitiesNoncurrent': 'BalanceSheet',
'StockholdersEquity': 'BalanceSheet',
'CashAndCashEquivalentsAtCarryingValue': 'BalanceSheet',
# Cash Flow
'NetCashProvidedByUsedInOperatingActivities': 'CashFlow',
'NetCashProvidedByUsedInInvestingActivities': 'CashFlow',
'NetCashProvidedByUsedInFinancingActivities': 'CashFlow',
'CashAndCashEquivalentsPeriodIncreaseDecrease': 'CashFlow'
}
# Semantic tags for concepts
SEMANTIC_TAGS = {
'Revenue': ['revenue', 'sales', 'operating'],
'NetIncomeLoss': ['profit', 'earnings', 'bottom_line'],
'Assets': ['assets', 'resources', 'balance_sheet'],
'CashAndCashEquivalentsAtCarryingValue': ['cash', 'liquidity', 'current_assets']
}
@classmethod
def parse_company_facts(cls, json_data: Dict[str, Any]) -> Optional[EntityFacts]:
"""
Parse SEC company facts JSON into EntityFacts.
Args:
json_data: Raw JSON from SEC API
Returns:
EntityFacts object or None if parsing fails
"""
try:
cik = int(json_data.get('cik', 0))
entity_name = json_data.get('entityName', 'Unknown')
facts = []
# Process facts from different taxonomies
facts_data = json_data.get('facts', {})
for taxonomy, taxonomy_facts in facts_data.items():
for concept, concept_data in taxonomy_facts.items():
# Process units for this concept
units = concept_data.get('units', {})
label = concept_data.get('label', concept)
description = concept_data.get('description', '')
for unit, unit_facts in units.items():
for fact_data in unit_facts:
fact = cls._parse_single_fact(
concept=concept,
taxonomy=taxonomy,
label=label,
description=description,
unit=unit,
fact_data=fact_data
)
if fact:
facts.append(fact)
if not facts:
log.warning("No facts found for CIK %s", cik)
return None
return EntityFacts(cik=cik, name=entity_name, facts=facts)
except Exception as e:
log.error("Error parsing company facts: %s", e)
return None
@classmethod
def _parse_single_fact(cls,
concept: str,
taxonomy: str,
label: str,
description: str,
unit: str,
fact_data: Dict[str, Any]) -> Optional[FinancialFact]:
"""
Parse a single fact from SEC data.
Args:
concept: Concept identifier
taxonomy: Taxonomy namespace
label: Human-readable label
description: Concept description
unit: Unit of measure
fact_data: Raw fact data
Returns:
FinancialFact or None if parsing fails
"""
# Extract core values
value = fact_data.get('val')
if value is None:
return None
# Parse dates
period_end = cls._parse_date(fact_data.get('end'))
period_start = cls._parse_date(fact_data.get('start'))
filing_date = cls._parse_date(fact_data.get('filed'))
# Determine period type
if period_start:
period_type = 'duration'
else:
period_type = 'instant'
# Parse fiscal period info
fiscal_year = cls._parse_fiscal_year(fact_data.get('fy'))
fiscal_period = fact_data.get('fp', '')
# Determine numeric value
numeric_value = None
if isinstance(value, (int, float)):
numeric_value = float(value)
elif isinstance(value, str) and value.replace('-', '').replace('.', '').isdigit():
try:
numeric_value = float(value)
except ValueError:
pass
# Determine statement type
statement_type = cls._determine_statement_type(concept)
# Get semantic tags
semantic_tags = cls._get_semantic_tags(concept)
# Get structural metadata from learned mappings
structural_info = cls._get_structural_info(concept)
# Determine data quality
data_quality = cls._assess_data_quality(fact_data, fiscal_period)
# Create business context
business_context = cls._generate_business_context(label, description, unit)
# Clean unit representation
clean_unit = cls._clean_unit(unit)
# Determine scale
scale = cls._determine_scale(unit)
return FinancialFact(
concept=f"{taxonomy}:{concept}",
taxonomy=taxonomy,
label=label,
value=value,
numeric_value=numeric_value,
unit=clean_unit,
scale=scale,
period_start=period_start,
period_end=period_end,
period_type=period_type,
fiscal_year=fiscal_year,
fiscal_period=fiscal_period,
filing_date=filing_date,
form_type=fact_data.get('form', ''),
accession=fact_data.get('accn', ''),
data_quality=data_quality,
is_audited=fiscal_period == 'FY', # Annual reports are typically audited
is_restated=False, # Would need additional logic to detect
is_estimated=False, # Would need additional logic to detect
confidence_score=0.9 if data_quality == DataQuality.HIGH else 0.7,
semantic_tags=semantic_tags,
business_context=business_context,
statement_type=statement_type,
# Add structural metadata
depth=structural_info.get('depth'),
parent_concept=structural_info.get('parent'),
section=structural_info.get('section'),
is_abstract=structural_info.get('is_abstract', False),
is_total=structural_info.get('is_total', False),
presentation_order=structural_info.get('avg_depth')
)
@staticmethod
def _parse_date(date_str: Optional[str]) -> Optional[date]:
"""Parse date string to date object"""
if not date_str:
return None
try:
# Try common date formats
for fmt in ['%Y-%m-%d', '%Y%m%d', '%m/%d/%Y']:
try:
return datetime.strptime(date_str, fmt).date()
except ValueError:
continue
# If all formats fail, try to parse as ISO format
return datetime.fromisoformat(date_str).date()
except Exception:
return None
@staticmethod
def _parse_fiscal_year(fy_value: Any) -> int:
"""Parse fiscal year value"""
if not fy_value:
return 0
try:
return int(fy_value)
except (ValueError, TypeError):
return 0
@classmethod
def _determine_statement_type(cls, concept: str) -> Optional[str]:
"""
Determine which financial statement a concept belongs to.
First checks static mappings, then falls back to learned mappings
with confidence threshold.
"""
# Remove namespace if present
if ':' in concept:
concept = concept.split(':')[-1]
# Check static mappings first (highest confidence)
if concept in cls.STATEMENT_MAPPING:
return cls.STATEMENT_MAPPING[concept]
# Check learned mappings
try:
learned_mappings = load_learned_mappings()
if concept in learned_mappings:
mapping = learned_mappings[concept]
# Only use high-confidence learned mappings
if mapping.get('confidence', 0) >= 0.5: # 50% threshold
return mapping['statement_type']
except Exception as e:
log.debug("Error loading learned mappings: %s", e)
return None
@classmethod
def _get_semantic_tags(cls, concept: str) -> List[str]:
"""Get semantic tags for a concept"""
# Remove namespace if present
if ':' in concept:
concept = concept.split(':')[-1]
return cls.SEMANTIC_TAGS.get(concept, [])
@classmethod
def _get_structural_info(cls, concept: str) -> Dict[str, Any]:
"""
Get structural metadata for a concept from learned mappings.
Returns dict with depth, parent, section, is_abstract, is_total
"""
# Remove namespace if present
if ':' in concept:
concept = concept.split(':')[-1]
try:
learned_mappings = load_learned_mappings()
if concept in learned_mappings:
mapping = learned_mappings[concept]
return {
'depth': int(mapping.get('avg_depth', 0)) if mapping.get('avg_depth') else None,
'parent': mapping.get('parent'),
'section': mapping.get('section'),
'is_abstract': mapping.get('is_abstract', False),
'is_total': mapping.get('is_total', False)
}
except Exception as e:
log.debug("Error getting structural info: %s", e)
return {}
@staticmethod
def _assess_data_quality(fact_data: Dict[str, Any], fiscal_period: str) -> DataQuality:
"""Assess the quality of a fact"""
# Annual data is typically higher quality
if fiscal_period == 'FY':
return DataQuality.HIGH
# Quarterly data
if fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
return DataQuality.HIGH
# Other data
return DataQuality.MEDIUM
@staticmethod
def _generate_business_context(label: str, description: str, unit: str) -> str:
"""Generate business context for a fact"""
# Handle null/None values
if not label:
label = ""
if not description:
description = ""
# Return description if it's longer and more informative than label
if description and len(description) > len(label):
return description
# Generate context based on label and unit
if label and 'Revenue' in label:
return "Total revenue generated from operations"
elif label and 'Income' in label:
return "Net earnings after all expenses and taxes"
elif label and 'Assets' in label:
return "Total resources owned by the company"
# Return label if available, otherwise empty string
return label if label else ""
@staticmethod
def _clean_unit(unit: str) -> str:
"""Clean and standardize unit representation"""
if not unit:
return ""
unit_mapping = {
'USD': 'USD',
'usd': 'USD',
'pure': 'number',
'shares': 'shares',
'USD/shares': 'USD per share'
}
return unit_mapping.get(unit, unit)
@staticmethod
def _determine_scale(unit: str) -> Optional[int]:
"""Determine scale factor from unit"""
# SEC data is typically already scaled
# This would need more sophisticated logic based on the actual data
return None