""" Parser for converting SEC API data to the new Entity Facts format. This module handles the conversion of raw SEC company facts JSON data into the new unified FinancialFact model. """ import logging from datetime import date, datetime from typing import Any, Dict, List, Optional from edgar.entity.entity_facts import EntityFacts from edgar.entity.mappings_loader import load_learned_mappings from edgar.entity.models import DataQuality, FinancialFact log = logging.getLogger(__name__) class EntityFactsParser: """ Parser for converting SEC company facts to EntityFacts. This class handles the transformation of raw SEC API data into the new unified fact model with proper typing and AI-ready metadata. """ # Concept mapping for common financial statement items STATEMENT_MAPPING = { # Income Statement 'Revenue': 'IncomeStatement', 'Revenues': 'IncomeStatement', # Fix for Issue #438 - ensure us-gaap:Revenues maps properly 'RevenueFromContractWithCustomerExcludingAssessedTax': 'IncomeStatement', 'SalesRevenueNet': 'IncomeStatement', 'CostOfRevenue': 'IncomeStatement', 'GrossProfit': 'IncomeStatement', 'OperatingExpenses': 'IncomeStatement', 'OperatingIncomeLoss': 'IncomeStatement', 'NetIncomeLoss': 'IncomeStatement', 'EarningsPerShareDiluted': 'IncomeStatement', # Balance Sheet 'Assets': 'BalanceSheet', 'AssetsCurrent': 'BalanceSheet', 'CurrentAssets': 'BalanceSheet', 'AssetsNoncurrent': 'BalanceSheet', 'Liabilities': 'BalanceSheet', 'LiabilitiesCurrent': 'BalanceSheet', 'CurrentLiabilities': 'BalanceSheet', 'LiabilitiesNoncurrent': 'BalanceSheet', 'StockholdersEquity': 'BalanceSheet', 'CashAndCashEquivalentsAtCarryingValue': 'BalanceSheet', # Cash Flow 'NetCashProvidedByUsedInOperatingActivities': 'CashFlow', 'NetCashProvidedByUsedInInvestingActivities': 'CashFlow', 'NetCashProvidedByUsedInFinancingActivities': 'CashFlow', 'CashAndCashEquivalentsPeriodIncreaseDecrease': 'CashFlow' } # Semantic tags for concepts SEMANTIC_TAGS = { 'Revenue': ['revenue', 'sales', 'operating'], 'NetIncomeLoss': ['profit', 'earnings', 'bottom_line'], 'Assets': ['assets', 'resources', 'balance_sheet'], 'CashAndCashEquivalentsAtCarryingValue': ['cash', 'liquidity', 'current_assets'] } @classmethod def parse_company_facts(cls, json_data: Dict[str, Any]) -> Optional[EntityFacts]: """ Parse SEC company facts JSON into EntityFacts. Args: json_data: Raw JSON from SEC API Returns: EntityFacts object or None if parsing fails """ try: cik = int(json_data.get('cik', 0)) entity_name = json_data.get('entityName', 'Unknown') facts = [] # Process facts from different taxonomies facts_data = json_data.get('facts', {}) for taxonomy, taxonomy_facts in facts_data.items(): for concept, concept_data in taxonomy_facts.items(): # Process units for this concept units = concept_data.get('units', {}) label = concept_data.get('label', concept) description = concept_data.get('description', '') for unit, unit_facts in units.items(): for fact_data in unit_facts: fact = cls._parse_single_fact( concept=concept, taxonomy=taxonomy, label=label, description=description, unit=unit, fact_data=fact_data ) if fact: facts.append(fact) if not facts: log.warning("No facts found for CIK %s", cik) return None return EntityFacts(cik=cik, name=entity_name, facts=facts) except Exception as e: log.error("Error parsing company facts: %s", e) return None @classmethod def _parse_single_fact(cls, concept: str, taxonomy: str, label: str, description: str, unit: str, fact_data: Dict[str, Any]) -> Optional[FinancialFact]: """ Parse a single fact from SEC data. Args: concept: Concept identifier taxonomy: Taxonomy namespace label: Human-readable label description: Concept description unit: Unit of measure fact_data: Raw fact data Returns: FinancialFact or None if parsing fails """ # Extract core values value = fact_data.get('val') if value is None: return None # Parse dates period_end = cls._parse_date(fact_data.get('end')) period_start = cls._parse_date(fact_data.get('start')) filing_date = cls._parse_date(fact_data.get('filed')) # Determine period type if period_start: period_type = 'duration' else: period_type = 'instant' # Parse fiscal period info fiscal_year = cls._parse_fiscal_year(fact_data.get('fy')) fiscal_period = fact_data.get('fp', '') # Determine numeric value numeric_value = None if isinstance(value, (int, float)): numeric_value = float(value) elif isinstance(value, str) and value.replace('-', '').replace('.', '').isdigit(): try: numeric_value = float(value) except ValueError: pass # Determine statement type statement_type = cls._determine_statement_type(concept) # Get semantic tags semantic_tags = cls._get_semantic_tags(concept) # Get structural metadata from learned mappings structural_info = cls._get_structural_info(concept) # Determine data quality data_quality = cls._assess_data_quality(fact_data, fiscal_period) # Create business context business_context = cls._generate_business_context(label, description, unit) # Clean unit representation clean_unit = cls._clean_unit(unit) # Determine scale scale = cls._determine_scale(unit) return FinancialFact( concept=f"{taxonomy}:{concept}", taxonomy=taxonomy, label=label, value=value, numeric_value=numeric_value, unit=clean_unit, scale=scale, period_start=period_start, period_end=period_end, period_type=period_type, fiscal_year=fiscal_year, fiscal_period=fiscal_period, filing_date=filing_date, form_type=fact_data.get('form', ''), accession=fact_data.get('accn', ''), data_quality=data_quality, is_audited=fiscal_period == 'FY', # Annual reports are typically audited is_restated=False, # Would need additional logic to detect is_estimated=False, # Would need additional logic to detect confidence_score=0.9 if data_quality == DataQuality.HIGH else 0.7, semantic_tags=semantic_tags, business_context=business_context, statement_type=statement_type, # Add structural metadata depth=structural_info.get('depth'), parent_concept=structural_info.get('parent'), section=structural_info.get('section'), is_abstract=structural_info.get('is_abstract', False), is_total=structural_info.get('is_total', False), presentation_order=structural_info.get('avg_depth') ) @staticmethod def _parse_date(date_str: Optional[str]) -> Optional[date]: """Parse date string to date object""" if not date_str: return None try: # Try common date formats for fmt in ['%Y-%m-%d', '%Y%m%d', '%m/%d/%Y']: try: return datetime.strptime(date_str, fmt).date() except ValueError: continue # If all formats fail, try to parse as ISO format return datetime.fromisoformat(date_str).date() except Exception: return None @staticmethod def _parse_fiscal_year(fy_value: Any) -> int: """Parse fiscal year value""" if not fy_value: return 0 try: return int(fy_value) except (ValueError, TypeError): return 0 @classmethod def _determine_statement_type(cls, concept: str) -> Optional[str]: """ Determine which financial statement a concept belongs to. First checks static mappings, then falls back to learned mappings with confidence threshold. """ # Remove namespace if present if ':' in concept: concept = concept.split(':')[-1] # Check static mappings first (highest confidence) if concept in cls.STATEMENT_MAPPING: return cls.STATEMENT_MAPPING[concept] # Check learned mappings try: learned_mappings = load_learned_mappings() if concept in learned_mappings: mapping = learned_mappings[concept] # Only use high-confidence learned mappings if mapping.get('confidence', 0) >= 0.5: # 50% threshold return mapping['statement_type'] except Exception as e: log.debug("Error loading learned mappings: %s", e) return None @classmethod def _get_semantic_tags(cls, concept: str) -> List[str]: """Get semantic tags for a concept""" # Remove namespace if present if ':' in concept: concept = concept.split(':')[-1] return cls.SEMANTIC_TAGS.get(concept, []) @classmethod def _get_structural_info(cls, concept: str) -> Dict[str, Any]: """ Get structural metadata for a concept from learned mappings. Returns dict with depth, parent, section, is_abstract, is_total """ # Remove namespace if present if ':' in concept: concept = concept.split(':')[-1] try: learned_mappings = load_learned_mappings() if concept in learned_mappings: mapping = learned_mappings[concept] return { 'depth': int(mapping.get('avg_depth', 0)) if mapping.get('avg_depth') else None, 'parent': mapping.get('parent'), 'section': mapping.get('section'), 'is_abstract': mapping.get('is_abstract', False), 'is_total': mapping.get('is_total', False) } except Exception as e: log.debug("Error getting structural info: %s", e) return {} @staticmethod def _assess_data_quality(fact_data: Dict[str, Any], fiscal_period: str) -> DataQuality: """Assess the quality of a fact""" # Annual data is typically higher quality if fiscal_period == 'FY': return DataQuality.HIGH # Quarterly data if fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']: return DataQuality.HIGH # Other data return DataQuality.MEDIUM @staticmethod def _generate_business_context(label: str, description: str, unit: str) -> str: """Generate business context for a fact""" # Handle null/None values if not label: label = "" if not description: description = "" # Return description if it's longer and more informative than label if description and len(description) > len(label): return description # Generate context based on label and unit if label and 'Revenue' in label: return "Total revenue generated from operations" elif label and 'Income' in label: return "Net earnings after all expenses and taxes" elif label and 'Assets' in label: return "Total resources owned by the company" # Return label if available, otherwise empty string return label if label else "" @staticmethod def _clean_unit(unit: str) -> str: """Clean and standardize unit representation""" if not unit: return "" unit_mapping = { 'USD': 'USD', 'usd': 'USD', 'pure': 'number', 'shares': 'shares', 'USD/shares': 'USD per share' } return unit_mapping.get(unit, unit) @staticmethod def _determine_scale(unit: str) -> Optional[int]: """Determine scale factor from unit""" # SEC data is typically already scaled # This would need more sophisticated logic based on the actual data return None