""" Enhanced financial statement that combines hierarchical structure with multi-period display. This module provides an enhanced statement class that uses learned mappings to show multiple periods with proper hierarchical organization. Note: PD011 violations in this file are false positives - .values refers to Dict[str, Optional[float]] on MultiPeriodItem objects, not pandas DataFrames. """ # ruff: noqa: PD011 from collections import defaultdict from dataclasses import dataclass, field from datetime import date from typing import Any, Dict, List, Optional import pandas as pd from rich import box from rich.console import Group from rich.padding import Padding from rich.panel import Panel from rich.table import Table from rich.text import Text from edgar.core import log from edgar.entity.mappings_loader import load_learned_mappings, load_virtual_trees from edgar.entity.models import FinancialFact try: from edgar.entity.terminal_styles import get_current_scheme except ImportError: # Fallback if terminal_styles not available - use professional scheme def get_current_scheme(): return { "abstract_item": "bold blue", "total_item": "bold bright_white", "regular_item": "", "low_confidence_item": "italic", "positive_value": "green", "negative_value": "red", "total_value_prefix": "bold", "separator": "blue", "company_name": "bold bright_white", "statement_type": "bold blue", "panel_border": "white", "empty_value": "bright_black", } from edgar.richtools import repr_rich @dataclass class MultiPeriodStatement: """ A financial statement showing multiple periods with hierarchical structure. Combines the best of both worlds: - Multiple periods side-by-side (like current pivot tables) - Hierarchical organization (from StructuredStatement) - Learned concept mappings for better coverage """ statement_type: str periods: List[str] # Period labels like ["Q1 2024", "Q2 2024"] # Hierarchical items with multi-period values items: List['MultiPeriodItem'] # Metadata company_name: Optional[str] = None cik: Optional[str] = None canonical_coverage: float = 0.0 # Display format control concise_format: bool = False # If True, display as $1.0B, if False display as $1,000,000,000 def __rich__(self): """Create a rich representation with multiple periods.""" # Get color scheme at the start colors = get_current_scheme() # Statement type mapping statement_names = { 'IncomeStatement': 'Income Statement', 'BalanceSheet': 'Balance Sheet', 'CashFlow': 'Cash Flow Statement' } # Title title_parts = [] if self.company_name: title_parts.append((self.company_name, colors["company_name"])) else: title_parts.append(("Financial Statement", colors["total_item"])) title = Text.assemble(*title_parts) # Subtitle statement_display = statement_names.get(self.statement_type, self.statement_type) period_range = f"{self.periods[-1]} to {self.periods[0]}" if len(self.periods) > 1 else self.periods[0] if self.periods else "" subtitle = f"{statement_display} • {period_range}" # Main table with multiple period columns stmt_table = Table( box=box.SIMPLE, show_header=True, padding=(0, 1), expand=True ) # Add concept column stmt_table.add_column("", style="", ratio=2) # Add period columns for period in self.periods: stmt_table.add_column(period, justify="right", style="bold", ratio=1) def add_item_to_table(item: 'MultiPeriodItem', depth: int = 0): """Add an item row to the table.""" indent = " " * depth # Prepare row values row = [] # Concept label if item.is_abstract: row.append(Text(f"{indent}{item.label}", style=colors["abstract_item"])) elif item.is_total: row.append(Text(f"{indent}{item.label}", style=colors["total_item"])) else: # Check if this is a key financial item that should always be prominent important_labels = [ 'Total Revenue', 'Revenue', 'Net Sales', 'Total Net Sales', 'Operating Income', 'Operating Income (Loss)', 'Operating Profit', 'Net Income', 'Net Income (Loss)', 'Net Earnings', 'Gross Profit', 'Gross Margin', 'Cost of Revenue', 'Cost of Goods Sold', 'Operating Expenses', 'Total Operating Expenses', 'Earnings Per Share', 'EPS' ] is_important = any(label in item.label for label in important_labels) # Don't mark important items as low confidence even if score is low if is_important: style = colors["total_item"] # Use bold styling for important items confidence_marker = "" else: style = colors["low_confidence_item"] if item.confidence < 0.8 else colors["regular_item"] confidence_marker = " ◦" if item.confidence < 0.8 else "" row.append(Text(f"{indent}{item.label}{confidence_marker}", style=style)) # Period values for period in self.periods: value_str = item.get_display_value(period, concise_format=self.concise_format) if value_str and value_str != "-": # Color code values value = item.values.get(period) if value and isinstance(value, (int, float)): value_style = colors["negative_value"] if value < 0 else colors["positive_value"] else: value_style = "" if item.is_total: # Combine total style with value color if present total_style = colors["total_value_prefix"] if value_style: total_style = f"{total_style} {value_style}" row.append(Text(value_str, style=total_style)) else: row.append(Text(value_str, style=value_style)) else: row.append("") stmt_table.add_row(*row) # Add separator line after totals if item.is_total and depth == 0: separator_row = [Text("─" * 40, style=colors["separator"])] for _ in self.periods: separator_row.append(Text("─" * 15, style=colors["separator"])) stmt_table.add_row(*separator_row) # Add children for child in item.children: if depth < 3: add_item_to_table(child, depth + 1) # Add all items for item in self.items: add_item_to_table(item) # Combine content content_parts = [ Padding("", (1, 0, 0, 0)), stmt_table ] content = Group(*content_parts) return Panel( content, title=title, subtitle=subtitle, border_style=colors["panel_border"], expand=True ) def to_dataframe(self) -> pd.DataFrame: """ Convert the multi-period statement to a DataFrame. Returns: DataFrame with concepts as rows and periods as columns """ data = [] def collect_items(item: 'MultiPeriodItem', depth: int = 0): """Recursively collect items into flat structure.""" # Create row data row = { 'concept': item.concept, 'label': item.label, 'depth': depth, 'is_abstract': item.is_abstract, 'is_total': item.is_total, 'section': item.section, 'confidence': item.confidence } # Add period values for period in self.periods: row[period] = item.values.get(period) data.append(row) # Process children for child in item.children: collect_items(child, depth + 1) # Collect all items for item in self.items: collect_items(item) # Create DataFrame df = pd.DataFrame(data) # Set concept as index if not df.empty: df = df.set_index('concept') return df def to_llm_context(self, include_metadata: bool = True, include_hierarchy: bool = False, flatten_values: bool = True) -> Dict[str, Any]: """ Generate structured context optimized for LLM consumption. This method creates a clean, structured representation of financial data that LLMs can easily parse and reason about, avoiding complex hierarchies and focusing on key-value pairs with clear semantics. Args: include_metadata: Include metadata about data quality and coverage include_hierarchy: Include parent-child relationships (default False for simplicity) flatten_values: Flatten multi-period values into period-prefixed keys (default True) Returns: Dictionary with structured financial data for LLM analysis Example Output: { "company": "Apple Inc.", "statement_type": "income_statement", "periods": ["FY 2024", "FY 2023"], "currency": "USD", "scale": "actual", "data": { "revenue_fy2024": 391035000000, "revenue_fy2023": 383285000000, "net_income_fy2024": 93736000000, ... }, "key_metrics": { "revenue_growth": 0.02, "profit_margin_fy2024": 0.24, ... }, "metadata": { "total_concepts": 173, "coverage_ratio": 0.85, ... } } """ from datetime import datetime context = { "company": self.company_name or "Unknown", "cik": self.cik or "Unknown", "statement_type": self._get_statement_type_name(), "periods": self.periods, "currency": "USD", # Default, could be enhanced "scale": "actual", # Values are in actual amounts "generated_at": datetime.now().isoformat() } # Prepare main data section data = {} hierarchical_data = [] if include_hierarchy else None def process_item(item: 'MultiPeriodItem', parent_path: str = ""): """Process an item and its children.""" # Skip abstract items unless they have values if item.is_abstract and not any(v is not None for v in item.values.values()): # Still process children for child in item.children: process_item(child, parent_path) return # Create a clean concept key (lowercase, underscored) concept_key = self._create_llm_key(item.concept) if flatten_values: # Create period-specific keys for period in self.periods: value = item.values.get(period) if value is not None: # Create period suffix period_key = period.lower().replace(' ', '_').replace('-', '_') full_key = f"{concept_key}_{period_key}" data[full_key] = value # Also store with label for better readability label_key = f"{self._create_llm_key(item.label)}_{period_key}" if label_key != full_key and label_key not in data: data[label_key] = value else: # Store as nested structure if any(v is not None for v in item.values.values()): data[concept_key] = { "label": item.label, "values": {p: v for p, v in item.values.items() if v is not None}, "is_total": item.is_total } # Add to hierarchical data if requested if include_hierarchy and hierarchical_data is not None: hierarchical_data.append({ "concept": item.concept, "label": item.label, "parent": parent_path or None, "depth": item.depth, "is_total": item.is_total, "values": {p: v for p, v in item.values.items() if v is not None} }) # Process children current_path = f"{parent_path}/{item.concept}" if parent_path else item.concept for child in item.children: process_item(child, current_path) # Process all top-level items for item in self.items: process_item(item) context["data"] = data if include_hierarchy and hierarchical_data: context["hierarchy"] = hierarchical_data # Calculate key metrics and ratios key_metrics = self._calculate_key_metrics(data) if key_metrics: context["key_metrics"] = key_metrics # Add metadata if requested if include_metadata: metadata = { "total_concepts": len([i for i in self._flatten_items() if not i.is_abstract]), "total_values": sum(1 for v in data.values() if v is not None), "periods_count": len(self.periods), "has_comparisons": len(self.periods) > 1, "coverage_ratio": self.coverage if hasattr(self, 'coverage') else None } # Add data quality indicators quality_indicators = [] if metadata["total_concepts"] > 100: quality_indicators.append("comprehensive") elif metadata["total_concepts"] > 50: quality_indicators.append("detailed") else: quality_indicators.append("basic") if metadata["has_comparisons"]: quality_indicators.append("comparable") metadata["quality_indicators"] = quality_indicators context["metadata"] = metadata return context def _get_statement_type_name(self) -> str: """Get clean statement type name for LLM context.""" type_map = { "IncomeStatement": "income_statement", "BalanceSheet": "balance_sheet", "CashFlow": "cash_flow", "CashFlowStatement": "cash_flow" } return type_map.get(self.statement_type, self.statement_type.lower()) def _create_llm_key(self, text: str) -> str: """Create a clean key from concept or label text.""" import re # Remove special characters and convert to snake_case text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'\s+', '_', text.strip()) return text.lower() def _flatten_items(self) -> List['MultiPeriodItem']: """Flatten all items into a single list.""" result = [] def collect(item: 'MultiPeriodItem'): result.append(item) for child in item.children: collect(child) for item in self.items: collect(item) return result def _calculate_key_metrics(self, data: Dict[str, Any]) -> Dict[str, Any]: """Calculate important financial metrics from the data.""" metrics = {} # Try to calculate based on statement type if "income" in self.statement_type.lower(): metrics.update(self._calculate_income_metrics(data)) elif "balance" in self.statement_type.lower(): metrics.update(self._calculate_balance_metrics(data)) elif "cash" in self.statement_type.lower(): metrics.update(self._calculate_cashflow_metrics(data)) return metrics def _calculate_income_metrics(self, data: Dict[str, Any]) -> Dict[str, float]: """Calculate income statement metrics.""" metrics = {} # Find revenue and net income for each period for period in self.periods: period_key = period.lower().replace(' ', '_').replace('-', '_') # Find revenue revenue_keys = [k for k in data.keys() if 'revenue' in k.lower() and period_key in k and 'total' in k.lower()] if not revenue_keys: revenue_keys = [k for k in data.keys() if 'revenue' in k.lower() and period_key in k] if revenue_keys: revenue = data[revenue_keys[0]] # Find net income income_keys = [k for k in data.keys() if 'net_income' in k.lower() and period_key in k] if income_keys: net_income = data[income_keys[0]] # Calculate profit margin if revenue and revenue != 0: metrics[f"profit_margin_{period_key}"] = round(net_income / revenue, 4) # Find operating income op_income_keys = [k for k in data.keys() if 'operating_income' in k.lower() and period_key in k] if op_income_keys: op_income = data[op_income_keys[0]] if revenue and revenue != 0: metrics[f"operating_margin_{period_key}"] = round(op_income / revenue, 4) # Calculate growth rates if we have multiple periods if len(self.periods) >= 2: # Get the two most recent periods recent_period = self.periods[0].lower().replace(' ', '_').replace('-', '_') prior_period = self.periods[1].lower().replace(' ', '_').replace('-', '_') # Revenue growth recent_rev_keys = [k for k in data.keys() if 'revenue' in k.lower() and recent_period in k and 'total' in k.lower()] prior_rev_keys = [k for k in data.keys() if 'revenue' in k.lower() and prior_period in k and 'total' in k.lower()] if recent_rev_keys and prior_rev_keys: recent_rev = data[recent_rev_keys[0]] prior_rev = data[prior_rev_keys[0]] if prior_rev and prior_rev != 0: metrics["revenue_growth_rate"] = round((recent_rev - prior_rev) / prior_rev, 4) return metrics def _calculate_balance_metrics(self, data: Dict[str, Any]) -> Dict[str, float]: """Calculate balance sheet metrics.""" metrics = {} for period in self.periods: period_key = period.lower().replace(' ', '_').replace('-', '_') # Find key balance sheet items assets_keys = [k for k in data.keys() if 'total_assets' in k.lower() and period_key in k] liabilities_keys = [k for k in data.keys() if 'total_liabilities' in k.lower() and period_key in k] equity_keys = [k for k in data.keys() if 'stockholders_equity' in k.lower() and period_key in k] if assets_keys and liabilities_keys: assets = data[assets_keys[0]] liabilities = data[liabilities_keys[0]] # Debt to assets ratio if assets and assets != 0: metrics[f"debt_to_assets_{period_key}"] = round(liabilities / assets, 4) # Equity ratio if equity_keys: equity = data[equity_keys[0]] if assets and assets != 0: metrics[f"equity_ratio_{period_key}"] = round(equity / assets, 4) return metrics def _calculate_cashflow_metrics(self, data: Dict[str, Any]) -> Dict[str, float]: """Calculate cash flow metrics.""" metrics = {} for period in self.periods: period_key = period.lower().replace(' ', '_').replace('-', '_') # Find operating cash flow ocf_keys = [k for k in data.keys() if 'operating_activities' in k.lower() and 'net_cash' in k.lower() and period_key in k] if ocf_keys: ocf = data[ocf_keys[0]] # Find capital expenditures capex_keys = [k for k in data.keys() if 'capital_expenditure' in k.lower() and period_key in k] if not capex_keys: capex_keys = [k for k in data.keys() if 'property_plant_equipment' in k.lower() and 'acquire' in k.lower() and period_key in k] if capex_keys: capex = abs(data[capex_keys[0]]) # Capex is usually negative # Calculate free cash flow metrics[f"free_cash_flow_{period_key}"] = ocf - capex return metrics def __iter__(self): """ Iterate over all items in the statement (flat iteration). Yields items in display order (depth-first traversal). Example: for item in statement: print(f"{item.label}: {item.values}") """ def traverse(item: 'MultiPeriodItem'): yield item for child in item.children: yield from traverse(child) for item in self.items: yield from traverse(item) def iter_hierarchy(self): """ Iterate over items with hierarchy information. Yields tuples of (item, depth, parent) for each item. Example: for item, depth, parent in statement.iter_hierarchy(): indent = " " * depth print(f"{indent}{item.label}") """ def traverse(item: 'MultiPeriodItem', depth: int = 0, parent: Optional['MultiPeriodItem'] = None): yield (item, depth, parent) for child in item.children: yield from traverse(child, depth + 1, item) for item in self.items: yield from traverse(item) def iter_with_values(self): """ Iterate over items that have actual values (skip abstract/empty items). Yields only items with at least one non-None value. Example: for item in statement.iter_with_values(): for period in statement.periods: value = item.values.get(period) if value: print(f"{item.label} ({period}): ${value:,.0f}") """ for item in self: if any(v is not None for v in item.values.values()): yield item def get_items_by_depth(self, max_depth: int = None) -> List['MultiPeriodItem']: """ Get all items up to a specified depth level. Args: max_depth: Maximum depth to include (None for all depths) Returns: List of items up to the specified depth Example: # Get only top-level and first-level items top_items = statement.get_items_by_depth(1) """ result = [] for item, depth, _ in self.iter_hierarchy(): if max_depth is None or depth <= max_depth: result.append(item) return result def find_item(self, concept: str = None, label: str = None) -> Optional['MultiPeriodItem']: """ Find a specific item by concept name or label. Args: concept: Concept name to search for (case-insensitive) label: Label text to search for (case-insensitive) Returns: First matching item or None if not found Example: revenue = statement.find_item(label="Total Revenue") if revenue: print(revenue.values) """ if not concept and not label: return None for item in self: if concept and item.concept.lower() == concept.lower(): return item if label and item.label.lower() == label.lower(): return item return None def to_dict(self, include_empty: bool = False) -> Dict[str, Any]: """ Convert statement to a simple dictionary structure for JSON serialization. Args: include_empty: Include items with no values Returns: Dictionary representation suitable for web APIs Example: data = statement.to_dict() json.dumps(data) # Ready for web API response """ def item_to_dict(item: 'MultiPeriodItem') -> Dict[str, Any]: # Skip items with no values unless requested if not include_empty and not any(v is not None for v in item.values.values()): return None result = { 'concept': item.concept, 'label': item.label, 'values': item.values, 'is_abstract': item.is_abstract, 'is_total': item.is_total, 'depth': item.depth, 'confidence': item.confidence } # Add children if they exist if item.children: children = [] for child in item.children: child_dict = item_to_dict(child) if child_dict: children.append(child_dict) if children: result['children'] = children return result items_data = [] for item in self.items: item_dict = item_to_dict(item) if item_dict: items_data.append(item_dict) return { 'company': self.company_name, 'cik': self.cik, 'statement_type': self._get_statement_type_name(), 'periods': self.periods, 'items': items_data, 'metadata': { 'canonical_coverage': self.canonical_coverage, 'total_items': len(list(self.iter_with_values())), 'concise_format': self.concise_format } } def to_flat_list(self) -> List[Dict[str, Any]]: """ Convert statement to a flat list of items for table rendering. Returns: List of dictionaries, each representing one row Example: rows = statement.to_flat_list() # Perfect for rendering in HTML tables or data grids for row in rows: print(f"{row['label']}: {row['values']}") """ result = [] for item, depth, parent in self.iter_hierarchy(): # Skip empty abstract items if item.is_abstract and not any(v is not None for v in item.values.values()): continue row = { 'concept': item.concept, 'label': item.label, 'depth': depth, 'parent': parent.concept if parent else None, 'is_abstract': item.is_abstract, 'is_total': item.is_total, 'confidence': item.confidence } # Add period values for period in self.periods: row[period] = item.values.get(period) # Also add formatted version row[f"{period}_formatted"] = item.get_display_value(period, self.concise_format) result.append(row) return result def get_period_comparison(self, period1: str, period2: str) -> List[Dict[str, Any]]: """ Get comparison data between two periods. Args: period1: First period to compare period2: Second period to compare Returns: List of items with values, changes, and percentages Example: comparison = statement.get_period_comparison("FY 2024", "FY 2023") for item in comparison: if item['change_percent']: print(f"{item['label']}: {item['change_percent']:.1%} change") """ if period1 not in self.periods or period2 not in self.periods: raise ValueError(f"Periods must be in {self.periods}") result = [] for item in self.iter_with_values(): val1 = item.values.get(period1) val2 = item.values.get(period2) comparison = { 'concept': item.concept, 'label': item.label, 'is_total': item.is_total, period1: val1, period2: val2, f"{period1}_formatted": item.get_display_value(period1, self.concise_format), f"{period2}_formatted": item.get_display_value(period2, self.concise_format) } # Calculate change if both values exist if val1 is not None and val2 is not None and val2 != 0: change = val1 - val2 change_percent = change / abs(val2) comparison['change'] = change comparison['change_percent'] = change_percent comparison['change_formatted'] = f"${change:,.0f}" if abs(change) >= 1 else f"{change:.2f}" else: comparison['change'] = None comparison['change_percent'] = None comparison['change_formatted'] = None result.append(comparison) return result def _create_table(self, for_llm: bool = False) -> Table: """ Create the statement table without Panel wrapper. Args: for_llm: If True, use minimal formatting for LLM consumption Returns: Rich Table object """ # Get color scheme colors = get_current_scheme() # Choose box style based on context box_style = box.MINIMAL if for_llm else box.SIMPLE # Main table with multiple period columns stmt_table = Table( box=box_style, show_header=True, padding=(0, 1), expand=True ) # Add concept column stmt_table.add_column("", style="", ratio=2) # Add period columns for period in self.periods: stmt_table.add_column(period, justify="right", style="bold", ratio=1) def add_item_to_table(item: 'MultiPeriodItem', depth: int = 0): """Add an item row to the table.""" indent = " " * depth # Prepare row values row = [] # Concept label if item.is_abstract: row.append(Text(f"{indent}{item.label}", style=colors["abstract_item"])) elif item.is_total: row.append(Text(f"{indent}{item.label}", style=colors["total_item"])) else: # Check if this is a key financial item that should always be prominent important_labels = [ 'Total Revenue', 'Revenue', 'Net Sales', 'Total Net Sales', 'Operating Income', 'Operating Income (Loss)', 'Operating Profit', 'Net Income', 'Net Income (Loss)', 'Net Earnings', 'Gross Profit', 'Gross Margin', 'Cost of Revenue', 'Cost of Goods Sold', 'Operating Expenses', 'Total Operating Expenses', 'Earnings Per Share', 'EPS' ] is_important = any(label in item.label for label in important_labels) # Don't mark important items as low confidence even if score is low if is_important: style = colors["total_item"] # Use bold styling for important items confidence_marker = "" else: style = colors["low_confidence_item"] if item.confidence < 0.8 else colors["regular_item"] confidence_marker = " ◦" if item.confidence < 0.8 else "" row.append(Text(f"{indent}{item.label}{confidence_marker}", style=style)) # Period values for period in self.periods: value_str = item.get_display_value(period, concise_format=self.concise_format) if value_str and value_str != "-": # Color code values value = item.values.get(period) if value and isinstance(value, (int, float)): value_style = colors["negative_value"] if value < 0 else colors["positive_value"] else: value_style = "" if item.is_total: # Combine total style with value color if present total_style = colors["total_value_prefix"] if value_style: total_style = f"{total_style} {value_style}" row.append(Text(value_str, style=total_style)) else: row.append(Text(value_str, style=value_style)) else: row.append("") stmt_table.add_row(*row) # Add separator line after totals (skip for LLM to save characters) if item.is_total and depth == 0 and not for_llm: separator_row = [Text("─" * 40, style=colors["separator"])] for _ in self.periods: separator_row.append(Text("─" * 15, style=colors["separator"])) stmt_table.add_row(*separator_row) # Add children for child in item.children: if depth < 3: add_item_to_table(child, depth + 1) # Add all items for item in self.items: add_item_to_table(item) return stmt_table def to_llm_string(self) -> str: """ Generate LLM-optimized string representation. Uses minimal formatting optimized for LLM consumption: - No Panel borders (saves ~200 characters) - Minimal table box style (saves ~100 characters per row) - No ANSI color codes (plain text) - Assumes concise_format is already set for number formatting - Omits separator lines after totals Returns: String representation optimized for LLM token usage """ from io import StringIO from rich.console import Console buffer = StringIO() # Disable color/formatting codes for plain text output console = Console( file=buffer, force_terminal=False, # No ANSI codes no_color=True, # Plain text only width=120, legacy_windows=False ) # Create table without Panel wrapper table = self._create_table(for_llm=True) console.print(table) output = buffer.getvalue() return output def __repr__(self) -> str: """String representation using rich formatting.""" return repr_rich(self.__rich__()) @dataclass class MultiPeriodItem: """An item in a multi-period statement with values for each period.""" concept: str label: str values: Dict[str, Optional[float]] # Period -> Value mapping # Hierarchy depth: int parent_concept: Optional[str] children: List['MultiPeriodItem'] = field(default_factory=list) # Metadata is_abstract: bool = False is_total: bool = False section: Optional[str] = None confidence: float = 1.0 def get_display_value(self, period: str, concise_format: bool = False) -> str: """ Get formatted value for a specific period. Args: period: The period to get value for concise_format: If True, use concise format ($1.0B), if False use full numbers with commas Returns: Formatted value string """ value = self.values.get(period) if value is not None: # Check if this is a per-share amount is_per_share = any(indicator in self.concept.lower() or indicator in self.label.lower() for indicator in ['pershare', 'per share', 'earnings per', 'eps']) if is_per_share: # Format per-share amounts with 2 decimal places, no dollar sign return f"{value:.2f}" elif concise_format: # Use concise format ($1.0B, $1.0M, etc.) if abs(value) >= 1_000_000_000: return f"${value/1_000_000_000:.1f}B" elif abs(value) >= 1_000_000: return f"${value/1_000_000:.1f}M" elif abs(value) >= 1_000: return f"${value/1_000:.0f}K" else: return f"${value:.0f}" else: # Use full number format with commas # Format as integer if whole number, otherwise with appropriate decimals if value == int(value): return f"${int(value):,}" else: # Use appropriate decimal places based on magnitude if abs(value) >= 1: return f"${value:,.0f}" else: return f"${value:.2f}" elif self.is_abstract: return "" else: return "-" def validate_fiscal_year_period_end(fiscal_year: int, period_end: date) -> bool: """ Validate that fiscal_year is reasonable given period_end. This handles SEC Facts API data quality issues where comparative periods are mislabeled with incorrect fiscal_year values (Issue #452). Args: fiscal_year: The fiscal year from the fact period_end: The period end date Returns: True if the fiscal_year/period_end combination is valid, False otherwise Examples: >>> # Early January period (52/53-week calendar) >>> validate_fiscal_year_period_end(2022, date(2023, 1, 1)) True >>> validate_fiscal_year_period_end(2023, date(2023, 1, 1)) True >>> validate_fiscal_year_period_end(2024, date(2023, 1, 1)) False >>> # Late December period >>> validate_fiscal_year_period_end(2023, date(2023, 12, 31)) True >>> validate_fiscal_year_period_end(2024, date(2023, 12, 31)) True >>> # Normal period >>> validate_fiscal_year_period_end(2023, date(2023, 6, 30)) True >>> validate_fiscal_year_period_end(2025, date(2023, 6, 30)) False """ year_diff = fiscal_year - period_end.year # Early January (Jan 1-7): fiscal_year should be year-1 (52/53-week calendar) or year # Example: Period ending Jan 1, 2023 → FY 2022 (most common) or FY 2023 (edge case) if period_end.month == 1 and period_end.day <= 7: return year_diff in (-1, 0) # Late December (Dec 25-31): fiscal_year should be year or year+1 # Example: Period ending Dec 31, 2023 → FY 2023 (most common) or FY 2024 (year-end shifts) elif period_end.month == 12 and period_end.day >= 25: return year_diff in (0, 1) # All other dates: fiscal_year should match period_end.year exactly else: return year_diff == 0 def validate_quarterly_period_end(fiscal_period: str, period_end: date, fiscal_year_end_month: int = 12) -> bool: """ Validate that period_end matches the expected month for the fiscal_period. This filters out comparative period data that's mislabeled with incorrect fiscal_period values in the SEC Facts API. Args: fiscal_period: The fiscal period (Q1, Q2, Q3, Q4, FY) period_end: The period end date fiscal_year_end_month: Company's fiscal year end month (default: 12) Returns: True if period_end matches expected month for fiscal_period Examples: >>> # Apple (fiscal year ends in September, month 9) >>> validate_quarterly_period_end('Q3', date(2025, 6, 28), 9) True # Q3 should end in June (3 months before Sept) >>> validate_quarterly_period_end('Q3', date(2024, 9, 28), 9) False # This is Q4, not Q3 """ if fiscal_period == 'FY': # FY should match fiscal year end month return period_end.month == fiscal_year_end_month # Calculate expected month for each quarter based on fiscal year end # Q4 ends in fiscal year end month # Q3 ends 3 months before that # Q2 ends 6 months before that # Q1 ends 9 months before that quarter_offsets = { 'Q1': -9, # 9 months before fiscal year end 'Q2': -6, # 6 months before fiscal year end 'Q3': -3, # 3 months before fiscal year end 'Q4': 0 # Fiscal year end month } if fiscal_period not in quarter_offsets: return False # Calculate expected month offset = quarter_offsets[fiscal_period] expected_month = fiscal_year_end_month + offset # Handle month wrapping if expected_month <= 0: expected_month += 12 elif expected_month > 12: expected_month -= 12 # Allow ±1 month flexibility for 52/53-week calendars month_diff = abs(period_end.month - expected_month) # Handle wrap-around (e.g., month 12 vs month 1 is only 1 month apart) if month_diff > 6: month_diff = 12 - month_diff return month_diff <= 1 def detect_fiscal_year_end(facts: List[FinancialFact]) -> int: """ Detect company's fiscal year end month from FY period_end dates. Returns: Most common month from FY period_end dates (default: 12) """ from collections import Counter # Get all FY facts with period_end fy_facts = [f for f in facts if f.fiscal_period == 'FY' and f.period_end] if not fy_facts: return 12 # Default to December # Find most common period_end month months = [f.period_end.month for f in fy_facts] most_common = Counter(months).most_common(1) return most_common[0][0] if most_common else 12 def calculate_fiscal_year_for_label(period_end: date, fiscal_year_end_month: int) -> int: """ Calculate the fiscal year for period labels based on period_end date. This function addresses Issue #460 where quarterly labels showed incorrect fiscal years because the SEC Facts API provides forward-looking fiscal_year values (the year the quarter contributes to), not the year for labeling purposes. For quarterly periods, the fiscal year label should reflect when the period occurred, not which fiscal year it contributes to. This mirrors the logic from validate_fiscal_year_period_end() but calculates the appropriate fiscal year for labels. Args: period_end: The period end date fiscal_year_end_month: Company's fiscal year end month (1-12) Returns: The fiscal year to use for labeling this period Examples: >>> # Apple (fiscal year ends in September) >>> # Q3 ending June 28, 2024 >>> calculate_fiscal_year_for_label(date(2024, 6, 28), 9) 2024 # Q3 2024, not Q3 2025 >>> # Q4 ending September 28, 2024 >>> calculate_fiscal_year_for_label(date(2024, 9, 28), 9) 2024 # Q4 2024 (fiscal year end) >>> # Q1 ending December 30, 2023 >>> calculate_fiscal_year_for_label(date(2023, 12, 30), 9) 2024 # Q1 2024 (first quarter of FY 2024) >>> # Early January period (52/53-week calendar edge case) >>> calculate_fiscal_year_for_label(date(2023, 1, 1), 12) 2022 # FY 2022 (52/53-week calendar convention) """ # Early January (Jan 1-7): Use prior year (52/53-week calendar convention) if period_end.month == 1 and period_end.day <= 7: return period_end.year - 1 # If period_end is in a month AFTER fiscal year end, it's the NEXT fiscal year # Example: Apple FY ends Sept (month 9) # - Period ending Oct 2023 (month 10) → FY 2024 (first quarter of new fiscal year) # - Period ending Sept 2023 (month 9) → FY 2023 (end of fiscal year) # - Period ending June 2024 (month 6) → FY 2024 (third quarter) if period_end.month > fiscal_year_end_month: # Period is after fiscal year end, so it's in the next fiscal year # Example: Sept FY end, period ends in Oct/Nov/Dec → next year return period_end.year + 1 else: # Period is at or before fiscal year end, use calendar year return period_end.year class EnhancedStatementBuilder: """ Builds multi-period statements with hierarchical structure using learned mappings. """ # Essential concepts that should always be shown if they have data ESSENTIAL_CONCEPTS = { 'BalanceSheet': { # Working Capital 'AccountsReceivable', 'AccountsReceivableNetCurrent', 'Inventory', 'InventoryNet', 'AccountsPayable', 'AccountsPayableCurrent', # Debt 'LongTermDebt', 'LongTermDebtNoncurrent', 'LongTermDebtCurrent', 'ShortTermDebt', 'ShortTermBorrowings', # Equity 'CommonStockSharesOutstanding', 'CommonStockValue', 'RetainedEarningsAccumulatedDeficit', # Other important 'IntangibleAssetsNetExcludingGoodwill', 'Goodwill', 'DeferredRevenueCurrent', 'DeferredRevenueNoncurrent', 'PropertyPlantAndEquipmentNet' }, 'IncomeStatement': { 'CostOfRevenue', 'CostOfGoodsAndServicesSold', 'GrossProfit', 'ResearchAndDevelopmentExpense', 'SellingGeneralAndAdministrativeExpense', 'InterestExpense', 'InterestIncome', 'OtherNonoperatingIncomeExpense' }, 'CashFlowStatement': { # Key adjustments 'DepreciationDepletionAndAmortization', 'DepreciationAndAmortization', # Investment activities 'CapitalExpendituresIncurredButNotYetPaid', 'PaymentsToAcquirePropertyPlantAndEquipment', 'PaymentsToAcquireBusinessesNetOfCashAcquired', 'BusinessAcquisitionsNetOfCashAcquired', # Financing activities 'DividendsPaid', 'PaymentsOfDividends', 'PaymentsOfDividendsCommonStock', 'PaymentsForRepurchaseOfCommonStock', 'PaymentsForRepurchaseOfEquity', 'ProceedsFromIssuanceOfLongTermDebt', 'RepaymentsOfLongTermDebt', # Working capital changes 'IncreaseDecreaseInAccountsReceivable', 'IncreaseDecreaseInInventories', 'IncreaseDecreaseInAccountsPayable' } } # Common concept name variations that should be normalized CONCEPT_NORMALIZATIONS = { # Cost concepts 'CostOfGoodsAndServicesSold': 'CostOfRevenue', 'CostOfGoodsSold': 'CostOfRevenue', 'CostOfSales': 'CostOfRevenue', # Receivables 'AccountsReceivableNetCurrent': 'AccountsReceivable', 'AccountsReceivableNet': 'AccountsReceivable', # Payables 'AccountsPayableCurrent': 'AccountsPayable', # Inventory 'InventoryNet': 'Inventory', # Debt concepts 'LongTermDebtNoncurrent': 'LongTermDebt', 'LongTermDebtAndCapitalLeaseObligations': 'LongTermDebt', 'ShortTermBorrowings': 'ShortTermDebt', # Depreciation concepts 'DepreciationDepletionAndAmortization': 'DepreciationAndAmortization', # Capital expenditure concepts 'PaymentsToAcquirePropertyPlantAndEquipment': 'CapitalExpenditures', 'CapitalExpendituresIncurredButNotYetPaid': 'CapitalExpenditures', # Dividend concepts 'PaymentsOfDividends': 'DividendsPaid', 'PaymentsForDividends': 'DividendsPaid', 'PaymentsOfDividendsCommonStock': 'DividendsPaid', # Share repurchase 'PaymentsForRepurchaseOfEquity': 'PaymentsForRepurchaseOfCommonStock' } def __init__(self): self.learned_mappings = load_learned_mappings() self.virtual_trees = load_virtual_trees() def _normalize_concept(self, concept: str) -> str: """Normalize concept names for matching.""" # Remove namespace prefix if ':' in concept: concept = concept.split(':')[-1] # Apply normalization mappings return self.CONCEPT_NORMALIZATIONS.get(concept, concept) def _is_essential_concept(self, concept: str, statement_type: str) -> bool: """Check if concept is essential for this statement type.""" essential = self.ESSENTIAL_CONCEPTS.get(statement_type, set()) normalized = self._normalize_concept(concept) return normalized in essential or concept in essential def build_multi_period_statement(self, facts: List[FinancialFact], statement_type: str, periods: int = 4, annual: bool = True) -> MultiPeriodStatement: """ Build a multi-period statement with hierarchical structure. Args: facts: List of all facts statement_type: Type of statement periods: Number of periods to include annual: Prefer annual periods over quarterly Returns: MultiPeriodStatement with hierarchical structure and multiple periods """ # Filter facts by statement type # Handle both 'CashFlow' and 'CashFlowStatement' for compatibility if statement_type == 'CashFlow': stmt_facts = [f for f in facts if f.statement_type in ['CashFlow', 'CashFlowStatement']] else: stmt_facts = [f for f in facts if f.statement_type == statement_type] # Use the same logic as FactQuery.latest_periods for consistency # Group facts by unique periods and calculate period info # FIX: Use period_end as part of the key to keep all variations period_info = {} period_facts = defaultdict(list) for fact in stmt_facts: # Include period_end in the key to avoid losing different period_end variations period_key = (fact.fiscal_year, fact.fiscal_period, fact.period_end) # Make period label unique by including period_end when there are duplicates period_label = f"{fact.fiscal_period} {fact.fiscal_year}" # Store period metadata for each unique combination if period_key not in period_info: period_info[period_key] = { 'label': period_label, 'end_date': fact.period_end or date.max, 'is_annual': fact.fiscal_period == 'FY', 'filing_date': fact.filing_date or date.min, 'fiscal_year': fact.fiscal_year, 'fiscal_period': fact.fiscal_period } # Store facts by the unique period key instead of label period_facts[period_key].append(fact) # Create list of periods with their metadata period_list = [] for period_key, info in period_info.items(): period_list.append((period_key, info)) # Detect fiscal year end month for label calculation (Issue #460) # This needs to be calculated before the annual/quarterly split so it's available for both paths fiscal_year_end_month = detect_fiscal_year_end(stmt_facts) if annual: # When annual=True, filter for TRUE annual periods using duration # Some facts are marked as FY but are actually quarterly (90 days vs 363+ days) true_annual_periods = [] for pk, info in period_list: if not info['is_annual']: continue # pk is now (fiscal_year, fiscal_period, period_end) fiscal_year = pk[0] period_end_date = pk[2] # Validate fiscal_year against period_end to filter out mislabeled comparative data # Issue #452: SEC Facts API has inconsistent fiscal_year values for comparatives if not period_end_date: continue # Use strict validation to reject invalid fiscal_year/period_end combinations if not validate_fiscal_year_period_end(fiscal_year, period_end_date): log.debug( f"Skipping invalid fiscal_year={fiscal_year} for period_end={period_end_date} " f"(likely mislabeled comparative data - Issue #452)" ) continue # Skip mislabeled comparative data # Get a fact from this period to check duration period_fact_list = period_facts.get(pk, []) if period_fact_list: # Check if this is truly annual by looking at period duration sample_fact = period_fact_list[0] if sample_fact.period_start and sample_fact.period_end: duration = (sample_fact.period_end - sample_fact.period_start).days # Annual periods are typically 360-370 days, quarterly are ~90 days if duration > 300: # This is truly annual true_annual_periods.append((pk, info)) elif not sample_fact.period_start: # If no period_start, assume it's annual if marked as FY # (this handles instant facts like balance sheet items) true_annual_periods.append((pk, info)) # Group by period year and select most recent comprehensive filing # This approach combines availability (comprehensive data) with recency (latest corrections) # Issue #452: When multiple periods exist for same year (e.g., Jan 1 and Dec 31 both in 2023), # prefer the period where fiscal_year best matches expected value annual_by_period_year = {} for pk, info in true_annual_periods: fiscal_year = pk[0] period_end_date = pk[2] period_year = period_end_date.year if period_end_date else None if period_year: facts_for_period = period_facts.get(pk, []) filing_date = info.get('filing_date') # Only consider periods with substantial data (≥5 facts) to avoid sparse comparative data if len(facts_for_period) >= 5: should_replace = False if period_year not in annual_by_period_year: should_replace = True else: existing_pk, existing_info = annual_by_period_year[period_year] existing_fiscal_year = existing_pk[0] existing_period_end = existing_pk[2] existing_filing_date = existing_info.get('filing_date') # Prefer period where fiscal_year matches expected value # For early January: expect fiscal_year = year - 1 # For normal dates: expect fiscal_year = year is_early_jan = period_end_date.month == 1 and period_end_date.day <= 7 existing_is_early_jan = existing_period_end.month == 1 and existing_period_end.day <= 7 expected_fy = period_year - 1 if is_early_jan else period_year existing_expected_fy = period_year - 1 if existing_is_early_jan else period_year # Score: 0 = matches expected, 1 = doesn't match score = 0 if fiscal_year == expected_fy else 1 existing_score = 0 if existing_fiscal_year == existing_expected_fy else 1 # Replace if current period has better score, or same score but newer filing if score < existing_score: should_replace = True elif score == existing_score and filing_date and existing_filing_date and filing_date > existing_filing_date: should_replace = True if should_replace: annual_by_period_year[period_year] = (pk, info) # Sort by period year (descending) and select sorted_periods = sorted(annual_by_period_year.items(), key=lambda x: x[0], reverse=True) selected_period_info = [period_info for year, period_info in sorted_periods[:periods]] else: # Quarterly mode: Filter out comparative data by validating period_end # fiscal_year_end_month was already calculated at line 1223 and is in scope here valid_quarterly_periods = [] for pk, info in period_list: fiscal_period = info['fiscal_period'] period_end_date = pk[2] # pk is (fiscal_year, fiscal_period, period_end) # Skip if no period_end if not period_end_date: continue # Skip FY periods - we only want Q1/Q2/Q3/Q4 for quarterly mode if fiscal_period == 'FY': continue # Validate period_end matches expected month for fiscal_period if validate_quarterly_period_end(fiscal_period, period_end_date, fiscal_year_end_month): valid_quarterly_periods.append((pk, info)) else: log.debug( f"Skipping invalid period_end={period_end_date} for fiscal_period={fiscal_period} " f"(likely comparative data)" ) # Group by fiscal period label and keep most recent # FIX for Issue #460: Calculate fiscal_year from period_end for quarterly labels quarterly_by_period = {} for pk, info in valid_quarterly_periods: fiscal_period = pk[1] period_end_date = pk[2] # Calculate correct fiscal year for label based on period_end # This fixes Issue #460 where SEC's forward-looking fiscal_year caused # quarterly labels to show 1 year ahead (Q3 2025 instead of Q3 2024) calculated_fiscal_year = calculate_fiscal_year_for_label( period_end_date, fiscal_year_end_month ) period_label = f"{fiscal_period} {calculated_fiscal_year}" # Store the calculated fiscal year in info for later use info_with_calculated_fy = info.copy() info_with_calculated_fy['calculated_fiscal_year'] = calculated_fiscal_year if period_label not in quarterly_by_period: quarterly_by_period[period_label] = (pk, info_with_calculated_fy) else: # If duplicate valid periods exist, prefer most recent filing_date existing_pk, existing_info = quarterly_by_period[period_label] if info['filing_date'] > existing_info['filing_date']: quarterly_by_period[period_label] = (pk, info_with_calculated_fy) # Sort by period end date (newest first) and select requested number sorted_periods = sorted( quarterly_by_period.values(), key=lambda x: x[1]['end_date'], reverse=True ) selected_period_info = sorted_periods[:periods] # Extract period labels and build a mapping for the selected periods # For annual periods, use the fiscal year from facts (most reliable) # For quarterly periods, calculate fiscal year from period_end (Issue #460) selected_periods = [] for pk, info in selected_period_info: if annual and info.get('is_annual') and pk[2]: # pk[2] is period_end # Use fiscal_year from facts if available (handles 52/53-week calendars correctly) # Falls back to period_end.year with early January adjustment for edge cases if 'fiscal_year' in info and info['fiscal_year']: label = f"FY {info['fiscal_year']}" else: period_end = pk[2] # For periods ending Jan 1-7, use prior year (52/53-week calendar convention) # This handles cases like fiscal year ending Jan 1, 2023 being FY 2022 if period_end.month == 1 and period_end.day <= 7: label = f"FY {period_end.year - 1}" else: label = f"FY {period_end.year}" elif not annual and pk[2]: # FIX for Issue #460: For quarterly periods, use the calculated fiscal year # that was stored during grouping (avoids recalculation) fiscal_period = pk[1] period_end = pk[2] calculated_fiscal_year = info.get('calculated_fiscal_year') if calculated_fiscal_year is not None: label = f"{fiscal_period} {calculated_fiscal_year}" else: # Fallback: calculate if not found (shouldn't happen for quarterly) calculated_fiscal_year = calculate_fiscal_year_for_label( period_end, fiscal_year_end_month ) label = f"{fiscal_period} {calculated_fiscal_year}" else: label = info['label'] selected_periods.append(label) # Create a new period_facts dict with labels as keys for the selected periods # CRITICAL: For annual periods, filter facts to only include those with duration > 300 days period_facts_by_label = defaultdict(list) for i, (period_key, info) in enumerate(selected_period_info): label = selected_periods[i] # Use the corrected label facts_for_period = period_facts.get(period_key, []) # If this is an annual period, filter to only include annual facts if annual and info.get('is_annual'): filtered_facts = [] for fact in facts_for_period: # Keep facts with annual duration (>300 days) or instant facts (no period_start) if fact.period_start and fact.period_end: duration = (fact.period_end - fact.period_start).days if duration > 300: filtered_facts.append(fact) else: # Instant facts (balance sheet items) don't have duration filtered_facts.append(fact) period_facts_by_label[label] = filtered_facts else: period_facts_by_label[label] = facts_for_period # Build hierarchical structure using canonical template # Handle statement type naming inconsistencies # Map fact statement types to virtual tree keys statement_type_mapping = { 'CashFlow': 'CashFlowStatement', 'IncomeStatement': 'IncomeStatement', 'BalanceSheet': 'BalanceSheet', 'ComprehensiveIncome': 'ComprehensiveIncome', 'StatementOfEquity': 'StatementOfEquity' } virtual_tree_key = statement_type_mapping.get(statement_type, statement_type) # Also try the exact statement type if mapping doesn't exist if virtual_tree_key not in self.virtual_trees and statement_type in self.virtual_trees: virtual_tree_key = statement_type if virtual_tree_key in self.virtual_trees: items = self._build_with_canonical(period_facts_by_label, selected_periods, virtual_tree_key) canonical_coverage = self._calculate_coverage(stmt_facts, virtual_tree_key) else: items = self._build_from_facts(period_facts_by_label, selected_periods) canonical_coverage = 0.0 return MultiPeriodStatement( statement_type=statement_type, periods=selected_periods, items=items, canonical_coverage=canonical_coverage ) def _build_with_canonical(self, period_facts: Dict[str, List[FinancialFact]], periods: List[str], virtual_tree_key: str) -> List[MultiPeriodItem]: """Build items using canonical structure.""" virtual_tree = self.virtual_trees[virtual_tree_key] items = [] # Create fact maps for each period period_maps = {} for period in periods: period_maps[period] = self._create_fact_map(period_facts.get(period, [])) # For Income Statement, promote essential concepts to top level for visibility if virtual_tree_key == 'IncomeStatement': items = self._build_with_promoted_concepts( virtual_tree, period_maps, periods, virtual_tree_key ) else: # Process root nodes normally for other statements for root_concept in virtual_tree.get('roots', []): item = self._build_canonical_item( root_concept, virtual_tree['nodes'], period_maps, periods, depth=0, statement_type=virtual_tree_key ) if item: items.append(item) # Add orphan facts that have values but aren't in the virtual tree orphan_section = self._add_orphan_facts( period_maps, virtual_tree.get('nodes', {}), periods, virtual_tree_key ) if orphan_section: items.append(orphan_section) # Add calculated metrics for Income Statement if virtual_tree_key == 'IncomeStatement': calculated_items = self._add_calculated_metrics(period_maps, periods, items) if calculated_items: items.extend(calculated_items) # Apply smart aggregation to parent nodes for item in items: self._apply_smart_aggregation(item) # Remove redundant table duplicates for cleaner presentation items = self._deduplicate_table_items(items) return items def _build_with_promoted_concepts(self, virtual_tree: Dict, period_maps: Dict[str, Dict[str, FinancialFact]], periods: List[str], statement_type: str) -> List[MultiPeriodItem]: """Build Income Statement with essential concepts promoted to top level.""" items = [] nodes = virtual_tree['nodes'] # Essential revenue/income concepts to promote ESSENTIAL_CONCEPTS = [ # Revenue concepts (in priority order) 'RevenueFromContractWithCustomerExcludingAssessedTax', 'SalesRevenueNet', 'Revenues', # Cost concepts 'CostOfGoodsAndServicesSold', 'CostOfRevenue', # Profit concepts 'GrossProfit', 'OperatingIncomeLoss', 'NetIncomeLoss', # Earnings per share 'EarningsPerShareBasic', 'EarningsPerShareDiluted' ] # Revenue concepts for deduplication (in priority order) REVENUE_CONCEPTS = [ 'RevenueFromContractWithCustomerExcludingAssessedTax', 'SalesRevenueNet', 'Revenues' ] # First, add the abstract root for structure for root_concept in virtual_tree.get('roots', []): if 'Abstract' in root_concept: item = self._build_canonical_item( root_concept, nodes, period_maps, periods, depth=0, statement_type=statement_type ) if item: # Clear children to rebuild with promoted concepts item.children = [] # Handle revenue deduplication first promoted_added = set() revenue_item = self._create_deduplicated_revenue_item( REVENUE_CONCEPTS, nodes, period_maps, periods, statement_type ) if revenue_item: item.children.append(revenue_item) # Mark all revenue concepts as processed promoted_added.update(REVENUE_CONCEPTS) # Add other promoted concepts that have values for concept in ESSENTIAL_CONCEPTS: if concept not in promoted_added and concept in nodes: # Check if it has values in any period has_values = any( concept in period_maps[p] for p in periods ) if has_values: promoted_item = self._build_canonical_item( concept, nodes, period_maps, periods, depth=1, statement_type=statement_type ) if promoted_item: # Override label for better display if concept == 'CostOfGoodsAndServicesSold': promoted_item.label = 'Cost of Revenue' promoted_item.children = [] # Don't show deep hierarchy item.children.append(promoted_item) promoted_added.add(concept) # Then add other important concepts not in essential list for child_concept in nodes.get(root_concept, {}).get('children', []): if child_concept not in promoted_added: child_item = self._build_canonical_item( child_concept, nodes, period_maps, periods, depth=1, statement_type=statement_type ) if child_item: item.children.append(child_item) items.append(item) break # If no abstract root, just build normally if not items: for root_concept in virtual_tree.get('roots', []): item = self._build_canonical_item( root_concept, nodes, period_maps, periods, depth=0, statement_type=statement_type ) if item: items.append(item) return items def _create_deduplicated_revenue_item(self, revenue_concepts: List[str], nodes: Dict[str, Any], period_maps: Dict[str, Dict[str, FinancialFact]], periods: List[str], statement_type: str) -> Optional[MultiPeriodItem]: """ Create a single deduplicated revenue item by combining multiple revenue concepts. This method implements revenue deduplication for the Facts API path, similar to what was done for XBRL processing. It combines revenue from different concepts across periods to show comprehensive revenue data. When no explicit revenue concepts exist, it attempts to calculate revenue from GrossProfit + CostOfRevenue. Args: revenue_concepts: List of revenue concepts in priority order nodes: Virtual tree nodes period_maps: Period-mapped fact data periods: List of periods statement_type: Statement type Returns: Single MultiPeriodItem with deduplicated revenue data or None if no revenue found """ # Collect all revenue values across all concepts and periods consolidated_values = {} best_label = "Total Revenue" # Default label has_any_revenue = False # Track which concept provides data for each period (for debugging/transparency) source_tracking = {} for period in periods: period_value = None source_concept = None # Try explicit revenue concepts in priority order for this period for concept in revenue_concepts: if concept in period_maps[period]: fact = period_maps[period][concept] if fact.numeric_value is not None: period_value = fact.numeric_value source_concept = concept has_any_revenue = True # Use the label from the first concept we find if period_value is not None and not source_tracking: best_label = fact.label if fact.label else "Total Revenue" break # Found value for this period, use highest priority # If no explicit revenue found, try to calculate from GrossProfit + CostOfRevenue if period_value is None: gross_profit = None cost_of_revenue = None # Look for GrossProfit if 'GrossProfit' in period_maps[period]: gross_profit_fact = period_maps[period]['GrossProfit'] gross_profit = gross_profit_fact.numeric_value # Look for CostOfRevenue if 'CostOfRevenue' in period_maps[period]: cost_fact = period_maps[period]['CostOfRevenue'] cost_of_revenue = cost_fact.numeric_value # Calculate revenue if both components are available if gross_profit is not None and cost_of_revenue is not None: period_value = gross_profit + cost_of_revenue source_concept = 'Calculated_Revenue' has_any_revenue = True # Debug output (disabled) # print(f"DEBUG: Calculated revenue for {period}: ${period_value:,} (GP: ${gross_profit:,} + CoR: ${cost_of_revenue:,})") consolidated_values[period] = period_value if source_concept: source_tracking[period] = source_concept if not has_any_revenue: return None # Override label to be more descriptive best_label = "Total Revenue" # Find the highest priority concept that has data to determine other properties primary_concept = None for concept in revenue_concepts: if any(concept in period_maps[p] for p in periods): primary_concept = concept break # If no explicit revenue concepts, use a calculated concept identifier if not primary_concept: primary_concept = 'TotalRevenue_Consolidated' # Create the deduplicated revenue item revenue_item = MultiPeriodItem( concept=primary_concept, # Use the highest priority concept as the base label=best_label, values=consolidated_values, depth=1, parent_concept=None, is_abstract=False, is_total=True, # Revenue is typically a total section=None, confidence=0.95, # High confidence for deduplicated revenue children=[] ) return revenue_item def _build_canonical_item(self, concept: str, nodes: Dict[str, Any], period_maps: Dict[str, Dict[str, FinancialFact]], periods: List[str], depth: int = 0, statement_type: str = None) -> Optional[MultiPeriodItem]: """Build a single canonical item with multi-period values.""" node = nodes.get(concept, {}) # Get values for each period # Check both original concept and normalized version values = {} has_any_value = False for period in periods: # Try original concept first fact = period_maps[period].get(concept) # If not found, try normalized version if not fact: normalized = self._normalize_concept(concept) fact = period_maps[period].get(normalized) if fact: values[period] = fact.numeric_value has_any_value = True else: values[period] = None # Get label from first fact or node label = None for period in periods: fact = period_maps[period].get(concept) if fact: label = fact.label break if not label: label = node.get('label', concept) # Process children first to see if any have values children_items = [] for child_concept in node.get('children', []): child_item = self._build_canonical_item( child_concept, nodes, period_maps, periods, depth + 1, statement_type=statement_type ) if child_item: children_items.append(child_item) # Determine if we should include this node # Include if ANY of these are true: # 1. It has values # 2. It's abstract (structural node) # 3. It has children with values # 4. It's an essential concept for investors # 5. It has reasonable occurrence rate (>= 0.3) is_essential = statement_type and self._is_essential_concept(concept, statement_type) if not has_any_value and not node.get('is_abstract'): # Skip only if ALL of these are true: # - Not essential # - Low occurrence rate # - No children with values if not is_essential and node.get('occurrence_rate', 0) < 0.3 and not children_items: return None item = MultiPeriodItem( concept=concept, label=label, values=values, depth=depth, parent_concept=None, is_abstract=node.get('is_abstract', False), is_total=node.get('is_total', False), section=node.get('section'), confidence=node.get('occurrence_rate', 1.0), children=children_items ) return item def _add_orphan_facts(self, period_maps: Dict[str, Dict[str, FinancialFact]], virtual_tree_nodes: Dict[str, Any], periods: List[str], statement_type: str) -> Optional[MultiPeriodItem]: """Add valuable facts not in virtual tree as 'Additional Items' section.""" # Find all concepts that have values but aren't in the virtual tree orphan_concepts = set() for period_map in period_maps.values(): for concept in period_map.keys(): # Skip if already in virtual tree if concept not in virtual_tree_nodes: # Check if this is an essential or important concept if self._is_important_orphan(concept, statement_type): orphan_concepts.add(concept) if not orphan_concepts: return None # Create orphan section orphan_section = MultiPeriodItem( concept='AdditionalItems', label='Additional Financial Items', values={}, depth=0, parent_concept=None, is_abstract=True, is_total=False, section='Additional', confidence=1.0 ) # Add each orphan concept as a child for concept in sorted(orphan_concepts): # Get values for each period values = {} label = None has_values = False for period in periods: fact = period_maps[period].get(concept) if fact: values[period] = fact.numeric_value has_values = True if not label: label = fact.label else: values[period] = None if has_values: orphan_item = MultiPeriodItem( concept=concept, label=label or concept, values=values, depth=1, parent_concept='AdditionalItems', is_abstract=False, is_total=self._is_total_concept(concept, label), section='Additional', confidence=0.5 # Lower confidence for orphan facts ) orphan_section.children.append(orphan_item) # Only return if we have actual orphan items return orphan_section if orphan_section.children else None def _is_important_orphan(self, concept: str, statement_type: str) -> bool: """Determine if an orphan concept is important enough to display.""" # Check if it's an essential concept if self._is_essential_concept(concept, statement_type): return True # Check if it's a normalized version of an essential concept normalized = self._normalize_concept(concept) if normalized != concept and self._is_essential_concept(normalized, statement_type): return True # Additional important concepts not in essential list but valuable important_keywords = [ # Balance Sheet 'Debt', 'Receivable', 'Payable', 'Inventory', 'Investment', 'Deferred', 'Accrued', 'Prepaid', 'Goodwill', 'Intangible', # Income Statement 'Revenue', 'Sales', 'Cost', 'Expense', 'Income', 'Profit', 'Loss', 'Research', 'Marketing', 'Administrative', 'Interest', 'Tax', # Cash Flow 'Depreciation', 'Amortization', 'Capital', 'Dividend', 'Acquisition', 'Repurchase', 'Proceeds', 'Payments', 'Working' ] concept_lower = concept.lower() return any(keyword.lower() in concept_lower for keyword in important_keywords) def _is_total_concept(self, concept: str, label: str = None) -> bool: """Determine if a concept represents a total.""" indicators = ['total', 'net', 'gross', 'subtotal', 'aggregate'] concept_lower = concept.lower() label_lower = (label or '').lower() return any(ind in concept_lower or ind in label_lower for ind in indicators) def _add_calculated_metrics(self, period_maps: Dict[str, Dict[str, FinancialFact]], periods: List[str], existing_items: List[MultiPeriodItem]) -> List[MultiPeriodItem]: """Add calculated metrics like Gross Profit if not already present.""" calculated_items = [] # Check if GrossProfit exists in items has_gross_profit = any( self._find_item_by_concept(item, 'GrossProfit') for item in existing_items ) if not has_gross_profit: # Try to calculate Gross Profit = Revenue - Cost of Revenue gross_profit_values = {} has_values = False for period in periods: period_map = period_maps[period] # Find revenue (try various concepts) revenue = None revenue_concepts = [ 'RevenueFromContractWithCustomerExcludingAssessedTax', 'Revenues', 'Revenue', 'SalesRevenueNet', 'TotalRevenues' ] for concept in revenue_concepts: if concept in period_map: revenue = period_map[concept].numeric_value break # Find cost of revenue cost = None cost_concepts = [ 'CostOfRevenue', 'CostOfGoodsAndServicesSold', 'CostOfGoodsSold', 'CostOfSales' ] for concept in cost_concepts: if concept in period_map: cost = period_map[concept].numeric_value break # Calculate if both available if revenue is not None and cost is not None: gross_profit_values[period] = revenue - cost has_values = True else: gross_profit_values[period] = None if has_values: gross_profit_item = MultiPeriodItem( concept='GrossProfit_Calculated', label='Gross Profit (Calculated)', values=gross_profit_values, depth=0, parent_concept=None, is_abstract=False, is_total=True, section='Calculated', confidence=0.8 ) calculated_items.append(gross_profit_item) return calculated_items def _find_item_by_concept(self, item: MultiPeriodItem, concept: str) -> Optional[MultiPeriodItem]: """Recursively find an item by concept name.""" if item.concept == concept: return item for child in item.children: found = self._find_item_by_concept(child, concept) if found: return found return None def _apply_smart_aggregation(self, item: MultiPeriodItem): """Apply smart aggregation to calculate parent values from children.""" # Recursively process children first for child in item.children: self._apply_smart_aggregation(child) # Only aggregate if: # 1. Parent has no values # 2. Parent is not abstract (or is a total) # 3. Has children with values has_any_value = any(v is not None for v in item.values.values()) if not has_any_value and item.children: # Check if this should be aggregated should_aggregate = ( item.is_total or 'total' in item.label.lower() or (not item.is_abstract and self._should_aggregate_children(item)) ) if should_aggregate: # Aggregate values from children for period in item.values.keys(): child_sum = 0 has_child_values = False for child in item.children: child_value = child.values.get(period) if child_value is not None: # Skip if child is also abstract (unless it's a calculated total) if not child.is_abstract or child.is_total: child_sum += child_value has_child_values = True if has_child_values: item.values[period] = child_sum # Mark as aggregated if not item.label.endswith(' (Aggregated)'): item.label = item.label + ' (Aggregated)' def _deduplicate_table_items(self, items: List[MultiPeriodItem]) -> List[MultiPeriodItem]: """ Remove redundant items from Statement [Table] structures when they duplicate primary items. This handles the XBRL quirk where the same concepts appear both: 1. At the top level (primary context) 2. Under Statement [Table] -> Statement [Line Items] (dimensional context) When there are no actual dimensions, these are pure duplicates. """ # First, collect all concepts and their values from non-table contexts primary_concepts = {} def collect_primary_concepts(item: MultiPeriodItem, in_table: bool = False): """Collect concepts that are not in table structures.""" # Check if we're entering a table if 'Table' in item.label and 'Statement' in item.label: in_table = True if not in_table and item.concept and item.values: # Store the concept and its values if any(v is not None for v in item.values.values()): primary_concepts[item.concept] = item.values # Recurse through children for child in item.children: collect_primary_concepts(child, in_table) # Collect all primary (non-table) concepts for item in items: collect_primary_concepts(item) def remove_duplicate_table_items(item: MultiPeriodItem, in_table: bool = False) -> Optional[MultiPeriodItem]: """Remove items from table structures that duplicate primary items.""" # Check if we're entering a table if 'Table' in item.label and 'Statement' in item.label: in_table = True # For table structures, check if ALL children are duplicates # If so, we might want to skip the entire table cleaned_children = [] total_children = 0 duplicate_children = 0 for child in item.children: total_children += 1 cleaned_child = remove_duplicate_table_items(child, in_table) if cleaned_child: cleaned_children.append(cleaned_child) else: duplicate_children += 1 # If most children are duplicates and we have few remaining items, # consider removing the table entirely if cleaned_children and len(cleaned_children) > 2: # Keep the table if it has meaningful content item.children = cleaned_children return item elif not cleaned_children: # Table is entirely duplicates, remove it return None else: # Table has very little unique content, remove it return None # For items within tables, check if they're duplicates if in_table and item.concept in primary_concepts: # Check if values match if item.values == primary_concepts[item.concept]: # This is a duplicate, remove it (but keep exploring children # in case they have unique dimensional breakdowns) has_unique_children = False cleaned_children = [] for child in item.children: cleaned_child = remove_duplicate_table_items(child, in_table) if cleaned_child: cleaned_children.append(cleaned_child) # Check if child has different values if cleaned_child.concept not in primary_concepts or \ cleaned_child.values != primary_concepts.get(cleaned_child.concept): has_unique_children = True if has_unique_children: # Keep this item as a container for unique children item.children = cleaned_children return item else: # Pure duplicate with no unique children return None # For non-duplicate items, clean their children cleaned_children = [] for child in item.children: cleaned_child = remove_duplicate_table_items(child, in_table) if cleaned_child: cleaned_children.append(cleaned_child) item.children = cleaned_children return item # Process all top-level items cleaned_items = [] for item in items: cleaned_item = remove_duplicate_table_items(item) if cleaned_item: cleaned_items.append(cleaned_item) return cleaned_items def _should_aggregate_children(self, item: MultiPeriodItem) -> bool: """Determine if children should be aggregated for this parent.""" # Don't aggregate if children are heterogeneous (mix of assets/liabilities etc) # This is a simplified check - could be more sophisticated aggregatable_parents = [ 'CurrentAssets', 'NonCurrentAssets', 'TotalAssets', 'CurrentLiabilities', 'NonCurrentLiabilities', 'TotalLiabilities', 'OperatingExpenses', 'TotalExpenses', 'TotalRevenue' ] return any(parent in item.concept for parent in aggregatable_parents) def _build_from_facts(self, period_facts: Dict[str, List[FinancialFact]], periods: List[str]) -> List[MultiPeriodItem]: """Build items directly from facts without canonical structure.""" # Simple approach - just list all unique concepts all_concepts = set() concept_labels = {} for period_facts_list in period_facts.values(): for fact in period_facts_list: concept = fact.concept.split(':', 1)[-1] if ':' in fact.concept else fact.concept all_concepts.add(concept) concept_labels[concept] = fact.label items = [] for concept in sorted(all_concepts): values = {} for period in periods: # Find fact for this concept in this period for fact in period_facts.get(period, []): fact_concept = fact.concept.split(':', 1)[-1] if ':' in fact.concept else fact.concept if fact_concept == concept: values[period] = fact.numeric_value break else: values[period] = None item = MultiPeriodItem( concept=concept, label=concept_labels.get(concept, concept), values=values, depth=0, parent_concept=None ) items.append(item) return items def _create_fact_map(self, facts: List[FinancialFact]) -> Dict[str, FinancialFact]: """Create concept -> fact mapping with normalization.""" fact_map = {} for fact in facts: # Get clean concept name without namespace concept = fact.concept.split(':', 1)[-1] if ':' in fact.concept else fact.concept # Store under both original and normalized names # This allows matching both variants fact_map[concept] = fact normalized = self._normalize_concept(concept) if normalized != concept: # Also store under normalized name if different # Prefer normalized if not already present if normalized not in fact_map: fact_map[normalized] = fact # Use most recent fact for duplicates if concept not in fact_map or fact.filing_date > fact_map[concept].filing_date: fact_map[concept] = fact return fact_map def _calculate_coverage(self, facts: List[FinancialFact], virtual_tree_key: str) -> float: """Calculate canonical coverage.""" if virtual_tree_key not in self.virtual_trees: return 0.0 canonical_concepts = set(self.virtual_trees[virtual_tree_key].get('nodes', {}).keys()) if not canonical_concepts: return 0.0 fact_concepts = set() for fact in facts: concept = fact.concept.split(':', 1)[-1] if ':' in fact.concept else fact.concept fact_concepts.add(concept) matched = len(fact_concepts & canonical_concepts) return matched / len(canonical_concepts)