""" Statement Builder for reconstructing financial statements using canonical structures. This module provides intelligent statement reconstruction using learned canonical structures and virtual presentation trees. """ import logging from collections import defaultdict from dataclasses import dataclass, field from datetime import date from typing import Any, Dict, List, Optional, Set from rich import box from rich.columns import Columns from rich.console import Group from rich.padding import Padding from rich.panel import Panel from rich.table import Table from rich.text import Text from edgar.entity.mappings_loader import load_canonical_structures, load_virtual_trees from edgar.entity.models import FinancialFact from edgar.richtools import repr_rich log = logging.getLogger(__name__) @dataclass class StatementItem: """A single item in a reconstructed financial statement.""" concept: str label: str value: Optional[float] depth: int parent_concept: Optional[str] children: List['StatementItem'] = field(default_factory=list) # Metadata is_abstract: bool = False is_total: bool = False section: Optional[str] = None confidence: float = 1.0 source: str = 'fact' # 'fact', 'calculated', 'canonical', 'placeholder' # Original fact if available fact: Optional[FinancialFact] = None def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" return { 'concept': self.concept, 'label': self.label, 'value': self.value, 'depth': self.depth, 'is_abstract': self.is_abstract, 'is_total': self.is_total, 'section': self.section, 'confidence': self.confidence, 'source': self.source, 'children': [child.to_dict() for child in self.children] } def get_display_value(self) -> str: """Get formatted value for display.""" if self.value is not None: if abs(self.value) >= 1_000_000_000: return f"${self.value/1_000_000_000:.1f}B" elif abs(self.value) >= 1_000_000: return f"${self.value/1_000_000:.1f}M" elif abs(self.value) >= 1_000: return f"${self.value/1_000:.0f}K" else: return f"${self.value:.0f}" elif self.is_abstract: return "" elif self.source == 'placeholder': return "[Missing]" else: return "-" def __rich__(self): """Create a rich representation of the statement item.""" from rich.tree import Tree # Create the node label if self.is_abstract: label = Text(self.label, style="bold cyan") elif self.is_total: label = Text(self.label, style="bold yellow") else: style = "dim" if self.confidence < 0.8 else "" confidence_marker = " ◦" if self.confidence < 0.8 else "" label = Text(f"{self.label}{confidence_marker}", style=style) # Add value if present value_str = self.get_display_value() if value_str and value_str != "-": # Color code values if value_str.startswith("$") and self.value and isinstance(self.value, (int, float)): value_style = "red" if self.value < 0 else "green" else: value_style = "" label_with_value = Text.assemble( label, " ", (value_str, value_style) ) else: label_with_value = label # Create tree with this item as root tree = Tree(label_with_value) # Add children for child in self.children: tree.add(child.__rich__()) return tree def __repr__(self) -> str: """String representation using rich formatting.""" return repr_rich(self.__rich__()) @dataclass class StructuredStatement: """A complete structured financial statement.""" statement_type: str fiscal_year: Optional[int] fiscal_period: Optional[str] period_end: Optional[date] items: List[StatementItem] # Metadata company_name: Optional[str] = None cik: Optional[str] = None canonical_coverage: float = 0.0 facts_used: int = 0 facts_total: int = 0 def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" return { 'statement_type': self.statement_type, 'fiscal_year': self.fiscal_year, 'fiscal_period': self.fiscal_period, 'period_end': self.period_end.isoformat() if self.period_end else None, 'company_name': self.company_name, 'cik': self.cik, 'canonical_coverage': self.canonical_coverage, 'facts_used': self.facts_used, 'facts_total': self.facts_total, 'items': [item.to_dict() for item in self.items] } def get_hierarchical_display(self, max_depth: int = 3) -> str: """Get hierarchical text representation.""" lines = [] def add_item(item: StatementItem, indent: int = 0): if indent > max_depth: return indent_str = " " * indent value_str = item.get_display_value() if item.is_abstract: lines.append(f"{indent_str}{item.label}") elif item.is_total: lines.append(f"{indent_str}{item.label:<40} {value_str:>15}") lines.append(f"{indent_str}{'-' * 55}") else: confidence_marker = "" if item.confidence > 0.8 else " *" lines.append(f"{indent_str}{item.label:<40} {value_str:>15}{confidence_marker}") for child in item.children: add_item(child, indent + 1) for item in self.items: add_item(item) return "\n".join(lines) def __rich__(self): """Create a rich representation of the structured statement.""" # Statement type mapping for better display statement_names = { 'IncomeStatement': 'Income Statement', 'BalanceSheet': 'Balance Sheet', 'CashFlow': 'Cash Flow Statement', 'StatementsOfComprehensiveIncome': 'Comprehensive Income', 'StatementsOfShareholdersEquity': 'Shareholders Equity' } # Title with company name and period title_parts = [] if self.company_name: title_parts.append((self.company_name, "bold green")) else: title_parts.append(("Financial Statement", "bold")) title = Text.assemble(*title_parts) # Subtitle with statement type and period statement_display = statement_names.get(self.statement_type, self.statement_type) if self.fiscal_period and self.fiscal_year: subtitle = f"{statement_display} • {self.fiscal_period} {self.fiscal_year}" elif self.period_end: subtitle = f"{statement_display} • As of {self.period_end}" else: subtitle = statement_display # Main statement table stmt_table = Table( box=box.SIMPLE, show_header=False, padding=(0, 1), expand=True ) stmt_table.add_column("Item", style="", ratio=3) stmt_table.add_column("Value", justify="right", style="bold", ratio=1) def add_item_to_table(item: StatementItem, depth: int = 0): """Add an item to the table with proper indentation.""" indent = " " * depth if item.is_abstract: # Abstract items are headers stmt_table.add_row( Text(f"{indent}{item.label}", style="bold cyan"), "" ) elif item.is_total: # Total items with underline value_text = Text(item.get_display_value(), style="bold yellow") stmt_table.add_row( Text(f"{indent}{item.label}", style="bold"), value_text ) # Add a separator line after totals if depth == 0: stmt_table.add_row("", "") stmt_table.add_row( Text("─" * 40, style="dim"), Text("─" * 15, style="dim") ) else: # Regular items style = "dim" if item.confidence < 0.8 else "" confidence_marker = " ◦" if item.confidence < 0.8 else "" label_text = f"{indent}{item.label}{confidence_marker}" # Color code positive/negative values value_str = item.get_display_value() if value_str and value_str.startswith("$"): try: # Extract numeric value for coloring if item.value and isinstance(item.value, (int, float)): if item.value < 0: value_style = "red" else: value_style = "green" else: value_style = "" except: value_style = "" else: value_style = "" stmt_table.add_row( Text(label_text, style=style), Text(value_str, style=value_style) if value_str else "" ) # Add children recursively for child in item.children: if depth < 3: # Limit depth for display add_item_to_table(child, depth + 1) # Add all items to the table for item in self.items: add_item_to_table(item) # Metadata summary metadata = Table(box=box.SIMPLE, show_header=False, padding=(0, 1)) metadata.add_column("Metric", style="dim") metadata.add_column("Value", style="bold") metadata.add_row("Facts Used", f"{self.facts_used:,}") if self.facts_total > 0: metadata.add_row("Total Facts", f"{self.facts_total:,}") if self.canonical_coverage > 0: coverage_pct = self.canonical_coverage * 100 coverage_style = "green" if coverage_pct >= 50 else "yellow" if coverage_pct >= 25 else "red" metadata.add_row( "Canonical Coverage", Text(f"{coverage_pct:.1f}%", style=coverage_style) ) if self.cik: metadata.add_row("CIK", self.cik) # Data quality indicators quality_notes = [] # Count items by confidence low_confidence_count = sum( 1 for item in self._flatten_items() if not item.is_abstract and item.confidence < 0.8 ) if low_confidence_count > 0: quality_notes.append( Text(f"◦ {low_confidence_count} items with lower confidence", style="dim yellow") ) # Count calculated vs actual values calculated_count = sum( 1 for item in self._flatten_items() if item.source == 'calculated' ) if calculated_count > 0: quality_notes.append( Text(f"◦ {calculated_count} calculated values", style="dim cyan") ) # Combine metadata and quality notes metadata_panel = Panel( metadata, title="📊 Statement Metadata", border_style="bright_black" ) # Create the main content group content_parts = [ Padding("", (1, 0, 0, 0)), stmt_table ] # Add metadata in a column layout if self.facts_used > 0: bottom_content = [metadata_panel] if quality_notes: quality_panel = Panel( Group(*quality_notes), title="📝 Data Quality Notes", border_style="bright_black" ) bottom_content.append(quality_panel) content_parts.append(Padding("", (1, 0))) content_parts.append(Columns(bottom_content, equal=True, expand=True)) content = Group(*content_parts) # Create the main panel return Panel( content, title=title, subtitle=subtitle, border_style="blue", expand=True ) def _flatten_items(self) -> List[StatementItem]: """Flatten the hierarchical items into a flat list.""" flat_items = [] def flatten(item: StatementItem): flat_items.append(item) for child in item.children: flatten(child) for item in self.items: flatten(item) return flat_items def __repr__(self) -> str: """String representation using rich formatting.""" return repr_rich(self.__rich__()) class StatementBuilder: """ Builds structured financial statements using canonical templates. This class reconstructs complete financial statements by combining actual facts with canonical structures, filling in missing concepts and maintaining proper hierarchy. """ def __init__(self, cik: Optional[str] = None): """ Initialize the statement builder. Args: cik: Company CIK for context """ self.cik = cik self.canonical_structures = load_canonical_structures() self.virtual_trees = load_virtual_trees() def build_statement(self, facts: List[FinancialFact], statement_type: str, fiscal_year: Optional[int] = None, fiscal_period: Optional[str] = None, use_canonical: bool = True, include_missing: bool = False) -> StructuredStatement: """ Build a structured financial statement from facts. Args: facts: List of financial facts statement_type: Type of statement (BalanceSheet, IncomeStatement, etc.) fiscal_year: Fiscal year to filter for fiscal_period: Fiscal period (FY, Q1, Q2, Q3, Q4) use_canonical: Whether to use canonical structure for organization include_missing: Whether to include placeholder for missing concepts Returns: StructuredStatement with hierarchical organization """ # Filter facts for this statement and period filtered_facts = self._filter_facts(facts, statement_type, fiscal_year, fiscal_period) # Create fact lookup fact_map = self._create_fact_map(filtered_facts) # Get period end date period_end = self._get_period_end(filtered_facts) if use_canonical and statement_type in self.virtual_trees: # Build using canonical structure items = self._build_with_canonical( fact_map, self.virtual_trees[statement_type], include_missing ) # Add unmatched facts unmatched = self._find_unmatched_facts(fact_map, self.virtual_trees[statement_type]) items.extend(self._create_items_from_facts(unmatched)) else: # Build from facts only items = self._build_from_facts(fact_map) # Calculate metadata facts_used = len(fact_map) canonical_coverage = self._calculate_coverage(fact_map, statement_type) if use_canonical else 0.0 return StructuredStatement( statement_type=statement_type, fiscal_year=fiscal_year, fiscal_period=fiscal_period, period_end=period_end, items=items, cik=self.cik, canonical_coverage=canonical_coverage, facts_used=facts_used, facts_total=len(facts) ) def _filter_facts(self, facts: List[FinancialFact], statement_type: str, fiscal_year: Optional[int], fiscal_period: Optional[str]) -> List[FinancialFact]: """Filter facts for the requested statement and period.""" filtered = [] for fact in facts: # Check statement type if fact.statement_type != statement_type: continue # Check fiscal year if fiscal_year and fact.fiscal_year != fiscal_year: continue # Check fiscal period if fiscal_period and fact.fiscal_period != fiscal_period: continue filtered.append(fact) return filtered def _create_fact_map(self, facts: List[FinancialFact]) -> Dict[str, FinancialFact]: """Create a map of concept to fact.""" fact_map = {} for fact in facts: # Extract clean concept name concept = fact.concept if ':' in concept: concept = concept.split(':', 1)[1] # Use most recent fact for duplicates if concept not in fact_map or fact.filing_date > fact_map[concept].filing_date: fact_map[concept] = fact return fact_map def _get_period_end(self, facts: List[FinancialFact]) -> Optional[date]: """Get the period end date from facts.""" for fact in facts: if fact.period_end: return fact.period_end return None def _build_with_canonical(self, fact_map: Dict[str, FinancialFact], virtual_tree: Dict[str, Any], include_missing: bool) -> List[StatementItem]: """Build statement using canonical structure.""" items = [] processed = set() # Process root nodes for root_concept in virtual_tree.get('roots', []): item = self._build_canonical_item( root_concept, virtual_tree['nodes'], fact_map, processed, include_missing, depth=0 ) if item: items.append(item) return items def _build_canonical_item(self, concept: str, nodes: Dict[str, Any], fact_map: Dict[str, FinancialFact], processed: Set[str], include_missing: bool, depth: int = 0, parent: Optional[str] = None) -> Optional[StatementItem]: """Build a single canonical item with children.""" if concept in processed: return None processed.add(concept) # Get node info node = nodes.get(concept, {}) # Check if we have a fact for this concept fact = fact_map.get(concept) # Determine if we should include this item if not fact and not include_missing and not node.get('is_abstract'): # Skip missing concrete concepts unless required if node.get('occurrence_rate', 0) < 0.8: # Not a core concept return None # Create the item item = StatementItem( concept=concept, label=fact.label if fact else node.get('label', concept), value=fact.numeric_value if fact else None, depth=depth, parent_concept=parent, is_abstract=node.get('is_abstract', False), is_total=node.get('is_total', False), section=node.get('section'), confidence=node.get('occurrence_rate', 1.0) if not fact else 1.0, source='fact' if fact else ('canonical' if not include_missing else 'placeholder'), fact=fact ) # Process children for child_concept in node.get('children', []): child_item = self._build_canonical_item( child_concept, nodes, fact_map, processed, include_missing, depth + 1, concept ) if child_item: item.children.append(child_item) # Try to calculate total if missing if item.is_total and item.value is None and item.children: calculated_value = self._calculate_total(item.children) if calculated_value is not None: item.value = calculated_value item.source = 'calculated' return item def _calculate_total(self, children: List[StatementItem]) -> Optional[float]: """Calculate total from children values.""" total = 0 has_values = False for child in children: if not child.is_abstract and child.value is not None: total += child.value has_values = True return total if has_values else None def _find_unmatched_facts(self, fact_map: Dict[str, FinancialFact], virtual_tree: Dict[str, Any]) -> Dict[str, FinancialFact]: """Find facts that don't match canonical concepts.""" canonical_concepts = set(virtual_tree.get('nodes', {}).keys()) unmatched = {} for concept, fact in fact_map.items(): if concept not in canonical_concepts: unmatched[concept] = fact return unmatched def _create_items_from_facts(self, facts: Dict[str, FinancialFact]) -> List[StatementItem]: """Create statement items from unmatched facts.""" items = [] for concept, fact in facts.items(): item = StatementItem( concept=concept, label=fact.label, value=fact.numeric_value, depth=1, # Default depth parent_concept=None, is_abstract=fact.is_abstract, is_total=fact.is_total, section=fact.section, confidence=0.7, # Lower confidence for unmatched source='fact', fact=fact ) items.append(item) return items def _build_from_facts(self, fact_map: Dict[str, FinancialFact]) -> List[StatementItem]: """Build statement directly from facts without canonical structure.""" # Group facts by parent hierarchy = defaultdict(list) roots = [] for concept, fact in fact_map.items(): if fact.parent_concept: hierarchy[fact.parent_concept].append(concept) else: roots.append(concept) # Build items recursively items = [] for root_concept in roots: item = self._build_fact_item(root_concept, fact_map, hierarchy) if item: items.append(item) # Add orphaned facts for concept, fact in fact_map.items(): if concept not in roots and not fact.parent_concept: item = StatementItem( concept=concept, label=fact.label, value=fact.numeric_value, depth=0, parent_concept=None, is_abstract=fact.is_abstract, is_total=fact.is_total, section=fact.section, confidence=1.0, source='fact', fact=fact ) items.append(item) return items def _build_fact_item(self, concept: str, fact_map: Dict[str, FinancialFact], hierarchy: Dict[str, List[str]], depth: int = 0) -> Optional[StatementItem]: """Build item from fact with children.""" if concept not in fact_map: return None fact = fact_map[concept] item = StatementItem( concept=concept, label=fact.label, value=fact.numeric_value, depth=depth, parent_concept=fact.parent_concept, is_abstract=fact.is_abstract, is_total=fact.is_total, section=fact.section, confidence=1.0, source='fact', fact=fact ) # Add children for child_concept in hierarchy.get(concept, []): child_item = self._build_fact_item(child_concept, fact_map, hierarchy, depth + 1) if child_item: item.children.append(child_item) return item def _calculate_coverage(self, fact_map: Dict[str, FinancialFact], statement_type: str) -> float: """Calculate canonical coverage percentage.""" if statement_type not in self.virtual_trees: return 0.0 canonical_concepts = set(self.virtual_trees[statement_type].get('nodes', {}).keys()) if not canonical_concepts: return 0.0 matched = len(set(fact_map.keys()) & canonical_concepts) return matched / len(canonical_concepts)