Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/entity/statement_builder.py
+++ b/venv/lib/python3.10/site-packages/edgar/entity/statement_builder.py
@@ -0,0 +1,731 @@
+"""
+Statement Builder for reconstructing financial statements using canonical structures.
+
+This module provides intelligent statement reconstruction using learned canonical
+structures and virtual presentation trees.
+"""
+
+import logging
+from collections import defaultdict
+from dataclasses import dataclass, field
+from datetime import date
+from typing import Any, Dict, List, Optional, Set
+
+from rich import box
+from rich.columns import Columns
+from rich.console import Group
+from rich.padding import Padding
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+
+from edgar.entity.mappings_loader import load_canonical_structures, load_virtual_trees
+from edgar.entity.models import FinancialFact
+from edgar.richtools import repr_rich
+
+log = logging.getLogger(__name__)
+
+
+@dataclass
+class StatementItem:
+    """A single item in a reconstructed financial statement."""
+    concept: str
+    label: str
+    value: Optional[float]
+    depth: int
+    parent_concept: Optional[str]
+    children: List['StatementItem'] = field(default_factory=list)
+
+    # Metadata
+    is_abstract: bool = False
+    is_total: bool = False
+    section: Optional[str] = None
+    confidence: float = 1.0
+    source: str = 'fact'  # 'fact', 'calculated', 'canonical', 'placeholder'
+
+    # Original fact if available
+    fact: Optional[FinancialFact] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary representation."""
+        return {
+            'concept': self.concept,
+            'label': self.label,
+            'value': self.value,
+            'depth': self.depth,
+            'is_abstract': self.is_abstract,
+            'is_total': self.is_total,
+            'section': self.section,
+            'confidence': self.confidence,
+            'source': self.source,
+            'children': [child.to_dict() for child in self.children]
+        }
+
+    def get_display_value(self) -> str:
+        """Get formatted value for display."""
+        if self.value is not None:
+            if abs(self.value) >= 1_000_000_000:
+                return f"${self.value/1_000_000_000:.1f}B"
+            elif abs(self.value) >= 1_000_000:
+                return f"${self.value/1_000_000:.1f}M"
+            elif abs(self.value) >= 1_000:
+                return f"${self.value/1_000:.0f}K"
+            else:
+                return f"${self.value:.0f}"
+        elif self.is_abstract:
+            return ""
+        elif self.source == 'placeholder':
+            return "[Missing]"
+        else:
+            return "-"
+
+    def __rich__(self):
+        """Create a rich representation of the statement item."""
+        from rich.tree import Tree
+
+        # Create the node label
+        if self.is_abstract:
+            label = Text(self.label, style="bold cyan")
+        elif self.is_total:
+            label = Text(self.label, style="bold yellow")
+        else:
+            style = "dim" if self.confidence < 0.8 else ""
+            confidence_marker = " ◦" if self.confidence < 0.8 else ""
+            label = Text(f"{self.label}{confidence_marker}", style=style)
+
+        # Add value if present
+        value_str = self.get_display_value()
+        if value_str and value_str != "-":
+            # Color code values
+            if value_str.startswith("$") and self.value and isinstance(self.value, (int, float)):
+                value_style = "red" if self.value < 0 else "green"
+            else:
+                value_style = ""
+
+            label_with_value = Text.assemble(
+                label,
+                " ",
+                (value_str, value_style)
+            )
+        else:
+            label_with_value = label
+
+        # Create tree with this item as root
+        tree = Tree(label_with_value)
+
+        # Add children
+        for child in self.children:
+            tree.add(child.__rich__())
+
+        return tree
+
+    def __repr__(self) -> str:
+        """String representation using rich formatting."""
+        return repr_rich(self.__rich__())
+
+
+@dataclass
+class StructuredStatement:
+    """A complete structured financial statement."""
+    statement_type: str
+    fiscal_year: Optional[int]
+    fiscal_period: Optional[str]
+    period_end: Optional[date]
+
+    items: List[StatementItem]
+
+    # Metadata
+    company_name: Optional[str] = None
+    cik: Optional[str] = None
+    canonical_coverage: float = 0.0
+    facts_used: int = 0
+    facts_total: int = 0
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary representation."""
+        return {
+            'statement_type': self.statement_type,
+            'fiscal_year': self.fiscal_year,
+            'fiscal_period': self.fiscal_period,
+            'period_end': self.period_end.isoformat() if self.period_end else None,
+            'company_name': self.company_name,
+            'cik': self.cik,
+            'canonical_coverage': self.canonical_coverage,
+            'facts_used': self.facts_used,
+            'facts_total': self.facts_total,
+            'items': [item.to_dict() for item in self.items]
+        }
+
+    def get_hierarchical_display(self, max_depth: int = 3) -> str:
+        """Get hierarchical text representation."""
+        lines = []
+
+        def add_item(item: StatementItem, indent: int = 0):
+            if indent > max_depth:
+                return
+
+            indent_str = "  " * indent
+            value_str = item.get_display_value()
+
+            if item.is_abstract:
+                lines.append(f"{indent_str}{item.label}")
+            elif item.is_total:
+                lines.append(f"{indent_str}{item.label:<40} {value_str:>15}")
+                lines.append(f"{indent_str}{'-' * 55}")
+            else:
+                confidence_marker = "" if item.confidence > 0.8 else " *"
+                lines.append(f"{indent_str}{item.label:<40} {value_str:>15}{confidence_marker}")
+
+            for child in item.children:
+                add_item(child, indent + 1)
+
+        for item in self.items:
+            add_item(item)
+
+        return "\n".join(lines)
+
+    def __rich__(self):
+        """Create a rich representation of the structured statement."""
+        # Statement type mapping for better display
+        statement_names = {
+            'IncomeStatement': 'Income Statement',
+            'BalanceSheet': 'Balance Sheet', 
+            'CashFlow': 'Cash Flow Statement',
+            'StatementsOfComprehensiveIncome': 'Comprehensive Income',
+            'StatementsOfShareholdersEquity': 'Shareholders Equity'
+        }
+
+        # Title with company name and period
+        title_parts = []
+        if self.company_name:
+            title_parts.append((self.company_name, "bold green"))
+        else:
+            title_parts.append(("Financial Statement", "bold"))
+
+        title = Text.assemble(*title_parts)
+
+        # Subtitle with statement type and period
+        statement_display = statement_names.get(self.statement_type, self.statement_type)
+        if self.fiscal_period and self.fiscal_year:
+            subtitle = f"{statement_display} • {self.fiscal_period} {self.fiscal_year}"
+        elif self.period_end:
+            subtitle = f"{statement_display} • As of {self.period_end}"
+        else:
+            subtitle = statement_display
+
+        # Main statement table
+        stmt_table = Table(
+            box=box.SIMPLE,
+            show_header=False,
+            padding=(0, 1),
+            expand=True
+        )
+        stmt_table.add_column("Item", style="", ratio=3)
+        stmt_table.add_column("Value", justify="right", style="bold", ratio=1)
+
+        def add_item_to_table(item: StatementItem, depth: int = 0):
+            """Add an item to the table with proper indentation."""
+            indent = "  " * depth
+
+            if item.is_abstract:
+                # Abstract items are headers
+                stmt_table.add_row(
+                    Text(f"{indent}{item.label}", style="bold cyan"),
+                    ""
+                )
+            elif item.is_total:
+                # Total items with underline
+                value_text = Text(item.get_display_value(), style="bold yellow")
+                stmt_table.add_row(
+                    Text(f"{indent}{item.label}", style="bold"),
+                    value_text
+                )
+                # Add a separator line after totals
+                if depth == 0:
+                    stmt_table.add_row("", "")
+                    stmt_table.add_row(
+                        Text("─" * 40, style="dim"),
+                        Text("─" * 15, style="dim")
+                    )
+            else:
+                # Regular items
+                style = "dim" if item.confidence < 0.8 else ""
+                confidence_marker = " ◦" if item.confidence < 0.8 else ""
+                label_text = f"{indent}{item.label}{confidence_marker}"
+
+                # Color code positive/negative values
+                value_str = item.get_display_value()
+                if value_str and value_str.startswith("$"):
+                    try:
+                        # Extract numeric value for coloring
+                        if item.value and isinstance(item.value, (int, float)):
+                            if item.value < 0:
+                                value_style = "red"
+                            else:
+                                value_style = "green"
+                        else:
+                            value_style = ""
+                    except:
+                        value_style = ""
+                else:
+                    value_style = ""
+
+                stmt_table.add_row(
+                    Text(label_text, style=style),
+                    Text(value_str, style=value_style) if value_str else ""
+                )
+
+            # Add children recursively
+            for child in item.children:
+                if depth < 3:  # Limit depth for display
+                    add_item_to_table(child, depth + 1)
+
+        # Add all items to the table
+        for item in self.items:
+            add_item_to_table(item)
+
+        # Metadata summary
+        metadata = Table(box=box.SIMPLE, show_header=False, padding=(0, 1))
+        metadata.add_column("Metric", style="dim")
+        metadata.add_column("Value", style="bold")
+
+        metadata.add_row("Facts Used", f"{self.facts_used:,}")
+        if self.facts_total > 0:
+            metadata.add_row("Total Facts", f"{self.facts_total:,}")
+
+        if self.canonical_coverage > 0:
+            coverage_pct = self.canonical_coverage * 100
+            coverage_style = "green" if coverage_pct >= 50 else "yellow" if coverage_pct >= 25 else "red"
+            metadata.add_row(
+                "Canonical Coverage",
+                Text(f"{coverage_pct:.1f}%", style=coverage_style)
+            )
+
+        if self.cik:
+            metadata.add_row("CIK", self.cik)
+
+        # Data quality indicators
+        quality_notes = []
+
+        # Count items by confidence
+        low_confidence_count = sum(
+            1 for item in self._flatten_items()
+            if not item.is_abstract and item.confidence < 0.8
+        )
+
+        if low_confidence_count > 0:
+            quality_notes.append(
+                Text(f"◦ {low_confidence_count} items with lower confidence", style="dim yellow")
+            )
+
+        # Count calculated vs actual values
+        calculated_count = sum(
+            1 for item in self._flatten_items()
+            if item.source == 'calculated'
+        )
+
+        if calculated_count > 0:
+            quality_notes.append(
+                Text(f"◦ {calculated_count} calculated values", style="dim cyan")
+            )
+
+        # Combine metadata and quality notes
+        metadata_panel = Panel(
+            metadata,
+            title="📊 Statement Metadata",
+            border_style="bright_black"
+        )
+
+        # Create the main content group
+        content_parts = [
+            Padding("", (1, 0, 0, 0)),
+            stmt_table
+        ]
+
+        # Add metadata in a column layout
+        if self.facts_used > 0:
+            bottom_content = [metadata_panel]
+
+            if quality_notes:
+                quality_panel = Panel(
+                    Group(*quality_notes),
+                    title="📝 Data Quality Notes",
+                    border_style="bright_black"
+                )
+                bottom_content.append(quality_panel)
+
+            content_parts.append(Padding("", (1, 0)))
+            content_parts.append(Columns(bottom_content, equal=True, expand=True))
+
+        content = Group(*content_parts)
+
+        # Create the main panel
+        return Panel(
+            content,
+            title=title,
+            subtitle=subtitle,
+            border_style="blue",
+            expand=True
+        )
+
+    def _flatten_items(self) -> List[StatementItem]:
+        """Flatten the hierarchical items into a flat list."""
+        flat_items = []
+
+        def flatten(item: StatementItem):
+            flat_items.append(item)
+            for child in item.children:
+                flatten(child)
+
+        for item in self.items:
+            flatten(item)
+
+        return flat_items
+
+    def __repr__(self) -> str:
+        """String representation using rich formatting."""
+        return repr_rich(self.__rich__())
+
+
+class StatementBuilder:
+    """
+    Builds structured financial statements using canonical templates.
+
+    This class reconstructs complete financial statements by combining
+    actual facts with canonical structures, filling in missing concepts
+    and maintaining proper hierarchy.
+    """
+
+    def __init__(self, cik: Optional[str] = None):
+        """
+        Initialize the statement builder.
+
+        Args:
+            cik: Company CIK for context
+        """
+        self.cik = cik
+        self.canonical_structures = load_canonical_structures()
+        self.virtual_trees = load_virtual_trees()
+
+    def build_statement(self, 
+                       facts: List[FinancialFact],
+                       statement_type: str,
+                       fiscal_year: Optional[int] = None,
+                       fiscal_period: Optional[str] = None,
+                       use_canonical: bool = True,
+                       include_missing: bool = False) -> StructuredStatement:
+        """
+        Build a structured financial statement from facts.
+
+        Args:
+            facts: List of financial facts
+            statement_type: Type of statement (BalanceSheet, IncomeStatement, etc.)
+            fiscal_year: Fiscal year to filter for
+            fiscal_period: Fiscal period (FY, Q1, Q2, Q3, Q4)
+            use_canonical: Whether to use canonical structure for organization
+            include_missing: Whether to include placeholder for missing concepts
+
+        Returns:
+            StructuredStatement with hierarchical organization
+        """
+        # Filter facts for this statement and period
+        filtered_facts = self._filter_facts(facts, statement_type, fiscal_year, fiscal_period)
+
+        # Create fact lookup
+        fact_map = self._create_fact_map(filtered_facts)
+
+        # Get period end date
+        period_end = self._get_period_end(filtered_facts)
+
+        if use_canonical and statement_type in self.virtual_trees:
+            # Build using canonical structure
+            items = self._build_with_canonical(
+                fact_map, 
+                self.virtual_trees[statement_type],
+                include_missing
+            )
+
+            # Add unmatched facts
+            unmatched = self._find_unmatched_facts(fact_map, self.virtual_trees[statement_type])
+            items.extend(self._create_items_from_facts(unmatched))
+        else:
+            # Build from facts only
+            items = self._build_from_facts(fact_map)
+
+        # Calculate metadata
+        facts_used = len(fact_map)
+        canonical_coverage = self._calculate_coverage(fact_map, statement_type) if use_canonical else 0.0
+
+        return StructuredStatement(
+            statement_type=statement_type,
+            fiscal_year=fiscal_year,
+            fiscal_period=fiscal_period,
+            period_end=period_end,
+            items=items,
+            cik=self.cik,
+            canonical_coverage=canonical_coverage,
+            facts_used=facts_used,
+            facts_total=len(facts)
+        )
+
+    def _filter_facts(self, facts: List[FinancialFact], 
+                     statement_type: str,
+                     fiscal_year: Optional[int],
+                     fiscal_period: Optional[str]) -> List[FinancialFact]:
+        """Filter facts for the requested statement and period."""
+        filtered = []
+
+        for fact in facts:
+            # Check statement type
+            if fact.statement_type != statement_type:
+                continue
+
+            # Check fiscal year
+            if fiscal_year and fact.fiscal_year != fiscal_year:
+                continue
+
+            # Check fiscal period
+            if fiscal_period and fact.fiscal_period != fiscal_period:
+                continue
+
+            filtered.append(fact)
+
+        return filtered
+
+    def _create_fact_map(self, facts: List[FinancialFact]) -> Dict[str, FinancialFact]:
+        """Create a map of concept to fact."""
+        fact_map = {}
+
+        for fact in facts:
+            # Extract clean concept name
+            concept = fact.concept
+            if ':' in concept:
+                concept = concept.split(':', 1)[1]
+
+            # Use most recent fact for duplicates
+            if concept not in fact_map or fact.filing_date > fact_map[concept].filing_date:
+                fact_map[concept] = fact
+
+        return fact_map
+
+    def _get_period_end(self, facts: List[FinancialFact]) -> Optional[date]:
+        """Get the period end date from facts."""
+        for fact in facts:
+            if fact.period_end:
+                return fact.period_end
+        return None
+
+    def _build_with_canonical(self, fact_map: Dict[str, FinancialFact],
+                             virtual_tree: Dict[str, Any],
+                             include_missing: bool) -> List[StatementItem]:
+        """Build statement using canonical structure."""
+        items = []
+        processed = set()
+
+        # Process root nodes
+        for root_concept in virtual_tree.get('roots', []):
+            item = self._build_canonical_item(
+                root_concept, 
+                virtual_tree['nodes'],
+                fact_map,
+                processed,
+                include_missing,
+                depth=0
+            )
+            if item:
+                items.append(item)
+
+        return items
+
+    def _build_canonical_item(self, concept: str,
+                             nodes: Dict[str, Any],
+                             fact_map: Dict[str, FinancialFact],
+                             processed: Set[str],
+                             include_missing: bool,
+                             depth: int = 0,
+                             parent: Optional[str] = None) -> Optional[StatementItem]:
+        """Build a single canonical item with children."""
+        if concept in processed:
+            return None
+
+        processed.add(concept)
+
+        # Get node info
+        node = nodes.get(concept, {})
+
+        # Check if we have a fact for this concept
+        fact = fact_map.get(concept)
+
+        # Determine if we should include this item
+        if not fact and not include_missing and not node.get('is_abstract'):
+            # Skip missing concrete concepts unless required
+            if node.get('occurrence_rate', 0) < 0.8:  # Not a core concept
+                return None
+
+        # Create the item
+        item = StatementItem(
+            concept=concept,
+            label=fact.label if fact else node.get('label', concept),
+            value=fact.numeric_value if fact else None,
+            depth=depth,
+            parent_concept=parent,
+            is_abstract=node.get('is_abstract', False),
+            is_total=node.get('is_total', False),
+            section=node.get('section'),
+            confidence=node.get('occurrence_rate', 1.0) if not fact else 1.0,
+            source='fact' if fact else ('canonical' if not include_missing else 'placeholder'),
+            fact=fact
+        )
+
+        # Process children
+        for child_concept in node.get('children', []):
+            child_item = self._build_canonical_item(
+                child_concept,
+                nodes,
+                fact_map,
+                processed,
+                include_missing,
+                depth + 1,
+                concept
+            )
+            if child_item:
+                item.children.append(child_item)
+
+        # Try to calculate total if missing
+        if item.is_total and item.value is None and item.children:
+            calculated_value = self._calculate_total(item.children)
+            if calculated_value is not None:
+                item.value = calculated_value
+                item.source = 'calculated'
+
+        return item
+
+    def _calculate_total(self, children: List[StatementItem]) -> Optional[float]:
+        """Calculate total from children values."""
+        total = 0
+        has_values = False
+
+        for child in children:
+            if not child.is_abstract and child.value is not None:
+                total += child.value
+                has_values = True
+
+        return total if has_values else None
+
+    def _find_unmatched_facts(self, fact_map: Dict[str, FinancialFact],
+                             virtual_tree: Dict[str, Any]) -> Dict[str, FinancialFact]:
+        """Find facts that don't match canonical concepts."""
+        canonical_concepts = set(virtual_tree.get('nodes', {}).keys())
+        unmatched = {}
+
+        for concept, fact in fact_map.items():
+            if concept not in canonical_concepts:
+                unmatched[concept] = fact
+
+        return unmatched
+
+    def _create_items_from_facts(self, facts: Dict[str, FinancialFact]) -> List[StatementItem]:
+        """Create statement items from unmatched facts."""
+        items = []
+
+        for concept, fact in facts.items():
+            item = StatementItem(
+                concept=concept,
+                label=fact.label,
+                value=fact.numeric_value,
+                depth=1,  # Default depth
+                parent_concept=None,
+                is_abstract=fact.is_abstract,
+                is_total=fact.is_total,
+                section=fact.section,
+                confidence=0.7,  # Lower confidence for unmatched
+                source='fact',
+                fact=fact
+            )
+            items.append(item)
+
+        return items
+
+    def _build_from_facts(self, fact_map: Dict[str, FinancialFact]) -> List[StatementItem]:
+        """Build statement directly from facts without canonical structure."""
+        # Group facts by parent
+        hierarchy = defaultdict(list)
+        roots = []
+
+        for concept, fact in fact_map.items():
+            if fact.parent_concept:
+                hierarchy[fact.parent_concept].append(concept)
+            else:
+                roots.append(concept)
+
+        # Build items recursively
+        items = []
+        for root_concept in roots:
+            item = self._build_fact_item(root_concept, fact_map, hierarchy)
+            if item:
+                items.append(item)
+
+        # Add orphaned facts
+        for concept, fact in fact_map.items():
+            if concept not in roots and not fact.parent_concept:
+                item = StatementItem(
+                    concept=concept,
+                    label=fact.label,
+                    value=fact.numeric_value,
+                    depth=0,
+                    parent_concept=None,
+                    is_abstract=fact.is_abstract,
+                    is_total=fact.is_total,
+                    section=fact.section,
+                    confidence=1.0,
+                    source='fact',
+                    fact=fact
+                )
+                items.append(item)
+
+        return items
+
+    def _build_fact_item(self, concept: str, 
+                        fact_map: Dict[str, FinancialFact],
+                        hierarchy: Dict[str, List[str]],
+                        depth: int = 0) -> Optional[StatementItem]:
+        """Build item from fact with children."""
+        if concept not in fact_map:
+            return None
+
+        fact = fact_map[concept]
+
+        item = StatementItem(
+            concept=concept,
+            label=fact.label,
+            value=fact.numeric_value,
+            depth=depth,
+            parent_concept=fact.parent_concept,
+            is_abstract=fact.is_abstract,
+            is_total=fact.is_total,
+            section=fact.section,
+            confidence=1.0,
+            source='fact',
+            fact=fact
+        )
+
+        # Add children
+        for child_concept in hierarchy.get(concept, []):
+            child_item = self._build_fact_item(child_concept, fact_map, hierarchy, depth + 1)
+            if child_item:
+                item.children.append(child_item)
+
+        return item
+
+    def _calculate_coverage(self, fact_map: Dict[str, FinancialFact],
+                          statement_type: str) -> float:
+        """Calculate canonical coverage percentage."""
+        if statement_type not in self.virtual_trees:
+            return 0.0
+
+        canonical_concepts = set(self.virtual_trees[statement_type].get('nodes', {}).keys())
+        if not canonical_concepts:
+            return 0.0
+
+        matched = len(set(fact_map.keys()) & canonical_concepts)
+        return matched / len(canonical_concepts)