""" XBRL Statement Ordering - Intelligent Ordering for Multi-Period Statements This module provides consistent ordering for financial statements across multiple periods by combining template-based, reference-based, and semantic positioning strategies. """ import re from enum import Enum from typing import Dict, List, Optional, Tuple try: from rapidfuzz import fuzz except ImportError: # Fallback to difflib if rapidfuzz is not available from difflib import SequenceMatcher class fuzz: @staticmethod def ratio(s1: str, s2: str) -> float: return SequenceMatcher(None, s1, s2).ratio() * 100 class StatementType(str, Enum): """Supported statement types for ordering""" INCOME_STATEMENT = "IncomeStatement" BALANCE_SHEET = "BalanceSheet" CASH_FLOW = "CashFlowStatement" EQUITY = "StatementOfEquity" class FinancialStatementTemplates: """Canonical ordering templates for financial statements based on XBRL concepts""" INCOME_STATEMENT_TEMPLATE = [ # Revenue Section (0-99) (0, "revenue_section", [ # Product/Service Revenue Components "us-gaap:SalesRevenueGoodsNet", "us-gaap:ProductSales", "us-gaap:SalesRevenueServicesNet", "us-gaap:SubscriptionRevenue", # Contract Revenue "us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax", "us-gaap:RevenueFromContractWithCustomerIncludingAssessedTax", # Total Revenue "us-gaap:Revenue", "us-gaap:Revenues", "us-gaap:SalesRevenueNet", "us-gaap:OperatingRevenue" ]), # Cost Section (100-199) (100, "cost_section", [ "us-gaap:CostOfRevenueAbstract", # Abstract "us-gaap:CostOfRevenue", # Total "us-gaap:CostOfGoodsSold", "us-gaap:CostOfGoodsAndServicesSold", "us-gaap:CostOfSales", "us-gaap:DirectOperatingCosts", "us-gaap:CostsAndExpenses" ]), # Gross Profit (200-299) (200, "gross_profit", [ "us-gaap:GrossProfit" ]), # Operating Expenses (300-399) (300, "operating_expenses", [ # R&D Expenses "us-gaap:ResearchAndDevelopmentCosts", "us-gaap:ResearchAndDevelopmentExpense", # SG&A Expenses "us-gaap:SellingGeneralAndAdministrativeExpense", "us-gaap:GeneralAndAdministrativeExpense", "us-gaap:AdministrativeExpense", "us-gaap:SellingAndMarketingExpense", "us-gaap:SellingExpense", "us-gaap:MarketingExpense", "us-gaap:AdvertisingExpense", # Total Operating Expenses "us-gaap:NoninterestExpense", "us-gaap:OperatingCostsAndExpenses", "us-gaap:OperatingExpenses" ]), # Operating Income (400-499) (400, "operating_income", [ "us-gaap:OperatingIncomeLoss", "us-gaap:OperatingIncome", "us-gaap:IncomeLossFromContinuingOperationsBeforeInterestAndTaxes" ]), # Non-Operating (500-599) (500, "non_operating", [ "us-gaap:InterestIncomeExpenseNet", "us-gaap:InterestAndDebtExpense", "us-gaap:InterestExpense", "us-gaap:InterestExpenseNonoperating", # ADBE uses this for non-operating interest expense "us-gaap:InterestIncome", "us-gaap:InvestmentIncomeInterest", # NVIDIA uses this variant "us-gaap:OtherNonoperatingIncomeExpense", "us-gaap:NonoperatingIncomeExpense", "orcl:NonoperatingIncomeExpenseIncludingEliminationOfNetIncomeLossAttributableToNoncontrollingInterests" ]), # Pre-Tax Income (600-699) (600, "pretax_income", [ "us-gaap:IncomeLossBeforeIncomeTaxes", "us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxes", "us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest", "orcl:IncomeLossFromContinuingOperationsIncludingNoncontrollingInterestBeforeIncomeTaxesExtraordinaryItems" ]), # Tax (700-799) (700, "tax", [ "us-gaap:IncomeTaxesPaidNet", "us-gaap:IncomeTaxExpenseBenefit" ]), # Net Income (800-899) (800, "net_income", [ "us-gaap:IncomeLossFromContinuingOperationsIncludingPortionAttributableToNoncontrollingInterest", "us-gaap:IncomeLossFromContinuingOperations", "us-gaap:NetIncome", "us-gaap:NetIncomeLoss", "us-gaap:ProfitLoss", "us-gaap:NetIncomeLossAttributableToNonredeemableNoncontrollingInterest", "us-gaap:NetIncomeLossAttributableToNoncontrollingInterest" ]), # Per Share Data (900-999) (900, "per_share", [ "us-gaap:EarningsPerShareAbstract", "us-gaap:EarningsPerShareBasic", "us-gaap:EarningsPerShareDiluted", "us-gaap:WeightedAverageNumberOfSharesOutstandingAbstract", "us-gaap:WeightedAverageNumberOfSharesOutstandingBasic", "us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding" ]) ] BALANCE_SHEET_TEMPLATE = [ # Current Assets (0-199) (0, "current_assets", [ "Cash and Cash Equivalents", "Cash", "Short-term Investments", "Marketable Securities", "Accounts Receivable", "Trade Receivables", "Inventory", "Prepaid Expenses", "Other Current Assets", "Total Current Assets" ]), # Non-Current Assets (200-399) (200, "noncurrent_assets", [ "Property, Plant and Equipment", "Property and Equipment", "Long-term Investments", "Goodwill", "Intangible Assets", "Other Non-current Assets", "Total Non-current Assets", "Total Assets" ]), # Current Liabilities (400-599) (400, "current_liabilities", [ "Accounts Payable", "Trade Payables", "Accrued Liabilities", "Accrued Expenses", "Short-term Debt", "Current Portion of Long-term Debt", "Other Current Liabilities", "Total Current Liabilities" ]), # Non-Current Liabilities (600-799) (600, "noncurrent_liabilities", [ "Long-term Debt", "Deferred Revenue", "Deferred Tax Liabilities", "Other Non-current Liabilities", "Total Non-current Liabilities", "Total Liabilities" ]), # Equity (800-999) (800, "equity", [ "Common Stock", "Additional Paid-in Capital", "Retained Earnings", "Accumulated Other Comprehensive Income", "Treasury Stock", "Total Stockholders' Equity", "Total Shareholders' Equity", "Total Equity" ]) ] def get_template_position(self, item_concept: str, item_label: str, statement_type: str) -> Optional[float]: """ Get template position for an item, prioritizing concept-based matching over label matching. Args: item_concept: The XBRL concept (e.g., "us-gaap:Revenue") item_label: The display label (e.g., "Contract Revenue") statement_type: Type of statement ("IncomeStatement", "BalanceSheet", etc.) Returns: Float position in template, or None if no match found """ # Handle different statement type formats if statement_type == "IncomeStatement": template_name = "INCOME_STATEMENT_TEMPLATE" elif statement_type == "BalanceSheet": template_name = "BALANCE_SHEET_TEMPLATE" else: template_name = f"{statement_type.upper()}_TEMPLATE" template = getattr(self, template_name, None) if not template: return None # Strategy 1: Direct concept matching (highest priority) if item_concept: normalized_concept = self._normalize_xbrl_concept(item_concept) for base_pos, _section_name, template_concepts in template: for i, template_concept in enumerate(template_concepts): template_normalized = self._normalize_xbrl_concept(template_concept) if normalized_concept == template_normalized: return float(base_pos + i) # Strategy 2: Label-based matching as fallback (for compatibility) if item_label: for base_pos, _section_name, template_concepts in template: for i, template_concept in enumerate(template_concepts): if self._labels_match(item_label, template_concept): return float(base_pos + i) return None def _normalize_xbrl_concept(self, concept: str) -> str: """ Normalize XBRL concept for matching. Handles variations in concept format: - "us-gaap:Revenue" vs "us-gaap_Revenue" - Case sensitivity - Namespace prefixes """ if not concept: return "" # Normalize separators (: vs _) normalized = concept.lower() normalized = normalized.replace(':', '_') # Handle common namespace variations # us-gaap, usgaap, gaap all should match if normalized.startswith('us-gaap_') or normalized.startswith('usgaap_'): normalized = 'us-gaap_' + normalized.split('_', 1)[1] elif normalized.startswith('gaap_'): normalized = 'us-gaap_' + normalized.split('_', 1)[1] return normalized def _labels_match(self, label1: str, label2: str) -> bool: """Check if two labels represent the same financial item (fallback for non-concept matching)""" if not label1 or not label2: return False # For XBRL concepts in templates, don't try to match against labels if ':' in label2 or '_gaap_' in label2.lower(): return False # Use existing normalization logic for label matching norm1 = self._normalize_concept(label1) norm2 = self._normalize_concept(label2) # Exact match if norm1 == norm2: return True # Fuzzy matching for similar concepts similarity = fuzz.ratio(norm1, norm2) / 100.0 return similarity > 0.7 def _concepts_match(self, concept1: str, concept2: str) -> bool: """Check if two concepts represent the same financial item""" # Normalize for comparison norm1 = self._normalize_concept(concept1) norm2 = self._normalize_concept(concept2) # Exact match if norm1 == norm2: return True # Fuzzy matching for similar concepts similarity = fuzz.ratio(norm1, norm2) / 100.0 return similarity > 0.7 # Lowered threshold for better matching def _normalize_concept(self, concept: str) -> str: """Normalize concept for comparison""" if not concept: return "" # Remove common variations normalized = concept.lower() normalized = re.sub(r'\s+', ' ', normalized) # Normalize whitespace normalized = re.sub(r'[,\.]', '', normalized) # Remove punctuation normalized = re.sub(r'\(.*?\)', '', normalized) # Remove parenthetical normalized = re.sub(r'\bexpense\b', '', normalized) # Remove 'expense' suffix normalized = re.sub(r'\bincome\b', '', normalized) # Remove 'income' suffix for matching return normalized.strip() class ReferenceOrderingStrategy: """Extract ordering from reference statement""" def establish_reference_order(self, statements: List[Dict]) -> Dict[str, float]: """Establish reference ordering from best available statement""" if not statements: return {} # Strategy: Use most recent statement (statements are ordered newest first) reference_statement = statements[0] reference_order = {} for i, item in enumerate(reference_statement.get('data', [])): concept = item.get('concept') label = item.get('label') if concept: # Store by both concept ID and label for flexibility reference_order[concept] = float(i) if label: reference_order[label] = float(i) return reference_order class SemanticPositioning: """Position concepts based on financial statement semantics""" def __init__(self, statement_type: str): self.statement_type = statement_type self.section_defaults = self._get_section_defaults() def _get_section_defaults(self) -> Dict[str, float]: """Default positions for each section when no other guidance available""" if self.statement_type == "IncomeStatement": return { "revenue": 50.0, "cost": 150.0, "gross_profit": 250.0, "expense": 350.0, "operating_income": 450.0, "non_operating": 550.0, "pretax_income": 650.0, "tax": 750.0, "net_income": 850.0, "per_share": 950.0 } elif self.statement_type == "BalanceSheet": return { "current_assets": 100.0, "noncurrent_assets": 300.0, "current_liabilities": 500.0, "noncurrent_liabilities": 700.0, "equity": 900.0 } return {} def infer_position(self, concept: str, existing_order: Dict[str, float]) -> float: """Infer semantic position for a new concept""" # Rule-based positioning section = self._classify_concept_section(concept) if section: return self._position_in_section(concept, section, existing_order) # Parent-child relationship positioning parent = self._find_parent_concept(concept, existing_order) if parent: return existing_order[parent] + 0.1 # Just after parent # Similarity-based positioning similar_concept = self._find_most_similar_concept(concept, existing_order) if similar_concept: return existing_order[similar_concept] + 0.1 # Default to end return 999.0 def _classify_concept_section(self, concept: str) -> Optional[str]: """Classify concept into financial statement section""" if not concept: return None concept_lower = concept.lower() if self.statement_type == "IncomeStatement": # Revenue indicators if any(term in concept_lower for term in ['revenue', 'sales']) and not any(term in concept_lower for term in ['cost', 'expense']): return "revenue" # Cost indicators elif any(term in concept_lower for term in ['cost of', 'cogs']): return "cost" # Gross profit elif 'gross profit' in concept_lower or 'gross margin' in concept_lower: return "gross_profit" # Operating expenses elif any(term in concept_lower for term in ['r&d', 'research', 'selling', 'administrative', 'marketing']) or ('expense' in concept_lower and 'tax' not in concept_lower): return "expense" # Operating income elif 'operating income' in concept_lower or 'operating profit' in concept_lower: return "operating_income" # Non-operating elif any(term in concept_lower for term in ['interest', 'other income', 'nonoperating']): return "non_operating" # Pre-tax income elif 'before tax' in concept_lower or 'pretax' in concept_lower: return "pretax_income" # Tax elif 'tax' in concept_lower and 'expense' in concept_lower: return "tax" # Net income elif 'net income' in concept_lower or 'net earnings' in concept_lower: return "net_income" # Per share elif any(term in concept_lower for term in ['per share', 'earnings per', 'shares outstanding']): return "per_share" elif self.statement_type == "BalanceSheet": if any(term in concept_lower for term in ['cash', 'receivable', 'inventory', 'prepaid']) or ('current' in concept_lower and 'asset' in concept_lower): return "current_assets" elif any(term in concept_lower for term in ['property', 'equipment', 'goodwill', 'intangible']) or ('asset' in concept_lower and 'current' not in concept_lower): return "noncurrent_assets" elif any(term in concept_lower for term in ['payable', 'accrued']) or ('current' in concept_lower and 'liabilit' in concept_lower): return "current_liabilities" elif 'debt' in concept_lower or ('liabilit' in concept_lower and 'current' not in concept_lower): return "noncurrent_liabilities" elif any(term in concept_lower for term in ['equity', 'stock', 'retained earnings', 'capital']): return "equity" return None def _position_in_section(self, concept: str, section: str, existing_order: Dict[str, float]) -> float: """Position concept within its identified section""" section_concepts = [ (label, pos) for label, pos in existing_order.items() if self._classify_concept_section(label) == section ] if not section_concepts: # Section doesn't exist yet - use template defaults return self.section_defaults.get(section, 999.0) # Find best position within section section_concepts.sort(key=lambda x: x[1]) # Sort by position # Simple strategy: place at end of section last_pos = section_concepts[-1][1] return last_pos + 0.1 def _find_parent_concept(self, concept: str, existing_order: Dict[str, float]) -> Optional[str]: """Find parent concept in hierarchy""" if not concept: return None # Look for hierarchical relationships # e.g., "Software Revenue" -> "Revenue" concept_words = set(concept.lower().split()) candidates = [] for existing_concept in existing_order.keys(): if not existing_concept: continue existing_words = set(existing_concept.lower().split()) # Check if existing concept is a parent (subset of words) # Also check for common patterns like "expense" being a parent of "X expense" if (existing_words.issubset(concept_words) and len(existing_words) < len(concept_words)) or \ (existing_concept.lower() in concept.lower() and existing_concept.lower() != concept.lower()): candidates.append((existing_concept, len(existing_words))) if candidates: # Return the most specific parent (most words in common) return max(candidates, key=lambda x: x[1])[0] return None def _find_most_similar_concept(self, concept: str, existing_order: Dict[str, float]) -> Optional[str]: """Find most similar existing concept""" if not concept: return None best_match = None best_similarity = 0.0 for existing_concept in existing_order.keys(): if not existing_concept: continue similarity = fuzz.ratio(concept.lower(), existing_concept.lower()) / 100.0 if similarity > best_similarity and similarity > 0.5: # Minimum threshold best_similarity = similarity best_match = existing_concept return best_match class StatementOrderingManager: """Manages consistent ordering across multi-period statements""" def __init__(self, statement_type: str): self.statement_type = statement_type self.templates = FinancialStatementTemplates() self.reference_strategy = ReferenceOrderingStrategy() self.semantic_positioning = SemanticPositioning(statement_type) def determine_ordering(self, statements: List[Dict]) -> Dict[str, float]: """ Determine unified ordering for all concepts across statements. Returns: Dict mapping concept -> sort_key (float for interpolation) """ if not statements: return {} all_concepts = self._extract_all_concepts(statements) # Strategy 1: Template-based ordering (highest priority) template_positioned = self._apply_template_ordering(all_concepts, statements) # Strategy 2: Reference statement ordering for non-template items reference_positioned = self._apply_reference_ordering( all_concepts, statements, template_positioned ) # Strategy 3: Semantic positioning for orphan concepts semantic_positioned = self._apply_semantic_positioning( all_concepts, template_positioned, reference_positioned ) # Strategy 4: Section-aware consolidation to maintain template groupings final_ordering = self._consolidate_section_ordering( semantic_positioned, template_positioned, statements ) return final_ordering def _extract_all_concepts(self, statements: List[Dict]) -> set: """Extract all unique concepts from statements""" all_concepts = set() for statement in statements: for item in statement.get('data', []): concept = item.get('concept') label = item.get('label') if concept: all_concepts.add(concept) if label: all_concepts.add(label) return all_concepts def _apply_template_ordering(self, concepts: set, statements: List[Dict]) -> Dict[str, float]: """Apply template-based ordering for known concepts using concept-first matching""" template_order = {} # Build a mapping of concepts/labels to their actual XBRL concepts for better matching concept_to_xbrl = {} label_to_xbrl = {} for statement in statements: for item in statement.get('data', []): concept = item.get('concept') label = item.get('label') if concept and label: concept_to_xbrl[concept] = concept label_to_xbrl[label] = concept elif concept: concept_to_xbrl[concept] = concept # Apply template ordering with concept priority for concept_or_label in concepts: # Determine if this is a concept or label is_concept = concept_or_label in concept_to_xbrl is_label = concept_or_label in label_to_xbrl # Get the actual XBRL concept and label for this item if is_concept: xbrl_concept = concept_or_label # Try to find the corresponding label corresponding_label = None for stmt in statements: for item in stmt.get('data', []): if item.get('concept') == concept_or_label: corresponding_label = item.get('label') break if corresponding_label: break elif is_label: xbrl_concept = label_to_xbrl.get(concept_or_label) corresponding_label = concept_or_label else: # Neither concept nor label found in mappings xbrl_concept = None corresponding_label = concept_or_label # Try concept-based matching first, then label-based template_pos = self.templates.get_template_position( item_concept=xbrl_concept, item_label=corresponding_label, statement_type=self.statement_type ) if template_pos is not None: template_order[concept_or_label] = template_pos # IMPORTANT: If we found a template position for a concept, # also apply it to the corresponding label (and vice versa) # This ensures consistent ordering regardless of whether the # stitcher uses concept or label as the key if is_concept and corresponding_label and corresponding_label in concepts: template_order[corresponding_label] = template_pos elif is_label and xbrl_concept and xbrl_concept in concepts: template_order[xbrl_concept] = template_pos return template_order def _apply_reference_ordering(self, concepts: set, statements: List[Dict], template_positioned: Dict[str, float]) -> Dict[str, float]: """Apply reference statement ordering for remaining concepts""" reference_order = self.reference_strategy.establish_reference_order(statements) combined_order = template_positioned.copy() for concept in concepts: if concept not in combined_order and concept in reference_order: combined_order[concept] = reference_order[concept] return combined_order def _apply_semantic_positioning(self, concepts: set, template_positioned: Dict[str, float], reference_positioned: Dict[str, float]) -> Dict[str, float]: """Apply semantic positioning for orphan concepts""" final_order = reference_positioned.copy() # Position remaining concepts using semantic rules for concept in concepts: if concept not in final_order: semantic_pos = self.semantic_positioning.infer_position(concept, final_order) final_order[concept] = semantic_pos return final_order def _consolidate_section_ordering(self, semantic_positioned: Dict[str, float], template_positioned: Dict[str, float], statements: List[Dict]) -> Dict[str, float]: """ Consolidate ordering to maintain template section groupings. This prevents reference ordering from breaking up logical template sections like per-share data (EPS + Shares Outstanding). """ # Identify template sections and their concepts template_sections = self._identify_template_sections(template_positioned) # Separate template-positioned from non-template items template_items = {} non_template_items = {} for concept, position in semantic_positioned.items(): if concept in template_positioned: template_items[concept] = position else: non_template_items[concept] = position # Re-organize to ensure section integrity final_ordering = {} # Process template sections in order for section_name, section_concepts in template_sections.items(): # Find all template items (concepts and labels) that belong to this section section_template_items = [] for concept in section_concepts: if concept in template_items: section_template_items.append(concept) # Also find labels that correspond to concepts in this section # by checking if any template_items have the same template position section_template_positions = set() for concept in section_concepts: if concept in template_positioned: section_template_positions.add(template_positioned[concept]) # Find labels that have the same template positions as section concepts for item, pos in template_items.items(): if pos in section_template_positions and item not in section_template_items: section_template_items.append(item) if section_template_items: # Use the template base position for this section to ensure strong grouping section_base_pos = self._get_section_base_position(section_name) # For critical sections like per_share, use an even stronger override if section_name == "per_share": # Force per-share items to be at the very end, regardless of hierarchy section_base_pos = 950.0 # Ensure all items in this section stay grouped together for i, item in enumerate(sorted(section_template_items, key=lambda x: template_items.get(x, 999.0))): final_ordering[item] = section_base_pos + i * 0.1 # Add non-template items, adjusting positions to avoid breaking template sections section_ranges = self._get_section_ranges(final_ordering, template_sections) for concept, position in non_template_items.items(): # Find appropriate insertion point that doesn't break template sections adjusted_position = self._find_insertion_point(position, section_ranges) final_ordering[concept] = adjusted_position return final_ordering def _get_section_base_position(self, section_name: str) -> float: """Get the base position for a template section""" if self.statement_type == "IncomeStatement": template = self.templates.INCOME_STATEMENT_TEMPLATE elif self.statement_type == "BalanceSheet": template = self.templates.BALANCE_SHEET_TEMPLATE else: return 999.0 for base_pos, name, _concepts in template: if name == section_name: return float(base_pos) return 999.0 def _identify_template_sections(self, template_positioned: Dict[str, float]) -> Dict[str, List[str]]: """Identify which concepts belong to which template sections""" sections = {} # Get the template for this statement type if self.statement_type == "IncomeStatement": template = self.templates.INCOME_STATEMENT_TEMPLATE elif self.statement_type == "BalanceSheet": template = self.templates.BALANCE_SHEET_TEMPLATE else: return {} # Build mapping of concepts to sections for _base_pos, section_name, template_concepts in template: section_concepts = [] for concept in template_positioned.keys(): # Check if this concept matches any template concept in this section for template_concept in template_concepts: if self._concept_matches_template(concept, template_concept): section_concepts.append(concept) break if section_concepts: sections[section_name] = section_concepts return sections def _concept_matches_template(self, concept: str, template_concept: str) -> bool: """Check if a concept matches a template concept""" # For XBRL concepts, do direct comparison if ':' in template_concept or '_gaap_' in template_concept.lower(): return self._normalize_xbrl_concept(concept) == self._normalize_xbrl_concept(template_concept) # For labels, use fuzzy matching return self._labels_match(concept, template_concept) def _get_section_ranges(self, final_ordering: Dict[str, float], template_sections: Dict[str, List[str]]) -> List[Tuple[float, float, str]]: """Get the position ranges occupied by each template section""" ranges = [] for section_name, concepts in template_sections.items(): section_positions = [final_ordering[c] for c in concepts if c in final_ordering] if section_positions: min_pos = min(section_positions) max_pos = max(section_positions) ranges.append((min_pos, max_pos, section_name)) return sorted(ranges) def _find_insertion_point(self, desired_position: float, section_ranges: List[Tuple[float, float, str]]) -> float: """Find appropriate insertion point that doesn't break template sections""" # Check if desired position conflicts with any template section for min_pos, max_pos, section_name in section_ranges: if min_pos <= desired_position <= max_pos: # Position conflicts with a template section # Place it just before the section (unless it should logically be after) # Special handling for per-share section if section_name == "per_share" and desired_position < min_pos: # Items that should come before per-share data return min_pos - 1.0 else: # Place after the section return max_pos + 1.0 # No conflicts, use desired position return desired_position def _normalize_xbrl_concept(self, concept: str) -> str: """Delegate to templates class for concept normalization""" return self.templates._normalize_xbrl_concept(concept) def _labels_match(self, label1: str, label2: str) -> bool: """Delegate to templates class for label matching""" return self.templates._labels_match(label1, label2)