""" XBRL Parser - Top-level integration module for XBRL parsing. This module provides the XBRL class, which integrates all components of the XBRL parsing system: - Instance Document Parser - Presentation Linkbase Parser - Calculation Linkbase Parser - Definition Linkbase Parser The XBRL class provides a unified interface for working with XBRL data, organizing facts according to presentation hierarchies, validating calculations, and handling dimensional qualifiers. """ import datetime from pathlib import Path from textwrap import dedent from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union if TYPE_CHECKING: from edgar.xbrl.facts import FactQuery from edgar.xbrl.models import Fact, Footnote import pandas as pd from rich import box from rich.table import Column, Table from rich.table import Table as RichTable from edgar.attachments import Attachments from edgar.core import log from edgar.richtools import repr_rich, strip_ansi_text from edgar.xbrl.core import STANDARD_LABEL from edgar.xbrl.models import PresentationNode from edgar.xbrl.parsers import XBRLParser from edgar.xbrl.period_selector import select_periods from edgar.xbrl.periods import get_period_views from edgar.xbrl.rendering import RenderedStatement, generate_rich_representation, render_statement from edgar.xbrl.statement_resolver import StatementResolver from edgar.xbrl.statements import statement_to_concepts class XBRLFilingWithNoXbrlData(Exception): """Exception raised when a filing does not contain XBRL data.""" def __init__(self, message: str): super().__init__(message) class XBRLAttachments: """ An adapter for the Attachments class that provides easy access to the XBRL documents. """ def __init__(self, attachments: Attachments): self._documents = dict() if attachments.data_files: for attachment in attachments.data_files: if attachment.document_type in ["XML", 'EX-101.INS'] and attachment.extension.endswith( ('.xml', '.XML')): content = attachment.content if ' bool: """ Determine if a statement should display dimensioned line items. Args: statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.) role_definition: The definition of the statement role Returns: bool: True if dimensions should be displayed, False otherwise """ # Look for keywords in role definition that suggest dimensional breakdowns dimension_keywords = [ 'segment', 'geography', 'geographic', 'region', 'product', 'business', 'by country', 'by region', 'by product', 'by segment', 'revenues by' ] role_def_lower = role_definition.lower() if role_definition else "" # For core financial statements, check if they contain segment information if statement_type in ['BalanceSheet', 'IncomeStatement', 'CashFlowStatement', 'StatementOfEquity', 'ComprehensiveIncome']: # Allow dimensional display if the role definition suggests segment/product breakdown if any(keyword in role_def_lower for keyword in dimension_keywords): return True # For income statements specifically, check if there are segment-related dimensional facts if statement_type == 'IncomeStatement': # Check if there are facts with ProductOrServiceAxis dimensions try: # Look for revenue facts with ProductOrServiceAxis dimensions revenue_concepts = [ 'us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax', 'us-gaap:Revenues', 'us-gaap:SalesRevenueNet' ] for _fact_key, fact in self.parser.facts.items(): # Check if this is a revenue-related concept concept_name = fact.element_id if hasattr(fact, 'element_id') else getattr(fact, 'concept', str(fact)) if any(revenue_concept in concept_name for revenue_concept in revenue_concepts): # Check if this fact has ProductOrServiceAxis dimension context = self.parser.contexts.get(fact.context_ref) if context and hasattr(context, 'dimensions') and context.dimensions: for dim_name, _dim_value in context.dimensions.items(): if 'ProductOrServiceAxis' in dim_name: return True return False except Exception: # If any error occurs, default to False return False # For other core statements, skip dimensional display by default return False # For non-core statements, check if they contain dimensional breakdowns return any(keyword in role_def_lower for keyword in dimension_keywords) @property def element_catalog(self): return self.parser.element_catalog @property def contexts(self): return self.parser.contexts @property def footnotes(self): """Access to XBRL footnotes.""" return self.parser.footnotes @property def _facts(self): return self.parser.facts @property def units(self): return self.parser.units @property def presentation_roles(self): return self.parser.presentation_roles @property def presentation_trees(self): return self.parser.presentation_trees @property def calculation_roles(self): return self.parser.calculation_roles @property def calculation_trees(self): return self.parser.calculation_trees @property def definition_roles(self): return self.parser.definition_roles @property def tables(self): return self.parser.tables @property def axes(self): return self.parser.axes @property def domains(self): return self.parser.domains @property def entity_info(self): return self.parser.entity_info @property def reporting_periods(self): return self.parser.reporting_periods @property def period_of_report(self) -> Optional[str]: if 'document_period_end_date' in self.entity_info: period = self.entity_info['document_period_end_date'] return period.strftime('%Y-%m-%d') if isinstance(period, datetime.date) else period return None @property def entity_name(self): return self.entity_info.get('entity_name') @property def document_type(self): return self.entity_info.get('document_type') @property def context_period_map(self): return self.parser.context_period_map @classmethod def from_directory(cls, directory_path: Union[str, Path]) -> 'XBRL': """ Parse all XBRL files in a directory. Args: directory_path: Path to directory containing XBRL files Returns: XBRL object with parsed data """ xbrl = cls() xbrl.parser.parse_directory(directory_path) # Try to create legacy instance as well for compatibility directory = Path(directory_path) for file_path in directory.glob("*"): if file_path.is_file() and file_path.name.lower().endswith('.xml') and ' 'XBRL': """ Create an XBRL object from individual files. Args: instance_file: Path to instance document file schema_file: Path to schema file presentation_file: Path to presentation linkbase file calculation_file: Path to calculation linkbase file definition_file: Path to definition linkbase file label_file: Path to label linkbase file Returns: XBRL object with parsed data """ xbrl = cls() # Parse schema first if schema_file: xbrl.parser.parse_schema(schema_file) # Parse linkbase files if label_file: xbrl.parser.parse_labels(label_file) if presentation_file: xbrl.parser.parse_presentation(presentation_file) if calculation_file: xbrl.parser.parse_calculation(calculation_file) if definition_file: xbrl.parser.parse_definition(definition_file) # Parse instance last if instance_file: xbrl.parser.parse_instance(instance_file) return xbrl @classmethod def from_filing(cls, filing) -> Optional['XBRL']: """ Create an XBRL object from a Filing object. Args: filing: Filing object with attachments containing XBRL files Returns: XBRL object with parsed data """ if filing.form.endswith("/A"): log.warning(dedent(f""" {filing} is an amended filing and may not contain full XBRL data e.g. some statements might be missing. Consider using the original filing instead if available with `get_filings(form="10-K", amendments=False)` """)) xbrl = cls() xbrl_attachments = XBRLAttachments(filing.attachments) if xbrl_attachments.empty: log.warning(f"No XBRL attachments found in filing {filing}") return None if xbrl_attachments.get('schema'): xbrl.parser.parse_schema_content(xbrl_attachments.get('schema').content) if xbrl_attachments.get('label'): xbrl.parser.parse_labels_content(xbrl_attachments.get('label').content) if xbrl_attachments.get('presentation'): xbrl.parser.parse_presentation_content(xbrl_attachments.get('presentation').content) if xbrl_attachments.get('calculation'): xbrl.parser.parse_calculation_content(xbrl_attachments.get('calculation').content) if xbrl_attachments.get('definition'): xbrl.parser.parse_definition_content(xbrl_attachments.get('definition').content) if xbrl_attachments.get('instance'): xbrl.parser.parse_instance_content(xbrl_attachments.get('instance').content) return xbrl @property def statements(self): from edgar.xbrl.statements import Statements return Statements(self) @property def facts(self): from edgar.xbrl.facts import FactsView if not hasattr(self, '_facts_view'): self._facts_view = FactsView(self) return self._facts_view @property def current_period(self): """ Convenient access to current period financial data. Provides simplified access to the most recent period's financial data without comparative information. This addresses common use cases where users only need the current period data. Returns: CurrentPeriodView: Interface for accessing current period data Example: >>> xbrl = filing.xbrl() >>> current = xbrl.current_period >>> balance_sheet = current.balance_sheet() >>> income = current.income_statement(raw_concepts=True) """ from edgar.xbrl.current_period import CurrentPeriodView if not hasattr(self, '_current_period_view'): self._current_period_view = CurrentPeriodView(self) return self._current_period_view def query(self, include_dimensions: bool = True, include_contexts: bool = False, include_element_info: bool = False) -> 'FactQuery': """ Start a new query for XBRL facts. """ fact_query = self.facts.query() if not include_dimensions: fact_query = fact_query.exclude_dimensions() if not include_contexts: fact_query = fact_query.exclude_contexts() if not include_element_info: fact_query = fact_query.exclude_element_info() return fact_query def get_all_statements(self) -> List[Dict[str, Any]]: """ Get all available financial statements. Returns: List of statement metadata (role, definition, element count) """ # Return cached result if available if self._all_statements_cached is not None: return self._all_statements_cached statements = [] # Reset indices self._statement_indices = {} self._statement_by_standard_name = {} self._statement_by_primary_concept = {} self._statement_by_role_uri = {} self._statement_by_role_name = {} for role, tree in self.presentation_trees.items(): # Check if this role appears to be a financial statement role_def = tree.definition.lower() statement_type = None primary_concept = next(iter(tree.all_nodes)) statement_category = None # First try to match using statement_to_concepts (for backward compatibility) for statement_alias, statement_info in statement_to_concepts.items(): if primary_concept == statement_info.concept: if 'parenthetical' in role_def: statement_type = f"{statement_alias}Parenthetical" else: statement_type = statement_alias if 'BalanceSheet' not in statement_type: break # If we didn't find a match, try additional patterns for notes and disclosures if not statement_type: if 'us-gaap_NotesToFinancialStatementsAbstract' in primary_concept or 'note' in role_def: statement_type = "Notes" statement_category = "note" elif 'us-gaap_DisclosuresAbstract' in primary_concept or 'disclosure' in role_def: statement_type = "Disclosures" statement_category = "disclosure" elif 'us-gaap_AccountingPoliciesAbstract' in primary_concept or 'accounting policies' in role_def: statement_type = "AccountingPolicies" statement_category = "note" elif 'us-gaap_SegmentDisclosureAbstract' in primary_concept or 'segment' in role_def: statement_type = "SegmentDisclosure" statement_category = "disclosure" # Try to extract role name from URI role_name = role.split('/')[-1] if '/' in role else role.split('#')[-1] if '#' in role else '' # Create the statement metadata statement = { 'role': role, 'definition': tree.definition, 'element_count': len(tree.all_nodes), 'type': statement_type, 'primary_concept': primary_concept, 'role_name': role_name, 'category': statement_category # This will be None for backward compatibility unless set above } statements.append(statement) # Build lookup indices # By role URI self._statement_by_role_uri[role] = statement # By role name (short name) if role_name: role_name_lower = role_name.lower() if role_name_lower not in self._statement_by_role_name: self._statement_by_role_name[role_name_lower] = [] self._statement_by_role_name[role_name_lower].append(statement) # By standard name if statement_type: if statement_type not in self._statement_by_standard_name: self._statement_by_standard_name[statement_type] = [] self._statement_by_standard_name[statement_type].append(statement) # By primary concept if primary_concept: if primary_concept not in self._statement_by_primary_concept: self._statement_by_primary_concept[primary_concept] = [] self._statement_by_primary_concept[primary_concept].append(statement) # Also index by definition (without spaces, lowercase) if statement['definition']: def_key = statement['definition'].lower().replace(' ', '') if def_key not in self._statement_indices: self._statement_indices[def_key] = [] self._statement_indices[def_key].append(statement) # Cache the result self._all_statements_cached = statements return statements def get_statement_by_type(self, statement_type: str, include_dimensions: bool = True) -> Optional[Dict[str, Any]]: """ Get the first statement matching the given type. Args: statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', 'Notes', etc.) include_dimensions: Whether to include dimensional segment data (default: True) Returns: Statement data if found, None otherwise """ # Use find_statement instead of the flawed index-based lookup matching_statements, found_role, actual_statement_type = self.find_statement(statement_type) if not found_role or not matching_statements: return None # Get statement data using the found role statement_data = self.get_statement(found_role, should_display_dimensions=include_dimensions) if statement_data: # Extract periods from the statement data periods = {} for item in statement_data: for period_id, _value in item.get('values', {}).items(): if period_id not in periods: # Get period label from reporting_periods period_label = period_id for period in self.reporting_periods: if period['key'] == period_id: period_label = period['label'] break periods[period_id] = {'label': period_label} return { 'role': found_role, 'definition': matching_statements[0]['definition'], 'statement_type': actual_statement_type, 'periods': periods, 'data': statement_data } return None @classmethod def stitch_statements(cls, xbrl_list: List['XBRL'], statement_type: str = 'IncomeStatement', period_type: str = 'RECENT_PERIODS', max_periods: int = 3, standard: bool = True) -> Dict[str, Any]: """ Stitch together statements from multiple XBRL objects. Args: xbrl_list: List of XBRL objects, should be from the same company and ordered by date statement_type: Type of statement to stitch ('IncomeStatement', 'BalanceSheet', etc.) period_type: Type of period view to generate max_periods: Maximum number of periods to include (default: 3) standard: Whether to use standardized concept labels (default: True) Returns: Stitched statement data """ from edgar.xbrl.stitching import stitch_statements as _stitch_statements return _stitch_statements(xbrl_list, statement_type, period_type, max_periods, standard) def render_stitched_statement(self, stitched_data: Dict[str, Any], statement_title: str, statement_type: str) -> 'RichTable': """ Render a stitched statement. Args: stitched_data: Stitched statement data statement_title: Title of the statement statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.) Returns: RichTable: A formatted table representation of the stitched statement """ from edgar.xbrl.stitching import render_stitched_statement as _render_stitched_statement return _render_stitched_statement(stitched_data, statement_title, statement_type, self.entity_info) def get_statement(self, role_or_type: str, period_filter: Optional[str] = None, should_display_dimensions: Optional[bool] = None) -> List[Dict[str, Any]]: """ Get a financial statement by role URI, statement type, or statement short name. Args: role_or_type: Can be one of: - Extended link role URI (e.g. "http://apple.com/role/ConsolidatedStatementOfIncome") - Statement type name (e.g. "BalanceSheet") - Statement short name (e.g. "ConsolidatedStatementOfIncome") period_filter: Optional period key to filter facts should_display_dimensions: Whether to display dimensions for this statement. If None, the method will determine based on statement type and role. Returns: List of line items with values """ # Use the centralized statement finder to get statement information matching_statements, found_role, actual_statement_type = self.find_statement(role_or_type) # If no matching statement found, return empty list if not found_role or found_role not in self.presentation_trees: return [] tree = self.presentation_trees[found_role] # Find the root element root_id = tree.root_element_id # If should_display_dimensions wasn't provided, determine it from the statement type and role if should_display_dimensions is None: role_definition = "" if matching_statements: role_definition = matching_statements[0]['definition'] # Determine whether to display dimensions should_display_dimensions = self._is_dimension_display_statement(actual_statement_type, role_definition) # Generate line items recursively line_items = [] self._generate_line_items(root_id, tree.all_nodes, line_items, period_filter, None, should_display_dimensions) # Apply revenue deduplication for income statements to fix Issue #438 if actual_statement_type == 'IncomeStatement': from edgar.xbrl.deduplication_strategy import RevenueDeduplicator line_items = RevenueDeduplicator.deduplicate_statement_items(line_items) return line_items def _generate_line_items(self, element_id: str, nodes: Dict[str, PresentationNode], result: List[Dict[str, Any]], period_filter: Optional[str] = None, path: List[str] = None, should_display_dimensions: bool = False) -> None: """ Recursively generate line items for a statement. Args: element_id: Current element ID nodes: Dictionary of presentation nodes result: List to append line items to period_filter: Optional period key to filter facts path: Current path in hierarchy should_display_dimensions: Whether to display dimensions for this statement """ if element_id not in nodes: return # Update path if path is None: path = [] current_path = path + [element_id] # Get node information node = nodes[element_id] # Get label label = node.display_label # Get values and decimals across periods values = {} decimals = {} # Store decimals info for each period units = {} # Store unit_ref for each period period_types = {} # Store period_type ('instant' or 'duration') for each period # Issue #463: Get balance and weight from element catalog and calculation trees # (same approach as FactsView.get_facts()) balance = None # Debit/credit classification from XBRL schema weight = None # Calculation weight from calculation linkbase # Get balance from element catalog element_id_normalized = element_id.replace(':', '_') if element_id_normalized in self.element_catalog: element = self.element_catalog[element_id_normalized] balance = element.balance if balance is None: # Fallback to static US-GAAP mapping from edgar.xbrl.parsers.concepts import get_balance_type balance = get_balance_type(element_id) # Get weight from calculation trees (Issue #463) if hasattr(self, 'calculation_trees') and self.calculation_trees: for calc_tree in self.calculation_trees.values(): if element_id_normalized in calc_tree.all_nodes: calc_node = calc_tree.all_nodes[element_id_normalized] weight = calc_node.weight break # Use first weight found # Calculate preferred_sign from preferred_label (for Issue #463) # This determines display transformation: -1 = negate, 1 = as-is, None = not specified preferred_sign_value = None if node.preferred_label: # Check if this is a negatedLabel (indicates value should be negated for display) # Use pattern matching to support any XBRL namespace version (2003, 2009, future versions) # Matches: 'negatedLabel', 'negatedTerseLabel', 'http://www.xbrl.org/YYYY/role/negated*Label', etc. label_lower = node.preferred_label.lower() is_negated = 'negated' in label_lower and ( label_lower.startswith('negated') or # Short form: 'negatedLabel' '/role/negated' in label_lower # Full URI: 'http://www.xbrl.org/*/role/negated*' ) preferred_sign_value = -1 if is_negated else 1 # Find facts for any of these concept names all_relevant_facts = self._find_facts_for_element(node.element_name, period_filter) # Group facts by period for better selection facts_by_period = {} # Process all found facts and group by period for context_id, wrapped_fact in all_relevant_facts.items(): # Get period key for this context period_key = self.context_period_map.get(context_id) if not period_key: continue # Skip if no period key found # Initialize period entry if not exists if period_key not in facts_by_period: facts_by_period[period_key] = [] # Add this fact to the period facts_by_period[period_key].append((context_id, wrapped_fact)) # should_display_dimensions is now passed as a parameter from the calling method # Process facts by period, with different handling based on statement type from collections import defaultdict dimensioned_facts = defaultdict(list) # For dimensioned statement types for period_key, period_facts in facts_by_period.items(): if should_display_dimensions: # For statements that should display dimensions, group facts by dimension for context_id, wrapped_fact in period_facts: fact = wrapped_fact['fact'] dimension_info = wrapped_fact['dimension_info'] dimension_key = wrapped_fact['dimension_key'] if dimension_info: # Use the dimension_key we already generated dim_key_str = dimension_key # Store dimensioned fact with the full dimension metadata dimensioned_facts[dim_key_str].append((period_key, fact, dimension_info)) else: # This is a non-dimensioned fact for this concept, use in the main item if not values.get(period_key): values[period_key] = fact.numeric_value if fact.numeric_value is not None else fact.value # Store the decimals info for proper scaling if fact.decimals is not None: try: if fact.decimals == 'INF': decimals[period_key] = 0 # Infinite precision, no scaling else: decimals[period_key] = int(fact.decimals) except (ValueError, TypeError): decimals[period_key] = 0 # Default # Store unit_ref for this period units[period_key] = fact.unit_ref # Store period_type from context if context_id in self.contexts: context = self.contexts[context_id] if hasattr(context, 'period') and context.period: pt = context.period.get('type') if isinstance(context.period, dict) else getattr(context.period, 'type', None) period_types[period_key] = pt else: # For standard financial statements, prefer non-dimensioned facts # If only one fact, use it if len(period_facts) == 1: context_id, wrapped_fact = period_facts[0] fact = wrapped_fact['fact'] else: # Multiple facts for same period - prioritize based on dimensions # Sort facts by preference: no dimensions first, then by dimension count (fewer dimensions preferred) sorted_facts = [] for ctx_id, wrapped_fact in period_facts: dimension_count = len(wrapped_fact['dimension_info']) sorted_facts.append((dimension_count, ctx_id, wrapped_fact)) # Sort by dimension count (no dimensions or fewer dimensions first) sorted_facts.sort() # Use the first fact (with fewest dimensions) _, context_id, wrapped_fact = sorted_facts[0] fact = wrapped_fact['fact'] # Store the value values[period_key] = fact.numeric_value if fact.numeric_value is not None else fact.value # Store the decimals info for proper scaling if fact.decimals is not None: try: if fact.decimals == 'INF': decimals[period_key] = 0 # Infinite precision, no scaling else: decimals[period_key] = int(fact.decimals) except (ValueError, TypeError): decimals[period_key] = 0 # Default if decimals can't be converted # Store unit_ref for this period units[period_key] = fact.unit_ref # Store period_type from context if context_id in self.contexts: context = self.contexts[context_id] if hasattr(context, 'period') and context.period: pt = context.period.get('type') if isinstance(context.period, dict) else getattr(context.period, 'type', None) period_types[period_key] = pt # Create preferred_signs dict for all periods (same value for all periods of this concept) preferred_signs = {} if preferred_sign_value is not None: for period_key in values.keys(): preferred_signs[period_key] = preferred_sign_value # For dimensional statements with dimension data, handle the parent item specially if should_display_dimensions and dimensioned_facts: # Create parent line item with total values AND dimensional children # This ensures users see both the total (e.g., Total Revenue = $25,500M) # and the dimensional breakdown (e.g., Auto Revenue = $19,878M, Energy = $3,014M) line_item = { 'concept': element_id, 'name': node.element_name, 'all_names': [node.element_name], 'label': label, # Keep original label, don't add colon 'values': values, # Show the total values 'decimals': decimals, # Include decimals for formatting 'units': units, # Include unit_ref for each period 'period_types': period_types, # Include period_type for each period 'preferred_signs': preferred_signs, # Include preferred_sign for display (Issue #463) 'balance': balance, # Include balance (debit/credit) for display (Issue #463) 'weight': weight, # Include calculation weight for metadata (Issue #463) 'level': node.depth, 'preferred_label': node.preferred_label, 'is_abstract': node.is_abstract, # Issue #450: Use node's actual abstract flag 'children': node.children, 'has_values': len(values) > 0, # True if we have total values 'has_dimension_children': True # Mark as having dimension children } else: # Non-dimensional case: Create normal line item with values line_item = { 'concept': element_id, 'name': node.element_name, 'all_names': [node.element_name], 'label': label, 'values': values, 'decimals': decimals, # Add decimals info for formatting 'units': units, # Include unit_ref for each period 'period_types': period_types, # Include period_type for each period 'preferred_signs': preferred_signs, # Include preferred_sign for display (Issue #463) 'balance': balance, # Include balance (debit/credit) for display (Issue #463) 'weight': weight, # Include calculation weight for metadata (Issue #463) 'level': node.depth, 'preferred_label': node.preferred_label, 'is_abstract': node.is_abstract, 'children': node.children, 'has_values': len(values) > 0 # Flag to indicate if we found values } # Add to result result.append(line_item) # For dimensional statements, add dimensioned facts as child line items if should_display_dimensions and dimensioned_facts: # Add each dimension as a child line item with increased depth for dim_key, facts_list in dimensioned_facts.items(): dim_values = {} dim_decimals = {} dim_units = {} # Store unit_ref for each period dim_period_types = {} # Store period_type for each period dim_metadata = None # Store metadata from the first fact # Collect values for each period for fact_data in facts_list: try: # Unpack with consistent 3-part tuples from our updated code period_key, fact, dimensions_info = fact_data # Store the dimension metadata from the first fact if dim_metadata is None: dim_metadata = dimensions_info # Extract value from fact dim_values[period_key] = fact.numeric_value if fact.numeric_value is not None else fact.value except (ValueError, TypeError, IndexError) as e: # Try to handle older format (period_key, fact) tuple for backward compatibility try: if isinstance(fact_data, tuple) and len(fact_data) == 2: period_key, fact = fact_data dim_values[ period_key] = fact.numeric_value if fact.numeric_value is not None else fact.value except Exception: # Log the error and continue log.warning(f"Error processing dimension fact data: {e}") continue # Store decimals if fact.decimals is not None: try: if fact.decimals == 'INF': dim_decimals[period_key] = 0 else: dim_decimals[period_key] = int(fact.decimals) except (ValueError, TypeError): dim_decimals[period_key] = 0 # Store unit_ref for this period dim_units[period_key] = fact.unit_ref # Store period_type from context context_id = fact.context_ref if context_id in self.contexts: context = self.contexts[context_id] if hasattr(context, 'period') and context.period: pt = context.period.get('type') if isinstance(context.period, dict) else getattr(context.period, 'type', None) dim_period_types[period_key] = pt # For better display, use the member label for dimension items, # but make sure we don't add the parent concept name as well # Default to the full dimension key (e.g., "Region: Americas") display_label = dim_key # Try various member label formats based on dimension structure if dim_metadata: if len(dim_metadata) == 1: # For single dimensions, just use the member label (e.g., "Americas") display_label = dim_metadata[0]['member_label'] else: # For multiple dimensions, create a combined label with all member names # (e.g., "Americas - iPhone") member_labels = [info['member_label'] for info in dim_metadata if 'member_label' in info] if member_labels: display_label = " - ".join(member_labels) # Create preferred_signs dict for dimensional line items (same value for all periods) dim_preferred_signs = {} if preferred_sign_value is not None: for period_key in dim_values.keys(): dim_preferred_signs[period_key] = preferred_sign_value # Create dimension line item dim_line_item = { 'concept': element_id, # Use same concept 'name': node.element_name, 'all_names': [node.element_name], 'label': display_label, # Use optimized dimension label 'full_dimension_label': dim_key, # Keep full dimension notation for reference 'values': dim_values, 'decimals': dim_decimals, 'units': dim_units, # Include unit_ref for each period 'period_types': dim_period_types, # Include period_type for each period 'preferred_signs': dim_preferred_signs, # Include preferred_sign for display (Issue #463) 'level': node.depth + 1, # Increase depth by 1 'preferred_label': node.preferred_label, 'is_abstract': False, 'children': [], 'has_values': len(dim_values) > 0, 'is_dimension': True, # Mark as a dimension item 'dimension_metadata': dim_metadata # Store full dimension information } # Add to result result.append(dim_line_item) # Process children for child_id in node.children: self._generate_line_items(child_id, nodes, result, period_filter, current_path, should_display_dimensions) def _find_facts_for_element(self, element_name: str, period_filter: Optional[str] = None, dimensions: Optional[Dict[str, str]] = None) -> Dict[str, Any]: """ Find facts for a specific element, optionally filtered by period and dimensions. Args: element_name: Element name to find facts for period_filter: Optional period key to filter contexts dimensions: Optional dictionary of dimension names to dimension values to filter by Returns: Dictionary of facts by context ID with dimension information attached """ if not element_name: return {} # No element name provided relevant_facts = {} # Check each context for context_id in self.contexts: # Use parser's get_fact method which handles normalization internally fact = self.parser.get_fact(element_name, context_id) if fact: # If period filter is specified, check if context matches period if period_filter: period_key = self.context_period_map.get(context_id) if period_key != period_filter: continue # Skip if period doesn't match # If dimensions are specified, check if context has matching dimensions if dimensions: context = self.contexts.get(context_id) if not context or not hasattr(context, 'dimensions'): continue # Skip if context doesn't have dimensions # Check if all specified dimensions match matches_all_dimensions = True for dim_name, dim_value in dimensions.items(): # Normalize dimension name if it contains a colon normalized_dim_name = dim_name.replace(':', '_') # Check if this dimension exists and matches the expected value if normalized_dim_name not in context.dimensions or context.dimensions[ normalized_dim_name] != dim_value: matches_all_dimensions = False break if not matches_all_dimensions: continue # Skip if dimensions don't match # Get the context and extract dimension information context = self.contexts.get(context_id) # Create a wrapper around the fact with dimension information wrapped_fact = { 'fact': fact, 'dimension_info': [], 'dimension_key': "" } if context and hasattr(context, 'dimensions') and context.dimensions: # Build rich dimension information with formatted labels dimension_info = [] dim_keys = [] for dim_name, dim_value in sorted(context.dimensions.items()): dim_value = dim_value.replace(":", "_") # Initialize with technical names dim_label = dim_name mem_label = dim_value # Get richer label information from element catalog dim_element = None mem_element = None # Try to get human-readable dimension name if dim_name in self.element_catalog: dim_element = self.element_catalog[dim_name] # Try different label roles in order of preference for role in ['http://www.xbrl.org/2003/role/terseLabel', 'http://www.xbrl.org/2003/role/label', 'http://www.xbrl.org/2003/role/verboseLabel']: if role in dim_element.labels: dim_label = dim_element.labels[role] break # Try to get human-readable member name if dim_value in self.element_catalog: mem_element = self.element_catalog[dim_value] # Try different label roles in order of preference for role in ['http://www.xbrl.org/2003/role/terseLabel', 'http://www.xbrl.org/2003/role/label', 'http://www.xbrl.org/2003/role/verboseLabel']: if role in mem_element.labels: mem_label = mem_element.labels[role] break # Clean up labels (remove [Axis], [Member], etc.) dim_label = dim_label.replace('[Axis]', '').replace('[Domain]', '').strip() mem_label = mem_label.replace('[Member]', '').strip() # Format key for display format_key = f"{dim_label}: {mem_label}" dim_keys.append(format_key) # Store rich dimension information dimension_info.append({ 'dimension': dim_name, 'member': dim_value, 'dimension_label': dim_label, 'member_label': mem_label, 'format_key': format_key, 'dimension_element': dim_element, 'member_element': mem_element }) # Store dimension information in the wrapper wrapped_fact['dimension_info'] = dimension_info wrapped_fact['dimension_key'] = ", ".join(sorted(dim_keys)) # If we get here, all filters passed relevant_facts[context_id] = wrapped_fact return relevant_facts def get_period_views(self, statement_type: str) -> List[Dict[str, Any]]: """ Get available period views for a statement type. Args: statement_type: Type of statement to get period views for Returns: List of period view options with name, description, and period keys """ return get_period_views(self, statement_type) def get_statements_by_category(self, category: str) -> List[Dict[str, Any]]: """ Get all statements matching a specific category. Args: category: Category of statements to find ('statement', 'note', 'disclosure', 'document', or 'other') Returns: List of statement metadata matching the category """ # Ensure indices are built if not self._all_statements_cached: self.get_all_statements() result = [] # Find all statements with matching category for stmt in self._all_statements_cached: if stmt.get('category') == category: result.append(stmt) return result def find_statement(self, statement_type: str, is_parenthetical: bool = False) -> Tuple[ List[Dict[str, Any]], Optional[str], str]: """ Find a statement by type, role, or name. Args: statement_type: Type of statement (e.g., "BalanceSheet") or role URI or statement name is_parenthetical: Whether to look for a parenthetical statement Returns: Tuple of: - List of matching statements - Found role URI (or None if not found) - Actual statement type (may be different from input if matched by role/name) """ # Initialize statement resolver if not already done if self._statement_resolver is None: self._statement_resolver = StatementResolver(self) # Use the enhanced statement resolver matching_statements, found_role, actual_statement_type, confidence = self._statement_resolver.find_statement( statement_type, is_parenthetical ) # For backward compatibility, ensure indices are built if not self._all_statements_cached: self.get_all_statements() # If we couldn't find anything with the resolver, fall back to the old implementation if not matching_statements: # Original implementation (fallback) matching_statements = [] found_role = None actual_statement_type = statement_type # Try to find the statement by standard name first if statement_type in self._statement_by_standard_name: matching_statements = self._statement_by_standard_name[statement_type] if matching_statements: found_role = matching_statements[0]['role'] # If not found by standard name, try by role URI if not matching_statements and statement_type.startswith( 'http') and statement_type in self._statement_by_role_uri: matching_statements = [self._statement_by_role_uri[statement_type]] found_role = statement_type # If not found, try by role name (case-insensitive) if not matching_statements: role_or_type_lower = statement_type.lower() if role_or_type_lower in self._statement_by_role_name: matching_statements = self._statement_by_role_name[role_or_type_lower] if matching_statements: found_role = matching_statements[0]['role'] # If still not found, try by definition if not matching_statements: def_key = statement_type.lower().replace(' ', '') if def_key in self._statement_indices: matching_statements = self._statement_indices[def_key] if matching_statements: found_role = matching_statements[0]['role'] # If still not found, try partial matching on role name if not matching_statements: for role_name, statements in self._statement_by_role_name.items(): if statement_type.lower() in role_name: matching_statements = statements found_role = statements[0]['role'] break # Update actual statement type if we found a match if matching_statements and matching_statements[0]['type']: actual_statement_type = matching_statements[0]['type'] return matching_statements, found_role, actual_statement_type def render_statement(self, statement_type: str = "BalanceSheet", period_filter: Optional[str] = None, period_view: Optional[str] = None, standard: bool = True, show_date_range: bool = False, parenthetical: bool = False, include_dimensions: bool = True) -> Optional[RenderedStatement]: """ Render a statement in a rich table format similar to how it would appear in an actual filing. Args: statement_type: Type of statement to render (e.g., "BalanceSheet", "IncomeStatement") or a specific statement role/name (e.g., "CONSOLIDATEDBALANCESHEETS") period_filter: Optional period key to filter by specific reporting period period_view: Optional name of a predefined period view (e.g., "Quarterly: Current vs Previous") standard: Whether to use standardized concept labels (default: True) show_date_range: Whether to show full date ranges for duration periods (default: False) parenthetical: Whether to look for a parenthetical statement (default: False) include_dimensions: Whether to include dimensional segment data (default: True) Returns: RichTable: A formatted table representation of the statement """ # Find the statement using the unified statement finder with parenthetical support matching_statements, found_role, actual_statement_type = self.find_statement(statement_type, parenthetical) # Get statement definition from matching statements role_definition = "" if matching_statements: role_definition = matching_statements[0]['definition'] # Determine if this statement should display dimensions should_display_dimensions = include_dimensions and self._is_dimension_display_statement(actual_statement_type, role_definition) # Get the statement data with dimension display flag statement_data = self.get_statement(statement_type, period_filter, should_display_dimensions) if not statement_data: return None # Get the statement title statement_info = statement_to_concepts.get(actual_statement_type) if statement_info: statement_title = statement_info.title else: # Try to get a nice title from the role definition if role_definition: statement_title = role_definition.split(' - ')[-1].strip() else: statement_title = statement_type # Add "Parenthetical" to the title if appropriate if parenthetical: statement_title = f"{statement_title} (Parenthetical)" # Get periods to display using unified period selection periods_to_display = select_periods( self, actual_statement_type, max_periods=4 ) # Render the statement return render_statement( statement_data, periods_to_display, statement_title, actual_statement_type, self.entity_info, standard, show_date_range, show_comparisons=True, xbrl_instance=self ) def to_pandas(self, statement_role: Optional[str] = None, standard: bool = True) -> Dict[str, pd.DataFrame]: """ Convert XBRL data to pandas DataFrames. Args: statement_role: Optional role URI to convert only a specific statement standard: Whether to use standardized concept labels (default: True) Returns: Dictionary of DataFrames for different aspects of the XBRL data """ dataframes = {} # Convert contexts to DataFrame context_data = [] for context_id, context in self.contexts.items(): ctx_dict = context.model_dump() ctx_dict['context_id'] = context_id # Extract entity info if 'entity' in ctx_dict and ctx_dict['entity']: ctx_dict['entity_identifier'] = ctx_dict['entity'].get('identifier') ctx_dict['entity_scheme'] = ctx_dict['entity'].get('scheme') # Extract period info if 'period' in ctx_dict and ctx_dict['period']: ctx_dict['period_type'] = ctx_dict['period'].get('type') if ctx_dict['period_type'] == 'instant': ctx_dict['period_instant'] = ctx_dict['period'].get('instant') elif ctx_dict['period_type'] == 'duration': ctx_dict['period_start'] = ctx_dict['period'].get('startDate') ctx_dict['period_end'] = ctx_dict['period'].get('endDate') # Extract dimensions if 'dimensions' in ctx_dict and ctx_dict['dimensions']: for dim_name, dim_value in ctx_dict['dimensions'].items(): dim_key = f"dim_{dim_name.replace(':', '_')}" ctx_dict[dim_key] = dim_value context_data.append(ctx_dict) if context_data: dataframes['contexts'] = pd.DataFrame(context_data) # Convert facts to DataFrame fact_data = [] for fact_key, fact in self._facts.items(): fact_dict = fact.model_dump() fact_dict['fact_key'] = fact_key # Try to get additional information if fact.context_ref in self.contexts: context = self.contexts[fact.context_ref] # Add period information if 'period' in context.model_dump() and context.period: fact_dict['period_type'] = context.period.get('type') if fact_dict['period_type'] == 'instant': fact_dict['period_instant'] = context.period.get('instant') elif fact_dict['period_type'] == 'duration': fact_dict['period_start'] = context.period.get('startDate') fact_dict['period_end'] = context.period.get('endDate') # Add entity information if 'entity' in context.model_dump() and context.entity: fact_dict['entity_identifier'] = context.entity.get('identifier') # Add dimensions if 'dimensions' in context.model_dump() and context.dimensions: for dim_name, dim_value in context.dimensions.items(): dim_key = f"dim_{dim_name.replace(':', '_')}" fact_dict[dim_key] = dim_value # Try to get element information element_id = fact.element_id if element_id in self.element_catalog: element = self.element_catalog[element_id] fact_dict['element_name'] = element.name fact_dict['element_type'] = element.data_type fact_dict['element_period_type'] = element.period_type fact_dict['element_balance'] = element.balance # Add label label = None if element.labels: # Try standard label first label = element.labels.get(STANDARD_LABEL) if not label: # Take first available label label = next(iter(element.labels.values()), None) fact_dict['element_label'] = label fact_data.append(fact_dict) if fact_data: dataframes['facts'] = pd.DataFrame(fact_data) # Convert entity info to DataFrame if self.entity_info: dataframes['entity_info'] = pd.DataFrame([self.entity_info]) # Convert specific statement if requested if statement_role: # Try direct role URI statement_data = self.get_statement(statement_role) # If not found, try by statement type if not statement_data and not statement_role.startswith('http'): # Find the role for this statement type all_statements = self.get_all_statements() matching_statements = [stmt for stmt in all_statements if stmt['type'] == statement_role] if matching_statements: role = matching_statements[0]['role'] statement_data = self.get_statement(role) # Convert statement data to DataFrame if found if statement_data: # Apply standardization if requested if standard: # Get statement type for context stmt_type = statement_role if not stmt_type.startswith('http'): stmt_type = statement_role else: # Try to determine statement type from role all_statements = self.get_all_statements() for stmt in all_statements: if stmt['role'] == statement_role: stmt_type = stmt['type'] break # Add statement type to each item for item in statement_data: item['statement_type'] = stmt_type # Apply standardization from edgar.xbrl.standardization import ConceptMapper, initialize_default_mappings, standardize_statement mapper = ConceptMapper(initialize_default_mappings(read_only=True)) statement_data = standardize_statement(statement_data, mapper) # Create rows for the DataFrame rows = [] # Add columns for all found periods all_periods = set() for item in statement_data: for period in item.get('values', {}).keys(): all_periods.add(period) # Sort periods (typically instant or duration_start_end format) sorted_periods = sorted(all_periods) for item in statement_data: row = { 'concept': item['concept'], 'label': item['label'], 'level': item['level'], 'is_abstract': item['is_abstract'], 'has_values': item.get('has_values', False), } # Add original label if standardized if 'original_label' in item: row['original_label'] = item['original_label'] # Add period values for period in sorted_periods: value = item.get('values', {}).get(period) row[period] = value rows.append(row) if rows: dataframes['statement'] = pd.DataFrame(rows) # Rename columns to remove duration/instant prefixes dataframes['statement'].columns = [ col.replace('duration_', '').replace('instant_', '') for col in dataframes['statement'].columns ] return dataframes def get_footnotes_for_fact(self, fact_id: str) -> List['Footnote']: """Get all footnotes associated with a specific fact ID. Args: fact_id: The ID of the fact to get footnotes for Returns: List of Footnote objects associated with the fact """ footnotes = [] # First check if any fact has this ID and get its footnote references for fact in self.parser.facts.values(): if fact.fact_id == fact_id: # Get the footnote objects for each footnote ID for footnote_id in fact.footnotes: if footnote_id in self.parser.footnotes: footnotes.append(self.parser.footnotes[footnote_id]) break return footnotes def get_facts_with_footnotes(self) -> Dict[str, 'Fact']: """Get all facts that have associated footnotes. Returns: Dictionary of fact_key -> Fact for all facts with footnotes """ facts_with_footnotes = {} for key, fact in self.parser.facts.items(): if fact.footnotes: facts_with_footnotes[key] = fact return facts_with_footnotes def get_currency_for_fact(self, element_name: str, period_key: str) -> Optional[str]: """ Get currency for a specific fact/period on-demand with caching. Args: element_name: The XBRL element name period_key: The period key to look up Returns: Currency measure string (e.g., 'iso4217:EUR') or None if not found """ # Create cache key cache_key = f"{element_name}_{period_key}" # Check cache first if not hasattr(self, '_currency_cache'): self._currency_cache = {} if cache_key in self._currency_cache: return self._currency_cache[cache_key] # Find facts for this element and period facts = self._find_facts_for_element(element_name, period_key) # Look for the first fact with currency information currency_measure = None for _, wrapped_fact in facts.items(): fact = wrapped_fact['fact'] if hasattr(fact, 'unit_ref') and fact.unit_ref and fact.unit_ref in self.units: unit_info = self.units[fact.unit_ref] if 'measure' in unit_info: currency_measure = unit_info['measure'] break # Cache the result (including None values to avoid repeated lookups) self._currency_cache[cache_key] = currency_measure return currency_measure def __rich__(self): """Rich representation for pretty printing in console.""" return generate_rich_representation(self) def __repr__(self): return repr_rich(self.__rich__()) def text(self, max_tokens: int = 2000) -> str: """ Get AI-optimized text representation of XBRL document. Returns a compact Markdown-KV format optimized for LLM consumption, including entity information, filing details, period coverage, available statements, and common usage patterns. This format uses 64.7% fewer tokens than the visual repr() format while retaining all essential information. Args: max_tokens: Target token budget (currently not enforced, reserved for future use) Returns: Compact Markdown-KV text representation optimized for AI consumption Example: >>> xbrl = filing.xbrl() >>> text = xbrl.text() >>> print(text) **Entity:** Apple Inc. (AAPL) **CIK:** 0000320193 **Form:** 10-K ... """ lines = [] # Entity information if self.entity_info: entity_name = self.entity_info.get('entity_name', 'Unknown Entity') ticker = self.entity_info.get('ticker', '') cik = self.entity_info.get('identifier', '') # Entity line with ticker if available entity_line = f"**Entity:** {entity_name}" if ticker: entity_line += f" ({ticker})" lines.append(entity_line) if cik: lines.append(f"**CIK:** {cik}") # Filing details doc_type = self.entity_info.get('document_type', '') if doc_type: lines.append(f"**Form:** {doc_type}") fiscal_year = self.entity_info.get('fiscal_year', '') fiscal_period = self.entity_info.get('fiscal_period', '') period_end = self.entity_info.get('document_period_end_date', '') if fiscal_period and fiscal_year: period_display = f"Fiscal Year {fiscal_year}" if fiscal_period == 'FY' else f"{fiscal_period} {fiscal_year}" if period_end: period_display += f" (ended {period_end})" lines.append(f"**Fiscal Period:** {period_display}") # Data volume lines.append(f"**Facts:** {len(self._facts):,}") lines.append(f"**Contexts:** {len(self.contexts):,}") # Period coverage if self.reporting_periods: lines.append("") lines.append("**Available Data Coverage:**") # Categorize periods annual_periods = [] quarterly_periods = [] for period in self.reporting_periods[:10]: label = period.get('label', '') if not label: continue if 'Annual:' in label or 'FY' in label.upper(): # Extract fiscal year import re year_match = re.search(r'to .* (\d{4})', label) if year_match: annual_periods.append(f"FY {year_match.group(1)}") else: annual_periods.append(label) elif 'Quarterly:' in label or any(q in label for q in ['Q1', 'Q2', 'Q3', 'Q4']): clean_label = label.replace('Quarterly:', '').strip() quarterly_periods.append(clean_label) if annual_periods: lines.append(f" Annual: {', '.join(annual_periods[:3])}") if quarterly_periods: lines.append(f" Quarterly: {', '.join(quarterly_periods[:2])}") # Available statements statements = self.get_all_statements() if statements: lines.append("") lines.append("**Available Statements:**") # Group by core vs other statements core_statements = set() other_statements = [] core_types = {'IncomeStatement', 'BalanceSheet', 'CashFlowStatement', 'StatementOfEquity', 'ComprehensiveIncome'} for stmt in statements: stmt_type = stmt.get('type', '') if stmt_type in core_types: core_statements.add(stmt_type) elif stmt_type: other_statements.append(stmt_type) # Show core statements first (in consistent order) if core_statements: ordered_core = [s for s in ['IncomeStatement', 'ComprehensiveIncome', 'BalanceSheet', 'StatementOfEquity', 'CashFlowStatement'] if s in core_statements] lines.append(f" Core: {', '.join(ordered_core)}") if other_statements and len(other_statements) <= 5: lines.append(f" Other: {', '.join(other_statements)}") elif other_statements: lines.append(f" Other: {len(other_statements)} additional statements") # Common actions (compact version) lines.append("") lines.append("**Common Actions:**") lines.append(" # List all available statements") lines.append(" xbrl.statements") lines.append("") lines.append(" # View core financial statements") lines.append(" stmt = xbrl.statements.income_statement()") lines.append(" stmt = xbrl.statements.balance_sheet()") lines.append(" stmt = xbrl.statements.cash_flow_statement()") lines.append(" stmt = xbrl.statements.statement_of_equity()") lines.append(" stmt = xbrl.statements.comprehensive_income()") lines.append("") lines.append(" # Get current period only (returns XBRL with filtered context)") lines.append(" current = xbrl.current_period") lines.append(" stmt = current.income_statement()") lines.append("") lines.append(" # Convert statement to DataFrame") lines.append(" df = stmt.to_dataframe()") lines.append("") lines.append(" # Query specific facts") lines.append(" revenue = xbrl.facts.query().by_concept('Revenue').to_dataframe()") lines.append("") lines.append("💡 Use xbrl.docs for comprehensive API guide") return "\n".join(lines) @property def docs(self): """ Get comprehensive documentation for the XBRL class. Returns a Docs object with detailed API documentation including usage patterns, examples, and guidance for working with XBRL data. The documentation is searchable using the .search() method. Returns: Docs: Documentation object with rich display and search capabilities Example: >>> xbrl.docs # Display full documentation >>> xbrl.docs.search("extract revenue") # Search for specific topics """ from edgar.richtools import Docs return Docs(self)