Initial commit
This commit is contained in:
72
venv/lib/python3.10/site-packages/edgar/xbrl/__init__.py
Normal file
72
venv/lib/python3.10/site-packages/edgar/xbrl/__init__.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""
|
||||
XBRL2 Module - Enhanced XBRL Processing for EdgarTools
|
||||
|
||||
This module provides enhanced parsing and processing of XBRL data,
|
||||
with support for statement standardization and multi-period statement stitching.
|
||||
|
||||
Example usage:
|
||||
|
||||
from edgar import Company
|
||||
from edgar.xbrl import XBRL, XBRLS
|
||||
|
||||
# Parse a single filing
|
||||
company = Company("AAPL")
|
||||
filing = company.latest_10k()
|
||||
xbrl = XBRL.from_filing(filing)
|
||||
|
||||
# Access statements from a single filing
|
||||
balance_sheet = xbrl.statements.balance_sheet()
|
||||
income_statement = xbrl.statements.income_statement()
|
||||
|
||||
# Render the statement or convert to DataFrame
|
||||
print(balance_sheet.render())
|
||||
df = income_statement.to_dataframe()
|
||||
|
||||
# For multi-period analysis, use XBRLS to stitch statements together
|
||||
filings = company.latest("10-K", 3) # Get 3 years of 10-K filings
|
||||
xbrls = XBRLS.from_filings(filings)
|
||||
|
||||
# Access stitched statements showing multiple years of data
|
||||
stitched_income = xbrls.statements.income_statement()
|
||||
|
||||
# Render the stitched statement or convert to DataFrame
|
||||
print(stitched_income.render())
|
||||
df = stitched_income.to_dataframe()
|
||||
"""
|
||||
|
||||
from edgar.xbrl.facts import FactQuery, FactsView
|
||||
from edgar.xbrl.rendering import RenderedStatement
|
||||
from edgar.xbrl.standardization import StandardConcept
|
||||
from edgar.xbrl.statements import Statement, Statements, StitchedStatement, StitchedStatements
|
||||
|
||||
# Export statement stitching functionality
|
||||
from edgar.xbrl.stitching import (
|
||||
XBRLS,
|
||||
StatementStitcher,
|
||||
StitchedFactQuery,
|
||||
StitchedFactsView,
|
||||
render_stitched_statement,
|
||||
stitch_statements,
|
||||
to_pandas,
|
||||
)
|
||||
from edgar.xbrl.xbrl import XBRL, XBRLFilingWithNoXbrlData
|
||||
|
||||
__all__ = [
|
||||
'XBRL',
|
||||
'XBRLFilingWithNoXbrlData',
|
||||
'XBRLS',
|
||||
'Statements',
|
||||
'Statement',
|
||||
'StitchedStatements',
|
||||
'StitchedStatement',
|
||||
'StandardConcept',
|
||||
'StatementStitcher',
|
||||
'stitch_statements',
|
||||
'render_stitched_statement',
|
||||
'RenderedStatement',
|
||||
'to_pandas',
|
||||
'FactsView',
|
||||
'FactQuery',
|
||||
'StitchedFactsView',
|
||||
'StitchedFactQuery'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,180 @@
|
||||
"""
|
||||
Abstract concept detection for XBRL elements.
|
||||
|
||||
This module provides utilities to determine if an XBRL concept should be marked as abstract,
|
||||
using multiple fallback strategies when taxonomy schema information is not available.
|
||||
|
||||
Background:
|
||||
-----------
|
||||
EdgarTools currently only parses company-specific XSD schema files included in SEC filings.
|
||||
Standard taxonomy schemas (US-GAAP, DEI, etc.) are referenced externally and not parsed.
|
||||
This means concepts from standard taxonomies are added to the element catalog without their
|
||||
abstract attribute information, defaulting to abstract=False.
|
||||
|
||||
Solution:
|
||||
---------
|
||||
This module implements a multi-tier fallback strategy for abstract detection:
|
||||
1. Trust schema abstract attribute (if available and True)
|
||||
2. Check known abstract concepts (explicit list)
|
||||
3. Pattern matching on concept name
|
||||
4. Structural heuristics (has children but no values)
|
||||
|
||||
See: Issue #450 - Statement of Equity rendering problems
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Set
|
||||
|
||||
# Known abstract concepts from US-GAAP taxonomy
|
||||
# These are explicitly marked abstract="true" in the US-GAAP taxonomy schemas
|
||||
KNOWN_ABSTRACT_CONCEPTS: Set[str] = {
|
||||
# Statement abstracts
|
||||
'us-gaap_StatementOfFinancialPositionAbstract',
|
||||
'us-gaap_StatementOfStockholdersEquityAbstract',
|
||||
'us-gaap_StatementOfIncomeAndComprehensiveIncomeAbstract',
|
||||
'us-gaap_StatementOfCashFlowsAbstract',
|
||||
'us-gaap_IncomeStatementAbstract',
|
||||
|
||||
# Roll forward abstracts
|
||||
'us-gaap_IncreaseDecreaseInStockholdersEquityRollForward',
|
||||
'us-gaap_PropertyPlantAndEquipmentRollForward',
|
||||
'us-gaap_IntangibleAssetsRollForward',
|
||||
'us-gaap_LongTermDebtRollForward',
|
||||
|
||||
# Reconciliation abstracts
|
||||
'us-gaap_AdjustmentsToReconcileNetIncomeLossToCashProvidedByUsedInOperatingActivitiesAbstract',
|
||||
|
||||
# Table and axis abstracts
|
||||
'us-gaap_StatementTable',
|
||||
'us-gaap_StatementLineItems',
|
||||
'us-gaap_StatementEquityComponentsAxis',
|
||||
'us-gaap_EquityComponentDomain',
|
||||
|
||||
# Accounting policies
|
||||
'us-gaap_AccountingPoliciesAbstract',
|
||||
'us-gaap_SignificantAccountingPoliciesTextBlock',
|
||||
|
||||
# Disclosure abstracts
|
||||
'us-gaap_DisclosureTextBlockAbstract',
|
||||
|
||||
# Document and entity information (DEI)
|
||||
'dei_DocumentInformationAbstract',
|
||||
'dei_EntityInformationAbstract',
|
||||
'dei_CoverAbstract',
|
||||
}
|
||||
|
||||
# Patterns that indicate a concept is likely abstract
|
||||
# These are based on XBRL naming conventions
|
||||
ABSTRACT_CONCEPT_PATTERNS: List[str] = [
|
||||
r'.*Abstract$', # Ends with "Abstract"
|
||||
r'.*RollForward$', # Ends with "RollForward" (roll forward tables)
|
||||
r'.*Table$', # Ends with "Table" (dimensional tables)
|
||||
r'.*Axis$', # Ends with "Axis" (dimensional axes)
|
||||
r'.*Domain$', # Ends with "Domain" (dimension domains)
|
||||
r'.*LineItems$', # Ends with "LineItems" (line item tables)
|
||||
r'.*TextBlock$', # Ends with "TextBlock" (disclosure text blocks)
|
||||
]
|
||||
|
||||
|
||||
def is_abstract_concept(
|
||||
concept_name: str,
|
||||
schema_abstract: bool = False,
|
||||
has_children: bool = False,
|
||||
has_values: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if an XBRL concept should be marked as abstract using multiple fallback strategies.
|
||||
|
||||
Strategy priority:
|
||||
1. Trust schema if it explicitly says abstract=True
|
||||
2. Check against known abstract concepts list
|
||||
3. Apply pattern matching on concept name
|
||||
4. Use structural heuristics (has children but no values)
|
||||
5. Default to schema value or False
|
||||
|
||||
Args:
|
||||
concept_name: The XBRL concept name (e.g., "us-gaap_StatementOfStockholdersEquityAbstract")
|
||||
schema_abstract: The abstract attribute from the schema (if available)
|
||||
has_children: Whether this concept has children in the presentation tree
|
||||
has_values: Whether this concept has fact values in the instance
|
||||
|
||||
Returns:
|
||||
True if the concept should be marked as abstract, False otherwise
|
||||
|
||||
Examples:
|
||||
>>> is_abstract_concept('us-gaap_StatementOfStockholdersEquityAbstract')
|
||||
True
|
||||
|
||||
>>> is_abstract_concept('us-gaap_Revenue')
|
||||
False
|
||||
|
||||
>>> is_abstract_concept('us-gaap_SomethingRollForward')
|
||||
True
|
||||
|
||||
>>> is_abstract_concept('us-gaap_UnknownConcept', has_children=True, has_values=False)
|
||||
True
|
||||
"""
|
||||
# Strategy 1: Trust schema if it says True
|
||||
if schema_abstract:
|
||||
return True
|
||||
|
||||
# Strategy 2: Check known abstract concepts
|
||||
if concept_name in KNOWN_ABSTRACT_CONCEPTS:
|
||||
return True
|
||||
|
||||
# Strategy 3: Pattern matching
|
||||
for pattern in ABSTRACT_CONCEPT_PATTERNS:
|
||||
if re.match(pattern, concept_name):
|
||||
return True
|
||||
|
||||
# Strategy 4: Structural heuristics
|
||||
# If a concept has children in the presentation tree but no fact values,
|
||||
# it's likely an abstract header
|
||||
if has_children and not has_values:
|
||||
return True
|
||||
|
||||
# Strategy 5: Default to schema value (or False if not provided)
|
||||
return schema_abstract
|
||||
|
||||
|
||||
def add_known_abstract_concept(concept_name: str) -> None:
|
||||
"""
|
||||
Add a concept to the known abstracts list.
|
||||
|
||||
This allows runtime extension of the known abstracts list when new abstract
|
||||
concepts are discovered that don't match existing patterns.
|
||||
|
||||
Args:
|
||||
concept_name: The XBRL concept name to add
|
||||
"""
|
||||
KNOWN_ABSTRACT_CONCEPTS.add(concept_name)
|
||||
|
||||
|
||||
def add_abstract_pattern(pattern: str) -> None:
|
||||
"""
|
||||
Add a pattern to the abstract pattern list.
|
||||
|
||||
Args:
|
||||
pattern: Regular expression pattern to match abstract concepts
|
||||
"""
|
||||
ABSTRACT_CONCEPT_PATTERNS.append(pattern)
|
||||
|
||||
|
||||
def get_known_abstract_concepts() -> Set[str]:
|
||||
"""
|
||||
Get the set of known abstract concepts.
|
||||
|
||||
Returns:
|
||||
Set of concept names known to be abstract
|
||||
"""
|
||||
return KNOWN_ABSTRACT_CONCEPTS.copy()
|
||||
|
||||
|
||||
def get_abstract_patterns() -> List[str]:
|
||||
"""
|
||||
Get the list of abstract concept patterns.
|
||||
|
||||
Returns:
|
||||
List of regex patterns used to identify abstract concepts
|
||||
"""
|
||||
return ABSTRACT_CONCEPT_PATTERNS.copy()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
125
venv/lib/python3.10/site-packages/edgar/xbrl/analysis/fraud.py
Normal file
125
venv/lib/python3.10/site-packages/edgar/xbrl/analysis/fraud.py
Normal file
@@ -0,0 +1,125 @@
|
||||
"""Financial fraud detection module.
|
||||
|
||||
This module provides tools for detecting potential financial fraud and anomalies:
|
||||
- Benford's Law Analysis for digit distribution anomalies
|
||||
- Altman Z-Score for bankruptcy risk
|
||||
- Beneish M-Score for earnings manipulation
|
||||
- Piotroski F-Score for financial strength
|
||||
"""
|
||||
|
||||
import math
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from ..standardization import StandardConcept
|
||||
from .metrics import AltmanZScore, BeneishMScore, PiotroskiFScore
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenfordResult:
|
||||
"""Results from Benford's Law analysis."""
|
||||
observed_dist: Dict[int, float] # Observed digit distribution
|
||||
expected_dist: Dict[int, float] # Expected Benford distribution
|
||||
chi_square: float # Chi-square statistic
|
||||
p_value: float # P-value for goodness of fit
|
||||
anomalous: bool # Whether distribution is significantly different
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"{'Anomalous' if self.anomalous else 'Normal'} (p={self.p_value:.3f})"
|
||||
|
||||
class FraudDetector:
|
||||
"""Detect potential financial fraud using multiple methods."""
|
||||
|
||||
def __init__(self, xbrl):
|
||||
"""Initialize with an XBRL instance."""
|
||||
self.xbrl = xbrl
|
||||
self.altman = AltmanZScore(xbrl)
|
||||
self.beneish = BeneishMScore(xbrl)
|
||||
self.piotroski = PiotroskiFScore(xbrl)
|
||||
|
||||
def analyze_digit_distribution(self, values: List[float], significance: float = 0.05) -> Optional[BenfordResult]:
|
||||
"""Analyze digit distribution using Benford's Law.
|
||||
|
||||
Args:
|
||||
values: List of numeric values to analyze
|
||||
significance: P-value threshold for anomaly detection
|
||||
|
||||
Returns:
|
||||
BenfordResult with analysis results, or None if insufficient data
|
||||
"""
|
||||
if len(values) < 10: # Need reasonable sample size
|
||||
return None
|
||||
|
||||
# Get first digits
|
||||
first_digits = [int(str(abs(float(v))).lstrip('0')[0]) for v in values if v != 0]
|
||||
if not first_digits:
|
||||
return None
|
||||
|
||||
# Calculate observed distribution
|
||||
digit_counts = Counter(first_digits)
|
||||
total = len(first_digits)
|
||||
observed_dist = {d: digit_counts.get(d, 0) / total for d in range(1, 10)}
|
||||
|
||||
# Calculate expected Benford distribution
|
||||
expected_dist = {d: math.log10(1 + 1/d) for d in range(1, 10)}
|
||||
|
||||
# Perform chi-square test
|
||||
chi_square = 0
|
||||
for d in range(1, 10):
|
||||
expected = expected_dist[d] * total
|
||||
observed = digit_counts.get(d, 0)
|
||||
chi_square += (observed - expected) ** 2 / expected
|
||||
|
||||
# Get p-value (8 degrees of freedom for digits 1-9)
|
||||
from scipy.stats import chi2
|
||||
p_value = 1 - chi2.cdf(chi_square, 8)
|
||||
|
||||
return BenfordResult(
|
||||
observed_dist=observed_dist,
|
||||
expected_dist=expected_dist,
|
||||
chi_square=chi_square,
|
||||
p_value=p_value,
|
||||
anomalous=p_value < significance
|
||||
)
|
||||
|
||||
def analyze_all(self) -> Dict[str, Any]:
|
||||
"""Run all fraud detection analyses.
|
||||
|
||||
Returns:
|
||||
Dict containing:
|
||||
- altman_z: Altman Z-Score results
|
||||
- beneish_m: Beneish M-Score results
|
||||
- piotroski_f: Piotroski F-Score results
|
||||
- benford: Benford's Law analysis results
|
||||
"""
|
||||
# Get financial values for Benford analysis
|
||||
values = []
|
||||
for concept in [
|
||||
StandardConcept.TOTAL_ASSETS,
|
||||
StandardConcept.TOTAL_LIABILITIES,
|
||||
StandardConcept.TOTAL_EQUITY,
|
||||
StandardConcept.REVENUE,
|
||||
StandardConcept.NET_INCOME,
|
||||
StandardConcept.OPERATING_INCOME,
|
||||
StandardConcept.OPERATING_CASH_FLOW
|
||||
]:
|
||||
if hasattr(self.xbrl.statements, 'balance_sheet'):
|
||||
bs_value = self.altman._get_value(concept)
|
||||
if bs_value:
|
||||
values.append(bs_value)
|
||||
if hasattr(self.xbrl.statements, 'income_statement'):
|
||||
is_value = self.altman._get_value(concept, "IncomeStatement")
|
||||
if is_value:
|
||||
values.append(is_value)
|
||||
if hasattr(self.xbrl.statements, 'cash_flow'):
|
||||
cf_value = self.altman._get_value(concept, "CashFlow")
|
||||
if cf_value:
|
||||
values.append(cf_value)
|
||||
|
||||
return {
|
||||
'altman_z': self.altman.calculate(),
|
||||
'beneish_m': self.beneish.calculate(),
|
||||
'piotroski_f': self.piotroski.calculate(),
|
||||
'benford': self.analyze_digit_distribution(values)
|
||||
}
|
||||
411
venv/lib/python3.10/site-packages/edgar/xbrl/analysis/metrics.py
Normal file
411
venv/lib/python3.10/site-packages/edgar/xbrl/analysis/metrics.py
Normal file
@@ -0,0 +1,411 @@
|
||||
"""Financial metrics and analysis module.
|
||||
|
||||
This module provides various financial metrics and analysis tools including:
|
||||
- Altman Z-Score for bankruptcy prediction
|
||||
- Beneish M-Score for earnings manipulation detection
|
||||
- Piotroski F-Score for financial strength assessment
|
||||
- Montier C-Score for earnings manipulation detection
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Optional
|
||||
|
||||
from ..standardization import MappingStore, StandardConcept
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetricResult:
|
||||
"""Container for metric calculation results with metadata."""
|
||||
value: float
|
||||
components: Dict[str, float]
|
||||
interpretation: str
|
||||
period: str
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"{self.value:.2f} ({self.interpretation})"
|
||||
|
||||
class FinancialMetrics:
|
||||
"""Base class for financial metrics calculations."""
|
||||
|
||||
def __init__(self, xbrl):
|
||||
"""Initialize with an XBRL instance."""
|
||||
self.xbrl = xbrl
|
||||
self._balance_sheet_df = None
|
||||
self._income_stmt_df = None
|
||||
self._cash_flow_df = None
|
||||
self._bs_period = None
|
||||
self._is_period = None
|
||||
self._cf_period = None
|
||||
|
||||
# Initialize concept mappings
|
||||
self._mapping_store = MappingStore()
|
||||
|
||||
# Initialize dataframes if statements exist
|
||||
if self.xbrl.statements.balance_sheet:
|
||||
bs = self.xbrl.statements.balance_sheet
|
||||
self._balance_sheet_df = bs.to_dataframe()
|
||||
self._bs_period = bs.periods[0].label
|
||||
|
||||
if self.xbrl.statements.income_statement:
|
||||
is_ = self.xbrl.statements.income_statement
|
||||
self._income_stmt_df = is_.to_dataframe()
|
||||
self._is_period = is_.periods[0].label
|
||||
|
||||
if self.xbrl.statements.cash_flow:
|
||||
cf = self.xbrl.statements.cash_flow
|
||||
self._cash_flow_df = cf.to_dataframe()
|
||||
self._cf_period = cf.periods[0].label
|
||||
|
||||
def _get_value(self, label: StandardConcept, statement_type: str = "BalanceSheet", period_offset: int = 0) -> Optional[float]:
|
||||
"""Safely extract a numeric value using the standardized label from the appropriate statement.
|
||||
|
||||
Args:
|
||||
label: The standardized concept to retrieve
|
||||
statement_type: Type of financial statement ("BalanceSheet", "IncomeStatement", "CashFlow")
|
||||
period_offset: Offset from current period (0 for current, -1 for prior, etc.)
|
||||
|
||||
Returns:
|
||||
The numeric value if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
concepts = self._mapping_store.get_company_concepts(label)
|
||||
if not concepts:
|
||||
return None
|
||||
|
||||
df = None
|
||||
if statement_type == "BalanceSheet" and self._balance_sheet_df is not None:
|
||||
df = self._balance_sheet_df
|
||||
elif statement_type == "IncomeStatement" and self._income_stmt_df is not None:
|
||||
df = self._income_stmt_df
|
||||
elif statement_type == "CashFlow" and self._cash_flow_df is not None:
|
||||
df = self._cash_flow_df
|
||||
|
||||
if df is None:
|
||||
return None
|
||||
|
||||
# Get all available periods
|
||||
periods = df.columns.tolist()
|
||||
if not periods:
|
||||
return None
|
||||
|
||||
# Get target period based on offset
|
||||
try:
|
||||
target_period = periods[period_offset]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
# Try each concept mapping
|
||||
for concept in concepts:
|
||||
try:
|
||||
return df.loc[concept, target_period]
|
||||
except KeyError:
|
||||
continue
|
||||
|
||||
return None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
class AltmanZScore(FinancialMetrics):
|
||||
"""Calculate Altman Z-Score for bankruptcy prediction."""
|
||||
|
||||
def calculate(self) -> Optional[MetricResult]:
|
||||
"""Calculate Altman Z-Score.
|
||||
|
||||
Z-Score = 1.2X₁ + 1.4X₂ + 3.3X₃ + 0.6X₄ + 1.0X₅
|
||||
where:
|
||||
X₁ = Working Capital / Total Assets
|
||||
X₂ = Retained Earnings / Total Assets
|
||||
X₃ = EBIT / Total Assets
|
||||
X₄ = Market Value of Equity / Total Liabilities
|
||||
X₅ = Sales / Total Assets
|
||||
"""
|
||||
# Get required values
|
||||
working_capital = self._get_working_capital()
|
||||
total_assets = self._get_value(StandardConcept.TOTAL_ASSETS)
|
||||
retained_earnings = self._get_value(StandardConcept.RETAINED_EARNINGS)
|
||||
ebit = self._get_value(StandardConcept.OPERATING_INCOME, "IncomeStatement")
|
||||
market_value = self._get_value(StandardConcept.TOTAL_EQUITY) # Using book value as proxy
|
||||
total_liabilities = self._get_value(StandardConcept.TOTAL_LIABILITIES)
|
||||
revenue = self._get_value(StandardConcept.REVENUE, "IncomeStatement")
|
||||
|
||||
# Check if we have all required values
|
||||
if not all([working_capital, total_assets, retained_earnings, ebit,
|
||||
market_value, total_liabilities, revenue]):
|
||||
return None
|
||||
|
||||
# Cast to float to help type checker
|
||||
working_capital = float(working_capital) # type: ignore
|
||||
total_assets = float(total_assets) # type: ignore
|
||||
retained_earnings = float(retained_earnings) # type: ignore
|
||||
ebit = float(ebit) # type: ignore
|
||||
market_value = float(market_value) # type: ignore
|
||||
total_liabilities = float(total_liabilities) # type: ignore
|
||||
revenue = float(revenue) # type: ignore
|
||||
|
||||
# Calculate ratios
|
||||
x1 = working_capital / total_assets
|
||||
x2 = retained_earnings / total_assets
|
||||
x3 = ebit / total_assets
|
||||
x4 = market_value / total_liabilities
|
||||
x5 = revenue / total_assets
|
||||
|
||||
# Calculate Z-Score
|
||||
z_score = 1.2*x1 + 1.4*x2 + 3.3*x3 + 0.6*x4 + 1.0*x5
|
||||
|
||||
# Interpret score
|
||||
if z_score > 2.99:
|
||||
interpretation = "Safe Zone: Low probability of financial distress"
|
||||
elif z_score > 1.81:
|
||||
interpretation = "Grey Zone: Moderate risk of financial distress"
|
||||
else:
|
||||
interpretation = "Distress Zone: High risk of financial distress"
|
||||
|
||||
return MetricResult(
|
||||
value=z_score,
|
||||
components={
|
||||
'working_capital_to_assets': x1,
|
||||
'retained_earnings_to_assets': x2,
|
||||
'ebit_to_assets': x3,
|
||||
'equity_to_liabilities': x4,
|
||||
'sales_to_assets': x5
|
||||
},
|
||||
interpretation=interpretation,
|
||||
period=self._bs_period if self._bs_period is not None else ""
|
||||
)
|
||||
|
||||
def _get_working_capital(self) -> Optional[float]:
|
||||
"""Calculate working capital."""
|
||||
current_assets = self._get_value(StandardConcept.TOTAL_CURRENT_ASSETS)
|
||||
current_liab = self._get_value(StandardConcept.TOTAL_CURRENT_LIABILITIES)
|
||||
|
||||
if current_assets is None or current_liab is None:
|
||||
return None
|
||||
|
||||
return current_assets - current_liab
|
||||
|
||||
class BeneishMScore(FinancialMetrics):
|
||||
"""Calculate Beneish M-Score for earnings manipulation detection."""
|
||||
|
||||
def calculate(self) -> Optional[MetricResult]:
|
||||
"""Calculate Beneish M-Score.
|
||||
|
||||
M-Score = -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI
|
||||
- 0.172*SGAI + 4.679*TATA - 0.327*LVGI
|
||||
|
||||
where:
|
||||
DSRI = Days Sales in Receivables Index
|
||||
GMI = Gross Margin Index
|
||||
AQI = Asset Quality Index
|
||||
SGI = Sales Growth Index
|
||||
DEPI = Depreciation Index
|
||||
SGAI = SG&A Expense Index
|
||||
TATA = Total Accruals to Total Assets
|
||||
LVGI = Leverage Index
|
||||
|
||||
A score greater than -2.22 indicates a high probability of earnings manipulation.
|
||||
"""
|
||||
# Get current year values
|
||||
receivables = self._get_value(StandardConcept.ACCOUNTS_RECEIVABLE)
|
||||
revenue = self._get_value(StandardConcept.REVENUE, "IncomeStatement")
|
||||
gross_profit = self._get_value(StandardConcept.GROSS_PROFIT, "IncomeStatement")
|
||||
total_assets = self._get_value(StandardConcept.TOTAL_ASSETS)
|
||||
ppe = self._get_value(StandardConcept.PROPERTY_PLANT_EQUIPMENT)
|
||||
depreciation = self._get_value(StandardConcept.DEPRECIATION_AMORTIZATION, "IncomeStatement")
|
||||
sga = self._get_value(StandardConcept.SGA_EXPENSE, "IncomeStatement")
|
||||
total_liabilities = self._get_value(StandardConcept.TOTAL_LIABILITIES)
|
||||
|
||||
# Get prior year values (assuming they're available)
|
||||
prior_receivables = self._get_value(StandardConcept.ACCOUNTS_RECEIVABLE, period_offset=-1)
|
||||
prior_revenue = self._get_value(StandardConcept.REVENUE, "IncomeStatement", period_offset=-1)
|
||||
prior_gross_profit = self._get_value(StandardConcept.GROSS_PROFIT, "IncomeStatement", period_offset=-1)
|
||||
prior_total_assets = self._get_value(StandardConcept.TOTAL_ASSETS, period_offset=-1)
|
||||
prior_ppe = self._get_value(StandardConcept.PROPERTY_PLANT_EQUIPMENT, period_offset=-1)
|
||||
prior_depreciation = self._get_value(StandardConcept.DEPRECIATION_AMORTIZATION, "IncomeStatement", period_offset=-1)
|
||||
prior_sga = self._get_value(StandardConcept.SGA_EXPENSE, "IncomeStatement", period_offset=-1)
|
||||
prior_total_liabilities = self._get_value(StandardConcept.TOTAL_LIABILITIES, period_offset=-1)
|
||||
|
||||
# Check if we have all required values
|
||||
if not all([receivables, revenue, gross_profit, total_assets, ppe, depreciation, sga, total_liabilities,
|
||||
prior_receivables, prior_revenue, prior_gross_profit, prior_total_assets, prior_ppe,
|
||||
prior_depreciation, prior_sga, prior_total_liabilities]):
|
||||
return None
|
||||
|
||||
# Cast to float to help type checker
|
||||
receivables = float(receivables) # type: ignore
|
||||
revenue = float(revenue) # type: ignore
|
||||
gross_profit = float(gross_profit) # type: ignore
|
||||
total_assets = float(total_assets) # type: ignore
|
||||
ppe = float(ppe) # type: ignore
|
||||
depreciation = float(depreciation) # type: ignore
|
||||
sga = float(sga) # type: ignore
|
||||
total_liabilities = float(total_liabilities) # type: ignore
|
||||
|
||||
prior_receivables = float(prior_receivables) # type: ignore
|
||||
prior_revenue = float(prior_revenue) # type: ignore
|
||||
prior_gross_profit = float(prior_gross_profit) # type: ignore
|
||||
prior_total_assets = float(prior_total_assets) # type: ignore
|
||||
prior_ppe = float(prior_ppe) # type: ignore
|
||||
prior_depreciation = float(prior_depreciation) # type: ignore
|
||||
prior_sga = float(prior_sga) # type: ignore
|
||||
prior_total_liabilities = float(prior_total_liabilities) # type: ignore
|
||||
|
||||
# Calculate components
|
||||
dsri = (receivables / revenue) / (prior_receivables / prior_revenue)
|
||||
gmi = (prior_gross_profit / prior_revenue) / (gross_profit / revenue)
|
||||
aqi = ((total_assets - ppe) / total_assets) / ((prior_total_assets - prior_ppe) / prior_total_assets)
|
||||
sgi = revenue / prior_revenue
|
||||
depi = (prior_depreciation / prior_ppe) / (depreciation / ppe)
|
||||
sgai = (sga / revenue) / (prior_sga / prior_revenue)
|
||||
tata = (total_assets - prior_total_assets) / total_assets
|
||||
lvgi = (total_liabilities / total_assets) / (prior_total_liabilities / prior_total_assets)
|
||||
|
||||
# Calculate M-Score
|
||||
m_score = -4.84 + 0.92*dsri + 0.528*gmi + 0.404*aqi + 0.892*sgi + \
|
||||
0.115*depi - 0.172*sgai + 4.679*tata - 0.327*lvgi
|
||||
|
||||
# Interpret score
|
||||
if m_score > -2.22:
|
||||
interpretation = "High probability of earnings manipulation"
|
||||
else:
|
||||
interpretation = "Low probability of earnings manipulation"
|
||||
|
||||
return MetricResult(
|
||||
value=m_score,
|
||||
components={
|
||||
'dsri': dsri,
|
||||
'gmi': gmi,
|
||||
'aqi': aqi,
|
||||
'sgi': sgi,
|
||||
'depi': depi,
|
||||
'sgai': sgai,
|
||||
'tata': tata,
|
||||
'lvgi': lvgi
|
||||
},
|
||||
interpretation=interpretation,
|
||||
period=self._bs_period if self._bs_period is not None else ""
|
||||
)
|
||||
|
||||
class PiotroskiFScore(FinancialMetrics):
|
||||
"""Calculate Piotroski F-Score for financial strength assessment."""
|
||||
|
||||
def calculate(self) -> Optional[MetricResult]:
|
||||
"""Calculate Piotroski F-Score.
|
||||
|
||||
The F-Score is the sum of 9 binary signals (0 or 1) across three categories:
|
||||
|
||||
Profitability:
|
||||
1. Return on Assets (ROA) > 0
|
||||
2. Operating Cash Flow > 0
|
||||
3. ROA(t) > ROA(t-1)
|
||||
4. Cash flow from operations > ROA
|
||||
|
||||
Leverage, Liquidity and Source of Funds:
|
||||
5. Long-term debt ratio(t) < Long-term debt ratio(t-1)
|
||||
6. Current ratio(t) > Current ratio(t-1)
|
||||
7. No new shares issued
|
||||
|
||||
Operating Efficiency:
|
||||
8. Gross margin(t) > Gross margin(t-1)
|
||||
9. Asset turnover(t) > Asset turnover(t-1)
|
||||
|
||||
A score of 8-9 indicates a strong company, while 0-2 indicates a weak company.
|
||||
"""
|
||||
scores = {}
|
||||
total_score = 0
|
||||
|
||||
# Get current year values
|
||||
net_income = self._get_value(StandardConcept.NET_INCOME, "IncomeStatement")
|
||||
total_assets = self._get_value(StandardConcept.TOTAL_ASSETS)
|
||||
operating_cash_flow = self._get_value(StandardConcept.OPERATING_CASH_FLOW, "CashFlow")
|
||||
long_term_debt = self._get_value(StandardConcept.LONG_TERM_DEBT)
|
||||
current_assets = self._get_value(StandardConcept.TOTAL_CURRENT_ASSETS)
|
||||
current_liab = self._get_value(StandardConcept.TOTAL_CURRENT_LIABILITIES)
|
||||
shares_outstanding = self._get_value(StandardConcept.SHARES_OUTSTANDING)
|
||||
revenue = self._get_value(StandardConcept.REVENUE, "IncomeStatement")
|
||||
gross_profit = self._get_value(StandardConcept.GROSS_PROFIT, "IncomeStatement")
|
||||
|
||||
# Get prior year values
|
||||
prior_net_income = self._get_value(StandardConcept.NET_INCOME, "IncomeStatement", -1)
|
||||
prior_total_assets = self._get_value(StandardConcept.TOTAL_ASSETS, "BalanceSheet", -1)
|
||||
prior_long_term_debt = self._get_value(StandardConcept.LONG_TERM_DEBT, "BalanceSheet", -1)
|
||||
prior_current_assets = self._get_value(StandardConcept.TOTAL_CURRENT_ASSETS, "BalanceSheet", -1)
|
||||
prior_current_liab = self._get_value(StandardConcept.TOTAL_CURRENT_LIABILITIES, "BalanceSheet", -1)
|
||||
prior_shares_outstanding = self._get_value(StandardConcept.SHARES_OUTSTANDING, "BalanceSheet", -1)
|
||||
prior_revenue = self._get_value(StandardConcept.REVENUE, "IncomeStatement", -1)
|
||||
prior_gross_profit = self._get_value(StandardConcept.GROSS_PROFIT, "IncomeStatement", -1)
|
||||
|
||||
# Check if we have minimum required values for any calculations
|
||||
if not all([net_income, total_assets, operating_cash_flow]):
|
||||
return None
|
||||
|
||||
# Cast to float
|
||||
net_income = float(net_income) # type: ignore
|
||||
total_assets = float(total_assets) # type: ignore
|
||||
operating_cash_flow = float(operating_cash_flow) # type: ignore
|
||||
|
||||
# 1. ROA > 0
|
||||
roa = net_income / total_assets
|
||||
scores['roa_positive'] = 1 if roa > 0 else 0
|
||||
total_score += scores['roa_positive']
|
||||
|
||||
# 2. Operating Cash Flow > 0
|
||||
scores['cfoa_positive'] = 1 if operating_cash_flow > 0 else 0
|
||||
total_score += scores['cfoa_positive']
|
||||
|
||||
# 3. ROA(t) > ROA(t-1)
|
||||
if prior_net_income is not None and prior_total_assets is not None:
|
||||
prior_roa = float(prior_net_income) / float(prior_total_assets) # type: ignore
|
||||
scores['roa_higher'] = 1 if roa > prior_roa else 0
|
||||
total_score += scores['roa_higher']
|
||||
|
||||
# 4. Cash flow from operations > ROA
|
||||
scores['quality_earnings'] = 1 if operating_cash_flow / total_assets > roa else 0
|
||||
total_score += scores['quality_earnings']
|
||||
|
||||
# 5. Long-term debt ratio
|
||||
if all([long_term_debt, prior_long_term_debt]):
|
||||
ltdr = float(long_term_debt) / total_assets # type: ignore
|
||||
prior_ltdr = float(prior_long_term_debt) / float(prior_total_assets) # type: ignore
|
||||
scores['leverage_lower'] = 1 if ltdr < prior_ltdr else 0
|
||||
total_score += scores['leverage_lower']
|
||||
|
||||
# 6. Current ratio
|
||||
if all([current_assets, current_liab, prior_current_assets, prior_current_liab]):
|
||||
curr_ratio = float(current_assets) / float(current_liab) # type: ignore
|
||||
prior_curr_ratio = float(prior_current_assets) / float(prior_current_liab) # type: ignore
|
||||
scores['liquidity_higher'] = 1 if curr_ratio > prior_curr_ratio else 0
|
||||
total_score += scores['liquidity_higher']
|
||||
|
||||
# 7. No new shares issued
|
||||
if shares_outstanding is not None and prior_shares_outstanding is not None:
|
||||
scores['no_dilution'] = 1 if float(shares_outstanding) <= float(prior_shares_outstanding) else 0 # type: ignore
|
||||
total_score += scores['no_dilution']
|
||||
|
||||
# 8. Gross margin
|
||||
if all([gross_profit, revenue, prior_gross_profit, prior_revenue]):
|
||||
margin = float(gross_profit) / float(revenue) # type: ignore
|
||||
prior_margin = float(prior_gross_profit) / float(prior_revenue) # type: ignore
|
||||
scores['margin_higher'] = 1 if margin > prior_margin else 0
|
||||
total_score += scores['margin_higher']
|
||||
|
||||
# 9. Asset turnover
|
||||
if all([revenue, prior_revenue]):
|
||||
turnover = float(revenue) / total_assets # type: ignore
|
||||
prior_turnover = float(prior_revenue) / float(prior_total_assets) # type: ignore
|
||||
scores['turnover_higher'] = 1 if turnover > prior_turnover else 0
|
||||
total_score += scores['turnover_higher']
|
||||
|
||||
# Interpret score
|
||||
if total_score >= 8:
|
||||
interpretation = "Strong financial position"
|
||||
elif total_score >= 5:
|
||||
interpretation = "Moderate financial position"
|
||||
else:
|
||||
interpretation = "Weak financial position"
|
||||
|
||||
return MetricResult(
|
||||
value=total_score,
|
||||
components=scores,
|
||||
interpretation=interpretation,
|
||||
period=self._bs_period if self._bs_period is not None else ""
|
||||
)
|
||||
1296
venv/lib/python3.10/site-packages/edgar/xbrl/analysis/ratios.py
Normal file
1296
venv/lib/python3.10/site-packages/edgar/xbrl/analysis/ratios.py
Normal file
File diff suppressed because it is too large
Load Diff
477
venv/lib/python3.10/site-packages/edgar/xbrl/core.py
Normal file
477
venv/lib/python3.10/site-packages/edgar/xbrl/core.py
Normal file
@@ -0,0 +1,477 @@
|
||||
"""
|
||||
Core utilities for XBRL processing.
|
||||
|
||||
This module provides common functions used throughout the XBRL parser.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
# Constants for label roles
|
||||
STANDARD_LABEL = "http://www.xbrl.org/2003/role/label"
|
||||
TERSE_LABEL = "http://www.xbrl.org/2003/role/terseLabel"
|
||||
PERIOD_START_LABEL = "http://www.xbrl.org/2003/role/periodStartLabel"
|
||||
PERIOD_END_LABEL = "http://www.xbrl.org/2003/role/periodEndLabel"
|
||||
TOTAL_LABEL = "http://www.xbrl.org/2003/role/totalLabel"
|
||||
|
||||
# XML namespaces
|
||||
NAMESPACES = {
|
||||
"xlink": "http://www.w3.org/1999/xlink",
|
||||
"xsd": "http://www.w3.org/2001/XMLSchema",
|
||||
"xbrli": "http://www.xbrl.org/2003/instance",
|
||||
"link": "http://www.xbrl.org/2003/linkbase"
|
||||
}
|
||||
|
||||
|
||||
def parse_date(date_str: str) -> datetime.date:
|
||||
"""
|
||||
Parse an XBRL date string to a date object.
|
||||
|
||||
Args:
|
||||
date_str: Date string in YYYY-MM-DD format
|
||||
|
||||
Returns:
|
||||
datetime.date object
|
||||
"""
|
||||
if not date_str:
|
||||
raise ValueError("Empty date string provided")
|
||||
|
||||
try:
|
||||
# Parse the date string
|
||||
date_obj = datetime.strptime(date_str, '%Y-%m-%d').date()
|
||||
|
||||
# Additional validation - some dates in XBRL can have invalid day values
|
||||
# (e.g. September 31, which doesn't exist)
|
||||
year, month, day = map(int, date_str.split('-'))
|
||||
|
||||
# Validate day of month
|
||||
if month == 2: # February
|
||||
if day > 29:
|
||||
# February never has more than 29 days
|
||||
raise ValueError(f"Invalid day {day} for February")
|
||||
elif day == 29 and not (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)):
|
||||
# February 29 is only valid in leap years
|
||||
raise ValueError(f"Invalid day 29 for February in non-leap year {year}")
|
||||
elif month in [4, 6, 9, 11] and day > 30:
|
||||
# April, June, September, November have 30 days max
|
||||
raise ValueError(f"Invalid day {day} for month {month}")
|
||||
elif day > 31:
|
||||
# No month has more than 31 days
|
||||
raise ValueError(f"Invalid day {day}")
|
||||
|
||||
return date_obj
|
||||
except (ValueError, TypeError) as e:
|
||||
# Provide more specific error message
|
||||
raise ValueError(f"Invalid date format or value: {date_str} - {str(e)}") from e
|
||||
|
||||
|
||||
def format_date(date_obj: datetime.date) -> str:
|
||||
"""
|
||||
Format a date object to a human-readable string.
|
||||
|
||||
Args:
|
||||
date_obj: datetime.date object
|
||||
|
||||
Returns:
|
||||
Formatted date string (e.g., "Sep 30, 2023")
|
||||
"""
|
||||
# Use abbreviated month format (%b) instead of full month (%B)
|
||||
formatted_date = date_obj.strftime('%b %d, %Y')
|
||||
|
||||
# Remove leading zeros from day
|
||||
if formatted_date.split()[1].startswith('0'):
|
||||
day_part = formatted_date.split()[1].lstrip('0')
|
||||
formatted_date = f"{formatted_date.split()[0]} {day_part} {formatted_date.split()[2]}"
|
||||
|
||||
return formatted_date
|
||||
|
||||
|
||||
def extract_element_id(href: str) -> str:
|
||||
"""
|
||||
Extract element ID from an XLink href.
|
||||
|
||||
Args:
|
||||
href: XLink href attribute value
|
||||
|
||||
Returns:
|
||||
Element ID
|
||||
"""
|
||||
return href.split('#')[-1]
|
||||
|
||||
|
||||
def classify_duration(days: int) -> str:
|
||||
"""
|
||||
Classify a duration in days as quarterly, semi-annual, annual, etc.
|
||||
|
||||
Args:
|
||||
days: Duration in days
|
||||
|
||||
Returns:
|
||||
Description of the duration (e.g., "Quarterly", "Annual")
|
||||
"""
|
||||
if 85 <= days <= 95:
|
||||
return "Quarterly"
|
||||
elif 175 <= days <= 185:
|
||||
return "Semi-Annual"
|
||||
elif 265 <= days <= 285:
|
||||
return "Nine Months"
|
||||
elif 350 <= days <= 380:
|
||||
return "Annual"
|
||||
else:
|
||||
return "Period"
|
||||
|
||||
|
||||
def determine_dominant_scale(statement_data: List[Dict[str, Any]],
|
||||
periods_to_display: List[Tuple[str, str]]) -> int:
|
||||
"""
|
||||
Determine the dominant scale (thousands, millions, billions) for a statement.
|
||||
|
||||
This looks at all monetary values in the statement and determines the most appropriate
|
||||
scale to use for the "In millions/billions/thousands" note.
|
||||
|
||||
Args:
|
||||
statement_data: The statement data with items and values
|
||||
periods_to_display: List of period keys and labels to consider
|
||||
|
||||
Returns:
|
||||
int: The dominant scale (-3 for thousands, -6 for millions, -9 for billions, 0 for no scaling)
|
||||
"""
|
||||
# Collect all decimals attributes
|
||||
all_decimals = []
|
||||
for item in statement_data:
|
||||
# Skip non-monetary items or items without values
|
||||
if not item.get('has_values', False) or not item.get('values'):
|
||||
continue
|
||||
|
||||
# Skip items that appear to be share counts or ratios
|
||||
label_lower = item['label'].lower()
|
||||
if any(keyword in label_lower for keyword in [
|
||||
'shares', 'share', 'stock', 'eps', 'earnings per share',
|
||||
'weighted average', 'number of', 'per common share', 'per share',
|
||||
'per basic', 'per diluted', 'outstanding', 'issued',
|
||||
'ratio', 'margin', 'percentage', 'rate', 'per cent'
|
||||
]):
|
||||
continue
|
||||
|
||||
# Get all decimals values for this item
|
||||
for period_key, _ in periods_to_display:
|
||||
if period_key in item.get('decimals', {}):
|
||||
decimals = item['decimals'][period_key]
|
||||
if isinstance(decimals, int):
|
||||
all_decimals.append(decimals)
|
||||
|
||||
# If we have decimals information, use that to determine the scale
|
||||
if all_decimals:
|
||||
# Count the occurrences of each scale
|
||||
scale_counts = {
|
||||
-9: 0, # billions
|
||||
-6: 0, # millions
|
||||
-3: 0, # thousands
|
||||
0: 0 # no scaling
|
||||
}
|
||||
|
||||
for decimals in all_decimals:
|
||||
if decimals <= -9:
|
||||
scale_counts[-9] += 1
|
||||
elif decimals <= -6:
|
||||
scale_counts[-6] += 1
|
||||
elif decimals <= -3:
|
||||
scale_counts[-3] += 1
|
||||
else:
|
||||
scale_counts[0] += 1
|
||||
|
||||
# Find the most common scale (excluding no scaling)
|
||||
most_common_scale = 0
|
||||
max_count = 0
|
||||
|
||||
for scale, count in scale_counts.items():
|
||||
if scale != 0 and count > max_count: # Prioritize scaling over no scaling
|
||||
max_count = count
|
||||
most_common_scale = scale
|
||||
|
||||
return most_common_scale
|
||||
|
||||
# If no decimals information, examine the magnitude of values
|
||||
all_values = []
|
||||
for item in statement_data:
|
||||
if not item.get('has_values', False) or not item.get('values'):
|
||||
continue
|
||||
|
||||
# Skip items that appear to be share counts or ratios
|
||||
label_lower = item['label'].lower()
|
||||
if any(keyword in label_lower for keyword in [
|
||||
'shares', 'share', 'stock', 'eps', 'earnings per share',
|
||||
'weighted average', 'number of', 'per common share', 'per share',
|
||||
'per basic', 'per diluted', 'outstanding', 'issued',
|
||||
'ratio', 'margin', 'percentage', 'rate', 'per cent'
|
||||
]):
|
||||
continue
|
||||
|
||||
# Get all values for this item
|
||||
for period_key, _ in periods_to_display:
|
||||
value = item['values'].get(period_key)
|
||||
if isinstance(value, (int, float)) and value != 0:
|
||||
all_values.append(abs(value))
|
||||
|
||||
# Determine the appropriate scale based on the magnitude of values
|
||||
if all_values:
|
||||
# Calculate median value to avoid outliers affecting the scale
|
||||
all_values.sort()
|
||||
median_value = all_values[len(all_values) // 2]
|
||||
|
||||
if median_value >= 1_000_000_000:
|
||||
return -9 # billions
|
||||
elif median_value >= 1_000_000:
|
||||
return -6 # millions
|
||||
elif median_value >= 1_000:
|
||||
return -3 # thousands
|
||||
|
||||
# Default to millions if we couldn't determine a scale
|
||||
return -6
|
||||
|
||||
|
||||
def get_currency_symbol(unit_measure: Optional[str]) -> str:
|
||||
"""
|
||||
Get the appropriate currency symbol from a unit measure string.
|
||||
|
||||
Args:
|
||||
unit_measure: Unit measure string (e.g., 'iso4217:USD', 'iso4217:EUR')
|
||||
|
||||
Returns:
|
||||
Currency symbol (e.g., '$', '€', '£')
|
||||
"""
|
||||
if not unit_measure:
|
||||
return "$" # Default to USD
|
||||
|
||||
# Map common ISO 4217 currency codes to symbols
|
||||
currency_symbols = {
|
||||
'iso4217:USD': '$',
|
||||
'iso4217:EUR': '€',
|
||||
'iso4217:GBP': '£',
|
||||
'iso4217:JPY': '¥',
|
||||
'iso4217:CAD': 'C$',
|
||||
'iso4217:AUD': 'A$',
|
||||
'iso4217:CHF': 'CHF',
|
||||
'iso4217:CNY': '¥',
|
||||
'iso4217:INR': '₹',
|
||||
'iso4217:KRW': '₩',
|
||||
'iso4217:BRL': 'R$',
|
||||
'iso4217:MXN': 'MX$',
|
||||
'iso4217:SEK': 'kr',
|
||||
'iso4217:NOK': 'kr',
|
||||
'iso4217:DKK': 'kr',
|
||||
'iso4217:PLN': 'zł',
|
||||
'iso4217:CZK': 'Kč',
|
||||
'iso4217:HUF': 'Ft',
|
||||
'iso4217:RUB': '₽',
|
||||
'iso4217:ZAR': 'R',
|
||||
'iso4217:SGD': 'S$',
|
||||
'iso4217:HKD': 'HK$',
|
||||
'iso4217:TWD': 'NT$',
|
||||
'iso4217:THB': '฿',
|
||||
'iso4217:MYR': 'RM',
|
||||
'iso4217:IDR': 'Rp',
|
||||
'iso4217:PHP': '₱',
|
||||
'iso4217:VND': '₫',
|
||||
'iso4217:ILS': '₪',
|
||||
'iso4217:TRY': '₺',
|
||||
'iso4217:AED': 'AED',
|
||||
'iso4217:SAR': 'SR',
|
||||
'iso4217:EGP': 'E£',
|
||||
'iso4217:NGN': '₦',
|
||||
}
|
||||
|
||||
return currency_symbols.get(unit_measure, '$') # Default to USD if unknown
|
||||
|
||||
|
||||
def format_value(value: Union[int, float, str], is_monetary: bool, scale: int,
|
||||
decimals: Optional[int] = None, currency_symbol: Optional[str] = None) -> str:
|
||||
"""
|
||||
Format a value with appropriate scaling and formatting.
|
||||
|
||||
Args:
|
||||
value: The value to format
|
||||
is_monetary: Whether the value is monetary
|
||||
scale: The scale to apply (-3 for thousands, -6 for millions, -9 for billions)
|
||||
decimals: XBRL decimals attribute value (optional)
|
||||
currency_symbol: Currency symbol to use for monetary values (default: '$')
|
||||
|
||||
Returns:
|
||||
Formatted value string
|
||||
"""
|
||||
# Handle non-numeric or zero values
|
||||
if not isinstance(value, (int, float)) or value == 0:
|
||||
return "" if value == 0 else str(value)
|
||||
|
||||
# Apply scaling
|
||||
scaled_value = value
|
||||
if scale <= -9: # Billions
|
||||
scaled_value = value / 1_000_000_000
|
||||
elif scale <= -6: # Millions
|
||||
scaled_value = value / 1_000_000
|
||||
elif scale <= -3: # Thousands
|
||||
scaled_value = value / 1_000
|
||||
|
||||
# Determine decimal places to show
|
||||
if isinstance(decimals, int):
|
||||
if decimals >= 0:
|
||||
# Positive decimals - show up to 2 decimal places
|
||||
decimal_places = min(2, decimals)
|
||||
else:
|
||||
# For negative decimals, adjust based on scaling
|
||||
if scale <= -9: # Billions
|
||||
decimal_places = min(2, max(0, decimals + 9))
|
||||
elif scale <= -6: # Millions
|
||||
decimal_places = min(2, max(0, decimals + 6))
|
||||
elif scale <= -3: # Thousands
|
||||
decimal_places = min(2, max(0, decimals + 3))
|
||||
else:
|
||||
# For unscaled values, respect the decimals attribute
|
||||
# If decimals is negative, show that many zeros to the left of decimal
|
||||
# E.g., decimals=-2 means precision to hundreds place (two zeros after decimal)
|
||||
decimal_places = max(0, -decimals)
|
||||
else:
|
||||
# Default decimal places
|
||||
if is_monetary:
|
||||
decimal_places = 0 # Standard for financial statements
|
||||
else:
|
||||
# For non-monetary values, check if it's effectively a whole number
|
||||
if abs(round(value) - value) < 0.001:
|
||||
decimal_places = 0 # Effectively whole numbers
|
||||
else:
|
||||
decimal_places = 2 # Show 2 decimals for actual fractional values
|
||||
|
||||
# Apply formatting
|
||||
decimal_format = f",.{decimal_places}f"
|
||||
|
||||
# Format with currency symbol if monetary, otherwise just format the number
|
||||
if is_monetary:
|
||||
# Use the provided currency symbol or default to '$'
|
||||
symbol = currency_symbol if currency_symbol is not None else '$'
|
||||
if value < 0:
|
||||
return f"{symbol}({abs(scaled_value):{decimal_format}})"
|
||||
else:
|
||||
return f"{symbol}{scaled_value:{decimal_format}}"
|
||||
else:
|
||||
# For non-monetary values, use parentheses for negative numbers
|
||||
if value < 0:
|
||||
return f"({abs(scaled_value):{decimal_format}})"
|
||||
else:
|
||||
return f"{scaled_value:{decimal_format}}"
|
||||
|
||||
|
||||
def find_previous_fiscal_year_period(instant_periods: List[Dict[str, Any]],
|
||||
prev_fiscal_year: int,
|
||||
fiscal_month: int,
|
||||
fiscal_day: int) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Find the previous fiscal year period using simple matching logic.
|
||||
|
||||
Args:
|
||||
instant_periods: List of instant periods sorted by date (most recent first)
|
||||
prev_fiscal_year: Previous fiscal year to find
|
||||
fiscal_month: Fiscal year end month
|
||||
fiscal_day: Fiscal year end day
|
||||
|
||||
Returns:
|
||||
Previous fiscal year period or None if not found
|
||||
"""
|
||||
for period in instant_periods[1:]: # Skip the current one
|
||||
try:
|
||||
period_date = parse_date(period['date'])
|
||||
# Check if this period is from the previous fiscal year and around fiscal year end
|
||||
if (period_date.year == prev_fiscal_year and
|
||||
period_date.month == fiscal_month and
|
||||
abs(period_date.day - fiscal_day) <= 7):
|
||||
return period
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def get_unit_display_name(unit_ref: Optional[str]) -> Optional[str]:
|
||||
"""
|
||||
Convert unit_ref to human-readable unit name.
|
||||
|
||||
Maps XBRL unit references to standard display names:
|
||||
- 'U-Monetary' / 'iso4217:USD' -> 'usd'
|
||||
- 'U-Shares' / 'shares' -> 'shares'
|
||||
- 'U-USD-per-shares' -> 'usdPerShare'
|
||||
- etc.
|
||||
|
||||
Args:
|
||||
unit_ref: XBRL unit reference string
|
||||
|
||||
Returns:
|
||||
Human-readable unit name or None if unit_ref is None
|
||||
|
||||
Examples:
|
||||
>>> get_unit_display_name('U-Monetary')
|
||||
'usd'
|
||||
>>> get_unit_display_name('U-Shares')
|
||||
'shares'
|
||||
>>> get_unit_display_name('U-USD-per-shares')
|
||||
'usdPerShare'
|
||||
"""
|
||||
if not unit_ref:
|
||||
return None
|
||||
|
||||
# Common unit patterns and their standard names
|
||||
unit_ref_lower = unit_ref.lower()
|
||||
|
||||
# Per-share units (ratios) - Check FIRST before simple monetary/share checks
|
||||
if 'per' in unit_ref_lower and 'share' in unit_ref_lower:
|
||||
if 'usd' in unit_ref_lower or 'monetary' in unit_ref_lower:
|
||||
return 'usdPerShare'
|
||||
elif 'eur' in unit_ref_lower:
|
||||
return 'eurPerShare'
|
||||
elif 'gbp' in unit_ref_lower:
|
||||
return 'gbpPerShare'
|
||||
else:
|
||||
return 'perShare'
|
||||
|
||||
# Share units (but not per-share)
|
||||
if 'share' in unit_ref_lower:
|
||||
return 'shares'
|
||||
|
||||
# Monetary units
|
||||
if 'monetary' in unit_ref_lower or 'iso4217:usd' in unit_ref_lower or unit_ref_lower == 'usd':
|
||||
return 'usd'
|
||||
elif 'eur' in unit_ref_lower or 'iso4217:eur' in unit_ref_lower:
|
||||
return 'eur'
|
||||
elif 'gbp' in unit_ref_lower or 'iso4217:gbp' in unit_ref_lower:
|
||||
return 'gbp'
|
||||
elif 'jpy' in unit_ref_lower or 'iso4217:jpy' in unit_ref_lower:
|
||||
return 'jpy'
|
||||
|
||||
# Pure numbers / ratios (no unit)
|
||||
if 'pure' in unit_ref_lower or 'number' in unit_ref_lower:
|
||||
return 'number'
|
||||
|
||||
# Default: return a simplified version of the unit_ref
|
||||
# Remove common prefixes and normalize
|
||||
simplified = unit_ref.replace('U-', '').replace('iso4217:', '')
|
||||
return simplified.lower()
|
||||
|
||||
|
||||
def is_point_in_time(period_type: Optional[str]) -> Optional[bool]:
|
||||
"""
|
||||
Determine if a period type represents a point-in-time value.
|
||||
|
||||
Args:
|
||||
period_type: XBRL period type ('instant' or 'duration')
|
||||
|
||||
Returns:
|
||||
True for 'instant' periods, False for 'duration' periods, None if period_type is None
|
||||
|
||||
Examples:
|
||||
>>> is_point_in_time('instant')
|
||||
True
|
||||
>>> is_point_in_time('duration')
|
||||
False
|
||||
>>> is_point_in_time(None)
|
||||
None
|
||||
"""
|
||||
if period_type is None:
|
||||
return None
|
||||
return period_type == 'instant'
|
||||
876
venv/lib/python3.10/site-packages/edgar/xbrl/current_period.py
Normal file
876
venv/lib/python3.10/site-packages/edgar/xbrl/current_period.py
Normal file
@@ -0,0 +1,876 @@
|
||||
"""
|
||||
Current Period API - Convenient access to current period financial data.
|
||||
|
||||
This module provides the CurrentPeriodView class that offers simplified access
|
||||
to the most recent period's financial data without comparative information,
|
||||
addressing GitHub issue #425.
|
||||
|
||||
Key features:
|
||||
- Automatic detection of the current (most recent) period
|
||||
- Direct access to balance sheet, income statement, and cash flow data
|
||||
- Support for raw XBRL concept names (unprocessed)
|
||||
- Notes and disclosures access
|
||||
- Beginner-friendly API design
|
||||
"""
|
||||
|
||||
from datetime import date, datetime
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from edgar.core import log
|
||||
from edgar.richtools import repr_rich
|
||||
from edgar.xbrl.exceptions import StatementNotFound
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar.xbrl.statements import Statement
|
||||
|
||||
|
||||
class CurrentPeriodView:
|
||||
"""
|
||||
Convenient access to current period financial data.
|
||||
|
||||
This class provides simplified access to the most recent period's
|
||||
financial data without comparative information. It automatically
|
||||
detects the current period and provides easy access to key statements.
|
||||
|
||||
Example usage:
|
||||
>>> xbrl = filing.xbrl()
|
||||
>>> current = xbrl.current_period
|
||||
>>> balance_sheet = current.balance_sheet()
|
||||
>>> income_statement = current.income_statement(raw_concepts=True)
|
||||
"""
|
||||
|
||||
def __init__(self, xbrl):
|
||||
"""
|
||||
Initialize CurrentPeriodView with an XBRL object.
|
||||
|
||||
Args:
|
||||
xbrl: XBRL object containing parsed financial data
|
||||
"""
|
||||
self.xbrl = xbrl
|
||||
self._current_period_key = None
|
||||
self._current_period_label = None
|
||||
|
||||
@property
|
||||
def period_key(self) -> str:
|
||||
"""
|
||||
Get the current period key (most recent period).
|
||||
|
||||
The current period is determined by:
|
||||
1. Document period end date if available
|
||||
2. Most recent period in reporting periods
|
||||
3. Fallback to any available period
|
||||
|
||||
Returns:
|
||||
Period key string (e.g., "instant_2024-12-31" or "duration_2024-01-01_2024-12-31")
|
||||
"""
|
||||
if self._current_period_key is None:
|
||||
self._current_period_key = self._detect_current_period()
|
||||
return self._current_period_key
|
||||
|
||||
@property
|
||||
def period_label(self) -> str:
|
||||
"""
|
||||
Get the human-readable label for the current period.
|
||||
|
||||
Returns:
|
||||
Human-readable period label (e.g., "December 31, 2024" or "Year Ended December 31, 2024")
|
||||
"""
|
||||
if self._current_period_label is None:
|
||||
self._detect_current_period() # This sets both key and label
|
||||
return self._current_period_label or self.period_key
|
||||
|
||||
def _detect_current_period(self) -> str:
|
||||
"""
|
||||
Detect the current (most recent) period from available data.
|
||||
|
||||
Strategy:
|
||||
1. Use document period end date to find matching instant period
|
||||
2. If no instant match, find most recent duration period ending on document period end
|
||||
3. Fall back to most recent period by end date
|
||||
4. Final fallback to first available period
|
||||
|
||||
Returns:
|
||||
Period key for the current period
|
||||
"""
|
||||
if not self.xbrl.reporting_periods:
|
||||
log.warning("No reporting periods found in XBRL data")
|
||||
return ""
|
||||
|
||||
# Try to use document period end date if available
|
||||
document_period_end = None
|
||||
if hasattr(self.xbrl, 'period_of_report') and self.xbrl.period_of_report:
|
||||
try:
|
||||
if isinstance(self.xbrl.period_of_report, str):
|
||||
document_period_end = datetime.strptime(self.xbrl.period_of_report, '%Y-%m-%d').date()
|
||||
elif isinstance(self.xbrl.period_of_report, (date, datetime)):
|
||||
document_period_end = self.xbrl.period_of_report
|
||||
if isinstance(document_period_end, datetime):
|
||||
document_period_end = document_period_end.date()
|
||||
except (ValueError, TypeError):
|
||||
log.debug(f"Could not parse document period end date: {self.xbrl.period_of_report}")
|
||||
|
||||
# Sort periods by end date (most recent first)
|
||||
periods_by_date = []
|
||||
for period in self.xbrl.reporting_periods:
|
||||
period_key = period['key']
|
||||
period_label = period.get('label', period_key)
|
||||
end_date = None
|
||||
|
||||
try:
|
||||
if period_key.startswith('instant_'):
|
||||
# Format: "instant_2024-12-31"
|
||||
date_str = period_key.split('_', 1)[1]
|
||||
end_date = datetime.strptime(date_str, '%Y-%m-%d').date()
|
||||
elif period_key.startswith('duration_'):
|
||||
# Format: "duration_2024-01-01_2024-12-31"
|
||||
parts = period_key.split('_')
|
||||
if len(parts) >= 3:
|
||||
date_str = parts[2] # End date
|
||||
end_date = datetime.strptime(date_str, '%Y-%m-%d').date()
|
||||
|
||||
if end_date:
|
||||
periods_by_date.append((end_date, period_key, period_label))
|
||||
except (ValueError, IndexError):
|
||||
log.debug(f"Could not parse period key: {period_key}")
|
||||
continue
|
||||
|
||||
if not periods_by_date:
|
||||
# Fallback to first available period if no dates could be parsed
|
||||
first_period = self.xbrl.reporting_periods[0]
|
||||
self._current_period_key = first_period['key']
|
||||
self._current_period_label = first_period.get('label', first_period['key'])
|
||||
log.debug(f"Using fallback period: {self._current_period_key}")
|
||||
return self._current_period_key
|
||||
|
||||
# Sort by date (most recent first)
|
||||
periods_by_date.sort(key=lambda x: x[0], reverse=True)
|
||||
|
||||
# Strategy 1: If we have document period end, look for exact matches
|
||||
# Prefer instant periods over duration periods when both match document end date
|
||||
if document_period_end:
|
||||
instant_match = None
|
||||
duration_match = None
|
||||
|
||||
for end_date, period_key, period_label in periods_by_date:
|
||||
if end_date == document_period_end:
|
||||
if period_key.startswith('instant_'):
|
||||
instant_match = (period_key, period_label)
|
||||
elif period_key.startswith('duration_'):
|
||||
duration_match = (period_key, period_label)
|
||||
|
||||
# Prefer instant match if available
|
||||
if instant_match:
|
||||
self._current_period_key = instant_match[0]
|
||||
self._current_period_label = instant_match[1]
|
||||
log.debug(f"Found instant period matching document end date: {instant_match[0]}")
|
||||
return self._current_period_key
|
||||
elif duration_match:
|
||||
self._current_period_key = duration_match[0]
|
||||
self._current_period_label = duration_match[1]
|
||||
log.debug(f"Found duration period matching document end date: {duration_match[0]}")
|
||||
return self._current_period_key
|
||||
|
||||
# Strategy 2: Use most recent period
|
||||
most_recent = periods_by_date[0]
|
||||
self._current_period_key = most_recent[1]
|
||||
self._current_period_label = most_recent[2]
|
||||
|
||||
log.debug(f"Selected most recent period: {self._current_period_key} ({self._current_period_label})")
|
||||
return self._current_period_key
|
||||
|
||||
def _get_appropriate_period_for_statement(self, statement_type: str) -> str:
|
||||
"""
|
||||
Get the appropriate period type for the given statement type.
|
||||
|
||||
Balance sheet items are point-in-time (instant periods).
|
||||
Income statement and cash flow items represent activities over time (duration periods).
|
||||
|
||||
Args:
|
||||
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
|
||||
Returns:
|
||||
Period key appropriate for the statement type
|
||||
"""
|
||||
# Statements that use instant periods (point in time)
|
||||
instant_statements = {
|
||||
'BalanceSheet',
|
||||
'StatementOfEquity',
|
||||
'StatementOfFinancialPosition'
|
||||
}
|
||||
|
||||
# Statements that use duration periods (period of time)
|
||||
duration_statements = {
|
||||
'IncomeStatement',
|
||||
'CashFlowStatement',
|
||||
'ComprehensiveIncome',
|
||||
'StatementOfOperations',
|
||||
'StatementOfCashFlows'
|
||||
}
|
||||
|
||||
if statement_type in instant_statements:
|
||||
# Use the current instant period
|
||||
return self.period_key
|
||||
elif statement_type in duration_statements:
|
||||
# Find the most recent duration period with the same end date
|
||||
if not self.xbrl.reporting_periods:
|
||||
return self.period_key # Fallback to current period
|
||||
|
||||
# Get the end date from the current period (which might be instant)
|
||||
current_end_date = None
|
||||
current_period_key = self.period_key
|
||||
|
||||
if current_period_key.startswith('instant_'):
|
||||
# Extract date from instant period
|
||||
date_str = current_period_key.split('_', 1)[1]
|
||||
try:
|
||||
from datetime import datetime
|
||||
current_end_date = datetime.strptime(date_str, '%Y-%m-%d').date()
|
||||
except (ValueError, IndexError):
|
||||
return self.period_key # Fallback
|
||||
elif current_period_key.startswith('duration_'):
|
||||
# Extract end date from duration period
|
||||
parts = current_period_key.split('_')
|
||||
if len(parts) >= 3:
|
||||
try:
|
||||
from datetime import datetime
|
||||
current_end_date = datetime.strptime(parts[2], '%Y-%m-%d').date()
|
||||
except (ValueError, IndexError):
|
||||
return self.period_key # Fallback
|
||||
|
||||
if current_end_date:
|
||||
# Look for a duration period ending on the same date
|
||||
# Prefer annual periods, then quarterly, then other durations
|
||||
matching_periods = []
|
||||
|
||||
for period in self.xbrl.reporting_periods:
|
||||
period_key = period['key']
|
||||
if period_key.startswith('duration_'):
|
||||
parts = period_key.split('_')
|
||||
if len(parts) >= 3:
|
||||
try:
|
||||
from datetime import datetime
|
||||
end_date = datetime.strptime(parts[2], '%Y-%m-%d').date()
|
||||
if end_date == current_end_date:
|
||||
period_type = period.get('period_type', '')
|
||||
priority = 1 if period_type == 'Annual' else (2 if period_type == 'Quarterly' else 3)
|
||||
matching_periods.append((priority, period_key, period.get('label', period_key)))
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
if matching_periods:
|
||||
# Sort by priority (1=Annual, 2=Quarterly, 3=Other) and return the best match
|
||||
matching_periods.sort(key=lambda x: x[0])
|
||||
selected_period = matching_periods[0][1]
|
||||
log.debug(f"Selected duration period for {statement_type}: {selected_period}")
|
||||
return selected_period
|
||||
|
||||
# Fallback: use current period even if it's not ideal
|
||||
return self.period_key
|
||||
else:
|
||||
# Unknown statement type, use current period
|
||||
log.debug(f"Unknown statement type {statement_type}, using current period: {self.period_key}")
|
||||
return self.period_key
|
||||
|
||||
def balance_sheet(self, raw_concepts: bool = False, as_statement: bool = True) -> Union[pd.DataFrame, 'Statement']:
|
||||
"""
|
||||
Get current period balance sheet data.
|
||||
|
||||
Args:
|
||||
raw_concepts: If True, preserve original XBRL concept names
|
||||
(e.g., "us-gaap:Assets" instead of "Assets")
|
||||
as_statement: If True, return a Statement object (default),
|
||||
if False, return DataFrame
|
||||
|
||||
Returns:
|
||||
Statement object with rich formatting by default,
|
||||
or pandas DataFrame if as_statement=False
|
||||
|
||||
Example:
|
||||
>>> stmt = xbrl.current_period.balance_sheet()
|
||||
>>> print(stmt) # Rich formatted table
|
||||
|
||||
>>> df = xbrl.current_period.balance_sheet(as_statement=False)
|
||||
>>> assets = df[df['label'].str.contains('Assets', case=False)]['value'].iloc[0]
|
||||
"""
|
||||
if as_statement:
|
||||
return self._get_statement_object('BalanceSheet')
|
||||
return self._get_statement_dataframe('BalanceSheet', raw_concepts=raw_concepts)
|
||||
|
||||
def income_statement(self, raw_concepts: bool = False, as_statement: bool = True) -> Union[pd.DataFrame, 'Statement']:
|
||||
"""
|
||||
Get current period income statement data.
|
||||
|
||||
Args:
|
||||
raw_concepts: If True, preserve original XBRL concept names
|
||||
(e.g., "us-gaap:Revenues" instead of "Revenue")
|
||||
as_statement: If True, return a Statement object (default),
|
||||
if False, return DataFrame
|
||||
|
||||
Returns:
|
||||
Statement object with rich formatting by default,
|
||||
or pandas DataFrame if as_statement=False
|
||||
|
||||
Example:
|
||||
>>> stmt = xbrl.current_period.income_statement()
|
||||
>>> print(stmt) # Rich formatted table
|
||||
|
||||
>>> df = xbrl.current_period.income_statement(as_statement=False, raw_concepts=True)
|
||||
>>> revenue = df[df['concept'].str.contains('Revenues')]['value'].iloc[0]
|
||||
"""
|
||||
if as_statement:
|
||||
return self._get_statement_object('IncomeStatement')
|
||||
return self._get_statement_dataframe('IncomeStatement', raw_concepts=raw_concepts)
|
||||
|
||||
def cashflow_statement(self, raw_concepts: bool = False, as_statement: bool = True) -> Union[pd.DataFrame, 'Statement']:
|
||||
"""
|
||||
Get current period cash flow statement data.
|
||||
|
||||
Args:
|
||||
raw_concepts: If True, preserve original XBRL concept names
|
||||
(e.g., "us-gaap:NetCashProvidedByUsedInOperatingActivities")
|
||||
as_statement: If True, return a Statement object (default),
|
||||
if False, return DataFrame
|
||||
|
||||
Returns:
|
||||
Statement object with rich formatting by default,
|
||||
or pandas DataFrame if as_statement=False
|
||||
|
||||
Example:
|
||||
>>> stmt = xbrl.current_period.cashflow_statement()
|
||||
>>> print(stmt) # Rich formatted table
|
||||
|
||||
>>> df = xbrl.current_period.cashflow_statement(as_statement=False)
|
||||
>>> operating_cf = df[df['label'].str.contains('Operating')]['value'].iloc[0]
|
||||
"""
|
||||
if as_statement:
|
||||
return self._get_statement_object('CashFlowStatement')
|
||||
return self._get_statement_dataframe('CashFlowStatement', raw_concepts=raw_concepts)
|
||||
|
||||
def statement_of_equity(self, raw_concepts: bool = False, as_statement: bool = True) -> Union[pd.DataFrame, 'Statement']:
|
||||
"""
|
||||
Get current period statement of equity data.
|
||||
|
||||
Args:
|
||||
raw_concepts: If True, preserve original XBRL concept names
|
||||
as_statement: If True, return a Statement object (default),
|
||||
if False, return DataFrame
|
||||
|
||||
Returns:
|
||||
Statement object with rich formatting by default,
|
||||
or pandas DataFrame if as_statement=False
|
||||
"""
|
||||
if as_statement:
|
||||
return self._get_statement_object('StatementOfEquity')
|
||||
return self._get_statement_dataframe('StatementOfEquity', raw_concepts=raw_concepts)
|
||||
|
||||
def comprehensive_income(self, raw_concepts: bool = False, as_statement: bool = True) -> Union[pd.DataFrame, 'Statement']:
|
||||
"""
|
||||
Get current period comprehensive income statement data.
|
||||
|
||||
Args:
|
||||
raw_concepts: If True, preserve original XBRL concept names
|
||||
as_statement: If True, return a Statement object (default),
|
||||
if False, return DataFrame
|
||||
|
||||
Returns:
|
||||
Statement object with rich formatting by default,
|
||||
or pandas DataFrame if as_statement=False
|
||||
"""
|
||||
if as_statement:
|
||||
return self._get_statement_object('ComprehensiveIncome')
|
||||
return self._get_statement_dataframe('ComprehensiveIncome', raw_concepts=raw_concepts)
|
||||
|
||||
def _get_statement_dataframe(self, statement_type: str, raw_concepts: bool = False) -> pd.DataFrame:
|
||||
"""
|
||||
Internal method to get statement data as DataFrame for current period.
|
||||
|
||||
Args:
|
||||
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
raw_concepts: Whether to preserve raw XBRL concept names
|
||||
|
||||
Returns:
|
||||
pandas DataFrame with statement data filtered to current period
|
||||
|
||||
Raises:
|
||||
StatementNotFound: If the requested statement type is not available
|
||||
"""
|
||||
try:
|
||||
# Select appropriate period based on statement type
|
||||
period_filter = self._get_appropriate_period_for_statement(statement_type)
|
||||
|
||||
# Get raw statement data filtered to current period
|
||||
statement_data = self.xbrl.get_statement(statement_type, period_filter=period_filter)
|
||||
|
||||
if not statement_data:
|
||||
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown')
|
||||
raise StatementNotFound(
|
||||
statement_type=statement_type,
|
||||
confidence=0.0,
|
||||
found_statements=[],
|
||||
entity_name=entity_name,
|
||||
reason=f"No data found for {statement_type} in period {self.period_label}"
|
||||
)
|
||||
|
||||
# Convert to DataFrame
|
||||
rows = []
|
||||
for item in statement_data:
|
||||
# Get the value for appropriate period
|
||||
values = item.get('values', {})
|
||||
current_value = values.get(period_filter)
|
||||
|
||||
if current_value is not None:
|
||||
row = {
|
||||
'concept': self._get_concept_name(item, raw_concepts),
|
||||
'label': item.get('label', ''),
|
||||
'value': current_value,
|
||||
'level': item.get('level', 0),
|
||||
'is_abstract': item.get('is_abstract', False)
|
||||
}
|
||||
|
||||
# Add original concept name if raw_concepts is requested
|
||||
if raw_concepts:
|
||||
row['standardized_label'] = item.get('label', '')
|
||||
# Try to get original concept names from all_names
|
||||
all_names = item.get('all_names', [])
|
||||
if all_names:
|
||||
row['original_concept'] = all_names[0] # First is usually original
|
||||
|
||||
# Add dimension information if present
|
||||
if item.get('is_dimension', False):
|
||||
row['dimension_label'] = item.get('full_dimension_label', '')
|
||||
row['is_dimension'] = True
|
||||
|
||||
rows.append(row)
|
||||
|
||||
if not rows:
|
||||
# Create empty DataFrame with expected structure
|
||||
columns = ['concept', 'label', 'value', 'level', 'is_abstract']
|
||||
if raw_concepts:
|
||||
columns.extend(['standardized_label', 'original_concept'])
|
||||
return pd.DataFrame(columns=columns)
|
||||
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error retrieving {statement_type} for current period: {str(e)}")
|
||||
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown')
|
||||
raise StatementNotFound(
|
||||
statement_type=statement_type,
|
||||
confidence=0.0,
|
||||
found_statements=[],
|
||||
entity_name=entity_name,
|
||||
reason=f"Failed to retrieve {statement_type}: {str(e)}"
|
||||
) from e
|
||||
|
||||
def _get_statement_object(self, statement_type: str) -> 'Statement':
|
||||
"""
|
||||
Internal method to get statement as a Statement object for current period.
|
||||
|
||||
Args:
|
||||
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
|
||||
Returns:
|
||||
Statement object with current period filtering applied
|
||||
|
||||
Raises:
|
||||
StatementNotFound: If the requested statement type is not available
|
||||
"""
|
||||
try:
|
||||
# Import here to avoid circular imports
|
||||
|
||||
# Select appropriate period based on statement type
|
||||
period_filter = self._get_appropriate_period_for_statement(statement_type)
|
||||
|
||||
# Find the statement using the unified statement finder
|
||||
matching_statements, found_role, actual_statement_type = self.xbrl.find_statement(statement_type)
|
||||
|
||||
if not found_role:
|
||||
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown')
|
||||
raise StatementNotFound(
|
||||
statement_type=statement_type,
|
||||
confidence=0.0,
|
||||
found_statements=[],
|
||||
entity_name=entity_name,
|
||||
reason=f"No matching {statement_type} found for current period {self.period_label}"
|
||||
)
|
||||
|
||||
# Create a Statement object with period filtering
|
||||
# We'll create a custom Statement class that applies period filtering
|
||||
statement = CurrentPeriodStatement(
|
||||
self.xbrl,
|
||||
found_role,
|
||||
canonical_type=statement_type,
|
||||
period_filter=period_filter,
|
||||
period_label=self.period_label
|
||||
)
|
||||
|
||||
return statement
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error retrieving {statement_type} statement object for current period: {str(e)}")
|
||||
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown')
|
||||
raise StatementNotFound(
|
||||
statement_type=statement_type,
|
||||
confidence=0.0,
|
||||
found_statements=[],
|
||||
entity_name=entity_name,
|
||||
reason=f"Failed to retrieve {statement_type} statement: {str(e)}"
|
||||
) from e
|
||||
|
||||
def _get_concept_name(self, item: Dict[str, Any], raw_concepts: bool) -> str:
|
||||
"""
|
||||
Get the appropriate concept name based on raw_concepts flag.
|
||||
|
||||
Args:
|
||||
item: Statement line item dictionary
|
||||
raw_concepts: Whether to use raw XBRL concept names
|
||||
|
||||
Returns:
|
||||
Concept name (raw or processed)
|
||||
"""
|
||||
if raw_concepts:
|
||||
# Try to get original concept name
|
||||
all_names = item.get('all_names', [])
|
||||
if all_names:
|
||||
# Return first name, converting underscores back to colons for XBRL format
|
||||
original = all_names[0]
|
||||
if '_' in original and ':' not in original:
|
||||
# This looks like a normalized name, try to restore colon format
|
||||
parts = original.split('_', 1)
|
||||
if len(parts) == 2 and parts[0] in ['us-gaap', 'dei', 'srt']:
|
||||
return f"{parts[0]}:{parts[1]}"
|
||||
return original
|
||||
return item.get('concept', '')
|
||||
else:
|
||||
# Use processed concept name
|
||||
return item.get('concept', '')
|
||||
|
||||
def notes(self, section_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get notes to financial statements for the current period.
|
||||
|
||||
Args:
|
||||
section_name: Optional specific note section to retrieve
|
||||
(e.g., "inventory", "revenue recognition")
|
||||
|
||||
Returns:
|
||||
List of note sections with their content
|
||||
|
||||
Note:
|
||||
This is a placeholder implementation. Full notes access would require
|
||||
additional development to parse and structure note content.
|
||||
"""
|
||||
# Get all statements and filter for notes
|
||||
all_statements = self.xbrl.get_all_statements()
|
||||
note_statements = []
|
||||
|
||||
for stmt in all_statements:
|
||||
stmt_type = (stmt.get('type') or '').lower()
|
||||
definition = (stmt.get('definition') or '').lower()
|
||||
|
||||
# Check if this looks like a note section
|
||||
if ('note' in stmt_type or 'note' in definition or
|
||||
'disclosure' in stmt_type or 'disclosure' in definition):
|
||||
|
||||
# If specific section requested, filter by name
|
||||
if section_name:
|
||||
if section_name.lower() in definition or section_name.lower() in stmt_type:
|
||||
note_statements.append({
|
||||
'section_name': stmt.get('definition', 'Untitled Note'),
|
||||
'type': stmt.get('type', ''),
|
||||
'role': stmt.get('role', ''),
|
||||
'element_count': stmt.get('element_count', 0)
|
||||
})
|
||||
else:
|
||||
# Return all note sections
|
||||
note_statements.append({
|
||||
'section_name': stmt.get('definition', 'Untitled Note'),
|
||||
'type': stmt.get('type', ''),
|
||||
'role': stmt.get('role', ''),
|
||||
'element_count': stmt.get('element_count', 0)
|
||||
})
|
||||
|
||||
return note_statements
|
||||
|
||||
def get_fact(self, concept: str, raw_concept: bool = False) -> Any:
|
||||
"""
|
||||
Get a specific fact value for the current period.
|
||||
|
||||
Args:
|
||||
concept: XBRL concept name to look up
|
||||
raw_concept: If True, treat concept as raw XBRL name (with colons)
|
||||
|
||||
Returns:
|
||||
Fact value if found, None otherwise
|
||||
|
||||
Example:
|
||||
>>> revenue = xbrl.current_period.get_fact('Revenues')
|
||||
>>> revenue_raw = xbrl.current_period.get_fact('us-gaap:Revenues', raw_concept=True)
|
||||
"""
|
||||
try:
|
||||
# Normalize concept name if needed
|
||||
if raw_concept and ':' in concept:
|
||||
# Convert colon format to underscore for internal lookup
|
||||
concept = concept.replace(':', '_')
|
||||
|
||||
# Use XBRL's fact finding method with current period filter
|
||||
facts = self.xbrl._find_facts_for_element(concept, period_filter=self.period_key)
|
||||
|
||||
if facts:
|
||||
# Return the first matching fact's value
|
||||
for _context_id, wrapped_fact in facts.items():
|
||||
fact = wrapped_fact['fact']
|
||||
return fact.numeric_value if fact.numeric_value is not None else fact.value
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error retrieving fact {concept}: {str(e)}")
|
||||
return None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert current period data to a dictionary format.
|
||||
|
||||
Returns:
|
||||
Dictionary with current period information and key financial data
|
||||
"""
|
||||
result = {
|
||||
'period_key': self.period_key,
|
||||
'period_label': self.period_label,
|
||||
'entity_name': getattr(self.xbrl, 'entity_name', None),
|
||||
'document_type': getattr(self.xbrl, 'document_type', None),
|
||||
'statements': {}
|
||||
}
|
||||
|
||||
# Try to get key statements
|
||||
statement_types = ['BalanceSheet', 'IncomeStatement', 'CashFlowStatement']
|
||||
for stmt_type in statement_types:
|
||||
try:
|
||||
df = self._get_statement_dataframe(stmt_type, raw_concepts=False)
|
||||
if not df.empty:
|
||||
# Convert DataFrame to list of dicts for JSON serialization
|
||||
result['statements'][stmt_type] = df.to_dict('records')
|
||||
except StatementNotFound:
|
||||
result['statements'][stmt_type] = None
|
||||
|
||||
return result
|
||||
|
||||
def debug_info(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get debugging information about the current period and data availability.
|
||||
|
||||
Returns:
|
||||
Dictionary with detailed debugging information
|
||||
"""
|
||||
info = {
|
||||
'current_period_key': self.period_key,
|
||||
'current_period_label': self.period_label,
|
||||
'total_reporting_periods': len(self.xbrl.reporting_periods),
|
||||
'entity_name': getattr(self.xbrl, 'entity_name', 'Unknown'),
|
||||
'document_period_end': getattr(self.xbrl, 'period_of_report', None),
|
||||
'periods': [],
|
||||
'statements': {}
|
||||
}
|
||||
|
||||
# Add all periods with basic info
|
||||
for period in self.xbrl.reporting_periods:
|
||||
period_info = {
|
||||
'key': period['key'],
|
||||
'label': period.get('label', 'No label'),
|
||||
'type': 'instant' if 'instant_' in period['key'] else 'duration'
|
||||
}
|
||||
info['periods'].append(period_info)
|
||||
|
||||
# Check statement availability
|
||||
statement_types = ['BalanceSheet', 'IncomeStatement', 'CashFlowStatement']
|
||||
for stmt_type in statement_types:
|
||||
try:
|
||||
# Get the period that would be used for this statement
|
||||
period_for_stmt = self._get_appropriate_period_for_statement(stmt_type)
|
||||
|
||||
# Get raw statement data
|
||||
raw_data = self.xbrl.get_statement(stmt_type, period_filter=period_for_stmt)
|
||||
|
||||
if raw_data:
|
||||
# Count items with values
|
||||
items_with_values = sum(1 for item in raw_data
|
||||
if period_for_stmt in item.get('values', {}))
|
||||
|
||||
info['statements'][stmt_type] = {
|
||||
'period_used': period_for_stmt,
|
||||
'raw_data_items': len(raw_data),
|
||||
'items_with_values': items_with_values,
|
||||
'available': items_with_values > 0,
|
||||
'error': None
|
||||
}
|
||||
else:
|
||||
info['statements'][stmt_type] = {
|
||||
'period_used': period_for_stmt,
|
||||
'raw_data_items': 0,
|
||||
'items_with_values': 0,
|
||||
'available': False,
|
||||
'error': 'No raw data returned'
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
info['statements'][stmt_type] = {
|
||||
'period_used': None,
|
||||
'raw_data_items': 0,
|
||||
'items_with_values': 0,
|
||||
'available': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
return info
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""String representation showing current period info."""
|
||||
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown Entity')
|
||||
return f"CurrentPeriodView(entity='{entity_name}', period='{self.period_label}')"
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""User-friendly string representation."""
|
||||
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown Entity')
|
||||
return f"Current Period Data for {entity_name}\nPeriod: {self.period_label}"
|
||||
|
||||
|
||||
class CurrentPeriodStatement:
|
||||
"""
|
||||
A Statement object that applies current period filtering.
|
||||
|
||||
This class wraps a regular Statement object and ensures that only
|
||||
the current period data is shown when rendering or accessing data.
|
||||
"""
|
||||
|
||||
def __init__(self, xbrl, role_or_type: str, canonical_type: Optional[str] = None,
|
||||
period_filter: Optional[str] = None, period_label: Optional[str] = None):
|
||||
"""
|
||||
Initialize with period filtering.
|
||||
|
||||
Args:
|
||||
xbrl: XBRL object containing parsed data
|
||||
role_or_type: Role URI, statement type, or statement short name
|
||||
canonical_type: Optional canonical statement type
|
||||
period_filter: Period key to filter to
|
||||
period_label: Human-readable period label
|
||||
"""
|
||||
self.xbrl = xbrl
|
||||
self.role_or_type = role_or_type
|
||||
self.canonical_type = canonical_type
|
||||
self.period_filter = period_filter
|
||||
self.period_label = period_label
|
||||
|
||||
# Create the underlying Statement object
|
||||
from edgar.xbrl.statements import Statement
|
||||
self._statement = Statement(xbrl, role_or_type, canonical_type, skip_concept_check=True)
|
||||
|
||||
def render(self, standard: bool = True, show_date_range: bool = False,
|
||||
include_dimensions: bool = True) -> Any:
|
||||
"""
|
||||
Render the statement as a formatted table for current period only.
|
||||
|
||||
Args:
|
||||
standard: Whether to use standardized concept labels
|
||||
show_date_range: Whether to show full date ranges for duration periods
|
||||
include_dimensions: Whether to include dimensional segment data
|
||||
|
||||
Returns:
|
||||
Rich Table containing the rendered statement for current period
|
||||
"""
|
||||
# Use the canonical type for rendering if available, otherwise use the role
|
||||
rendering_type = self.canonical_type if self.canonical_type else self.role_or_type
|
||||
|
||||
return self.xbrl.render_statement(
|
||||
rendering_type,
|
||||
period_filter=self.period_filter,
|
||||
standard=standard,
|
||||
show_date_range=show_date_range,
|
||||
include_dimensions=include_dimensions
|
||||
)
|
||||
|
||||
def get_raw_data(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get the raw statement data filtered to current period.
|
||||
|
||||
Returns:
|
||||
List of line items with values for current period only
|
||||
"""
|
||||
return self._statement.get_raw_data(period_filter=self.period_filter)
|
||||
|
||||
def get_dataframe(self, raw_concepts: bool = False) -> pd.DataFrame:
|
||||
"""
|
||||
Convert the statement to a DataFrame for current period.
|
||||
|
||||
Args:
|
||||
raw_concepts: If True, preserve original XBRL concept names
|
||||
|
||||
Returns:
|
||||
pandas DataFrame with current period data only
|
||||
"""
|
||||
# Get raw data for current period
|
||||
raw_data = self.get_raw_data()
|
||||
|
||||
# Convert to DataFrame format similar to CurrentPeriodView
|
||||
rows = []
|
||||
for item in raw_data:
|
||||
values = item.get('values', {})
|
||||
current_value = values.get(self.period_filter)
|
||||
|
||||
if current_value is not None:
|
||||
concept_name = item.get('concept', '')
|
||||
if raw_concepts:
|
||||
# Try to get original concept name
|
||||
all_names = item.get('all_names', [])
|
||||
if all_names:
|
||||
original = all_names[0]
|
||||
if '_' in original and ':' not in original:
|
||||
parts = original.split('_', 1)
|
||||
if len(parts) == 2 and parts[0] in ['us-gaap', 'dei', 'srt']:
|
||||
concept_name = f"{parts[0]}:{parts[1]}"
|
||||
else:
|
||||
concept_name = original
|
||||
else:
|
||||
concept_name = original
|
||||
|
||||
row = {
|
||||
'concept': concept_name,
|
||||
'label': item.get('label', ''),
|
||||
'value': current_value,
|
||||
'level': item.get('level', 0),
|
||||
'is_abstract': item.get('is_abstract', False)
|
||||
}
|
||||
|
||||
# Add original concept name if raw_concepts is requested
|
||||
if raw_concepts:
|
||||
row['standardized_label'] = item.get('label', '')
|
||||
all_names = item.get('all_names', [])
|
||||
if all_names:
|
||||
row['original_concept'] = all_names[0]
|
||||
|
||||
# Add dimension information if present
|
||||
if item.get('is_dimension', False):
|
||||
row['dimension_label'] = item.get('full_dimension_label', '')
|
||||
row['is_dimension'] = True
|
||||
|
||||
rows.append(row)
|
||||
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
def calculate_ratios(self) -> Dict[str, float]:
|
||||
"""Calculate common financial ratios for this statement."""
|
||||
return self._statement.calculate_ratios()
|
||||
|
||||
def __rich__(self) -> Any:
|
||||
"""Rich console representation."""
|
||||
return self.render()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""String representation."""
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""User-friendly string representation."""
|
||||
return repr(self)
|
||||
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Revenue Deduplication Strategy for Issue #438
|
||||
|
||||
This module implements intelligent deduplication for revenue concepts
|
||||
that may have the same underlying value but different GAAP concept names.
|
||||
|
||||
The strategy:
|
||||
1. Identify groups of items with the same value in the same period
|
||||
2. Apply hierarchical precedence rules to choose the most appropriate concept
|
||||
3. Filter out less specific concepts when duplicates exist
|
||||
|
||||
Revenue Concept Hierarchy (most to least preferred):
|
||||
1. us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax (most specific - ASC 606)
|
||||
2. us-gaap:Revenues (standard general concept)
|
||||
3. us-gaap:SalesRevenueNet (less common)
|
||||
4. us-gaap:Revenue (least specific)
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, List, Set
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RevenueDeduplicator:
|
||||
"""
|
||||
Handles deduplication of revenue concepts in financial statements.
|
||||
"""
|
||||
|
||||
# Revenue concept precedence (higher number = higher precedence)
|
||||
REVENUE_CONCEPT_PRECEDENCE = {
|
||||
'us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax': 100, # Most specific (ASC 606)
|
||||
'us-gaap:Revenues': 90, # Standard concept
|
||||
'us-gaap:SalesRevenueNet': 80, # Alternative concept
|
||||
'us-gaap:Revenue': 70, # Generic concept
|
||||
'us-gaap:TotalRevenuesAndGains': 60, # Broader concept
|
||||
}
|
||||
|
||||
# Additional revenue-related concepts that might cause duplicates
|
||||
REVENUE_RELATED_CONCEPTS = {
|
||||
'RevenueFromContractWithCustomerExcludingAssessedTax',
|
||||
'Revenues',
|
||||
'Revenue',
|
||||
'SalesRevenueNet',
|
||||
'TotalRevenuesAndGains',
|
||||
'RevenueFromContractWithCustomer',
|
||||
'TotalRevenues'
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def deduplicate_statement_items(cls, statement_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Remove duplicate revenue concepts from statement items.
|
||||
|
||||
Args:
|
||||
statement_items: List of statement line items
|
||||
|
||||
Returns:
|
||||
Filtered list with duplicates removed
|
||||
"""
|
||||
if not statement_items:
|
||||
return statement_items
|
||||
|
||||
# Group items by period and value to find potential duplicates
|
||||
period_value_groups = cls._group_by_period_value(statement_items)
|
||||
|
||||
# Identify items to remove
|
||||
items_to_remove = set()
|
||||
|
||||
for (_period, _value), items in period_value_groups.items():
|
||||
if len(items) > 1 and cls._are_revenue_duplicates(items):
|
||||
# This is a group of revenue items with the same value
|
||||
# Keep only the highest precedence item
|
||||
items_to_remove.update(cls._select_duplicates_to_remove(items))
|
||||
|
||||
# Filter out the items marked for removal
|
||||
result = []
|
||||
for i, item in enumerate(statement_items):
|
||||
if i not in items_to_remove:
|
||||
result.append(item)
|
||||
else:
|
||||
log.debug("Removed duplicate revenue item: %s = %s", item.get('label', 'Unknown'), item.get('values', {}))
|
||||
|
||||
removed_count = len(statement_items) - len(result)
|
||||
if removed_count > 0:
|
||||
log.info("Revenue deduplication: removed %d duplicate items", removed_count)
|
||||
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def _group_by_period_value(cls, statement_items: List[Dict[str, Any]]) -> Dict[tuple, List[tuple]]:
|
||||
"""
|
||||
Group statement items by (period, value) pairs.
|
||||
|
||||
Returns:
|
||||
Dict mapping (period, value) to list of (index, item) tuples
|
||||
"""
|
||||
groups = defaultdict(list)
|
||||
|
||||
for i, item in enumerate(statement_items):
|
||||
values = item.get('values', {})
|
||||
for period, value in values.items():
|
||||
if value is not None and value != 0:
|
||||
groups[(period, value)].append((i, item))
|
||||
|
||||
return groups
|
||||
|
||||
@classmethod
|
||||
def _are_revenue_duplicates(cls, indexed_items: List[tuple]) -> bool:
|
||||
"""
|
||||
Check if a group of items are revenue duplicates.
|
||||
|
||||
Args:
|
||||
indexed_items: List of (index, item) tuples
|
||||
|
||||
Returns:
|
||||
True if these items are revenue duplicates
|
||||
"""
|
||||
revenue_count = 0
|
||||
|
||||
for _, item in indexed_items:
|
||||
if cls._is_revenue_concept(item):
|
||||
revenue_count += 1
|
||||
|
||||
# If we have multiple revenue concepts, they're potential duplicates
|
||||
return revenue_count > 1
|
||||
|
||||
@classmethod
|
||||
def _is_revenue_concept(cls, item: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Check if an item represents a revenue concept.
|
||||
"""
|
||||
concept = item.get('concept', '')
|
||||
all_names = item.get('all_names', [])
|
||||
label = item.get('label', '').lower()
|
||||
|
||||
# First check for exclusions (costs, expenses, etc.)
|
||||
exclusion_terms = ['cost', 'expense', 'loss', 'depreciation', 'amortization']
|
||||
for name in [concept] + all_names + [label]:
|
||||
if any(excl in name.lower() for excl in exclusion_terms):
|
||||
return False
|
||||
|
||||
# Look for revenue-related terms in concept or names
|
||||
for name in [concept] + all_names:
|
||||
if any(term in name for term in cls.REVENUE_RELATED_CONCEPTS):
|
||||
return True
|
||||
|
||||
# Also check label for revenue indicators (but not cost-related)
|
||||
if any(term in label for term in ['revenue', 'sales']) and not any(excl in label for excl in exclusion_terms):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _select_duplicates_to_remove(cls, indexed_items: List[tuple]) -> Set[int]:
|
||||
"""
|
||||
Select which items to remove from a duplicate group.
|
||||
|
||||
Args:
|
||||
indexed_items: List of (index, item) tuples
|
||||
|
||||
Returns:
|
||||
Set of indices to remove
|
||||
"""
|
||||
if len(indexed_items) <= 1:
|
||||
return set()
|
||||
|
||||
# Score each item by precedence
|
||||
scored_items = []
|
||||
for index, item in indexed_items:
|
||||
score = cls._get_precedence_score(item)
|
||||
scored_items.append((score, index, item))
|
||||
|
||||
# Sort by score (highest first)
|
||||
scored_items.sort(reverse=True)
|
||||
|
||||
# Keep the highest scored item, remove the rest
|
||||
indices_to_remove = set()
|
||||
for i in range(1, len(scored_items)): # Skip first (highest scored)
|
||||
_, index, item = scored_items[i]
|
||||
indices_to_remove.add(index)
|
||||
|
||||
return indices_to_remove
|
||||
|
||||
@classmethod
|
||||
def _get_precedence_score(cls, item: Dict[str, Any]) -> int:
|
||||
"""
|
||||
Get the precedence score for a revenue concept.
|
||||
|
||||
Higher scores are preferred and will be kept.
|
||||
"""
|
||||
concept = item.get('concept', '')
|
||||
all_names = item.get('all_names', [])
|
||||
|
||||
# Check for exact matches in precedence table
|
||||
for name in [concept] + all_names:
|
||||
if name in cls.REVENUE_CONCEPT_PRECEDENCE:
|
||||
return cls.REVENUE_CONCEPT_PRECEDENCE[name]
|
||||
|
||||
# Check for partial matches (handle namespace prefixes)
|
||||
for name in [concept] + all_names:
|
||||
for precedence_concept, score in cls.REVENUE_CONCEPT_PRECEDENCE.items():
|
||||
if precedence_concept.split(':')[-1] in name:
|
||||
return score
|
||||
|
||||
# Default score for unrecognized revenue concepts
|
||||
return 50
|
||||
|
||||
@classmethod
|
||||
def get_deduplication_stats(cls, original_items: List[Dict[str, Any]],
|
||||
deduplicated_items: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate statistics about the deduplication process.
|
||||
"""
|
||||
original_count = len(original_items)
|
||||
deduplicated_count = len(deduplicated_items)
|
||||
removed_count = original_count - deduplicated_count
|
||||
|
||||
# Count revenue items
|
||||
original_revenue_count = sum(1 for item in original_items if cls._is_revenue_concept(item))
|
||||
deduplicated_revenue_count = sum(1 for item in deduplicated_items if cls._is_revenue_concept(item))
|
||||
|
||||
return {
|
||||
'original_total_items': original_count,
|
||||
'deduplicated_total_items': deduplicated_count,
|
||||
'removed_items': removed_count,
|
||||
'original_revenue_items': original_revenue_count,
|
||||
'deduplicated_revenue_items': deduplicated_revenue_count,
|
||||
'removed_revenue_items': original_revenue_count - deduplicated_revenue_count,
|
||||
'deduplication_performed': removed_count > 0
|
||||
}
|
||||
567
venv/lib/python3.10/site-packages/edgar/xbrl/docs/Statement.md
Normal file
567
venv/lib/python3.10/site-packages/edgar/xbrl/docs/Statement.md
Normal file
@@ -0,0 +1,567 @@
|
||||
# Statement Class Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The `Statement` class represents a single financial statement extracted from XBRL data. It provides methods for viewing, manipulating, and analyzing financial statement data including income statements, balance sheets, cash flow statements, and disclosure notes.
|
||||
|
||||
A Statement object contains:
|
||||
- **Line items** with values across multiple periods
|
||||
- **Hierarchy** showing the structure and relationships
|
||||
- **Metadata** including concept names and labels
|
||||
- **Period information** for time-series analysis
|
||||
|
||||
## Getting a Statement
|
||||
|
||||
### From XBRL
|
||||
|
||||
```python
|
||||
# Get XBRL data first
|
||||
xbrl = filing.xbrl()
|
||||
|
||||
# Access specific statements
|
||||
income = xbrl.statements.income_statement()
|
||||
balance = xbrl.statements.balance_sheet()
|
||||
cashflow = xbrl.statements.cash_flow_statement()
|
||||
equity = xbrl.statements.statement_of_equity()
|
||||
|
||||
# By name
|
||||
cover_page = xbrl.statements['CoverPage']
|
||||
|
||||
# By index
|
||||
first_statement = xbrl.statements[0]
|
||||
```
|
||||
|
||||
## Viewing Statements
|
||||
|
||||
### Rich Display
|
||||
|
||||
```python
|
||||
# Print statement to see formatted table
|
||||
print(income)
|
||||
|
||||
# Shows:
|
||||
# - Statement title
|
||||
# - Line items with hierarchical structure
|
||||
# - Values for multiple periods
|
||||
# - Proper number formatting
|
||||
```
|
||||
|
||||
### Text Representation
|
||||
|
||||
```python
|
||||
# Get plain text version
|
||||
text = str(income)
|
||||
|
||||
# Or explicitly
|
||||
text_output = income.text()
|
||||
```
|
||||
|
||||
## Converting to DataFrame
|
||||
|
||||
### Basic Conversion
|
||||
|
||||
```python
|
||||
# Convert statement to pandas DataFrame
|
||||
df = income.to_dataframe()
|
||||
|
||||
# DataFrame structure:
|
||||
# - Index: Line item labels or concepts
|
||||
# - Columns: Period dates
|
||||
# - Values: Financial amounts
|
||||
```
|
||||
|
||||
### With Period Filter
|
||||
|
||||
```python
|
||||
# Filter to specific periods
|
||||
df = income.to_dataframe(period_filter='2024')
|
||||
|
||||
# Only includes periods matching the filter
|
||||
```
|
||||
|
||||
### Accessing Specific Data
|
||||
|
||||
```python
|
||||
# Convert to DataFrame for easy analysis
|
||||
df = income.to_dataframe()
|
||||
|
||||
# Access specific line items
|
||||
revenue = df.loc['Revenue']
|
||||
net_income = df.loc['Net Income']
|
||||
|
||||
# Access specific periods
|
||||
current_period = df.iloc[:, 0] # First column (most recent)
|
||||
prior_period = df.iloc[:, 1] # Second column
|
||||
|
||||
# Specific cell
|
||||
current_revenue = df.loc['Revenue', df.columns[0]]
|
||||
```
|
||||
|
||||
## Statement Properties
|
||||
|
||||
### Available Periods
|
||||
|
||||
```python
|
||||
# Get list of periods in the statement
|
||||
periods = statement.periods
|
||||
|
||||
# Each period is a date string (YYYY-MM-DD)
|
||||
for period in periods:
|
||||
print(f"Data available for: {period}")
|
||||
```
|
||||
|
||||
### Statement Name and Type
|
||||
|
||||
```python
|
||||
# Get statement information
|
||||
name = statement.name # Statement display name
|
||||
concept = statement.concept # XBRL concept identifier
|
||||
```
|
||||
|
||||
### Raw Data Access
|
||||
|
||||
```python
|
||||
# Get underlying statement data structure
|
||||
raw_data = statement.get_raw_data()
|
||||
|
||||
# Returns list of dictionaries with:
|
||||
# - concept: XBRL concept name
|
||||
# - label: Display label
|
||||
# - values: Dict of period -> value
|
||||
# - level: Hierarchy depth
|
||||
# - all_names: All concept variations
|
||||
```
|
||||
|
||||
## Rendering and Display
|
||||
|
||||
### Custom Rendering
|
||||
|
||||
```python
|
||||
# Render with specific options
|
||||
rendered = statement.render()
|
||||
|
||||
# Rendered statement has rich formatting
|
||||
print(rendered)
|
||||
```
|
||||
|
||||
### Text Export
|
||||
|
||||
```python
|
||||
# Get markdown-formatted text
|
||||
markdown_text = statement.text()
|
||||
|
||||
# Suitable for:
|
||||
# - AI/LLM consumption
|
||||
# - Documentation
|
||||
# - Text-based analysis
|
||||
```
|
||||
|
||||
## Working with Statement Data
|
||||
|
||||
### Calculate Growth Rates
|
||||
|
||||
```python
|
||||
# Convert to DataFrame
|
||||
df = income.to_dataframe()
|
||||
|
||||
# Calculate period-over-period growth
|
||||
if len(df.columns) >= 2:
|
||||
current = df.iloc[:, 0]
|
||||
prior = df.iloc[:, 1]
|
||||
|
||||
# Growth rate
|
||||
growth = ((current - prior) / prior * 100).round(2)
|
||||
|
||||
# Create comparison DataFrame
|
||||
comparison = pd.DataFrame({
|
||||
'Current': current,
|
||||
'Prior': prior,
|
||||
'Growth %': growth
|
||||
})
|
||||
|
||||
print(comparison)
|
||||
```
|
||||
|
||||
### Extract Specific Metrics
|
||||
|
||||
```python
|
||||
# Get income statement metrics
|
||||
df = income.to_dataframe()
|
||||
|
||||
# Extract key metrics from most recent period
|
||||
current = df.iloc[:, 0]
|
||||
|
||||
metrics = {
|
||||
'Revenue': current.get('Revenue', 0),
|
||||
'Operating Income': current.get('Operating Income', 0),
|
||||
'Net Income': current.get('Net Income', 0),
|
||||
}
|
||||
|
||||
# Calculate derived metrics
|
||||
if metrics['Revenue'] > 0:
|
||||
metrics['Operating Margin'] = (
|
||||
metrics['Operating Income'] / metrics['Revenue'] * 100
|
||||
)
|
||||
metrics['Net Margin'] = (
|
||||
metrics['Net Income'] / metrics['Revenue'] * 100
|
||||
)
|
||||
```
|
||||
|
||||
### Filter Line Items
|
||||
|
||||
```python
|
||||
# Convert to DataFrame
|
||||
df = balance.to_dataframe()
|
||||
|
||||
# Filter for specific items
|
||||
asset_items = df[df.index.str.contains('Asset', case=False)]
|
||||
liability_items = df[df.index.str.contains('Liabilit', case=False)]
|
||||
|
||||
# Get subtotals
|
||||
if 'Current Assets' in df.index:
|
||||
current_assets = df.loc['Current Assets']
|
||||
```
|
||||
|
||||
### Time Series Analysis
|
||||
|
||||
```python
|
||||
# Get multiple periods
|
||||
df = income.to_dataframe()
|
||||
|
||||
# Plot revenue trend
|
||||
if 'Revenue' in df.index:
|
||||
revenue_series = df.loc['Revenue']
|
||||
|
||||
# Convert to numeric and plot
|
||||
import matplotlib.pyplot as plt
|
||||
revenue_series.plot(kind='line', title='Revenue Trend')
|
||||
plt.show()
|
||||
```
|
||||
|
||||
## Common Workflows
|
||||
|
||||
### Compare Current vs Prior Period
|
||||
|
||||
```python
|
||||
# Get income statement
|
||||
income = xbrl.statements.income_statement()
|
||||
df = income.to_dataframe()
|
||||
|
||||
# Ensure we have at least 2 periods
|
||||
if len(df.columns) >= 2:
|
||||
# Create comparison
|
||||
comparison = pd.DataFrame({
|
||||
'Current': df.iloc[:, 0],
|
||||
'Prior': df.iloc[:, 1],
|
||||
'Change': df.iloc[:, 0] - df.iloc[:, 1],
|
||||
'Change %': ((df.iloc[:, 0] - df.iloc[:, 1]) / df.iloc[:, 1] * 100).round(2)
|
||||
})
|
||||
|
||||
# Show key metrics
|
||||
key_items = ['Revenue', 'Operating Income', 'Net Income']
|
||||
for item in key_items:
|
||||
if item in comparison.index:
|
||||
print(f"\n{item}:")
|
||||
print(comparison.loc[item])
|
||||
```
|
||||
|
||||
### Extract All Periods to CSV
|
||||
|
||||
```python
|
||||
# Get statement
|
||||
statement = xbrl.statements.income_statement()
|
||||
|
||||
# Convert and save
|
||||
df = statement.to_dataframe()
|
||||
df.to_csv('income_statement.csv')
|
||||
|
||||
print(f"Exported {len(df)} line items across {len(df.columns)} periods")
|
||||
```
|
||||
|
||||
### Build Financial Ratios
|
||||
|
||||
```python
|
||||
# Get both income statement and balance sheet
|
||||
income = xbrl.statements.income_statement()
|
||||
balance = xbrl.statements.balance_sheet()
|
||||
|
||||
# Convert to DataFrames
|
||||
income_df = income.to_dataframe()
|
||||
balance_df = balance.to_dataframe()
|
||||
|
||||
# Extract values (most recent period)
|
||||
revenue = income_df.loc['Revenue', income_df.columns[0]]
|
||||
net_income = income_df.loc['Net Income', income_df.columns[0]]
|
||||
total_assets = balance_df.loc['Assets', balance_df.columns[0]]
|
||||
total_equity = balance_df.loc['Equity', balance_df.columns[0]]
|
||||
|
||||
# Calculate ratios
|
||||
ratios = {
|
||||
'Net Profit Margin': (net_income / revenue * 100).round(2),
|
||||
'ROA': (net_income / total_assets * 100).round(2),
|
||||
'ROE': (net_income / total_equity * 100).round(2),
|
||||
'Asset Turnover': (revenue / total_assets).round(2),
|
||||
}
|
||||
|
||||
print("Financial Ratios:")
|
||||
for ratio, value in ratios.items():
|
||||
print(f" {ratio}: {value}")
|
||||
```
|
||||
|
||||
### Search for Specific Items
|
||||
|
||||
```python
|
||||
# Get statement as DataFrame
|
||||
df = income.to_dataframe()
|
||||
|
||||
# Search for items containing keywords
|
||||
research_costs = df[df.index.str.contains('Research', case=False)]
|
||||
tax_items = df[df.index.str.contains('Tax', case=False)]
|
||||
|
||||
# Or get raw data with concept names
|
||||
raw = income.get_raw_data()
|
||||
research_concepts = [
|
||||
item for item in raw
|
||||
if 'research' in item['label'].lower()
|
||||
]
|
||||
```
|
||||
|
||||
### Aggregate Subcategories
|
||||
|
||||
```python
|
||||
# Get statement
|
||||
df = balance.to_dataframe()
|
||||
|
||||
# Define categories (adjust based on actual labels)
|
||||
current_asset_categories = [
|
||||
'Cash and Cash Equivalents',
|
||||
'Accounts Receivable',
|
||||
'Inventory',
|
||||
'Other Current Assets'
|
||||
]
|
||||
|
||||
# Sum categories
|
||||
current_assets_sum = sum([
|
||||
df.loc[cat, df.columns[0]]
|
||||
for cat in current_asset_categories
|
||||
if cat in df.index
|
||||
])
|
||||
|
||||
# Verify against reported total
|
||||
if 'Current Assets' in df.index:
|
||||
reported_total = df.loc['Current Assets', df.columns[0]]
|
||||
print(f"Calculated: {current_assets_sum}")
|
||||
print(f"Reported: {reported_total}")
|
||||
print(f"Difference: {current_assets_sum - reported_total}")
|
||||
```
|
||||
|
||||
## Integration with Analysis Tools
|
||||
|
||||
### With Pandas
|
||||
|
||||
```python
|
||||
# Statement integrates seamlessly with pandas
|
||||
df = statement.to_dataframe()
|
||||
|
||||
# Use all pandas functionality
|
||||
summary = df.describe()
|
||||
correlations = df.T.corr()
|
||||
rolling_avg = df.T.rolling(window=4).mean()
|
||||
```
|
||||
|
||||
### With NumPy
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
# Convert to numpy array for numerical operations
|
||||
df = statement.to_dataframe()
|
||||
values = df.values
|
||||
|
||||
# Numerical analysis
|
||||
mean_values = np.mean(values, axis=1)
|
||||
std_values = np.std(values, axis=1)
|
||||
growth_rates = np.diff(values, axis=1) / values[:, :-1]
|
||||
```
|
||||
|
||||
### Export for Visualization
|
||||
|
||||
```python
|
||||
# Prepare data for plotting
|
||||
df = income.to_dataframe()
|
||||
|
||||
# Select key items
|
||||
plot_items = ['Revenue', 'Operating Income', 'Net Income']
|
||||
plot_data = df.loc[plot_items].T
|
||||
|
||||
# Plot with matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
plot_data.plot(kind='bar', figsize=(12, 6))
|
||||
plt.title('Income Statement Trends')
|
||||
plt.xlabel('Period')
|
||||
plt.ylabel('Amount (USD)')
|
||||
plt.xticks(rotation=45)
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Missing Line Items
|
||||
|
||||
```python
|
||||
# Check if item exists before accessing
|
||||
df = statement.to_dataframe()
|
||||
|
||||
if 'Revenue' in df.index:
|
||||
revenue = df.loc['Revenue']
|
||||
else:
|
||||
print("Revenue not found in statement")
|
||||
# Try alternative names
|
||||
for alt in ['Revenues', 'Total Revenue', 'Net Revenue']:
|
||||
if alt in df.index:
|
||||
revenue = df.loc[alt]
|
||||
break
|
||||
```
|
||||
|
||||
### Handling Different Formats
|
||||
|
||||
```python
|
||||
# Companies may use different labels
|
||||
def find_item(df, possible_names):
|
||||
"""Find item by trying multiple possible names."""
|
||||
for name in possible_names:
|
||||
if name in df.index:
|
||||
return df.loc[name]
|
||||
return None
|
||||
|
||||
# Usage
|
||||
revenue_names = ['Revenue', 'Revenues', 'Total Revenue', 'Net Sales']
|
||||
revenue = find_item(df, revenue_names)
|
||||
|
||||
if revenue is not None:
|
||||
print(f"Found revenue: {revenue}")
|
||||
else:
|
||||
print("Revenue not found under common names")
|
||||
```
|
||||
|
||||
### Incomplete Period Data
|
||||
|
||||
```python
|
||||
# Check data availability
|
||||
df = statement.to_dataframe()
|
||||
|
||||
# Check for null values
|
||||
missing_data = df.isnull().sum()
|
||||
if missing_data.any():
|
||||
print("Periods with missing data:")
|
||||
print(missing_data[missing_data > 0])
|
||||
|
||||
# Fill missing with 0 or forward fill
|
||||
df_filled = df.fillna(0) # Replace NaN with 0
|
||||
# or
|
||||
df_filled = df.fillna(method='ffill') # Forward fill
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Always convert to DataFrame for analysis**:
|
||||
```python
|
||||
df = statement.to_dataframe() # Easier to work with
|
||||
```
|
||||
|
||||
2. **Check item names before accessing**:
|
||||
```python
|
||||
if 'Revenue' in df.index:
|
||||
revenue = df.loc['Revenue']
|
||||
```
|
||||
|
||||
3. **Handle multiple naming conventions**:
|
||||
```python
|
||||
# Try variations
|
||||
for name in ['Revenue', 'Revenues', 'Total Revenue']:
|
||||
if name in df.index:
|
||||
revenue = df.loc[name]
|
||||
break
|
||||
```
|
||||
|
||||
4. **Validate calculated values**:
|
||||
```python
|
||||
# Check against reported totals
|
||||
calculated = sum(components)
|
||||
reported = df.loc['Total']
|
||||
assert abs(calculated - reported) < 0.01, "Mismatch!"
|
||||
```
|
||||
|
||||
5. **Use period filters appropriately**:
|
||||
```python
|
||||
# Filter to specific years
|
||||
df_2024 = statement.to_dataframe(period_filter='2024')
|
||||
```
|
||||
|
||||
## Performance Tips
|
||||
|
||||
### Caching DataFrames
|
||||
|
||||
```python
|
||||
# Cache the DataFrame if using repeatedly
|
||||
df_cache = statement.to_dataframe()
|
||||
|
||||
# Reuse cached version
|
||||
revenue = df_cache.loc['Revenue']
|
||||
net_income = df_cache.loc['Net Income']
|
||||
# ... more operations
|
||||
```
|
||||
|
||||
### Selective Period Loading
|
||||
|
||||
```python
|
||||
# If you only need recent data
|
||||
current_only = xbrl.current_period.income_statement()
|
||||
df = current_only.to_dataframe() # Smaller, faster
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "KeyError: Line item not found"
|
||||
|
||||
**Cause**: Item label doesn't match exactly
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# List all available items
|
||||
print(df.index.tolist())
|
||||
|
||||
# Or search for pattern
|
||||
matching = df[df.index.str.contains('keyword', case=False)]
|
||||
```
|
||||
|
||||
### "Empty DataFrame"
|
||||
|
||||
**Cause**: Statement has no data or wrong period filter
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# Check raw data
|
||||
raw = statement.get_raw_data()
|
||||
print(f"Statement has {len(raw)} items")
|
||||
|
||||
# Check periods
|
||||
print(f"Available periods: {statement.periods}")
|
||||
```
|
||||
|
||||
### "Index error when accessing columns"
|
||||
|
||||
**Cause**: Fewer periods than expected
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# Check column count first
|
||||
if len(df.columns) >= 2:
|
||||
current = df.iloc[:, 0]
|
||||
prior = df.iloc[:, 1]
|
||||
else:
|
||||
print("Insufficient periods for comparison")
|
||||
```
|
||||
|
||||
This guide covers the essential patterns for working with Statement objects in edgartools. For information on accessing statements from XBRL, see the XBRL documentation.
|
||||
587
venv/lib/python3.10/site-packages/edgar/xbrl/docs/XBRL.md
Normal file
587
venv/lib/python3.10/site-packages/edgar/xbrl/docs/XBRL.md
Normal file
@@ -0,0 +1,587 @@
|
||||
# XBRL Class Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The `XBRL` class is the primary interface for working with XBRL (eXtensible Business Reporting Language) financial data from SEC filings. It provides structured access to financial statements, facts, and related data extracted from filings like 10-K, 10-Q, and 8-K reports.
|
||||
|
||||
XBRL documents contain:
|
||||
- **Financial statements** (Income Statement, Balance Sheet, Cash Flow, etc.)
|
||||
- **Facts** - Individual data points with values, periods, and dimensions
|
||||
- **Contexts** - Time periods and dimensional information
|
||||
- **Presentation** - How facts are organized into statements
|
||||
|
||||
## Getting XBRL Data
|
||||
|
||||
### From a Filing
|
||||
|
||||
```python
|
||||
# Get XBRL from any filing with financial data
|
||||
filing = company.get_filings(form="10-K").latest()
|
||||
xbrl = filing.xbrl()
|
||||
```
|
||||
|
||||
### Quick Check
|
||||
|
||||
```python
|
||||
# Print XBRL to see what's available
|
||||
print(xbrl)
|
||||
# Shows: company info, available statements, periods, and usage examples
|
||||
```
|
||||
|
||||
## Accessing Financial Statements
|
||||
|
||||
### Core Statement Methods
|
||||
|
||||
The XBRL class provides convenient methods for accessing standard financial statements:
|
||||
|
||||
```python
|
||||
# Access core financial statements
|
||||
income = xbrl.statements.income_statement()
|
||||
balance = xbrl.statements.balance_sheet()
|
||||
cashflow = xbrl.statements.cash_flow_statement()
|
||||
equity = xbrl.statements.statement_of_equity()
|
||||
comprehensive = xbrl.statements.comprehensive_income()
|
||||
```
|
||||
|
||||
### Access by Name
|
||||
|
||||
You can access any statement by its exact name as it appears in the filing:
|
||||
|
||||
```python
|
||||
# List all available statements
|
||||
print(xbrl.statements)
|
||||
|
||||
# Access specific statement by name
|
||||
cover_page = xbrl.statements['CoverPage']
|
||||
disclosure = xbrl.statements['CONDENSED CONSOLIDATED BALANCE SHEETS Unaudited']
|
||||
```
|
||||
|
||||
### Access by Index
|
||||
|
||||
Statements can also be accessed by their index position:
|
||||
|
||||
```python
|
||||
# Get statement by index (0-based)
|
||||
first_statement = xbrl.statements[0]
|
||||
sixth_statement = xbrl.statements[6]
|
||||
```
|
||||
|
||||
## Working with Periods
|
||||
|
||||
### Current Period Only
|
||||
|
||||
To work with just the most recent period's data:
|
||||
|
||||
```python
|
||||
# Get current period XBRL view
|
||||
current = xbrl.current_period
|
||||
|
||||
# Access statements for current period
|
||||
current_income = current.income_statement()
|
||||
current_balance = current.balance_sheet()
|
||||
```
|
||||
|
||||
### Multi-Period Statements
|
||||
|
||||
By default, statements include multiple periods for comparison:
|
||||
|
||||
```python
|
||||
# Get income statement with comparative periods
|
||||
income = xbrl.statements.income_statement()
|
||||
# Typically includes current year/quarter and prior periods
|
||||
|
||||
# Convert to DataFrame to see all periods
|
||||
df = income.to_dataframe()
|
||||
print(df.columns) # Shows all available periods
|
||||
```
|
||||
|
||||
### Available Periods
|
||||
|
||||
```python
|
||||
# See what periods are available
|
||||
for period in xbrl.reporting_periods:
|
||||
print(f"Period: {period['label']}, Key: {period['key']}")
|
||||
```
|
||||
|
||||
## Querying Facts
|
||||
|
||||
The `.facts` property provides a powerful query interface for finding specific data points:
|
||||
|
||||
### Basic Fact Queries
|
||||
|
||||
```python
|
||||
# Get all revenue facts
|
||||
revenue_facts = xbrl.facts.query().by_concept('Revenue').to_dataframe()
|
||||
|
||||
# Get net income facts
|
||||
net_income = xbrl.facts.query().by_concept('NetIncome').to_dataframe()
|
||||
|
||||
# Search by label instead of concept name
|
||||
revenue = xbrl.facts.query().by_label('Revenue').to_dataframe()
|
||||
```
|
||||
|
||||
### Filter by Period
|
||||
|
||||
```python
|
||||
# Get facts for a specific period
|
||||
period_key = "duration_2024-01-01_2024-12-31"
|
||||
facts_2024 = xbrl.facts.query().by_period_key(period_key).to_dataframe()
|
||||
|
||||
# Filter by fiscal year
|
||||
facts_fy2024 = xbrl.facts.query().by_fiscal_year(2024).to_dataframe()
|
||||
|
||||
# Filter by fiscal period
|
||||
q1_facts = xbrl.facts.query().by_fiscal_period("Q1").to_dataframe()
|
||||
```
|
||||
|
||||
### Filter by Statement Type
|
||||
|
||||
```python
|
||||
# Get all income statement facts
|
||||
income_facts = xbrl.facts.query().by_statement_type("IncomeStatement").to_dataframe()
|
||||
|
||||
# Get all balance sheet facts
|
||||
balance_facts = xbrl.facts.query().by_statement_type("BalanceSheet").to_dataframe()
|
||||
```
|
||||
|
||||
### Chaining Filters
|
||||
|
||||
```python
|
||||
# Combine multiple filters
|
||||
revenue_2024 = (xbrl.facts.query()
|
||||
.by_concept('Revenue')
|
||||
.by_fiscal_year(2024)
|
||||
.by_period_type('duration')
|
||||
.to_dataframe())
|
||||
```
|
||||
|
||||
### Pattern Matching
|
||||
|
||||
```python
|
||||
# Find all concepts matching a pattern (case-insensitive)
|
||||
asset_facts = xbrl.facts.query().by_concept('Asset', exact=False).to_dataframe()
|
||||
|
||||
# Search labels with pattern
|
||||
liability_facts = xbrl.facts.query().by_label('liabilities', exact=False).to_dataframe()
|
||||
```
|
||||
|
||||
## Converting to DataFrames
|
||||
|
||||
### Statement to DataFrame
|
||||
|
||||
```python
|
||||
# Convert any statement to pandas DataFrame
|
||||
income = xbrl.statements.income_statement()
|
||||
df = income.to_dataframe()
|
||||
|
||||
# DataFrame has:
|
||||
# - One row per line item
|
||||
# - One column per period
|
||||
# - Index is the concept/label
|
||||
```
|
||||
|
||||
### Facts to DataFrame
|
||||
|
||||
```python
|
||||
# Query returns DataFrame directly
|
||||
df = xbrl.facts.query().by_concept('Revenue').to_dataframe()
|
||||
|
||||
# DataFrame columns:
|
||||
# - concept: XBRL concept name
|
||||
# - label: Human-readable label
|
||||
# - value: Fact value
|
||||
# - period: Period identifier
|
||||
# - start: Period start date (for duration)
|
||||
# - end: Period end date
|
||||
# - unit: Unit of measure (e.g., USD)
|
||||
# - dimensions: Dimensional breakdowns (if any)
|
||||
```
|
||||
|
||||
## Advanced Patterns
|
||||
|
||||
### Finding Specific Disclosures
|
||||
|
||||
```python
|
||||
# Get statements organized by category
|
||||
categories = xbrl.statements.get_statements_by_category()
|
||||
|
||||
# View all disclosures
|
||||
disclosures = categories['disclosure']
|
||||
for disc in disclosures:
|
||||
print(f"{disc['index']}: {disc['title']}")
|
||||
|
||||
# View all notes
|
||||
notes = categories['note']
|
||||
for note in notes:
|
||||
print(f"{note['index']}: {note['title']}")
|
||||
|
||||
# Get core financial statements
|
||||
core_statements = categories['statement']
|
||||
|
||||
# Or list all statements to find specific ones
|
||||
all_statements = xbrl.get_all_statements()
|
||||
for stmt in all_statements:
|
||||
print(f"{stmt['type']}: {stmt['title']}")
|
||||
|
||||
# Access by exact name or index
|
||||
risk_factors = xbrl.statements['RiskFactorsDisclosure']
|
||||
# Or by index from the category list
|
||||
first_disclosure = xbrl.statements[disclosures[0]['index']]
|
||||
```
|
||||
|
||||
### Cross-Period Analysis
|
||||
|
||||
```python
|
||||
# Get multi-period income statement
|
||||
income = xbrl.statements.income_statement()
|
||||
df = income.to_dataframe()
|
||||
|
||||
# Calculate year-over-year growth
|
||||
if len(df.columns) >= 2:
|
||||
current = df.iloc[:, 0]
|
||||
prior = df.iloc[:, 1]
|
||||
growth = ((current - prior) / prior * 100).round(2)
|
||||
print(f"Revenue growth: {growth.loc['Revenue']}%")
|
||||
```
|
||||
|
||||
### Working with Dimensions
|
||||
|
||||
```python
|
||||
# Query facts with specific dimensional breakdowns
|
||||
segment_revenue = (xbrl.facts.query()
|
||||
.by_concept('Revenue')
|
||||
.by_dimension('Segment', 'ProductSegment')
|
||||
.to_dataframe())
|
||||
|
||||
# Group by dimensions
|
||||
segment_totals = segment_revenue.groupby('dimensions')['value'].sum()
|
||||
```
|
||||
|
||||
### Custom Fact Filtering
|
||||
|
||||
```python
|
||||
# Use custom filter function
|
||||
large_amounts = xbrl.facts.query().by_value(lambda v: abs(v) > 1000000).to_dataframe()
|
||||
|
||||
# Custom filter with lambda
|
||||
recent_facts = xbrl.facts.query().by_custom(
|
||||
lambda fact: fact['end'] >= '2024-01-01'
|
||||
).to_dataframe()
|
||||
```
|
||||
|
||||
## Common Workflows
|
||||
|
||||
### Extract Revenue from Income Statement
|
||||
|
||||
```python
|
||||
# Method 1: Via statement
|
||||
income = xbrl.statements.income_statement()
|
||||
df = income.to_dataframe()
|
||||
revenue = df.loc['Revenue']
|
||||
|
||||
# Method 2: Via facts query
|
||||
revenue_facts = xbrl.facts.query().by_concept('Revenues').to_dataframe()
|
||||
latest_revenue = revenue_facts.iloc[0]['value']
|
||||
```
|
||||
|
||||
### Compare Current vs Prior Year
|
||||
|
||||
```python
|
||||
# Get current period data
|
||||
current = xbrl.current_period
|
||||
current_income = current.income_statement()
|
||||
current_df = current_income.to_dataframe()
|
||||
|
||||
# Get full multi-period data
|
||||
full_income = xbrl.statements.income_statement()
|
||||
full_df = full_income.to_dataframe()
|
||||
|
||||
# Compare
|
||||
if len(full_df.columns) >= 2:
|
||||
comparison = pd.DataFrame({
|
||||
'Current': full_df.iloc[:, 0],
|
||||
'Prior': full_df.iloc[:, 1],
|
||||
'Change': full_df.iloc[:, 0] - full_df.iloc[:, 1]
|
||||
})
|
||||
print(comparison)
|
||||
```
|
||||
|
||||
### Extract Specific Disclosure Data
|
||||
|
||||
```python
|
||||
# Find debt-related disclosures
|
||||
all_statements = xbrl.get_all_statements()
|
||||
debt_statements = [s for s in all_statements if 'debt' in s['title'].lower()]
|
||||
|
||||
# Access first debt disclosure
|
||||
if debt_statements:
|
||||
debt_disclosure = xbrl.statements[debt_statements[0]['type']]
|
||||
debt_df = debt_disclosure.to_dataframe()
|
||||
```
|
||||
|
||||
### Export All Core Statements
|
||||
|
||||
```python
|
||||
# Export all core financial statements to CSV
|
||||
statements_to_export = {
|
||||
'income_statement': xbrl.statements.income_statement(),
|
||||
'balance_sheet': xbrl.statements.balance_sheet(),
|
||||
'cash_flow': xbrl.statements.cash_flow_statement(),
|
||||
}
|
||||
|
||||
for name, stmt in statements_to_export.items():
|
||||
if stmt:
|
||||
df = stmt.to_dataframe()
|
||||
df.to_csv(f"{name}.csv")
|
||||
```
|
||||
|
||||
### Build Custom Financial Summary
|
||||
|
||||
```python
|
||||
# Extract key metrics from multiple statements
|
||||
metrics = {}
|
||||
|
||||
# Revenue and profit from income statement
|
||||
income = xbrl.statements.income_statement()
|
||||
income_df = income.to_dataframe()
|
||||
metrics['Revenue'] = income_df.loc['Revenue', income_df.columns[0]]
|
||||
metrics['Net Income'] = income_df.loc['Net Income', income_df.columns[0]]
|
||||
|
||||
# Assets from balance sheet
|
||||
balance = xbrl.statements.balance_sheet()
|
||||
balance_df = balance.to_dataframe()
|
||||
metrics['Total Assets'] = balance_df.loc['Assets', balance_df.columns[0]]
|
||||
|
||||
# Cash flow from operations
|
||||
cashflow = xbrl.statements.cash_flow_statement()
|
||||
cashflow_df = cashflow.to_dataframe()
|
||||
metrics['Operating Cash Flow'] = cashflow_df.loc['Operating Activities', cashflow_df.columns[0]]
|
||||
|
||||
# Create summary DataFrame
|
||||
summary = pd.DataFrame([metrics])
|
||||
print(summary)
|
||||
```
|
||||
|
||||
## Entity Information
|
||||
|
||||
### Access Filing Metadata
|
||||
|
||||
```python
|
||||
# Get entity and filing information
|
||||
entity_info = xbrl.entity_info
|
||||
|
||||
print(f"Company: {entity_info.get('entity_name')}")
|
||||
print(f"Ticker: {entity_info.get('trading_symbol')}")
|
||||
print(f"CIK: {entity_info.get('entity_identifier')}")
|
||||
print(f"Form: {entity_info.get('document_type')}")
|
||||
print(f"Fiscal Year: {entity_info.get('document_fiscal_year_focus')}")
|
||||
print(f"Fiscal Period: {entity_info.get('document_fiscal_period_focus')}")
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Missing Statements
|
||||
|
||||
```python
|
||||
from edgar.xbrl.xbrl import StatementNotFound
|
||||
|
||||
try:
|
||||
equity = xbrl.statements.statement_of_equity()
|
||||
except StatementNotFound:
|
||||
print("Statement of equity not available in this filing")
|
||||
equity = None
|
||||
```
|
||||
|
||||
### Empty Query Results
|
||||
|
||||
```python
|
||||
# Query returns empty DataFrame if no matches
|
||||
results = xbrl.facts.query().by_concept('NonexistentConcept').to_dataframe()
|
||||
|
||||
if results.empty:
|
||||
print("No facts found matching query")
|
||||
```
|
||||
|
||||
### Handling Multiple Formats
|
||||
|
||||
```python
|
||||
# Some companies use different concept names
|
||||
revenue_concepts = ['Revenue', 'Revenues', 'SalesRevenue', 'RevenueFromContractWithCustomer']
|
||||
|
||||
for concept in revenue_concepts:
|
||||
revenue = xbrl.facts.query().by_concept(concept).to_dataframe()
|
||||
if not revenue.empty:
|
||||
print(f"Found revenue under concept: {concept}")
|
||||
break
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Caching
|
||||
|
||||
```python
|
||||
# Facts are cached after first access
|
||||
facts = xbrl.facts # First call - loads data
|
||||
facts2 = xbrl.facts # Subsequent calls use cache
|
||||
```
|
||||
|
||||
### Limiting Results
|
||||
|
||||
```python
|
||||
# Use limit() to reduce memory usage for large result sets
|
||||
sample_facts = xbrl.facts.query().limit(100).to_dataframe()
|
||||
```
|
||||
|
||||
### Efficient Filtering
|
||||
|
||||
```python
|
||||
# Apply specific filters early in the query chain
|
||||
# Good: specific filters first
|
||||
revenue = (xbrl.facts.query()
|
||||
.by_statement_type("IncomeStatement") # Narrow down first
|
||||
.by_concept("Revenue") # Then more specific
|
||||
.to_dataframe())
|
||||
|
||||
# Less efficient: broad query then filter
|
||||
all_facts = xbrl.facts.query().to_dataframe()
|
||||
revenue = all_facts[all_facts['concept'] == 'Revenue']
|
||||
```
|
||||
|
||||
## Data Structure Reference
|
||||
|
||||
### Key Properties
|
||||
|
||||
| Property | Type | Description |
|
||||
|----------|------|-------------|
|
||||
| `statements` | Statements | Access to financial statements |
|
||||
| `facts` | FactsView | Query interface for facts |
|
||||
| `entity_info` | dict | Company and filing metadata |
|
||||
| `reporting_periods` | list | Available time periods |
|
||||
| `contexts` | dict | XBRL contexts (periods + dimensions) |
|
||||
| `units` | dict | Units of measure |
|
||||
| `current_period` | CurrentPeriodView | Current period only |
|
||||
|
||||
### Fact DataFrame Columns
|
||||
|
||||
When you convert facts to a DataFrame using `.to_dataframe()`, you get:
|
||||
|
||||
- `concept`: XBRL element name (e.g., 'Revenues', 'Assets')
|
||||
- `label`: Human-readable label
|
||||
- `value`: Fact value (numeric or text)
|
||||
- `period`: Period identifier
|
||||
- `start`: Period start date (for duration periods)
|
||||
- `end`: Period end date
|
||||
- `unit`: Unit of measure (e.g., 'USD', 'shares')
|
||||
- `dimensions`: Dictionary of dimensional breakdowns
|
||||
- `decimals`: Precision indicator
|
||||
|
||||
## Integration with Other Classes
|
||||
|
||||
### With Filing
|
||||
|
||||
```python
|
||||
# XBRL comes from filing
|
||||
filing = company.get_filings(form="10-K").latest()
|
||||
xbrl = filing.xbrl()
|
||||
|
||||
# Access back to filing if needed
|
||||
# (Store reference if you need it)
|
||||
```
|
||||
|
||||
### With Company
|
||||
|
||||
```python
|
||||
# Get multiple filings and compare XBRL data
|
||||
filings = company.get_filings(form="10-Q", count=4)
|
||||
|
||||
revenue_trend = []
|
||||
for filing in filings:
|
||||
xbrl = filing.xbrl()
|
||||
revenue = xbrl.facts.query().by_concept('Revenue').to_dataframe()
|
||||
if not revenue.empty:
|
||||
revenue_trend.append({
|
||||
'filing_date': filing.filing_date,
|
||||
'revenue': revenue.iloc[0]['value']
|
||||
})
|
||||
|
||||
trend_df = pd.DataFrame(revenue_trend)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Check statement availability** before accessing:
|
||||
```python
|
||||
print(xbrl) # See what's available
|
||||
```
|
||||
|
||||
2. **Use query chaining** for complex filters:
|
||||
```python
|
||||
results = (xbrl.facts.query()
|
||||
.by_statement_type("IncomeStatement")
|
||||
.by_fiscal_year(2024)
|
||||
.by_period_type("duration")
|
||||
.to_dataframe())
|
||||
```
|
||||
|
||||
3. **Handle missing data gracefully**:
|
||||
```python
|
||||
try:
|
||||
stmt = xbrl.statements.equity_statement()
|
||||
except StatementNotFound:
|
||||
stmt = None
|
||||
```
|
||||
|
||||
4. **Convert to DataFrame for analysis**:
|
||||
```python
|
||||
df = statement.to_dataframe() # Easier to work with
|
||||
```
|
||||
|
||||
5. **Use current_period for latest data**:
|
||||
```python
|
||||
current = xbrl.current_period
|
||||
latest_income = current.income_statement()
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "Statement not found"
|
||||
|
||||
**Cause**: Statement doesn't exist in this filing or uses non-standard name
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# List all available statements
|
||||
print(xbrl.statements)
|
||||
|
||||
# Or check available types
|
||||
all_statements = xbrl.get_all_statements()
|
||||
statement_types = [s['type'] for s in all_statements]
|
||||
```
|
||||
|
||||
### "No facts found"
|
||||
|
||||
**Cause**: Concept name doesn't match or no data for period
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# Try pattern matching
|
||||
results = xbrl.facts.query().by_concept('Revenue', exact=False).to_dataframe()
|
||||
|
||||
# Or search by label
|
||||
results = xbrl.facts.query().by_label('revenue').to_dataframe()
|
||||
```
|
||||
|
||||
### "Empty DataFrame"
|
||||
|
||||
**Cause**: Period filter too restrictive or no data available
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# Check available periods
|
||||
print(xbrl.reporting_periods)
|
||||
|
||||
# Query without period filter
|
||||
all_revenue = xbrl.facts.query().by_concept('Revenue').to_dataframe()
|
||||
```
|
||||
|
||||
This comprehensive guide covers the essential patterns for working with XBRL data in edgartools. For more examples, see the Filing and Statement documentation.
|
||||
311
venv/lib/python3.10/site-packages/edgar/xbrl/examples.py
Normal file
311
venv/lib/python3.10/site-packages/edgar/xbrl/examples.py
Normal file
@@ -0,0 +1,311 @@
|
||||
"""
|
||||
Examples demonstrating how to use the XBRL2 module.
|
||||
|
||||
This module provides multiple examples demonstrating different ways to use the XBRL2 module.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from rich import print
|
||||
from rich.console import Console
|
||||
|
||||
from edgar import Company, Filing
|
||||
from edgar.xbrl.statements import Statements
|
||||
from edgar.xbrl.xbrl import XBRL
|
||||
|
||||
|
||||
def render_financial_statements(ticker="AAPL"):
|
||||
"""
|
||||
Demonstrates how to render financial statements in a tabular format.
|
||||
"""
|
||||
company = Company("AAPL")
|
||||
|
||||
# Get the latest filing
|
||||
filing = company.latest("10-K")
|
||||
|
||||
# Create an XBRL object
|
||||
xbrl = XBRL.from_filing(filing)
|
||||
|
||||
# Display entity information
|
||||
print("\n[bold]Entity Information:[/bold]")
|
||||
for key, value in xbrl.entity_info.items():
|
||||
print(f"{key}: {value}")
|
||||
|
||||
# Display available reporting periods
|
||||
print("\n[bold]Available Reporting Periods:[/bold]")
|
||||
for i, period in enumerate(xbrl.reporting_periods):
|
||||
if period['type'] == 'instant':
|
||||
print(f"{i + 1}. As of {period['date']}")
|
||||
else:
|
||||
print(f"{i + 1}. {period['start_date']} to {period['end_date']}")
|
||||
|
||||
# Show available period views for each statement
|
||||
print("\n[bold]Available Period Views for Balance Sheet:[/bold]")
|
||||
bs_views = xbrl.get_period_views("BalanceSheet")
|
||||
for view in bs_views:
|
||||
print(f"- {view['name']}: {view['description']}")
|
||||
|
||||
print("\n[bold]Available Period Views for Income Statement:[/bold]")
|
||||
is_views = xbrl.get_period_views("IncomeStatement")
|
||||
for view in is_views:
|
||||
print(f"- {view['name']}: {view['description']}")
|
||||
|
||||
# Render Balance Sheet using default view
|
||||
print("\n[bold]Balance Sheet (Default View):[/bold]")
|
||||
balance_sheet = xbrl.render_statement("BalanceSheet")
|
||||
print(balance_sheet)
|
||||
|
||||
# Render Balance Sheet with Current vs. Previous Period view if available
|
||||
if bs_views and any(v['name'] == 'Current vs. Previous Period' for v in bs_views):
|
||||
print("\n[bold]Balance Sheet (Current vs. Previous Period):[/bold]")
|
||||
current_vs_prev_bs = xbrl.render_statement("BalanceSheet", period_view="Current vs. Previous Period")
|
||||
print(current_vs_prev_bs)
|
||||
|
||||
# Render Income Statement using default view
|
||||
print("\n[bold]Income Statement (Default View):[/bold]")
|
||||
income_statement = xbrl.render_statement("IncomeStatement")
|
||||
print(income_statement)
|
||||
|
||||
# Render Income Statement with Annual Comparison view if available
|
||||
if is_views and any(v['name'] == 'Annual Comparison' for v in is_views):
|
||||
print("\n[bold]Income Statement (Annual Comparison):[/bold]")
|
||||
annual_is = xbrl.render_statement("IncomeStatement", period_view="Annual Comparison")
|
||||
print(annual_is)
|
||||
|
||||
# Render Cash Flow Statement
|
||||
print("\n[bold]Cash Flow Statement:[/bold]")
|
||||
cash_flow = xbrl.render_statement("CashFlowStatement")
|
||||
print(cash_flow)
|
||||
|
||||
# Get a specific period for rendering
|
||||
if xbrl.reporting_periods:
|
||||
# Use the most recent instant period for Balance Sheet
|
||||
instant_periods = [p for p in xbrl.reporting_periods if p['type'] == 'instant']
|
||||
|
||||
if instant_periods:
|
||||
period = instant_periods[0] # Most recent period
|
||||
period_key = f"instant_{period['date']}"
|
||||
|
||||
print(f"\n[bold]Balance Sheet (As of {period['date']} only):[/bold]")
|
||||
single_period_bs = xbrl.render_statement("BalanceSheet", period_filter=period_key)
|
||||
print(single_period_bs)
|
||||
|
||||
# Use most recent duration period for Income Statement
|
||||
duration_periods = [p for p in xbrl.reporting_periods if p['type'] == 'duration']
|
||||
|
||||
if duration_periods:
|
||||
period = duration_periods[0] # Most recent period
|
||||
period_key = f"duration_{period['start_date']}_{period['end_date']}"
|
||||
|
||||
print(f"\n[bold]Income Statement ({period['start_date']} to {period['end_date']} only):[/bold]")
|
||||
single_period_is = xbrl.render_statement("IncomeStatement", period_filter=period_key)
|
||||
print(single_period_is)
|
||||
|
||||
def using_statements_api(ticker="TSLA"):
|
||||
"""
|
||||
Demonstrates the use of the user-friendly Statements API.
|
||||
"""
|
||||
company = Company(ticker)
|
||||
|
||||
# Get the latest filing
|
||||
filing = company.latest("10-K")
|
||||
|
||||
# Create an XBRL object
|
||||
xbrl = XBRL.from_filing(filing)
|
||||
|
||||
# Create a Statements object for easier access
|
||||
statements = Statements(xbrl)
|
||||
|
||||
# Display available statements
|
||||
print("\n[bold]Available Statements:[/bold]")
|
||||
print(statements)
|
||||
|
||||
# Display balance sheet
|
||||
print("\n[bold]Balance Sheet:[/bold]")
|
||||
balance_sheet = statements.balance_sheet()
|
||||
print(balance_sheet)
|
||||
|
||||
# Display income statement
|
||||
print("\n[bold]Income Statement:[/bold]")
|
||||
income_statement = statements.income_statement()
|
||||
print(income_statement)
|
||||
|
||||
# Display cash flow statement
|
||||
print("\n[bold]Cash Flow Statement:[/bold]")
|
||||
cash_flow = statements.cashflow_statement()
|
||||
print(cash_flow)
|
||||
|
||||
# Get available period views
|
||||
print("\n[bold]Available Period Views for Income Statement:[/bold]")
|
||||
period_views = statements.get_period_views("IncomeStatement")
|
||||
for view in period_views:
|
||||
print(f"- {view['name']}: {view['description']}")
|
||||
|
||||
# Display with specific period view if available
|
||||
if period_views:
|
||||
view_name = period_views[0]['name']
|
||||
print(f"\n[bold]Income Statement with {view_name} Period View:[/bold]")
|
||||
income_statement_view = statements.income_statement(period_view=view_name)
|
||||
print(income_statement_view)
|
||||
|
||||
# Display three-column view if available
|
||||
print("\n[bold]Three-Column Statement View (if available):[/bold]")
|
||||
period_views = statements.get_period_views("BalanceSheet")
|
||||
three_year_view = next((v for v in period_views if "Three" in v['name']), None)
|
||||
if three_year_view:
|
||||
print(f"\n[bold]Balance Sheet with Three Periods ({three_year_view['name']}):[/bold]")
|
||||
print(f"Description: {three_year_view['description']}")
|
||||
three_col_bs = statements.balance_sheet(period_view=three_year_view['name'])
|
||||
print(three_col_bs)
|
||||
else:
|
||||
print("[yellow]No three-period view available for this filing.[/yellow]")
|
||||
|
||||
# Convert to dataframe
|
||||
print("\n[bold]Converting to DataFrame:[/bold]")
|
||||
df = statements.to_dataframe("IncomeStatement")
|
||||
print(f"DataFrame shape: {df.shape}")
|
||||
print(df.head(3))
|
||||
|
||||
def example_with_real_filing():
|
||||
"""
|
||||
Example using a real filing from SEC.
|
||||
Note: This requires internet access.
|
||||
"""
|
||||
# Using print directly with rich formatting instead of console
|
||||
print("[bold]Example with Real Filing[/bold]")
|
||||
|
||||
try:
|
||||
# Get a filing with XBRL attachments
|
||||
filing = Filing.get('0000320193-23-000077') # Apple 10-K
|
||||
print(f"Retrieved filing: {filing.form} for {filing.company} ({filing.filing_date})")
|
||||
|
||||
# Parse XBRL data
|
||||
xbrl = XBRL.from_filing(filing)
|
||||
|
||||
# Create Statements object
|
||||
statements = Statements(xbrl)
|
||||
|
||||
# Display entity information
|
||||
print("\n[bold]Entity Information:[/bold]")
|
||||
entity_info = {
|
||||
'entity_name': xbrl.entity_info.get('entity_name'),
|
||||
'ticker': xbrl.entity_info.get('ticker'),
|
||||
'document_type': xbrl.entity_info.get('document_type'),
|
||||
'fiscal_year': xbrl.entity_info.get('fiscal_year'),
|
||||
'fiscal_period': xbrl.entity_info.get('fiscal_period')
|
||||
}
|
||||
for key, value in entity_info.items():
|
||||
print(f"{key}: {value}")
|
||||
|
||||
# Display balance sheet
|
||||
print("\n[bold]Balance Sheet:[/bold]")
|
||||
balance_sheet = statements.balance_sheet()
|
||||
print(balance_sheet)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[bold red]Error loading real filing: {str(e)}[/bold red]")
|
||||
print("[yellow]Note: This example requires internet access to fetch filings from SEC EDGAR.[/yellow]")
|
||||
|
||||
|
||||
def standardized_statements_example():
|
||||
"""
|
||||
Demonstrates the use of standardized concept labels.
|
||||
"""
|
||||
# Path to XBRL files
|
||||
sample_dir = Path(__file__).parent / "aapl"
|
||||
|
||||
# Create an XBRL object by parsing the directory
|
||||
xbrl = XBRL.from_directory(sample_dir)
|
||||
|
||||
# Create a Statements object for easier access
|
||||
statements = Statements(xbrl)
|
||||
|
||||
# Display original income statement
|
||||
print("\n[bold]Income Statement (Original Labels):[/bold]")
|
||||
income_statement = statements.income_statement()
|
||||
print(income_statement)
|
||||
|
||||
# Display standardized income statement
|
||||
print("\n[bold]Income Statement (Standardized Labels):[/bold]")
|
||||
income_statement_std = statements.income_statement(standard=True)
|
||||
print(income_statement_std)
|
||||
|
||||
# Display original balance sheet
|
||||
print("\n[bold]Balance Sheet (Original Labels):[/bold]")
|
||||
balance_sheet = statements.balance_sheet()
|
||||
print(balance_sheet)
|
||||
|
||||
# Display standardized balance sheet
|
||||
print("\n[bold]Balance Sheet (Standardized Labels):[/bold]")
|
||||
balance_sheet_std = statements.balance_sheet(standard=True)
|
||||
print(balance_sheet_std)
|
||||
|
||||
# Show standardized statement with a specific period view
|
||||
period_views = statements.get_period_views("BalanceSheet")
|
||||
if period_views:
|
||||
view_name = period_views[0]['name']
|
||||
print(f"\n[bold]Balance Sheet ({view_name}) with Standardized Labels:[/bold]")
|
||||
balance_sheet_view_std = statements.balance_sheet(period_view=view_name, standard=True)
|
||||
print(balance_sheet_view_std)
|
||||
|
||||
# Demonstrate standardized DataFrames
|
||||
print("\n[bold]Converting to DataFrame with Standardized Labels:[/bold]")
|
||||
|
||||
# Original DataFrame
|
||||
print("\n[bold]Original DataFrame:[/bold]")
|
||||
df_orig = statements.to_dataframe("IncomeStatement", standard=False)
|
||||
if not df_orig.empty:
|
||||
print(f"DataFrame shape: {df_orig.shape}")
|
||||
print(df_orig[['concept', 'label']].head(3))
|
||||
|
||||
# Standardized DataFrame
|
||||
print("\n[bold]Standardized DataFrame:[/bold]")
|
||||
df_std = statements.to_dataframe("IncomeStatement", standard=True)
|
||||
if not df_std.empty:
|
||||
print(f"DataFrame shape: {df_std.shape}")
|
||||
if 'original_label' in df_std.columns:
|
||||
print(df_std[['concept', 'label', 'original_label']].head(3))
|
||||
else:
|
||||
print(df_std[['concept', 'label']].head(3))
|
||||
|
||||
if __name__ == "__main__":
|
||||
console = Console()
|
||||
print("[bold cyan]XBRL2 Module Examples[/bold cyan]")
|
||||
print("[yellow]Choose an example to run:[/yellow]")
|
||||
print("1. Render Financial Statements (Direct XBRL API)")
|
||||
print("2. Using Statements API (User-friendly API)")
|
||||
print("3. Example with Real Filing (Requires Internet)")
|
||||
print("4. Standardized Statements (Concept Standardization)")
|
||||
print("5. Run All Examples")
|
||||
|
||||
try:
|
||||
choice = input("\nEnter your choice (1-5): ")
|
||||
|
||||
if choice == "1":
|
||||
render_financial_statements()
|
||||
elif choice == "2":
|
||||
using_statements_api()
|
||||
elif choice == "3":
|
||||
example_with_real_filing()
|
||||
elif choice == "4":
|
||||
standardized_statements_example()
|
||||
elif choice == "5":
|
||||
print("\n[bold]Running All Examples[/bold]\n")
|
||||
print("\n[bold cyan]Example 1: Render Financial Statements[/bold cyan]\n")
|
||||
render_financial_statements()
|
||||
print("\n" + "-" * 80 + "\n")
|
||||
print("\n[bold cyan]Example 2: Using Statements API[/bold cyan]\n")
|
||||
using_statements_api()
|
||||
print("\n" + "-" * 80 + "\n")
|
||||
print("\n[bold cyan]Example 3: Example with Real Filing[/bold cyan]\n")
|
||||
example_with_real_filing()
|
||||
print("\n" + "-" * 80 + "\n")
|
||||
print("\n[bold cyan]Example 4: Standardized Statements[/bold cyan]\n")
|
||||
standardized_statements_example()
|
||||
else:
|
||||
print("[bold red]Invalid choice. Please run the script again and select a valid option.[/bold red]")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n[yellow]Examples cancelled by user.[/yellow]")
|
||||
except Exception as e:
|
||||
print(f"[bold red]Error running examples: {str(e)}[/bold red]")
|
||||
36
venv/lib/python3.10/site-packages/edgar/xbrl/exceptions.py
Normal file
36
venv/lib/python3.10/site-packages/edgar/xbrl/exceptions.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
XBRL-specific exceptions.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class StatementNotFound(Exception):
|
||||
"""Exception raised when a statement cannot be resolved with sufficient confidence."""
|
||||
statement_type: str
|
||||
confidence: float
|
||||
found_statements: List[str]
|
||||
entity_name: str = "Unknown"
|
||||
cik: str = "Unknown"
|
||||
period_of_report: str = "Unknown"
|
||||
reason: str = ""
|
||||
|
||||
def __str__(self):
|
||||
base_msg = f"Failed to resolve {self.statement_type} for {self.entity_name} (CIK: {self.cik}, Period: {self.period_of_report})"
|
||||
if self.confidence > 0:
|
||||
confidence_msg = f"Low confidence match: {self.confidence:.2f}"
|
||||
else:
|
||||
confidence_msg = "No matching statements found"
|
||||
|
||||
if self.found_statements:
|
||||
found_msg = f"Found statements: {self.found_statements}"
|
||||
else:
|
||||
found_msg = "No statements available"
|
||||
|
||||
details = f"{base_msg}. {confidence_msg}. {found_msg}"
|
||||
if self.reason:
|
||||
details += f". {self.reason}"
|
||||
|
||||
return details
|
||||
1513
venv/lib/python3.10/site-packages/edgar/xbrl/facts.py
Normal file
1513
venv/lib/python3.10/site-packages/edgar/xbrl/facts.py
Normal file
File diff suppressed because it is too large
Load Diff
313
venv/lib/python3.10/site-packages/edgar/xbrl/models.py
Normal file
313
venv/lib/python3.10/site-packages/edgar/xbrl/models.py
Normal file
@@ -0,0 +1,313 @@
|
||||
"""
|
||||
Data models for XBRL parsing.
|
||||
|
||||
This module defines the core data structures used throughout the XBRL parser.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# Constants for label roles
|
||||
STANDARD_LABEL = "http://www.xbrl.org/2003/role/label"
|
||||
TERSE_LABEL = "http://www.xbrl.org/2003/role/terseLabel"
|
||||
PERIOD_START_LABEL = "http://www.xbrl.org/2003/role/periodStartLabel"
|
||||
PERIOD_END_LABEL = "http://www.xbrl.org/2003/role/periodEndLabel"
|
||||
TOTAL_LABEL = "http://www.xbrl.org/2003/role/totalLabel"
|
||||
|
||||
|
||||
def select_display_label(
|
||||
labels: Dict[str, str],
|
||||
preferred_label: Optional[str] = None,
|
||||
standard_label: Optional[str] = None,
|
||||
element_id: Optional[str] = None,
|
||||
element_name: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Select the most appropriate label for display, following a consistent priority order.
|
||||
Includes standardization mapping to provide consistent labels across companies.
|
||||
|
||||
Args:
|
||||
labels: Dictionary of available labels
|
||||
preferred_label: Role of the preferred label (if specified in presentation linkbase)
|
||||
standard_label: The standard label content (if available)
|
||||
element_id: Element ID (fallback)
|
||||
element_name: Element name (alternative fallback)
|
||||
|
||||
Returns:
|
||||
The selected label according to priority rules, with standardization applied if available
|
||||
"""
|
||||
# First, select the best available label using existing priority logic
|
||||
selected_label = None
|
||||
|
||||
# 1. Use preferred label if specified and available
|
||||
if preferred_label and labels and preferred_label in labels:
|
||||
selected_label = labels[preferred_label]
|
||||
|
||||
# 2. Use terse label if available (more user-friendly)
|
||||
elif labels and TERSE_LABEL in labels:
|
||||
selected_label = labels[TERSE_LABEL]
|
||||
|
||||
# 3. Fall back to standard label
|
||||
elif standard_label:
|
||||
selected_label = standard_label
|
||||
|
||||
# 4. Try STANDARD_LABEL directly from labels dict
|
||||
elif labels and STANDARD_LABEL in labels:
|
||||
selected_label = labels[STANDARD_LABEL]
|
||||
|
||||
# 5. Take any available label
|
||||
elif labels:
|
||||
selected_label = next(iter(labels.values()), "")
|
||||
|
||||
# 6. Use element name if available
|
||||
elif element_name:
|
||||
selected_label = element_name
|
||||
|
||||
# 7. Last resort: element ID
|
||||
else:
|
||||
selected_label = element_id or ""
|
||||
|
||||
# Apply standardization if we have an element_id (concept)
|
||||
if element_id and selected_label:
|
||||
try:
|
||||
from edgar.xbrl.standardization.core import initialize_default_mappings
|
||||
|
||||
# Initialize mapping store (cached after first call)
|
||||
if not hasattr(select_display_label, '_mapping_store'):
|
||||
select_display_label._mapping_store = initialize_default_mappings(read_only=True)
|
||||
|
||||
# Try to get standardized concept
|
||||
standardized_label = select_display_label._mapping_store.get_standard_concept(element_id)
|
||||
|
||||
if standardized_label:
|
||||
return standardized_label
|
||||
|
||||
except ImportError:
|
||||
# Standardization not available, continue with selected label
|
||||
pass
|
||||
except Exception:
|
||||
# Any other error in standardization, continue with selected label
|
||||
pass
|
||||
|
||||
return selected_label
|
||||
|
||||
|
||||
class ElementCatalog:
|
||||
"""
|
||||
A catalog of XBRL elements with their properties.
|
||||
|
||||
This is the base data structure for element metadata as described in the design document.
|
||||
|
||||
Attributes:
|
||||
name: The name of the element (e.g., "us-gaap_NetIncome")
|
||||
data_type: The data type of the element (e.g., "monetary", "string", etc.)
|
||||
period_type: The period type of the element (e.g., "instant", "duration")
|
||||
balance: The balance type of the element (e.g., "debit", "credit", or None)
|
||||
abstract: Whether the element is abstract (True/False)
|
||||
labels: A dictionary of labels for the element, keyed by role URI
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
name: str,
|
||||
data_type: str,
|
||||
period_type: str,
|
||||
balance: Optional[str] = None,
|
||||
abstract: bool = False,
|
||||
labels: Optional[Dict[str, str]] = None
|
||||
):
|
||||
self.name = name
|
||||
self.data_type = data_type
|
||||
self.period_type = period_type
|
||||
self.balance = balance
|
||||
self.abstract = abstract
|
||||
self.labels = labels if labels is not None else {}
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.name
|
||||
|
||||
|
||||
class Context(BaseModel):
|
||||
"""
|
||||
An XBRL context defining entity, period, and dimensional information.
|
||||
|
||||
This corresponds to the Context Registry in the design document.
|
||||
"""
|
||||
context_id: str
|
||||
entity: Dict[str, Any] = Field(default_factory=dict)
|
||||
period: Dict[str, Any] = Field(default_factory=dict)
|
||||
dimensions: Dict[str, str] = Field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def period_string(self) -> str:
|
||||
"""Return a human-readable string representation of the period."""
|
||||
if self.period.get('type') == 'instant':
|
||||
return f"As of {self.period.get('instant')}"
|
||||
elif self.period.get('type') == 'duration':
|
||||
return f"From {self.period.get('startDate')} to {self.period.get('endDate')}"
|
||||
else:
|
||||
return "Forever"
|
||||
|
||||
|
||||
class Fact(BaseModel):
|
||||
"""
|
||||
An XBRL fact with value and references to context, unit, and element.
|
||||
|
||||
This corresponds to the Fact Database in the design document.
|
||||
|
||||
The instance_id field is used to differentiate between duplicate facts
|
||||
that share the same element_id and context_ref. When a fact has no
|
||||
duplicates, instance_id will be None.
|
||||
|
||||
The fact_id field preserves the original id attribute from the XML element,
|
||||
enabling linkage with footnotes.
|
||||
"""
|
||||
element_id: str
|
||||
context_ref: str
|
||||
value: str
|
||||
unit_ref: Optional[str] = None
|
||||
decimals: Optional[Union[int, str]] = None # int or "INF"
|
||||
numeric_value: Optional[float] = None
|
||||
footnotes: List[str] = Field(default_factory=list)
|
||||
instance_id: Optional[int] = None
|
||||
fact_id: Optional[str] = None # Original id attribute from the XML
|
||||
|
||||
|
||||
class Footnote(BaseModel):
|
||||
"""
|
||||
Represents an XBRL footnote with its text content and related facts.
|
||||
|
||||
Footnotes are linked to facts via footnoteArc elements that connect
|
||||
fact IDs to footnote IDs using xlink:from and xlink:to attributes.
|
||||
"""
|
||||
footnote_id: str
|
||||
text: str
|
||||
lang: Optional[str] = "en-US"
|
||||
role: Optional[str] = None
|
||||
related_fact_ids: List[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class PresentationNode(BaseModel):
|
||||
"""
|
||||
A node in the presentation hierarchy.
|
||||
|
||||
This corresponds to the Presentation Node in the design document.
|
||||
"""
|
||||
element_id: str
|
||||
parent: Optional[str] = None
|
||||
children: List[str] = Field(default_factory=list)
|
||||
order: float = 0.0
|
||||
preferred_label: Optional[str] = None
|
||||
depth: int = 0
|
||||
|
||||
# Additional information linked from element catalog
|
||||
element_name: Optional[str] = None
|
||||
standard_label: Optional[str] = None
|
||||
is_abstract: bool = False
|
||||
labels: Dict[str, str] = Field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def display_label(self) -> str:
|
||||
"""
|
||||
Return the appropriate label for display, prioritizing user-friendly options.
|
||||
|
||||
Label selection priority:
|
||||
1. Preferred label (if specified in presentation linkbase)
|
||||
2. Terse label (for more concise display)
|
||||
3. Label (standard label)
|
||||
4. Element ID (fallback)
|
||||
"""
|
||||
return select_display_label(
|
||||
labels=self.labels,
|
||||
standard_label=self.standard_label,
|
||||
preferred_label=self.preferred_label,
|
||||
element_id=self.element_id
|
||||
)
|
||||
|
||||
|
||||
class PresentationTree(BaseModel):
|
||||
"""
|
||||
A presentation tree for a specific role.
|
||||
|
||||
This corresponds to the Presentation Hierarchy in the design document.
|
||||
"""
|
||||
role_uri: str
|
||||
definition: str
|
||||
root_element_id: str
|
||||
all_nodes: Dict[str, PresentationNode] = Field(default_factory=dict)
|
||||
order: int = 0
|
||||
|
||||
|
||||
class CalculationNode(BaseModel):
|
||||
"""
|
||||
A node in the calculation hierarchy.
|
||||
|
||||
This corresponds to the Calculation Node in the design document.
|
||||
"""
|
||||
element_id: str
|
||||
children: List[str] = Field(default_factory=list)
|
||||
parent: Optional[str] = None
|
||||
weight: float = 1.0
|
||||
order: float = 0.0
|
||||
|
||||
# Information linked from schema
|
||||
balance_type: Optional[str] = None # "debit", "credit", or None
|
||||
period_type: Optional[str] = None # "instant" or "duration"
|
||||
|
||||
|
||||
class CalculationTree(BaseModel):
|
||||
"""
|
||||
A calculation tree for a specific role.
|
||||
|
||||
This corresponds to the Calculation Network in the design document.
|
||||
"""
|
||||
role_uri: str
|
||||
definition: str
|
||||
root_element_id: str
|
||||
all_nodes: Dict[str, CalculationNode] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class Axis(BaseModel):
|
||||
"""
|
||||
A dimensional axis (dimension) in XBRL.
|
||||
|
||||
This corresponds to the Axis (Dimension) in the design document.
|
||||
"""
|
||||
element_id: str
|
||||
label: str
|
||||
domain_id: Optional[str] = None
|
||||
default_member_id: Optional[str] = None
|
||||
is_typed_dimension: bool = False
|
||||
typed_domain_ref: str = ""
|
||||
|
||||
|
||||
class Domain(BaseModel):
|
||||
"""
|
||||
A domain in an XBRL dimensional structure.
|
||||
|
||||
This corresponds to the Domain in the design document.
|
||||
"""
|
||||
element_id: str
|
||||
label: str
|
||||
members: List[str] = Field(default_factory=list) # List of domain member element IDs
|
||||
parent: Optional[str] = None # Parent domain element ID
|
||||
|
||||
|
||||
class Table(BaseModel):
|
||||
"""
|
||||
A dimensional table (hypercube) in XBRL.
|
||||
|
||||
This corresponds to the Table (Hypercube) in the design document.
|
||||
"""
|
||||
element_id: str
|
||||
label: str
|
||||
role_uri: str
|
||||
axes: List[str] = Field(default_factory=list) # List of axis element IDs
|
||||
line_items: List[str] = Field(default_factory=list) # List of line item element IDs
|
||||
closed: bool = False
|
||||
context_element: str = "segment"
|
||||
|
||||
|
||||
class XBRLProcessingError(Exception):
|
||||
"""Exception raised for errors during XBRL processing."""
|
||||
pass
|
||||
@@ -0,0 +1,27 @@
|
||||
"""
|
||||
XBRL Parser Components.
|
||||
|
||||
This package provides specialized parser components for different aspects
|
||||
of XBRL document processing. Each parser handles a specific responsibility
|
||||
in the XBRL parsing workflow.
|
||||
"""
|
||||
|
||||
from .base import BaseParser
|
||||
from .calculation import CalculationParser
|
||||
from .coordinator import XBRLParser
|
||||
from .definition import DefinitionParser
|
||||
from .instance import InstanceParser
|
||||
from .labels import LabelsParser
|
||||
from .presentation import PresentationParser
|
||||
from .schema import SchemaParser
|
||||
|
||||
__all__ = [
|
||||
'BaseParser',
|
||||
'XBRLParser',
|
||||
'SchemaParser',
|
||||
'LabelsParser',
|
||||
'PresentationParser',
|
||||
'CalculationParser',
|
||||
'DefinitionParser',
|
||||
'InstanceParser',
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
148
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/base.py
Normal file
148
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/base.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""
|
||||
Base parser functionality for XBRL parsing components.
|
||||
|
||||
This module provides common utilities and base functionality shared across
|
||||
all XBRL parser components.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
from lxml import etree as ET
|
||||
|
||||
from edgar.core import log
|
||||
from edgar.xbrl.core import NAMESPACES
|
||||
|
||||
|
||||
class BaseParser:
|
||||
"""Base class for XBRL parser components with common functionality."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize base parser with common data structures."""
|
||||
# Common namespaces and utilities available to all parsers
|
||||
self.namespaces = NAMESPACES
|
||||
|
||||
def _safe_parse_xml(self, content: str) -> ET.Element:
|
||||
"""
|
||||
Safely parse XML content with lxml, handling encoding declarations properly.
|
||||
|
||||
Args:
|
||||
content: XML content as string or bytes
|
||||
|
||||
Returns:
|
||||
parsed XML root element
|
||||
"""
|
||||
parser = ET.XMLParser(remove_blank_text=True, recover=True)
|
||||
|
||||
# Convert to bytes for safer parsing if needed
|
||||
if isinstance(content, str):
|
||||
content_bytes = content.encode('utf-8')
|
||||
else:
|
||||
content_bytes = content
|
||||
|
||||
# Parse with lxml
|
||||
return ET.XML(content_bytes, parser)
|
||||
|
||||
def _parse_order_attribute(self, arc) -> float:
|
||||
"""Parse order attribute from arc, checking both order and xlink:order."""
|
||||
# Try xlink:order first (XBRL standard)
|
||||
order_value = arc.get('{http://www.w3.org/1999/xlink}order')
|
||||
if order_value is None:
|
||||
# Fallback to order attribute
|
||||
order_value = arc.get('order')
|
||||
|
||||
# Debug logging to understand what's in the XBRL document
|
||||
if order_value is not None:
|
||||
log.debug(f"Found order attribute: {order_value}")
|
||||
else:
|
||||
# Log all attributes to see what's actually there
|
||||
all_attrs = dict(arc.attrib) if hasattr(arc, 'attrib') else {}
|
||||
log.debug(f"No order attribute found. Available attributes: {all_attrs}")
|
||||
|
||||
try:
|
||||
return float(order_value) if order_value is not None else 0.0
|
||||
except (ValueError, TypeError):
|
||||
return 0.0
|
||||
|
||||
def _extract_role_info(self, role_element) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract role information from a role element.
|
||||
|
||||
Args:
|
||||
role_element: XML element containing role definition
|
||||
|
||||
Returns:
|
||||
Dictionary with role information
|
||||
"""
|
||||
role_info = {}
|
||||
|
||||
# Get role URI
|
||||
role_uri = role_element.get('roleURI', '')
|
||||
role_info['uri'] = role_uri
|
||||
|
||||
# Extract role definition/label
|
||||
definition_elem = role_element.find('.//{http://www.xbrl.org/2003/linkbase}definition')
|
||||
if definition_elem is not None:
|
||||
role_info['definition'] = definition_elem.text or ''
|
||||
else:
|
||||
# Fallback: create definition from role URI
|
||||
role_info['definition'] = role_uri.split('/')[-1].replace('_', ' ') if role_uri else ''
|
||||
|
||||
return role_info
|
||||
|
||||
def _get_element_namespace_and_name(self, element_id: str) -> tuple[str, str]:
|
||||
"""
|
||||
Extract namespace and local name from an element ID.
|
||||
|
||||
Args:
|
||||
element_id: Element identifier (may include namespace prefix)
|
||||
|
||||
Returns:
|
||||
Tuple of (namespace, local_name)
|
||||
"""
|
||||
if ':' in element_id:
|
||||
prefix, local_name = element_id.split(':', 1)
|
||||
# Map common prefixes to namespaces
|
||||
namespace_map = {
|
||||
'us-gaap': 'http://fasb.org/us-gaap/2024',
|
||||
'dei': 'http://xbrl.sec.gov/dei/2024',
|
||||
'invest': 'http://xbrl.sec.gov/invest/2013-01-31',
|
||||
'country': 'http://xbrl.sec.gov/country/2023',
|
||||
'currency': 'http://xbrl.sec.gov/currency/2023',
|
||||
'exch': 'http://xbrl.sec.gov/exch/2023',
|
||||
'naics': 'http://xbrl.sec.gov/naics/2023',
|
||||
'sic': 'http://xbrl.sec.gov/sic/2023',
|
||||
'stpr': 'http://xbrl.sec.gov/stpr/2023',
|
||||
}
|
||||
namespace = namespace_map.get(prefix, f'http://unknown.namespace/{prefix}')
|
||||
return namespace, local_name
|
||||
else:
|
||||
return '', element_id
|
||||
|
||||
def _normalize_element_id(self, element_id: str) -> str:
|
||||
"""
|
||||
Normalize element ID to a consistent format.
|
||||
|
||||
Args:
|
||||
element_id: Original element identifier
|
||||
|
||||
Returns:
|
||||
Normalized element identifier
|
||||
"""
|
||||
if ':' in element_id:
|
||||
prefix, name = element_id.split(':', 1)
|
||||
return f"{prefix}_{name}"
|
||||
return element_id
|
||||
|
||||
def _log_parsing_progress(self, component: str, count: int, total: int = None):
|
||||
"""
|
||||
Log parsing progress for debugging.
|
||||
|
||||
Args:
|
||||
component: Name of component being parsed
|
||||
count: Number of items processed
|
||||
total: Total number of items (optional)
|
||||
"""
|
||||
if total:
|
||||
log.debug(f"Parsed {count}/{total} {component}")
|
||||
else:
|
||||
log.debug(f"Parsed {count} {component}")
|
||||
@@ -0,0 +1,223 @@
|
||||
"""
|
||||
Calculation parser for XBRL documents.
|
||||
|
||||
This module handles parsing of XBRL calculation linkbases and building
|
||||
calculation trees with weights for validation.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from edgar.xbrl.core import NAMESPACES, extract_element_id
|
||||
from edgar.xbrl.models import CalculationNode, CalculationTree, ElementCatalog, Fact, XBRLProcessingError
|
||||
|
||||
from .base import BaseParser
|
||||
|
||||
|
||||
class CalculationParser(BaseParser):
|
||||
"""Parser for XBRL calculation linkbases."""
|
||||
|
||||
def __init__(self, calculation_roles: Dict[str, Dict[str, Any]],
|
||||
calculation_trees: Dict[str, CalculationTree],
|
||||
element_catalog: Dict[str, ElementCatalog],
|
||||
facts: Dict[str, Fact]):
|
||||
"""
|
||||
Initialize calculation parser with data structure references.
|
||||
|
||||
Args:
|
||||
calculation_roles: Reference to calculation roles dictionary
|
||||
calculation_trees: Reference to calculation trees dictionary
|
||||
element_catalog: Reference to element catalog dictionary
|
||||
facts: Reference to facts dictionary
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# Store references to data structures
|
||||
self.calculation_roles = calculation_roles
|
||||
self.calculation_trees = calculation_trees
|
||||
self.element_catalog = element_catalog
|
||||
self.facts = facts
|
||||
|
||||
def parse_calculation(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse calculation linkbase file and build calculation trees."""
|
||||
try:
|
||||
content = Path(file_path).read_text()
|
||||
self.parse_calculation_content(content)
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing calculation file {file_path}: {str(e)}") from e
|
||||
|
||||
def parse_calculation_content(self, content: str) -> None:
|
||||
"""Parse calculation linkbase content and build calculation trees."""
|
||||
try:
|
||||
# Use safe XML parsing method
|
||||
root = self._safe_parse_xml(content)
|
||||
|
||||
# Extract calculation links
|
||||
calculation_links = root.findall('.//{http://www.xbrl.org/2003/linkbase}calculationLink')
|
||||
|
||||
for link in calculation_links:
|
||||
role = link.get('{http://www.w3.org/1999/xlink}role')
|
||||
if not role:
|
||||
continue
|
||||
|
||||
# Store role information
|
||||
role_id = role.split('/')[-1] if '/' in role else role
|
||||
role_def = role_id.replace('_', ' ')
|
||||
|
||||
self.calculation_roles[role] = {
|
||||
'roleUri': role,
|
||||
'definition': role_def,
|
||||
'roleId': role_id
|
||||
}
|
||||
|
||||
# Extract arcs
|
||||
arcs = link.findall('.//{http://www.xbrl.org/2003/linkbase}calculationArc')
|
||||
|
||||
# Create relationships list
|
||||
relationships = []
|
||||
|
||||
for arc in arcs:
|
||||
from_ref = arc.get('{http://www.w3.org/1999/xlink}from')
|
||||
to_ref = arc.get('{http://www.w3.org/1999/xlink}to')
|
||||
order = self._parse_order_attribute(arc)
|
||||
weight = float(arc.get('weight', '1.0'))
|
||||
|
||||
if not from_ref or not to_ref:
|
||||
continue
|
||||
|
||||
# Find locators for from/to references
|
||||
from_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{from_ref}"]')
|
||||
to_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{to_ref}"]')
|
||||
|
||||
if from_loc is None or to_loc is None:
|
||||
continue
|
||||
|
||||
from_href = from_loc.get('{http://www.w3.org/1999/xlink}href')
|
||||
to_href = to_loc.get('{http://www.w3.org/1999/xlink}href')
|
||||
|
||||
if not from_href or not to_href:
|
||||
continue
|
||||
|
||||
# Extract element IDs
|
||||
from_element = extract_element_id(from_href)
|
||||
to_element = extract_element_id(to_href)
|
||||
|
||||
# Add relationship
|
||||
relationships.append({
|
||||
'from_element': from_element,
|
||||
'to_element': to_element,
|
||||
'order': order,
|
||||
'weight': weight
|
||||
})
|
||||
|
||||
# Build calculation tree for this role
|
||||
if relationships:
|
||||
self._build_calculation_tree(role, relationships)
|
||||
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing calculation content: {str(e)}") from e
|
||||
|
||||
def _build_calculation_tree(self, role: str, relationships: List[Dict[str, Any]]) -> None:
|
||||
"""
|
||||
Build a calculation tree from relationships.
|
||||
|
||||
Args:
|
||||
role: Extended link role URI
|
||||
relationships: List of relationships (from_element, to_element, order, weight)
|
||||
"""
|
||||
# Group relationships by source element
|
||||
from_map = {}
|
||||
to_map = {}
|
||||
|
||||
for rel in relationships:
|
||||
from_element = rel['from_element']
|
||||
to_element = rel['to_element']
|
||||
|
||||
if from_element not in from_map:
|
||||
from_map[from_element] = []
|
||||
from_map[from_element].append(rel)
|
||||
|
||||
if to_element not in to_map:
|
||||
to_map[to_element] = []
|
||||
to_map[to_element].append(rel)
|
||||
|
||||
# Find root elements (appear as 'from' but not as 'to')
|
||||
root_elements = set(from_map.keys()) - set(to_map.keys())
|
||||
|
||||
if not root_elements:
|
||||
return # No root elements found
|
||||
|
||||
# Create calculation tree
|
||||
tree = CalculationTree(
|
||||
role_uri=role,
|
||||
definition=self.calculation_roles[role]['definition'],
|
||||
root_element_id=next(iter(root_elements)),
|
||||
all_nodes={}
|
||||
)
|
||||
|
||||
# Build tree recursively
|
||||
for root_id in root_elements:
|
||||
self._build_calculation_subtree(root_id, None, from_map, tree.all_nodes)
|
||||
|
||||
# Add tree to collection
|
||||
self.calculation_trees[role] = tree
|
||||
|
||||
def _build_calculation_subtree(self, element_id: str, parent_id: Optional[str],
|
||||
from_map: Dict[str, List[Dict[str, Any]]],
|
||||
all_nodes: Dict[str, CalculationNode]) -> None:
|
||||
"""
|
||||
Recursively build a calculation subtree.
|
||||
|
||||
Args:
|
||||
element_id: Current element ID
|
||||
parent_id: Parent element ID
|
||||
from_map: Map of relationships by source element
|
||||
all_nodes: Dictionary to store all nodes
|
||||
"""
|
||||
# Create node
|
||||
node = CalculationNode(
|
||||
element_id=element_id,
|
||||
parent=parent_id,
|
||||
children=[]
|
||||
)
|
||||
|
||||
# Add element information if available
|
||||
elem_info = None
|
||||
if element_id in self.element_catalog:
|
||||
elem_info = self.element_catalog[element_id]
|
||||
else:
|
||||
# Try alternative element ID formats (colon vs underscore)
|
||||
alt_element_id = element_id.replace(':', '_') if ':' in element_id else element_id.replace('_', ':')
|
||||
if alt_element_id in self.element_catalog:
|
||||
elem_info = self.element_catalog[alt_element_id]
|
||||
|
||||
if elem_info:
|
||||
node.balance_type = elem_info.balance
|
||||
node.period_type = elem_info.period_type
|
||||
|
||||
# Add to collection
|
||||
all_nodes[element_id] = node
|
||||
|
||||
# Process children
|
||||
if element_id in from_map:
|
||||
# Sort children by order
|
||||
children = sorted(from_map[element_id], key=lambda r: r['order'])
|
||||
|
||||
for rel in children:
|
||||
child_id = rel['to_element']
|
||||
|
||||
# Add child to parent's children list
|
||||
node.children.append(child_id)
|
||||
|
||||
# Set weight
|
||||
weight = rel['weight']
|
||||
|
||||
# Recursively build child subtree
|
||||
self._build_calculation_subtree(
|
||||
child_id, element_id, from_map, all_nodes
|
||||
)
|
||||
|
||||
# Update weight and order after child is built
|
||||
if child_id in all_nodes:
|
||||
all_nodes[child_id].weight = weight
|
||||
all_nodes[child_id].order = rel['order']
|
||||
382
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/concepts.py
Normal file
382
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/concepts.py
Normal file
@@ -0,0 +1,382 @@
|
||||
"""
|
||||
Shared XBRL concept definitions for balance types and deprecated normalization lists.
|
||||
|
||||
This module contains balance type mappings for common US-GAAP concepts to support
|
||||
the balance column in DataFrame exports without parsing full taxonomy schemas.
|
||||
|
||||
DEPRECATED: Static normalization concept lists (CONSISTENT_POSITIVE_CONCEPTS,
|
||||
LEGITIMATE_NEGATIVE_CONCEPTS) are kept for historical reference but no longer used.
|
||||
Testing confirmed that SEC XBRL instance data is already consistent across companies.
|
||||
See Issue #463 analysis for details.
|
||||
"""
|
||||
|
||||
# =============================================================================
|
||||
# DEPRECATED CONCEPT LISTS (No longer used as of Issue #463)
|
||||
# =============================================================================
|
||||
# These lists were created to work around perceived inconsistencies in XBRL data.
|
||||
# Testing revealed that raw SEC instance data is ALREADY consistent across companies.
|
||||
#
|
||||
# Historical context:
|
||||
# - Issues #290, #334, #451 reported negative values for expenses
|
||||
# - Root cause: EdgarTools was misusing calculation weights for display logic
|
||||
# - These lists fixed symptoms but not the actual problem
|
||||
# - Issue #463 removed calculation weight application during parsing
|
||||
# - Result: Raw values preserved as-is (matching SEC CompanyFacts API)
|
||||
#
|
||||
# Kept for historical reference and potential future use cases.
|
||||
# =============================================================================
|
||||
|
||||
CONSISTENT_POSITIVE_CONCEPTS = {
|
||||
# Research and Development Expenses
|
||||
'us-gaap_ResearchAndDevelopmentExpense',
|
||||
'us_gaap_ResearchAndDevelopmentExpense',
|
||||
'ResearchAndDevelopmentExpense',
|
||||
|
||||
# Selling, General & Administrative Expenses
|
||||
'us-gaap_SellingGeneralAndAdministrativeExpense',
|
||||
'us_gaap_SellingGeneralAndAdministrativeExpense',
|
||||
'SellingGeneralAndAdministrativeExpense',
|
||||
|
||||
# General and Administrative Expenses (separate from SG&A)
|
||||
'us-gaap_GeneralAndAdministrativeExpense',
|
||||
'us_gaap_GeneralAndAdministrativeExpense',
|
||||
'GeneralAndAdministrativeExpense',
|
||||
|
||||
# Selling Expenses
|
||||
'us-gaap_SellingExpense',
|
||||
'us_gaap_SellingExpense',
|
||||
'SellingExpense',
|
||||
|
||||
# Marketing and Advertising Expenses
|
||||
'us-gaap_SellingAndMarketingExpense',
|
||||
'us_gaap_SellingAndMarketingExpense',
|
||||
'SellingAndMarketingExpense',
|
||||
'us-gaap_MarketingExpense',
|
||||
'us_gaap_MarketingExpense',
|
||||
'MarketingExpense',
|
||||
'us-gaap_AdvertisingExpense',
|
||||
'us_gaap_AdvertisingExpense',
|
||||
'AdvertisingExpense',
|
||||
|
||||
# Share-based Compensation Expenses
|
||||
'us-gaap_AllocatedShareBasedCompensationExpense',
|
||||
'us_gaap_AllocatedShareBasedCompensationExpense',
|
||||
'AllocatedShareBasedCompensationExpense',
|
||||
'us-gaap_ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized',
|
||||
'us_gaap_ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized',
|
||||
'ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized',
|
||||
|
||||
# Operating Expenses (general)
|
||||
'us-gaap_OperatingExpenses',
|
||||
'us_gaap_OperatingExpenses',
|
||||
'OperatingExpenses',
|
||||
|
||||
# Professional Services Expenses
|
||||
'us-gaap_ProfessionalServiceFees',
|
||||
'us_gaap_ProfessionalServiceFees',
|
||||
'ProfessionalServiceFees',
|
||||
|
||||
# Compensation and Benefits
|
||||
'us-gaap_LaborAndRelatedExpense',
|
||||
'us_gaap_LaborAndRelatedExpense',
|
||||
'LaborAndRelatedExpense',
|
||||
'us-gaap_EmployeeBenefitsExpense',
|
||||
'us_gaap_EmployeeBenefitsExpense',
|
||||
'EmployeeBenefitsExpense',
|
||||
|
||||
# Cost of Revenue and Cost of Goods/Services Sold (Issue #290, #451)
|
||||
'us-gaap_CostOfRevenue',
|
||||
'us_gaap_CostOfRevenue',
|
||||
'CostOfRevenue',
|
||||
'us-gaap_CostOfGoodsAndServicesSold',
|
||||
'us_gaap_CostOfGoodsAndServicesSold',
|
||||
'CostOfGoodsAndServicesSold',
|
||||
'us-gaap_CostOfGoodsSold',
|
||||
'us_gaap_CostOfGoodsSold',
|
||||
'CostOfGoodsSold',
|
||||
'us-gaap_CostOfServices',
|
||||
'us_gaap_CostOfServices',
|
||||
'CostOfServices',
|
||||
|
||||
# Income Tax Expense (Issue #451)
|
||||
'us-gaap_IncomeTaxExpenseBenefit',
|
||||
'us_gaap_IncomeTaxExpenseBenefit',
|
||||
'IncomeTaxExpenseBenefit',
|
||||
'us-gaap_IncomeTaxRecoveryExpense',
|
||||
'us_gaap_IncomeTaxRecoveryExpense',
|
||||
'IncomeTaxRecoveryExpense',
|
||||
|
||||
# Cash Flow Statement - Financing Activities (cash outflows)
|
||||
# These represent uses of cash that should always be positive
|
||||
'us-gaap_PaymentsForRepurchaseOfCommonStock',
|
||||
'us_gaap_PaymentsForRepurchaseOfCommonStock',
|
||||
'PaymentsForRepurchaseOfCommonStock',
|
||||
'us-gaap_PaymentsOfDividends',
|
||||
'us_gaap_PaymentsOfDividends',
|
||||
'PaymentsOfDividends',
|
||||
'us-gaap_PaymentsOfDividendsCommonStock',
|
||||
'us_gaap_PaymentsOfDividendsCommonStock',
|
||||
'PaymentsOfDividendsCommonStock',
|
||||
'us-gaap_PaymentsOfDividendsPreferredStockAndPreferenceStock',
|
||||
'us_gaap_PaymentsOfDividendsPreferredStockAndPreferenceStock',
|
||||
'PaymentsOfDividendsPreferredStockAndPreferenceStock'
|
||||
}
|
||||
|
||||
# DEPRECATED: Concepts that can legitimately be negative
|
||||
# This list is no longer used but kept for historical reference.
|
||||
LEGITIMATE_NEGATIVE_CONCEPTS = {
|
||||
# Interest expense/income that can be net negative
|
||||
'us-gaap_InterestIncomeExpenseNet',
|
||||
'us_gaap_InterestIncomeExpenseNet',
|
||||
'InterestIncomeExpenseNet',
|
||||
|
||||
# Foreign exchange gains/losses
|
||||
'us-gaap_ForeignCurrencyTransactionGainLossBeforeTax',
|
||||
'us_gaap_ForeignCurrencyTransactionGainLossBeforeTax',
|
||||
'ForeignCurrencyTransactionGainLossBeforeTax',
|
||||
|
||||
# Restructuring reversals/credits
|
||||
'us-gaap_RestructuringChargesAndReversals',
|
||||
'us_gaap_RestructuringChargesAndReversals',
|
||||
'RestructuringChargesAndReversals'
|
||||
}
|
||||
|
||||
# US-GAAP Balance Type Mappings (Issue #463)
|
||||
#
|
||||
# This mapping provides balance types for common US-GAAP concepts to support
|
||||
# the balance column in DataFrame exports without requiring full taxonomy parsing.
|
||||
#
|
||||
# Balance types:
|
||||
# - "debit": Assets, Expenses (increase with debits, decrease with credits)
|
||||
# - "credit": Liabilities, Equity, Revenue (increase with credits, decrease with debits)
|
||||
#
|
||||
# TODO: Eventually replace with full US-GAAP taxonomy parser that follows schema imports
|
||||
#
|
||||
US_GAAP_BALANCE_TYPES = {
|
||||
# ============================================================================
|
||||
# ASSETS (Balance: debit)
|
||||
# ============================================================================
|
||||
|
||||
# Current Assets
|
||||
'us-gaap:Cash': 'debit',
|
||||
'Cash': 'debit', # Short form
|
||||
'us-gaap:CashAndCashEquivalentsAtCarryingValue': 'debit',
|
||||
'CashAndCashEquivalentsAtCarryingValue': 'debit', # Short form
|
||||
'us-gaap:CashEquivalentsAtCarryingValue': 'debit',
|
||||
'us-gaap:RestrictedCashAndCashEquivalents': 'debit',
|
||||
'us-gaap:MarketableSecurities': 'debit',
|
||||
'us-gaap:AvailableForSaleSecuritiesDebtSecurities': 'debit',
|
||||
'us-gaap:ShortTermInvestments': 'debit',
|
||||
'us-gaap:AccountsReceivableNetCurrent': 'debit',
|
||||
'us-gaap:AccountsReceivableGrossCurrent': 'debit',
|
||||
'us-gaap:Inventory': 'debit',
|
||||
'us-gaap:InventoryNet': 'debit',
|
||||
'us-gaap:PrepaidExpenseAndOtherAssetsCurrent': 'debit',
|
||||
'us-gaap:DeferredTaxAssetsNetCurrent': 'debit',
|
||||
'us-gaap:OtherAssetsCurrent': 'debit',
|
||||
'us-gaap:AssetsCurrent': 'debit',
|
||||
|
||||
# Non-Current Assets
|
||||
'us-gaap:PropertyPlantAndEquipmentNet': 'debit',
|
||||
'us-gaap:PropertyPlantAndEquipmentGross': 'debit',
|
||||
'us-gaap:Land': 'debit',
|
||||
'us-gaap:BuildingsAndImprovementsGross': 'debit',
|
||||
'us-gaap:MachineryAndEquipmentGross': 'debit',
|
||||
'us-gaap:Goodwill': 'debit',
|
||||
'us-gaap:IntangibleAssetsNetExcludingGoodwill': 'debit',
|
||||
'us-gaap:IntangibleAssetsGrossExcludingGoodwill': 'debit',
|
||||
'us-gaap:LongTermInvestments': 'debit',
|
||||
'us-gaap:DeferredTaxAssetsNetNoncurrent': 'debit',
|
||||
'us-gaap:OtherAssetsNoncurrent': 'debit',
|
||||
'us-gaap:AssetsNoncurrent': 'debit',
|
||||
'us-gaap:Assets': 'debit',
|
||||
'Assets': 'debit', # Short form
|
||||
|
||||
# ============================================================================
|
||||
# LIABILITIES (Balance: credit)
|
||||
# ============================================================================
|
||||
|
||||
# Current Liabilities
|
||||
'us-gaap:AccountsPayableCurrent': 'credit',
|
||||
'us-gaap:AccruedLiabilitiesCurrent': 'credit',
|
||||
'us-gaap:DeferredRevenueCurrent': 'credit',
|
||||
'us-gaap:ContractWithCustomerLiabilityCurrent': 'credit',
|
||||
'us-gaap:ShortTermBorrowings': 'credit',
|
||||
'us-gaap:LongTermDebtCurrent': 'credit',
|
||||
'us-gaap:CommercialPaper': 'credit',
|
||||
'us-gaap:AccruedIncomeTaxesCurrent': 'credit',
|
||||
'us-gaap:DividendsPayableCurrent': 'credit',
|
||||
'us-gaap:OtherLiabilitiesCurrent': 'credit',
|
||||
'us-gaap:LiabilitiesCurrent': 'credit',
|
||||
|
||||
# Non-Current Liabilities
|
||||
'us-gaap:LongTermDebtNoncurrent': 'credit',
|
||||
'us-gaap:LongTermDebtAndCapitalLeaseObligations': 'credit',
|
||||
'us-gaap:DeferredRevenueNoncurrent': 'credit',
|
||||
'us-gaap:DeferredTaxLiabilitiesNoncurrent': 'credit',
|
||||
'us-gaap:PensionAndOtherPostretirementDefinedBenefitPlansLiabilitiesNoncurrent': 'credit',
|
||||
'us-gaap:OtherLiabilitiesNoncurrent': 'credit',
|
||||
'us-gaap:LiabilitiesNoncurrent': 'credit',
|
||||
'us-gaap:Liabilities': 'credit',
|
||||
|
||||
# ============================================================================
|
||||
# EQUITY (Balance: credit)
|
||||
# ============================================================================
|
||||
|
||||
'us-gaap:CommonStockValue': 'credit',
|
||||
'us-gaap:CommonStockSharesIssued': 'credit',
|
||||
'us-gaap:CommonStockSharesOutstanding': 'credit',
|
||||
'us-gaap:PreferredStockValue': 'credit',
|
||||
'us-gaap:AdditionalPaidInCapital': 'credit',
|
||||
'us-gaap:AdditionalPaidInCapitalCommonStock': 'credit',
|
||||
'us-gaap:RetainedEarningsAccumulatedDeficit': 'credit',
|
||||
'us-gaap:TreasuryStockValue': 'debit', # Contra-equity (debit balance)
|
||||
'us-gaap:AccumulatedOtherComprehensiveIncomeLossNetOfTax': 'credit',
|
||||
'us-gaap:StockholdersEquity': 'credit',
|
||||
'us-gaap:StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest': 'credit',
|
||||
'us-gaap:LiabilitiesAndStockholdersEquity': 'credit',
|
||||
|
||||
# ============================================================================
|
||||
# REVENUE (Balance: credit)
|
||||
# ============================================================================
|
||||
|
||||
'us-gaap:Revenues': 'credit',
|
||||
'Revenues': 'credit', # Short form
|
||||
'Revenue': 'credit', # Short form (singular)
|
||||
'us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax': 'credit',
|
||||
'RevenueFromContractWithCustomerExcludingAssessedTax': 'credit', # Short form
|
||||
'us-gaap:RevenueFromContractWithCustomerIncludingAssessedTax': 'credit',
|
||||
'RevenueFromContractWithCustomerIncludingAssessedTax': 'credit', # Short form
|
||||
'us-gaap:SalesRevenueNet': 'credit',
|
||||
'us-gaap:SalesRevenueGoodsNet': 'credit',
|
||||
'us-gaap:SalesRevenueServicesNet': 'credit',
|
||||
'us-gaap:InterestAndDividendIncomeOperating': 'credit',
|
||||
'us-gaap:InterestIncomeOther': 'credit',
|
||||
'us-gaap:InvestmentIncomeInterest': 'credit',
|
||||
'us-gaap:GainLossOnSaleOfPropertyPlantEquipment': 'credit',
|
||||
'us-gaap:GainLossOnInvestments': 'credit',
|
||||
'us-gaap:OtherNonoperatingIncomeExpense': 'credit',
|
||||
|
||||
# ============================================================================
|
||||
# EXPENSES & COSTS (Balance: debit)
|
||||
# ============================================================================
|
||||
|
||||
# Cost of Revenue
|
||||
'us-gaap:CostOfRevenue': 'debit',
|
||||
'us-gaap:CostOfGoodsAndServicesSold': 'debit',
|
||||
'us-gaap:CostOfGoodsSold': 'debit',
|
||||
'us-gaap:CostOfServices': 'debit',
|
||||
|
||||
# Operating Expenses
|
||||
'us-gaap:ResearchAndDevelopmentExpense': 'debit',
|
||||
'us-gaap:SellingGeneralAndAdministrativeExpense': 'debit',
|
||||
'us-gaap:GeneralAndAdministrativeExpense': 'debit',
|
||||
'us-gaap:SellingExpense': 'debit',
|
||||
'us-gaap:SellingAndMarketingExpense': 'debit',
|
||||
'us-gaap:MarketingExpense': 'debit',
|
||||
'us-gaap:AdvertisingExpense': 'debit',
|
||||
'us-gaap:DepreciationDepletionAndAmortization': 'debit',
|
||||
'us-gaap:Depreciation': 'debit',
|
||||
'us-gaap:AmortizationOfIntangibleAssets': 'debit',
|
||||
'us-gaap:RestructuringCharges': 'debit',
|
||||
'us-gaap:AssetImpairmentCharges': 'debit',
|
||||
'us-gaap:ShareBasedCompensation': 'debit',
|
||||
|
||||
# Other Expenses
|
||||
'us-gaap:InterestExpense': 'debit',
|
||||
'us-gaap:InterestExpenseDebt': 'debit',
|
||||
'us-gaap:IncomeTaxExpenseBenefit': 'debit',
|
||||
'us-gaap:ProvisionForDoubtfulAccounts': 'debit',
|
||||
|
||||
# ============================================================================
|
||||
# INCOME & TOTALS (Balance: credit)
|
||||
# ============================================================================
|
||||
|
||||
'us-gaap:GrossProfit': 'credit',
|
||||
'us-gaap:OperatingIncomeLoss': 'credit',
|
||||
'us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest': 'credit',
|
||||
'us-gaap:IncomeLossFromContinuingOperations': 'credit',
|
||||
'us-gaap:NetIncomeLoss': 'credit',
|
||||
'us-gaap:NetIncomeLossAvailableToCommonStockholdersBasic': 'credit',
|
||||
'us-gaap:NetIncomeLossAvailableToCommonStockholdersDiluted': 'credit',
|
||||
'us-gaap:ComprehensiveIncomeNetOfTax': 'credit',
|
||||
|
||||
# ============================================================================
|
||||
# CASH FLOW STATEMENT
|
||||
# ============================================================================
|
||||
|
||||
# Operating Activities
|
||||
'us-gaap:NetCashProvidedByUsedInOperatingActivities': 'debit',
|
||||
'us-gaap:DepreciationAndAmortization': 'debit',
|
||||
'us-gaap:ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized': 'debit',
|
||||
'us-gaap:DeferredIncomeTaxExpenseBenefit': 'debit',
|
||||
|
||||
# Investing Activities
|
||||
'us-gaap:NetCashProvidedByUsedInInvestingActivities': 'debit',
|
||||
'us-gaap:PaymentsToAcquirePropertyPlantAndEquipment': 'credit', # Cash outflow
|
||||
'us-gaap:PaymentsToAcquireBusinessesNetOfCashAcquired': 'credit', # Cash outflow
|
||||
'us-gaap:PaymentsToAcquireMarketableSecurities': 'credit', # Cash outflow
|
||||
'us-gaap:ProceedsFromSaleOfPropertyPlantAndEquipment': 'debit', # Cash inflow
|
||||
'us-gaap:ProceedsFromSaleOfAvailableForSaleSecuritiesDebt': 'debit', # Cash inflow
|
||||
|
||||
# Financing Activities
|
||||
'us-gaap:NetCashProvidedByUsedInFinancingActivities': 'debit',
|
||||
'us-gaap:ProceedsFromIssuanceOfCommonStock': 'debit', # Cash inflow
|
||||
'us-gaap:ProceedsFromIssuanceOfLongTermDebt': 'debit', # Cash inflow
|
||||
'us-gaap:RepaymentsOfLongTermDebt': 'credit', # Cash outflow
|
||||
'us-gaap:PaymentsOfDividends': 'credit', # Cash outflow
|
||||
'us-gaap:PaymentsOfDividendsCommonStock': 'credit', # Cash outflow
|
||||
'us-gaap:PaymentsForRepurchaseOfCommonStock': 'credit', # Cash outflow
|
||||
}
|
||||
|
||||
|
||||
def get_balance_type(concept: str) -> str:
|
||||
"""
|
||||
Get the balance type for a concept.
|
||||
|
||||
Looks up the balance type from the static US-GAAP mapping, handling
|
||||
both colon and underscore namespace separators.
|
||||
|
||||
Args:
|
||||
concept: The concept name (e.g., 'us-gaap:Revenue' or 'us-gaap_Revenue' or 'us_gaap_Revenue')
|
||||
|
||||
Returns:
|
||||
Balance type ('debit', 'credit', or None if not found)
|
||||
|
||||
Example:
|
||||
>>> get_balance_type('us-gaap:Cash')
|
||||
'debit'
|
||||
>>> get_balance_type('us-gaap_Revenue')
|
||||
'credit'
|
||||
>>> get_balance_type('us_gaap_Revenue')
|
||||
'credit'
|
||||
>>> get_balance_type('UnknownConcept')
|
||||
None
|
||||
"""
|
||||
# Try direct lookup first (standard form)
|
||||
if concept in US_GAAP_BALANCE_TYPES:
|
||||
return US_GAAP_BALANCE_TYPES[concept]
|
||||
|
||||
# Normalize to standard form: us-gaap:LocalName
|
||||
# Handle common namespace prefix variations
|
||||
normalized = concept
|
||||
|
||||
# Replace known namespace patterns
|
||||
# us_gaap_Cash -> us-gaap:Cash
|
||||
# us-gaap_Cash -> us-gaap:Cash
|
||||
if 'us_gaap' in normalized:
|
||||
normalized = normalized.replace('us_gaap_', 'us-gaap:')
|
||||
normalized = normalized.replace('us_gaap:', 'us-gaap:')
|
||||
elif 'us-gaap' in normalized:
|
||||
normalized = normalized.replace('us-gaap_', 'us-gaap:')
|
||||
|
||||
# Try normalized form
|
||||
if normalized in US_GAAP_BALANCE_TYPES:
|
||||
return US_GAAP_BALANCE_TYPES[normalized]
|
||||
|
||||
# Try converting all underscores to colons (simple fallback)
|
||||
concept_all_colons = concept.replace('_', ':')
|
||||
if concept_all_colons in US_GAAP_BALANCE_TYPES:
|
||||
return US_GAAP_BALANCE_TYPES[concept_all_colons]
|
||||
|
||||
return None
|
||||
@@ -0,0 +1,291 @@
|
||||
"""
|
||||
XBRL Parser Coordinator.
|
||||
|
||||
This module provides the main XBRLParser class that coordinates parsing
|
||||
workflow across all specialized parser components while maintaining
|
||||
API compatibility with the original monolithic parser.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from edgar.core import log
|
||||
from edgar.xbrl.models import (
|
||||
Axis,
|
||||
CalculationTree,
|
||||
Context,
|
||||
Domain,
|
||||
ElementCatalog,
|
||||
Fact,
|
||||
PresentationTree,
|
||||
Table,
|
||||
XBRLProcessingError,
|
||||
)
|
||||
|
||||
from .calculation import CalculationParser
|
||||
from .definition import DefinitionParser
|
||||
from .instance import InstanceParser
|
||||
from .labels import LabelsParser
|
||||
from .presentation import PresentationParser
|
||||
from .schema import SchemaParser
|
||||
|
||||
|
||||
class XBRLParser:
|
||||
"""
|
||||
Coordinated XBRL parser that delegates to specialized component parsers.
|
||||
|
||||
This class maintains full API compatibility with the original monolithic
|
||||
XBRLParser while providing improved maintainability through component separation.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the coordinated XBRL parser with all data structures."""
|
||||
# Core data structures
|
||||
self.element_catalog: Dict[str, ElementCatalog] = {}
|
||||
self.contexts: Dict[str, Context] = {}
|
||||
self.facts: Dict[str, Fact] = {}
|
||||
self.units: Dict[str, Any] = {}
|
||||
self.footnotes: Dict[str, Any] = {}
|
||||
|
||||
# Presentation structures
|
||||
self.presentation_roles: Dict[str, Dict[str, Any]] = {}
|
||||
self.presentation_trees: Dict[str, PresentationTree] = {}
|
||||
|
||||
# Calculation structures
|
||||
self.calculation_roles: Dict[str, Dict[str, Any]] = {}
|
||||
self.calculation_trees: Dict[str, CalculationTree] = {}
|
||||
|
||||
# Definition (dimensional) structures
|
||||
self.definition_roles: Dict[str, Dict[str, Any]] = {}
|
||||
self.tables: Dict[str, List[Table]] = {}
|
||||
self.axes: Dict[str, Axis] = {}
|
||||
self.domains: Dict[str, Domain] = {}
|
||||
|
||||
# Entity information
|
||||
self.entity_info: Dict[str, Any] = {}
|
||||
self.dei_facts: Dict[str, Fact] = {}
|
||||
|
||||
# Reporting periods
|
||||
self.reporting_periods: List[Dict[str, Any]] = []
|
||||
|
||||
# Mapping of context IDs to period identifiers for easy lookup
|
||||
self.context_period_map: Dict[str, str] = {}
|
||||
|
||||
# Initialize component parsers
|
||||
self._init_parsers()
|
||||
|
||||
def _init_parsers(self):
|
||||
"""Initialize all component parsers with shared data structures."""
|
||||
# Create component parsers with references to shared data structures
|
||||
self.schema_parser = SchemaParser(
|
||||
element_catalog=self.element_catalog
|
||||
)
|
||||
|
||||
self.labels_parser = LabelsParser(
|
||||
element_catalog=self.element_catalog
|
||||
)
|
||||
|
||||
self.presentation_parser = PresentationParser(
|
||||
presentation_roles=self.presentation_roles,
|
||||
presentation_trees=self.presentation_trees,
|
||||
element_catalog=self.element_catalog
|
||||
)
|
||||
|
||||
self.calculation_parser = CalculationParser(
|
||||
calculation_roles=self.calculation_roles,
|
||||
calculation_trees=self.calculation_trees,
|
||||
element_catalog=self.element_catalog,
|
||||
facts=self.facts
|
||||
)
|
||||
|
||||
self.definition_parser = DefinitionParser(
|
||||
definition_roles=self.definition_roles,
|
||||
tables=self.tables,
|
||||
axes=self.axes,
|
||||
domains=self.domains,
|
||||
element_catalog=self.element_catalog
|
||||
)
|
||||
|
||||
self.instance_parser = InstanceParser(
|
||||
contexts=self.contexts,
|
||||
facts=self.facts,
|
||||
units=self.units,
|
||||
footnotes=self.footnotes,
|
||||
calculation_trees=self.calculation_trees,
|
||||
entity_info=self.entity_info,
|
||||
reporting_periods=self.reporting_periods,
|
||||
context_period_map=self.context_period_map
|
||||
)
|
||||
|
||||
# Set up cross-references for embedded linkbase processing
|
||||
self.schema_parser.set_linkbase_parsers(
|
||||
labels_parser=self.labels_parser,
|
||||
presentation_parser=self.presentation_parser,
|
||||
calculation_parser=self.calculation_parser,
|
||||
definition_parser=self.definition_parser
|
||||
)
|
||||
|
||||
def _create_normalized_fact_key(self, element_id: str, context_ref: str, instance_id: Optional[int] = None) -> str:
|
||||
"""
|
||||
Create a normalized fact key using underscore format.
|
||||
|
||||
Args:
|
||||
element_id: The element ID
|
||||
context_ref: The context reference
|
||||
instance_id: Optional instance ID for duplicate facts
|
||||
|
||||
Returns:
|
||||
Normalized key in format: element_id_context_ref[_instance_id]
|
||||
"""
|
||||
return self.instance_parser._create_normalized_fact_key(element_id, context_ref, instance_id)
|
||||
|
||||
def get_facts_by_key(self, element_id: str, context_ref: str) -> List[Fact]:
|
||||
"""Get all facts matching the given element ID and context reference.
|
||||
|
||||
This method handles both single facts and duplicate facts using the hybrid storage approach.
|
||||
For single facts, it returns a list with one fact. For duplicates, it returns all instances.
|
||||
|
||||
Args:
|
||||
element_id: The element ID to look up
|
||||
context_ref: The context reference
|
||||
|
||||
Returns:
|
||||
List of matching facts
|
||||
"""
|
||||
# Create base key for lookup
|
||||
base_key = self._create_normalized_fact_key(element_id, context_ref)
|
||||
|
||||
# Check if single fact exists
|
||||
if base_key in self.facts:
|
||||
return [self.facts[base_key]]
|
||||
|
||||
# Check for duplicate facts (with instance IDs)
|
||||
matching_facts = []
|
||||
instance_id = 0
|
||||
while True:
|
||||
instance_key = self._create_normalized_fact_key(element_id, context_ref, instance_id)
|
||||
if instance_key in self.facts:
|
||||
matching_facts.append(self.facts[instance_key])
|
||||
instance_id += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return matching_facts
|
||||
|
||||
def get_fact(self, element_id: str, context_ref: str) -> Optional[Fact]:
|
||||
"""Get a single fact by element ID and context reference.
|
||||
|
||||
Returns the first fact if multiple instances exist.
|
||||
|
||||
Args:
|
||||
element_id: The element ID to look up
|
||||
context_ref: The context reference
|
||||
|
||||
Returns:
|
||||
The fact if found, None otherwise
|
||||
"""
|
||||
facts = self.get_facts_by_key(element_id, context_ref)
|
||||
return facts[0] if facts else None
|
||||
|
||||
def parse_directory(self, directory_path: Union[str, Path]) -> None:
|
||||
"""
|
||||
Parse all XBRL files in a directory.
|
||||
|
||||
Args:
|
||||
directory_path: Path to directory containing XBRL files
|
||||
"""
|
||||
try:
|
||||
directory = Path(directory_path)
|
||||
if not directory.is_dir():
|
||||
raise XBRLProcessingError(f"Directory not found: {directory_path}")
|
||||
|
||||
log.debug(f"Parsing XBRL directory: {directory}")
|
||||
|
||||
# Parse schema files first to build element catalog
|
||||
schema_files = list(directory.glob('*.xsd'))
|
||||
for schema_file in schema_files:
|
||||
log.debug(f"Parsing schema: {schema_file}")
|
||||
self.schema_parser.parse_schema(schema_file)
|
||||
|
||||
# Parse linkbase files
|
||||
linkbase_patterns = [
|
||||
('*_lab.xml', self.labels_parser.parse_labels),
|
||||
('*_pre.xml', self.presentation_parser.parse_presentation),
|
||||
('*_cal.xml', self.calculation_parser.parse_calculation),
|
||||
('*_def.xml', self.definition_parser.parse_definition),
|
||||
]
|
||||
|
||||
for pattern, parser_method in linkbase_patterns:
|
||||
linkbase_files = list(directory.glob(pattern))
|
||||
for linkbase_file in linkbase_files:
|
||||
log.debug(f"Parsing linkbase: {linkbase_file}")
|
||||
parser_method(linkbase_file)
|
||||
|
||||
# Parse instance files last (they depend on schemas and linkbases)
|
||||
instance_files = list(directory.glob('*.xml'))
|
||||
# Filter out linkbase files
|
||||
instance_files = [f for f in instance_files if not any(
|
||||
f.name.endswith(suffix) for suffix in ['_lab.xml', '_pre.xml', '_cal.xml', '_def.xml']
|
||||
)]
|
||||
|
||||
for instance_file in instance_files:
|
||||
log.debug(f"Parsing instance: {instance_file}")
|
||||
self.instance_parser.parse_instance(instance_file)
|
||||
|
||||
log.info(f"Successfully parsed XBRL directory with {len(self.facts)} facts")
|
||||
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing directory {directory_path}: {str(e)}") from e
|
||||
|
||||
# Delegate methods to component parsers for API compatibility
|
||||
def parse_schema(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse schema file and extract element information."""
|
||||
return self.schema_parser.parse_schema(file_path)
|
||||
|
||||
def parse_schema_content(self, content: str) -> None:
|
||||
"""Parse schema content and extract element information."""
|
||||
return self.schema_parser.parse_schema_content(content)
|
||||
|
||||
def parse_labels(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse label linkbase file and extract label information."""
|
||||
return self.labels_parser.parse_labels(file_path)
|
||||
|
||||
def parse_labels_content(self, content: str) -> None:
|
||||
"""Parse label linkbase content and extract label information."""
|
||||
return self.labels_parser.parse_labels_content(content)
|
||||
|
||||
def parse_presentation(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse presentation linkbase file and build presentation trees."""
|
||||
return self.presentation_parser.parse_presentation(file_path)
|
||||
|
||||
def parse_presentation_content(self, content: str) -> None:
|
||||
"""Parse presentation linkbase content and build presentation trees."""
|
||||
return self.presentation_parser.parse_presentation_content(content)
|
||||
|
||||
def parse_calculation(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse calculation linkbase file and build calculation trees."""
|
||||
return self.calculation_parser.parse_calculation(file_path)
|
||||
|
||||
def parse_calculation_content(self, content: str) -> None:
|
||||
"""Parse calculation linkbase content and build calculation trees."""
|
||||
return self.calculation_parser.parse_calculation_content(content)
|
||||
|
||||
def parse_definition(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse definition linkbase file and build dimensional structures."""
|
||||
return self.definition_parser.parse_definition(file_path)
|
||||
|
||||
def parse_definition_content(self, content: str) -> None:
|
||||
"""Parse definition linkbase content and build dimensional structures."""
|
||||
return self.definition_parser.parse_definition_content(content)
|
||||
|
||||
def parse_instance(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse instance document file and extract contexts, facts, and units."""
|
||||
return self.instance_parser.parse_instance(file_path)
|
||||
|
||||
def parse_instance_content(self, content: str) -> None:
|
||||
"""Parse instance document content and extract contexts, facts, and units."""
|
||||
return self.instance_parser.parse_instance_content(content)
|
||||
|
||||
def count_facts(self, content: str) -> tuple:
|
||||
"""Count the number of facts in the instance document."""
|
||||
return self.instance_parser.count_facts(content)
|
||||
@@ -0,0 +1,235 @@
|
||||
"""
|
||||
Definition parser for XBRL documents.
|
||||
|
||||
This module handles parsing of XBRL definition linkbases and building
|
||||
dimensional structures like tables, axes, and domains.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
from edgar.xbrl.core import NAMESPACES, STANDARD_LABEL, extract_element_id
|
||||
from edgar.xbrl.models import Axis, Domain, ElementCatalog, Table, XBRLProcessingError
|
||||
|
||||
from .base import BaseParser
|
||||
|
||||
|
||||
class DefinitionParser(BaseParser):
|
||||
"""Parser for XBRL definition linkbases."""
|
||||
|
||||
def __init__(self, definition_roles: Dict[str, Dict[str, Any]],
|
||||
tables: Dict[str, List[Table]],
|
||||
axes: Dict[str, Axis],
|
||||
domains: Dict[str, Domain],
|
||||
element_catalog: Dict[str, ElementCatalog]):
|
||||
"""
|
||||
Initialize definition parser with data structure references.
|
||||
|
||||
Args:
|
||||
definition_roles: Reference to definition roles dictionary
|
||||
tables: Reference to tables dictionary
|
||||
axes: Reference to axes dictionary
|
||||
domains: Reference to domains dictionary
|
||||
element_catalog: Reference to element catalog dictionary
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# Store references to data structures
|
||||
self.definition_roles = definition_roles
|
||||
self.tables = tables
|
||||
self.axes = axes
|
||||
self.domains = domains
|
||||
self.element_catalog = element_catalog
|
||||
|
||||
def parse_definition(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse definition linkbase file and build dimensional structures."""
|
||||
try:
|
||||
content = Path(file_path).read_text()
|
||||
self.parse_definition_content(content)
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing definition file {file_path}: {str(e)}") from e
|
||||
|
||||
def parse_definition_content(self, content: str) -> None:
|
||||
"""Parse definition linkbase content and build dimensional structures."""
|
||||
try:
|
||||
root = self._safe_parse_xml(content)
|
||||
|
||||
# Extract definition links
|
||||
definition_links = root.findall('.//{http://www.xbrl.org/2003/linkbase}definitionLink')
|
||||
|
||||
for link in definition_links:
|
||||
role = link.get('{http://www.w3.org/1999/xlink}role')
|
||||
if not role:
|
||||
continue
|
||||
|
||||
# Store role information
|
||||
role_id = role.split('/')[-1] if '/' in role else role
|
||||
role_def = role_id.replace('_', ' ')
|
||||
|
||||
self.definition_roles[role] = {
|
||||
'roleUri': role,
|
||||
'definition': role_def,
|
||||
'roleId': role_id
|
||||
}
|
||||
|
||||
# Extract arcs
|
||||
arcs = link.findall('.//{http://www.xbrl.org/2003/linkbase}definitionArc')
|
||||
|
||||
# Create relationships list
|
||||
relationships = []
|
||||
|
||||
for arc in arcs:
|
||||
from_ref = arc.get('{http://www.w3.org/1999/xlink}from')
|
||||
to_ref = arc.get('{http://www.w3.org/1999/xlink}to')
|
||||
order = self._parse_order_attribute(arc)
|
||||
|
||||
# Get the arcrole - this is important for identifying dimensional relationships
|
||||
arcrole = arc.get('{http://www.w3.org/1999/xlink}arcrole')
|
||||
if not from_ref or not to_ref or not arcrole:
|
||||
continue
|
||||
|
||||
# Find locators for from/to references
|
||||
from_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{from_ref}"]')
|
||||
to_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{to_ref}"]')
|
||||
|
||||
if from_loc is None or to_loc is None:
|
||||
continue
|
||||
|
||||
from_href = from_loc.get('{http://www.w3.org/1999/xlink}href')
|
||||
to_href = to_loc.get('{http://www.w3.org/1999/xlink}href')
|
||||
|
||||
if not from_href or not to_href:
|
||||
continue
|
||||
|
||||
# Extract element IDs
|
||||
from_element = extract_element_id(from_href)
|
||||
to_element = extract_element_id(to_href)
|
||||
|
||||
# Add relationship with arcrole
|
||||
relationships.append({
|
||||
'from_element': from_element,
|
||||
'to_element': to_element,
|
||||
'order': order,
|
||||
'arcrole': arcrole
|
||||
})
|
||||
|
||||
# Process dimensional structures from relationships
|
||||
self._process_dimensional_relationships(role, relationships)
|
||||
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing definition content: {str(e)}") from e
|
||||
|
||||
def _process_dimensional_relationships(self, role: str, relationships: List[Dict[str, Any]]) -> None:
|
||||
"""
|
||||
Process dimensional relationships to build tables, axes, and domains.
|
||||
|
||||
Args:
|
||||
role: Extended link role URI
|
||||
relationships: List of dimensional relationships
|
||||
"""
|
||||
# XBRL Dimensions arcrole URIs
|
||||
HYPERCUBE_DIMENSION = "http://xbrl.org/int/dim/arcrole/hypercube-dimension"
|
||||
DIMENSION_DOMAIN = "http://xbrl.org/int/dim/arcrole/dimension-domain"
|
||||
DOMAIN_MEMBER = "http://xbrl.org/int/dim/arcrole/domain-member"
|
||||
ALL = "http://xbrl.org/int/dim/arcrole/all"
|
||||
|
||||
# Group relationships by arcrole
|
||||
grouped_rels = {}
|
||||
for rel in relationships:
|
||||
arcrole = rel['arcrole']
|
||||
if arcrole not in grouped_rels:
|
||||
grouped_rels[arcrole] = []
|
||||
grouped_rels[arcrole].append(rel)
|
||||
|
||||
# Process hypercube-dimension relationships to identify tables and axes
|
||||
hypercube_axes = {} # Map of hypercubes to their axes
|
||||
if HYPERCUBE_DIMENSION in grouped_rels:
|
||||
for rel in grouped_rels[HYPERCUBE_DIMENSION]:
|
||||
table_id = rel['from_element']
|
||||
axis_id = rel['to_element']
|
||||
|
||||
if table_id not in hypercube_axes:
|
||||
hypercube_axes[table_id] = []
|
||||
|
||||
hypercube_axes[table_id].append(axis_id)
|
||||
|
||||
# Create or update axis
|
||||
if axis_id not in self.axes:
|
||||
self.axes[axis_id] = Axis(
|
||||
element_id=axis_id,
|
||||
label=self._get_element_label(axis_id)
|
||||
)
|
||||
|
||||
# Process dimension-domain relationships to link axes to domains
|
||||
if DIMENSION_DOMAIN in grouped_rels:
|
||||
for rel in grouped_rels[DIMENSION_DOMAIN]:
|
||||
axis_id = rel['from_element']
|
||||
domain_id = rel['to_element']
|
||||
|
||||
# Link domain to axis
|
||||
if axis_id in self.axes:
|
||||
self.axes[axis_id].domain_id = domain_id
|
||||
|
||||
# Create or update domain
|
||||
if domain_id not in self.domains:
|
||||
self.domains[domain_id] = Domain(
|
||||
element_id=domain_id,
|
||||
label=self._get_element_label(domain_id)
|
||||
)
|
||||
|
||||
# Process domain-member relationships to build domain hierarchies
|
||||
if DOMAIN_MEMBER in grouped_rels:
|
||||
# Group by parent (domain) element
|
||||
domain_members = {}
|
||||
for rel in grouped_rels[DOMAIN_MEMBER]:
|
||||
domain_id = rel['from_element']
|
||||
member_id = rel['to_element']
|
||||
|
||||
if domain_id not in domain_members:
|
||||
domain_members[domain_id] = []
|
||||
|
||||
domain_members[domain_id].append(member_id)
|
||||
|
||||
# Also create the domain if it doesn't exist
|
||||
if domain_id not in self.domains:
|
||||
self.domains[domain_id] = Domain(
|
||||
element_id=domain_id,
|
||||
label=self._get_element_label(domain_id)
|
||||
)
|
||||
|
||||
# Update domains with their members
|
||||
for domain_id, members in domain_members.items():
|
||||
if domain_id in self.domains:
|
||||
self.domains[domain_id].members = members
|
||||
|
||||
# Process 'all' relationships to identify line items and build hypercubes (tables)
|
||||
if ALL in grouped_rels:
|
||||
tables_by_role = []
|
||||
for rel in grouped_rels[ALL]:
|
||||
line_items_id = rel['to_element']
|
||||
table_id = rel['from_element']
|
||||
|
||||
# Only process if this table has axes defined
|
||||
if table_id in hypercube_axes:
|
||||
table = Table(
|
||||
element_id=table_id,
|
||||
label=self._get_element_label(table_id),
|
||||
role_uri=role,
|
||||
axes=hypercube_axes[table_id],
|
||||
line_items=[line_items_id],
|
||||
closed=False # Default
|
||||
)
|
||||
tables_by_role.append(table)
|
||||
|
||||
# Add tables to collection
|
||||
if tables_by_role:
|
||||
self.tables[role] = tables_by_role
|
||||
|
||||
def _get_element_label(self, element_id: str) -> str:
|
||||
"""Get the label for an element, falling back to the element ID if not found."""
|
||||
if element_id in self.element_catalog and self.element_catalog[element_id].labels:
|
||||
# Use standard label if available
|
||||
standard_label = self.element_catalog[element_id].labels.get(STANDARD_LABEL)
|
||||
if standard_label:
|
||||
return standard_label
|
||||
return element_id # Fallback to element ID
|
||||
768
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/instance.py
Normal file
768
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/instance.py
Normal file
@@ -0,0 +1,768 @@
|
||||
"""
|
||||
Instance parser for XBRL documents.
|
||||
|
||||
This module handles parsing of XBRL instance documents including facts, contexts,
|
||||
units, footnotes, and entity information extraction.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
from lxml import etree as ET
|
||||
|
||||
from edgar.core import log
|
||||
from edgar.xbrl.core import NAMESPACES, classify_duration
|
||||
from edgar.xbrl.models import Context, Fact, XBRLProcessingError
|
||||
|
||||
from .base import BaseParser
|
||||
|
||||
|
||||
class InstanceParser(BaseParser):
|
||||
"""Parser for XBRL instance documents."""
|
||||
|
||||
def __init__(self, contexts: Dict[str, Context], facts: Dict[str, Fact],
|
||||
units: Dict[str, Any], footnotes: Dict[str, Any],
|
||||
calculation_trees: Dict[str, Any], entity_info: Dict[str, Any],
|
||||
reporting_periods: List[Dict[str, Any]], context_period_map: Dict[str, str]):
|
||||
"""
|
||||
Initialize instance parser with data structure references.
|
||||
|
||||
Args:
|
||||
contexts: Reference to contexts dictionary
|
||||
facts: Reference to facts dictionary
|
||||
units: Reference to units dictionary
|
||||
footnotes: Reference to footnotes dictionary
|
||||
calculation_trees: Reference to calculation trees dictionary
|
||||
entity_info: Reference to entity info dictionary
|
||||
reporting_periods: Reference to reporting periods list
|
||||
context_period_map: Reference to context period map
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# Store references to data structures
|
||||
self.contexts = contexts
|
||||
self.facts = facts
|
||||
self.units = units
|
||||
self.footnotes = footnotes
|
||||
self.calculation_trees = calculation_trees
|
||||
self.entity_info = entity_info
|
||||
self.reporting_periods = reporting_periods
|
||||
self.context_period_map = context_period_map
|
||||
|
||||
# DEI facts extracted during entity info processing
|
||||
self.dei_facts: Dict[str, Fact] = {}
|
||||
|
||||
def _create_normalized_fact_key(self, element_id: str, context_ref: str, instance_id: int = None) -> str:
|
||||
"""
|
||||
Create a normalized fact key using underscore format.
|
||||
|
||||
Args:
|
||||
element_id: The element ID
|
||||
context_ref: The context reference
|
||||
instance_id: Optional instance ID for duplicate facts
|
||||
|
||||
Returns:
|
||||
Normalized key in format: element_id_context_ref[_instance_id]
|
||||
"""
|
||||
normalized_element_id = element_id
|
||||
if ':' in element_id:
|
||||
prefix, name = element_id.split(':', 1)
|
||||
normalized_element_id = f"{prefix}_{name}"
|
||||
if instance_id is not None:
|
||||
return f"{normalized_element_id}_{context_ref}_{instance_id}"
|
||||
return f"{normalized_element_id}_{context_ref}"
|
||||
|
||||
def parse_instance(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse instance document file and extract contexts, facts, and units."""
|
||||
try:
|
||||
content = Path(file_path).read_text()
|
||||
self.parse_instance_content(content)
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing instance file {file_path}: {str(e)}") from e
|
||||
|
||||
def parse_instance_content(self, content: str) -> None:
|
||||
"""Parse instance document content and extract contexts, facts, and units."""
|
||||
try:
|
||||
# Use lxml's optimized parser with smart string handling and recovery mode
|
||||
parser = ET.XMLParser(remove_blank_text=True, recover=True, huge_tree=True)
|
||||
|
||||
# Convert to bytes for faster parsing if not already
|
||||
if isinstance(content, str):
|
||||
content_bytes = content.encode('utf-8')
|
||||
else:
|
||||
content_bytes = content
|
||||
|
||||
# Parse content with optimized settings
|
||||
root = ET.XML(content_bytes, parser)
|
||||
|
||||
# Extract data in optimal order (contexts first, then units, then facts)
|
||||
# This ensures dependencies are resolved before they're needed
|
||||
self._extract_contexts(root)
|
||||
self._extract_units(root)
|
||||
self._extract_facts(root)
|
||||
self._extract_footnotes(root)
|
||||
|
||||
# Post-processing steps after all raw data is extracted
|
||||
self._extract_entity_info()
|
||||
self._build_reporting_periods()
|
||||
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing instance content: {str(e)}") from e
|
||||
|
||||
def count_facts(self, content: str) -> tuple:
|
||||
"""Count the number of facts in the instance document
|
||||
This function counts both unique facts and total fact instances in the XBRL document.
|
||||
|
||||
Returns:
|
||||
tuple: (unique_facts_count, total_fact_instances)
|
||||
"""
|
||||
|
||||
# Use lxml's optimized parser with smart string handling and recovery mode
|
||||
parser = ET.XMLParser(remove_blank_text=True, recover=True, huge_tree=True)
|
||||
|
||||
# Convert to bytes for faster parsing if not already
|
||||
if isinstance(content, str):
|
||||
content_bytes = content.encode('utf-8')
|
||||
else:
|
||||
content_bytes = content
|
||||
|
||||
# Parse content with optimized settings
|
||||
root = ET.XML(content_bytes, parser)
|
||||
|
||||
# Fast path to identify non-fact elements to skip
|
||||
skip_tag_endings = {'}context', '}unit', '}schemaRef'}
|
||||
|
||||
# Track both total instances and unique facts
|
||||
total_fact_instances = 0 # Total number of fact references in the document
|
||||
unique_facts = set() # Set of unique element_id + context_ref combinations
|
||||
create_key = self._create_normalized_fact_key
|
||||
|
||||
# Define counting function
|
||||
def count_element(element):
|
||||
"""Process a single element as a potential fact."""
|
||||
nonlocal total_fact_instances
|
||||
|
||||
# Skip known non-fact elements
|
||||
tag = element.tag
|
||||
for ending in skip_tag_endings:
|
||||
if tag.endswith(ending):
|
||||
return
|
||||
|
||||
# Get context reference - key check to identify facts
|
||||
context_ref = element.get('contextRef')
|
||||
if context_ref is None:
|
||||
return
|
||||
|
||||
# Extract element namespace and name - optimized split
|
||||
if '}' in tag:
|
||||
namespace, element_name = tag.split('}', 1)
|
||||
namespace = namespace[1:] # Faster than strip('{')
|
||||
else:
|
||||
element_name = tag
|
||||
namespace = None
|
||||
|
||||
# Get namespace prefix - cached for performance
|
||||
prefix = None
|
||||
for std_prefix, std_uri_base in NAMESPACES.items():
|
||||
if namespace.startswith(std_uri_base):
|
||||
prefix = std_prefix
|
||||
break
|
||||
|
||||
if not prefix and namespace:
|
||||
# Try to extract prefix from the namespace
|
||||
parts = namespace.split('/')
|
||||
prefix = parts[-1] if parts else ''
|
||||
|
||||
# Construct element ID with optimized string concatenation
|
||||
if prefix:
|
||||
element_id = f"{prefix}:{element_name}" if prefix else element_name
|
||||
else:
|
||||
element_id = element_name
|
||||
|
||||
# Create a normalized key using underscore format for consistency
|
||||
normalized_key = create_key(element_id, context_ref)
|
||||
|
||||
# Track unique facts
|
||||
unique_facts.add(normalized_key)
|
||||
|
||||
# Increment total instances count
|
||||
total_fact_instances += 1
|
||||
|
||||
# Optimize traversal using lxml's iterchildren and iterdescendants if available
|
||||
if hasattr(root, 'iterchildren'):
|
||||
# Use lxml's optimized traversal methods
|
||||
for child in root.iterchildren():
|
||||
count_element(child)
|
||||
# Process nested elements with optimized iteration
|
||||
for descendant in child.iterdescendants():
|
||||
count_element(descendant)
|
||||
else:
|
||||
# Fallback for ElementTree
|
||||
for child in root:
|
||||
count_element(child)
|
||||
for descendant in child.findall('.//*'):
|
||||
count_element(descendant)
|
||||
|
||||
# Return tuple of counts (unique_facts_count, total_fact_instances)
|
||||
return len(unique_facts), total_fact_instances
|
||||
|
||||
def _extract_contexts(self, root: ET.Element) -> None:
|
||||
"""Extract contexts from instance document."""
|
||||
try:
|
||||
# Find all context elements
|
||||
for context_elem in root.findall('.//{http://www.xbrl.org/2003/instance}context'):
|
||||
context_id = context_elem.get('id')
|
||||
if not context_id:
|
||||
continue
|
||||
|
||||
# Create context object
|
||||
context = Context(context_id=context_id)
|
||||
|
||||
# Extract entity information
|
||||
entity_elem = context_elem.find('.//{http://www.xbrl.org/2003/instance}entity')
|
||||
if entity_elem is not None:
|
||||
# Get identifier
|
||||
identifier_elem = entity_elem.find('.//{http://www.xbrl.org/2003/instance}identifier')
|
||||
if identifier_elem is not None:
|
||||
scheme = identifier_elem.get('scheme', '')
|
||||
identifier = identifier_elem.text
|
||||
context.entity = {
|
||||
'scheme': scheme,
|
||||
'identifier': identifier
|
||||
}
|
||||
|
||||
# Get segment dimensions if present
|
||||
segment_elem = entity_elem.find('.//{http://www.xbrl.org/2003/instance}segment')
|
||||
if segment_elem is not None:
|
||||
# Extract explicit dimensions
|
||||
for dim_elem in segment_elem.findall('.//{http://xbrl.org/2006/xbrldi}explicitMember'):
|
||||
dimension = dim_elem.get('dimension')
|
||||
value = dim_elem.text
|
||||
if dimension and value:
|
||||
context.dimensions[dimension] = value
|
||||
|
||||
# Extract typed dimensions
|
||||
for dim_elem in segment_elem.findall('.//{http://xbrl.org/2006/xbrldi}typedMember'):
|
||||
dimension = dim_elem.get('dimension')
|
||||
if dimension:
|
||||
# The typed dimension value is the text content of the first child element
|
||||
for child in dim_elem:
|
||||
# Extract the text content, which contains the actual typed member value
|
||||
if child.text and child.text.strip():
|
||||
context.dimensions[dimension] = child.text.strip()
|
||||
else:
|
||||
# Fallback to tag if no text content
|
||||
context.dimensions[dimension] = child.tag
|
||||
break
|
||||
|
||||
# Extract period information
|
||||
period_elem = context_elem.find('.//{http://www.xbrl.org/2003/instance}period')
|
||||
if period_elem is not None:
|
||||
# Check for instant period
|
||||
instant_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}instant')
|
||||
if instant_elem is not None and instant_elem.text:
|
||||
context.period = {
|
||||
'type': 'instant',
|
||||
'instant': instant_elem.text
|
||||
}
|
||||
|
||||
# Check for duration period
|
||||
start_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}startDate')
|
||||
end_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}endDate')
|
||||
if start_elem is not None and end_elem is not None and start_elem.text and end_elem.text:
|
||||
context.period = {
|
||||
'type': 'duration',
|
||||
'startDate': start_elem.text,
|
||||
'endDate': end_elem.text
|
||||
}
|
||||
|
||||
# Check for forever period
|
||||
forever_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}forever')
|
||||
if forever_elem is not None:
|
||||
context.period = {
|
||||
'type': 'forever'
|
||||
}
|
||||
|
||||
# Add context to registry
|
||||
self.contexts[context_id] = context
|
||||
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error extracting contexts: {str(e)}") from e
|
||||
|
||||
def _extract_units(self, root: ET.Element) -> None:
|
||||
"""Extract units from instance document."""
|
||||
try:
|
||||
# Find all unit elements
|
||||
for unit_elem in root.findall('.//{http://www.xbrl.org/2003/instance}unit'):
|
||||
unit_id = unit_elem.get('id')
|
||||
if not unit_id:
|
||||
continue
|
||||
|
||||
# Check for measure
|
||||
measure_elem = unit_elem.find('.//{http://www.xbrl.org/2003/instance}measure')
|
||||
if measure_elem is not None and measure_elem.text:
|
||||
self.units[unit_id] = {
|
||||
'type': 'simple',
|
||||
'measure': measure_elem.text
|
||||
}
|
||||
continue
|
||||
|
||||
# Check for divide
|
||||
divide_elem = unit_elem.find('.//{http://www.xbrl.org/2003/instance}divide')
|
||||
if divide_elem is not None:
|
||||
# Get numerator
|
||||
numerator_elem = divide_elem.find('.//{http://www.xbrl.org/2003/instance}unitNumerator')
|
||||
denominator_elem = divide_elem.find('.//{http://www.xbrl.org/2003/instance}unitDenominator')
|
||||
|
||||
if numerator_elem is not None and denominator_elem is not None:
|
||||
# Get measures
|
||||
numerator_measures = [elem.text for elem in numerator_elem.findall('.//{http://www.xbrl.org/2003/instance}measure') if elem.text]
|
||||
denominator_measures = [elem.text for elem in denominator_elem.findall('.//{http://www.xbrl.org/2003/instance}measure') if elem.text]
|
||||
|
||||
self.units[unit_id] = {
|
||||
'type': 'divide',
|
||||
'numerator': numerator_measures,
|
||||
'denominator': denominator_measures
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error extracting units: {str(e)}") from e
|
||||
|
||||
def _extract_facts(self, root: ET.Element) -> None:
|
||||
"""Extract facts from instance document."""
|
||||
try:
|
||||
# Get direct access to nsmap if using lxml (much faster than regex extraction)
|
||||
if hasattr(root, 'nsmap'):
|
||||
# Leverage lxml's native nsmap functionality
|
||||
prefix_map = {uri: prefix for prefix, uri in root.nsmap.items() if prefix is not None}
|
||||
else:
|
||||
# Fallback for ElementTree - precompile regex patterns for namespace extraction
|
||||
xmlns_pattern = '{http://www.w3.org/2000/xmlns/}'
|
||||
prefix_map = {}
|
||||
|
||||
# Extract namespace declarations from root
|
||||
for attr_name, attr_value in root.attrib.items():
|
||||
if attr_name.startswith(xmlns_pattern) or attr_name.startswith('xmlns:'):
|
||||
# Extract the prefix more efficiently
|
||||
if attr_name.startswith(xmlns_pattern):
|
||||
prefix = attr_name[len(xmlns_pattern):]
|
||||
else:
|
||||
prefix = attr_name.split(':', 1)[1]
|
||||
prefix_map[attr_value] = prefix
|
||||
|
||||
# Initialize counters and tracking
|
||||
fact_count = 0
|
||||
facts_dict = {}
|
||||
base_keys = {}
|
||||
|
||||
# Fast path to identify non-fact elements to skip - compile as set for O(1) lookup
|
||||
skip_tag_endings = {
|
||||
'schemaRef',
|
||||
'roleRef',
|
||||
'arcroleRef',
|
||||
'linkbaseRef',
|
||||
'context',
|
||||
'unit'
|
||||
}
|
||||
|
||||
def process_element(element):
|
||||
"""Process a single element as a potential fact."""
|
||||
nonlocal fact_count
|
||||
|
||||
# Skip annotation nodes and other non element nodes
|
||||
if not ET.iselement(element):
|
||||
return
|
||||
# Skip known non-fact elements - faster check with set membership
|
||||
# If the tag is not a string, try calling () to get the string value (in rare cases)
|
||||
if callable(element.tag):
|
||||
if isinstance(element, ET._Comment):
|
||||
return
|
||||
if not element.values():
|
||||
return
|
||||
tag = element.tag
|
||||
for ending in skip_tag_endings:
|
||||
if tag.endswith(ending):
|
||||
return
|
||||
|
||||
# Get context reference - key check to identify facts
|
||||
context_ref = element.get('contextRef')
|
||||
if not context_ref:
|
||||
return
|
||||
|
||||
# Get fact ID if present (for footnote linkage)
|
||||
fact_id = element.get('id')
|
||||
|
||||
# Extract element namespace and name - optimized split
|
||||
if '}' in tag:
|
||||
namespace, element_name = tag.split('}', 1)
|
||||
namespace = namespace[1:] # Faster than strip('{')
|
||||
|
||||
# Try to extract prefix from the namespace
|
||||
prefix = prefix_map.get(namespace)
|
||||
if not prefix:
|
||||
parts = namespace.split('/')
|
||||
prefix = parts[-1] if parts else ''
|
||||
else:
|
||||
element_name = tag
|
||||
prefix = ''
|
||||
|
||||
# Construct element ID with optimized string concatenation
|
||||
element_id = f"{prefix}:{element_name}" if prefix else element_name
|
||||
|
||||
# Get unit reference
|
||||
unit_ref = element.get('unitRef')
|
||||
|
||||
# Get value - optimize string handling
|
||||
value = element.text
|
||||
if not value or not value.strip():
|
||||
# Only check children if text is empty - use direct iteration for speed
|
||||
for sub_elem in element:
|
||||
sub_text = sub_elem.text
|
||||
if sub_text and sub_text.strip():
|
||||
value = sub_text
|
||||
break
|
||||
|
||||
# Optimize string handling - inline conditional
|
||||
value = value.strip() if value else ""
|
||||
|
||||
# Get decimals attribute - direct access
|
||||
decimals = element.get('decimals')
|
||||
|
||||
# Optimize numeric conversion with faster try/except
|
||||
numeric_value = None
|
||||
if value:
|
||||
try:
|
||||
numeric_value = float(value)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Create base key for duplicate detection
|
||||
base_key = self._create_normalized_fact_key(element_id, context_ref)
|
||||
|
||||
# Handle duplicates
|
||||
instance_id = None
|
||||
if base_key in base_keys:
|
||||
# This is a duplicate - convert existing fact to use instance_id if needed
|
||||
if base_key in facts_dict:
|
||||
existing_fact = facts_dict[base_key]
|
||||
# Move existing fact to new key with instance_id=0
|
||||
del facts_dict[base_key]
|
||||
existing_fact.instance_id = 0
|
||||
facts_dict[self._create_normalized_fact_key(element_id, context_ref, 0)] = existing_fact
|
||||
# Add new fact with next instance_id
|
||||
instance_id = len(base_keys[base_key])
|
||||
base_keys[base_key].append(True)
|
||||
else:
|
||||
# First instance of this fact
|
||||
base_keys[base_key] = [True]
|
||||
|
||||
# Create fact object
|
||||
fact = Fact(
|
||||
element_id=element_id,
|
||||
context_ref=context_ref,
|
||||
value=value,
|
||||
unit_ref=unit_ref,
|
||||
decimals=decimals,
|
||||
numeric_value=numeric_value,
|
||||
instance_id=instance_id,
|
||||
fact_id=fact_id
|
||||
)
|
||||
|
||||
# Store fact with appropriate key
|
||||
key = self._create_normalized_fact_key(element_id, context_ref, instance_id)
|
||||
facts_dict[key] = fact
|
||||
fact_count += 1
|
||||
|
||||
# Use lxml's optimized traversal methods
|
||||
if hasattr(root, 'iterchildren'):
|
||||
# Use lxml's optimized traversal methods
|
||||
for child in root.iterchildren():
|
||||
process_element(child)
|
||||
# Process nested elements with optimized iteration
|
||||
for descendant in child.iterdescendants():
|
||||
process_element(descendant)
|
||||
else:
|
||||
# Fallback for ElementTree
|
||||
for child in root:
|
||||
process_element(child)
|
||||
for descendant in child.findall('.//*'):
|
||||
process_element(descendant)
|
||||
|
||||
# Update instance facts
|
||||
self.facts.update(facts_dict)
|
||||
|
||||
log.debug(f"Extracted {fact_count} facts ({len(base_keys)} unique fact identifiers)")
|
||||
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error extracting facts: {str(e)}") from e
|
||||
|
||||
def _extract_footnotes(self, root: ET.Element) -> None:
|
||||
"""Extract footnotes from instance document.
|
||||
|
||||
Footnotes in XBRL are linked to facts via footnoteLink elements that contain:
|
||||
1. footnote elements with the actual text content
|
||||
2. footnoteArc elements that connect fact IDs to footnote IDs
|
||||
"""
|
||||
try:
|
||||
from edgar.xbrl.models import Footnote
|
||||
|
||||
# Find all footnoteLink elements
|
||||
for footnote_link in root.findall('.//{http://www.xbrl.org/2003/linkbase}footnoteLink'):
|
||||
# First, extract all footnote definitions
|
||||
for footnote_elem in footnote_link.findall('{http://www.xbrl.org/2003/linkbase}footnote'):
|
||||
# Try both 'id' and 'xlink:label' attributes
|
||||
footnote_id = footnote_elem.get('id') or footnote_elem.get('{http://www.w3.org/1999/xlink}label')
|
||||
if not footnote_id:
|
||||
continue
|
||||
|
||||
# Get footnote attributes
|
||||
lang = footnote_elem.get('{http://www.w3.org/XML/1998/namespace}lang', 'en-US')
|
||||
role = footnote_elem.get('{http://www.w3.org/1999/xlink}role')
|
||||
|
||||
# Extract text content, handling XHTML formatting
|
||||
footnote_text = ""
|
||||
# Check for XHTML content
|
||||
xhtml_divs = footnote_elem.findall('.//{http://www.w3.org/1999/xhtml}div')
|
||||
if xhtml_divs:
|
||||
# Concatenate all text within XHTML elements
|
||||
for div in xhtml_divs:
|
||||
footnote_text += "".join(div.itertext()).strip()
|
||||
else:
|
||||
# Fall back to direct text content
|
||||
footnote_text = "".join(footnote_elem.itertext()).strip()
|
||||
|
||||
# Create Footnote object
|
||||
footnote = Footnote(
|
||||
footnote_id=footnote_id,
|
||||
text=footnote_text,
|
||||
lang=lang,
|
||||
role=role,
|
||||
related_fact_ids=[]
|
||||
)
|
||||
self.footnotes[footnote_id] = footnote
|
||||
|
||||
# Second, process footnoteArc elements to link facts to footnotes
|
||||
for arc_elem in footnote_link.findall('{http://www.xbrl.org/2003/linkbase}footnoteArc'):
|
||||
fact_id = arc_elem.get('{http://www.w3.org/1999/xlink}from')
|
||||
footnote_id = arc_elem.get('{http://www.w3.org/1999/xlink}to')
|
||||
|
||||
if fact_id and footnote_id:
|
||||
# Add fact ID to footnote's related facts
|
||||
if footnote_id in self.footnotes:
|
||||
self.footnotes[footnote_id].related_fact_ids.append(fact_id)
|
||||
else:
|
||||
log.warning(f"Footnote arc references undefined footnote: {footnote_id}")
|
||||
|
||||
# Also update the fact's footnotes list if we can find it
|
||||
# This requires finding the fact by its fact_id
|
||||
for fact in self.facts.values():
|
||||
if fact.fact_id == fact_id:
|
||||
if footnote_id not in fact.footnotes:
|
||||
fact.footnotes.append(footnote_id)
|
||||
break
|
||||
|
||||
log.debug(f"Extracted {len(self.footnotes)} footnotes")
|
||||
|
||||
except Exception as e:
|
||||
# Log the error but don't fail - footnotes are optional
|
||||
log.warning(f"Error extracting footnotes: {str(e)}")
|
||||
|
||||
def _extract_entity_info(self) -> None:
|
||||
"""Extract entity information from contexts and DEI facts."""
|
||||
try:
|
||||
# Extract CIK/identifier from first context
|
||||
identifier = None
|
||||
if self.contexts:
|
||||
first = next(iter(self.contexts.values()))
|
||||
ident = first.entity.get('identifier')
|
||||
if ident and ident.isdigit():
|
||||
identifier = ident.lstrip('0')
|
||||
|
||||
# Collect all DEI facts into a dict: concept -> Fact
|
||||
self.dei_facts: Dict[str, Fact] = {}
|
||||
for fact in self.facts.values():
|
||||
eid = fact.element_id
|
||||
if eid.startswith('dei:'):
|
||||
concept = eid.split(':', 1)[1]
|
||||
elif eid.startswith('dei_'):
|
||||
concept = eid.split('_', 1)[1]
|
||||
else:
|
||||
continue
|
||||
self.dei_facts[concept] = fact
|
||||
|
||||
# Helper: get the first available DEI fact value
|
||||
def get_dei(*names):
|
||||
for n in names:
|
||||
f = self.dei_facts.get(n)
|
||||
if f:
|
||||
return f.value
|
||||
return None
|
||||
|
||||
# Build entity_info preserving existing keys
|
||||
self.entity_info.update({
|
||||
'entity_name': get_dei('EntityRegistrantName'),
|
||||
'ticker': get_dei('TradingSymbol'),
|
||||
'identifier': identifier,
|
||||
'document_type': get_dei('DocumentType'),
|
||||
'reporting_end_date': None,
|
||||
'document_period_end_date':get_dei('DocumentPeriodEndDate'),
|
||||
'fiscal_year': get_dei('DocumentFiscalYearFocus','FiscalYearFocus','FiscalYear'),
|
||||
'fiscal_period': get_dei('DocumentFiscalPeriodFocus','FiscalPeriodFocus'),
|
||||
'fiscal_year_end_month': None,
|
||||
'fiscal_year_end_day': None,
|
||||
'annual_report': False,
|
||||
'quarterly_report': False,
|
||||
'amendment': False,
|
||||
})
|
||||
|
||||
# Determine reporting_end_date from contexts
|
||||
for ctx in self.contexts.values():
|
||||
period = getattr(ctx, 'period', {})
|
||||
if period.get('type') == 'instant':
|
||||
ds = period.get('instant')
|
||||
if ds:
|
||||
try:
|
||||
dt_obj = datetime.strptime(ds, '%Y-%m-%d').date()
|
||||
curr = self.entity_info['reporting_end_date']
|
||||
if curr is None or dt_obj > curr:
|
||||
self.entity_info['reporting_end_date'] = dt_obj
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Parse fiscal year end date into month/day
|
||||
fye = get_dei('CurrentFiscalYearEndDate','FiscalYearEnd')
|
||||
if fye:
|
||||
try:
|
||||
s = fye
|
||||
if s.startswith('--'):
|
||||
s = s[2:]
|
||||
if '-' in s:
|
||||
m, d = s.split('-', 1)
|
||||
if m.isdigit() and d.isdigit():
|
||||
self.entity_info['fiscal_year_end_month'] = int(m)
|
||||
self.entity_info['fiscal_year_end_day'] = int(d)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Flags based on document_type
|
||||
dt_val = self.entity_info['document_type'] or ''
|
||||
self.entity_info['annual_report'] = (dt_val == '10-K')
|
||||
self.entity_info['quarterly_report'] = (dt_val == '10-Q')
|
||||
self.entity_info['amendment'] = ('/A' in dt_val)
|
||||
|
||||
log.debug(f"Entity info: {self.entity_info}")
|
||||
except Exception as e:
|
||||
log.warning(f"Warning: Error extracting entity info: {str(e)}")
|
||||
|
||||
def _build_reporting_periods(self) -> None:
|
||||
"""Build reporting periods from contexts."""
|
||||
try:
|
||||
# Clear existing periods
|
||||
self.reporting_periods.clear()
|
||||
self.context_period_map.clear()
|
||||
|
||||
# Collect unique periods from contexts
|
||||
instant_periods = {}
|
||||
duration_periods = {}
|
||||
|
||||
for context_id, context in self.contexts.items():
|
||||
if 'period' in context.model_dump() and 'type' in context.period:
|
||||
period_type = context.period.get('type')
|
||||
|
||||
if period_type == 'instant':
|
||||
date_str = context.period.get('instant')
|
||||
if date_str:
|
||||
if date_str not in instant_periods:
|
||||
instant_periods[date_str] = []
|
||||
|
||||
# Add context ID to this period
|
||||
instant_periods[date_str].append(context_id)
|
||||
|
||||
# Map context to period key
|
||||
period_key = f"instant_{date_str}"
|
||||
self.context_period_map[context_id] = period_key
|
||||
|
||||
elif period_type == 'duration':
|
||||
start_date = context.period.get('startDate')
|
||||
end_date = context.period.get('endDate')
|
||||
if start_date and end_date:
|
||||
duration_key = f"{start_date}_{end_date}"
|
||||
if duration_key not in duration_periods:
|
||||
duration_periods[duration_key] = []
|
||||
|
||||
# Add context ID to this period
|
||||
duration_periods[duration_key].append(context_id)
|
||||
|
||||
# Map context to period key
|
||||
period_key = f"duration_{start_date}_{end_date}"
|
||||
self.context_period_map[context_id] = period_key
|
||||
|
||||
# Process instant periods
|
||||
for date_str, context_ids in instant_periods.items():
|
||||
try:
|
||||
date_obj = datetime.strptime(date_str, '%Y-%m-%d').date()
|
||||
formatted_date = date_obj.strftime('%B %d, %Y')
|
||||
|
||||
period = {
|
||||
'type': 'instant',
|
||||
'date': date_str,
|
||||
'date_obj': date_obj,
|
||||
'label': formatted_date,
|
||||
'context_ids': context_ids,
|
||||
'key': f"instant_{date_str}"
|
||||
}
|
||||
self.reporting_periods.append(period)
|
||||
except (ValueError, TypeError):
|
||||
# Skip invalid dates
|
||||
continue
|
||||
|
||||
# Process duration periods
|
||||
for period_key, context_ids in duration_periods.items():
|
||||
start_date, end_date = period_key.split('_')
|
||||
try:
|
||||
start_obj = datetime.strptime(start_date, '%Y-%m-%d').date()
|
||||
end_obj = datetime.strptime(end_date, '%Y-%m-%d').date()
|
||||
formatted_start = start_obj.strftime('%B %d, %Y')
|
||||
formatted_end = end_obj.strftime('%B %d, %Y')
|
||||
|
||||
# Calculate duration in days
|
||||
days = (end_obj - start_obj).days
|
||||
|
||||
# Determine period type based on duration
|
||||
period_description = classify_duration(days)
|
||||
|
||||
period = {
|
||||
'type': 'duration',
|
||||
'start_date': start_date,
|
||||
'end_date': end_date,
|
||||
'start_obj': start_obj,
|
||||
'end_obj': end_obj,
|
||||
'days': days,
|
||||
'period_type': period_description,
|
||||
'label': f"{period_description}: {formatted_start} to {formatted_end}",
|
||||
'context_ids': context_ids,
|
||||
'key': f"duration_{start_date}_{end_date}"
|
||||
}
|
||||
self.reporting_periods.append(period)
|
||||
except (ValueError, TypeError):
|
||||
# Skip invalid dates
|
||||
continue
|
||||
|
||||
# Sort periods by date (most recent first)
|
||||
self.reporting_periods.sort(key=lambda p: p['date_obj'] if p['type'] == 'instant' else p['end_obj'], reverse=True)
|
||||
|
||||
# Debug printout to verify periods are extracted
|
||||
if len(self.reporting_periods) > 0:
|
||||
log.debug(f"Found {len(self.reporting_periods)} reporting periods.")
|
||||
log.debug(f"First period: {self.reporting_periods[0]['label']}")
|
||||
else:
|
||||
log.debug("Warning: No reporting periods found!")
|
||||
|
||||
# Debug context period map
|
||||
log.debug(f"Context period map has {len(self.context_period_map)} entries.")
|
||||
|
||||
except Exception as e:
|
||||
# Log error but don't fail
|
||||
log.debug(f"Warning: Error building reporting periods: {str(e)}")
|
||||
self.reporting_periods.clear()
|
||||
149
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/labels.py
Normal file
149
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/labels.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""
|
||||
Labels parser for XBRL documents.
|
||||
|
||||
This module handles parsing of XBRL label linkbases and extracting
|
||||
element labels for display purposes.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, Union
|
||||
|
||||
from lxml import etree as ET
|
||||
|
||||
from edgar.xbrl.core import STANDARD_LABEL, extract_element_id
|
||||
from edgar.xbrl.models import ElementCatalog, XBRLProcessingError
|
||||
|
||||
from .base import BaseParser
|
||||
|
||||
|
||||
class LabelsParser(BaseParser):
|
||||
"""Parser for XBRL label linkbases."""
|
||||
|
||||
def __init__(self, element_catalog: Dict[str, ElementCatalog]):
|
||||
"""
|
||||
Initialize labels parser with data structure references.
|
||||
|
||||
Args:
|
||||
element_catalog: Reference to element catalog dictionary
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# Store references to data structures
|
||||
self.element_catalog = element_catalog
|
||||
|
||||
def parse_labels(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse label linkbase file and extract label information."""
|
||||
try:
|
||||
content = Path(file_path).read_text()
|
||||
self.parse_labels_content(content)
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing label file {file_path}: {str(e)}") from e
|
||||
|
||||
def parse_labels_content(self, content: str) -> None:
|
||||
"""Parse label linkbase content and extract label information."""
|
||||
try:
|
||||
# Optimize: Register namespaces for faster XPath lookups
|
||||
nsmap = {
|
||||
'link': 'http://www.xbrl.org/2003/linkbase',
|
||||
'xlink': 'http://www.w3.org/1999/xlink',
|
||||
'xml': 'http://www.w3.org/XML/1998/namespace'
|
||||
}
|
||||
|
||||
# Optimize: Use lxml parser with smart string handling
|
||||
parser = ET.XMLParser(remove_blank_text=True, recover=True)
|
||||
root = ET.XML(content.encode('utf-8'), parser)
|
||||
|
||||
# Optimize: Use specific XPath expressions with namespaces for faster lookups
|
||||
# This is much faster than using findall with '//' in element tree
|
||||
label_arcs = root.xpath('//link:labelArc', namespaces=nsmap)
|
||||
labels = root.xpath('//link:label', namespaces=nsmap)
|
||||
|
||||
# Optimize: Pre-allocate dictionary with expected size
|
||||
label_lookup = {}
|
||||
|
||||
# Optimize: Cache attribute lookups
|
||||
xlink_label = '{http://www.w3.org/1999/xlink}label'
|
||||
xlink_role = '{http://www.w3.org/1999/xlink}role'
|
||||
xml_lang = '{http://www.w3.org/XML/1998/namespace}lang'
|
||||
default_role = 'http://www.xbrl.org/2003/role/label'
|
||||
|
||||
# Optimize: Process labels in a single pass with direct attribute access
|
||||
for label in labels:
|
||||
label_id = label.get(xlink_label)
|
||||
if not label_id:
|
||||
continue
|
||||
|
||||
# Get text first - if empty, skip further processing
|
||||
text = label.text
|
||||
if text is None:
|
||||
continue
|
||||
|
||||
# Get attributes - direct lookup is faster than method calls
|
||||
role = label.get(xlink_role, default_role)
|
||||
lang = label.get(xml_lang, 'en-US')
|
||||
|
||||
# Create nested dictionaries only when needed
|
||||
if label_id not in label_lookup:
|
||||
label_lookup[label_id] = {}
|
||||
|
||||
if lang not in label_lookup[label_id]:
|
||||
label_lookup[label_id][lang] = {}
|
||||
|
||||
label_lookup[label_id][lang][role] = text
|
||||
|
||||
# Optimize: Cache attribute lookups for arcs
|
||||
xlink_from = '{http://www.w3.org/1999/xlink}from'
|
||||
xlink_to = '{http://www.w3.org/1999/xlink}to'
|
||||
xlink_href = '{http://www.w3.org/1999/xlink}href'
|
||||
|
||||
# Optimize: Create a lookup table for locators by label for faster access
|
||||
loc_by_label = {}
|
||||
for loc in root.xpath('//link:loc', namespaces=nsmap):
|
||||
loc_label = loc.get(xlink_label)
|
||||
if loc_label:
|
||||
loc_by_label[loc_label] = loc.get(xlink_href)
|
||||
|
||||
# Connect labels to elements using arcs - with optimized lookups
|
||||
for arc in label_arcs:
|
||||
from_ref = arc.get(xlink_from)
|
||||
to_ref = arc.get(xlink_to)
|
||||
|
||||
if not from_ref or not to_ref or to_ref not in label_lookup:
|
||||
continue
|
||||
|
||||
# Use cached locator lookup instead of expensive XPath
|
||||
href = loc_by_label.get(from_ref)
|
||||
if not href:
|
||||
continue
|
||||
|
||||
# Extract element ID from href
|
||||
element_id = extract_element_id(href)
|
||||
|
||||
# Find labels for this element - check most likely case first
|
||||
if 'en-US' in label_lookup[to_ref]:
|
||||
element_labels = label_lookup[to_ref]['en-US']
|
||||
|
||||
# Optimize: Update catalog with minimal overhead
|
||||
catalog_entry = self.element_catalog.get(element_id)
|
||||
if catalog_entry:
|
||||
catalog_entry.labels.update(element_labels)
|
||||
else:
|
||||
# Create placeholder in catalog
|
||||
self.element_catalog[element_id] = ElementCatalog(
|
||||
name=element_id,
|
||||
data_type="",
|
||||
period_type="duration",
|
||||
labels=element_labels
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing label content: {str(e)}") from e
|
||||
|
||||
def get_element_label(self, element_id: str) -> str:
|
||||
"""Get the label for an element, falling back to the element ID if not found."""
|
||||
if element_id in self.element_catalog and self.element_catalog[element_id].labels:
|
||||
# Use standard label if available
|
||||
standard_label = self.element_catalog[element_id].labels.get(STANDARD_LABEL)
|
||||
if standard_label:
|
||||
return standard_label
|
||||
return element_id # Fallback to element ID
|
||||
@@ -0,0 +1,249 @@
|
||||
"""
|
||||
Presentation parser for XBRL documents.
|
||||
|
||||
This module handles parsing of XBRL presentation linkbases and building
|
||||
presentation trees for financial statement structure.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from lxml import etree as ET
|
||||
|
||||
from edgar.xbrl.core import extract_element_id
|
||||
from edgar.xbrl.models import ElementCatalog, PresentationNode, PresentationTree, XBRLProcessingError
|
||||
|
||||
from .base import BaseParser
|
||||
|
||||
|
||||
class PresentationParser(BaseParser):
|
||||
"""Parser for XBRL presentation linkbases."""
|
||||
|
||||
def __init__(self, presentation_roles: Dict[str, Dict[str, Any]],
|
||||
presentation_trees: Dict[str, PresentationTree],
|
||||
element_catalog: Dict[str, ElementCatalog]):
|
||||
"""
|
||||
Initialize presentation parser with data structure references.
|
||||
|
||||
Args:
|
||||
presentation_roles: Reference to presentation roles dictionary
|
||||
presentation_trees: Reference to presentation trees dictionary
|
||||
element_catalog: Reference to element catalog dictionary
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# Store references to data structures
|
||||
self.presentation_roles = presentation_roles
|
||||
self.presentation_trees = presentation_trees
|
||||
self.element_catalog = element_catalog
|
||||
|
||||
def parse_presentation(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse presentation linkbase file and build presentation trees."""
|
||||
try:
|
||||
content = Path(file_path).read_text()
|
||||
self.parse_presentation_content(content)
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing presentation file {file_path}: {str(e)}") from e
|
||||
|
||||
def parse_presentation_content(self, content: str) -> None:
|
||||
"""Parse presentation linkbase content and build presentation trees."""
|
||||
try:
|
||||
# Optimize: Register namespaces for faster XPath lookups
|
||||
nsmap = {
|
||||
'link': 'http://www.xbrl.org/2003/linkbase',
|
||||
'xlink': 'http://www.w3.org/1999/xlink'
|
||||
}
|
||||
|
||||
# Optimize: Use lxml parser with smart string handling
|
||||
parser = ET.XMLParser(remove_blank_text=True, recover=True)
|
||||
root = ET.XML(content.encode('utf-8'), parser)
|
||||
|
||||
# Optimize: Use XPath with namespaces for faster extraction
|
||||
presentation_links = root.xpath('//link:presentationLink', namespaces=nsmap)
|
||||
|
||||
# Optimize: Cache attribute paths
|
||||
xlink_role = '{http://www.w3.org/1999/xlink}role'
|
||||
xlink_from = '{http://www.w3.org/1999/xlink}from'
|
||||
xlink_to = '{http://www.w3.org/1999/xlink}to'
|
||||
xlink_label = '{http://www.w3.org/1999/xlink}label'
|
||||
xlink_href = '{http://www.w3.org/1999/xlink}href'
|
||||
|
||||
for link in presentation_links:
|
||||
role = link.get(xlink_role)
|
||||
if not role:
|
||||
continue
|
||||
|
||||
# Store role information
|
||||
role_id = role.split('/')[-1] if '/' in role else role
|
||||
role_def = role_id.replace('_', ' ')
|
||||
|
||||
self.presentation_roles[role] = {
|
||||
'roleUri': role,
|
||||
'definition': role_def,
|
||||
'roleId': role_id
|
||||
}
|
||||
|
||||
# Optimize: Pre-build locator map to avoid repeated XPath lookups
|
||||
loc_map = {}
|
||||
for loc in link.xpath('.//link:loc', namespaces=nsmap):
|
||||
label = loc.get(xlink_label)
|
||||
if label:
|
||||
loc_map[label] = loc.get(xlink_href)
|
||||
|
||||
# Optimize: Extract arcs using direct xpath with context
|
||||
arcs = link.xpath('.//link:presentationArc', namespaces=nsmap)
|
||||
|
||||
# Create relationships map - pre-allocate with known size
|
||||
relationships = []
|
||||
relationships_append = relationships.append # Local function reference for speed
|
||||
|
||||
# Process arcs with optimized locator lookups
|
||||
for arc in arcs:
|
||||
from_ref = arc.get(xlink_from)
|
||||
to_ref = arc.get(xlink_to)
|
||||
|
||||
if not from_ref or not to_ref:
|
||||
continue
|
||||
|
||||
# Optimize: Use cached locator references instead of expensive XPath lookups
|
||||
from_href = loc_map.get(from_ref)
|
||||
to_href = loc_map.get(to_ref)
|
||||
|
||||
if not from_href or not to_href:
|
||||
continue
|
||||
|
||||
# Parse order attribute correctly
|
||||
order = self._parse_order_attribute(arc)
|
||||
|
||||
preferred_label = arc.get('preferredLabel')
|
||||
|
||||
# Extract element IDs from hrefs
|
||||
from_element = extract_element_id(from_href)
|
||||
to_element = extract_element_id(to_href)
|
||||
|
||||
# Add relationship using local function reference
|
||||
relationships_append({
|
||||
'from_element': from_element,
|
||||
'to_element': to_element,
|
||||
'order': order,
|
||||
'preferred_label': preferred_label
|
||||
})
|
||||
|
||||
# Build presentation tree for this role if we have relationships
|
||||
if relationships:
|
||||
self._build_presentation_tree(role, relationships)
|
||||
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing presentation content: {str(e)}") from e
|
||||
|
||||
def _build_presentation_tree(self, role: str, relationships: List[Dict[str, Any]]) -> None:
|
||||
"""
|
||||
Build a presentation tree from relationships.
|
||||
|
||||
Args:
|
||||
role: Extended link role URI
|
||||
relationships: List of relationships (from_element, to_element, order, preferred_label)
|
||||
"""
|
||||
# Group relationships by source element
|
||||
from_map = {}
|
||||
to_map = {}
|
||||
|
||||
for rel in relationships:
|
||||
from_element = rel['from_element']
|
||||
to_element = rel['to_element']
|
||||
|
||||
if from_element not in from_map:
|
||||
from_map[from_element] = []
|
||||
from_map[from_element].append(rel)
|
||||
|
||||
if to_element not in to_map:
|
||||
to_map[to_element] = []
|
||||
to_map[to_element].append(rel)
|
||||
|
||||
# Find root elements (appear as 'from' but not as 'to')
|
||||
root_elements = set(from_map.keys()) - set(to_map.keys())
|
||||
|
||||
if not root_elements:
|
||||
return # No root elements found
|
||||
|
||||
# Create presentation tree
|
||||
tree = PresentationTree(
|
||||
role_uri=role,
|
||||
definition=self.presentation_roles[role]['definition'],
|
||||
root_element_id=next(iter(root_elements)),
|
||||
all_nodes={}
|
||||
)
|
||||
|
||||
# Build tree recursively
|
||||
for root_id in root_elements:
|
||||
self._build_presentation_subtree(root_id, None, 0, from_map, tree.all_nodes)
|
||||
|
||||
# Add tree to collection
|
||||
self.presentation_trees[role] = tree
|
||||
|
||||
def _build_presentation_subtree(self, element_id: str, parent_id: Optional[str], depth: int,
|
||||
from_map: Dict[str, List[Dict[str, Any]]],
|
||||
all_nodes: Dict[str, PresentationNode]) -> None:
|
||||
"""
|
||||
Recursively build a presentation subtree.
|
||||
|
||||
Args:
|
||||
element_id: Current element ID
|
||||
parent_id: Parent element ID
|
||||
depth: Current depth in tree
|
||||
from_map: Map of relationships by source element
|
||||
all_nodes: Dictionary to store all nodes
|
||||
"""
|
||||
# Create node
|
||||
node = PresentationNode(
|
||||
element_id=element_id,
|
||||
parent=parent_id,
|
||||
children=[],
|
||||
depth=depth
|
||||
)
|
||||
|
||||
# Add element information if available
|
||||
if element_id in self.element_catalog:
|
||||
elem_info = self.element_catalog[element_id]
|
||||
node.element_name = elem_info.name
|
||||
node.standard_label = elem_info.labels.get('http://www.xbrl.org/2003/role/label', elem_info.name)
|
||||
|
||||
# Use enhanced abstract detection (Issue #450 fix)
|
||||
# The element catalog may not have correct abstract info for standard taxonomy concepts
|
||||
from edgar.xbrl.abstract_detection import is_abstract_concept
|
||||
node.is_abstract = is_abstract_concept(
|
||||
concept_name=elem_info.name,
|
||||
schema_abstract=elem_info.abstract,
|
||||
has_children=False, # Will be updated after children are processed
|
||||
has_values=False # Will be determined later when facts are loaded
|
||||
)
|
||||
|
||||
node.labels = elem_info.labels
|
||||
|
||||
# Add to collection
|
||||
all_nodes[element_id] = node
|
||||
|
||||
# Process children
|
||||
if element_id in from_map:
|
||||
# Sort children by order
|
||||
children = sorted(from_map[element_id], key=lambda r: r['order'])
|
||||
|
||||
for rel in children:
|
||||
child_id = rel['to_element']
|
||||
|
||||
# Add child to parent's children list
|
||||
node.children.append(child_id)
|
||||
|
||||
# Set preferred label
|
||||
preferred_label = rel['preferred_label']
|
||||
|
||||
# Recursively build child subtree
|
||||
self._build_presentation_subtree(
|
||||
child_id, element_id, depth + 1, from_map, all_nodes
|
||||
)
|
||||
|
||||
# Update preferred label and order after child is built
|
||||
if child_id in all_nodes:
|
||||
if preferred_label:
|
||||
all_nodes[child_id].preferred_label = preferred_label
|
||||
all_nodes[child_id].order = rel['order']
|
||||
210
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/schema.py
Normal file
210
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/schema.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""
|
||||
Schema parser for XBRL documents.
|
||||
|
||||
This module handles parsing of XBRL taxonomy schemas and element catalog
|
||||
creation with element definitions and properties.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, Union
|
||||
|
||||
from lxml import etree as ET
|
||||
|
||||
from edgar.core import log
|
||||
from edgar.xbrl.models import ElementCatalog, XBRLProcessingError
|
||||
|
||||
from .base import BaseParser
|
||||
|
||||
|
||||
class SchemaParser(BaseParser):
|
||||
"""Parser for XBRL taxonomy schemas."""
|
||||
|
||||
def __init__(self, element_catalog: Dict[str, ElementCatalog]):
|
||||
"""
|
||||
Initialize schema parser with data structure references.
|
||||
|
||||
Args:
|
||||
element_catalog: Reference to element catalog dictionary
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# Store references to data structures
|
||||
self.element_catalog = element_catalog
|
||||
|
||||
# Will be set by coordinator when needed
|
||||
self.parse_labels_content = None
|
||||
self.parse_presentation_content = None
|
||||
self.parse_calculation_content = None
|
||||
self.parse_definition_content = None
|
||||
|
||||
def set_linkbase_parsers(self, labels_parser, presentation_parser, calculation_parser, definition_parser):
|
||||
"""
|
||||
Set references to other parsers for embedded linkbase processing.
|
||||
|
||||
Args:
|
||||
labels_parser: LabelsParser instance
|
||||
presentation_parser: PresentationParser instance
|
||||
calculation_parser: CalculationParser instance
|
||||
definition_parser: DefinitionParser instance
|
||||
"""
|
||||
self.parse_labels_content = labels_parser.parse_labels_content
|
||||
self.parse_presentation_content = presentation_parser.parse_presentation_content
|
||||
self.parse_calculation_content = calculation_parser.parse_calculation_content
|
||||
self.parse_definition_content = definition_parser.parse_definition_content
|
||||
|
||||
def parse_schema(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse schema file and extract element information."""
|
||||
try:
|
||||
content = Path(file_path).read_text()
|
||||
self.parse_schema_content(content)
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing schema file {file_path}: {str(e)}") from e
|
||||
|
||||
def parse_schema_content(self, content: str) -> None:
|
||||
"""Parse schema content and extract element information."""
|
||||
try:
|
||||
# Use the safe XML parsing helper
|
||||
root = self._safe_parse_xml(content)
|
||||
|
||||
# Extract element declarations
|
||||
for element in root.findall('.//{http://www.w3.org/2001/XMLSchema}element'):
|
||||
element_id = element.get('id') or element.get('name')
|
||||
if not element_id:
|
||||
continue
|
||||
|
||||
# Extract element properties
|
||||
data_type = element.get('type', '')
|
||||
|
||||
# Check for balance and period type
|
||||
# First check as attributes on the element (modern XBRL style)
|
||||
balance_type = element.get('{http://www.xbrl.org/2003/instance}balance')
|
||||
period_type = element.get('{http://www.xbrl.org/2003/instance}periodType')
|
||||
abstract = element.get('abstract', 'false').lower() == 'true'
|
||||
|
||||
# If not found as attributes, look in nested annotations (legacy style)
|
||||
if not balance_type or not period_type:
|
||||
annotation = element.find('.//{http://www.w3.org/2001/XMLSchema}annotation')
|
||||
if annotation is not None:
|
||||
for appinfo in annotation.findall('.//{http://www.w3.org/2001/XMLSchema}appinfo'):
|
||||
if not balance_type:
|
||||
balance_element = appinfo.find('.//{http://www.xbrl.org/2003/instance}balance')
|
||||
if balance_element is not None:
|
||||
balance_type = balance_element.text
|
||||
|
||||
if not period_type:
|
||||
period_element = appinfo.find('.//{http://www.xbrl.org/2003/instance}periodType')
|
||||
if period_element is not None:
|
||||
period_type = period_element.text
|
||||
|
||||
# Create element catalog entry
|
||||
self.element_catalog[element_id] = ElementCatalog(
|
||||
name=element_id,
|
||||
data_type=data_type,
|
||||
period_type=period_type or "duration", # Default to duration
|
||||
balance=balance_type,
|
||||
abstract=abstract,
|
||||
labels={}
|
||||
)
|
||||
|
||||
# Extract embedded linkbases if present
|
||||
embedded_linkbases = self._extract_embedded_linkbases(content)
|
||||
|
||||
# If embedded linkbases were found, parse them
|
||||
if embedded_linkbases and 'linkbases' in embedded_linkbases:
|
||||
if 'label' in embedded_linkbases['linkbases'] and self.parse_labels_content:
|
||||
label_content = embedded_linkbases['linkbases']['label']
|
||||
self.parse_labels_content(label_content)
|
||||
|
||||
if 'presentation' in embedded_linkbases['linkbases'] and self.parse_presentation_content:
|
||||
presentation_content = embedded_linkbases['linkbases']['presentation']
|
||||
self.parse_presentation_content(presentation_content)
|
||||
|
||||
if 'calculation' in embedded_linkbases['linkbases'] and self.parse_calculation_content:
|
||||
calculation_content = embedded_linkbases['linkbases']['calculation']
|
||||
self.parse_calculation_content(calculation_content)
|
||||
|
||||
if 'definition' in embedded_linkbases['linkbases'] and self.parse_definition_content:
|
||||
definition_content = embedded_linkbases['linkbases']['definition']
|
||||
self.parse_definition_content(definition_content)
|
||||
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing schema content: {str(e)}") from e
|
||||
|
||||
def _extract_embedded_linkbases(self, schema_content: str) -> Dict[str, Dict[str, str]]:
|
||||
"""
|
||||
Extract embedded linkbases and role types from the schema file.
|
||||
|
||||
Args:
|
||||
schema_content: XML content of the schema file
|
||||
|
||||
Returns:
|
||||
Dictionary containing embedded linkbases and role type information
|
||||
"""
|
||||
embedded_data = {
|
||||
'linkbases': {},
|
||||
'role_types': {}
|
||||
}
|
||||
|
||||
try:
|
||||
# Use the safe XML parsing helper
|
||||
root = self._safe_parse_xml(schema_content)
|
||||
|
||||
# Create namespace map for use with XPath
|
||||
nsmap = {
|
||||
'xsd': 'http://www.w3.org/2001/XMLSchema',
|
||||
'link': 'http://www.xbrl.org/2003/linkbase'
|
||||
}
|
||||
|
||||
# Find all appinfo elements using optimized XPath
|
||||
for appinfo in root.xpath('.//xsd:appinfo', namespaces=nsmap):
|
||||
# Extract role types
|
||||
for role_type in appinfo.xpath('./link:roleType', namespaces=nsmap):
|
||||
role_uri = role_type.get('roleURI')
|
||||
role_id = role_type.get('id')
|
||||
|
||||
# Use optimized XPath to find definition
|
||||
definition = role_type.find('./link:definition', nsmap)
|
||||
definition_text = definition.text if definition is not None else ""
|
||||
|
||||
# Use optimized XPath to find usedOn elements
|
||||
used_on = [elem.text for elem in role_type.xpath('./link:usedOn', namespaces=nsmap) if elem.text]
|
||||
|
||||
if role_uri:
|
||||
embedded_data['role_types'][role_uri] = {
|
||||
'id': role_id,
|
||||
'definition': definition_text,
|
||||
'used_on': used_on
|
||||
}
|
||||
|
||||
# Find the linkbase element with optimized XPath
|
||||
linkbase = appinfo.find('./link:linkbase', nsmap)
|
||||
if linkbase is not None:
|
||||
# Extract the entire linkbase element as a string - with proper encoding
|
||||
linkbase_string = ET.tostring(linkbase, encoding='unicode', method='xml')
|
||||
|
||||
# Extract each type of linkbase with optimized XPath
|
||||
for linkbase_type in ['presentation', 'label', 'calculation', 'definition']:
|
||||
# Use direct child XPath for better performance
|
||||
xpath_expr = f'./link:{linkbase_type}Link'
|
||||
linkbase_elements = linkbase.xpath(xpath_expr, namespaces=nsmap)
|
||||
|
||||
if linkbase_elements:
|
||||
# Convert all linkbase elements of this type to strings
|
||||
linkbase_strings = [
|
||||
ET.tostring(elem, encoding='unicode', method='xml')
|
||||
for elem in linkbase_elements
|
||||
]
|
||||
|
||||
# Join multiple linkbase elements efficiently
|
||||
linkbase_header = linkbase_string.split('>', 1)[0] + '>'
|
||||
embedded_data['linkbases'][linkbase_type] = (
|
||||
f"{linkbase_header}\n" +
|
||||
'\n'.join(linkbase_strings) +
|
||||
"\n</link:linkbase>"
|
||||
)
|
||||
|
||||
return embedded_data
|
||||
except Exception as e:
|
||||
# Log the error but don't fail - just return empty embedded data
|
||||
log.warning(f"Warning: Error extracting embedded linkbases: {str(e)}")
|
||||
return embedded_data
|
||||
@@ -0,0 +1,312 @@
|
||||
"""
|
||||
Enhanced period selection with data availability checking.
|
||||
|
||||
This module provides functions to verify that selected periods have sufficient
|
||||
data before displaying them to investors.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
|
||||
|
||||
def count_facts_for_period(xbrl_instance, period_key: str, statement_type: Optional[str] = None) -> int:
|
||||
"""
|
||||
Count the number of facts available for a specific period.
|
||||
|
||||
Args:
|
||||
xbrl_instance: XBRL instance with facts
|
||||
period_key: Period key to check (e.g., 'instant_2024-09-28')
|
||||
statement_type: Optional statement type to filter facts
|
||||
|
||||
Returns:
|
||||
Number of facts found for this period
|
||||
"""
|
||||
fact_count = 0
|
||||
|
||||
# Parse period key to get context criteria
|
||||
if period_key.startswith('instant_'):
|
||||
period_type = 'instant'
|
||||
period_date = period_key.replace('instant_', '')
|
||||
elif 'duration_' in period_key:
|
||||
period_type = 'duration'
|
||||
parts = period_key.split('_')
|
||||
if len(parts) >= 3:
|
||||
start_date = parts[1]
|
||||
end_date = parts[2]
|
||||
else:
|
||||
return 0
|
||||
else:
|
||||
return 0
|
||||
|
||||
# Count facts matching this period
|
||||
for _fact_key, fact in xbrl_instance._facts.items():
|
||||
# Get context for this fact
|
||||
context = xbrl_instance.contexts.get(fact.context_ref)
|
||||
if not context:
|
||||
continue
|
||||
|
||||
# Check if period matches
|
||||
period_data = context.model_dump().get('period', {})
|
||||
if period_type == 'instant':
|
||||
if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
|
||||
fact_count += 1
|
||||
elif period_type == 'duration':
|
||||
if (period_data.get('type') == 'duration' and
|
||||
period_data.get('startDate') == start_date and
|
||||
period_data.get('endDate') == end_date):
|
||||
fact_count += 1
|
||||
|
||||
return fact_count
|
||||
|
||||
|
||||
def get_essential_concepts_for_statement(statement_type: str) -> Set[str]:
|
||||
"""
|
||||
Get the essential concepts that should be present for a statement type.
|
||||
|
||||
These are the minimum concepts investors expect to see.
|
||||
"""
|
||||
essential_concepts = {
|
||||
'BalanceSheet': {
|
||||
# Core balance sheet items
|
||||
'Assets', 'AssetsCurrent',
|
||||
'Liabilities', 'LiabilitiesCurrent',
|
||||
'StockholdersEquity', 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest',
|
||||
# Common important items
|
||||
'CashAndCashEquivalentsAtCarryingValue', 'Cash',
|
||||
'AccountsReceivableNetCurrent', 'AccountsReceivable',
|
||||
'Inventory', 'InventoryNet',
|
||||
'PropertyPlantAndEquipmentNet',
|
||||
'AccountsPayableCurrent', 'AccountsPayable',
|
||||
'LongTermDebt', 'LongTermDebtNoncurrent'
|
||||
},
|
||||
'IncomeStatement': {
|
||||
# Core income items
|
||||
'Revenues', 'RevenueFromContractWithCustomerExcludingAssessedTax', 'SalesRevenueNet',
|
||||
'CostOfRevenue', 'CostOfGoodsAndServicesSold', 'CostOfGoodsSold',
|
||||
'GrossProfit',
|
||||
'OperatingExpenses', 'OperatingCostsAndExpenses',
|
||||
'OperatingIncomeLoss', 'IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest',
|
||||
'NetIncomeLoss', 'ProfitLoss',
|
||||
# Common important items
|
||||
'ResearchAndDevelopmentExpense',
|
||||
'SellingGeneralAndAdministrativeExpense',
|
||||
'EarningsPerShareBasic', 'EarningsPerShareDiluted'
|
||||
},
|
||||
'CashFlowStatement': {
|
||||
# Core cash flow items
|
||||
'NetCashProvidedByUsedInOperatingActivities',
|
||||
'NetCashProvidedByUsedInInvestingActivities',
|
||||
'NetCashProvidedByUsedInFinancingActivities',
|
||||
'CashAndCashEquivalentsPeriodIncreaseDecrease',
|
||||
# Common important items
|
||||
'NetIncomeLoss',
|
||||
'DepreciationDepletionAndAmortization', 'DepreciationAndAmortization',
|
||||
'PaymentsToAcquirePropertyPlantAndEquipment',
|
||||
'PaymentsOfDividends', 'PaymentsOfDividendsCommonStock'
|
||||
}
|
||||
}
|
||||
|
||||
return essential_concepts.get(statement_type, set())
|
||||
|
||||
|
||||
def check_period_data_quality(xbrl_instance, period_key: str, statement_type: str) -> Dict[str, any]:
|
||||
"""
|
||||
Check the data quality for a specific period.
|
||||
|
||||
Returns:
|
||||
Dictionary with quality metrics:
|
||||
- fact_count: Total number of facts
|
||||
- meaningful_fact_count: Number of facts with meaningful (non-empty) values
|
||||
- essential_coverage: Percentage of essential concepts found
|
||||
- has_sufficient_data: Boolean indicating if period should be displayed
|
||||
- missing_essentials: List of missing essential concepts
|
||||
- has_meaningful_data: Boolean indicating if period has meaningful values (fixes Issue #408)
|
||||
"""
|
||||
# Count total facts
|
||||
fact_count = count_facts_for_period(xbrl_instance, period_key, statement_type)
|
||||
|
||||
# Count meaningful facts (non-empty values) - Fix for Issue #408
|
||||
meaningful_fact_count = 0
|
||||
|
||||
# Get essential concepts
|
||||
essential_concepts = get_essential_concepts_for_statement(statement_type)
|
||||
|
||||
# Check which essential concepts are present
|
||||
found_essentials = set()
|
||||
missing_essentials = set()
|
||||
|
||||
# Parse period for context matching
|
||||
if period_key.startswith('instant_'):
|
||||
period_type = 'instant'
|
||||
period_date = period_key.replace('instant_', '')
|
||||
else:
|
||||
period_type = 'duration'
|
||||
parts = period_key.split('_')
|
||||
if len(parts) >= 3:
|
||||
start_date = parts[1]
|
||||
end_date = parts[2]
|
||||
else:
|
||||
return {
|
||||
'fact_count': fact_count,
|
||||
'essential_coverage': 0.0,
|
||||
'has_sufficient_data': False,
|
||||
'missing_essentials': list(essential_concepts)
|
||||
}
|
||||
|
||||
# Check each essential concept
|
||||
for concept in essential_concepts:
|
||||
concept_found = False
|
||||
|
||||
# Look for this concept in facts
|
||||
for _fact_key, fact in xbrl_instance._facts.items():
|
||||
if concept_found:
|
||||
break
|
||||
|
||||
# Check if this fact matches the concept
|
||||
element = xbrl_instance.element_catalog.get(fact.element_id)
|
||||
if element and concept in element.name:
|
||||
# Check if it's for our period
|
||||
context = xbrl_instance.contexts.get(fact.context_ref)
|
||||
if context:
|
||||
period_data = context.model_dump().get('period', {})
|
||||
if period_type == 'instant':
|
||||
if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
|
||||
found_essentials.add(concept)
|
||||
concept_found = True
|
||||
else:
|
||||
if (period_data.get('type') == 'duration' and
|
||||
period_data.get('startDate') == start_date and
|
||||
period_data.get('endDate') == end_date):
|
||||
found_essentials.add(concept)
|
||||
concept_found = True
|
||||
|
||||
if not concept_found:
|
||||
missing_essentials.add(concept)
|
||||
|
||||
# Count meaningful facts (non-empty values) - Fix for Issue #408
|
||||
for _fact_key, fact in xbrl_instance._facts.items():
|
||||
# Check if it's for our period
|
||||
context = xbrl_instance.contexts.get(fact.context_ref)
|
||||
if context:
|
||||
period_data = context.model_dump().get('period', {})
|
||||
period_matches = False
|
||||
|
||||
if period_type == 'instant':
|
||||
if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
|
||||
period_matches = True
|
||||
else:
|
||||
if (period_data.get('type') == 'duration' and
|
||||
period_data.get('startDate') == start_date and
|
||||
period_data.get('endDate') == end_date):
|
||||
period_matches = True
|
||||
|
||||
if period_matches:
|
||||
# Check if fact has meaningful value
|
||||
fact_value = getattr(fact, 'value', None)
|
||||
if fact_value is not None:
|
||||
str_value = str(fact_value).strip()
|
||||
if str_value and str_value.lower() not in ['', 'nan', 'none']:
|
||||
try:
|
||||
import pandas as pd
|
||||
numeric_value = pd.to_numeric(str_value, errors='coerce')
|
||||
if not pd.isna(numeric_value):
|
||||
meaningful_fact_count += 1
|
||||
except:
|
||||
# If not numeric but not empty, might still be meaningful
|
||||
if len(str_value) > 0:
|
||||
meaningful_fact_count += 1
|
||||
|
||||
# Calculate coverage
|
||||
essential_coverage = len(found_essentials) / len(essential_concepts) if essential_concepts else 0.0
|
||||
|
||||
# Determine if sufficient data
|
||||
# Require at least 50% essential coverage or 20+ facts
|
||||
has_sufficient_data = essential_coverage >= 0.5 or fact_count >= 20
|
||||
|
||||
# Determine if has meaningful data (fixes Issue #408)
|
||||
# A period has meaningful data if it has at least some facts with non-empty values
|
||||
has_meaningful_data = meaningful_fact_count > 0
|
||||
|
||||
return {
|
||||
'fact_count': fact_count,
|
||||
'meaningful_fact_count': meaningful_fact_count,
|
||||
'essential_coverage': essential_coverage,
|
||||
'has_sufficient_data': has_sufficient_data,
|
||||
'has_meaningful_data': has_meaningful_data,
|
||||
'missing_essentials': list(missing_essentials),
|
||||
'found_essentials': list(found_essentials)
|
||||
}
|
||||
|
||||
|
||||
def filter_periods_with_data(xbrl_instance, periods: List[Tuple[str, str]],
|
||||
statement_type: str,
|
||||
min_fact_count: int = 10) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Filter periods to only include those with sufficient data.
|
||||
|
||||
Args:
|
||||
xbrl_instance: XBRL instance
|
||||
periods: List of (period_key, label) tuples
|
||||
statement_type: Type of statement
|
||||
min_fact_count: Minimum number of facts required
|
||||
|
||||
Returns:
|
||||
Filtered list of periods with sufficient data
|
||||
"""
|
||||
filtered_periods = []
|
||||
|
||||
for period_key, label in periods:
|
||||
quality = check_period_data_quality(xbrl_instance, period_key, statement_type)
|
||||
|
||||
# Include period if it has sufficient data AND meaningful data (fixes Issue #408)
|
||||
if (quality['has_sufficient_data'] and
|
||||
quality['fact_count'] >= min_fact_count and
|
||||
quality['has_meaningful_data']):
|
||||
filtered_periods.append((period_key, label))
|
||||
else:
|
||||
# Log why period was excluded
|
||||
pass
|
||||
|
||||
return filtered_periods
|
||||
|
||||
|
||||
def determine_investor_preferred_periods(xbrl_instance, statement_type: str) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Enhanced period selection that prioritizes what investors want to see.
|
||||
|
||||
For Annual Reports:
|
||||
1. Current fiscal year
|
||||
2. Prior fiscal year (YoY comparison)
|
||||
3. Two years ago (3-year trend)
|
||||
|
||||
For Quarterly Reports:
|
||||
1. Current quarter
|
||||
2. Same quarter prior year (YoY)
|
||||
3. Current YTD
|
||||
4. Prior year YTD
|
||||
|
||||
Only includes periods with sufficient data.
|
||||
"""
|
||||
from edgar.xbrl.period_selector import select_periods
|
||||
|
||||
# Start with the unified period selection
|
||||
base_periods = select_periods(xbrl_instance, statement_type)
|
||||
|
||||
# Filter for data availability
|
||||
periods_with_data = filter_periods_with_data(
|
||||
xbrl_instance,
|
||||
base_periods,
|
||||
statement_type,
|
||||
min_fact_count=10
|
||||
)
|
||||
|
||||
# If we lost too many periods, be less strict
|
||||
if len(periods_with_data) < 2 and len(base_periods) >= 2:
|
||||
# Try again with lower threshold
|
||||
periods_with_data = filter_periods_with_data(
|
||||
xbrl_instance,
|
||||
base_periods,
|
||||
statement_type,
|
||||
min_fact_count=5
|
||||
)
|
||||
|
||||
return periods_with_data
|
||||
622
venv/lib/python3.10/site-packages/edgar/xbrl/period_selector.py
Normal file
622
venv/lib/python3.10/site-packages/edgar/xbrl/period_selector.py
Normal file
@@ -0,0 +1,622 @@
|
||||
"""
|
||||
Unified Period Selection System
|
||||
|
||||
A streamlined, single-responsibility approach to XBRL period selection that:
|
||||
- Consolidates logic from legacy periods.py and smart_periods.py
|
||||
- Always applies document date filtering to prevent future period bugs
|
||||
- Preserves essential fiscal intelligence while eliminating complexity
|
||||
- Provides a single, clear entry point for all period selection
|
||||
|
||||
This replaces 1,275 lines of dual-system complexity with ~200 lines of focused logic.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import date, datetime
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def select_periods(xbrl, statement_type: str, max_periods: int = 4) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Single entry point for period selection.
|
||||
|
||||
Args:
|
||||
xbrl: XBRL instance with reporting_periods and entity_info
|
||||
statement_type: 'BalanceSheet', 'IncomeStatement', 'CashFlowStatement', etc.
|
||||
max_periods: Maximum number of periods to return
|
||||
|
||||
Returns:
|
||||
List of (period_key, period_label) tuples, most recent first
|
||||
"""
|
||||
# Step 1: Always filter by document date first (prevents future date bugs)
|
||||
all_periods = xbrl.reporting_periods
|
||||
document_end_date = xbrl.period_of_report
|
||||
|
||||
if not all_periods:
|
||||
logger.warning("No reporting periods available for %s", xbrl.entity_name)
|
||||
return []
|
||||
|
||||
filtered_periods = _filter_by_document_date(all_periods, document_end_date)
|
||||
|
||||
if not filtered_periods:
|
||||
logger.warning("No valid periods found after document date filtering for %s", xbrl.entity_name)
|
||||
return [(p['key'], p['label']) for p in all_periods[:max_periods]] # Fallback to unfiltered
|
||||
|
||||
try:
|
||||
# Step 2: Statement-specific logic
|
||||
if statement_type == 'BalanceSheet':
|
||||
candidate_periods = _select_balance_sheet_periods(filtered_periods, max_periods)
|
||||
else: # Income/Cash Flow statements
|
||||
candidate_periods = _select_duration_periods(filtered_periods, xbrl.entity_info, max_periods)
|
||||
|
||||
# Step 3: Filter out periods with insufficient data
|
||||
periods_with_data = _filter_periods_with_sufficient_data(xbrl, candidate_periods, statement_type)
|
||||
|
||||
if periods_with_data:
|
||||
return periods_with_data
|
||||
else:
|
||||
# If no periods have sufficient data, return the candidates anyway
|
||||
logger.warning("No periods with sufficient data found for %s %s, returning all candidates", xbrl.entity_name, statement_type)
|
||||
return candidate_periods
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Period selection failed for %s %s: %s", xbrl.entity_name, statement_type, e)
|
||||
# Final fallback: return filtered periods (document date filter already applied)
|
||||
return [(p['key'], p['label']) for p in filtered_periods[:max_periods]]
|
||||
|
||||
|
||||
def _filter_by_document_date(periods: List[Dict], document_end_date: Optional[str]) -> List[Dict]:
|
||||
"""
|
||||
Filter periods to only include those that end on or before the document date.
|
||||
|
||||
This prevents the future date bug where periods from 2026-2029 were selected
|
||||
for a 2024 filing.
|
||||
"""
|
||||
if not document_end_date:
|
||||
return periods
|
||||
|
||||
try:
|
||||
doc_end_date = datetime.strptime(document_end_date, '%Y-%m-%d').date()
|
||||
except (ValueError, TypeError):
|
||||
logger.debug("Could not parse document end date: %s", document_end_date)
|
||||
return periods
|
||||
|
||||
filtered_periods = []
|
||||
for period in periods:
|
||||
try:
|
||||
if period['type'] == 'instant':
|
||||
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
|
||||
if period_date <= doc_end_date:
|
||||
filtered_periods.append(period)
|
||||
else: # duration
|
||||
period_end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
|
||||
if period_end_date <= doc_end_date:
|
||||
filtered_periods.append(period)
|
||||
except (ValueError, TypeError):
|
||||
# If we can't parse the period date, include it to be safe
|
||||
filtered_periods.append(period)
|
||||
|
||||
return filtered_periods
|
||||
|
||||
|
||||
def _select_balance_sheet_periods(periods: List[Dict], max_periods: int) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Select instant periods for balance sheet statements.
|
||||
|
||||
Balance sheets are point-in-time snapshots, so we need instant periods.
|
||||
We select the most recent instant periods with basic fiscal year intelligence.
|
||||
"""
|
||||
instant_periods = [p for p in periods if p['type'] == 'instant']
|
||||
|
||||
if not instant_periods:
|
||||
logger.warning("No instant periods found for balance sheet")
|
||||
return []
|
||||
|
||||
# Sort by date (most recent first)
|
||||
instant_periods = _sort_periods_by_date(instant_periods, 'instant')
|
||||
|
||||
# Take more candidate periods initially (up to 10) to ensure we capture fiscal year ends
|
||||
# Many filings have several instant periods (quarterly, mid-year, etc.) with minimal data
|
||||
# We need to cast a wider net initially and let data filtering select the best ones
|
||||
# Issue #464: Was only checking first 4 periods, missing prior fiscal year ends
|
||||
candidate_count = min(10, len(instant_periods))
|
||||
|
||||
selected_periods = []
|
||||
for period in instant_periods[:candidate_count]:
|
||||
selected_periods.append((period['key'], period['label']))
|
||||
if len(selected_periods) >= max_periods * 3: # Check up to 3x max_periods
|
||||
break
|
||||
|
||||
return selected_periods
|
||||
|
||||
|
||||
def _select_duration_periods(periods: List[Dict], entity_info: Dict[str, Any], max_periods: int) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Select duration periods for income/cash flow statements with fiscal intelligence.
|
||||
|
||||
This consolidates the sophisticated fiscal year logic from the legacy system
|
||||
while keeping it simple and focused.
|
||||
"""
|
||||
duration_periods = [p for p in periods if p['type'] == 'duration']
|
||||
|
||||
if not duration_periods:
|
||||
logger.warning("No duration periods found for income/cash flow statement")
|
||||
return []
|
||||
|
||||
# Get fiscal information for intelligent period selection
|
||||
fiscal_period = entity_info.get('fiscal_period', 'FY')
|
||||
fiscal_year_end_month = entity_info.get('fiscal_year_end_month')
|
||||
fiscal_year_end_day = entity_info.get('fiscal_year_end_day')
|
||||
|
||||
# Filter for annual periods if this is an annual report
|
||||
if fiscal_period == 'FY':
|
||||
annual_periods = _get_annual_periods(duration_periods)
|
||||
if annual_periods:
|
||||
# Apply fiscal year alignment scoring
|
||||
scored_periods = _score_fiscal_alignment(annual_periods, fiscal_year_end_month, fiscal_year_end_day)
|
||||
return [(p['key'], p['label']) for p in scored_periods[:max_periods]]
|
||||
|
||||
# For quarterly reports or if no annual periods found, use sophisticated quarterly logic
|
||||
return _select_quarterly_periods(duration_periods, max_periods)
|
||||
|
||||
|
||||
def _select_quarterly_periods(duration_periods: List[Dict], max_periods: int) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Select quarterly periods with intelligent investor-focused logic.
|
||||
|
||||
For quarterly filings, investors typically want:
|
||||
1. Current quarter (most recent quarterly period)
|
||||
2. Same quarter from prior year (YoY comparison)
|
||||
3. Year-to-date current year (6-month, 9-month YTD)
|
||||
4. Year-to-date prior year (comparative YTD)
|
||||
|
||||
Issue #464 Fix: Cast wider net by checking more quarterly periods and returning
|
||||
more candidates (max_periods * 3) to let data quality filtering select the best ones.
|
||||
This mirrors the successful Balance Sheet fix from v4.20.1.
|
||||
"""
|
||||
if not duration_periods:
|
||||
return []
|
||||
|
||||
# Categorize periods by duration to identify types
|
||||
quarterly_periods = [] # ~90 days (80-100)
|
||||
ytd_periods = [] # 180-280 days (semi-annual, 9-month YTD)
|
||||
|
||||
for period in duration_periods:
|
||||
try:
|
||||
start_date = datetime.strptime(period['start_date'], '%Y-%m-%d').date()
|
||||
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
|
||||
duration_days = (end_date - start_date).days
|
||||
|
||||
if 80 <= duration_days <= 100: # Quarterly
|
||||
quarterly_periods.append(period)
|
||||
elif 150 <= duration_days <= 285: # YTD (semi-annual to 9-month)
|
||||
ytd_periods.append(period)
|
||||
# Skip periods that are too short (<80 days) or too long (>285 days but <300)
|
||||
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
# Sort periods by end date (most recent first)
|
||||
quarterly_periods = _sort_periods_by_date(quarterly_periods, 'duration')
|
||||
ytd_periods = _sort_periods_by_date(ytd_periods, 'duration')
|
||||
|
||||
selected_periods = []
|
||||
|
||||
# 1. Add current quarter (most recent quarterly period)
|
||||
if quarterly_periods:
|
||||
current_quarter = quarterly_periods[0]
|
||||
selected_periods.append((current_quarter['key'], current_quarter['label']))
|
||||
|
||||
# 2. Find same quarter from prior year for YoY comparison
|
||||
# Issue #464: Check more quarterly periods to find prior year matches
|
||||
try:
|
||||
current_end = datetime.strptime(current_quarter['end_date'], '%Y-%m-%d').date()
|
||||
target_year = current_end.year - 1
|
||||
|
||||
# Check up to 12 quarterly periods instead of just a few
|
||||
check_count = min(12, len(quarterly_periods) - 1)
|
||||
for period in quarterly_periods[1:check_count + 1]:
|
||||
period_end = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
|
||||
# Same quarter if same month and within 15 days, previous year
|
||||
if (period_end.year == target_year and
|
||||
period_end.month == current_end.month and
|
||||
abs(period_end.day - current_end.day) <= 15):
|
||||
selected_periods.append((period['key'], period['label']))
|
||||
break
|
||||
except (ValueError, TypeError, KeyError):
|
||||
pass
|
||||
|
||||
# 3. Add current year YTD (most recent YTD period)
|
||||
if ytd_periods:
|
||||
current_ytd = ytd_periods[0]
|
||||
# Avoid duplicates - check if this YTD period is already selected as quarterly
|
||||
if not any(current_ytd['key'] == key for key, _ in selected_periods):
|
||||
selected_periods.append((current_ytd['key'], current_ytd['label']))
|
||||
|
||||
# 4. Add additional YTD candidates for data quality filtering to choose from
|
||||
# Issue #464: Cast wider net instead of strict matching to handle fiscal year differences
|
||||
# Example: AAPL current YTD ends June 29, prior YTD ends July 1 (different months)
|
||||
# Let data quality filtering choose the best periods based on fact counts
|
||||
if len(selected_periods) < max_periods * 3:
|
||||
added_keys = {key for key, _ in selected_periods}
|
||||
check_count = min(8, len(ytd_periods) - 1)
|
||||
for period in ytd_periods[1:check_count + 1]: # Skip first (already added as current_ytd)
|
||||
if period['key'] not in added_keys and len(selected_periods) < max_periods * 3:
|
||||
selected_periods.append((period['key'], period['label']))
|
||||
added_keys.add(period['key'])
|
||||
|
||||
# If we still don't have enough periods, add other quarterly periods
|
||||
# Issue #464: Check more periods and return more candidates
|
||||
if len(selected_periods) < max_periods * 3:
|
||||
added_keys = {key for key, _ in selected_periods}
|
||||
check_count = min(12, len(quarterly_periods))
|
||||
for period in quarterly_periods[:check_count]:
|
||||
if period['key'] not in added_keys and len(selected_periods) < max_periods * 3:
|
||||
selected_periods.append((period['key'], period['label']))
|
||||
added_keys.add(period['key'])
|
||||
|
||||
# Issue #464: Return max_periods * 3 candidates instead of just max_periods
|
||||
# Let data quality filtering in _filter_periods_with_sufficient_data choose the best ones
|
||||
# This mirrors the successful Balance Sheet fix from v4.20.1 (line 128)
|
||||
return selected_periods[:max_periods * 3]
|
||||
|
||||
|
||||
def _get_annual_periods(duration_periods: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
Filter duration periods to only include truly annual periods (>300 days).
|
||||
|
||||
This consolidates the 300-day logic that was duplicated across both systems.
|
||||
"""
|
||||
annual_periods = []
|
||||
|
||||
for period in duration_periods:
|
||||
if _is_annual_period(period):
|
||||
annual_periods.append(period)
|
||||
|
||||
return annual_periods
|
||||
|
||||
|
||||
def _is_annual_period(period: Dict) -> bool:
|
||||
"""
|
||||
Determine if a period is truly annual (300-400 days).
|
||||
|
||||
Annual periods should be approximately one year, allowing for:
|
||||
- Leap years (366 days)
|
||||
- Slight variations in fiscal year end dates
|
||||
- But rejecting multi-year cumulative periods
|
||||
"""
|
||||
try:
|
||||
start_date = datetime.strptime(period['start_date'], '%Y-%m-%d').date()
|
||||
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
|
||||
duration_days = (end_date - start_date).days
|
||||
# Annual periods should be between 300-400 days
|
||||
# This rejects quarterly (~90 days) and multi-year (>400 days) periods
|
||||
return 300 < duration_days <= 400
|
||||
except (ValueError, TypeError, KeyError):
|
||||
return False
|
||||
|
||||
|
||||
def _score_fiscal_alignment(periods: List[Dict], fiscal_month: Optional[int], fiscal_day: Optional[int]) -> List[Dict]:
|
||||
"""
|
||||
Score and sort periods based on fiscal year alignment.
|
||||
|
||||
This preserves the sophisticated fiscal intelligence from the legacy system.
|
||||
"""
|
||||
if fiscal_month is None or fiscal_day is None:
|
||||
# No fiscal info available, just sort by date
|
||||
return _sort_periods_by_date(periods, 'duration')
|
||||
|
||||
scored_periods = []
|
||||
|
||||
for period in periods:
|
||||
try:
|
||||
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
|
||||
score = _calculate_fiscal_alignment_score(end_date, fiscal_month, fiscal_day)
|
||||
|
||||
# Add score to period for sorting
|
||||
period_with_score = period.copy()
|
||||
period_with_score['fiscal_score'] = score
|
||||
scored_periods.append(period_with_score)
|
||||
|
||||
except (ValueError, TypeError, KeyError):
|
||||
# If we can't score it, give it a low score
|
||||
period_with_score = period.copy()
|
||||
period_with_score['fiscal_score'] = 0
|
||||
scored_periods.append(period_with_score)
|
||||
|
||||
# Sort by fiscal score (highest first), then by date
|
||||
scored_periods.sort(key=lambda p: (p.get('fiscal_score', 0), p.get('end_date', '')), reverse=True)
|
||||
|
||||
return scored_periods
|
||||
|
||||
|
||||
def _calculate_fiscal_alignment_score(end_date: date, fiscal_month: int, fiscal_day: int) -> int:
|
||||
"""
|
||||
Calculate fiscal year alignment score (0-100).
|
||||
|
||||
Consolidated from the legacy system's fiscal alignment logic.
|
||||
"""
|
||||
if end_date.month == fiscal_month and end_date.day == fiscal_day:
|
||||
return 100 # Perfect fiscal year end match
|
||||
elif end_date.month == fiscal_month and abs(end_date.day - fiscal_day) <= 15:
|
||||
return 75 # Same month, within 15 days
|
||||
elif abs(end_date.month - fiscal_month) <= 1:
|
||||
return 50 # Adjacent month
|
||||
else:
|
||||
return 25 # Different quarter
|
||||
|
||||
|
||||
def _sort_periods_by_date(periods: List[Dict], period_type: str) -> List[Dict]:
|
||||
"""
|
||||
Sort periods by date (most recent first).
|
||||
|
||||
Handles both instant and duration periods correctly.
|
||||
"""
|
||||
def get_sort_key(period):
|
||||
try:
|
||||
if period_type == 'instant':
|
||||
return datetime.strptime(period['date'], '%Y-%m-%d').date()
|
||||
else: # duration
|
||||
return datetime.strptime(period['end_date'], '%Y-%m-%d').date()
|
||||
except (ValueError, TypeError, KeyError):
|
||||
return date.min # Sort problematic periods to the end
|
||||
|
||||
return sorted(periods, key=get_sort_key, reverse=True)
|
||||
|
||||
|
||||
def _calculate_dynamic_thresholds(facts_by_period: Dict, statement_type: str) -> int:
|
||||
"""
|
||||
Calculate minimum fact threshold based on actual data distribution.
|
||||
|
||||
This adapts to company size - small companies get lower thresholds,
|
||||
large companies maintain high standards.
|
||||
|
||||
Args:
|
||||
facts_by_period: Pre-grouped facts by period key
|
||||
statement_type: Statement type to analyze
|
||||
|
||||
Returns:
|
||||
Minimum fact count threshold for this company/statement
|
||||
"""
|
||||
# Collect fact counts for this statement type across all periods
|
||||
statement_fact_counts = []
|
||||
|
||||
for period_key, period_facts in facts_by_period.items():
|
||||
statement_facts = [
|
||||
f for f in period_facts
|
||||
if f.get('statement_type') == statement_type
|
||||
]
|
||||
if statement_facts:
|
||||
statement_fact_counts.append(len(statement_facts))
|
||||
|
||||
if not statement_fact_counts:
|
||||
# No data for this statement type - use conservative default
|
||||
return 10
|
||||
|
||||
# Sort to find the richest periods
|
||||
statement_fact_counts.sort(reverse=True)
|
||||
|
||||
# Strategy: Use 40% of the richest period's fact count as minimum
|
||||
# This adapts to company size while still filtering sparse periods
|
||||
richest_period_facts = statement_fact_counts[0]
|
||||
|
||||
# Calculate adaptive threshold
|
||||
adaptive_threshold = int(richest_period_facts * 0.4)
|
||||
|
||||
# Apply floor and ceiling
|
||||
MIN_FLOOR = 10 # Never go below 10 facts
|
||||
MAX_CEILING = {
|
||||
'BalanceSheet': 40,
|
||||
'IncomeStatement': 25,
|
||||
'CashFlowStatement': 20
|
||||
}
|
||||
|
||||
threshold = max(MIN_FLOOR, min(adaptive_threshold, MAX_CEILING.get(statement_type, 30)))
|
||||
|
||||
logger.debug("Dynamic threshold for %s: %d (richest period: %d facts, 40%% = %d)",
|
||||
statement_type, threshold, richest_period_facts, adaptive_threshold)
|
||||
|
||||
return threshold
|
||||
|
||||
|
||||
def _calculate_dynamic_concept_diversity(facts_by_period: Dict, statement_type: str) -> int:
|
||||
"""
|
||||
Calculate minimum concept diversity based on actual data.
|
||||
|
||||
Returns:
|
||||
Minimum unique concept count for this company/statement
|
||||
"""
|
||||
if statement_type != 'BalanceSheet':
|
||||
return 0 # Only apply to Balance Sheets for now
|
||||
|
||||
# Find maximum concept diversity across periods
|
||||
max_concepts = 0
|
||||
for period_facts in facts_by_period.values():
|
||||
statement_facts = [
|
||||
f for f in period_facts
|
||||
if f.get('statement_type') == statement_type
|
||||
]
|
||||
unique_concepts = len(set(f.get('concept') for f in statement_facts if f.get('concept')))
|
||||
max_concepts = max(max_concepts, unique_concepts)
|
||||
|
||||
# Require 30% of maximum concept diversity, but at least 5
|
||||
diversity_threshold = max(5, int(max_concepts * 0.3))
|
||||
|
||||
logger.debug("Dynamic concept diversity for %s: %d (max concepts: %d)",
|
||||
statement_type, diversity_threshold, max_concepts)
|
||||
|
||||
return diversity_threshold
|
||||
|
||||
|
||||
# Enhanced essential concept patterns with multiple variations
|
||||
ESSENTIAL_CONCEPT_PATTERNS = {
|
||||
'BalanceSheet': [
|
||||
# Pattern groups - any match in group counts as finding that concept
|
||||
['Assets', 'AssetsCurrent', 'AssetsNoncurrent', 'AssetsFairValueDisclosure'],
|
||||
['Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent', 'LiabilitiesAndStockholdersEquity'],
|
||||
['Equity', 'StockholdersEquity', 'ShareholdersEquity', 'PartnersCapital',
|
||||
'MembersEquity', 'ShareholdersEquityIncludingPortionAttributableToNoncontrollingInterest']
|
||||
],
|
||||
'IncomeStatement': [
|
||||
['Revenue', 'Revenues', 'SalesRevenue', 'SalesRevenueNet', 'RevenueFromContractWithCustomer'],
|
||||
['NetIncome', 'NetIncomeLoss', 'ProfitLoss', 'NetIncomeLossAvailableToCommonStockholdersBasic'],
|
||||
['OperatingIncome', 'OperatingIncomeLoss', 'IncomeLossFromOperations']
|
||||
],
|
||||
'CashFlowStatement': [
|
||||
['OperatingCashFlow', 'NetCashProvidedByUsedInOperatingActivities',
|
||||
'CashProvidedByUsedInOperatingActivities'],
|
||||
['InvestingCashFlow', 'NetCashProvidedByUsedInInvestingActivities',
|
||||
'CashProvidedByUsedInInvestingActivities'],
|
||||
['FinancingCashFlow', 'NetCashProvidedByUsedInFinancingActivities',
|
||||
'CashProvidedByUsedInFinancingActivities']
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def _check_essential_concepts_flexible(statement_facts: List[Dict], statement_type: str) -> int:
|
||||
"""
|
||||
Check for essential concepts using flexible pattern matching.
|
||||
|
||||
Returns count of essential concept groups found (not individual patterns).
|
||||
"""
|
||||
concept_groups = ESSENTIAL_CONCEPT_PATTERNS.get(statement_type, [])
|
||||
|
||||
if not concept_groups:
|
||||
return 0
|
||||
|
||||
# Extract all concepts from facts once
|
||||
fact_concepts = [f.get('concept', '').lower() for f in statement_facts if f.get('concept')]
|
||||
|
||||
essential_concept_count = 0
|
||||
|
||||
# For each concept group, check if ANY pattern matches
|
||||
for pattern_group in concept_groups:
|
||||
group_matched = False
|
||||
|
||||
for pattern in pattern_group:
|
||||
pattern_lower = pattern.lower()
|
||||
|
||||
# Check if this pattern appears in any fact concept
|
||||
if any(pattern_lower in concept for concept in fact_concepts):
|
||||
group_matched = True
|
||||
logger.debug("Essential concept matched: %s (from group %s)",
|
||||
pattern, pattern_group[0])
|
||||
break
|
||||
|
||||
if group_matched:
|
||||
essential_concept_count += 1
|
||||
|
||||
return essential_concept_count
|
||||
|
||||
|
||||
def _filter_periods_with_sufficient_data(xbrl, candidate_periods: List[Tuple[str, str]], statement_type: str) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Filter periods to only include those with sufficient financial data.
|
||||
|
||||
This prevents selection of periods that exist in the taxonomy but have
|
||||
no meaningful financial facts (like the Alphabet 2019 case).
|
||||
|
||||
Issue #464: Added statement-specific fact count checks and concept diversity
|
||||
requirements to prevent showing sparse historical periods with only 1-2 concepts.
|
||||
|
||||
Performance optimization: Retrieves all facts once and works with in-memory data
|
||||
instead of creating 40+ DataFrames per statement rendering.
|
||||
"""
|
||||
MIN_FACTS_THRESHOLD = 10 # Minimum facts needed for a period to be considered viable
|
||||
|
||||
# PERFORMANCE FIX: Get all facts once at the start (single operation)
|
||||
all_facts = xbrl.facts.get_facts() # Returns List[Dict] - fast!
|
||||
|
||||
# Pre-group facts by period_key (O(n) operation, done once)
|
||||
facts_by_period = {}
|
||||
for fact in all_facts:
|
||||
period_key = fact.get('period_key')
|
||||
if period_key:
|
||||
if period_key not in facts_by_period:
|
||||
facts_by_period[period_key] = []
|
||||
facts_by_period[period_key].append(fact)
|
||||
|
||||
# Pre-group facts by statement type within each period
|
||||
statement_facts_by_period = {}
|
||||
for period_key, period_facts in facts_by_period.items():
|
||||
statement_facts_by_period[period_key] = [
|
||||
f for f in period_facts
|
||||
if f.get('statement_type') == statement_type
|
||||
]
|
||||
|
||||
# DYNAMIC THRESHOLDS: Calculate based on this company's data distribution
|
||||
statement_min_facts = _calculate_dynamic_thresholds(facts_by_period, statement_type)
|
||||
min_concept_diversity = _calculate_dynamic_concept_diversity(facts_by_period, statement_type)
|
||||
|
||||
# Get essential concept groups for this statement type
|
||||
required_concept_groups = len(ESSENTIAL_CONCEPT_PATTERNS.get(statement_type, []))
|
||||
|
||||
periods_with_data = []
|
||||
|
||||
# Loop through candidates using pre-computed groups (no DataFrame conversions!)
|
||||
for period_key, period_label in candidate_periods:
|
||||
try:
|
||||
# Get pre-grouped facts (fast list access, not DataFrame query)
|
||||
statement_facts = statement_facts_by_period.get(period_key, [])
|
||||
period_facts = facts_by_period.get(period_key, [])
|
||||
|
||||
statement_fact_count = len(statement_facts)
|
||||
total_fact_count = len(period_facts)
|
||||
|
||||
# Check statement-specific threshold
|
||||
if statement_fact_count < statement_min_facts:
|
||||
logger.debug("Period %s has insufficient %s facts (%d < %d)",
|
||||
period_label, statement_type, statement_fact_count, statement_min_facts)
|
||||
continue
|
||||
|
||||
# Fallback check for total facts
|
||||
if total_fact_count < MIN_FACTS_THRESHOLD:
|
||||
logger.debug("Period %s has insufficient facts (%d < %d)",
|
||||
period_label, total_fact_count, MIN_FACTS_THRESHOLD)
|
||||
continue
|
||||
|
||||
# Check concept diversity (Issue #464)
|
||||
if statement_type == 'BalanceSheet':
|
||||
unique_concepts = len(set(f.get('concept') for f in statement_facts if f.get('concept')))
|
||||
|
||||
if unique_concepts < min_concept_diversity:
|
||||
logger.debug("Period %s lacks concept diversity (%d < %d unique concepts)",
|
||||
period_label, unique_concepts, min_concept_diversity)
|
||||
continue
|
||||
|
||||
# FLEXIBLE CONCEPT MATCHING: Check essential concepts using pattern groups
|
||||
essential_concept_count = _check_essential_concepts_flexible(statement_facts, statement_type)
|
||||
|
||||
# Require at least half the essential concept groups
|
||||
min_essential_required = max(1, required_concept_groups // 2)
|
||||
if essential_concept_count >= min_essential_required:
|
||||
periods_with_data.append((period_key, period_label))
|
||||
unique_concepts_count = len(set(f.get('concept') for f in statement_facts if f.get('concept')))
|
||||
logger.debug("Period %s has sufficient data: %d %s facts, %d unique concepts, %d/%d essential concepts",
|
||||
period_label, statement_fact_count, statement_type,
|
||||
unique_concepts_count,
|
||||
essential_concept_count, required_concept_groups)
|
||||
else:
|
||||
logger.debug("Period %s lacks essential concepts: %d/%d present",
|
||||
period_label, essential_concept_count, required_concept_groups)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Error checking data for period %s: %s", period_label, e)
|
||||
# Be more conservative - don't include if we can't verify
|
||||
continue
|
||||
|
||||
return periods_with_data
|
||||
|
||||
|
||||
# Legacy compatibility functions - to be removed after migration
|
||||
def determine_periods_to_display(xbrl_instance, statement_type: str) -> List[Tuple[str, str]]:
|
||||
"""Legacy compatibility wrapper."""
|
||||
logger.warning("Using legacy compatibility wrapper - update to use select_periods() directly")
|
||||
return select_periods(xbrl_instance, statement_type)
|
||||
|
||||
|
||||
def select_smart_periods(xbrl, statement_type: str, max_periods: int = 4) -> List[Tuple[str, str]]:
|
||||
"""Legacy compatibility wrapper."""
|
||||
logger.warning("Using legacy compatibility wrapper - update to use select_periods() directly")
|
||||
return select_periods(xbrl, statement_type, max_periods)
|
||||
693
venv/lib/python3.10/site-packages/edgar/xbrl/periods.py
Normal file
693
venv/lib/python3.10/site-packages/edgar/xbrl/periods.py
Normal file
@@ -0,0 +1,693 @@
|
||||
"""
|
||||
Period handling functionality for XBRL statements.
|
||||
|
||||
This module provides functions for handling periods in XBRL statements, including:
|
||||
- Determining available period views for different statement types
|
||||
- Selecting appropriate periods for display
|
||||
- Handling fiscal year and quarter information
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
# Configuration for different statement types
|
||||
STATEMENT_TYPE_CONFIG = {
|
||||
'BalanceSheet': {
|
||||
'period_type': 'instant',
|
||||
'max_periods': 3,
|
||||
'allow_annual_comparison': True,
|
||||
'views': [
|
||||
{
|
||||
'name': 'Three Recent Periods',
|
||||
'description': 'Shows three most recent reporting periods',
|
||||
'max_periods': 3,
|
||||
'requires_min_periods': 3
|
||||
},
|
||||
{
|
||||
'name': 'Current vs. Previous Period',
|
||||
'description': 'Shows the current period and the previous period',
|
||||
'max_periods': 2,
|
||||
'requires_min_periods': 1
|
||||
},
|
||||
{
|
||||
'name': 'Three-Year Annual Comparison',
|
||||
'description': 'Shows three fiscal years for comparison',
|
||||
'max_periods': 3,
|
||||
'requires_min_periods': 3,
|
||||
'annual_only': True
|
||||
}
|
||||
]
|
||||
},
|
||||
'IncomeStatement': {
|
||||
'period_type': 'duration',
|
||||
'max_periods': 3,
|
||||
'allow_annual_comparison': True,
|
||||
'views': [
|
||||
{
|
||||
'name': 'Three Recent Periods',
|
||||
'description': 'Shows three most recent reporting periods',
|
||||
'max_periods': 3,
|
||||
'requires_min_periods': 3
|
||||
},
|
||||
{
|
||||
'name': 'YTD and Quarterly Breakdown',
|
||||
'description': 'Shows YTD figures and quarterly breakdown',
|
||||
'max_periods': 5,
|
||||
'requires_min_periods': 2,
|
||||
'mixed_view': True
|
||||
}
|
||||
]
|
||||
},
|
||||
'StatementOfEquity': {
|
||||
'period_type': 'duration',
|
||||
'max_periods': 3,
|
||||
'views': [
|
||||
{
|
||||
'name': 'Three Recent Periods',
|
||||
'description': 'Shows three most recent reporting periods',
|
||||
'max_periods': 3,
|
||||
'requires_min_periods': 1
|
||||
}
|
||||
]
|
||||
},
|
||||
'ComprehensiveIncome': {
|
||||
'period_type': 'duration',
|
||||
'max_periods': 3,
|
||||
'views': [
|
||||
{
|
||||
'name': 'Three Recent Periods',
|
||||
'description': 'Shows three most recent reporting periods',
|
||||
'max_periods': 3,
|
||||
'requires_min_periods': 1
|
||||
}
|
||||
]
|
||||
},
|
||||
'CoverPage': {
|
||||
'period_type': 'instant',
|
||||
'max_periods': 1,
|
||||
'views': [
|
||||
{
|
||||
'name': 'Current Period',
|
||||
'description': 'Shows the current reporting period',
|
||||
'max_periods': 1,
|
||||
'requires_min_periods': 1
|
||||
}
|
||||
]
|
||||
},
|
||||
'Notes': {
|
||||
'period_type': 'instant',
|
||||
'max_periods': 1,
|
||||
'views': [
|
||||
{
|
||||
'name': 'Current Period',
|
||||
'description': 'Shows the current reporting period',
|
||||
'max_periods': 1,
|
||||
'requires_min_periods': 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
def sort_periods(periods: List[Dict], period_type: str) -> List[Dict]:
|
||||
"""Sort periods by date, with most recent first."""
|
||||
if period_type == 'instant':
|
||||
return sorted(periods, key=lambda x: x['date'], reverse=True)
|
||||
return sorted(periods, key=lambda x: (x['end_date'], x['start_date']), reverse=True)
|
||||
|
||||
def filter_periods_by_document_end_date(periods: List[Dict], document_period_end_date: str, period_type: str) -> List[Dict]:
|
||||
"""Filter periods to only include those that end on or before the document period end date."""
|
||||
if not document_period_end_date:
|
||||
return periods
|
||||
|
||||
try:
|
||||
doc_end_date = datetime.strptime(document_period_end_date, '%Y-%m-%d').date()
|
||||
except (ValueError, TypeError):
|
||||
# If we can't parse the document end date, return all periods
|
||||
return periods
|
||||
|
||||
filtered_periods = []
|
||||
for period in periods:
|
||||
try:
|
||||
if period_type == 'instant':
|
||||
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
|
||||
if period_date <= doc_end_date:
|
||||
filtered_periods.append(period)
|
||||
else: # duration
|
||||
period_end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
|
||||
if period_end_date <= doc_end_date:
|
||||
filtered_periods.append(period)
|
||||
except (ValueError, TypeError):
|
||||
# If we can't parse the period date, include it to be safe
|
||||
filtered_periods.append(period)
|
||||
|
||||
return filtered_periods
|
||||
|
||||
def filter_periods_by_type(periods: List[Dict], period_type: str) -> List[Dict]:
|
||||
"""Filter periods by their type (instant or duration)."""
|
||||
return [p for p in periods if p['type'] == period_type]
|
||||
|
||||
def calculate_fiscal_alignment_score(end_date: datetime.date, fiscal_month: int, fiscal_day: int) -> int:
|
||||
"""Calculate how well a date aligns with fiscal year end."""
|
||||
if end_date.month == fiscal_month and end_date.day == fiscal_day:
|
||||
return 100
|
||||
if end_date.month == fiscal_month and abs(end_date.day - fiscal_day) <= 15:
|
||||
return 75
|
||||
if abs(end_date.month - fiscal_month) <= 1 and abs(end_date.day - fiscal_day) <= 15:
|
||||
return 50
|
||||
return 0
|
||||
|
||||
|
||||
def generate_period_view(view_config: Dict[str, Any], periods: List[Dict], is_annual: bool = False) -> Optional[Dict[str, Any]]:
|
||||
"""Generate a period view based on configuration and available periods.
|
||||
|
||||
Args:
|
||||
view_config: Configuration for the view (from STATEMENT_TYPE_CONFIG)
|
||||
periods: List of periods to choose from
|
||||
is_annual: Whether this is an annual report
|
||||
|
||||
Returns:
|
||||
Dictionary with view name, description, and period keys if view is valid,
|
||||
None if view cannot be generated with available periods
|
||||
"""
|
||||
if len(periods) < view_config['requires_min_periods']:
|
||||
return None
|
||||
|
||||
if view_config.get('annual_only', False) and not is_annual:
|
||||
return None
|
||||
|
||||
max_periods = min(view_config['max_periods'], len(periods))
|
||||
return {
|
||||
'name': view_config['name'],
|
||||
'description': view_config['description'],
|
||||
'period_keys': [p['key'] for p in periods[:max_periods]]
|
||||
}
|
||||
|
||||
|
||||
def generate_mixed_view(view_config: Dict[str, Any], ytd_periods: List[Dict],
|
||||
quarterly_periods: List[Dict]) -> Optional[Dict[str, Any]]:
|
||||
"""Generate a mixed view combining YTD and quarterly periods.
|
||||
|
||||
Args:
|
||||
view_config: Configuration for the view
|
||||
ytd_periods: List of year-to-date periods
|
||||
quarterly_periods: List of quarterly periods
|
||||
|
||||
Returns:
|
||||
Dictionary with view configuration if valid, None otherwise
|
||||
"""
|
||||
if not ytd_periods or not quarterly_periods:
|
||||
return None
|
||||
|
||||
mixed_keys = []
|
||||
|
||||
# Add current YTD
|
||||
mixed_keys.append(ytd_periods[0]['key'])
|
||||
|
||||
# Add recent quarters
|
||||
for q in quarterly_periods[:min(4, len(quarterly_periods))]:
|
||||
if q['key'] not in mixed_keys:
|
||||
mixed_keys.append(q['key'])
|
||||
|
||||
if len(mixed_keys) >= view_config['requires_min_periods']:
|
||||
return {
|
||||
'name': view_config['name'],
|
||||
'description': view_config['description'],
|
||||
'period_keys': mixed_keys[:view_config['max_periods']]
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_period_views(xbrl_instance, statement_type: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get available period views for a statement type.
|
||||
|
||||
Args:
|
||||
xbrl_instance: XBRL instance with context and entity information
|
||||
statement_type: Type of statement to get period views for
|
||||
|
||||
Returns:
|
||||
List of period view options with name, description, and period keys
|
||||
"""
|
||||
period_views = []
|
||||
|
||||
# Get statement configuration
|
||||
config = STATEMENT_TYPE_CONFIG.get(statement_type)
|
||||
if not config:
|
||||
return period_views
|
||||
|
||||
# Get useful entity info for period selection
|
||||
entity_info = xbrl_instance.entity_info
|
||||
fiscal_period_focus = entity_info.get('fiscal_period')
|
||||
annual_report = fiscal_period_focus == 'FY'
|
||||
|
||||
# Get all periods
|
||||
all_periods = xbrl_instance.reporting_periods
|
||||
document_period_end_date = xbrl_instance.period_of_report
|
||||
|
||||
# Filter and sort periods by type
|
||||
period_type = config['period_type']
|
||||
periods = filter_periods_by_type(all_periods, period_type)
|
||||
# Filter by document period end date to exclude periods after the reporting period
|
||||
periods = filter_periods_by_document_end_date(periods, document_period_end_date, period_type)
|
||||
periods = sort_periods(periods, period_type)
|
||||
|
||||
# If this statement type allows annual comparison and this is an annual report,
|
||||
# filter for annual periods
|
||||
annual_periods = []
|
||||
if config.get('allow_annual_comparison') and annual_report:
|
||||
fiscal_month = entity_info.get('fiscal_year_end_month')
|
||||
fiscal_day = entity_info.get('fiscal_year_end_day')
|
||||
|
||||
if fiscal_month is not None and fiscal_day is not None:
|
||||
for period in periods:
|
||||
try:
|
||||
date_field = 'date' if period_type == 'instant' else 'end_date'
|
||||
end_date = datetime.strptime(period[date_field], '%Y-%m-%d').date()
|
||||
score = calculate_fiscal_alignment_score(end_date, fiscal_month, fiscal_day)
|
||||
if score > 0: # Any alignment is good enough for a view
|
||||
annual_periods.append(period)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# Generate views based on configuration
|
||||
for view_config in config.get('views', []):
|
||||
if view_config.get('mixed_view'):
|
||||
# Special handling for mixed YTD/quarterly views
|
||||
ytd_periods = [p for p in periods if p.get('ytd')]
|
||||
quarterly_periods = [p for p in periods if p.get('quarterly')]
|
||||
view = generate_mixed_view(view_config, ytd_periods, quarterly_periods)
|
||||
elif view_config.get('annual_only'):
|
||||
# Views that should only show annual periods
|
||||
view = generate_period_view(view_config, annual_periods, annual_report)
|
||||
else:
|
||||
# Standard views using all periods
|
||||
view = generate_period_view(view_config, periods, annual_report)
|
||||
|
||||
if view:
|
||||
period_views.append(view)
|
||||
|
||||
return period_views
|
||||
|
||||
def determine_periods_to_display(
|
||||
xbrl_instance,
|
||||
statement_type: str,
|
||||
period_filter: Optional[str] = None,
|
||||
period_view: Optional[str] = None
|
||||
) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Determine which periods should be displayed for a statement.
|
||||
|
||||
Uses smart period selection, which balances investor needs
|
||||
with data availability for optimal financial analysis.
|
||||
|
||||
Args:
|
||||
xbrl_instance: XBRL instance with context and entity information
|
||||
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
period_filter: Optional period key to filter by specific reporting period
|
||||
period_view: Optional name of a predefined period view
|
||||
|
||||
Returns:
|
||||
List of tuples with period keys and labels to display
|
||||
"""
|
||||
periods_to_display = []
|
||||
|
||||
# If a specific period is requested, use only that
|
||||
if period_filter:
|
||||
for period in xbrl_instance.reporting_periods:
|
||||
if period['key'] == period_filter:
|
||||
periods_to_display.append((period_filter, period['label']))
|
||||
break
|
||||
return periods_to_display
|
||||
|
||||
# If a period view is specified, use that
|
||||
if period_view:
|
||||
available_views = get_period_views(xbrl_instance, statement_type)
|
||||
matching_view = next((view for view in available_views if view['name'] == period_view), None)
|
||||
|
||||
if matching_view:
|
||||
for period_key in matching_view['period_keys']:
|
||||
for period in xbrl_instance.reporting_periods:
|
||||
if period['key'] == period_key:
|
||||
periods_to_display.append((period_key, period['label']))
|
||||
break
|
||||
return periods_to_display
|
||||
|
||||
# Use unified period selection system with fallback to legacy logic
|
||||
try:
|
||||
from edgar.xbrl.period_selector import select_periods
|
||||
return select_periods(xbrl_instance, statement_type)
|
||||
except Exception as e:
|
||||
# Log the error and fall back to legacy logic
|
||||
import logging
|
||||
logging.warning("Unified period selection failed, using legacy logic: %s", e)
|
||||
# Continue to legacy logic below
|
||||
|
||||
# If no specific periods requested, use default logic based on statement type
|
||||
all_periods = xbrl_instance.reporting_periods
|
||||
entity_info = xbrl_instance.entity_info
|
||||
fiscal_period_focus = entity_info.get('fiscal_period')
|
||||
document_period_end_date = xbrl_instance.period_of_report
|
||||
|
||||
# Filter periods by statement type
|
||||
if statement_type == 'BalanceSheet':
|
||||
instant_periods = filter_periods_by_type(all_periods, 'instant')
|
||||
# Filter by document period end date to exclude periods after the reporting period
|
||||
instant_periods = filter_periods_by_document_end_date(instant_periods, document_period_end_date, 'instant')
|
||||
instant_periods = sort_periods(instant_periods, 'instant')
|
||||
|
||||
# Get fiscal information for better period matching
|
||||
fiscal_period_focus = entity_info.get('fiscal_period')
|
||||
fiscal_year_focus = entity_info.get('fiscal_year')
|
||||
fiscal_year_end_month = entity_info.get('fiscal_year_end_month')
|
||||
fiscal_year_end_day = entity_info.get('fiscal_year_end_day')
|
||||
|
||||
if instant_periods:
|
||||
# Take latest instant period that is not later than document_period_end_date
|
||||
current_period = instant_periods[0] # Most recent
|
||||
period_key = current_period['key']
|
||||
periods_to_display.append((period_key, current_period['label']))
|
||||
|
||||
# Try to find appropriate comparison period
|
||||
try:
|
||||
current_date = datetime.strptime(current_period['date'], '%Y-%m-%d').date()
|
||||
|
||||
# Use fiscal information if available for better matching
|
||||
if fiscal_year_end_month is not None and fiscal_year_end_day is not None:
|
||||
# Check if this is a fiscal year end report
|
||||
is_fiscal_year_end = False
|
||||
if fiscal_period_focus == 'FY' or (
|
||||
current_date.month == fiscal_year_end_month and
|
||||
abs(current_date.day - fiscal_year_end_day) <= 7):
|
||||
is_fiscal_year_end = True
|
||||
|
||||
if is_fiscal_year_end and fiscal_year_focus:
|
||||
# For fiscal year end, find the previous fiscal year end period
|
||||
prev_fiscal_year = int(fiscal_year_focus) - 1 if isinstance(fiscal_year_focus,
|
||||
(int, str)) and str(
|
||||
fiscal_year_focus).isdigit() else current_date.year - 1
|
||||
|
||||
# Look for a comparable period from previous fiscal year
|
||||
for period in instant_periods[1:]: # Skip the current one
|
||||
try:
|
||||
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
|
||||
# Check if this period is from the previous fiscal year and around fiscal year end
|
||||
if (period_date.year == prev_fiscal_year and
|
||||
period_date.month == fiscal_year_end_month and
|
||||
abs(period_date.day - fiscal_year_end_day) <= 15):
|
||||
periods_to_display.append((period['key'], period['label']))
|
||||
break
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# If no appropriate period found yet, try generic date-based comparison
|
||||
if len(periods_to_display) == 1:
|
||||
# Look for a period from previous year with similar date pattern
|
||||
prev_year = current_date.year - 1
|
||||
|
||||
for period in instant_periods[1:]: # Skip the current one
|
||||
try:
|
||||
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
|
||||
# If from previous year with similar month/day
|
||||
if period_date.year == prev_year:
|
||||
periods_to_display.append((period['key'], period['label']))
|
||||
break
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# Only add additional comparable periods (up to a total of 3)
|
||||
# For annual reports, only add periods that are also fiscal year ends
|
||||
is_annual_report = (fiscal_period_focus == 'FY')
|
||||
added_period_keys = [key for key, _ in periods_to_display]
|
||||
|
||||
for period in instant_periods[1:]: # Skip current period
|
||||
if len(periods_to_display) >= 3:
|
||||
break # Stop when we have 3 periods
|
||||
|
||||
# For annual reports, only add periods that are fiscal year ends
|
||||
# ENHANCED: Ensure we're selecting true annual period ends, not quarterly
|
||||
if is_annual_report and fiscal_year_end_month is not None and fiscal_year_end_day is not None:
|
||||
try:
|
||||
# Check if this period is close to the fiscal year end
|
||||
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
|
||||
|
||||
# STRICT CHECK: For annual reports, be more selective
|
||||
# The period should be within a reasonable range of fiscal year end
|
||||
is_fiscal_year_end = (
|
||||
period_date.month == fiscal_year_end_month and
|
||||
abs(period_date.day - fiscal_year_end_day) <= 15 # Allow some flexibility
|
||||
)
|
||||
|
||||
# Additional check: Ensure this is approximately 1 year before previous periods
|
||||
if is_fiscal_year_end and len(periods_to_display) > 0:
|
||||
prev_date_str = periods_to_display[-1][0].split('_')[-1] if '_' in periods_to_display[-1][0] else None
|
||||
if prev_date_str:
|
||||
try:
|
||||
prev_date = datetime.strptime(prev_date_str, '%Y-%m-%d').date()
|
||||
year_diff = abs((prev_date - period_date).days)
|
||||
# Should be approximately 365 days apart (allow 350-380 range)
|
||||
if not (350 <= year_diff <= 380):
|
||||
is_fiscal_year_end = False
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Only include this period if it's a fiscal year end
|
||||
if not is_fiscal_year_end:
|
||||
continue # Skip non-fiscal-year-end periods
|
||||
except (ValueError, TypeError):
|
||||
continue # Skip periods with invalid dates
|
||||
|
||||
# Don't add periods we've already added
|
||||
period_key = period['key']
|
||||
if period_key not in added_period_keys:
|
||||
periods_to_display.append((period_key, period['label']))
|
||||
|
||||
except (ValueError, TypeError):
|
||||
# If date parsing failed, still try to select appropriate periods
|
||||
# For annual reports, we should only show fiscal year end periods
|
||||
is_annual_report = (fiscal_period_focus == 'FY')
|
||||
|
||||
added_count = 0
|
||||
for i, period in enumerate(instant_periods):
|
||||
if i == 0:
|
||||
continue # Skip first period which should already be added
|
||||
|
||||
if added_count >= 2: # Already added 2 more (for a total of 3)
|
||||
break
|
||||
|
||||
# For annual reports, only add periods that are close to fiscal year end
|
||||
if (is_annual_report and fiscal_year_end_month is not None and
|
||||
fiscal_year_end_day is not None):
|
||||
try:
|
||||
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
|
||||
# Only add periods close to fiscal year end
|
||||
if (period_date.month != fiscal_year_end_month or
|
||||
abs(period_date.day - fiscal_year_end_day) > 15):
|
||||
continue # Skip periods that aren't fiscal year ends
|
||||
except (ValueError, TypeError):
|
||||
continue # Skip periods with invalid dates
|
||||
|
||||
periods_to_display.append((period['key'], period['label']))
|
||||
added_count += 1
|
||||
|
||||
elif statement_type in ['IncomeStatement', 'CashFlowStatement']:
|
||||
duration_periods = filter_periods_by_type(all_periods, 'duration')
|
||||
# Filter by document period end date to exclude periods after the reporting period
|
||||
duration_periods = filter_periods_by_document_end_date(duration_periods, document_period_end_date, 'duration')
|
||||
duration_periods = sort_periods(duration_periods, 'duration')
|
||||
if duration_periods:
|
||||
# For annual reports, prioritize annual periods
|
||||
if fiscal_period_focus == 'FY':
|
||||
# Get fiscal year end information if available
|
||||
fiscal_year_end_month = entity_info.get('fiscal_year_end_month')
|
||||
fiscal_year_end_day = entity_info.get('fiscal_year_end_day')
|
||||
|
||||
# First pass: Find all periods that are approximately a year long
|
||||
# CRITICAL FIX: Apply strict duration filtering to ensure we only get annual periods
|
||||
# Some facts are marked as FY but are actually quarterly (90 days vs 363+ days)
|
||||
candidate_annual_periods = []
|
||||
for period in duration_periods:
|
||||
try:
|
||||
start_date = datetime.strptime(period['start_date'], '%Y-%m-%d').date()
|
||||
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
|
||||
days = (end_date - start_date).days
|
||||
# STRICT CHECK: Annual periods must be > 300 days
|
||||
# This filters out quarterly periods incorrectly marked as FY
|
||||
if days > 300: # Truly annual period (not quarterly)
|
||||
# Add a score to each period for later sorting
|
||||
# Default score is 0 (will be increased for fiscal year matches)
|
||||
period_with_score = period.copy()
|
||||
period_with_score['fiscal_alignment_score'] = 0
|
||||
period_with_score['duration_days'] = days # Store for debugging
|
||||
candidate_annual_periods.append(period_with_score)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# Second pass: Score periods based on alignment with fiscal year pattern
|
||||
if fiscal_year_end_month is not None and fiscal_year_end_day is not None:
|
||||
for period in candidate_annual_periods:
|
||||
try:
|
||||
# Check how closely the end date aligns with fiscal year end
|
||||
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
|
||||
|
||||
# Perfect match: Same month and day as fiscal year end
|
||||
if end_date.month == fiscal_year_end_month and end_date.day == fiscal_year_end_day:
|
||||
period['fiscal_alignment_score'] = 100
|
||||
# Strong match: Same month and within 15 days
|
||||
elif end_date.month == fiscal_year_end_month and abs(end_date.day - fiscal_year_end_day) <= 15:
|
||||
period['fiscal_alignment_score'] = 75
|
||||
# Moderate match: Month before/after and close to the day
|
||||
elif abs(end_date.month - fiscal_year_end_month) <= 1 and abs(end_date.day - fiscal_year_end_day) <= 15:
|
||||
period['fiscal_alignment_score'] = 50
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# Sort periods by fiscal alignment (higher score first) and then by recency (end date)
|
||||
annual_periods = sorted(
|
||||
candidate_annual_periods,
|
||||
key=lambda x: (x['fiscal_alignment_score'], x['end_date']),
|
||||
reverse=True # Highest score and most recent first
|
||||
)
|
||||
|
||||
if annual_periods:
|
||||
# Take up to 3 best matching annual periods (prioritizing fiscal year alignment)
|
||||
for period in annual_periods[:3]:
|
||||
periods_to_display.append((period['key'], period['label']))
|
||||
return periods_to_display
|
||||
|
||||
# For quarterly reports, apply intelligent period selection
|
||||
else:
|
||||
# First, categorize periods by duration to identify meaningful financial periods
|
||||
quarterly_periods = [] # 85-95 days (one quarter)
|
||||
ytd_periods = [] # 175-185 days (two quarters), 265-275 days (three quarters)
|
||||
annual_periods = [] # 350-380 days (full year for comparisons)
|
||||
|
||||
current_year = None
|
||||
if document_period_end_date:
|
||||
try:
|
||||
current_year = datetime.strptime(document_period_end_date, '%Y-%m-%d').year
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Categorize all duration periods by their length
|
||||
# ENHANCED: More strict duration checking to avoid misclassification
|
||||
for period in duration_periods:
|
||||
try:
|
||||
start_date = datetime.strptime(period['start_date'], '%Y-%m-%d').date()
|
||||
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
|
||||
days = (end_date - start_date).days
|
||||
|
||||
# Skip single-day or very short periods (less than 30 days)
|
||||
if days < 30:
|
||||
continue
|
||||
|
||||
# Categorize by duration with stricter checks
|
||||
if 80 <= days <= 100: # Quarterly period (~90 days), slightly wider range
|
||||
period['period_type'] = 'quarterly'
|
||||
period['days'] = days
|
||||
quarterly_periods.append(period)
|
||||
elif 170 <= days <= 190: # Semi-annual/YTD for Q2 (~180 days)
|
||||
period['period_type'] = 'semi-annual'
|
||||
period['days'] = days
|
||||
ytd_periods.append(period)
|
||||
elif 260 <= days <= 280: # YTD for Q3 (~270 days)
|
||||
period['period_type'] = 'three-quarters'
|
||||
period['days'] = days
|
||||
ytd_periods.append(period)
|
||||
elif days > 300: # Annual period for comparisons (strict check)
|
||||
period['period_type'] = 'annual'
|
||||
period['days'] = days
|
||||
annual_periods.append(period)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# Build the optimal set of periods for quarterly reporting
|
||||
selected_periods = []
|
||||
|
||||
# 1. Add the most recent quarterly period (current quarter)
|
||||
if quarterly_periods:
|
||||
# Find the most recent quarterly period
|
||||
recent_quarterly = quarterly_periods[0] # Already sorted by end date
|
||||
selected_periods.append(recent_quarterly)
|
||||
|
||||
# Try to find the same quarter from previous year for comparison
|
||||
if current_year:
|
||||
for qp in quarterly_periods[1:]:
|
||||
try:
|
||||
qp_end = datetime.strptime(qp['end_date'], '%Y-%m-%d').date()
|
||||
recent_end = datetime.strptime(recent_quarterly['end_date'], '%Y-%m-%d').date()
|
||||
# Same quarter, previous year (within 15 days tolerance)
|
||||
if (qp_end.year == current_year - 1 and
|
||||
qp_end.month == recent_end.month and
|
||||
abs(qp_end.day - recent_end.day) <= 15):
|
||||
selected_periods.append(qp)
|
||||
break
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# 2. Add the most recent YTD period if available
|
||||
if ytd_periods:
|
||||
# Find the YTD period that ends closest to the document period end
|
||||
selected_periods.append(ytd_periods[0])
|
||||
|
||||
# 3. If we don't have enough periods yet, add more quarterly periods
|
||||
if len(selected_periods) < 3:
|
||||
for period in quarterly_periods:
|
||||
if period not in selected_periods and len(selected_periods) < 3:
|
||||
selected_periods.append(period)
|
||||
|
||||
# 4. If still not enough, consider annual periods for year-over-year comparison
|
||||
if len(selected_periods) < 3 and annual_periods:
|
||||
for period in annual_periods:
|
||||
if len(selected_periods) < 3:
|
||||
selected_periods.append(period)
|
||||
|
||||
# Convert selected periods to display format
|
||||
for period in selected_periods[:3]: # Limit to 3 periods
|
||||
periods_to_display.append((period['key'], period['label']))
|
||||
|
||||
# For other statement types (not covered by specific logic above)
|
||||
else:
|
||||
# Get configuration for this statement type, or use defaults
|
||||
statement_info = STATEMENT_TYPE_CONFIG.get(statement_type, {})
|
||||
|
||||
if not statement_info:
|
||||
# For unknown statement types, use heuristics based on available periods
|
||||
|
||||
# For unknown statement types, determine preferences based on fiscal period
|
||||
if fiscal_period_focus == 'FY':
|
||||
# For annual reports, prefer duration periods and show comparisons
|
||||
statement_info = {
|
||||
'period_type': 'duration',
|
||||
'max_periods': 3,
|
||||
'allow_annual_comparison': True
|
||||
}
|
||||
else:
|
||||
# For interim reports, accept either type but limit to current period
|
||||
statement_info = {
|
||||
'period_type': 'either',
|
||||
'max_periods': 1,
|
||||
'allow_annual_comparison': False
|
||||
}
|
||||
|
||||
# Select periods based on determined preferences
|
||||
period_type = statement_info.get('period_type', 'either')
|
||||
max_periods = statement_info.get('max_periods', 1)
|
||||
|
||||
if period_type == 'instant' or period_type == 'either':
|
||||
instant_periods = filter_periods_by_type(all_periods, 'instant')
|
||||
instant_periods = filter_periods_by_document_end_date(instant_periods, document_period_end_date, 'instant')
|
||||
instant_periods = sort_periods(instant_periods, 'instant')
|
||||
if instant_periods:
|
||||
for period in instant_periods[:max_periods]:
|
||||
periods_to_display.append((period['key'], period['label']))
|
||||
|
||||
if (period_type == 'duration' or (period_type == 'either' and not periods_to_display)):
|
||||
duration_periods = filter_periods_by_type(all_periods, 'duration')
|
||||
duration_periods = filter_periods_by_document_end_date(duration_periods, document_period_end_date, 'duration')
|
||||
duration_periods = sort_periods(duration_periods, 'duration')
|
||||
if duration_periods:
|
||||
for period in duration_periods[:max_periods]:
|
||||
periods_to_display.append((period['key'], period['label']))
|
||||
|
||||
return periods_to_display
|
||||
1821
venv/lib/python3.10/site-packages/edgar/xbrl/rendering.py
Normal file
1821
venv/lib/python3.10/site-packages/edgar/xbrl/rendering.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,44 @@
|
||||
# XBRL2 Standardization
|
||||
|
||||
This package provides functionality for standardizing XBRL concepts across different company filings.
|
||||
|
||||
## Overview
|
||||
|
||||
The standardization module maps company-specific XBRL concepts to standardized concept names,
|
||||
enabling consistent presentation of financial statements regardless of the filing entity.
|
||||
|
||||
This is particularly useful for:
|
||||
- Comparing financial data across different companies
|
||||
- Building standardized reports and visualizations
|
||||
- Creating consistent financial datasets for analysis
|
||||
|
||||
## Components
|
||||
|
||||
- `StandardConcept`: An enumeration of standard financial statement concepts
|
||||
- `MappingStore`: Storage for mappings between company-specific and standard concepts
|
||||
- `ConceptMapper`: Maps company-specific concepts to standard concepts using various techniques
|
||||
- `standardize_statement`: Function to standardize a statement's labels
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from edgar.xbrl.standardization import StandardConcept, initialize_default_mappings, ConceptMapper,
|
||||
standardize_statement
|
||||
|
||||
# Get the default mappings
|
||||
store = initialize_default_mappings()
|
||||
|
||||
# Create a mapper
|
||||
mapper = ConceptMapper(store)
|
||||
|
||||
# Standardize a statement
|
||||
standardized_data = standardize_statement(statement_data, mapper)
|
||||
```
|
||||
|
||||
## Concept Mappings
|
||||
|
||||
The standardized concept mappings are stored in the `concept_mappings.json` file included
|
||||
in this package. This file maps standard concept names to lists of company-specific concept IDs.
|
||||
|
||||
The file is automatically loaded when initializing the `MappingStore` and can be extended
|
||||
with new mappings as needed.
|
||||
@@ -0,0 +1,17 @@
|
||||
"""
|
||||
XBRL concept standardization package.
|
||||
|
||||
This package provides functionality to map company-specific XBRL concepts
|
||||
to standardized concept names, enabling consistent presentation of financial
|
||||
statements regardless of the filing entity.
|
||||
"""
|
||||
|
||||
from edgar.xbrl.standardization.core import ConceptMapper, MappingStore, StandardConcept, initialize_default_mappings, standardize_statement
|
||||
|
||||
__all__ = [
|
||||
'StandardConcept',
|
||||
'MappingStore',
|
||||
'ConceptMapper',
|
||||
'standardize_statement',
|
||||
'initialize_default_mappings'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"concept_mappings": {
|
||||
"Sales and Service Revenue": [
|
||||
"brka_SalesAndServiceRevenue"
|
||||
]
|
||||
},
|
||||
"hierarchy_rules": {
|
||||
"Revenue": {
|
||||
"components": [
|
||||
"Sales and Service Revenue",
|
||||
"Operating Lease Revenue"
|
||||
],
|
||||
"description": "Total revenue comprises sales/service revenue and operating lease income for holding company"
|
||||
}
|
||||
},
|
||||
"business_context": {
|
||||
"entity_type": "holding_company",
|
||||
"industry": "diversified_conglomerate",
|
||||
"description": "Berkshire Hathaway operates diverse businesses including insurance, utilities, railroads, and manufacturing"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
{
|
||||
"entity_info": {
|
||||
"name": "Microsoft Corporation",
|
||||
"cik": "0000789019",
|
||||
"ticker": "MSFT",
|
||||
"description": "Microsoft-specific concept mappings for unique business terminology"
|
||||
},
|
||||
|
||||
"concept_mappings": {
|
||||
"_comment_msft_revenue": "Microsoft uses specific revenue categorization that differs from standard tech companies",
|
||||
|
||||
"Product Revenue": [
|
||||
"msft_ProductRevenue",
|
||||
"msft_WindowsCommercialRevenue",
|
||||
"msft_WindowsConsumerRevenue",
|
||||
"msft_OfficeCommercialRevenue"
|
||||
],
|
||||
|
||||
"Service Revenue": [
|
||||
"msft_ServiceRevenue",
|
||||
"msft_CloudServicesRevenue",
|
||||
"msft_ConsultingServicesRevenue"
|
||||
],
|
||||
|
||||
"Subscription Revenue": [
|
||||
"msft_Office365CommercialRevenue",
|
||||
"msft_Office365ConsumerRevenue",
|
||||
"msft_DynamicsRevenue"
|
||||
],
|
||||
|
||||
"Platform Revenue": [
|
||||
"msft_AzureRevenue",
|
||||
"msft_XboxContentAndServicesRevenue"
|
||||
],
|
||||
|
||||
"_comment_msft_expenses": "Microsoft has unique expense categorizations for sales and marketing vs G&A",
|
||||
|
||||
"Sales and Marketing Expense": [
|
||||
"msft_SalesAndMarketingExpense",
|
||||
"msft_AdvertisingAndPromotionExpense"
|
||||
],
|
||||
|
||||
"Technical Support Expense": [
|
||||
"msft_TechnicalSupportExpense",
|
||||
"msft_CustomerSupportExpense"
|
||||
]
|
||||
},
|
||||
|
||||
"hierarchy_rules": {
|
||||
"_comment": "Rules for handling Microsoft-specific hierarchical relationships",
|
||||
|
||||
"revenue_hierarchy": {
|
||||
"parent": "Revenue",
|
||||
"children": ["Product Revenue", "Service Revenue", "Subscription Revenue", "Platform Revenue"],
|
||||
"calculation_rule": "sum"
|
||||
},
|
||||
|
||||
"expense_hierarchy": {
|
||||
"parent": "Operating Expenses",
|
||||
"children": ["Sales and Marketing Expense", "Technical Support Expense"],
|
||||
"calculation_rule": "sum"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"metadata": {
|
||||
"entity_identifier": "tsla",
|
||||
"company_name": "Tesla, Inc.",
|
||||
"cik": "1318605",
|
||||
"priority": "high",
|
||||
"created_date": "2024-06-25",
|
||||
"last_updated": "2024-06-25",
|
||||
"description": "Tesla-specific concept mappings to handle automotive, energy, and service revenue streams"
|
||||
},
|
||||
"concept_mappings": {
|
||||
"Automotive Revenue": [
|
||||
"tsla_AutomotiveRevenue",
|
||||
"tsla_AutomotiveSales",
|
||||
"tsla_VehicleRevenue"
|
||||
],
|
||||
"Automotive Leasing Revenue": [
|
||||
"tsla_AutomotiveLeasing",
|
||||
"tsla_AutomotiveLeasingRevenue",
|
||||
"tsla_VehicleLeasingRevenue"
|
||||
],
|
||||
"Energy Revenue": [
|
||||
"tsla_EnergyGenerationAndStorageRevenue",
|
||||
"tsla_EnergyRevenue",
|
||||
"tsla_SolarRevenue",
|
||||
"tsla_EnergyStorageRevenue"
|
||||
],
|
||||
"Service Revenue": [
|
||||
"tsla_ServicesAndOtherRevenue",
|
||||
"tsla_ServiceRevenue",
|
||||
"tsla_SuperchargerRevenue"
|
||||
]
|
||||
},
|
||||
"hierarchy_rules": {
|
||||
"Revenue": {
|
||||
"children": [
|
||||
"Automotive Revenue",
|
||||
"Energy Revenue",
|
||||
"Service Revenue"
|
||||
]
|
||||
},
|
||||
"Automotive Revenue": {
|
||||
"children": [
|
||||
"Automotive Leasing Revenue"
|
||||
]
|
||||
}
|
||||
},
|
||||
"business_context": {
|
||||
"primary_revenue_streams": ["automotive", "energy", "services"],
|
||||
"revenue_model": "product_and_service",
|
||||
"key_metrics": ["vehicle_deliveries", "energy_deployments"],
|
||||
"industry": "automotive_technology"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,353 @@
|
||||
{
|
||||
"_comment_revenue_hierarchy": "REVENUE HIERARCHY FIX: Separated total revenue from component revenue types to prevent duplicate labels. Contract and product revenue are components that should have distinct labels from total revenue.",
|
||||
"Revenue": [
|
||||
"us-gaap_Revenue",
|
||||
"us-gaap_Revenues",
|
||||
"us-gaap_SalesRevenueNet",
|
||||
"us-gaap_OperatingRevenue"
|
||||
],
|
||||
"Contract Revenue": [
|
||||
"us-gaap_RevenueFromContractWithCustomerExcludingAssessedTax",
|
||||
"us-gaap_RevenueFromContractWithCustomerIncludingAssessedTax"
|
||||
],
|
||||
"Product Revenue": [
|
||||
"us-gaap_SalesRevenueGoodsNet",
|
||||
"us-gaap_ProductSales"
|
||||
],
|
||||
"Operating Lease Revenue": [
|
||||
"us-gaap_OperatingLeaseLeaseIncome"
|
||||
],
|
||||
"_comment_cost_of_revenue_hierarchy": "COST OF REVENUE HIERARCHY FIX: Separated different cost types to prevent duplicate labels. Different business models (manufacturing, service, mixed) use different cost concepts that should have distinct labels for clarity.",
|
||||
"Cost of Revenue": [
|
||||
"us-gaap_CostOfRevenueAbstract"
|
||||
],
|
||||
"Total Cost of Revenue": [
|
||||
"us-gaap_CostOfRevenue"
|
||||
],
|
||||
"Cost of Goods Sold": [
|
||||
"us-gaap_CostOfGoodsSold"
|
||||
],
|
||||
"Cost of Goods and Services Sold": [
|
||||
"us-gaap_CostOfGoodsAndServicesSold"
|
||||
],
|
||||
"Cost of Sales": [
|
||||
"us-gaap_CostOfSales"
|
||||
],
|
||||
"Cost of Goods and Services Excluding Depreciation": [
|
||||
"us-gaap_CostOfGoodsAndServiceExcludingDepreciationDepletionAndAmortization"
|
||||
],
|
||||
"Direct Operating Costs": [
|
||||
"us-gaap_DirectOperatingCosts"
|
||||
],
|
||||
"Costs and Expenses": [
|
||||
"us-gaap_CostsAndExpenses"
|
||||
],
|
||||
"Gross Profit": [
|
||||
"us-gaap_GrossProfit"
|
||||
],
|
||||
"Operating Expenses": [
|
||||
"us-gaap_NoninterestExpense",
|
||||
"us-gaap_OperatingCostsAndExpenses",
|
||||
"us-gaap_OperatingExpenses"
|
||||
],
|
||||
"Research and Development Expense": [
|
||||
"us-gaap_ResearchAndDevelopmentCosts",
|
||||
"us-gaap_ResearchAndDevelopmentExpense"
|
||||
],
|
||||
"_comment_sga_hierarchy": "SG&A HIERARCHY FIX: Separated total SG&A from components to prevent duplicate labels. Previously all three concepts below mapped to 'Selling, General and Administrative Expense' causing confusion when companies report both total and components.",
|
||||
"Selling, General and Administrative Expense": [
|
||||
"us-gaap_SellingGeneralAndAdministrativeExpense"
|
||||
],
|
||||
"General and Administrative Expense": [
|
||||
"us-gaap_GeneralAndAdministrativeExpense",
|
||||
"us-gaap_AdministrativeExpense"
|
||||
],
|
||||
"Selling Expense": [
|
||||
"us-gaap_SellingAndMarketingExpense",
|
||||
"us-gaap_SellingExpense"
|
||||
],
|
||||
"Marketing Expense": [
|
||||
"us-gaap_MarketingExpense",
|
||||
"us-gaap_AdvertisingExpense"
|
||||
],
|
||||
"Operating Income": [
|
||||
"us-gaap_OperatingIncomeLoss",
|
||||
"us-gaap_OperatingIncome",
|
||||
"us-gaap_IncomeLossFromContinuingOperationsBeforeInterestAndTaxes"
|
||||
],
|
||||
"Nonoperating Income/Expense": [
|
||||
"orcl_NonoperatingIncomeExpenseIncludingEliminationOfNetIncomeLossAttributableToNoncontrollingInterests",
|
||||
"us-gaap_NonoperatingIncomeExpense"
|
||||
],
|
||||
"Interest Expense": [
|
||||
"us-gaap_InterestAndDebtExpense",
|
||||
"us-gaap_InterestExpense",
|
||||
"us-gaap_InterestIncomeExpenseNet"
|
||||
],
|
||||
"Interest Expense (operating)": [
|
||||
"us-gaap_InterestExpenseOperating"
|
||||
],
|
||||
"Interest Expense (non-operating)": [
|
||||
"us-gaap_InterestExpenseNonoperating"
|
||||
],
|
||||
"_comment_income_before_tax_hierarchy": "INCOME BEFORE TAX HIERARCHY FIX: Separated total income before tax from component types to prevent duplicate labels. Continuing operations and extraordinary items are components that should have distinct labels.",
|
||||
"Income Before Tax": [
|
||||
"us-gaap_IncomeLossBeforeIncomeTaxes"
|
||||
],
|
||||
"Income Before Tax from Continuing Operations": [
|
||||
"us-gaap_IncomeLossFromContinuingOperationsBeforeIncomeTaxes",
|
||||
"us-gaap_IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest",
|
||||
"orcl_IncomeLossFromContinuingOperationsIncludingNoncontrollingInterestBeforeIncomeTaxesExtraordinaryItems"
|
||||
],
|
||||
"Income Tax Expense": [
|
||||
"us-gaap_IncomeTaxesPaidNet",
|
||||
"us-gaap_IncomeTaxExpenseBenefit"
|
||||
],
|
||||
"_comment_net_income_hierarchy": "NET INCOME HIERARCHY FIX: Separated total net income from component income types to prevent duplicate labels. Continuing operations income and profit/loss are components that should have distinct labels from total net income.",
|
||||
"Net Income": [
|
||||
"us-gaap_NetIncome",
|
||||
"us-gaap_NetIncomeLoss"
|
||||
],
|
||||
"Net Income from Continuing Operations": [
|
||||
"us-gaap_IncomeLossFromContinuingOperationsIncludingPortionAttributableToNoncontrollingInterest",
|
||||
"us-gaap_IncomeLossFromContinuingOperations"
|
||||
],
|
||||
"Profit or Loss": [
|
||||
"us-gaap_ProfitLoss"
|
||||
],
|
||||
"Net Income Attributable to Noncontrolling Interest": [
|
||||
"us-gaap_NetIncomeLossAttributableToNonredeemableNoncontrollingInterest",
|
||||
"us-gaap_NetIncomeLossAttributableToNoncontrollingInterest"
|
||||
],
|
||||
"Basic Net Income Available to Common Shareholders": [
|
||||
"us-gaap_NetIncomeLossAvailableToCommonStockholdersBasic"
|
||||
],
|
||||
"Diluted Net Income Available to Common Shareholders": [
|
||||
"us-gaap_NetIncomeLossAvailableToCommonStockholdersDiluted"
|
||||
],
|
||||
"Accumulated Other Comprehensive Income/Loss": [
|
||||
"us-gaap_AccumulatedOtherComprehensiveIncomeLossNetOfTax"
|
||||
],
|
||||
"Earnings Per Share": [
|
||||
"us-gaap_EarningsPerShareAbstract"
|
||||
],
|
||||
"Earnings Per Share (Basic)": [
|
||||
"us-gaap_EarningsPerShareBasic"
|
||||
],
|
||||
"Earnings Per Share (Diluted)": [
|
||||
"us-gaap_EarningsPerShareDiluted"
|
||||
],
|
||||
"Shares Outstanding": [
|
||||
"us-gaap_WeightedAverageNumberOfSharesOutstandingAbstract"
|
||||
],
|
||||
"Shares Outstanding (Basic)": [
|
||||
"us-gaap_WeightedAverageNumberOfSharesOutstandingBasic"
|
||||
],
|
||||
"Shares Outstanding (Diluted)": [
|
||||
"us-gaap_WeightedAverageNumberOfDilutedSharesOutstanding"
|
||||
],
|
||||
"Cash and Cash Equivalents": [
|
||||
"us-gaap_CashEquivalentsAtCarryingValue",
|
||||
"us-gaap_Cash",
|
||||
"us-gaap_CashAndCashEquivalentsAtCarryingValue",
|
||||
"us-gaap_CashCashEquivalentsAndShortTermInvestments"
|
||||
],
|
||||
"Accounts Receivable": [
|
||||
"us-gaap_AccountsReceivableNet",
|
||||
"us-gaap_ReceivablesNetCurrent",
|
||||
"us-gaap_AccountsReceivableNetCurrent",
|
||||
"us-gaap_AccountsReceivableGross"
|
||||
],
|
||||
"Inventory": [
|
||||
"us-gaap_InventoryGross",
|
||||
"us-gaap_InventoryFinishedGoods",
|
||||
"us-gaap_InventoryNet"
|
||||
],
|
||||
"Prepaid Expenses": [
|
||||
"us-gaap_PrepaidExpenseAndOtherAssetsCurrent",
|
||||
"us-gaap_PrepaidExpenseCurrent"
|
||||
],
|
||||
"Current Marketable Securities": [
|
||||
"us-gaap_AvailableForSaleSecuritiesDebtSecuritiesCurrent",
|
||||
"us-gaap_MarketableSecuritiesCurrent"
|
||||
],
|
||||
"Non Current Marketable Securities": [
|
||||
"us-gaap_MarketableSecuritiesNoncurrent"
|
||||
],
|
||||
"Total Current Assets": [
|
||||
"us-gaap_AssetsCurrent"
|
||||
],
|
||||
"Total Non Current Assets": [
|
||||
"us-gaap_AssetsNoncurrent"
|
||||
],
|
||||
"Property, Plant and Equipment": [
|
||||
"us-gaap_PropertyPlantAndEquipmentGross",
|
||||
"us-gaap_PropertyPlantAndEquipmentNet",
|
||||
"us-gaap_FixedAssets"
|
||||
],
|
||||
"Goodwill": [
|
||||
"us-gaap_Goodwill"
|
||||
],
|
||||
"Intangible Assets": [
|
||||
"us-gaap_IntangibleAssetsNetIncludingGoodwill",
|
||||
"us-gaap_IntangibleAssetsNetExcludingGoodwill",
|
||||
"us-gaap_FiniteLivedIntangibleAssetsNet"
|
||||
],
|
||||
"Total Assets": [
|
||||
"us-gaap_Assets",
|
||||
"us-gaap_AssetsTotal"
|
||||
],
|
||||
"Long-Term Investments": [
|
||||
"us-gaap_LongTermInvestments"
|
||||
],
|
||||
"Accounts Payable": [
|
||||
"us-gaap_AccountsPayableCurrent",
|
||||
"us-gaap_AccountsPayableTradeCurrent"
|
||||
],
|
||||
"Accrued Liabilities": [
|
||||
"us-gaap_OtherAccruedLiabilitiesCurrent",
|
||||
"us-gaap_AccruedLiabilitiesCurrent",
|
||||
"us-gaap_EmployeeRelatedLiabilitiesCurrent"
|
||||
],
|
||||
"Short Term Debt": [
|
||||
"us-gaap_DebtCurrent",
|
||||
"us-gaap_ShortTermBorrowings",
|
||||
"us-gaap_LongTermDebtCurrent"
|
||||
],
|
||||
"Total Current Liabilities": [
|
||||
"us-gaap_LiabilitiesCurrent"
|
||||
],
|
||||
"Total Non Current Liabilities": [
|
||||
"us-gaap_LiabilitiesNoncurrent"
|
||||
],
|
||||
"Long Term Debt": [
|
||||
"us-gaap_LongTermDebtAndCapitalLeaseObligations",
|
||||
"us-gaap_LongTermDebt",
|
||||
"us-gaap_LongTermBorrowings",
|
||||
"us-gaap_LongTermDebtNoncurrent"
|
||||
],
|
||||
"Notes Payable, Current": [
|
||||
"us-gaap_NotesPayableCurrent"
|
||||
],
|
||||
"Notes Payable, Non Current": [
|
||||
"us-gaap_LongTermNotesAndLoans"
|
||||
],
|
||||
"Deferred Revenue": [
|
||||
"us-gaap_DeferredRevenueNoncurrent",
|
||||
"us-gaap_DeferredRevenueCurrent",
|
||||
"us-gaap_DeferredRevenue"
|
||||
],
|
||||
"Total Liabilities": [
|
||||
"us-gaap_LiabilitiesTotal",
|
||||
"us-gaap_Liabilities"
|
||||
],
|
||||
"Common Stock Shares Outstanding": [
|
||||
"us-gaap_CommonStockSharesOutstanding"
|
||||
],
|
||||
"Common Stock Shares Issued": [
|
||||
"us-gaap_CommonStockSharesIssued"
|
||||
],
|
||||
"Common Stock": [
|
||||
"us-gaap_CommonStocksIncludingAdditionalPaidInCapital",
|
||||
"us-gaap_StockholdersEquityCommonStock",
|
||||
"us-gaap_CommonStockValue"
|
||||
],
|
||||
"Preferred Stock": [
|
||||
"us-gaap_PreferredStockValue"
|
||||
],
|
||||
"Treasury Stock Common Value": [
|
||||
"us-gaap_TreasuryStockCommonValue",
|
||||
"us-gaap_TreasuryStockValue"
|
||||
],
|
||||
"Retained Earnings": [
|
||||
"us-gaap_RetainedEarnings",
|
||||
"us-gaap_RetainedEarningsAccumulatedDeficit"
|
||||
],
|
||||
"Minority Interest": [
|
||||
"us-gaap_MinorityInterest",
|
||||
"us-gaap_NoncontrollingInterest"
|
||||
],
|
||||
"Total Stockholders' Equity": [
|
||||
"us-gaap_EquityAttributableToParent",
|
||||
"us-gaap_StockholdersEquity",
|
||||
"us-gaap_StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest",
|
||||
"us-gaap_StockholdersEquityAttributableToParent"
|
||||
],
|
||||
"Total Liabilities and Stockholders' Equity": [
|
||||
"us-gaap_LiabilitiesAndStockholdersEquity"
|
||||
],
|
||||
"Net Cash from Operating Activities": [
|
||||
"us-gaap_NetCashProvidedByUsedInOperatingActivities",
|
||||
"us-gaap_NetCashProvidedByUsedInOperatingActivitiesContinuingOperations"
|
||||
],
|
||||
"Net Cash from Investing Activities": [
|
||||
"us-gaap_NetCashProvidedByUsedInInvestingActivities",
|
||||
"us-gaap_NetCashProvidedByUsedInInvestingActivitiesContinuingOperations"
|
||||
],
|
||||
"Net Cash from Financing Activities": [
|
||||
"us-gaap_NetCashProvidedByUsedInFinancingActivitiesContinuingOperations",
|
||||
"us-gaap_NetCashProvidedByUsedInFinancingActivities"
|
||||
],
|
||||
"Net Change in Cash": [
|
||||
"us-gaap_IncreaseDecreaseInCashAndCashEquivalents",
|
||||
"us-gaap_CashAndCashEquivalentsPeriodIncreaseDecrease",
|
||||
"us-gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect"
|
||||
],
|
||||
"Payments for Property, Plant and Equipment": [
|
||||
"us-gaap_PaymentsToAcquirePropertyPlantAndEquipment"
|
||||
],
|
||||
"Payments of Dividends": [
|
||||
"us-gaap_PaymentsOfDividends"
|
||||
],
|
||||
"Tax Withholding for Share-Based Compensation": [
|
||||
"us-gaap_PaymentsRelatedToTaxWithholdingForShareBasedCompensation"
|
||||
],
|
||||
"Payments to Acquire Businesses": [
|
||||
"us-gaap_PaymentsToAcquireBusinessesNetOfCashAcquired"
|
||||
],
|
||||
"Proceeds from Issuance of Common Stock": [
|
||||
"us-gaap_ProceedsFromIssuanceOfCommonStock"
|
||||
],
|
||||
"Proceeds from Issuance of Long-Term Debt": [
|
||||
"us-gaap_ProceedsFromIssuanceOfLongTermDebt"
|
||||
],
|
||||
"Proceeds from Maturities, Prepayments and Calls of Securities": [
|
||||
"us-gaap_ProceedsFromMaturitiesPrepaymentsAndCallsOfAvailableForSaleSecurities"
|
||||
],
|
||||
"Proceeds from Sale and Maturity of Other Investments": [
|
||||
"us-gaap_ProceedsFromSaleAndMaturityOfOtherInvestments"
|
||||
],
|
||||
"Proceeds from Sale of Debt Securities, ": [
|
||||
"us-gaap_ProceedsFromSaleOfAvailableForSaleSecuritiesDebt"
|
||||
],
|
||||
"Proceeds from (Repayments of) Commercial Paper": [
|
||||
"us-gaap_ProceedsFromRepaymentsOfCommercialPaper"
|
||||
],
|
||||
"Other Assets": [
|
||||
"us-gaap_OtherAssets"
|
||||
],
|
||||
"Other Current Assets": [
|
||||
"us-gaap_OtherAssetsCurrent"
|
||||
],
|
||||
"Other Non Current Assets": [
|
||||
"us-gaap_OtherAssetsNoncurrent"
|
||||
],
|
||||
"Deferred Tax Assets": [
|
||||
"us-gaap_DeferredIncomeTaxAssetsNet"
|
||||
],
|
||||
"Other Liabilities": [
|
||||
"us-gaap_OtherLiabilities"
|
||||
|
||||
],
|
||||
"Other Current Liabilities": [
|
||||
"us-gaap_OtherLiabilitiesCurrent"
|
||||
],
|
||||
"Other Non Current Liabilities": [
|
||||
"us-gaap_OtherLiabilitiesNoncurrent"
|
||||
],
|
||||
"Depreciation and Amortization": [
|
||||
"us-gaap_AmortizationOfIntangibleAssets",
|
||||
"us-gaap_Depreciation",
|
||||
"us-gaap_DepreciationAndAmortization"
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,817 @@
|
||||
"""
|
||||
Module for standardizing XBRL concepts across different company filings.
|
||||
|
||||
This module provides functionality to map company-specific XBRL concepts
|
||||
to standardized concept names, enabling consistent presentation of financial
|
||||
statements regardless of the filing entity.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from difflib import SequenceMatcher
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class StandardConcept(str, Enum):
|
||||
"""
|
||||
Standardized concept names for financial statements.
|
||||
|
||||
The enum value (string) is the display label used for presentation.
|
||||
These labels should match keys in concept_mappings.json.
|
||||
"""
|
||||
# Balance Sheet - Assets
|
||||
CASH_AND_EQUIVALENTS = "Cash and Cash Equivalents"
|
||||
ACCOUNTS_RECEIVABLE = "Accounts Receivable"
|
||||
INVENTORY = "Inventory"
|
||||
PREPAID_EXPENSES = "Prepaid Expenses"
|
||||
TOTAL_CURRENT_ASSETS = "Total Current Assets"
|
||||
PROPERTY_PLANT_EQUIPMENT = "Property, Plant and Equipment"
|
||||
GOODWILL = "Goodwill"
|
||||
INTANGIBLE_ASSETS = "Intangible Assets"
|
||||
TOTAL_ASSETS = "Total Assets"
|
||||
|
||||
# Balance Sheet - Liabilities
|
||||
ACCOUNTS_PAYABLE = "Accounts Payable"
|
||||
ACCRUED_LIABILITIES = "Accrued Liabilities"
|
||||
SHORT_TERM_DEBT = "Short Term Debt"
|
||||
TOTAL_CURRENT_LIABILITIES = "Total Current Liabilities"
|
||||
LONG_TERM_DEBT = "Long Term Debt"
|
||||
DEFERRED_REVENUE = "Deferred Revenue"
|
||||
TOTAL_LIABILITIES = "Total Liabilities"
|
||||
|
||||
# Balance Sheet - Equity
|
||||
COMMON_STOCK = "Common Stock"
|
||||
RETAINED_EARNINGS = "Retained Earnings"
|
||||
TOTAL_EQUITY = "Total Stockholders' Equity"
|
||||
|
||||
# Income Statement - Revenue Hierarchy
|
||||
REVENUE = "Revenue"
|
||||
CONTRACT_REVENUE = "Contract Revenue"
|
||||
PRODUCT_REVENUE = "Product Revenue"
|
||||
SERVICE_REVENUE = "Service Revenue"
|
||||
SUBSCRIPTION_REVENUE = "Subscription Revenue"
|
||||
LEASING_REVENUE = "Leasing Revenue"
|
||||
|
||||
# Industry-Specific Revenue Concepts
|
||||
AUTOMOTIVE_REVENUE = "Automotive Revenue"
|
||||
AUTOMOTIVE_LEASING_REVENUE = "Automotive Leasing Revenue"
|
||||
ENERGY_REVENUE = "Energy Revenue"
|
||||
SOFTWARE_REVENUE = "Software Revenue"
|
||||
HARDWARE_REVENUE = "Hardware Revenue"
|
||||
PLATFORM_REVENUE = "Platform Revenue"
|
||||
|
||||
# Income Statement - Expenses
|
||||
COST_OF_REVENUE = "Cost of Revenue"
|
||||
COST_OF_GOODS_SOLD = "Cost of Goods Sold"
|
||||
COST_OF_GOODS_AND_SERVICES_SOLD = "Cost of Goods and Services Sold"
|
||||
COST_OF_SALES = "Cost of Sales"
|
||||
COSTS_AND_EXPENSES = "Costs and Expenses"
|
||||
DIRECT_OPERATING_COSTS = "Direct Operating Costs"
|
||||
GROSS_PROFIT = "Gross Profit"
|
||||
OPERATING_EXPENSES = "Operating Expenses"
|
||||
RESEARCH_AND_DEVELOPMENT = "Research and Development Expense"
|
||||
|
||||
# Enhanced Expense Hierarchy
|
||||
SELLING_GENERAL_ADMIN = "Selling, General and Administrative Expense"
|
||||
SELLING_EXPENSE = "Selling Expense"
|
||||
GENERAL_ADMIN_EXPENSE = "General and Administrative Expense"
|
||||
MARKETING_EXPENSE = "Marketing Expense"
|
||||
SALES_EXPENSE = "Sales Expense"
|
||||
|
||||
# Other Income Statement
|
||||
OPERATING_INCOME = "Operating Income"
|
||||
INTEREST_EXPENSE = "Interest Expense"
|
||||
INCOME_BEFORE_TAX = "Income Before Tax"
|
||||
INCOME_BEFORE_TAX_CONTINUING_OPS = "Income Before Tax from Continuing Operations"
|
||||
INCOME_TAX_EXPENSE = "Income Tax Expense"
|
||||
NET_INCOME = "Net Income"
|
||||
NET_INCOME_CONTINUING_OPS = "Net Income from Continuing Operations"
|
||||
NET_INCOME_NONCONTROLLING = "Net Income Attributable to Noncontrolling Interest"
|
||||
PROFIT_OR_LOSS = "Profit or Loss"
|
||||
|
||||
# Cash Flow Statement
|
||||
CASH_FROM_OPERATIONS = "Net Cash from Operating Activities"
|
||||
CASH_FROM_INVESTING = "Net Cash from Investing Activities"
|
||||
CASH_FROM_FINANCING = "Net Cash from Financing Activities"
|
||||
NET_CHANGE_IN_CASH = "Net Change in Cash"
|
||||
|
||||
@classmethod
|
||||
def get_from_label(cls, label: str) -> Optional['StandardConcept']:
|
||||
"""
|
||||
Get a StandardConcept enum by its label value.
|
||||
|
||||
Args:
|
||||
label: The label string to look up
|
||||
|
||||
Returns:
|
||||
The corresponding StandardConcept or None if not found
|
||||
"""
|
||||
for concept in cls:
|
||||
if concept.value == label:
|
||||
return concept
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_all_values(cls) -> Set[str]:
|
||||
"""
|
||||
Get all label values defined in the enum.
|
||||
|
||||
Returns:
|
||||
Set of all label strings
|
||||
"""
|
||||
return {concept.value for concept in cls}
|
||||
|
||||
|
||||
class MappingStore:
|
||||
"""
|
||||
Storage for mappings between company-specific concepts and standard concepts.
|
||||
|
||||
Attributes:
|
||||
source (str): Path to the JSON file storing the mappings
|
||||
mappings (Dict[str, Set[str]]): Dictionary mapping standard concepts to sets of company concepts
|
||||
company_mappings (Dict[str, Dict]): Company-specific mappings loaded from company_mappings/
|
||||
merged_mappings (Dict[str, List[Tuple]]): Merged mappings with priority scoring
|
||||
"""
|
||||
|
||||
def __init__(self, source: Optional[str] = None, validate_with_enum: bool = False, read_only: bool = False):
|
||||
"""
|
||||
Initialize the mapping store.
|
||||
|
||||
Args:
|
||||
source: Path to the JSON file storing the mappings. If None, uses default location.
|
||||
validate_with_enum: Whether to validate JSON keys against StandardConcept enum
|
||||
read_only: If True, never save changes back to the file (used in testing)
|
||||
"""
|
||||
self.read_only = read_only
|
||||
|
||||
|
||||
if source is None:
|
||||
# Try a few different ways to locate the file, handling both development
|
||||
# and installed package scenarios
|
||||
self.source = None
|
||||
|
||||
# Default to a file in the same directory as this module (development mode)
|
||||
module_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
potential_path = os.path.join(module_dir, "concept_mappings.json")
|
||||
if os.path.exists(potential_path):
|
||||
self.source = potential_path
|
||||
|
||||
# If not found, try to load from package data (installed package)
|
||||
if self.source is None:
|
||||
try:
|
||||
import importlib.resources as pkg_resources
|
||||
try:
|
||||
# For Python 3.9+
|
||||
with pkg_resources.files('edgar.xbrl.standardization').joinpath('concept_mappings.json').open('r') as f:
|
||||
# Just read the file to see if it exists, we'll load it properly later
|
||||
f.read(1)
|
||||
self.source = potential_path # Use the same path as before
|
||||
except (ImportError, FileNotFoundError, AttributeError):
|
||||
# Fallback for older Python versions
|
||||
try:
|
||||
import pkg_resources as legacy_resources
|
||||
if legacy_resources.resource_exists('edgar.xbrl.standardization', 'concept_mappings.json'):
|
||||
self.source = potential_path # Use the same path as before
|
||||
except (ImportError, FileNotFoundError):
|
||||
pass
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# If we still haven't found the file, use the default path anyway
|
||||
# (it will fail gracefully in _load_mappings)
|
||||
if self.source is None:
|
||||
self.source = potential_path
|
||||
else:
|
||||
self.source = source
|
||||
|
||||
self.mappings = self._load_mappings()
|
||||
|
||||
# Load company-specific mappings (always enabled)
|
||||
self.company_mappings = self._load_all_company_mappings()
|
||||
self.merged_mappings = self._create_merged_mappings()
|
||||
self.hierarchy_rules = self._load_hierarchy_rules()
|
||||
|
||||
# Validate the loaded mappings against StandardConcept enum
|
||||
if validate_with_enum:
|
||||
self.validate_against_enum()
|
||||
|
||||
def validate_against_enum(self) -> Tuple[bool, List[str]]:
|
||||
"""
|
||||
Validate that all keys in the mappings exist in StandardConcept enum.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, list_of_missing_keys)
|
||||
"""
|
||||
standard_values = StandardConcept.get_all_values()
|
||||
json_keys = set(self.mappings.keys())
|
||||
|
||||
# Find keys in JSON that aren't in enum
|
||||
missing_in_enum = json_keys - standard_values
|
||||
|
||||
# Find enum values not in JSON (just for information)
|
||||
missing_in_json = standard_values - json_keys
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if missing_in_enum:
|
||||
logger.warning("Found %d keys in concept_mappings.json that don't exist in StandardConcept enum: %s", len(missing_in_enum), sorted(missing_in_enum))
|
||||
|
||||
if missing_in_json:
|
||||
logger.info("Found %d StandardConcept values without mappings in concept_mappings.json: %s", len(missing_in_json), sorted(missing_in_json))
|
||||
|
||||
return len(missing_in_enum) == 0, list(missing_in_enum)
|
||||
|
||||
def to_dataframe(self) -> pd.DataFrame:
|
||||
"""
|
||||
Convert mappings to a pandas DataFrame for analysis and visualization.
|
||||
|
||||
Returns:
|
||||
DataFrame with columns for standard_concept and company_concept
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
raise ImportError("pandas is required for to_dataframe() but is not installed") from None
|
||||
|
||||
rows = []
|
||||
for standard_concept, company_concepts in self.mappings.items():
|
||||
for company_concept in company_concepts:
|
||||
rows.append({
|
||||
'standard_concept': standard_concept,
|
||||
'company_concept': company_concept
|
||||
})
|
||||
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def _load_all_company_mappings(self) -> Dict[str, Dict]:
|
||||
"""Load all company-specific mapping files from company_mappings/ directory."""
|
||||
mappings = {}
|
||||
company_dir = os.path.join(os.path.dirname(self.source or __file__), "company_mappings")
|
||||
|
||||
if os.path.exists(company_dir):
|
||||
for file in os.listdir(company_dir):
|
||||
if file.endswith("_mappings.json"):
|
||||
entity_id = file.replace("_mappings.json", "")
|
||||
try:
|
||||
with open(os.path.join(company_dir, file), 'r') as f:
|
||||
company_data = json.load(f)
|
||||
mappings[entity_id] = company_data
|
||||
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("Failed to load %s: %s", file, e)
|
||||
|
||||
return mappings
|
||||
|
||||
def _create_merged_mappings(self) -> Dict[str, List[Tuple[str, str, int]]]:
|
||||
"""Create merged mappings with priority scoring.
|
||||
|
||||
Priority levels:
|
||||
1. Core mappings (lowest)
|
||||
2. Company mappings (higher)
|
||||
3. Company-specific matches (highest when company detected)
|
||||
|
||||
Returns:
|
||||
Dict mapping standard concepts to list of (company_concept, source, priority) tuples
|
||||
"""
|
||||
merged = {}
|
||||
|
||||
# Add core mappings (priority 1 - lowest)
|
||||
for std_concept, company_concepts in self.mappings.items():
|
||||
merged[std_concept] = []
|
||||
for concept in company_concepts:
|
||||
merged[std_concept].append((concept, "core", 1))
|
||||
|
||||
# Add company mappings (priority 2 - higher)
|
||||
for entity_id, company_data in self.company_mappings.items():
|
||||
concept_mappings = company_data.get("concept_mappings", {})
|
||||
priority_level = 2
|
||||
|
||||
for std_concept, company_concepts in concept_mappings.items():
|
||||
if std_concept not in merged:
|
||||
merged[std_concept] = []
|
||||
for concept in company_concepts:
|
||||
merged[std_concept].append((concept, entity_id, priority_level))
|
||||
|
||||
return merged
|
||||
|
||||
def _load_hierarchy_rules(self) -> Dict[str, Dict]:
|
||||
"""Load hierarchy rules from company mappings."""
|
||||
all_rules = {}
|
||||
|
||||
# Add company hierarchy rules
|
||||
for _entity_id, company_data in self.company_mappings.items():
|
||||
hierarchy_rules = company_data.get("hierarchy_rules", {})
|
||||
all_rules.update(hierarchy_rules)
|
||||
|
||||
return all_rules
|
||||
|
||||
def _detect_entity_from_concept(self, concept: str) -> Optional[str]:
|
||||
"""Detect entity identifier from concept name prefix."""
|
||||
if '_' in concept:
|
||||
prefix = concept.split('_')[0].lower()
|
||||
# Check if this prefix corresponds to a known company
|
||||
if prefix in self.company_mappings:
|
||||
return prefix
|
||||
return None
|
||||
|
||||
def _load_mappings(self) -> Dict[str, Set[str]]:
|
||||
"""
|
||||
Load mappings from the JSON file.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping standard concepts to sets of company concepts
|
||||
"""
|
||||
data = None
|
||||
|
||||
# First try direct file access
|
||||
try:
|
||||
with open(self.source, 'r') as f:
|
||||
data = json.load(f)
|
||||
except (FileNotFoundError, IOError, PermissionError):
|
||||
# If direct file access fails, try package resources
|
||||
try:
|
||||
try:
|
||||
# Modern importlib.resources approach (Python 3.9+)
|
||||
import importlib.resources as pkg_resources
|
||||
try:
|
||||
# For Python 3.9+
|
||||
with pkg_resources.files('edgar.xbrl.standardization').joinpath('concept_mappings.json').open('r') as f:
|
||||
data = json.load(f)
|
||||
except (ImportError, FileNotFoundError, AttributeError):
|
||||
# Fallback to legacy pkg_resources
|
||||
import pkg_resources as legacy_resources
|
||||
resource_string = legacy_resources.resource_string('edgar.xbrl.standardization', 'concept_mappings.json')
|
||||
data = json.loads(resource_string)
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
# If all attempts fail, log a warning
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("Could not load concept_mappings.json. Standardization will be limited.")
|
||||
|
||||
# If we have data, process it based on its structure
|
||||
if data:
|
||||
# Check if the structure is flat or nested
|
||||
if any(isinstance(value, dict) for value in data.values()):
|
||||
# Nested structure by statement type
|
||||
flattened = {}
|
||||
for _statement_type, concepts in data.items():
|
||||
for standard_concept, company_concepts in concepts.items():
|
||||
flattened[standard_concept] = set(company_concepts)
|
||||
return flattened
|
||||
else:
|
||||
# Flat structure
|
||||
return {k: set(v) for k, v in data.items()}
|
||||
|
||||
# If all methods fail, return empty mappings
|
||||
# The initialize_default_mappings function will create a file if needed
|
||||
return {}
|
||||
|
||||
def _save_mappings(self) -> None:
|
||||
"""Save mappings to the JSON file, unless in read_only mode."""
|
||||
# Skip saving if in read_only mode
|
||||
if self.read_only:
|
||||
return
|
||||
|
||||
# Ensure directory exists
|
||||
directory = os.path.dirname(self.source)
|
||||
if directory and not os.path.exists(directory):
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
|
||||
# Convert sets to lists for JSON serialization
|
||||
serializable_mappings = {k: list(v) for k, v in self.mappings.items()}
|
||||
|
||||
with open(self.source, 'w') as f:
|
||||
json.dump(serializable_mappings, f, indent=2)
|
||||
|
||||
def add(self, company_concept: str, standard_concept: str) -> None:
|
||||
"""
|
||||
Add a mapping from a company concept to a standard concept.
|
||||
|
||||
Args:
|
||||
company_concept: The company-specific concept
|
||||
standard_concept: The standard concept
|
||||
"""
|
||||
if standard_concept not in self.mappings:
|
||||
self.mappings[standard_concept] = set()
|
||||
|
||||
self.mappings[standard_concept].add(company_concept)
|
||||
self._save_mappings()
|
||||
|
||||
def get_standard_concept(self, company_concept: str, context: Dict = None) -> Optional[str]:
|
||||
"""
|
||||
Get the standard concept for a given company concept with priority-based resolution.
|
||||
|
||||
Args:
|
||||
company_concept: The company-specific concept
|
||||
context: Optional context information (not used in current implementation)
|
||||
|
||||
Returns:
|
||||
The standard concept or None if not found
|
||||
"""
|
||||
# Use merged mappings with priority-based resolution
|
||||
if self.merged_mappings:
|
||||
# Detect company from concept prefix (e.g., 'tsla:Revenue' -> 'tsla')
|
||||
detected_entity = self._detect_entity_from_concept(company_concept)
|
||||
|
||||
# Search through merged mappings with priority
|
||||
candidates = []
|
||||
|
||||
for std_concept, mapping_list in self.merged_mappings.items():
|
||||
for concept, source, priority in mapping_list:
|
||||
if concept == company_concept:
|
||||
# Boost priority if it matches detected entity
|
||||
effective_priority = priority
|
||||
if detected_entity and source == detected_entity:
|
||||
effective_priority = 4 # Highest priority for exact company match
|
||||
|
||||
candidates.append((std_concept, effective_priority, source))
|
||||
|
||||
# Return highest priority match
|
||||
if candidates:
|
||||
best_match = max(candidates, key=lambda x: x[1])
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.debug("Mapping applied: %s -> %s (source: %s, priority: %s)", company_concept, best_match[0], best_match[2], best_match[1])
|
||||
return best_match[0]
|
||||
|
||||
# Fallback to core mappings
|
||||
for standard_concept, company_concepts in self.mappings.items():
|
||||
if company_concept in company_concepts:
|
||||
return standard_concept
|
||||
return None
|
||||
|
||||
def get_company_concepts(self, standard_concept: str) -> Set[str]:
|
||||
"""
|
||||
Get all company concepts mapped to a standard concept.
|
||||
|
||||
Args:
|
||||
standard_concept: The standard concept
|
||||
|
||||
Returns:
|
||||
Set of company concepts mapped to the standard concept
|
||||
"""
|
||||
return self.mappings.get(standard_concept, set())
|
||||
|
||||
|
||||
class ConceptMapper:
|
||||
"""
|
||||
Maps company-specific concepts to standard concepts using various techniques.
|
||||
|
||||
Attributes:
|
||||
mapping_store (MappingStore): Storage for concept mappings
|
||||
pending_mappings (Dict): Low-confidence mappings pending review
|
||||
_cache (Dict): In-memory cache of mapped concepts
|
||||
"""
|
||||
|
||||
def __init__(self, mapping_store: MappingStore):
|
||||
"""
|
||||
Initialize the concept mapper.
|
||||
|
||||
Args:
|
||||
mapping_store: Storage for concept mappings
|
||||
"""
|
||||
self.mapping_store = mapping_store
|
||||
self.pending_mappings = {}
|
||||
# Cache for faster lookups of previously mapped concepts
|
||||
self._cache = {}
|
||||
# Precompute lowercased standard concept values for faster comparison
|
||||
self._std_concept_values = [(concept, concept.value.lower()) for concept in StandardConcept]
|
||||
|
||||
# Statement-specific keyword sets for faster contextual matching
|
||||
self._bs_keywords = {'assets', 'liabilities', 'equity', 'cash', 'debt', 'inventory', 'receivable', 'payable'}
|
||||
self._is_keywords = {'revenue', 'sales', 'income', 'expense', 'profit', 'loss', 'tax', 'earnings'}
|
||||
self._cf_keywords = {'cash', 'operating', 'investing', 'financing', 'activities'}
|
||||
|
||||
def map_concept(self, company_concept: str, label: str, context: Dict[str, Any]) -> Optional[str]:
|
||||
"""
|
||||
Map a company concept to a standard concept.
|
||||
|
||||
Args:
|
||||
company_concept: The company-specific concept
|
||||
label: The label for the concept
|
||||
context: Additional context information (statement type, calculation relationships, etc.)
|
||||
|
||||
Returns:
|
||||
The standard concept or None if no mapping found
|
||||
"""
|
||||
# Use cache for faster lookups
|
||||
cache_key = (company_concept, context.get('statement_type', ''))
|
||||
if cache_key in self._cache:
|
||||
return self._cache[cache_key]
|
||||
|
||||
# Check if we already have a mapping in the store
|
||||
standard_concept = self.mapping_store.get_standard_concept(company_concept)
|
||||
if standard_concept:
|
||||
self._cache[cache_key] = standard_concept
|
||||
return standard_concept
|
||||
|
||||
# Cache negative results too to avoid repeated inference
|
||||
self._cache[cache_key] = None
|
||||
return None
|
||||
|
||||
def _infer_mapping(self, company_concept: str, label: str, context: Dict[str, Any]) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Infer a mapping between a company concept and a standard concept.
|
||||
|
||||
Args:
|
||||
company_concept: The company-specific concept
|
||||
label: The label for the concept
|
||||
context: Additional context information
|
||||
|
||||
Returns:
|
||||
Tuple of (standard_concept, confidence)
|
||||
"""
|
||||
# Fast path for common patterns
|
||||
label_lower = label.lower()
|
||||
|
||||
# Quick matching for common concepts without full sequence matching
|
||||
if "total assets" in label_lower:
|
||||
return StandardConcept.TOTAL_ASSETS.value, 0.95
|
||||
elif "revenue" in label_lower and len(label_lower) < 30: # Only match short labels to avoid false positives
|
||||
return StandardConcept.REVENUE.value, 0.9
|
||||
elif "net income" in label_lower and "parent" not in label_lower:
|
||||
return StandardConcept.NET_INCOME.value, 0.9
|
||||
|
||||
# Faster direct match checking with precomputed lowercase values
|
||||
for std_concept, std_value_lower in self._std_concept_values:
|
||||
if std_value_lower == label_lower:
|
||||
return std_concept.value, 1.0 # Perfect match
|
||||
|
||||
# Fall back to sequence matching for similarity
|
||||
best_match = None
|
||||
best_score = 0
|
||||
|
||||
# Only compute similarity if some relevant keywords are present to reduce workload
|
||||
statement_type = context.get("statement_type", "")
|
||||
|
||||
# Statement type based filtering to reduce unnecessary comparisons
|
||||
limited_concepts = []
|
||||
if statement_type == "BalanceSheet":
|
||||
if any(kw in label_lower for kw in self._bs_keywords):
|
||||
# Filter to balance sheet concepts only
|
||||
limited_concepts = [c for c, v in self._std_concept_values
|
||||
if any(kw in v for kw in self._bs_keywords)]
|
||||
elif statement_type == "IncomeStatement":
|
||||
if any(kw in label_lower for kw in self._is_keywords):
|
||||
# Filter to income statement concepts only
|
||||
limited_concepts = [c for c, v in self._std_concept_values
|
||||
if any(kw in v for kw in self._is_keywords)]
|
||||
elif statement_type == "CashFlowStatement":
|
||||
if any(kw in label_lower for kw in self._cf_keywords):
|
||||
# Filter to cash flow concepts only
|
||||
limited_concepts = [c for c, v in self._std_concept_values
|
||||
if any(kw in v for kw in self._cf_keywords)]
|
||||
|
||||
# Use limited concepts if available, otherwise use all
|
||||
concepts_to_check = limited_concepts if limited_concepts else [c for c, _ in self._std_concept_values]
|
||||
|
||||
# Calculate similarities for candidate concepts
|
||||
for std_concept in concepts_to_check:
|
||||
# Calculate similarity between labels
|
||||
similarity = SequenceMatcher(None, label_lower, std_concept.value.lower()).ratio()
|
||||
|
||||
# Check if this is the best match so far
|
||||
if similarity > best_score:
|
||||
best_score = similarity
|
||||
best_match = std_concept.value
|
||||
|
||||
# Apply specific contextual rules based on statement type
|
||||
if statement_type == "BalanceSheet":
|
||||
if "assets" in label_lower and "total" in label_lower:
|
||||
if best_match == StandardConcept.TOTAL_ASSETS.value:
|
||||
best_score = min(1.0, best_score + 0.2)
|
||||
elif "liabilities" in label_lower and "total" in label_lower:
|
||||
if best_match == StandardConcept.TOTAL_LIABILITIES.value:
|
||||
best_score = min(1.0, best_score + 0.2)
|
||||
elif "equity" in label_lower and ("total" in label_lower or "stockholders" in label_lower):
|
||||
if best_match == StandardConcept.TOTAL_EQUITY.value:
|
||||
best_score = min(1.0, best_score + 0.2)
|
||||
|
||||
elif statement_type == "IncomeStatement":
|
||||
if any(term in label_lower for term in ["revenue", "sales"]):
|
||||
if best_match == StandardConcept.REVENUE.value:
|
||||
best_score = min(1.0, best_score + 0.2)
|
||||
elif "net income" in label_lower:
|
||||
if best_match == StandardConcept.NET_INCOME.value:
|
||||
best_score = min(1.0, best_score + 0.2)
|
||||
|
||||
# Promote to 0.5 confidence if score close enough to help match
|
||||
# more items that are almost at threshold
|
||||
if 0.45 <= best_score < 0.5:
|
||||
best_score = 0.5
|
||||
|
||||
# If confidence is too low, return None
|
||||
if best_score < 0.5:
|
||||
return None, 0.0
|
||||
|
||||
return best_match, best_score
|
||||
|
||||
def learn_mappings(self, filings: List[Dict[str, Any]]) -> None:
|
||||
"""
|
||||
Learn mappings from a list of filings.
|
||||
|
||||
Args:
|
||||
filings: List of dicts with XBRL data
|
||||
"""
|
||||
# Pre-filter to only process unmapped concepts
|
||||
mapped_concepts = set()
|
||||
for _std_concept, company_concepts in self.mapping_store.mappings.items():
|
||||
mapped_concepts.update(company_concepts)
|
||||
|
||||
# Process only unmapped filings
|
||||
unmapped_filings = [f for f in filings if f.get("concept") not in mapped_concepts]
|
||||
|
||||
# Create a batch of mappings to add
|
||||
mappings_to_add = {}
|
||||
|
||||
for filing in unmapped_filings:
|
||||
concept = filing["concept"]
|
||||
label = filing["label"]
|
||||
context = {
|
||||
"statement_type": filing.get("statement_type", ""),
|
||||
"calculation_parent": filing.get("calculation_parent", ""),
|
||||
"position": filing.get("position", "")
|
||||
}
|
||||
|
||||
# Infer mapping and confidence
|
||||
standard_concept, confidence = self._infer_mapping(concept, label, context)
|
||||
|
||||
# Handle based on confidence
|
||||
if standard_concept and confidence >= 0.9:
|
||||
if standard_concept not in mappings_to_add:
|
||||
mappings_to_add[standard_concept] = set()
|
||||
mappings_to_add[standard_concept].add(concept)
|
||||
elif standard_concept and confidence >= 0.5:
|
||||
if standard_concept not in self.pending_mappings:
|
||||
self.pending_mappings[standard_concept] = []
|
||||
self.pending_mappings[standard_concept].append((concept, confidence, label))
|
||||
|
||||
# Batch add all mappings at once
|
||||
for std_concept, concepts in mappings_to_add.items():
|
||||
for concept in concepts:
|
||||
self.mapping_store.add(concept, std_concept)
|
||||
# Update cache
|
||||
cache_key = (concept, filing.get("statement_type", ""))
|
||||
self._cache[cache_key] = std_concept
|
||||
|
||||
def save_pending_mappings(self, destination: str) -> None:
|
||||
"""
|
||||
Save pending mappings to a file.
|
||||
|
||||
Args:
|
||||
destination: Path to save the pending mappings
|
||||
"""
|
||||
# Convert to serializable format
|
||||
serializable_mappings = {}
|
||||
for std_concept, mappings in self.pending_mappings.items():
|
||||
serializable_mappings[std_concept] = [
|
||||
{"concept": c, "confidence": conf, "label": lbl}
|
||||
for c, conf, lbl in mappings
|
||||
]
|
||||
|
||||
with open(destination, 'w') as f:
|
||||
json.dump(serializable_mappings, f, indent=2)
|
||||
|
||||
|
||||
def standardize_statement(statement_data: List[Dict[str, Any]], mapper: ConceptMapper) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Standardize labels in a statement using the concept mapper.
|
||||
|
||||
Args:
|
||||
statement_data: List of statement line items
|
||||
mapper: ConceptMapper instance
|
||||
|
||||
Returns:
|
||||
Statement data with standardized labels where possible
|
||||
"""
|
||||
# Pre-filter to identify which items need standardization
|
||||
# This avoids unnecessary copying and processing
|
||||
items_to_standardize = []
|
||||
statement_type = statement_data[0].get("statement_type", "") if statement_data else ""
|
||||
|
||||
# First pass - identify which items need standardization and prepare context
|
||||
for i, item in enumerate(statement_data):
|
||||
# Skip abstract elements and dimensions as they don't need standardization
|
||||
if item.get("is_abstract", False) or item.get("is_dimension", False):
|
||||
continue
|
||||
|
||||
concept = item.get("concept", "")
|
||||
if not concept:
|
||||
continue
|
||||
|
||||
label = item.get("label", "")
|
||||
if not label:
|
||||
continue
|
||||
|
||||
# Build minimal context once, reuse for multiple calls
|
||||
context = {
|
||||
"statement_type": item.get("statement_type", "") or statement_type,
|
||||
"level": item.get("level", 0),
|
||||
"is_total": "total" in label.lower() or item.get("is_total", False)
|
||||
}
|
||||
|
||||
items_to_standardize.append((i, concept, label, context))
|
||||
|
||||
# If no items need standardization, return early with unchanged data
|
||||
if not items_to_standardize:
|
||||
return statement_data
|
||||
|
||||
# Second pass - create result list with standardized items
|
||||
result = []
|
||||
|
||||
# Track which indices need standardization for faster lookup
|
||||
standardize_indices = {i for i, _, _, _ in items_to_standardize}
|
||||
|
||||
# Process all items
|
||||
for i, item in enumerate(statement_data):
|
||||
if i not in standardize_indices:
|
||||
# Items that don't need standardization are used as-is
|
||||
result.append(item)
|
||||
continue
|
||||
|
||||
# Get the prepared data for this item
|
||||
_, concept, label, context = next((x for x in items_to_standardize if x[0] == i), (None, None, None, None))
|
||||
|
||||
# Try to map the concept
|
||||
standard_label = mapper.map_concept(concept, label, context)
|
||||
|
||||
# If we found a mapping, create a modified copy
|
||||
if standard_label:
|
||||
# Create a shallow copy only when needed
|
||||
standardized_item = item.copy()
|
||||
standardized_item["label"] = standard_label
|
||||
standardized_item["original_label"] = label
|
||||
result.append(standardized_item)
|
||||
else:
|
||||
# No mapping found, use original item
|
||||
result.append(item)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def create_default_mappings_file(file_path: str) -> None:
|
||||
"""
|
||||
Create the initial concept_mappings.json file with default mappings.
|
||||
This can be called during package installation or initialization.
|
||||
|
||||
Args:
|
||||
file_path: Path where to create the file
|
||||
"""
|
||||
# Ensure directory exists
|
||||
directory = os.path.dirname(file_path)
|
||||
if directory and not os.path.exists(directory):
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
|
||||
# The file already exists, don't overwrite it
|
||||
if os.path.exists(file_path):
|
||||
return
|
||||
|
||||
# Create a minimal set of mappings to get started
|
||||
minimal_mappings = {
|
||||
StandardConcept.REVENUE.value: [
|
||||
"us-gaap_Revenue",
|
||||
"us-gaap_SalesRevenueNet",
|
||||
"us-gaap_Revenues"
|
||||
],
|
||||
StandardConcept.NET_INCOME.value: [
|
||||
"us-gaap_NetIncome",
|
||||
"us-gaap_NetIncomeLoss",
|
||||
"us-gaap_ProfitLoss"
|
||||
],
|
||||
StandardConcept.TOTAL_ASSETS.value: [
|
||||
"us-gaap_Assets",
|
||||
"us-gaap_AssetsTotal"
|
||||
]
|
||||
}
|
||||
|
||||
# Write the file
|
||||
with open(file_path, 'w') as f:
|
||||
json.dump(minimal_mappings, f, indent=2)
|
||||
|
||||
# Initialize MappingStore - only loads from JSON
|
||||
def initialize_default_mappings(read_only: bool = False) -> MappingStore:
|
||||
"""
|
||||
Initialize a MappingStore with mappings from the concept_mappings.json file.
|
||||
|
||||
Args:
|
||||
read_only: If True, prevent writing changes back to the file (used in testing)
|
||||
|
||||
Returns:
|
||||
MappingStore initialized with mappings from JSON file
|
||||
"""
|
||||
store = MappingStore(read_only=read_only)
|
||||
|
||||
# If JSON file doesn't exist, create it with minimal default mappings
|
||||
# Only do this in non-read_only mode to avoid test-initiated file creation
|
||||
if not read_only and not os.path.exists(store.source):
|
||||
create_default_mappings_file(store.source)
|
||||
|
||||
return store
|
||||
@@ -0,0 +1,872 @@
|
||||
"""
|
||||
Statement Resolution for XBRL data.
|
||||
|
||||
This module provides a robust system for identifying and matching XBRL financial statements,
|
||||
notes, and disclosures regardless of taxonomy variations and company-specific customizations.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from edgar.core import log
|
||||
from edgar.xbrl.exceptions import StatementNotFound
|
||||
from edgar.xbrl.statements import statement_to_concepts
|
||||
|
||||
|
||||
class StatementCategory(Enum):
|
||||
"""Categories of XBRL presentation sections."""
|
||||
FINANCIAL_STATEMENT = "statement"
|
||||
NOTE = "note"
|
||||
DISCLOSURE = "disclosure"
|
||||
DOCUMENT = "document" # For cover page, signatures, etc.
|
||||
OTHER = "other"
|
||||
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConceptPattern:
|
||||
"""Pattern for matching statement concepts across different taxonomies."""
|
||||
pattern: str
|
||||
weight: float = 1.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class StatementType:
|
||||
"""Detailed information about a statement type for matching."""
|
||||
name: str
|
||||
primary_concepts: List[str]
|
||||
category: StatementCategory = StatementCategory.FINANCIAL_STATEMENT # Default to financial statement
|
||||
alternative_concepts: List[str] = field(default_factory=list)
|
||||
concept_patterns: List[str] = field(default_factory=list)
|
||||
key_concepts: List[str] = field(default_factory=list)
|
||||
role_patterns: List[str] = field(default_factory=list)
|
||||
title: str = ""
|
||||
supports_parenthetical: bool = False
|
||||
weight_map: Dict[str, float] = field(default_factory=dict)
|
||||
|
||||
def match_concept(self, concept_name: str) -> bool:
|
||||
"""Check if a concept name matches this statement type's concepts."""
|
||||
# Try exact primary concept match
|
||||
if concept_name in self.primary_concepts:
|
||||
return True
|
||||
|
||||
# Try alternate concepts
|
||||
if concept_name in self.alternative_concepts:
|
||||
return True
|
||||
|
||||
# Try matching against patterns
|
||||
for pattern in self.concept_patterns:
|
||||
if re.match(pattern, concept_name):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def match_role(self, role_uri: str, role_name: str = "", role_def: str = "") -> bool:
|
||||
"""Check if role information matches this statement type."""
|
||||
name_lower = self.name.lower()
|
||||
|
||||
# Check exact match in role parts
|
||||
if name_lower in role_uri.lower():
|
||||
return True
|
||||
|
||||
if role_name and name_lower in role_name.lower():
|
||||
return True
|
||||
|
||||
if role_def and name_lower in role_def.lower():
|
||||
return True
|
||||
|
||||
# Try pattern matching
|
||||
for pattern in self.role_patterns:
|
||||
if re.match(pattern, role_uri) or (role_name and re.match(pattern, role_name)):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# Registry of statement types with matching information
|
||||
statement_registry = {
|
||||
"BalanceSheet": StatementType(
|
||||
name="BalanceSheet",
|
||||
category=StatementCategory.FINANCIAL_STATEMENT,
|
||||
primary_concepts=["us-gaap_StatementOfFinancialPositionAbstract"],
|
||||
alternative_concepts=[
|
||||
"us-gaap_BalanceSheetAbstract",
|
||||
"ifrs-full_StatementOfFinancialPositionAbstract" # IFRS equivalent
|
||||
],
|
||||
concept_patterns=[
|
||||
r".*_StatementOfFinancialPositionAbstract$",
|
||||
r".*_BalanceSheetAbstract$",
|
||||
r".*_ConsolidatedBalanceSheetsAbstract$",
|
||||
r".*_CondensedConsolidatedBalanceSheetsUnauditedAbstract$"
|
||||
],
|
||||
key_concepts=[
|
||||
"us-gaap_Assets", "us-gaap_Liabilities", "us-gaap_StockholdersEquity",
|
||||
"ifrs-full_Assets", "ifrs-full_Liabilities", "ifrs-full_Equity" # IFRS equivalents
|
||||
],
|
||||
role_patterns=[
|
||||
r".*[Bb]alance[Ss]heet.*",
|
||||
r".*[Ss]tatement[Oo]f[Ff]inancial[Pp]osition.*",
|
||||
r".*StatementConsolidatedBalanceSheets.*"
|
||||
],
|
||||
title="Consolidated Balance Sheets",
|
||||
supports_parenthetical=True,
|
||||
weight_map={"assets": 0.3, "liabilities": 0.3, "equity": 0.4}
|
||||
),
|
||||
|
||||
"IncomeStatement": StatementType(
|
||||
name="IncomeStatement",
|
||||
category=StatementCategory.FINANCIAL_STATEMENT,
|
||||
primary_concepts=["us-gaap_IncomeStatementAbstract"],
|
||||
alternative_concepts=[
|
||||
"us-gaap_StatementOfIncomeAbstract",
|
||||
"ifrs-full_IncomeStatementAbstract" # IFRS equivalent
|
||||
],
|
||||
concept_patterns=[
|
||||
r".*_IncomeStatementAbstract$",
|
||||
r".*_StatementOfIncomeAbstract$",
|
||||
r".*_ConsolidatedStatementsOfIncomeAbstract$",
|
||||
r".*_CondensedConsolidatedStatementsOfIncomeUnauditedAbstract$"
|
||||
],
|
||||
key_concepts=[
|
||||
"us-gaap_Revenues", "us-gaap_NetIncomeLoss",
|
||||
"ifrs-full_Revenue", "ifrs-full_ProfitLoss" # IFRS equivalents
|
||||
],
|
||||
role_patterns=[
|
||||
r".*[Ii]ncome[Ss]tatement.*",
|
||||
r".*[Ss]tatement[Oo]f[Ii]ncome.*",
|
||||
r".*[Oo]perations.*",
|
||||
r".*StatementConsolidatedStatementsOfIncome.*"
|
||||
],
|
||||
title="Consolidated Statement of Income",
|
||||
supports_parenthetical=True,
|
||||
weight_map={"revenues": 0.4, "netIncomeLoss": 0.6}
|
||||
),
|
||||
|
||||
"CashFlowStatement": StatementType(
|
||||
name="CashFlowStatement",
|
||||
category=StatementCategory.FINANCIAL_STATEMENT,
|
||||
primary_concepts=["us-gaap_StatementOfCashFlowsAbstract"],
|
||||
alternative_concepts=["ifrs-full_StatementOfCashFlowsAbstract"], # IFRS equivalent
|
||||
concept_patterns=[
|
||||
r".*_StatementOfCashFlowsAbstract$",
|
||||
r".*_CashFlowsAbstract$",
|
||||
r".*_ConsolidatedStatementsOfCashFlowsAbstract$",
|
||||
r".*_CondensedConsolidatedStatementsOfCashFlowsUnauditedAbstract$"
|
||||
],
|
||||
key_concepts=[
|
||||
"us-gaap_NetCashProvidedByUsedInOperatingActivities",
|
||||
"us-gaap_CashAndCashEquivalentsPeriodIncreaseDecrease",
|
||||
"ifrs-full_CashFlowsFromUsedInOperatingActivities", # IFRS equivalents
|
||||
"ifrs-full_IncreaseDecreaseInCashAndCashEquivalents"
|
||||
],
|
||||
role_patterns=[
|
||||
r".*[Cc]ash[Ff]low.*",
|
||||
r".*[Ss]tatement[Oo]f[Cc]ash[Ff]lows.*",
|
||||
r".*StatementConsolidatedStatementsOfCashFlows.*"
|
||||
],
|
||||
title="Consolidated Statement of Cash Flows",
|
||||
supports_parenthetical=False
|
||||
),
|
||||
|
||||
"StatementOfEquity": StatementType(
|
||||
name="StatementOfEquity",
|
||||
category=StatementCategory.FINANCIAL_STATEMENT,
|
||||
primary_concepts=["us-gaap_StatementOfStockholdersEquityAbstract"],
|
||||
alternative_concepts=[
|
||||
"us-gaap_StatementOfShareholdersEquityAbstract",
|
||||
"us-gaap_StatementOfPartnersCapitalAbstract"
|
||||
],
|
||||
concept_patterns=[
|
||||
r".*_StatementOfStockholdersEquityAbstract$",
|
||||
r".*_StatementOfShareholdersEquityAbstract$",
|
||||
r".*_StatementOfChangesInEquityAbstract$",
|
||||
r".*_ConsolidatedStatementsOfShareholdersEquityAbstract$"
|
||||
],
|
||||
key_concepts=["us-gaap_StockholdersEquity", "us-gaap_CommonStock", "us-gaap_RetainedEarnings"],
|
||||
role_patterns=[
|
||||
r".*[Ee]quity.*",
|
||||
r".*[Ss]tockholders.*",
|
||||
r".*[Ss]hareholders.*",
|
||||
r".*[Cc]hanges[Ii]n[Ee]quity.*",
|
||||
r".*StatementConsolidatedStatementsOfStockholdersEquity.*"
|
||||
],
|
||||
title="Consolidated Statement of Equity",
|
||||
supports_parenthetical=False
|
||||
),
|
||||
|
||||
"ComprehensiveIncome": StatementType(
|
||||
name="ComprehensiveIncome",
|
||||
category=StatementCategory.FINANCIAL_STATEMENT,
|
||||
primary_concepts=["us-gaap_StatementOfIncomeAndComprehensiveIncomeAbstract"],
|
||||
alternative_concepts=["us-gaap_StatementOfComprehensiveIncomeAbstract"],
|
||||
concept_patterns=[
|
||||
r".*_ComprehensiveIncomeAbstract$",
|
||||
r".*_StatementOfComprehensiveIncomeAbstract$",
|
||||
r".*_ConsolidatedStatementsOfComprehensiveIncomeAbstract$"
|
||||
],
|
||||
key_concepts=["us-gaap_ComprehensiveIncomeNetOfTax"],
|
||||
role_patterns=[
|
||||
r".*[Cc]omprehensive[Ii]ncome.*",
|
||||
r".*[Oo]ther[Cc]omprehensive.*",
|
||||
r".*StatementConsolidatedStatementsOfComprehensiveIncome.*"
|
||||
],
|
||||
title="Consolidated Statement of Comprehensive Income",
|
||||
supports_parenthetical=True
|
||||
),
|
||||
|
||||
"Notes": StatementType(
|
||||
name="Notes",
|
||||
category=StatementCategory.NOTE,
|
||||
primary_concepts=["us-gaap_NotesToFinancialStatementsAbstract"],
|
||||
alternative_concepts=[],
|
||||
concept_patterns=[
|
||||
r".*_NotesToFinancialStatementsAbstract$",
|
||||
r".*_NotesAbstract$"
|
||||
],
|
||||
key_concepts=[],
|
||||
role_patterns=[
|
||||
r".*[Nn]otes[Tt]o[Ff]inancial[Ss]tatements.*",
|
||||
r".*[Nn]ote\s+\d+.*",
|
||||
r".*[Nn]otes.*"
|
||||
],
|
||||
title="Notes to Financial Statements",
|
||||
supports_parenthetical=False
|
||||
),
|
||||
|
||||
"AccountingPolicies": StatementType(
|
||||
name="AccountingPolicies",
|
||||
category=StatementCategory.NOTE,
|
||||
primary_concepts=["us-gaap_AccountingPoliciesAbstract"],
|
||||
alternative_concepts=[],
|
||||
concept_patterns=[
|
||||
r".*_AccountingPoliciesAbstract$",
|
||||
r".*_SignificantAccountingPoliciesAbstract$"
|
||||
],
|
||||
key_concepts=["us-gaap_SignificantAccountingPoliciesTextBlock"],
|
||||
role_patterns=[
|
||||
r".*[Aa]ccounting[Pp]olicies.*",
|
||||
r".*[Ss]ignificant[Aa]ccounting[Pp]olicies.*"
|
||||
],
|
||||
title="Significant Accounting Policies",
|
||||
supports_parenthetical=False
|
||||
),
|
||||
|
||||
"Disclosures": StatementType(
|
||||
name="Disclosures",
|
||||
category=StatementCategory.DISCLOSURE,
|
||||
primary_concepts=["us-gaap_DisclosuresAbstract"],
|
||||
alternative_concepts=[],
|
||||
concept_patterns=[
|
||||
r".*_DisclosuresAbstract$",
|
||||
r".*_DisclosureAbstract$"
|
||||
],
|
||||
key_concepts=[],
|
||||
role_patterns=[
|
||||
r".*[Dd]isclosure.*"
|
||||
],
|
||||
title="Disclosures",
|
||||
supports_parenthetical=False
|
||||
),
|
||||
|
||||
"SegmentDisclosure": StatementType(
|
||||
name="SegmentDisclosure",
|
||||
category=StatementCategory.DISCLOSURE,
|
||||
primary_concepts=["us-gaap_SegmentDisclosureAbstract"],
|
||||
alternative_concepts=[],
|
||||
concept_patterns=[
|
||||
r".*_SegmentDisclosureAbstract$",
|
||||
r".*_SegmentReportingDisclosureAbstract$"
|
||||
],
|
||||
key_concepts=["us-gaap_SegmentReportingDisclosureTextBlock"],
|
||||
role_patterns=[
|
||||
r".*[Ss]egment.*",
|
||||
r".*[Ss]egment[Rr]eporting.*",
|
||||
r".*[Ss]egment[Ii]nformation.*"
|
||||
],
|
||||
title="Segment Information",
|
||||
supports_parenthetical=False
|
||||
),
|
||||
|
||||
"CoverPage": StatementType(
|
||||
name="CoverPage",
|
||||
category=StatementCategory.DOCUMENT,
|
||||
primary_concepts=["dei_CoverAbstract"],
|
||||
concept_patterns=[r".*_CoverAbstract$"],
|
||||
key_concepts=["dei_EntityRegistrantName", "dei_DocumentType"],
|
||||
role_patterns=[r".*[Cc]over.*"],
|
||||
title="Cover Page",
|
||||
supports_parenthetical=False
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
class StatementResolver:
|
||||
"""
|
||||
Resolves statement identifiers to actual XBRL statement roles.
|
||||
|
||||
This class provides a multi-layered approach to statement matching,
|
||||
handling taxonomy variations and company-specific customizations.
|
||||
"""
|
||||
|
||||
def __init__(self, xbrl):
|
||||
"""
|
||||
Initialize with an XBRL object.
|
||||
|
||||
Args:
|
||||
xbrl: XBRL object containing parsed data
|
||||
"""
|
||||
self.xbrl = xbrl
|
||||
self._cache = {}
|
||||
|
||||
# Build indices for faster lookups
|
||||
self._statement_by_role_uri = {}
|
||||
self._statement_by_role_name = {}
|
||||
self._statement_by_primary_concept = {}
|
||||
self._statement_by_type = {}
|
||||
self._statement_by_role_def = {}
|
||||
|
||||
# Map legacy statement types to new registry
|
||||
self._legacy_to_registry = {}
|
||||
for legacy_type, info in statement_to_concepts.items():
|
||||
if legacy_type in statement_registry:
|
||||
self._legacy_to_registry[legacy_type] = legacy_type
|
||||
continue
|
||||
|
||||
# Try to find a match in the registry
|
||||
for reg_type, reg_info in statement_registry.items():
|
||||
if info.concept in reg_info.primary_concepts or info.concept in reg_info.alternative_concepts:
|
||||
self._legacy_to_registry[legacy_type] = reg_type
|
||||
break
|
||||
|
||||
# Initialize indices when instantiated
|
||||
self._initialize_indices()
|
||||
|
||||
def _initialize_indices(self):
|
||||
"""Build lookup indices for fast statement retrieval."""
|
||||
# Get all statements
|
||||
statements = self.xbrl.get_all_statements()
|
||||
|
||||
# Reset indices
|
||||
self._statement_by_role_uri = {}
|
||||
self._statement_by_role_name = {}
|
||||
self._statement_by_primary_concept = {}
|
||||
self._statement_by_type = {}
|
||||
self._statement_by_role_def = {}
|
||||
|
||||
# Build indices
|
||||
for stmt in statements:
|
||||
role = stmt.get('role', '')
|
||||
role_name = stmt.get('role_name', '').lower() if stmt.get('role_name') else ''
|
||||
primary_concept = stmt.get('primary_concept', '')
|
||||
stmt_type = stmt.get('type', '')
|
||||
role_def = stmt.get('definition', '').lower() if stmt.get('definition') else ''
|
||||
|
||||
# By role URI
|
||||
self._statement_by_role_uri[role] = stmt
|
||||
|
||||
# By role name
|
||||
if role_name:
|
||||
if role_name not in self._statement_by_role_name:
|
||||
self._statement_by_role_name[role_name] = []
|
||||
self._statement_by_role_name[role_name].append(stmt)
|
||||
|
||||
# By primary concept
|
||||
if primary_concept:
|
||||
if primary_concept not in self._statement_by_primary_concept:
|
||||
self._statement_by_primary_concept[primary_concept] = []
|
||||
self._statement_by_primary_concept[primary_concept].append(stmt)
|
||||
|
||||
# By statement type
|
||||
if stmt_type:
|
||||
if stmt_type not in self._statement_by_type:
|
||||
self._statement_by_type[stmt_type] = []
|
||||
self._statement_by_type[stmt_type].append(stmt)
|
||||
|
||||
# By role definition (without spaces, lowercase)
|
||||
if role_def:
|
||||
def_key = role_def.replace(' ', '')
|
||||
if def_key not in self._statement_by_role_def:
|
||||
self._statement_by_role_def[def_key] = []
|
||||
self._statement_by_role_def[def_key].append(stmt)
|
||||
|
||||
def _match_by_primary_concept(self, statement_type: str, is_parenthetical: bool = False) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
|
||||
"""
|
||||
Match statements using primary concept names.
|
||||
|
||||
Args:
|
||||
statement_type: Statement type to match
|
||||
is_parenthetical: Whether to look for a parenthetical statement
|
||||
|
||||
Returns:
|
||||
Tuple of (matching statements, found role, confidence score)
|
||||
"""
|
||||
# Convert legacy types to registry types if needed
|
||||
if statement_type in self._legacy_to_registry:
|
||||
registry_type = self._legacy_to_registry[statement_type]
|
||||
else:
|
||||
registry_type = statement_type
|
||||
|
||||
# Check if this is a known statement type
|
||||
if registry_type not in statement_registry:
|
||||
return [], None, 0.0
|
||||
|
||||
# Get registry information
|
||||
registry_entry = statement_registry[registry_type]
|
||||
|
||||
# Try to match by primary concepts
|
||||
matched_statements = []
|
||||
|
||||
for concept in registry_entry.primary_concepts + registry_entry.alternative_concepts:
|
||||
if concept in self._statement_by_primary_concept:
|
||||
for stmt in self._statement_by_primary_concept[concept]:
|
||||
# Handle parenthetical check
|
||||
if registry_entry.supports_parenthetical:
|
||||
role_def = stmt.get('definition', '').lower()
|
||||
is_role_parenthetical = 'parenthetical' in role_def
|
||||
|
||||
# Skip if parenthetical status doesn't match
|
||||
if is_parenthetical != is_role_parenthetical:
|
||||
continue
|
||||
|
||||
matched_statements.append(stmt)
|
||||
|
||||
# If we found matching statements, return with high confidence
|
||||
if matched_statements:
|
||||
return matched_statements, matched_statements[0]['role'], 0.9
|
||||
|
||||
return [], None, 0.0
|
||||
|
||||
def _match_by_concept_pattern(self, statement_type: str, is_parenthetical: bool = False) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
|
||||
"""
|
||||
Match statements using regex patterns on concept names to handle custom company namespaces.
|
||||
|
||||
Args:
|
||||
statement_type: Statement type to match
|
||||
is_parenthetical: Whether to look for a parenthetical statement
|
||||
|
||||
Returns:
|
||||
Tuple of (matching statements, found role, confidence score)
|
||||
"""
|
||||
# Convert legacy types to registry types if needed
|
||||
if statement_type in self._legacy_to_registry:
|
||||
registry_type = self._legacy_to_registry[statement_type]
|
||||
else:
|
||||
registry_type = statement_type
|
||||
|
||||
# Check if this is a known statement type
|
||||
if registry_type not in statement_registry:
|
||||
return [], None, 0.0
|
||||
|
||||
# Get registry information
|
||||
registry_entry = statement_registry[registry_type]
|
||||
concept_patterns = registry_entry.concept_patterns
|
||||
|
||||
if not concept_patterns:
|
||||
return [], None, 0.0
|
||||
|
||||
# Get all statements to check against patterns
|
||||
all_statements = self.xbrl.get_all_statements()
|
||||
|
||||
# Check each statement's primary concept against our patterns
|
||||
matched_statements = []
|
||||
for stmt in all_statements:
|
||||
primary_concept = stmt.get('primary_concept', '')
|
||||
|
||||
# Skip if no primary concept
|
||||
if not primary_concept:
|
||||
continue
|
||||
|
||||
# Check if this concept matches any of our patterns
|
||||
for pattern in concept_patterns:
|
||||
if re.match(pattern, primary_concept):
|
||||
# For parenthetical statements, check the role definition
|
||||
if registry_entry.supports_parenthetical:
|
||||
role_def = stmt.get('definition', '').lower()
|
||||
is_role_parenthetical = 'parenthetical' in role_def
|
||||
|
||||
# Skip if parenthetical status doesn't match
|
||||
if is_parenthetical != is_role_parenthetical:
|
||||
continue
|
||||
|
||||
matched_statements.append(stmt)
|
||||
break # Found a match, no need to check other patterns
|
||||
|
||||
# If we found matching statements, return with high confidence
|
||||
if matched_statements:
|
||||
return matched_statements, matched_statements[0]['role'], 0.85
|
||||
|
||||
return [], None, 0.0
|
||||
|
||||
def _match_by_role_pattern(self, statement_type: str, is_parenthetical: bool = False) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
|
||||
"""
|
||||
Match statements using role URI or role name patterns.
|
||||
|
||||
Args:
|
||||
statement_type: Statement type to match
|
||||
is_parenthetical: Whether to look for a parenthetical statement
|
||||
|
||||
Returns:
|
||||
Tuple of (matching statements, found role, confidence score)
|
||||
"""
|
||||
# Convert legacy types to registry types if needed
|
||||
if statement_type in self._legacy_to_registry:
|
||||
registry_type = self._legacy_to_registry[statement_type]
|
||||
else:
|
||||
registry_type = statement_type
|
||||
|
||||
# Check if this is a known statement type
|
||||
if registry_type not in statement_registry:
|
||||
return [], None, 0.0
|
||||
|
||||
# Get registry information
|
||||
registry_entry = statement_registry[registry_type]
|
||||
role_patterns = registry_entry.role_patterns
|
||||
|
||||
if not role_patterns:
|
||||
return [], None, 0.0
|
||||
|
||||
# Get all statements
|
||||
all_statements = self.xbrl.get_all_statements()
|
||||
|
||||
# Check each statement's role and role name against our patterns
|
||||
matched_statements = []
|
||||
for stmt in all_statements:
|
||||
role = stmt.get('role', '')
|
||||
role_name = stmt.get('role_name', '')
|
||||
|
||||
# Check if role matches any pattern
|
||||
for pattern in role_patterns:
|
||||
if (re.search(pattern, role, re.IGNORECASE) or
|
||||
(role_name and re.search(pattern, role_name, re.IGNORECASE))):
|
||||
# For parenthetical statements, check the role definition
|
||||
if registry_entry.supports_parenthetical:
|
||||
role_def = stmt.get('definition', '').lower()
|
||||
is_role_parenthetical = 'parenthetical' in role_def
|
||||
|
||||
# Skip if parenthetical status doesn't match
|
||||
if is_parenthetical != is_role_parenthetical:
|
||||
continue
|
||||
|
||||
matched_statements.append(stmt)
|
||||
break # Found a match, no need to check other patterns
|
||||
|
||||
# If we found matching statements, return with good confidence
|
||||
if matched_statements:
|
||||
return matched_statements, matched_statements[0]['role'], 0.75
|
||||
|
||||
return [], None, 0.0
|
||||
|
||||
def _match_by_content(self, statement_type: str) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
|
||||
"""
|
||||
Match statements by analyzing their content against key concepts.
|
||||
|
||||
Args:
|
||||
statement_type: Statement type to match
|
||||
|
||||
Returns:
|
||||
Tuple of (matching statements, found role, confidence score)
|
||||
"""
|
||||
# Convert legacy types to registry types if needed
|
||||
if statement_type in self._legacy_to_registry:
|
||||
registry_type = self._legacy_to_registry[statement_type]
|
||||
else:
|
||||
registry_type = statement_type
|
||||
|
||||
# Check if this is a known statement type
|
||||
if registry_type not in statement_registry:
|
||||
return [], None, 0.0
|
||||
|
||||
# Get registry information
|
||||
registry_entry = statement_registry[registry_type]
|
||||
key_concepts = registry_entry.key_concepts
|
||||
|
||||
if not key_concepts:
|
||||
return [], None, 0.0
|
||||
|
||||
# Get all statements
|
||||
all_statements = self.xbrl.get_all_statements()
|
||||
|
||||
# Score each statement based on presence of key concepts
|
||||
statement_scores = []
|
||||
|
||||
for stmt in all_statements:
|
||||
role = stmt.get('role', '')
|
||||
if role not in self.xbrl.presentation_trees:
|
||||
continue
|
||||
|
||||
# Get concept nodes for this role
|
||||
tree = self.xbrl.presentation_trees[role]
|
||||
all_nodes = set(tree.all_nodes.keys())
|
||||
|
||||
# Count matching key concepts
|
||||
matches = 0
|
||||
total_weight = 0.0
|
||||
|
||||
for concept in key_concepts:
|
||||
# Normalize concept name
|
||||
normalized = concept.replace(':', '_')
|
||||
|
||||
if concept in all_nodes or normalized in all_nodes:
|
||||
matches += 1
|
||||
# Add weighting if available
|
||||
concept_key = concept.split('_')[-1].lower()
|
||||
weight = registry_entry.weight_map.get(concept_key, 1.0)
|
||||
total_weight += weight
|
||||
|
||||
# Calculate confidence score (weighted by presence of key concepts)
|
||||
if key_concepts:
|
||||
# Base confidence on percentage of key concepts found
|
||||
confidence = matches / len(key_concepts)
|
||||
|
||||
# Apply weighting if available
|
||||
if total_weight > 0:
|
||||
confidence = min(total_weight / sum(registry_entry.weight_map.values()), 1.0)
|
||||
else:
|
||||
confidence = 0.0
|
||||
|
||||
if confidence > 0:
|
||||
statement_scores.append((stmt, confidence))
|
||||
|
||||
# Sort by confidence score
|
||||
statement_scores.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Return best match if above threshold
|
||||
if statement_scores and statement_scores[0][1] >= 0.4:
|
||||
best_match, confidence = statement_scores[0]
|
||||
return [best_match], best_match['role'], min(confidence + 0.2, 0.85) # Boost confidence but cap at 0.85
|
||||
|
||||
return [], None, 0.0
|
||||
|
||||
def _match_by_standard_name(self, statement_type: str) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
|
||||
"""
|
||||
Match statements by standard statement type name.
|
||||
|
||||
Args:
|
||||
statement_type: Statement type to match
|
||||
|
||||
Returns:
|
||||
Tuple of (matching statements, found role, confidence score)
|
||||
"""
|
||||
# Check if we have statements of this type
|
||||
if statement_type in self._statement_by_type:
|
||||
statements = self._statement_by_type[statement_type]
|
||||
if statements:
|
||||
return statements, statements[0]['role'], 0.95
|
||||
|
||||
return [], None, 0.0
|
||||
|
||||
def _match_by_role_definition(self, statement_type: str) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
|
||||
"""
|
||||
Match statements by role definition text.
|
||||
|
||||
Args:
|
||||
statement_type: Statement type or definition text to match
|
||||
|
||||
Returns:
|
||||
Tuple of (matching statements, found role, confidence score)
|
||||
"""
|
||||
# Clean statement type for matching
|
||||
clean_type = statement_type.lower().replace(' ', '')
|
||||
|
||||
# Try exact match
|
||||
if clean_type in self._statement_by_role_def:
|
||||
statements = self._statement_by_role_def[clean_type]
|
||||
if statements:
|
||||
return statements, statements[0]['role'], 0.85
|
||||
|
||||
# Try partial match
|
||||
for def_key, statements in self._statement_by_role_def.items():
|
||||
if clean_type in def_key:
|
||||
return statements, statements[0]['role'], 0.65
|
||||
|
||||
if def_key in clean_type:
|
||||
return statements, statements[0]['role'], 0.55
|
||||
|
||||
return [], None, 0.0
|
||||
|
||||
def _get_best_guess(self, statement_type: str) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
|
||||
"""
|
||||
Make a best guess when all other methods fail.
|
||||
|
||||
Args:
|
||||
statement_type: Statement type to guess
|
||||
|
||||
Returns:
|
||||
Tuple of (matching statements, found role, confidence score)
|
||||
"""
|
||||
# Try partial matching on role names
|
||||
clean_type = statement_type.lower()
|
||||
|
||||
for role_name, statements in self._statement_by_role_name.items():
|
||||
if clean_type in role_name or role_name in clean_type:
|
||||
return statements, statements[0]['role'], 0.4
|
||||
|
||||
# If we have statements of any type, return the first one with very low confidence
|
||||
all_statements = self.xbrl.get_all_statements()
|
||||
if all_statements:
|
||||
# Try to find a primary financial statement
|
||||
for stmt_type in ['BalanceSheet', 'IncomeStatement', 'CashFlowStatement']:
|
||||
if stmt_type in self._statement_by_type:
|
||||
statements = self._statement_by_type[stmt_type]
|
||||
if statements:
|
||||
return statements, statements[0]['role'], 0.2
|
||||
|
||||
# Last resort: return first statement
|
||||
return [all_statements[0]], all_statements[0]['role'], 0.1
|
||||
|
||||
return [], None, 0.0
|
||||
|
||||
def find_statement(self, statement_type: str, is_parenthetical: bool = False,
|
||||
category_filter: Optional[StatementCategory] = None) -> Tuple[List[Dict[str, Any]], Optional[str], str, float]:
|
||||
"""
|
||||
Find a statement by type, with multi-layered fallback approach.
|
||||
|
||||
Args:
|
||||
statement_type: Statement type or identifier
|
||||
is_parenthetical: Whether to look for parenthetical version
|
||||
category_filter: Optional filter to only match statements of a specific category
|
||||
|
||||
Returns:
|
||||
Tuple of (matching_statements, found_role, canonical_statement_type, confidence_score)
|
||||
|
||||
Note:
|
||||
For standard statement types like "BalanceSheet", "IncomeStatement", etc., the
|
||||
canonical_statement_type will be the input statement_type, allowing downstream
|
||||
code to still recognize and apply type-specific logic.
|
||||
"""
|
||||
# Check cache first
|
||||
category_key = str(category_filter.value) if category_filter else "None"
|
||||
cache_key = f"{statement_type}_{is_parenthetical}_{category_key}"
|
||||
if cache_key in self._cache:
|
||||
return self._cache[cache_key]
|
||||
|
||||
# If this is a role URI we already know, return immediately
|
||||
if statement_type in self._statement_by_role_uri:
|
||||
stmt = self._statement_by_role_uri[statement_type]
|
||||
|
||||
# Apply category filter if specified
|
||||
if category_filter:
|
||||
# Get category from statement or determine based on type
|
||||
stmt_category = None
|
||||
if 'category' in stmt and stmt['category']:
|
||||
stmt_category = stmt['category']
|
||||
elif stmt['type'] in statement_registry:
|
||||
stmt_category = statement_registry[stmt['type']].category.value
|
||||
|
||||
# Skip if category doesn't match
|
||||
if stmt_category != category_filter.value:
|
||||
result = ([], None, statement_type, 0.0)
|
||||
self._cache[cache_key] = result
|
||||
return result
|
||||
|
||||
result = ([stmt], statement_type, stmt.get('type', statement_type), 1.0)
|
||||
self._cache[cache_key] = result
|
||||
return result
|
||||
|
||||
# Check if this is a canonical statement type from the registry
|
||||
is_canonical_type = statement_type in statement_registry
|
||||
|
||||
# Try standard name matching first (exact type match)
|
||||
match = self._match_by_standard_name(statement_type)
|
||||
if match[0] and match[2] > 0.9: # Very high confidence
|
||||
statements, role, conf = match
|
||||
# For canonical types, preserve the original statement_type
|
||||
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
|
||||
result = (statements, role, canonical_type, conf)
|
||||
self._cache[cache_key] = result
|
||||
return result
|
||||
|
||||
# Try primary concept matching
|
||||
match = self._match_by_primary_concept(statement_type, is_parenthetical)
|
||||
if match[0] and match[2] > 0.8: # High confidence
|
||||
statements, role, conf = match
|
||||
# For canonical types, preserve the original statement_type
|
||||
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
|
||||
result = (statements, role, canonical_type, conf)
|
||||
self._cache[cache_key] = result
|
||||
return result
|
||||
|
||||
# Try custom namespace matching
|
||||
match = self._match_by_concept_pattern(statement_type, is_parenthetical)
|
||||
if match[0] and match[2] > 0.8: # High confidence
|
||||
statements, role, conf = match
|
||||
# For canonical types, preserve the original statement_type
|
||||
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
|
||||
result = (statements, role, canonical_type, conf)
|
||||
self._cache[cache_key] = result
|
||||
return result
|
||||
|
||||
# Try role pattern matching
|
||||
match = self._match_by_role_pattern(statement_type, is_parenthetical)
|
||||
if match[0] and match[2] > 0.7: # Good confidence
|
||||
statements, role, conf = match
|
||||
# For canonical types, preserve the original statement_type
|
||||
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
|
||||
result = (statements, role, canonical_type, conf)
|
||||
self._cache[cache_key] = result
|
||||
return result
|
||||
|
||||
# Try content-based analysis
|
||||
match = self._match_by_content(statement_type)
|
||||
if match[0] and match[2] > 0.6: # Moderate confidence
|
||||
statements, role, conf = match
|
||||
# For canonical types, preserve the original statement_type
|
||||
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
|
||||
result = (statements, role, canonical_type, conf)
|
||||
self._cache[cache_key] = result
|
||||
return result
|
||||
|
||||
# Try role definition matching
|
||||
match = self._match_by_role_definition(statement_type)
|
||||
if match[0] and match[2] > 0.5: # Lower confidence but still useful
|
||||
statements, role, conf = match
|
||||
# For canonical types, preserve the original statement_type
|
||||
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
|
||||
result = (statements, role, canonical_type, conf)
|
||||
self._cache[cache_key] = result
|
||||
return result
|
||||
|
||||
# No good match found, return best guess with low confidence
|
||||
statements, role, conf = self._get_best_guess(statement_type)
|
||||
if conf < 0.4:
|
||||
# Get entity context for detailed error reporting
|
||||
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown')
|
||||
cik = getattr(self.xbrl, 'cik', 'Unknown')
|
||||
period_of_report = getattr(self.xbrl, 'period_of_report', 'Unknown')
|
||||
|
||||
if len(statements) == 0:
|
||||
raise StatementNotFound(
|
||||
statement_type=statement_type,
|
||||
confidence=conf,
|
||||
found_statements=[],
|
||||
entity_name=entity_name,
|
||||
cik=cik,
|
||||
period_of_report=period_of_report,
|
||||
reason="No statements available in XBRL data"
|
||||
)
|
||||
elif conf < 0.3:
|
||||
found_statements = [s['definition'] for s in statements]
|
||||
raise StatementNotFound(
|
||||
statement_type=statement_type,
|
||||
confidence=conf,
|
||||
found_statements=found_statements,
|
||||
entity_name=entity_name,
|
||||
cik=cik,
|
||||
period_of_report=period_of_report,
|
||||
reason="Confidence threshold not met"
|
||||
)
|
||||
else:
|
||||
log.warn(
|
||||
f"No good match found for statement type '{statement_type}'. The best guess has low confidence: {conf:.2f}")
|
||||
if statements:
|
||||
# For canonical types, preserve the original statement_type
|
||||
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
|
||||
result = (statements, role, canonical_type, conf)
|
||||
else:
|
||||
result = ([], None, statement_type, 0.0)
|
||||
|
||||
self._cache[cache_key] = result
|
||||
return result
|
||||
1514
venv/lib/python3.10/site-packages/edgar/xbrl/statements.py
Normal file
1514
venv/lib/python3.10/site-packages/edgar/xbrl/statements.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,27 @@
|
||||
"""
|
||||
XBRL Statement Stitching Package
|
||||
|
||||
This package provides functionality to combine multiple XBRL statements
|
||||
across different time periods into a unified view, handling concept
|
||||
consistency issues and normalizing data representation.
|
||||
"""
|
||||
|
||||
# Import standardize_statement for backwards compatibility with tests
|
||||
from edgar.xbrl.standardization import standardize_statement
|
||||
from edgar.xbrl.stitching.core import StatementStitcher, stitch_statements
|
||||
from edgar.xbrl.stitching.periods import determine_optimal_periods
|
||||
from edgar.xbrl.stitching.query import StitchedFactQuery, StitchedFactsView
|
||||
from edgar.xbrl.stitching.utils import render_stitched_statement, to_pandas
|
||||
from edgar.xbrl.stitching.xbrls import XBRLS
|
||||
|
||||
__all__ = [
|
||||
'XBRLS',
|
||||
'StatementStitcher',
|
||||
'stitch_statements',
|
||||
'determine_optimal_periods',
|
||||
'render_stitched_statement',
|
||||
'to_pandas',
|
||||
'standardize_statement',
|
||||
'StitchedFactsView',
|
||||
'StitchedFactQuery'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
621
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/core.py
Normal file
621
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/core.py
Normal file
@@ -0,0 +1,621 @@
|
||||
"""
|
||||
XBRL Statement Stitching - Core Functionality
|
||||
|
||||
This module contains the core StatementStitcher class and related functionality
|
||||
for combining multiple XBRL statements across different time periods.
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
from edgar.xbrl.core import format_date, parse_date
|
||||
from edgar.xbrl.standardization import ConceptMapper, initialize_default_mappings, standardize_statement
|
||||
from edgar.xbrl.stitching.ordering import StatementOrderingManager
|
||||
from edgar.xbrl.stitching.periods import determine_optimal_periods
|
||||
from edgar.xbrl.stitching.presentation import VirtualPresentationTree
|
||||
|
||||
|
||||
class StatementStitcher:
|
||||
"""
|
||||
Combines multiple statements across time periods into a unified view.
|
||||
|
||||
This class handles the complexities of combining financial statements
|
||||
from different periods, including:
|
||||
- Normalizing concepts that change over time
|
||||
- Aligning periods correctly
|
||||
- Handling missing data points
|
||||
- Providing both standardized and company-specific views
|
||||
"""
|
||||
|
||||
class PeriodType(str, Enum):
|
||||
"""Types of period views available for stitched statements"""
|
||||
RECENT_PERIODS = "Most Recent Periods"
|
||||
RECENT_YEARS = "Recent Years"
|
||||
THREE_YEAR_COMPARISON = "Three-Year Comparison"
|
||||
THREE_QUARTERS = "Three Recent Quarters"
|
||||
ANNUAL_COMPARISON = "Annual Comparison"
|
||||
QUARTERLY_TREND = "Quarterly Trend"
|
||||
ALL_PERIODS = "All Available Periods"
|
||||
|
||||
def __init__(self, concept_mapper: Optional[ConceptMapper] = None):
|
||||
"""
|
||||
Initialize a StatementStitcher instance.
|
||||
|
||||
Args:
|
||||
concept_mapper: Optional ConceptMapper for standardizing concepts.
|
||||
If None, a default mapper is created.
|
||||
"""
|
||||
if concept_mapper is None:
|
||||
self.mapping_store = initialize_default_mappings()
|
||||
self.concept_mapper = ConceptMapper(self.mapping_store)
|
||||
else:
|
||||
self.concept_mapper = concept_mapper
|
||||
self.mapping_store = concept_mapper.mapping_store
|
||||
|
||||
# Initialize data structures
|
||||
self.periods = [] # Ordered list of period identifiers
|
||||
self.period_dates = {} # Maps period ID to display dates
|
||||
self.data = defaultdict(dict) # {concept: {period: value}}
|
||||
self.concept_metadata = {} # Metadata for each concept (level, etc.)
|
||||
self.ordering_manager = None # Will be initialized during stitching
|
||||
self.original_statement_order = [] # Track original order for hierarchy context
|
||||
|
||||
def stitch_statements(
|
||||
self,
|
||||
statements: List[Dict[str, Any]],
|
||||
period_type: Union[PeriodType, str] = PeriodType.RECENT_PERIODS,
|
||||
max_periods: int = None,
|
||||
standard: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Stitch multiple statements into a unified view.
|
||||
|
||||
Args:
|
||||
statements: List of statement data from different filings
|
||||
period_type: Type of period view to generate
|
||||
max_periods: Maximum number of periods to include
|
||||
standard: Whether to use standardized concept labels
|
||||
|
||||
Returns:
|
||||
Dictionary with stitched statement data
|
||||
"""
|
||||
# Reset state
|
||||
self.periods = []
|
||||
self.period_dates = {}
|
||||
self.data = defaultdict(dict)
|
||||
self.concept_metadata = {}
|
||||
self.original_statement_order = []
|
||||
|
||||
# Initialize ordering manager for this statement type
|
||||
statement_type = statements[0].get('statement_type', 'IncomeStatement') if statements else 'IncomeStatement'
|
||||
self.ordering_manager = StatementOrderingManager(statement_type)
|
||||
|
||||
# Capture original statement order from the most recent (first) statement for hierarchy context
|
||||
if statements:
|
||||
reference_statement = statements[0]
|
||||
self.original_statement_order = []
|
||||
for item in reference_statement.get('data', []):
|
||||
concept = item.get('concept')
|
||||
label = item.get('label')
|
||||
if concept:
|
||||
self.original_statement_order.append(concept)
|
||||
if label and label not in self.original_statement_order:
|
||||
self.original_statement_order.append(label)
|
||||
|
||||
# Extract and sort all periods
|
||||
all_periods = self._extract_periods(statements)
|
||||
|
||||
# Set max_periods if not provided
|
||||
max_periods = max_periods or len(statements) + 2 # Allow for the last statement to have 3 periods
|
||||
|
||||
# Select appropriate periods based on period_type
|
||||
selected_periods = self._select_periods(all_periods, period_type, max_periods)
|
||||
self.periods = selected_periods
|
||||
|
||||
# Process each statement
|
||||
for _i, statement in enumerate(statements):
|
||||
# Only process statements that have periods in our selection
|
||||
statement_periods = set(statement['periods'].keys())
|
||||
relevant_periods = statement_periods.intersection(set(selected_periods))
|
||||
|
||||
if not relevant_periods:
|
||||
continue
|
||||
|
||||
# Standardize the statement if needed
|
||||
if standard:
|
||||
processed_data = self._standardize_statement_data(statement)
|
||||
else:
|
||||
processed_data = statement['data']
|
||||
|
||||
# Store data for each item
|
||||
self._integrate_statement_data(processed_data, statement['periods'], relevant_periods)
|
||||
|
||||
# Format the stitched data
|
||||
return self._format_output_with_ordering(statements)
|
||||
|
||||
def _extract_periods(self, statements: List[Dict[str, Any]]) -> List[Tuple[str, datetime]]:
|
||||
"""
|
||||
Extract and sort all periods from the statements, de-duplicating periods with the same date.
|
||||
|
||||
Args:
|
||||
statements: List of statement data
|
||||
|
||||
Returns:
|
||||
List of (period_id, end_date) tuples, sorted by date (newest first)
|
||||
"""
|
||||
# Use a dictionary to track unique periods by their end date
|
||||
# This will handle cases where different period_ids reference the same date
|
||||
unique_periods = {} # key: date string, value: (period_id, datetime, statement_index)
|
||||
|
||||
for i, statement in enumerate(statements):
|
||||
# Use statement index (i) to prioritize more recent filings
|
||||
# Lower index = more recent filing
|
||||
for period_id, period_info in statement['periods'].items():
|
||||
# Extract end date for sorting
|
||||
try:
|
||||
# Initialize normalized_key to silence the type checker
|
||||
normalized_key = ""
|
||||
|
||||
if period_id.startswith('instant_'):
|
||||
date_str = period_id.split('_')[1]
|
||||
# Format the date consistently with single statements
|
||||
try:
|
||||
date_obj = parse_date(date_str)
|
||||
display_date = format_date(date_obj)
|
||||
except ValueError:
|
||||
# Fall back to original label if parsing fails
|
||||
display_date = period_info['label']
|
||||
period_type = 'instant'
|
||||
# For instant periods, create a normalized key with just the date
|
||||
normalized_key = f"{period_type}_{date_str}"
|
||||
else: # duration
|
||||
# For durations, extract both start and end dates
|
||||
parts = period_id.split('_')
|
||||
if len(parts) >= 3:
|
||||
start_date_str = parts[1]
|
||||
end_date_str = parts[2]
|
||||
start_date = parse_date(start_date_str)
|
||||
end_date = parse_date(end_date_str)
|
||||
date_str = end_date_str # Use end date for sorting
|
||||
|
||||
# Format end date consistently - for stitched statements,
|
||||
# we only need the end date for duration periods as that's what users compare
|
||||
display_date = format_date(end_date)
|
||||
period_type = 'duration'
|
||||
# Create a normalized key that combines period type, start date, and end date
|
||||
normalized_key = f"{period_type}_{format_date(start_date)}_{format_date(end_date)}"
|
||||
else:
|
||||
# Skip malformed period IDs
|
||||
continue
|
||||
|
||||
# Parse the end date for sorting
|
||||
end_date = parse_date(date_str)
|
||||
|
||||
# Check if we already have this period (by normalized key)
|
||||
if normalized_key in unique_periods:
|
||||
existing_idx = unique_periods[normalized_key][2]
|
||||
# Only replace if this statement is from a more recent filing
|
||||
if i < existing_idx:
|
||||
unique_periods[normalized_key] = (period_id, end_date, i)
|
||||
self.period_dates[period_id] = display_date
|
||||
else:
|
||||
# Add new period
|
||||
unique_periods[normalized_key] = (period_id, end_date, i)
|
||||
self.period_dates[period_id] = display_date
|
||||
|
||||
except (ValueError, TypeError, IndexError):
|
||||
# Skip periods with invalid dates
|
||||
continue
|
||||
|
||||
# Extract and sort the unique periods
|
||||
all_periods = [(period_id, end_date) for period_id, end_date, _ in unique_periods.values()]
|
||||
|
||||
# Sort by date, newest first
|
||||
return sorted(all_periods, key=lambda x: x[1], reverse=True)
|
||||
|
||||
def _select_periods(
|
||||
self,
|
||||
all_periods: List[Tuple[str, Union[str,datetime]]],
|
||||
period_type: Union[PeriodType, str],
|
||||
max_periods: int
|
||||
) -> List[str]:
|
||||
"""
|
||||
Select appropriate periods based on period_type.
|
||||
|
||||
Args:
|
||||
all_periods: List of (period_id, end_date) tuples
|
||||
period_type: Type of period view to generate
|
||||
max_periods: Maximum number of periods to include
|
||||
|
||||
Returns:
|
||||
List of selected period IDs
|
||||
"""
|
||||
if isinstance(period_type, str):
|
||||
try:
|
||||
period_type = StatementStitcher.PeriodType(period_type)
|
||||
except ValueError:
|
||||
# Default to recent periods if string doesn't match enum
|
||||
period_type = StatementStitcher.PeriodType.RECENT_PERIODS
|
||||
|
||||
# Extract period types (instant vs duration)
|
||||
instants = [(pid, date) for pid, date in all_periods if pid.startswith('instant_')]
|
||||
durations = [(pid, date) for pid, date in all_periods if not pid.startswith('instant_')]
|
||||
|
||||
# Apply different selection logic based on period_type
|
||||
if period_type == StatementStitcher.PeriodType.RECENT_PERIODS:
|
||||
# Just take the most recent periods up to max_periods
|
||||
return [pid for pid, _ in all_periods[:max_periods]]
|
||||
|
||||
elif period_type == StatementStitcher.PeriodType.THREE_YEAR_COMPARISON:
|
||||
# For balance sheets, find year-end instants
|
||||
year_ends = []
|
||||
years_seen = set()
|
||||
|
||||
for pid, date in instants:
|
||||
year = parse_date(date).year
|
||||
if year not in years_seen and len(year_ends) < max_periods:
|
||||
year_ends.append(pid)
|
||||
years_seen.add(year)
|
||||
|
||||
return year_ends
|
||||
|
||||
elif period_type == StatementStitcher.PeriodType.THREE_QUARTERS:
|
||||
# Find the most recent quarters (for income statements)
|
||||
quarterly_periods = []
|
||||
|
||||
for pid, _date in durations:
|
||||
# Check if this appears to be a quarterly period
|
||||
if not pid.startswith('duration_'):
|
||||
continue
|
||||
|
||||
start_date_str = pid.split('_')[1]
|
||||
end_date_str = pid.split('_')[2]
|
||||
|
||||
try:
|
||||
start_date = parse_date(start_date_str)
|
||||
end_date = parse_date(end_date_str)
|
||||
days = (end_date - start_date).days
|
||||
|
||||
# Assuming quarterly is around 90 days
|
||||
if 80 <= days <= 95:
|
||||
quarterly_periods.append(pid)
|
||||
if len(quarterly_periods) >= max_periods:
|
||||
break
|
||||
except (ValueError, TypeError, IndexError):
|
||||
continue
|
||||
|
||||
return quarterly_periods
|
||||
|
||||
elif period_type == StatementStitcher.PeriodType.ANNUAL_COMPARISON:
|
||||
# Find annual periods (for income statements)
|
||||
annual_periods = []
|
||||
|
||||
for pid, _date in durations:
|
||||
# Check if this appears to be an annual period
|
||||
if not pid.startswith('duration_'):
|
||||
continue
|
||||
|
||||
start_date_str = pid.split('_')[1]
|
||||
end_date_str = pid.split('_')[2]
|
||||
|
||||
try:
|
||||
start_date = parse_date(start_date_str)
|
||||
end_date = parse_date(end_date_str)
|
||||
days = (end_date - start_date).days
|
||||
|
||||
# Assuming annual is around 365 days
|
||||
if 350 <= days <= 380:
|
||||
annual_periods.append(pid)
|
||||
if len(annual_periods) >= max_periods:
|
||||
break
|
||||
except (ValueError, TypeError, IndexError):
|
||||
continue
|
||||
|
||||
return annual_periods
|
||||
|
||||
elif period_type == StatementStitcher.PeriodType.ALL_PERIODS:
|
||||
# Return all periods, newest first, up to max_periods
|
||||
return [pid for pid, _ in all_periods[:max_periods]]
|
||||
|
||||
# Default to recent periods
|
||||
return [pid for pid, _ in all_periods[:max_periods]]
|
||||
|
||||
def _standardize_statement_data(self, statement: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Standardize the statement data using the concept mapper.
|
||||
|
||||
Args:
|
||||
statement: Statement data
|
||||
|
||||
Returns:
|
||||
Standardized statement data
|
||||
"""
|
||||
# Add statement type to context for better mapping
|
||||
statement_type = statement.get('statement_type', '')
|
||||
statement_data = statement['data']
|
||||
|
||||
for item in statement_data:
|
||||
item['statement_type'] = statement_type
|
||||
|
||||
# Apply standardization using the concept mapper
|
||||
return standardize_statement(statement_data, self.concept_mapper)
|
||||
|
||||
def _integrate_statement_data(
|
||||
self,
|
||||
statement_data: List[Dict[str, Any]],
|
||||
period_map: Dict[str, Dict[str, str]],
|
||||
relevant_periods: Set[str]
|
||||
) -> None:
|
||||
"""
|
||||
Integrate statement data from one statement into the stitched view.
|
||||
|
||||
Args:
|
||||
statement_data: Statement data
|
||||
period_map: Map of period IDs to period information
|
||||
relevant_periods: Set of periods from this statement to include
|
||||
"""
|
||||
# Map to track concepts by their underlying concept ID, not just label
|
||||
# This helps merge rows that represent the same concept but have different labels
|
||||
concept_to_label_map = {}
|
||||
|
||||
for item in statement_data:
|
||||
concept = item.get('concept')
|
||||
label = item.get('label')
|
||||
|
||||
# Skip items without concept or label
|
||||
if not concept or not label:
|
||||
continue
|
||||
|
||||
# Skip abstract items with no children (headers without data)
|
||||
if item.get('is_abstract', False) and not item.get('children'):
|
||||
continue
|
||||
|
||||
# Skip dimension items
|
||||
if any(bracket in label for bracket in ['[Axis]', '[Domain]', '[Member]', '[Line Items]', '[Table]', '[Abstract]']):
|
||||
continue
|
||||
|
||||
# Use concept as the primary key for identifying the same financial line item
|
||||
# This is more reliable than labels which may vary across filings
|
||||
|
||||
# If we've already seen this concept, use the existing label as the key
|
||||
# This ensures we merge rows that represent the same concept
|
||||
if concept in concept_to_label_map:
|
||||
concept_key = concept_to_label_map[concept]
|
||||
else:
|
||||
# For a new concept, use the current label as the key
|
||||
concept_key = label
|
||||
# Remember this mapping for future occurrences
|
||||
concept_to_label_map[concept] = concept_key
|
||||
|
||||
# Store metadata about the concept (level, abstract status, etc.)
|
||||
# If we've already seen this concept, only update metadata if it's from a more recent period
|
||||
# This ensures we use labels from the most recent filing when merging rows
|
||||
if concept_key not in self.concept_metadata:
|
||||
self.concept_metadata[concept_key] = {
|
||||
'level': item.get('level', 0),
|
||||
'is_abstract': item.get('is_abstract', False),
|
||||
'is_total': item.get('is_total', False) or 'total' in label.lower(),
|
||||
'original_concept': concept,
|
||||
'latest_label': label # Store the original label too
|
||||
}
|
||||
else:
|
||||
# For existing concepts, update the label to use the most recent one
|
||||
# We determine which periods are most recent based on position in self.periods
|
||||
# (earlier indices are more recent periods)
|
||||
|
||||
# Find the periods in this statement
|
||||
statement_periods = [p for p in relevant_periods if p in self.periods]
|
||||
if statement_periods:
|
||||
# Get the most recent period in this statement
|
||||
most_recent_period = min(statement_periods, key=lambda p: self.periods.index(p))
|
||||
most_recent_idx = self.periods.index(most_recent_period)
|
||||
|
||||
# Find the earliest period where we have data for this concept
|
||||
existing_periods = [p for p in self.data[concept_key].keys() if p in self.periods]
|
||||
if existing_periods:
|
||||
earliest_existing_idx = min(self.periods.index(p) for p in existing_periods)
|
||||
|
||||
# If this statement has more recent data, update the label
|
||||
if most_recent_idx < earliest_existing_idx:
|
||||
# Update the concept key label for display
|
||||
new_concept_key = label
|
||||
|
||||
# If we're changing the label, we need to migrate existing data
|
||||
if new_concept_key != concept_key:
|
||||
# Copy existing data to the new key
|
||||
if new_concept_key not in self.data:
|
||||
self.data[new_concept_key] = self.data[concept_key].copy()
|
||||
|
||||
# Update metadata
|
||||
self.concept_metadata[new_concept_key] = self.concept_metadata[concept_key].copy()
|
||||
self.concept_metadata[new_concept_key]['latest_label'] = label
|
||||
|
||||
# Update the concept mapping
|
||||
concept_to_label_map[concept] = new_concept_key
|
||||
concept_key = new_concept_key
|
||||
else:
|
||||
# Just update the latest label
|
||||
self.concept_metadata[concept_key]['latest_label'] = label
|
||||
|
||||
# Store values for relevant periods
|
||||
for period_id in relevant_periods:
|
||||
if period_id in self.periods: # Only include selected periods
|
||||
value = item.get('values', {}).get(period_id)
|
||||
if value is not None:
|
||||
self.data[concept_key][period_id] = {
|
||||
'value': value,
|
||||
'decimals': item.get('decimals', {}).get(period_id, 0)
|
||||
}
|
||||
|
||||
def _format_output_with_ordering(self, statements: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Format the stitched data for rendering with intelligent ordering using virtual presentation tree.
|
||||
|
||||
Args:
|
||||
statements: Original statements for ordering reference
|
||||
|
||||
Returns:
|
||||
Stitched statement data in the expected format
|
||||
"""
|
||||
# Get unified ordering for all concepts using the ordering manager
|
||||
concept_ordering = {}
|
||||
if self.ordering_manager:
|
||||
concept_ordering = self.ordering_manager.determine_ordering(statements)
|
||||
|
||||
# Build virtual presentation tree to preserve hierarchy while applying semantic ordering
|
||||
presentation_tree = VirtualPresentationTree(self.ordering_manager)
|
||||
ordered_nodes = presentation_tree.build_tree(
|
||||
concept_metadata=self.concept_metadata,
|
||||
concept_ordering=concept_ordering,
|
||||
original_statement_order=self.original_statement_order
|
||||
)
|
||||
|
||||
# Convert nodes back to the expected format
|
||||
ordered_concepts = [(node.concept, node.metadata) for node in ordered_nodes]
|
||||
|
||||
# Build the output structure
|
||||
result = {
|
||||
'periods': [(pid, self.period_dates.get(pid, pid)) for pid in self.periods],
|
||||
'statement_data': []
|
||||
}
|
||||
|
||||
for concept, metadata in ordered_concepts:
|
||||
# Create an item for each concept
|
||||
item = {
|
||||
# Use the latest label if available, otherwise fall back to the concept key
|
||||
'label': metadata.get('latest_label', concept),
|
||||
'level': metadata['level'],
|
||||
'is_abstract': metadata['is_abstract'],
|
||||
'is_total': metadata['is_total'],
|
||||
'concept': metadata['original_concept'],
|
||||
'values': {},
|
||||
'decimals': {}
|
||||
}
|
||||
|
||||
# Add values for each period
|
||||
for period_id in self.periods:
|
||||
if period_id in self.data[concept]:
|
||||
item['values'][period_id] = self.data[concept][period_id]['value']
|
||||
item['decimals'][period_id] = self.data[concept][period_id]['decimals']
|
||||
|
||||
# Set has_values flag based on whether there are any values
|
||||
item['has_values'] = len(item['values']) > 0
|
||||
|
||||
# Only include items with values or abstract items
|
||||
if item['has_values'] or item['is_abstract']:
|
||||
result['statement_data'].append(item)
|
||||
|
||||
return result
|
||||
|
||||
def _format_output(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Backward compatibility method - calls the new ordering-aware method.
|
||||
|
||||
Returns:
|
||||
Stitched statement data in the expected format
|
||||
"""
|
||||
# For backward compatibility, call the new method with empty statements
|
||||
# This will use alphabetical ordering as before
|
||||
return self._format_output_with_ordering([])
|
||||
|
||||
|
||||
def stitch_statements(
|
||||
xbrl_list: List[Any],
|
||||
statement_type: str = 'IncomeStatement',
|
||||
period_type: Union[StatementStitcher.PeriodType, str] = StatementStitcher.PeriodType.RECENT_PERIODS,
|
||||
max_periods: int = 3,
|
||||
standard: bool = True,
|
||||
use_optimal_periods: bool = True,
|
||||
include_dimensions: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Stitch together statements from multiple XBRL objects.
|
||||
|
||||
Args:
|
||||
xbrl_list: List of XBRL objects, should be from the same company and ordered by date
|
||||
statement_type: Type of statement to stitch ('IncomeStatement', 'BalanceSheet', etc.)
|
||||
period_type: Type of period view to generate
|
||||
max_periods: Maximum number of periods to include (default: 3)
|
||||
standard: Whether to use standardized concept labels (default: True)
|
||||
use_optimal_periods: Whether to use the entity info to determine optimal periods (default: True)
|
||||
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
|
||||
|
||||
Returns:
|
||||
Stitched statement data
|
||||
"""
|
||||
# Initialize the stitcher
|
||||
stitcher = StatementStitcher()
|
||||
|
||||
# Collect statements of the specified type from each XBRL object
|
||||
statements = []
|
||||
|
||||
# If using optimal periods based on entity info
|
||||
if use_optimal_periods:
|
||||
# Use our utility function to determine the best periods
|
||||
optimal_periods = determine_optimal_periods(xbrl_list, statement_type, max_periods=max_periods)
|
||||
|
||||
# Limit to max_periods if needed
|
||||
if len(optimal_periods) > max_periods:
|
||||
optimal_periods = optimal_periods[:max_periods]
|
||||
|
||||
# Extract the XBRL objects that contain our optimal periods
|
||||
for period_metadata in optimal_periods:
|
||||
xbrl_index = period_metadata['xbrl_index']
|
||||
xbrl = xbrl_list[xbrl_index]
|
||||
|
||||
# Get the statement and period info
|
||||
statement = xbrl.get_statement_by_type(statement_type, include_dimensions=include_dimensions)
|
||||
if statement:
|
||||
# Only include the specific period from this statement
|
||||
period_key = period_metadata['period_key']
|
||||
|
||||
# Check if this period exists in the statement
|
||||
if period_key in statement['periods']:
|
||||
# Create a filtered version of the statement with just this period
|
||||
filtered_statement = {
|
||||
'role': statement['role'],
|
||||
'definition': statement['definition'],
|
||||
'statement_type': statement['statement_type'],
|
||||
'periods': {period_key: statement['periods'][period_key]},
|
||||
'data': statement['data']
|
||||
}
|
||||
|
||||
# Update the period label to include information from entity_info
|
||||
display_date = period_metadata['display_date']
|
||||
period_type = period_metadata['period_type']
|
||||
fiscal_period = period_metadata.get('fiscal_period')
|
||||
|
||||
# Create a more informative label
|
||||
if period_type == 'instant':
|
||||
if fiscal_period == 'FY':
|
||||
period_label = f"FY {display_date}"
|
||||
else:
|
||||
period_label = display_date
|
||||
else: # duration
|
||||
# For duration periods, add fiscal quarter/year info if available
|
||||
if fiscal_period == 'FY':
|
||||
period_label = f"FY {display_date}"
|
||||
elif fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
|
||||
period_label = f"{fiscal_period} {display_date}"
|
||||
else:
|
||||
period_label = display_date
|
||||
|
||||
# Update the period label
|
||||
filtered_statement['periods'][period_key] = {
|
||||
'label': period_label,
|
||||
'original_label': statement['periods'][period_key]['label']
|
||||
}
|
||||
|
||||
statements.append(filtered_statement)
|
||||
# Traditional approach without using entity info
|
||||
else:
|
||||
for xbrl in xbrl_list:
|
||||
# Get statement data for the specified type
|
||||
statement = xbrl.find_statement(statement_type)
|
||||
if statement:
|
||||
statements.append(statement)
|
||||
|
||||
# Stitch the statements
|
||||
return stitcher.stitch_statements(statements, period_type, max_periods, standard)
|
||||
@@ -0,0 +1,833 @@
|
||||
"""
|
||||
XBRL Statement Ordering - Intelligent Ordering for Multi-Period Statements
|
||||
|
||||
This module provides consistent ordering for financial statements across multiple periods
|
||||
by combining template-based, reference-based, and semantic positioning strategies.
|
||||
"""
|
||||
|
||||
import re
|
||||
from enum import Enum
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
except ImportError:
|
||||
# Fallback to difflib if rapidfuzz is not available
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
class fuzz:
|
||||
@staticmethod
|
||||
def ratio(s1: str, s2: str) -> float:
|
||||
return SequenceMatcher(None, s1, s2).ratio() * 100
|
||||
|
||||
|
||||
class StatementType(str, Enum):
|
||||
"""Supported statement types for ordering"""
|
||||
INCOME_STATEMENT = "IncomeStatement"
|
||||
BALANCE_SHEET = "BalanceSheet"
|
||||
CASH_FLOW = "CashFlowStatement"
|
||||
EQUITY = "StatementOfEquity"
|
||||
|
||||
|
||||
class FinancialStatementTemplates:
|
||||
"""Canonical ordering templates for financial statements based on XBRL concepts"""
|
||||
|
||||
INCOME_STATEMENT_TEMPLATE = [
|
||||
# Revenue Section (0-99)
|
||||
(0, "revenue_section", [
|
||||
# Product/Service Revenue Components
|
||||
"us-gaap:SalesRevenueGoodsNet",
|
||||
"us-gaap:ProductSales",
|
||||
"us-gaap:SalesRevenueServicesNet",
|
||||
"us-gaap:SubscriptionRevenue",
|
||||
# Contract Revenue
|
||||
"us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
|
||||
"us-gaap:RevenueFromContractWithCustomerIncludingAssessedTax",
|
||||
# Total Revenue
|
||||
"us-gaap:Revenue",
|
||||
"us-gaap:Revenues",
|
||||
"us-gaap:SalesRevenueNet",
|
||||
"us-gaap:OperatingRevenue"
|
||||
]),
|
||||
|
||||
# Cost Section (100-199)
|
||||
(100, "cost_section", [
|
||||
"us-gaap:CostOfRevenueAbstract", # Abstract
|
||||
"us-gaap:CostOfRevenue", # Total
|
||||
"us-gaap:CostOfGoodsSold",
|
||||
"us-gaap:CostOfGoodsAndServicesSold",
|
||||
"us-gaap:CostOfSales",
|
||||
"us-gaap:DirectOperatingCosts",
|
||||
"us-gaap:CostsAndExpenses"
|
||||
]),
|
||||
|
||||
# Gross Profit (200-299)
|
||||
(200, "gross_profit", [
|
||||
"us-gaap:GrossProfit"
|
||||
]),
|
||||
|
||||
# Operating Expenses (300-399)
|
||||
(300, "operating_expenses", [
|
||||
# R&D Expenses
|
||||
"us-gaap:ResearchAndDevelopmentCosts",
|
||||
"us-gaap:ResearchAndDevelopmentExpense",
|
||||
# SG&A Expenses
|
||||
"us-gaap:SellingGeneralAndAdministrativeExpense",
|
||||
"us-gaap:GeneralAndAdministrativeExpense",
|
||||
"us-gaap:AdministrativeExpense",
|
||||
"us-gaap:SellingAndMarketingExpense",
|
||||
"us-gaap:SellingExpense",
|
||||
"us-gaap:MarketingExpense",
|
||||
"us-gaap:AdvertisingExpense",
|
||||
# Total Operating Expenses
|
||||
"us-gaap:NoninterestExpense",
|
||||
"us-gaap:OperatingCostsAndExpenses",
|
||||
"us-gaap:OperatingExpenses"
|
||||
]),
|
||||
|
||||
# Operating Income (400-499)
|
||||
(400, "operating_income", [
|
||||
"us-gaap:OperatingIncomeLoss",
|
||||
"us-gaap:OperatingIncome",
|
||||
"us-gaap:IncomeLossFromContinuingOperationsBeforeInterestAndTaxes"
|
||||
]),
|
||||
|
||||
# Non-Operating (500-599)
|
||||
(500, "non_operating", [
|
||||
"us-gaap:InterestIncomeExpenseNet",
|
||||
"us-gaap:InterestAndDebtExpense",
|
||||
"us-gaap:InterestExpense",
|
||||
"us-gaap:InterestExpenseNonoperating", # ADBE uses this for non-operating interest expense
|
||||
"us-gaap:InterestIncome",
|
||||
"us-gaap:InvestmentIncomeInterest", # NVIDIA uses this variant
|
||||
"us-gaap:OtherNonoperatingIncomeExpense",
|
||||
"us-gaap:NonoperatingIncomeExpense",
|
||||
"orcl:NonoperatingIncomeExpenseIncludingEliminationOfNetIncomeLossAttributableToNoncontrollingInterests"
|
||||
]),
|
||||
|
||||
# Pre-Tax Income (600-699)
|
||||
(600, "pretax_income", [
|
||||
"us-gaap:IncomeLossBeforeIncomeTaxes",
|
||||
"us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxes",
|
||||
"us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest",
|
||||
"orcl:IncomeLossFromContinuingOperationsIncludingNoncontrollingInterestBeforeIncomeTaxesExtraordinaryItems"
|
||||
]),
|
||||
|
||||
# Tax (700-799)
|
||||
(700, "tax", [
|
||||
"us-gaap:IncomeTaxesPaidNet",
|
||||
"us-gaap:IncomeTaxExpenseBenefit"
|
||||
]),
|
||||
|
||||
# Net Income (800-899)
|
||||
(800, "net_income", [
|
||||
"us-gaap:IncomeLossFromContinuingOperationsIncludingPortionAttributableToNoncontrollingInterest",
|
||||
"us-gaap:IncomeLossFromContinuingOperations",
|
||||
"us-gaap:NetIncome",
|
||||
"us-gaap:NetIncomeLoss",
|
||||
"us-gaap:ProfitLoss",
|
||||
"us-gaap:NetIncomeLossAttributableToNonredeemableNoncontrollingInterest",
|
||||
"us-gaap:NetIncomeLossAttributableToNoncontrollingInterest"
|
||||
]),
|
||||
|
||||
# Per Share Data (900-999)
|
||||
(900, "per_share", [
|
||||
"us-gaap:EarningsPerShareAbstract",
|
||||
"us-gaap:EarningsPerShareBasic",
|
||||
"us-gaap:EarningsPerShareDiluted",
|
||||
"us-gaap:WeightedAverageNumberOfSharesOutstandingAbstract",
|
||||
"us-gaap:WeightedAverageNumberOfSharesOutstandingBasic",
|
||||
"us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding"
|
||||
])
|
||||
]
|
||||
|
||||
BALANCE_SHEET_TEMPLATE = [
|
||||
# Current Assets (0-199)
|
||||
(0, "current_assets", [
|
||||
"Cash and Cash Equivalents",
|
||||
"Cash",
|
||||
"Short-term Investments",
|
||||
"Marketable Securities",
|
||||
"Accounts Receivable",
|
||||
"Trade Receivables",
|
||||
"Inventory",
|
||||
"Prepaid Expenses",
|
||||
"Other Current Assets",
|
||||
"Total Current Assets"
|
||||
]),
|
||||
|
||||
# Non-Current Assets (200-399)
|
||||
(200, "noncurrent_assets", [
|
||||
"Property, Plant and Equipment",
|
||||
"Property and Equipment",
|
||||
"Long-term Investments",
|
||||
"Goodwill",
|
||||
"Intangible Assets",
|
||||
"Other Non-current Assets",
|
||||
"Total Non-current Assets",
|
||||
"Total Assets"
|
||||
]),
|
||||
|
||||
# Current Liabilities (400-599)
|
||||
(400, "current_liabilities", [
|
||||
"Accounts Payable",
|
||||
"Trade Payables",
|
||||
"Accrued Liabilities",
|
||||
"Accrued Expenses",
|
||||
"Short-term Debt",
|
||||
"Current Portion of Long-term Debt",
|
||||
"Other Current Liabilities",
|
||||
"Total Current Liabilities"
|
||||
]),
|
||||
|
||||
# Non-Current Liabilities (600-799)
|
||||
(600, "noncurrent_liabilities", [
|
||||
"Long-term Debt",
|
||||
"Deferred Revenue",
|
||||
"Deferred Tax Liabilities",
|
||||
"Other Non-current Liabilities",
|
||||
"Total Non-current Liabilities",
|
||||
"Total Liabilities"
|
||||
]),
|
||||
|
||||
# Equity (800-999)
|
||||
(800, "equity", [
|
||||
"Common Stock",
|
||||
"Additional Paid-in Capital",
|
||||
"Retained Earnings",
|
||||
"Accumulated Other Comprehensive Income",
|
||||
"Treasury Stock",
|
||||
"Total Stockholders' Equity",
|
||||
"Total Shareholders' Equity",
|
||||
"Total Equity"
|
||||
])
|
||||
]
|
||||
|
||||
def get_template_position(self, item_concept: str, item_label: str, statement_type: str) -> Optional[float]:
|
||||
"""
|
||||
Get template position for an item, prioritizing concept-based matching over label matching.
|
||||
|
||||
Args:
|
||||
item_concept: The XBRL concept (e.g., "us-gaap:Revenue")
|
||||
item_label: The display label (e.g., "Contract Revenue")
|
||||
statement_type: Type of statement ("IncomeStatement", "BalanceSheet", etc.)
|
||||
|
||||
Returns:
|
||||
Float position in template, or None if no match found
|
||||
"""
|
||||
# Handle different statement type formats
|
||||
if statement_type == "IncomeStatement":
|
||||
template_name = "INCOME_STATEMENT_TEMPLATE"
|
||||
elif statement_type == "BalanceSheet":
|
||||
template_name = "BALANCE_SHEET_TEMPLATE"
|
||||
else:
|
||||
template_name = f"{statement_type.upper()}_TEMPLATE"
|
||||
|
||||
template = getattr(self, template_name, None)
|
||||
if not template:
|
||||
return None
|
||||
|
||||
# Strategy 1: Direct concept matching (highest priority)
|
||||
if item_concept:
|
||||
normalized_concept = self._normalize_xbrl_concept(item_concept)
|
||||
for base_pos, _section_name, template_concepts in template:
|
||||
for i, template_concept in enumerate(template_concepts):
|
||||
template_normalized = self._normalize_xbrl_concept(template_concept)
|
||||
if normalized_concept == template_normalized:
|
||||
return float(base_pos + i)
|
||||
|
||||
# Strategy 2: Label-based matching as fallback (for compatibility)
|
||||
if item_label:
|
||||
for base_pos, _section_name, template_concepts in template:
|
||||
for i, template_concept in enumerate(template_concepts):
|
||||
if self._labels_match(item_label, template_concept):
|
||||
return float(base_pos + i)
|
||||
|
||||
return None
|
||||
|
||||
def _normalize_xbrl_concept(self, concept: str) -> str:
|
||||
"""
|
||||
Normalize XBRL concept for matching.
|
||||
|
||||
Handles variations in concept format:
|
||||
- "us-gaap:Revenue" vs "us-gaap_Revenue"
|
||||
- Case sensitivity
|
||||
- Namespace prefixes
|
||||
"""
|
||||
if not concept:
|
||||
return ""
|
||||
|
||||
# Normalize separators (: vs _)
|
||||
normalized = concept.lower()
|
||||
normalized = normalized.replace(':', '_')
|
||||
|
||||
# Handle common namespace variations
|
||||
# us-gaap, usgaap, gaap all should match
|
||||
if normalized.startswith('us-gaap_') or normalized.startswith('usgaap_'):
|
||||
normalized = 'us-gaap_' + normalized.split('_', 1)[1]
|
||||
elif normalized.startswith('gaap_'):
|
||||
normalized = 'us-gaap_' + normalized.split('_', 1)[1]
|
||||
|
||||
return normalized
|
||||
|
||||
def _labels_match(self, label1: str, label2: str) -> bool:
|
||||
"""Check if two labels represent the same financial item (fallback for non-concept matching)"""
|
||||
if not label1 or not label2:
|
||||
return False
|
||||
|
||||
# For XBRL concepts in templates, don't try to match against labels
|
||||
if ':' in label2 or '_gaap_' in label2.lower():
|
||||
return False
|
||||
|
||||
# Use existing normalization logic for label matching
|
||||
norm1 = self._normalize_concept(label1)
|
||||
norm2 = self._normalize_concept(label2)
|
||||
|
||||
# Exact match
|
||||
if norm1 == norm2:
|
||||
return True
|
||||
|
||||
# Fuzzy matching for similar concepts
|
||||
similarity = fuzz.ratio(norm1, norm2) / 100.0
|
||||
return similarity > 0.7
|
||||
|
||||
def _concepts_match(self, concept1: str, concept2: str) -> bool:
|
||||
"""Check if two concepts represent the same financial item"""
|
||||
# Normalize for comparison
|
||||
norm1 = self._normalize_concept(concept1)
|
||||
norm2 = self._normalize_concept(concept2)
|
||||
|
||||
# Exact match
|
||||
if norm1 == norm2:
|
||||
return True
|
||||
|
||||
# Fuzzy matching for similar concepts
|
||||
similarity = fuzz.ratio(norm1, norm2) / 100.0
|
||||
return similarity > 0.7 # Lowered threshold for better matching
|
||||
|
||||
def _normalize_concept(self, concept: str) -> str:
|
||||
"""Normalize concept for comparison"""
|
||||
if not concept:
|
||||
return ""
|
||||
|
||||
# Remove common variations
|
||||
normalized = concept.lower()
|
||||
normalized = re.sub(r'\s+', ' ', normalized) # Normalize whitespace
|
||||
normalized = re.sub(r'[,\.]', '', normalized) # Remove punctuation
|
||||
normalized = re.sub(r'\(.*?\)', '', normalized) # Remove parenthetical
|
||||
normalized = re.sub(r'\bexpense\b', '', normalized) # Remove 'expense' suffix
|
||||
normalized = re.sub(r'\bincome\b', '', normalized) # Remove 'income' suffix for matching
|
||||
return normalized.strip()
|
||||
|
||||
|
||||
class ReferenceOrderingStrategy:
|
||||
"""Extract ordering from reference statement"""
|
||||
|
||||
def establish_reference_order(self, statements: List[Dict]) -> Dict[str, float]:
|
||||
"""Establish reference ordering from best available statement"""
|
||||
|
||||
if not statements:
|
||||
return {}
|
||||
|
||||
# Strategy: Use most recent statement (statements are ordered newest first)
|
||||
reference_statement = statements[0]
|
||||
|
||||
reference_order = {}
|
||||
for i, item in enumerate(reference_statement.get('data', [])):
|
||||
concept = item.get('concept')
|
||||
label = item.get('label')
|
||||
|
||||
if concept:
|
||||
# Store by both concept ID and label for flexibility
|
||||
reference_order[concept] = float(i)
|
||||
if label:
|
||||
reference_order[label] = float(i)
|
||||
|
||||
return reference_order
|
||||
|
||||
|
||||
class SemanticPositioning:
|
||||
"""Position concepts based on financial statement semantics"""
|
||||
|
||||
def __init__(self, statement_type: str):
|
||||
self.statement_type = statement_type
|
||||
self.section_defaults = self._get_section_defaults()
|
||||
|
||||
def _get_section_defaults(self) -> Dict[str, float]:
|
||||
"""Default positions for each section when no other guidance available"""
|
||||
if self.statement_type == "IncomeStatement":
|
||||
return {
|
||||
"revenue": 50.0,
|
||||
"cost": 150.0,
|
||||
"gross_profit": 250.0,
|
||||
"expense": 350.0,
|
||||
"operating_income": 450.0,
|
||||
"non_operating": 550.0,
|
||||
"pretax_income": 650.0,
|
||||
"tax": 750.0,
|
||||
"net_income": 850.0,
|
||||
"per_share": 950.0
|
||||
}
|
||||
elif self.statement_type == "BalanceSheet":
|
||||
return {
|
||||
"current_assets": 100.0,
|
||||
"noncurrent_assets": 300.0,
|
||||
"current_liabilities": 500.0,
|
||||
"noncurrent_liabilities": 700.0,
|
||||
"equity": 900.0
|
||||
}
|
||||
return {}
|
||||
|
||||
def infer_position(self, concept: str, existing_order: Dict[str, float]) -> float:
|
||||
"""Infer semantic position for a new concept"""
|
||||
|
||||
# Rule-based positioning
|
||||
section = self._classify_concept_section(concept)
|
||||
if section:
|
||||
return self._position_in_section(concept, section, existing_order)
|
||||
|
||||
# Parent-child relationship positioning
|
||||
parent = self._find_parent_concept(concept, existing_order)
|
||||
if parent:
|
||||
return existing_order[parent] + 0.1 # Just after parent
|
||||
|
||||
# Similarity-based positioning
|
||||
similar_concept = self._find_most_similar_concept(concept, existing_order)
|
||||
if similar_concept:
|
||||
return existing_order[similar_concept] + 0.1
|
||||
|
||||
# Default to end
|
||||
return 999.0
|
||||
|
||||
def _classify_concept_section(self, concept: str) -> Optional[str]:
|
||||
"""Classify concept into financial statement section"""
|
||||
if not concept:
|
||||
return None
|
||||
|
||||
concept_lower = concept.lower()
|
||||
|
||||
if self.statement_type == "IncomeStatement":
|
||||
# Revenue indicators
|
||||
if any(term in concept_lower for term in ['revenue', 'sales']) and not any(term in concept_lower for term in ['cost', 'expense']):
|
||||
return "revenue"
|
||||
# Cost indicators
|
||||
elif any(term in concept_lower for term in ['cost of', 'cogs']):
|
||||
return "cost"
|
||||
# Gross profit
|
||||
elif 'gross profit' in concept_lower or 'gross margin' in concept_lower:
|
||||
return "gross_profit"
|
||||
# Operating expenses
|
||||
elif any(term in concept_lower for term in ['r&d', 'research', 'selling', 'administrative', 'marketing']) or ('expense' in concept_lower and 'tax' not in concept_lower):
|
||||
return "expense"
|
||||
# Operating income
|
||||
elif 'operating income' in concept_lower or 'operating profit' in concept_lower:
|
||||
return "operating_income"
|
||||
# Non-operating
|
||||
elif any(term in concept_lower for term in ['interest', 'other income', 'nonoperating']):
|
||||
return "non_operating"
|
||||
# Pre-tax income
|
||||
elif 'before tax' in concept_lower or 'pretax' in concept_lower:
|
||||
return "pretax_income"
|
||||
# Tax
|
||||
elif 'tax' in concept_lower and 'expense' in concept_lower:
|
||||
return "tax"
|
||||
# Net income
|
||||
elif 'net income' in concept_lower or 'net earnings' in concept_lower:
|
||||
return "net_income"
|
||||
# Per share
|
||||
elif any(term in concept_lower for term in ['per share', 'earnings per', 'shares outstanding']):
|
||||
return "per_share"
|
||||
|
||||
elif self.statement_type == "BalanceSheet":
|
||||
if any(term in concept_lower for term in ['cash', 'receivable', 'inventory', 'prepaid']) or ('current' in concept_lower and 'asset' in concept_lower):
|
||||
return "current_assets"
|
||||
elif any(term in concept_lower for term in ['property', 'equipment', 'goodwill', 'intangible']) or ('asset' in concept_lower and 'current' not in concept_lower):
|
||||
return "noncurrent_assets"
|
||||
elif any(term in concept_lower for term in ['payable', 'accrued']) or ('current' in concept_lower and 'liabilit' in concept_lower):
|
||||
return "current_liabilities"
|
||||
elif 'debt' in concept_lower or ('liabilit' in concept_lower and 'current' not in concept_lower):
|
||||
return "noncurrent_liabilities"
|
||||
elif any(term in concept_lower for term in ['equity', 'stock', 'retained earnings', 'capital']):
|
||||
return "equity"
|
||||
|
||||
return None
|
||||
|
||||
def _position_in_section(self, concept: str, section: str, existing_order: Dict[str, float]) -> float:
|
||||
"""Position concept within its identified section"""
|
||||
section_concepts = [
|
||||
(label, pos) for label, pos in existing_order.items()
|
||||
if self._classify_concept_section(label) == section
|
||||
]
|
||||
|
||||
if not section_concepts:
|
||||
# Section doesn't exist yet - use template defaults
|
||||
return self.section_defaults.get(section, 999.0)
|
||||
|
||||
# Find best position within section
|
||||
section_concepts.sort(key=lambda x: x[1]) # Sort by position
|
||||
|
||||
# Simple strategy: place at end of section
|
||||
last_pos = section_concepts[-1][1]
|
||||
return last_pos + 0.1
|
||||
|
||||
def _find_parent_concept(self, concept: str, existing_order: Dict[str, float]) -> Optional[str]:
|
||||
"""Find parent concept in hierarchy"""
|
||||
if not concept:
|
||||
return None
|
||||
|
||||
# Look for hierarchical relationships
|
||||
# e.g., "Software Revenue" -> "Revenue"
|
||||
concept_words = set(concept.lower().split())
|
||||
|
||||
candidates = []
|
||||
for existing_concept in existing_order.keys():
|
||||
if not existing_concept:
|
||||
continue
|
||||
|
||||
existing_words = set(existing_concept.lower().split())
|
||||
|
||||
# Check if existing concept is a parent (subset of words)
|
||||
# Also check for common patterns like "expense" being a parent of "X expense"
|
||||
if (existing_words.issubset(concept_words) and len(existing_words) < len(concept_words)) or \
|
||||
(existing_concept.lower() in concept.lower() and existing_concept.lower() != concept.lower()):
|
||||
candidates.append((existing_concept, len(existing_words)))
|
||||
|
||||
if candidates:
|
||||
# Return the most specific parent (most words in common)
|
||||
return max(candidates, key=lambda x: x[1])[0]
|
||||
|
||||
return None
|
||||
|
||||
def _find_most_similar_concept(self, concept: str, existing_order: Dict[str, float]) -> Optional[str]:
|
||||
"""Find most similar existing concept"""
|
||||
if not concept:
|
||||
return None
|
||||
|
||||
best_match = None
|
||||
best_similarity = 0.0
|
||||
|
||||
for existing_concept in existing_order.keys():
|
||||
if not existing_concept:
|
||||
continue
|
||||
|
||||
similarity = fuzz.ratio(concept.lower(), existing_concept.lower()) / 100.0
|
||||
if similarity > best_similarity and similarity > 0.5: # Minimum threshold
|
||||
best_similarity = similarity
|
||||
best_match = existing_concept
|
||||
|
||||
return best_match
|
||||
|
||||
|
||||
class StatementOrderingManager:
|
||||
"""Manages consistent ordering across multi-period statements"""
|
||||
|
||||
def __init__(self, statement_type: str):
|
||||
self.statement_type = statement_type
|
||||
self.templates = FinancialStatementTemplates()
|
||||
self.reference_strategy = ReferenceOrderingStrategy()
|
||||
self.semantic_positioning = SemanticPositioning(statement_type)
|
||||
|
||||
def determine_ordering(self, statements: List[Dict]) -> Dict[str, float]:
|
||||
"""
|
||||
Determine unified ordering for all concepts across statements.
|
||||
|
||||
Returns:
|
||||
Dict mapping concept -> sort_key (float for interpolation)
|
||||
"""
|
||||
if not statements:
|
||||
return {}
|
||||
|
||||
all_concepts = self._extract_all_concepts(statements)
|
||||
|
||||
# Strategy 1: Template-based ordering (highest priority)
|
||||
template_positioned = self._apply_template_ordering(all_concepts, statements)
|
||||
|
||||
# Strategy 2: Reference statement ordering for non-template items
|
||||
reference_positioned = self._apply_reference_ordering(
|
||||
all_concepts, statements, template_positioned
|
||||
)
|
||||
|
||||
# Strategy 3: Semantic positioning for orphan concepts
|
||||
semantic_positioned = self._apply_semantic_positioning(
|
||||
all_concepts, template_positioned, reference_positioned
|
||||
)
|
||||
|
||||
# Strategy 4: Section-aware consolidation to maintain template groupings
|
||||
final_ordering = self._consolidate_section_ordering(
|
||||
semantic_positioned, template_positioned, statements
|
||||
)
|
||||
|
||||
return final_ordering
|
||||
|
||||
def _extract_all_concepts(self, statements: List[Dict]) -> set:
|
||||
"""Extract all unique concepts from statements"""
|
||||
all_concepts = set()
|
||||
|
||||
for statement in statements:
|
||||
for item in statement.get('data', []):
|
||||
concept = item.get('concept')
|
||||
label = item.get('label')
|
||||
if concept:
|
||||
all_concepts.add(concept)
|
||||
if label:
|
||||
all_concepts.add(label)
|
||||
|
||||
return all_concepts
|
||||
|
||||
def _apply_template_ordering(self, concepts: set, statements: List[Dict]) -> Dict[str, float]:
|
||||
"""Apply template-based ordering for known concepts using concept-first matching"""
|
||||
template_order = {}
|
||||
|
||||
# Build a mapping of concepts/labels to their actual XBRL concepts for better matching
|
||||
concept_to_xbrl = {}
|
||||
label_to_xbrl = {}
|
||||
|
||||
for statement in statements:
|
||||
for item in statement.get('data', []):
|
||||
concept = item.get('concept')
|
||||
label = item.get('label')
|
||||
|
||||
if concept and label:
|
||||
concept_to_xbrl[concept] = concept
|
||||
label_to_xbrl[label] = concept
|
||||
elif concept:
|
||||
concept_to_xbrl[concept] = concept
|
||||
|
||||
# Apply template ordering with concept priority
|
||||
for concept_or_label in concepts:
|
||||
# Determine if this is a concept or label
|
||||
is_concept = concept_or_label in concept_to_xbrl
|
||||
is_label = concept_or_label in label_to_xbrl
|
||||
|
||||
# Get the actual XBRL concept and label for this item
|
||||
if is_concept:
|
||||
xbrl_concept = concept_or_label
|
||||
# Try to find the corresponding label
|
||||
corresponding_label = None
|
||||
for stmt in statements:
|
||||
for item in stmt.get('data', []):
|
||||
if item.get('concept') == concept_or_label:
|
||||
corresponding_label = item.get('label')
|
||||
break
|
||||
if corresponding_label:
|
||||
break
|
||||
elif is_label:
|
||||
xbrl_concept = label_to_xbrl.get(concept_or_label)
|
||||
corresponding_label = concept_or_label
|
||||
else:
|
||||
# Neither concept nor label found in mappings
|
||||
xbrl_concept = None
|
||||
corresponding_label = concept_or_label
|
||||
|
||||
# Try concept-based matching first, then label-based
|
||||
template_pos = self.templates.get_template_position(
|
||||
item_concept=xbrl_concept,
|
||||
item_label=corresponding_label,
|
||||
statement_type=self.statement_type
|
||||
)
|
||||
|
||||
if template_pos is not None:
|
||||
template_order[concept_or_label] = template_pos
|
||||
|
||||
# IMPORTANT: If we found a template position for a concept,
|
||||
# also apply it to the corresponding label (and vice versa)
|
||||
# This ensures consistent ordering regardless of whether the
|
||||
# stitcher uses concept or label as the key
|
||||
if is_concept and corresponding_label and corresponding_label in concepts:
|
||||
template_order[corresponding_label] = template_pos
|
||||
elif is_label and xbrl_concept and xbrl_concept in concepts:
|
||||
template_order[xbrl_concept] = template_pos
|
||||
|
||||
return template_order
|
||||
|
||||
def _apply_reference_ordering(self, concepts: set, statements: List[Dict],
|
||||
template_positioned: Dict[str, float]) -> Dict[str, float]:
|
||||
"""Apply reference statement ordering for remaining concepts"""
|
||||
reference_order = self.reference_strategy.establish_reference_order(statements)
|
||||
|
||||
combined_order = template_positioned.copy()
|
||||
|
||||
for concept in concepts:
|
||||
if concept not in combined_order and concept in reference_order:
|
||||
combined_order[concept] = reference_order[concept]
|
||||
|
||||
return combined_order
|
||||
|
||||
def _apply_semantic_positioning(self, concepts: set, template_positioned: Dict[str, float],
|
||||
reference_positioned: Dict[str, float]) -> Dict[str, float]:
|
||||
"""Apply semantic positioning for orphan concepts"""
|
||||
final_order = reference_positioned.copy()
|
||||
|
||||
# Position remaining concepts using semantic rules
|
||||
for concept in concepts:
|
||||
if concept not in final_order:
|
||||
semantic_pos = self.semantic_positioning.infer_position(concept, final_order)
|
||||
final_order[concept] = semantic_pos
|
||||
|
||||
return final_order
|
||||
|
||||
def _consolidate_section_ordering(self, semantic_positioned: Dict[str, float],
|
||||
template_positioned: Dict[str, float],
|
||||
statements: List[Dict]) -> Dict[str, float]:
|
||||
"""
|
||||
Consolidate ordering to maintain template section groupings.
|
||||
|
||||
This prevents reference ordering from breaking up logical template sections
|
||||
like per-share data (EPS + Shares Outstanding).
|
||||
"""
|
||||
# Identify template sections and their concepts
|
||||
template_sections = self._identify_template_sections(template_positioned)
|
||||
|
||||
# Separate template-positioned from non-template items
|
||||
template_items = {}
|
||||
non_template_items = {}
|
||||
|
||||
for concept, position in semantic_positioned.items():
|
||||
if concept in template_positioned:
|
||||
template_items[concept] = position
|
||||
else:
|
||||
non_template_items[concept] = position
|
||||
|
||||
# Re-organize to ensure section integrity
|
||||
final_ordering = {}
|
||||
|
||||
# Process template sections in order
|
||||
for section_name, section_concepts in template_sections.items():
|
||||
# Find all template items (concepts and labels) that belong to this section
|
||||
section_template_items = []
|
||||
|
||||
for concept in section_concepts:
|
||||
if concept in template_items:
|
||||
section_template_items.append(concept)
|
||||
|
||||
# Also find labels that correspond to concepts in this section
|
||||
# by checking if any template_items have the same template position
|
||||
section_template_positions = set()
|
||||
for concept in section_concepts:
|
||||
if concept in template_positioned:
|
||||
section_template_positions.add(template_positioned[concept])
|
||||
|
||||
# Find labels that have the same template positions as section concepts
|
||||
for item, pos in template_items.items():
|
||||
if pos in section_template_positions and item not in section_template_items:
|
||||
section_template_items.append(item)
|
||||
|
||||
if section_template_items:
|
||||
# Use the template base position for this section to ensure strong grouping
|
||||
section_base_pos = self._get_section_base_position(section_name)
|
||||
|
||||
# For critical sections like per_share, use an even stronger override
|
||||
if section_name == "per_share":
|
||||
# Force per-share items to be at the very end, regardless of hierarchy
|
||||
section_base_pos = 950.0
|
||||
|
||||
# Ensure all items in this section stay grouped together
|
||||
for i, item in enumerate(sorted(section_template_items,
|
||||
key=lambda x: template_items.get(x, 999.0))):
|
||||
final_ordering[item] = section_base_pos + i * 0.1
|
||||
|
||||
# Add non-template items, adjusting positions to avoid breaking template sections
|
||||
section_ranges = self._get_section_ranges(final_ordering, template_sections)
|
||||
|
||||
for concept, position in non_template_items.items():
|
||||
# Find appropriate insertion point that doesn't break template sections
|
||||
adjusted_position = self._find_insertion_point(position, section_ranges)
|
||||
final_ordering[concept] = adjusted_position
|
||||
|
||||
return final_ordering
|
||||
|
||||
def _get_section_base_position(self, section_name: str) -> float:
|
||||
"""Get the base position for a template section"""
|
||||
if self.statement_type == "IncomeStatement":
|
||||
template = self.templates.INCOME_STATEMENT_TEMPLATE
|
||||
elif self.statement_type == "BalanceSheet":
|
||||
template = self.templates.BALANCE_SHEET_TEMPLATE
|
||||
else:
|
||||
return 999.0
|
||||
|
||||
for base_pos, name, _concepts in template:
|
||||
if name == section_name:
|
||||
return float(base_pos)
|
||||
|
||||
return 999.0
|
||||
|
||||
def _identify_template_sections(self, template_positioned: Dict[str, float]) -> Dict[str, List[str]]:
|
||||
"""Identify which concepts belong to which template sections"""
|
||||
sections = {}
|
||||
|
||||
# Get the template for this statement type
|
||||
if self.statement_type == "IncomeStatement":
|
||||
template = self.templates.INCOME_STATEMENT_TEMPLATE
|
||||
elif self.statement_type == "BalanceSheet":
|
||||
template = self.templates.BALANCE_SHEET_TEMPLATE
|
||||
else:
|
||||
return {}
|
||||
|
||||
# Build mapping of concepts to sections
|
||||
for _base_pos, section_name, template_concepts in template:
|
||||
section_concepts = []
|
||||
|
||||
for concept in template_positioned.keys():
|
||||
# Check if this concept matches any template concept in this section
|
||||
for template_concept in template_concepts:
|
||||
if self._concept_matches_template(concept, template_concept):
|
||||
section_concepts.append(concept)
|
||||
break
|
||||
|
||||
if section_concepts:
|
||||
sections[section_name] = section_concepts
|
||||
|
||||
return sections
|
||||
|
||||
def _concept_matches_template(self, concept: str, template_concept: str) -> bool:
|
||||
"""Check if a concept matches a template concept"""
|
||||
# For XBRL concepts, do direct comparison
|
||||
if ':' in template_concept or '_gaap_' in template_concept.lower():
|
||||
return self._normalize_xbrl_concept(concept) == self._normalize_xbrl_concept(template_concept)
|
||||
|
||||
# For labels, use fuzzy matching
|
||||
return self._labels_match(concept, template_concept)
|
||||
|
||||
def _get_section_ranges(self, final_ordering: Dict[str, float],
|
||||
template_sections: Dict[str, List[str]]) -> List[Tuple[float, float, str]]:
|
||||
"""Get the position ranges occupied by each template section"""
|
||||
ranges = []
|
||||
|
||||
for section_name, concepts in template_sections.items():
|
||||
section_positions = [final_ordering[c] for c in concepts if c in final_ordering]
|
||||
|
||||
if section_positions:
|
||||
min_pos = min(section_positions)
|
||||
max_pos = max(section_positions)
|
||||
ranges.append((min_pos, max_pos, section_name))
|
||||
|
||||
return sorted(ranges)
|
||||
|
||||
def _find_insertion_point(self, desired_position: float,
|
||||
section_ranges: List[Tuple[float, float, str]]) -> float:
|
||||
"""Find appropriate insertion point that doesn't break template sections"""
|
||||
|
||||
# Check if desired position conflicts with any template section
|
||||
for min_pos, max_pos, section_name in section_ranges:
|
||||
if min_pos <= desired_position <= max_pos:
|
||||
# Position conflicts with a template section
|
||||
# Place it just before the section (unless it should logically be after)
|
||||
|
||||
# Special handling for per-share section
|
||||
if section_name == "per_share" and desired_position < min_pos:
|
||||
# Items that should come before per-share data
|
||||
return min_pos - 1.0
|
||||
else:
|
||||
# Place after the section
|
||||
return max_pos + 1.0
|
||||
|
||||
# No conflicts, use desired position
|
||||
return desired_position
|
||||
|
||||
def _normalize_xbrl_concept(self, concept: str) -> str:
|
||||
"""Delegate to templates class for concept normalization"""
|
||||
return self.templates._normalize_xbrl_concept(concept)
|
||||
|
||||
def _labels_match(self, label1: str, label2: str) -> bool:
|
||||
"""Delegate to templates class for label matching"""
|
||||
return self.templates._labels_match(label1, label2)
|
||||
@@ -0,0 +1,547 @@
|
||||
"""
|
||||
XBRL Statement Stitching - Period Optimization (Refactored)
|
||||
|
||||
This module provides functionality to determine optimal periods for stitching
|
||||
statements across multiple XBRL filings, handling period selection and
|
||||
fiscal period matching.
|
||||
|
||||
Refactored to use a clean class-based architecture for better maintainability,
|
||||
testability, and extensibility.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from edgar.xbrl.core import format_date, parse_date
|
||||
from edgar.xbrl.xbrl import XBRL
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PeriodSelectionConfig:
|
||||
"""Configuration for period selection behavior"""
|
||||
|
||||
# Duration ranges for different period types
|
||||
annual_duration_range: Tuple[int, int] = (350, 380)
|
||||
quarterly_duration_range: Tuple[int, int] = (80, 100)
|
||||
q2_ytd_range: Tuple[int, int] = (175, 190)
|
||||
q3_ytd_range: Tuple[int, int] = (260, 285)
|
||||
q4_annual_range: Tuple[int, int] = (350, 380)
|
||||
|
||||
# Target durations for optimization
|
||||
target_annual_days: int = 365
|
||||
target_quarterly_days: int = 90
|
||||
target_q2_ytd_days: int = 180
|
||||
target_q3_ytd_days: int = 270
|
||||
|
||||
# Behavior flags
|
||||
require_exact_matches: bool = True
|
||||
allow_fallback_when_no_doc_date: bool = True
|
||||
max_periods_default: int = 8
|
||||
|
||||
|
||||
class PeriodMatcher:
|
||||
"""Handles exact period matching logic"""
|
||||
|
||||
def __init__(self, config: PeriodSelectionConfig):
|
||||
self.config = config
|
||||
|
||||
def find_exact_instant_match(self, periods: List[Dict], target_date: date) -> Optional[Dict]:
|
||||
"""Find instant period that exactly matches target date"""
|
||||
for period in periods:
|
||||
try:
|
||||
period_date = parse_date(period['date'])
|
||||
if period_date == target_date:
|
||||
return period
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning("Failed to parse period date '%s': %s", period.get('date'), e)
|
||||
continue
|
||||
return None
|
||||
|
||||
def find_exact_duration_match(self, periods: List[Dict], target_date: date) -> Optional[Dict]:
|
||||
"""Find duration period that ends exactly on target date"""
|
||||
for period in periods:
|
||||
try:
|
||||
end_date = parse_date(period['end_date'])
|
||||
if end_date == target_date:
|
||||
return period
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning("Failed to parse period end date '%s': %s", period.get('end_date'), e)
|
||||
continue
|
||||
return None
|
||||
|
||||
def filter_by_duration_range(self, periods: List[Dict], min_days: int, max_days: int, target_days: int) -> List[Dict]:
|
||||
"""Filter periods by duration and sort by proximity to target"""
|
||||
filtered_periods = []
|
||||
|
||||
for period in periods:
|
||||
duration_days = period.get('duration_days')
|
||||
if duration_days is None:
|
||||
try:
|
||||
start_date = parse_date(period['start_date'])
|
||||
end_date = parse_date(period['end_date'])
|
||||
duration_days = (end_date - start_date).days
|
||||
period = period.copy()
|
||||
period['duration_days'] = duration_days
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning("Failed to calculate duration for period: %s", e)
|
||||
continue
|
||||
|
||||
if min_days <= duration_days <= max_days:
|
||||
filtered_periods.append(period)
|
||||
|
||||
# Sort by proximity to target duration
|
||||
filtered_periods.sort(key=lambda x: abs(x['duration_days'] - target_days))
|
||||
return filtered_periods
|
||||
|
||||
|
||||
class FiscalPeriodClassifier:
|
||||
"""Classifies and filters periods based on fiscal information"""
|
||||
|
||||
def __init__(self, config: PeriodSelectionConfig):
|
||||
self.config = config
|
||||
|
||||
def classify_annual_periods(self, periods: List[Dict]) -> List[Dict]:
|
||||
"""Identify annual periods (350-380 days)"""
|
||||
min_days, max_days = self.config.annual_duration_range
|
||||
target_days = self.config.target_annual_days
|
||||
|
||||
annual_periods = []
|
||||
for period in periods:
|
||||
duration_days = period.get('duration_days', 0)
|
||||
if min_days <= duration_days <= max_days:
|
||||
annual_periods.append(period)
|
||||
|
||||
# Sort by proximity to target annual duration
|
||||
annual_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
|
||||
return annual_periods
|
||||
|
||||
def classify_quarterly_periods(self, periods: List[Dict]) -> List[Dict]:
|
||||
"""Identify quarterly periods (80-100 days)"""
|
||||
min_days, max_days = self.config.quarterly_duration_range
|
||||
target_days = self.config.target_quarterly_days
|
||||
|
||||
quarterly_periods = []
|
||||
for period in periods:
|
||||
duration_days = period.get('duration_days', 0)
|
||||
if min_days <= duration_days <= max_days:
|
||||
quarterly_periods.append(period)
|
||||
|
||||
# Sort by proximity to target quarterly duration
|
||||
quarterly_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
|
||||
return quarterly_periods
|
||||
|
||||
def classify_ytd_periods(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
|
||||
"""Identify YTD periods based on fiscal quarter"""
|
||||
if fiscal_period not in ['Q2', 'Q3', 'Q4']:
|
||||
return []
|
||||
|
||||
# Get expected duration range for this fiscal period
|
||||
duration_ranges = {
|
||||
'Q2': self.config.q2_ytd_range,
|
||||
'Q3': self.config.q3_ytd_range,
|
||||
'Q4': self.config.q4_annual_range
|
||||
}
|
||||
|
||||
target_durations = {
|
||||
'Q2': self.config.target_q2_ytd_days,
|
||||
'Q3': self.config.target_q3_ytd_days,
|
||||
'Q4': self.config.target_annual_days
|
||||
}
|
||||
|
||||
min_days, max_days = duration_ranges[fiscal_period]
|
||||
target_days = target_durations[fiscal_period]
|
||||
|
||||
ytd_periods = []
|
||||
for period in periods:
|
||||
duration_days = period.get('duration_days', 0)
|
||||
if min_days <= duration_days <= max_days:
|
||||
ytd_periods.append(period)
|
||||
|
||||
# Sort by proximity to target duration
|
||||
ytd_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
|
||||
return ytd_periods
|
||||
|
||||
def get_expected_durations(self, fiscal_period: str) -> Dict[str, Tuple[int, int]]:
|
||||
"""Get expected duration ranges for fiscal period"""
|
||||
if fiscal_period == 'FY':
|
||||
return {'annual': self.config.annual_duration_range}
|
||||
elif fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
|
||||
durations = {'quarterly': self.config.quarterly_duration_range}
|
||||
if fiscal_period == 'Q2':
|
||||
durations['ytd'] = self.config.q2_ytd_range
|
||||
elif fiscal_period == 'Q3':
|
||||
durations['ytd'] = self.config.q3_ytd_range
|
||||
elif fiscal_period == 'Q4':
|
||||
durations['ytd'] = self.config.q4_annual_range
|
||||
return durations
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
class StatementTypeSelector:
|
||||
"""Handles statement-specific period selection logic"""
|
||||
|
||||
def __init__(self, matcher: PeriodMatcher, classifier: FiscalPeriodClassifier):
|
||||
self.matcher = matcher
|
||||
self.classifier = classifier
|
||||
|
||||
def select_balance_sheet_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date]) -> List[Dict]:
|
||||
"""Select instant periods for balance sheets"""
|
||||
# Filter for instant periods only
|
||||
instant_periods = [p for p in xbrl.reporting_periods if p['type'] == 'instant']
|
||||
|
||||
if not instant_periods:
|
||||
return []
|
||||
|
||||
# If we have document_period_end_date, find exact match
|
||||
if doc_period_end_date:
|
||||
exact_match = self.matcher.find_exact_instant_match(instant_periods, doc_period_end_date)
|
||||
if exact_match:
|
||||
return [exact_match]
|
||||
else:
|
||||
# No exact match found - don't use fallback to prevent fiscal year boundary issues
|
||||
logger.info("No exact instant period match found for %s", doc_period_end_date)
|
||||
return []
|
||||
|
||||
# No document_period_end_date available - use most recent period
|
||||
instant_periods.sort(key=lambda x: x['date'], reverse=True)
|
||||
return [instant_periods[0]]
|
||||
|
||||
def select_income_statement_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
|
||||
fiscal_period: str) -> List[Dict]:
|
||||
"""Select duration periods for income statements"""
|
||||
return self._select_duration_periods(xbrl, doc_period_end_date, fiscal_period)
|
||||
|
||||
def select_cash_flow_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
|
||||
fiscal_period: str) -> List[Dict]:
|
||||
"""Select duration periods for cash flow statements"""
|
||||
return self._select_duration_periods(xbrl, doc_period_end_date, fiscal_period)
|
||||
|
||||
def _select_duration_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
|
||||
fiscal_period: str) -> List[Dict]:
|
||||
"""Common logic for selecting duration periods"""
|
||||
# Filter for duration periods only
|
||||
duration_periods = [p for p in xbrl.reporting_periods if p['type'] == 'duration']
|
||||
|
||||
if not duration_periods:
|
||||
return []
|
||||
|
||||
# Add duration_days to all periods
|
||||
enriched_periods = []
|
||||
for period in duration_periods:
|
||||
try:
|
||||
start_date = parse_date(period['start_date'])
|
||||
end_date = parse_date(period['end_date'])
|
||||
period_copy = period.copy()
|
||||
period_copy['duration_days'] = (end_date - start_date).days
|
||||
enriched_periods.append(period_copy)
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning("Failed to parse period dates: %s", e)
|
||||
continue
|
||||
|
||||
if not enriched_periods:
|
||||
return []
|
||||
|
||||
# If we have document_period_end_date, find periods that end exactly on that date
|
||||
if doc_period_end_date:
|
||||
matching_periods = []
|
||||
for period in enriched_periods:
|
||||
try:
|
||||
end_date = parse_date(period['end_date'])
|
||||
if end_date == doc_period_end_date:
|
||||
matching_periods.append(period)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
if matching_periods:
|
||||
return self._select_appropriate_durations(matching_periods, fiscal_period)
|
||||
else:
|
||||
# No exact match found - don't use fallback
|
||||
logger.info("No exact duration period match found for %s", doc_period_end_date)
|
||||
return []
|
||||
|
||||
# No document_period_end_date - use fallback logic
|
||||
return self._select_fallback_periods(enriched_periods, fiscal_period)
|
||||
|
||||
def _select_appropriate_durations(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
|
||||
"""Select appropriate duration periods based on fiscal period"""
|
||||
selected_periods = []
|
||||
|
||||
is_annual = fiscal_period == 'FY'
|
||||
|
||||
if is_annual:
|
||||
# For annual reports, select annual periods
|
||||
annual_periods = self.classifier.classify_annual_periods(periods)
|
||||
if annual_periods:
|
||||
selected_periods.append(annual_periods[0])
|
||||
else:
|
||||
# For quarterly reports, select quarterly period
|
||||
quarterly_periods = self.classifier.classify_quarterly_periods(periods)
|
||||
if quarterly_periods:
|
||||
selected_periods.append(quarterly_periods[0])
|
||||
|
||||
# Also select YTD period if appropriate
|
||||
ytd_periods = self.classifier.classify_ytd_periods(periods, fiscal_period)
|
||||
if ytd_periods:
|
||||
selected_periods.append(ytd_periods[0])
|
||||
|
||||
return selected_periods
|
||||
|
||||
def _select_fallback_periods(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
|
||||
"""Fallback period selection when no document_period_end_date is available"""
|
||||
is_annual = fiscal_period == 'FY'
|
||||
|
||||
if is_annual:
|
||||
# For annual reports, prefer periods closest to 365 days
|
||||
annual_periods = self.classifier.classify_annual_periods(periods)
|
||||
if annual_periods:
|
||||
# Sort by end date and take the most recent
|
||||
annual_periods.sort(key=lambda x: x['end_date'], reverse=True)
|
||||
return [annual_periods[0]]
|
||||
else:
|
||||
# For quarterly reports, prefer quarterly duration
|
||||
quarterly_periods = self.classifier.classify_quarterly_periods(periods)
|
||||
selected_periods = []
|
||||
|
||||
if quarterly_periods:
|
||||
quarterly_periods.sort(key=lambda x: x['end_date'], reverse=True)
|
||||
selected_periods.append(quarterly_periods[0])
|
||||
|
||||
# Add YTD period if available
|
||||
ytd_periods = self.classifier.classify_ytd_periods(periods, fiscal_period)
|
||||
if ytd_periods:
|
||||
ytd_periods.sort(key=lambda x: x['end_date'], reverse=True)
|
||||
selected_periods.append(ytd_periods[0])
|
||||
|
||||
return selected_periods
|
||||
|
||||
# If no appropriate periods found, return the most recent period
|
||||
periods.sort(key=lambda x: x['end_date'], reverse=True)
|
||||
return [periods[0]]
|
||||
|
||||
|
||||
class PeriodMetadataEnricher:
|
||||
"""Handles period metadata enrichment"""
|
||||
|
||||
def enrich_period_metadata(self, period: Dict, xbrl_index: int, entity_info: Dict,
|
||||
doc_period_end_date: Optional[date], fiscal_period: str,
|
||||
fiscal_year: str) -> Dict[str, Any]:
|
||||
"""Add comprehensive metadata to period"""
|
||||
period_metadata = {
|
||||
'xbrl_index': xbrl_index,
|
||||
'period_key': period['key'],
|
||||
'period_label': period['label'],
|
||||
'period_type': period['type'],
|
||||
'entity_info': entity_info,
|
||||
'doc_period_end_date': doc_period_end_date,
|
||||
'fiscal_period': fiscal_period,
|
||||
'fiscal_year': fiscal_year
|
||||
}
|
||||
|
||||
# Add date information
|
||||
if period['type'] == 'instant':
|
||||
period_metadata['date'] = parse_date(period['date'])
|
||||
period_metadata['display_date'] = format_date(period_metadata['date'])
|
||||
else: # duration
|
||||
period_metadata['start_date'] = parse_date(period['start_date'])
|
||||
period_metadata['end_date'] = parse_date(period['end_date'])
|
||||
period_metadata['duration_days'] = period.get('duration_days',
|
||||
(period_metadata['end_date'] - period_metadata['start_date']).days)
|
||||
period_metadata['display_date'] = format_date(period_metadata['end_date'])
|
||||
|
||||
return period_metadata
|
||||
|
||||
|
||||
class PeriodDeduplicator:
|
||||
"""Handles period deduplication and sorting"""
|
||||
|
||||
def deduplicate_periods(self, periods: List[Dict], statement_type: str) -> List[Dict]:
|
||||
"""Remove duplicate periods using exact date matching"""
|
||||
filtered_periods = []
|
||||
|
||||
for period in periods:
|
||||
too_close = False
|
||||
for included_period in filtered_periods:
|
||||
# Skip if period types don't match
|
||||
if period['period_type'] != included_period['period_type']:
|
||||
continue
|
||||
|
||||
# Calculate date difference
|
||||
if period['period_type'] == 'instant':
|
||||
date1 = period['date']
|
||||
date2 = included_period['date']
|
||||
else: # duration
|
||||
date1 = period['end_date']
|
||||
date2 = included_period['end_date']
|
||||
|
||||
# Periods are duplicates if they have exactly the same date
|
||||
if date1 == date2:
|
||||
too_close = True
|
||||
break
|
||||
|
||||
if not too_close:
|
||||
filtered_periods.append(period)
|
||||
|
||||
return filtered_periods
|
||||
|
||||
def sort_periods_chronologically(self, periods: List[Dict], statement_type: str) -> List[Dict]:
|
||||
"""Sort periods by appropriate date field"""
|
||||
if statement_type == 'BalanceSheet':
|
||||
return sorted(periods, key=lambda x: x['date'], reverse=True)
|
||||
else:
|
||||
return sorted(periods, key=lambda x: x['end_date'], reverse=True)
|
||||
|
||||
def limit_periods(self, periods: List[Dict], max_periods: int) -> List[Dict]:
|
||||
"""Limit to maximum number of periods"""
|
||||
return periods[:max_periods] if len(periods) > max_periods else periods
|
||||
|
||||
|
||||
class PeriodOptimizer:
|
||||
"""Main orchestrator for period optimization"""
|
||||
|
||||
def __init__(self, config: Optional[PeriodSelectionConfig] = None):
|
||||
self.config = config or PeriodSelectionConfig()
|
||||
self.matcher = PeriodMatcher(self.config)
|
||||
self.classifier = FiscalPeriodClassifier(self.config)
|
||||
self.selector = StatementTypeSelector(self.matcher, self.classifier)
|
||||
self.enricher = PeriodMetadataEnricher()
|
||||
self.deduplicator = PeriodDeduplicator()
|
||||
|
||||
def determine_optimal_periods(self, xbrl_list: List[XBRL], statement_type: str,
|
||||
max_periods: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
"""Main entry point - orchestrates the entire process"""
|
||||
max_periods = max_periods or self.config.max_periods_default
|
||||
|
||||
# Step 1: Extract periods from all XBRLs
|
||||
all_periods = self._extract_all_periods(xbrl_list, statement_type)
|
||||
|
||||
# Step 2: Enrich with metadata
|
||||
enriched_periods = self._enrich_with_metadata(all_periods)
|
||||
|
||||
# Step 3: Deduplicate, sort, and limit
|
||||
final_periods = self._deduplicate_and_limit(enriched_periods, max_periods, statement_type)
|
||||
|
||||
return final_periods
|
||||
|
||||
def _extract_all_periods(self, xbrl_list: List[XBRL], statement_type: str) -> List[Dict[str, Any]]:
|
||||
"""Extract periods from all XBRL objects"""
|
||||
all_periods = []
|
||||
|
||||
for i, xbrl in enumerate(xbrl_list):
|
||||
# Skip None XBRLs (pre-XBRL era filings before 2009)
|
||||
if xbrl is None:
|
||||
continue
|
||||
|
||||
# Skip XBRLs with no reporting periods
|
||||
if not xbrl.reporting_periods:
|
||||
continue
|
||||
|
||||
entity_info = xbrl.entity_info or {}
|
||||
doc_period_end_date = self._parse_document_period_end_date(entity_info)
|
||||
fiscal_period = entity_info.get('fiscal_period')
|
||||
fiscal_year = entity_info.get('fiscal_year')
|
||||
|
||||
# Select appropriate periods based on statement type
|
||||
selected_periods = self._select_periods_for_statement_type(
|
||||
xbrl, statement_type, doc_period_end_date, fiscal_period
|
||||
)
|
||||
|
||||
# Add context information to each period
|
||||
for period in selected_periods:
|
||||
period_with_context = {
|
||||
'period': period,
|
||||
'xbrl_index': i,
|
||||
'entity_info': entity_info,
|
||||
'doc_period_end_date': doc_period_end_date,
|
||||
'fiscal_period': fiscal_period,
|
||||
'fiscal_year': fiscal_year
|
||||
}
|
||||
all_periods.append(period_with_context)
|
||||
|
||||
return all_periods
|
||||
|
||||
def _parse_document_period_end_date(self, entity_info: Dict) -> Optional[date]:
|
||||
"""Parse document_period_end_date from entity_info"""
|
||||
if 'document_period_end_date' not in entity_info:
|
||||
return None
|
||||
|
||||
try:
|
||||
doc_period_end_date = entity_info['document_period_end_date']
|
||||
if not isinstance(doc_period_end_date, date):
|
||||
doc_period_end_date = parse_date(str(doc_period_end_date))
|
||||
return doc_period_end_date
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning("Failed to parse document_period_end_date: %s", e)
|
||||
return None
|
||||
|
||||
def _select_periods_for_statement_type(self, xbrl: XBRL, statement_type: str,
|
||||
doc_period_end_date: Optional[date],
|
||||
fiscal_period: str) -> List[Dict]:
|
||||
"""Select periods based on statement type"""
|
||||
if statement_type == 'BalanceSheet':
|
||||
return self.selector.select_balance_sheet_periods(xbrl, doc_period_end_date)
|
||||
elif statement_type in ['IncomeStatement', 'CashFlowStatement']:
|
||||
if statement_type == 'IncomeStatement':
|
||||
return self.selector.select_income_statement_periods(xbrl, doc_period_end_date, fiscal_period)
|
||||
else:
|
||||
return self.selector.select_cash_flow_periods(xbrl, doc_period_end_date, fiscal_period)
|
||||
else:
|
||||
# For other statement types, use income statement logic as default
|
||||
return self.selector.select_income_statement_periods(xbrl, doc_period_end_date, fiscal_period)
|
||||
|
||||
def _enrich_with_metadata(self, all_periods: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Enrich periods with comprehensive metadata"""
|
||||
enriched_periods = []
|
||||
|
||||
for period_context in all_periods:
|
||||
period = period_context['period']
|
||||
enriched_metadata = self.enricher.enrich_period_metadata(
|
||||
period,
|
||||
period_context['xbrl_index'],
|
||||
period_context['entity_info'],
|
||||
period_context['doc_period_end_date'],
|
||||
period_context['fiscal_period'],
|
||||
period_context['fiscal_year']
|
||||
)
|
||||
enriched_periods.append(enriched_metadata)
|
||||
|
||||
return enriched_periods
|
||||
|
||||
def _deduplicate_and_limit(self, periods: List[Dict[str, Any]], max_periods: int,
|
||||
statement_type: str) -> List[Dict[str, Any]]:
|
||||
"""Deduplicate, sort, and limit periods"""
|
||||
# Sort periods chronologically
|
||||
sorted_periods = self.deduplicator.sort_periods_chronologically(periods, statement_type)
|
||||
|
||||
# Remove duplicates
|
||||
deduplicated_periods = self.deduplicator.deduplicate_periods(sorted_periods, statement_type)
|
||||
|
||||
# Limit to maximum number of periods
|
||||
final_periods = self.deduplicator.limit_periods(deduplicated_periods, max_periods)
|
||||
|
||||
return final_periods
|
||||
|
||||
|
||||
# Main function that maintains the original API
|
||||
def determine_optimal_periods(xbrl_list: List[XBRL], statement_type: str, max_periods: int = 8) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Determine the optimal periods to display for stitched statements from a list of XBRL objects.
|
||||
|
||||
This function analyzes entity info and reporting periods across multiple XBRL instances
|
||||
to select the most appropriate periods for display, ensuring consistency in period selection
|
||||
when creating stitched statements.
|
||||
|
||||
Args:
|
||||
xbrl_list: List of XBRL objects ordered chronologically
|
||||
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
max_periods: Maximum number of periods to return (default is 8)
|
||||
|
||||
Returns:
|
||||
List of period metadata dictionaries containing information for display
|
||||
"""
|
||||
optimizer = PeriodOptimizer()
|
||||
return optimizer.determine_optimal_periods(xbrl_list, statement_type, max_periods)
|
||||
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
XBRL Presentation Tree - Virtual presentation tree for multi-period statements
|
||||
|
||||
This module creates a virtual presentation tree that preserves hierarchical
|
||||
relationships while applying semantic ordering within sibling groups.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class PresentationNode:
|
||||
"""Represents a node in the virtual presentation tree"""
|
||||
|
||||
concept: str
|
||||
label: str
|
||||
level: int
|
||||
metadata: Dict[str, Any]
|
||||
semantic_order: float = 999.0
|
||||
original_index: int = 999
|
||||
|
||||
def __post_init__(self):
|
||||
self.children: List[PresentationNode] = []
|
||||
self.parent: Optional[PresentationNode] = None
|
||||
|
||||
def add_child(self, child: 'PresentationNode'):
|
||||
"""Add a child node and set parent relationship"""
|
||||
child.parent = self
|
||||
self.children.append(child)
|
||||
|
||||
def sort_children(self):
|
||||
"""Sort children using semantic ordering while preserving hierarchy"""
|
||||
# Sort direct children by semantic order, then by original index as tiebreaker
|
||||
self.children.sort(key=lambda x: (x.semantic_order, x.original_index))
|
||||
|
||||
# Recursively sort grandchildren
|
||||
for child in self.children:
|
||||
child.sort_children()
|
||||
|
||||
def flatten_to_list(self) -> List['PresentationNode']:
|
||||
"""Flatten tree to ordered list while preserving hierarchy"""
|
||||
result = [self]
|
||||
for child in self.children:
|
||||
result.extend(child.flatten_to_list())
|
||||
return result
|
||||
|
||||
|
||||
class VirtualPresentationTree:
|
||||
"""Builds and manages virtual presentation tree for stitched statements"""
|
||||
|
||||
def __init__(self, ordering_manager=None):
|
||||
self.ordering_manager = ordering_manager
|
||||
self.root_nodes: List[PresentationNode] = []
|
||||
self.all_nodes: Dict[str, PresentationNode] = {}
|
||||
|
||||
def build_tree(self, concept_metadata: Dict, concept_ordering: Dict,
|
||||
original_statement_order: List[str] = None) -> List[PresentationNode]:
|
||||
"""
|
||||
Build presentation tree from concept metadata and ordering.
|
||||
|
||||
Args:
|
||||
concept_metadata: Metadata for each concept including level
|
||||
concept_ordering: Semantic ordering positions
|
||||
original_statement_order: Original order of concepts for context
|
||||
|
||||
Returns:
|
||||
Flattened list of nodes in correct presentation order
|
||||
"""
|
||||
# Step 1: Create nodes for all concepts
|
||||
self._create_nodes(concept_metadata, concept_ordering, original_statement_order)
|
||||
|
||||
# Step 2: Build parent-child relationships based on levels and context
|
||||
self._build_hierarchy(original_statement_order or [])
|
||||
|
||||
# Step 3: Apply semantic ordering within sibling groups
|
||||
self._apply_semantic_ordering()
|
||||
|
||||
# Step 4: Flatten tree to linear list
|
||||
return self._flatten_tree()
|
||||
|
||||
def _create_nodes(self, concept_metadata: Dict, concept_ordering: Dict,
|
||||
original_statement_order: List[str] = None):
|
||||
"""Create nodes for all concepts"""
|
||||
self.all_nodes = {}
|
||||
|
||||
for i, (concept, metadata) in enumerate(concept_metadata.items()):
|
||||
label = metadata.get('latest_label', concept)
|
||||
level = metadata.get('level', 0)
|
||||
semantic_order = concept_ordering.get(concept, concept_ordering.get(label, 999.0))
|
||||
|
||||
# Track original index for maintaining some original order context
|
||||
original_index = i
|
||||
if original_statement_order:
|
||||
try:
|
||||
original_index = original_statement_order.index(concept)
|
||||
except ValueError:
|
||||
try:
|
||||
original_index = original_statement_order.index(label)
|
||||
except ValueError:
|
||||
original_index = i + 1000 # Place unknown concepts later
|
||||
|
||||
node = PresentationNode(
|
||||
concept=concept,
|
||||
label=label,
|
||||
level=level,
|
||||
metadata=metadata,
|
||||
semantic_order=semantic_order,
|
||||
original_index=original_index
|
||||
)
|
||||
|
||||
self.all_nodes[concept] = node
|
||||
|
||||
def _build_hierarchy(self, original_order: List[str]):
|
||||
"""Build parent-child relationships based on level progression and context"""
|
||||
|
||||
# Sort nodes by their original order to maintain context for hierarchy detection
|
||||
nodes_in_order = []
|
||||
|
||||
# First, try to use original order if available
|
||||
if original_order:
|
||||
# Map concepts in original order
|
||||
concept_to_node = {node.concept: node for node in self.all_nodes.values()}
|
||||
label_to_node = {node.label: node for node in self.all_nodes.values()}
|
||||
|
||||
for item in original_order:
|
||||
if item in concept_to_node:
|
||||
nodes_in_order.append(concept_to_node[item])
|
||||
elif item in label_to_node:
|
||||
nodes_in_order.append(label_to_node[item])
|
||||
|
||||
# Add any remaining nodes not in original order
|
||||
remaining_nodes = [node for node in self.all_nodes.values()
|
||||
if node not in nodes_in_order]
|
||||
remaining_nodes.sort(key=lambda x: x.original_index)
|
||||
nodes_in_order.extend(remaining_nodes)
|
||||
else:
|
||||
# Fall back to sorting by original index
|
||||
nodes_in_order = sorted(self.all_nodes.values(),
|
||||
key=lambda x: x.original_index)
|
||||
|
||||
# Build hierarchy using a parent stack approach
|
||||
parent_stack = [] # Stack of potential parents at each level
|
||||
|
||||
for node in nodes_in_order:
|
||||
current_level = node.level
|
||||
|
||||
# Pop parents that are at the same level or deeper
|
||||
# We're looking for a parent at a level less than current
|
||||
while parent_stack and parent_stack[-1].level >= current_level:
|
||||
parent_stack.pop()
|
||||
|
||||
if parent_stack:
|
||||
# Check if potential parent and child belong to compatible sections
|
||||
parent = parent_stack[-1]
|
||||
|
||||
# Prevent cross-section hierarchies for critical sections like per_share
|
||||
should_be_child = self._should_be_hierarchical_child(parent, node)
|
||||
|
||||
if should_be_child:
|
||||
# Valid parent-child relationship
|
||||
parent.add_child(node)
|
||||
else:
|
||||
# Different sections - make this a root node instead
|
||||
self.root_nodes.append(node)
|
||||
else:
|
||||
# No parent - this is a root node
|
||||
self.root_nodes.append(node)
|
||||
|
||||
# This node could be a parent for subsequent nodes
|
||||
parent_stack.append(node)
|
||||
|
||||
def _apply_semantic_ordering(self):
|
||||
"""Apply semantic ordering within sibling groups"""
|
||||
|
||||
# Sort root nodes by semantic order first, then original index
|
||||
self.root_nodes.sort(key=lambda x: (x.semantic_order, x.original_index))
|
||||
|
||||
# Sort children within each parent recursively
|
||||
for root in self.root_nodes:
|
||||
root.sort_children()
|
||||
|
||||
def _flatten_tree(self) -> List[PresentationNode]:
|
||||
"""Flatten tree to linear list preserving hierarchy"""
|
||||
result = []
|
||||
|
||||
for root in self.root_nodes:
|
||||
result.extend(root.flatten_to_list())
|
||||
|
||||
return result
|
||||
|
||||
def _should_be_hierarchical_child(self, parent: PresentationNode, child: PresentationNode) -> bool:
|
||||
"""
|
||||
Determine if child should be hierarchically under parent based on semantic ordering.
|
||||
|
||||
Prevents cross-section hierarchies that would break template section groupings.
|
||||
"""
|
||||
# Get semantic ordering positions
|
||||
parent_order = parent.semantic_order
|
||||
child_order = child.semantic_order
|
||||
|
||||
# If both have very specific semantic orders from templates (not defaults),
|
||||
# check if they're in similar ranges (same section)
|
||||
if parent_order < 900 and child_order < 900:
|
||||
# Both are template-positioned, check if they're in similar sections
|
||||
# Allow parent-child within 200 points (roughly same section)
|
||||
section_gap = abs(parent_order - child_order)
|
||||
if section_gap > 200:
|
||||
return False
|
||||
|
||||
# Special case: Per-share items (900+) should never be children of early items
|
||||
if child_order >= 900 and parent_order < 800:
|
||||
return False
|
||||
|
||||
# Special case: Non-operating items (500-599) should not be children of operating items
|
||||
if 500 <= child_order < 600 and parent_order < 500:
|
||||
return False
|
||||
|
||||
# Special case: Revenue items should not be parents of per-share items
|
||||
if parent_order < 100 and child_order >= 900:
|
||||
return False
|
||||
|
||||
# Check for semantic incompatibility based on labels
|
||||
child_label = child.label.lower()
|
||||
parent_label = parent.label.lower()
|
||||
|
||||
# Per-share items should not be children of non-per-share items
|
||||
if any(term in child_label for term in ['earnings per share', 'shares outstanding']):
|
||||
if not any(term in parent_label for term in ['earnings', 'shares', 'per share']):
|
||||
return False
|
||||
|
||||
# Interest expense items should not be children of non-interest items
|
||||
if 'interest expense' in child_label:
|
||||
if 'interest' not in parent_label and 'nonoperating' not in parent_label:
|
||||
return False
|
||||
|
||||
# Otherwise, allow hierarchical relationship
|
||||
return True
|
||||
|
||||
def debug_tree(self) -> str:
|
||||
"""Generate a debug representation of the tree"""
|
||||
lines = []
|
||||
|
||||
def _add_node_lines(node: PresentationNode, depth: int = 0):
|
||||
indent = " " * depth
|
||||
lines.append(f"{indent}├─ {node.label} (level={node.level}, "
|
||||
f"semantic={node.semantic_order:.1f}, orig={node.original_index})")
|
||||
|
||||
for child in node.children:
|
||||
_add_node_lines(child, depth + 1)
|
||||
|
||||
lines.append("Virtual Presentation Tree:")
|
||||
for root in self.root_nodes:
|
||||
_add_node_lines(root)
|
||||
|
||||
return "\n".join(lines)
|
||||
640
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/query.py
Normal file
640
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/query.py
Normal file
@@ -0,0 +1,640 @@
|
||||
"""
|
||||
XBRL Statement Stitching - Query Functionality
|
||||
|
||||
This module provides query functionality for stitched XBRL facts, allowing
|
||||
users to query standardized, multi-period financial data.
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
||||
|
||||
import pandas as pd
|
||||
from rich import box
|
||||
from rich.console import Group
|
||||
from rich.markdown import Markdown
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from edgar.richtools import repr_rich
|
||||
from edgar.xbrl.facts import FactQuery
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar.xbrl.stitching.xbrls import XBRLS
|
||||
|
||||
|
||||
class StitchedFactsView:
|
||||
"""
|
||||
A view over stitched facts from multiple XBRL filings.
|
||||
|
||||
This class extracts facts from stitched statements rather than raw XBRL facts,
|
||||
ensuring that queries operate on standardized, post-processed data.
|
||||
"""
|
||||
|
||||
def __init__(self, xbrls: 'XBRLS'):
|
||||
self.xbrls = xbrls
|
||||
self._facts_cache = None
|
||||
self._last_cache_key = None
|
||||
|
||||
def __len__(self):
|
||||
return len(self.get_facts())
|
||||
|
||||
@property
|
||||
def entity_name(self):
|
||||
"""Get entity name from the most recent XBRL filing."""
|
||||
if self.xbrls.xbrl_list:
|
||||
return getattr(self.xbrls.xbrl_list[0], 'entity_name', 'Unknown Entity')
|
||||
return 'Unknown Entity'
|
||||
|
||||
@property
|
||||
def document_type(self):
|
||||
"""Get document type from entity info."""
|
||||
return self.xbrls.entity_info.get('document_type', 'Multi-Period Stitched')
|
||||
|
||||
def get_facts(self,
|
||||
max_periods: int = 8,
|
||||
standard: bool = True,
|
||||
statement_types: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract facts from stitched statements.
|
||||
|
||||
Args:
|
||||
max_periods: Maximum periods to include
|
||||
standard: Whether to use standardized labels
|
||||
statement_types: List of statement types to include
|
||||
|
||||
Returns:
|
||||
List of fact dictionaries with stitched/standardized data
|
||||
"""
|
||||
# Create cache key
|
||||
cache_key = (max_periods, standard, tuple(statement_types or []))
|
||||
if self._facts_cache and self._last_cache_key == cache_key:
|
||||
return self._facts_cache
|
||||
|
||||
statement_types = statement_types or [
|
||||
'IncomeStatement', 'BalanceSheet', 'CashFlowStatement',
|
||||
'StatementOfEquity', 'ComprehensiveIncome'
|
||||
]
|
||||
|
||||
all_facts = []
|
||||
|
||||
for statement_type in statement_types:
|
||||
try:
|
||||
# Get stitched statement data (this applies standardization)
|
||||
stitched_data = self.xbrls.get_statement(
|
||||
statement_type=statement_type,
|
||||
max_periods=max_periods,
|
||||
standard=standard
|
||||
)
|
||||
|
||||
# Extract facts from stitched data
|
||||
facts = self._extract_facts_from_stitched_data(
|
||||
stitched_data, statement_type
|
||||
)
|
||||
all_facts.extend(facts)
|
||||
|
||||
except Exception:
|
||||
# Skip statements that can't be stitched
|
||||
continue
|
||||
|
||||
# Cache results
|
||||
self._facts_cache = all_facts
|
||||
self._last_cache_key = cache_key
|
||||
|
||||
return all_facts
|
||||
|
||||
def _extract_facts_from_stitched_data(self,
|
||||
stitched_data: Dict[str, Any],
|
||||
statement_type: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert stitched statement data back to fact-like records for querying.
|
||||
|
||||
Args:
|
||||
stitched_data: Output from StatementStitcher
|
||||
statement_type: Type of statement
|
||||
|
||||
Returns:
|
||||
List of fact dictionaries
|
||||
"""
|
||||
facts = []
|
||||
periods = stitched_data.get('periods', [])
|
||||
statement_data = stitched_data.get('statement_data', [])
|
||||
|
||||
for item in statement_data:
|
||||
# Skip abstract items without values
|
||||
if item.get('is_abstract', False) and not item.get('has_values', False):
|
||||
continue
|
||||
|
||||
concept = item.get('concept', '')
|
||||
label = item.get('label', '')
|
||||
original_label = item.get('original_label', label)
|
||||
|
||||
# Create a fact record for each period with data
|
||||
for period_id, value in item.get('values', {}).items():
|
||||
if value is None:
|
||||
continue
|
||||
|
||||
# Find period metadata
|
||||
period_info = self._get_period_info(period_id, periods)
|
||||
|
||||
fact = {
|
||||
# Core identification
|
||||
'concept': concept,
|
||||
'label': label, # Standardized label
|
||||
'original_label': original_label, # Original company label
|
||||
'statement_type': statement_type,
|
||||
|
||||
# Value information
|
||||
'value': value,
|
||||
'numeric_value': self._convert_to_numeric(value),
|
||||
'decimals': item.get('decimals', {}).get(period_id, 0),
|
||||
|
||||
# Period information
|
||||
'period_key': period_id,
|
||||
'period_type': period_info.get('period_type', 'duration'),
|
||||
'period_start': period_info.get('period_start'),
|
||||
'period_end': period_info.get('period_end'),
|
||||
'period_instant': period_info.get('period_instant'),
|
||||
'period_label': period_info.get('period_label', ''),
|
||||
|
||||
# Statement context
|
||||
'level': item.get('level', 0),
|
||||
'is_abstract': item.get('is_abstract', False),
|
||||
'is_total': item.get('is_total', False),
|
||||
|
||||
# Multi-filing context
|
||||
'filing_count': len(self.xbrls.xbrl_list),
|
||||
'standardized': True, # Mark as coming from standardized data
|
||||
|
||||
# Source attribution (which XBRL filing this came from)
|
||||
'source_filing_index': self._determine_source_filing(period_id),
|
||||
}
|
||||
|
||||
# Add fiscal period info if available
|
||||
fiscal_info = self._extract_fiscal_info(period_id)
|
||||
fact.update(fiscal_info)
|
||||
|
||||
facts.append(fact)
|
||||
|
||||
return facts
|
||||
|
||||
def _get_period_info(self, period_id: str, periods: List[tuple]) -> Dict[str, Any]:
|
||||
"""Extract period metadata from period_id and periods list."""
|
||||
period_info = {}
|
||||
|
||||
# Find matching period
|
||||
for pid, label in periods:
|
||||
if pid == period_id:
|
||||
period_info['period_label'] = label
|
||||
break
|
||||
|
||||
# Parse period_id to extract dates and type
|
||||
if period_id.startswith('instant_'):
|
||||
period_info['period_type'] = 'instant'
|
||||
date_str = period_id.replace('instant_', '')
|
||||
period_info['period_instant'] = date_str
|
||||
period_info['period_end'] = date_str
|
||||
elif period_id.startswith('duration_'):
|
||||
period_info['period_type'] = 'duration'
|
||||
parts = period_id.replace('duration_', '').split('_')
|
||||
if len(parts) >= 2:
|
||||
period_info['period_start'] = parts[0]
|
||||
period_info['period_end'] = parts[1]
|
||||
|
||||
return period_info
|
||||
|
||||
def _convert_to_numeric(self, value: Any) -> Optional[float]:
|
||||
"""Convert value to numeric if possible."""
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
# Remove commas and try to convert
|
||||
cleaned = value.replace(',', '').replace('$', '').strip()
|
||||
return float(cleaned)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
return None
|
||||
|
||||
def _determine_source_filing(self, period_id: str) -> Optional[int]:
|
||||
"""Determine which filing this period came from."""
|
||||
# This would require enhanced tracking in the stitching process
|
||||
# For now, return None but this could be enhanced
|
||||
return None
|
||||
|
||||
def _extract_fiscal_info(self, period_id: str) -> Dict[str, Any]:
|
||||
"""Extract fiscal period and year information."""
|
||||
fiscal_info = {}
|
||||
|
||||
# Try to extract fiscal info from entity_info of the relevant XBRL
|
||||
# This is a simplified approach - could be enhanced with better tracking
|
||||
if self.xbrls.xbrl_list:
|
||||
entity_info = self.xbrls.xbrl_list[0].entity_info
|
||||
if entity_info:
|
||||
fiscal_info['fiscal_period'] = entity_info.get('fiscal_period')
|
||||
fiscal_info['fiscal_year'] = entity_info.get('fiscal_year')
|
||||
|
||||
return fiscal_info
|
||||
|
||||
def query(self, **kwargs) -> 'StitchedFactQuery':
|
||||
"""Create a new query for stitched facts."""
|
||||
return StitchedFactQuery(self, **kwargs)
|
||||
|
||||
|
||||
class StitchedFactQuery(FactQuery):
|
||||
"""
|
||||
Enhanced fact query for stitched/standardized multi-filing data.
|
||||
|
||||
Extends the base FactQuery with capabilities specific to multi-period,
|
||||
standardized financial data.
|
||||
"""
|
||||
|
||||
def __init__(self, stitched_facts_view: StitchedFactsView, **kwargs):
|
||||
# Initialize with stitched facts view instead of regular facts view
|
||||
self._stitched_facts_view = stitched_facts_view
|
||||
|
||||
# Initialize base FactQuery attributes manually since we're not calling super().__init__
|
||||
self._facts_view = stitched_facts_view # For compatibility with base class
|
||||
self._filters = []
|
||||
self._transformations = []
|
||||
self._aggregations = []
|
||||
self._include_dimensions = True
|
||||
self._include_contexts = True
|
||||
self._include_element_info = True
|
||||
self._sort_by = None
|
||||
self._sort_ascending = True
|
||||
self._limit = None
|
||||
self._statement_type = None
|
||||
|
||||
# Multi-filing specific options
|
||||
self._cross_period_only = False
|
||||
self._trend_analysis = False
|
||||
self._require_all_periods = False
|
||||
|
||||
# Store query-specific parameters for get_facts
|
||||
self._max_periods = kwargs.get('max_periods', 8)
|
||||
self._standard = kwargs.get('standard', True)
|
||||
self._statement_types = kwargs.get('statement_types', None)
|
||||
|
||||
def __str__(self):
|
||||
return f"StitchedFactQuery(filters={len(self._filters)})"
|
||||
|
||||
# Enhanced filtering methods for multi-filing scenarios
|
||||
|
||||
def by_standardized_concept(self, concept_name: str) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Filter by standardized concept name (e.g., 'Revenue', 'Net Income').
|
||||
|
||||
Args:
|
||||
concept_name: Standardized concept name
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
# Query both the standardized label and original concept
|
||||
self._filters.append(
|
||||
lambda f: (f.get('label') == concept_name or
|
||||
concept_name.lower() in f.get('label', '').lower() or
|
||||
concept_name.lower() in f.get('concept', '').lower())
|
||||
)
|
||||
return self
|
||||
|
||||
def by_original_label(self, pattern: str, exact: bool = False) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Filter by original company-specific labels before standardization.
|
||||
|
||||
Args:
|
||||
pattern: Pattern to match against original labels
|
||||
exact: Whether to require exact match
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
if exact:
|
||||
self._filters.append(lambda f: f.get('original_label') == pattern)
|
||||
else:
|
||||
regex = re.compile(pattern, re.IGNORECASE)
|
||||
self._filters.append(
|
||||
lambda f: f.get('original_label') and regex.search(f['original_label'])
|
||||
)
|
||||
return self
|
||||
|
||||
def across_periods(self, min_periods: int = 2) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Filter to concepts that appear across multiple periods.
|
||||
|
||||
Args:
|
||||
min_periods: Minimum number of periods the concept must appear in
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
self._cross_period_only = True
|
||||
self._min_periods = min_periods
|
||||
return self
|
||||
|
||||
def by_fiscal_period(self, fiscal_period: str) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Filter by fiscal period (FY, Q1, Q2, Q3, Q4).
|
||||
|
||||
Args:
|
||||
fiscal_period: Fiscal period identifier
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
self._filters.append(
|
||||
lambda f: f.get('fiscal_period') == fiscal_period
|
||||
)
|
||||
return self
|
||||
|
||||
def by_filing_index(self, filing_index: int) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Filter facts by which filing they originated from.
|
||||
|
||||
Args:
|
||||
filing_index: Index of the filing (0 = most recent)
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
self._filters.append(
|
||||
lambda f: f.get('source_filing_index') == filing_index
|
||||
)
|
||||
return self
|
||||
|
||||
def trend_analysis(self, concept: str) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Set up for trend analysis of a specific concept across periods.
|
||||
|
||||
Args:
|
||||
concept: Concept to analyze trends for
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
self._trend_analysis = True
|
||||
self.by_standardized_concept(concept)
|
||||
return self
|
||||
|
||||
def complete_periods_only(self) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Only return concepts that have values in all available periods.
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
self._require_all_periods = True
|
||||
return self
|
||||
|
||||
def execute(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Execute the query with enhanced multi-period processing.
|
||||
|
||||
Returns:
|
||||
List of fact dictionaries
|
||||
"""
|
||||
# Get base results from stitched facts with query parameters
|
||||
results = self._stitched_facts_view.get_facts(
|
||||
max_periods=self._max_periods,
|
||||
standard=self._standard,
|
||||
statement_types=self._statement_types
|
||||
)
|
||||
|
||||
# Apply standard filters
|
||||
for filter_func in self._filters:
|
||||
results = [f for f in results if filter_func(f)]
|
||||
|
||||
# Apply transformations
|
||||
for transform_fn in self._transformations:
|
||||
for fact in results:
|
||||
if 'value' in fact and fact['value'] is not None:
|
||||
fact['value'] = transform_fn(fact['value'])
|
||||
|
||||
# Apply aggregations
|
||||
if self._aggregations:
|
||||
aggregated_results = {}
|
||||
for agg in self._aggregations:
|
||||
dimension = agg['dimension']
|
||||
func = agg['function']
|
||||
|
||||
# Group facts by dimension
|
||||
groups = {}
|
||||
for fact in results:
|
||||
dim_value = fact.get(f'dim_{dimension}')
|
||||
if dim_value and 'value' in fact and fact['value'] is not None:
|
||||
if dim_value not in groups:
|
||||
groups[dim_value] = []
|
||||
groups[dim_value].append(fact['value'])
|
||||
|
||||
# Apply aggregation function
|
||||
for dim_value, values in groups.items():
|
||||
agg_value = 0.0 # Initialize with default value
|
||||
if func == 'sum':
|
||||
agg_value = sum(values)
|
||||
elif func == 'average':
|
||||
agg_value = sum(values) / len(values)
|
||||
|
||||
key = (dimension, dim_value)
|
||||
if key not in aggregated_results:
|
||||
aggregated_results[key] = {'dimension': dimension, 'value': dim_value, 'values': {}}
|
||||
aggregated_results[key]['values'][func] = agg_value
|
||||
|
||||
results = list(aggregated_results.values())
|
||||
|
||||
# Apply cross-period filtering if requested
|
||||
if self._cross_period_only:
|
||||
results = self._filter_cross_period_concepts(results)
|
||||
|
||||
# Apply complete periods filtering if requested
|
||||
if self._require_all_periods:
|
||||
results = self._filter_complete_periods(results)
|
||||
|
||||
# Apply trend analysis if requested
|
||||
if self._trend_analysis:
|
||||
results = self._prepare_trend_data(results)
|
||||
|
||||
# Apply sorting if specified
|
||||
if results and self._sort_by and self._sort_by in results[0]:
|
||||
results.sort(key=lambda f: f.get(self._sort_by, ''),
|
||||
reverse=not self._sort_ascending)
|
||||
|
||||
# Apply limit if specified
|
||||
if self._limit is not None:
|
||||
results = results[:self._limit]
|
||||
|
||||
return results
|
||||
|
||||
def _filter_cross_period_concepts(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Filter to concepts that appear in multiple periods."""
|
||||
concept_periods = defaultdict(set)
|
||||
for fact in results:
|
||||
concept_key = (fact.get('concept', ''), fact.get('label', ''))
|
||||
concept_periods[concept_key].add(fact.get('period_key', ''))
|
||||
|
||||
# Filter to concepts with minimum period count
|
||||
valid_concepts = {
|
||||
concept for concept, periods in concept_periods.items()
|
||||
if len(periods) >= getattr(self, '_min_periods', 2)
|
||||
}
|
||||
|
||||
return [
|
||||
fact for fact in results
|
||||
if (fact.get('concept', ''), fact.get('label', '')) in valid_concepts
|
||||
]
|
||||
|
||||
def _filter_complete_periods(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Filter to concepts that have values in all periods."""
|
||||
# Get all available periods
|
||||
all_periods = set(fact.get('period_key', '') for fact in results)
|
||||
|
||||
concept_periods = defaultdict(set)
|
||||
for fact in results:
|
||||
concept_key = (fact.get('concept', ''), fact.get('label', ''))
|
||||
concept_periods[concept_key].add(fact.get('period_key', ''))
|
||||
|
||||
# Filter to concepts with complete period coverage
|
||||
complete_concepts = {
|
||||
concept for concept, periods in concept_periods.items()
|
||||
if periods == all_periods
|
||||
}
|
||||
|
||||
return [
|
||||
fact for fact in results
|
||||
if (fact.get('concept', ''), fact.get('label', '')) in complete_concepts
|
||||
]
|
||||
|
||||
def _prepare_trend_data(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Prepare data for trend analysis by sorting periods."""
|
||||
# Sort by period end date for trend analysis
|
||||
return sorted(results, key=lambda f: f.get('period_end', ''))
|
||||
|
||||
def to_trend_dataframe(self) -> pd.DataFrame:
|
||||
"""
|
||||
Create a DataFrame optimized for trend analysis.
|
||||
|
||||
Returns:
|
||||
DataFrame with concepts as rows and periods as columns
|
||||
"""
|
||||
results = self.execute()
|
||||
|
||||
if not results:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Pivot data for trend analysis
|
||||
df = pd.DataFrame(results)
|
||||
|
||||
# Create pivot table with concepts as rows and periods as columns
|
||||
if 'concept' in df.columns and 'period_end' in df.columns and 'numeric_value' in df.columns:
|
||||
trend_df = df.pivot_table(
|
||||
index=['label', 'concept'],
|
||||
columns='period_end',
|
||||
values='numeric_value',
|
||||
aggfunc='first'
|
||||
)
|
||||
return trend_df
|
||||
|
||||
return df
|
||||
|
||||
def to_dataframe(self, *columns) -> pd.DataFrame:
|
||||
"""
|
||||
Execute the query and return results as a DataFrame.
|
||||
|
||||
Args:
|
||||
columns: List of columns to include in the DataFrame
|
||||
|
||||
Returns:
|
||||
pandas DataFrame with query results
|
||||
"""
|
||||
results = self.execute()
|
||||
|
||||
if not results:
|
||||
return pd.DataFrame()
|
||||
|
||||
df = pd.DataFrame(results)
|
||||
df['value'] = df['value'].astype(str) # Ensure value is string for display
|
||||
|
||||
# Filter columns based on inclusion flags
|
||||
if not self._include_dimensions:
|
||||
df = df.loc[:, [col for col in df.columns if not col.startswith('dim_')]]
|
||||
|
||||
if not self._include_contexts:
|
||||
context_cols = ['context_ref', 'entity_identifier', 'entity_scheme',
|
||||
'period_type']
|
||||
df = df.loc[:, [col for col in df.columns if col not in context_cols]]
|
||||
|
||||
if not self._include_element_info:
|
||||
element_cols = ['element_id', 'element_name', 'element_type', 'element_period_type',
|
||||
'element_balance', 'element_label']
|
||||
df = df.loc[:, [col for col in df.columns if col not in element_cols]]
|
||||
|
||||
# Drop empty columns
|
||||
df = df.dropna(axis=1, how='all')
|
||||
|
||||
# Filter columns if specified
|
||||
if columns:
|
||||
df = df[list(columns)]
|
||||
|
||||
# Skip these columns
|
||||
skip_columns = ['fact_key', 'period_key']
|
||||
|
||||
# Order columns
|
||||
first_columns = [col for col in
|
||||
['concept', 'label', 'original_label', 'value', 'numeric_value',
|
||||
'period_start', 'period_end', 'decimals', 'statement_type', 'fiscal_period']
|
||||
if col in df.columns]
|
||||
columns = first_columns + [col for col in df.columns
|
||||
if col not in first_columns
|
||||
and col not in skip_columns]
|
||||
|
||||
return df[columns]
|
||||
|
||||
def __rich__(self):
|
||||
title = Text.assemble(("Stitched Facts Query"),
|
||||
)
|
||||
subtitle = Text.assemble((self._stitched_facts_view.entity_name, "bold deep_sky_blue1"),
|
||||
" - ",
|
||||
(self._stitched_facts_view.document_type)
|
||||
)
|
||||
df = self.to_dataframe().fillna('')
|
||||
columns = df.columns.tolist()
|
||||
description = Markdown(
|
||||
f"""
|
||||
Use *to_dataframe(columns)* to get a DataFrame of the results.
|
||||
|
||||
e.g. `query.to_dataframe('concept', 'value', 'period_end')`
|
||||
|
||||
Available columns:
|
||||
'{', '.join(columns)}'
|
||||
|
||||
**Enhanced Multi-Period Methods:**
|
||||
- `across_periods(min_periods=2)` - Filter to concepts across multiple periods
|
||||
- `by_standardized_concept('Revenue')` - Filter by standardized labels
|
||||
- `by_original_label('Net sales')` - Filter by original company labels
|
||||
- `trend_analysis('Revenue')` - Set up trend analysis
|
||||
- `to_trend_dataframe()` - Get trend-optimized DataFrame
|
||||
"""
|
||||
)
|
||||
|
||||
display_columns = [col for col in ['label', 'concept', 'value', 'period_start', 'period_end', 'statement_type']
|
||||
if col in columns]
|
||||
|
||||
if not df.empty:
|
||||
df_display = df[display_columns].head(10) # Show first 10 rows
|
||||
table = Table(*display_columns, show_header=True, header_style="bold", box=box.SIMPLE)
|
||||
for t in df_display.itertuples(index=False):
|
||||
row = []
|
||||
for i in t:
|
||||
row.append(str(i)[:50]) # Truncate long values
|
||||
table.add_row(*row)
|
||||
else:
|
||||
table = Table("No results found", box=box.SIMPLE)
|
||||
|
||||
panel = Panel(Group(description, table), title=title, subtitle=subtitle, box=box.ROUNDED)
|
||||
return panel
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
106
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/utils.py
Normal file
106
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/utils.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""
|
||||
XBRL Statement Stitching - Utility Functions
|
||||
|
||||
This module contains utility functions for rendering and converting stitched
|
||||
statement data.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def render_stitched_statement(
|
||||
stitched_data: Dict[str, Any],
|
||||
statement_title: str,
|
||||
statement_type: str,
|
||||
entity_info: Dict[str, Any] = None,
|
||||
show_date_range: bool = False,
|
||||
xbrl_instance: Optional[Any] = None
|
||||
):
|
||||
"""
|
||||
Render a stitched statement using the same rendering logic as individual statements.
|
||||
|
||||
Args:
|
||||
stitched_data: Stitched statement data
|
||||
statement_title: Title of the statement
|
||||
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
entity_info: Entity information (optional)
|
||||
show_date_range: Whether to show full date ranges for duration periods
|
||||
|
||||
Returns:
|
||||
RichTable: A formatted table representation of the stitched statement
|
||||
"""
|
||||
from edgar.xbrl.rendering import render_statement
|
||||
|
||||
# Extract periods and statement data
|
||||
periods_to_display = stitched_data['periods']
|
||||
statement_data = stitched_data['statement_data']
|
||||
|
||||
# Apply special title formatting for stitched statements
|
||||
if len(periods_to_display) > 1:
|
||||
# For multiple periods, modify the title to indicate the trend view
|
||||
period_desc = f" ({len(periods_to_display)}-Period View)"
|
||||
statement_title = f"{statement_title}{period_desc}"
|
||||
|
||||
# Use the existing rendering function with the new show_date_range parameter
|
||||
return render_statement(
|
||||
statement_data=statement_data,
|
||||
periods_to_display=periods_to_display,
|
||||
statement_title=statement_title,
|
||||
statement_type=statement_type,
|
||||
entity_info=entity_info,
|
||||
show_date_range=show_date_range,
|
||||
xbrl_instance=xbrl_instance
|
||||
)
|
||||
|
||||
|
||||
def to_pandas(stitched_data: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""
|
||||
Convert stitched statement data to a pandas DataFrame.
|
||||
|
||||
Args:
|
||||
stitched_data: Stitched statement data
|
||||
|
||||
Returns:
|
||||
DataFrame with periods as columns and concepts as index
|
||||
"""
|
||||
# Extract periods and statement data
|
||||
statement_data = stitched_data['statement_data']
|
||||
|
||||
# Create ordered list of period column names (preserving the original ordering)
|
||||
period_columns = []
|
||||
for period_id, _period_label in stitched_data['periods']:
|
||||
# Use the end_date in YYYY-MM-DD format as the column name
|
||||
col = period_id[-10:]
|
||||
period_columns.append(col)
|
||||
|
||||
# Create a dictionary for the DataFrame with ordered columns
|
||||
# Start with metadata columns
|
||||
data = {}
|
||||
data['label'] = []
|
||||
data['concept'] = []
|
||||
|
||||
# Initialize period columns in the correct order (newest first)
|
||||
for col in period_columns:
|
||||
data[col] = []
|
||||
|
||||
for _i, item in enumerate(statement_data):
|
||||
# Skip abstract items without values
|
||||
if item['is_abstract'] and not item['has_values']:
|
||||
continue
|
||||
|
||||
data['label'].append(item['label'])
|
||||
data['concept'].append(item['concept'])
|
||||
|
||||
# Add values for each period in the correct order
|
||||
for period_id, _period_label in stitched_data['periods']:
|
||||
col = period_id[-10:]
|
||||
value = item['values'].get(period_id)
|
||||
data[col].append(value)
|
||||
|
||||
# Create the DataFrame with columns in the correct order
|
||||
column_order = ['label', 'concept'] + period_columns
|
||||
df = pd.DataFrame(data, columns=column_order)
|
||||
|
||||
return df
|
||||
340
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/xbrls.py
Normal file
340
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/xbrls.py
Normal file
@@ -0,0 +1,340 @@
|
||||
"""
|
||||
XBRL Statement Stitching - XBRLS Class
|
||||
|
||||
This module contains the XBRLS class which represents multiple XBRL filings
|
||||
stitched together for multi-period analysis.
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from edgar.xbrl.stitching.core import StatementStitcher, stitch_statements
|
||||
from edgar.xbrl.stitching.query import StitchedFactQuery, StitchedFactsView
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar._filings import Filings
|
||||
from edgar.xbrl.statements import StitchedStatements
|
||||
|
||||
|
||||
class XBRLS:
|
||||
"""
|
||||
A class representing multiple XBRL filings stitched together.
|
||||
|
||||
This provides a unified view of financial data across multiple time periods,
|
||||
automatically handling the complexities of statement stitching.
|
||||
"""
|
||||
|
||||
def __init__(self, xbrl_list: List[Any]):
|
||||
"""
|
||||
Initialize an XBRLS instance with a list of XBRL objects.
|
||||
|
||||
Args:
|
||||
xbrl_list: List of XBRL objects, should be from the same company
|
||||
and ordered from newest to oldest
|
||||
"""
|
||||
# Store the list of XBRL objects
|
||||
self.xbrl_list = xbrl_list
|
||||
|
||||
# Extract entity info from the most recent XBRL
|
||||
self.entity_info = xbrl_list[0].entity_info if xbrl_list else {}
|
||||
|
||||
# Cache for stitched statements
|
||||
self._statement_cache = {}
|
||||
|
||||
# Cache for stitched facts view
|
||||
self._stitched_facts_view = None
|
||||
|
||||
@classmethod
|
||||
def from_filings(cls, filings: Union['Filings', List[Any]], filter_amendments:bool=True) -> 'XBRLS':
|
||||
"""
|
||||
Create an XBRLS object from a list of Filing objects or a Filings object containing multiple filings.
|
||||
Each filing should be the same form (e.g., 10-K, 10-Q) and from the same company.
|
||||
|
||||
Args:
|
||||
filings: List of Filing objects, should be from the same company
|
||||
|
||||
Returns:
|
||||
XBRLS object with stitched data
|
||||
"""
|
||||
from edgar.xbrl.xbrl import XBRL
|
||||
|
||||
if filter_amendments:
|
||||
filtered_filings = filings.filter(amendments=False)
|
||||
else:
|
||||
filtered_filings = filings
|
||||
|
||||
# Sort filings by date (newest first)
|
||||
sorted_filings = sorted(filtered_filings, key=lambda f: f.filing_date, reverse=True)
|
||||
|
||||
# Create XBRL objects from filings
|
||||
xbrl_list = []
|
||||
for filing in sorted_filings:
|
||||
try:
|
||||
xbrl = XBRL.from_filing(filing)
|
||||
xbrl_list.append(xbrl)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return cls(xbrl_list)
|
||||
|
||||
@classmethod
|
||||
def from_xbrl_objects(cls, xbrl_list: List[Any]) -> 'XBRLS':
|
||||
"""
|
||||
Create an XBRLS object from a list of XBRL objects.
|
||||
|
||||
Args:
|
||||
xbrl_list: List of XBRL objects, should be from the same company
|
||||
|
||||
Returns:
|
||||
XBRLS object with stitched data
|
||||
"""
|
||||
return cls(xbrl_list)
|
||||
|
||||
@property
|
||||
def statements(self) -> 'StitchedStatements':
|
||||
"""
|
||||
Get a user-friendly interface to access stitched financial statements.
|
||||
|
||||
Returns:
|
||||
StitchedStatements object
|
||||
"""
|
||||
from edgar.xbrl.statements import StitchedStatements
|
||||
return StitchedStatements(self)
|
||||
|
||||
@property
|
||||
def facts(self) -> StitchedFactsView:
|
||||
"""
|
||||
Get a view over stitched facts from all XBRL filings.
|
||||
|
||||
Returns:
|
||||
StitchedFactsView for querying standardized, multi-period data
|
||||
"""
|
||||
if self._stitched_facts_view is None:
|
||||
self._stitched_facts_view = StitchedFactsView(self)
|
||||
return self._stitched_facts_view
|
||||
|
||||
def query(self,
|
||||
max_periods: int = 8,
|
||||
standardize: bool = True,
|
||||
statement_types: Optional[List[str]] = None,
|
||||
**kwargs) -> StitchedFactQuery:
|
||||
"""
|
||||
Start a new query for stitched facts across all filings.
|
||||
|
||||
Args:
|
||||
max_periods: Maximum periods to include in stitched data
|
||||
standardize: Whether to use standardized labels
|
||||
statement_types: List of statement types to include
|
||||
**kwargs: Additional options passed to StitchedFactQuery
|
||||
|
||||
Returns:
|
||||
StitchedFactQuery for building complex queries
|
||||
"""
|
||||
# Pass query parameters to the StitchedFactQuery
|
||||
kwargs.update({
|
||||
'max_periods': max_periods,
|
||||
'standardize': standardize,
|
||||
'statement_types': statement_types
|
||||
})
|
||||
return self.facts.query(**kwargs)
|
||||
|
||||
def get_statement(self, statement_type: str,
|
||||
max_periods: int = 8,
|
||||
standard: bool = True,
|
||||
use_optimal_periods: bool = True,
|
||||
include_dimensions: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
Get a stitched statement of the specified type.
|
||||
|
||||
Args:
|
||||
statement_type: Type of statement to stitch ('IncomeStatement', 'BalanceSheet', etc.)
|
||||
max_periods: Maximum number of periods to include
|
||||
standard: Whether to use standardized concept labels
|
||||
use_optimal_periods: Whether to use entity info to determine optimal periods
|
||||
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
|
||||
|
||||
Returns:
|
||||
Dictionary with stitched statement data
|
||||
"""
|
||||
# Check cache first
|
||||
cache_key = f"{statement_type}_{max_periods}_{standard}_{use_optimal_periods}_{include_dimensions}"
|
||||
if cache_key in self._statement_cache:
|
||||
return self._statement_cache[cache_key]
|
||||
|
||||
# Stitch the statement
|
||||
result = stitch_statements(
|
||||
self.xbrl_list,
|
||||
statement_type=statement_type,
|
||||
period_type=StatementStitcher.PeriodType.ALL_PERIODS,
|
||||
max_periods=max_periods,
|
||||
standard=standard,
|
||||
use_optimal_periods=use_optimal_periods,
|
||||
include_dimensions=include_dimensions
|
||||
)
|
||||
|
||||
# Cache the result
|
||||
self._statement_cache[cache_key] = result
|
||||
|
||||
return result
|
||||
|
||||
def render_statement(self, statement_type: str,
|
||||
max_periods: int = 8,
|
||||
standardize: bool = True,
|
||||
use_optimal_periods: bool = True,
|
||||
show_date_range: bool = False,
|
||||
include_dimensions: bool = False):
|
||||
"""
|
||||
Render a stitched statement in a rich table format.
|
||||
|
||||
Args:
|
||||
statement_type: Type of statement to render ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
max_periods: Maximum number of periods to include
|
||||
standardize: Whether to use standardized concept labels
|
||||
use_optimal_periods: Whether to use entity info to determine optimal periods
|
||||
show_date_range: Whether to show full date ranges for duration periods
|
||||
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
|
||||
|
||||
Returns:
|
||||
RichTable: A formatted table representation of the stitched statement
|
||||
"""
|
||||
# Create a StitchedStatement object and use its render method
|
||||
from edgar.xbrl.statements import StitchedStatement
|
||||
statement = StitchedStatement(self, statement_type, max_periods, standardize, use_optimal_periods, include_dimensions)
|
||||
return statement.render(show_date_range=show_date_range)
|
||||
|
||||
def to_dataframe(self, statement_type: str,
|
||||
max_periods: int = 8,
|
||||
standardize: bool = True) -> pd.DataFrame:
|
||||
"""
|
||||
Convert a stitched statement to a pandas DataFrame.
|
||||
|
||||
Args:
|
||||
statement_type: Type of statement to convert ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
max_periods: Maximum number of periods to include
|
||||
standardize: Whether to use standardized concept labels
|
||||
|
||||
Returns:
|
||||
DataFrame with periods as columns and concepts as index
|
||||
"""
|
||||
# Create a StitchedStatement object and use its to_dataframe method
|
||||
from edgar.xbrl.statements import StitchedStatement
|
||||
statement = StitchedStatement(self, statement_type, max_periods, standardize)
|
||||
return statement.to_dataframe()
|
||||
|
||||
def get_periods(self) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Get all available periods across all XBRL objects.
|
||||
|
||||
Returns:
|
||||
List of period information dictionaries, each containing:
|
||||
- 'type': 'instant' or 'duration'
|
||||
- 'key': period key (e.g., 'instant_2024-09-28', 'duration_2024-01-01_2024-09-28')
|
||||
- 'label': human-readable label
|
||||
For instant periods:
|
||||
- 'date': end date as 'YYYY-MM-DD'
|
||||
For duration periods:
|
||||
- 'start_date': start date as 'YYYY-MM-DD'
|
||||
- 'end_date': end date as 'YYYY-MM-DD'
|
||||
- 'days': duration in days
|
||||
- 'period_type': classification ('Annual', 'Quarterly', etc.)
|
||||
"""
|
||||
all_periods = []
|
||||
|
||||
# Go through all XBRL objects to collect periods
|
||||
for xbrl in self.xbrl_list:
|
||||
all_periods.extend(xbrl.reporting_periods)
|
||||
|
||||
# De-duplicate periods with the same labels
|
||||
unique_periods = {}
|
||||
for period in all_periods:
|
||||
# Use the date string as the unique key
|
||||
key = period['date'] if period['type'] == 'instant' else f"{period['start_date']}_{period['end_date']}"
|
||||
if key not in unique_periods:
|
||||
unique_periods[key] = period
|
||||
|
||||
return list(unique_periods.values())
|
||||
|
||||
def get_period_end_dates(self) -> List[str]:
|
||||
"""
|
||||
Get end dates for all available periods in YYYY-MM-DD format.
|
||||
|
||||
This is a convenience method that extracts just the end dates from periods,
|
||||
handling both instant and duration periods correctly.
|
||||
|
||||
Returns:
|
||||
List of end dates as strings in YYYY-MM-DD format, sorted newest first
|
||||
"""
|
||||
periods = self.get_periods()
|
||||
end_dates = []
|
||||
|
||||
for period in periods:
|
||||
if period.get('type') == 'duration':
|
||||
end_date = period.get('end_date')
|
||||
elif period.get('type') == 'instant':
|
||||
end_date = period.get('date')
|
||||
else:
|
||||
continue
|
||||
|
||||
if end_date:
|
||||
end_dates.append(end_date)
|
||||
|
||||
# Sort newest first and remove duplicates while preserving order
|
||||
seen = set()
|
||||
sorted_dates = []
|
||||
for date in sorted(set(end_dates), reverse=True):
|
||||
if date not in seen:
|
||||
sorted_dates.append(date)
|
||||
seen.add(date)
|
||||
|
||||
return sorted_dates
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
String representation of the XBRLS object.
|
||||
|
||||
Returns:
|
||||
String representation
|
||||
"""
|
||||
filing_count = len(self.xbrl_list)
|
||||
periods = self.get_periods()
|
||||
return f"XBRLS with {filing_count} filings covering {len(periods)} unique periods"
|
||||
|
||||
def __rich__(self):
|
||||
"""
|
||||
Rich representation for pretty console output.
|
||||
|
||||
Returns:
|
||||
Rich console representation
|
||||
"""
|
||||
from rich.panel import Panel
|
||||
from rich.text import Text
|
||||
|
||||
# Get information about the XBRLS object
|
||||
filing_count = len(self.xbrl_list)
|
||||
periods = self.get_periods()
|
||||
|
||||
# Create a panel with the information
|
||||
content = Text.from_markup("[bold]XBRLS Object[/bold]\n")
|
||||
content.append(f"Filings: {filing_count}\n")
|
||||
content.append(f"Unique Periods: {len(periods)}\n")
|
||||
|
||||
# List available statement types
|
||||
statement_types = set()
|
||||
for xbrl in self.xbrl_list:
|
||||
statements = xbrl.get_all_statements()
|
||||
for stmt in statements:
|
||||
if stmt['type']:
|
||||
statement_types.add(stmt['type'])
|
||||
|
||||
content.append("\n[bold]Available Statement Types:[/bold]\n")
|
||||
for stmt_type in sorted(statement_types):
|
||||
content.append(f"- {stmt_type}\n")
|
||||
|
||||
# Show how to access statements
|
||||
content.append("\n[bold]Example Usage:[/bold]\n")
|
||||
content.append("xbrls.statements.income_statement()\n")
|
||||
content.append("xbrls.statements.balance_sheet()\n")
|
||||
content.append("xbrls.to_dataframe('IncomeStatement')\n")
|
||||
|
||||
return Panel(content, title="XBRLS", expand=False)
|
||||
1732
venv/lib/python3.10/site-packages/edgar/xbrl/xbrl.py
Normal file
1732
venv/lib/python3.10/site-packages/edgar/xbrl/xbrl.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user