Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,72 @@
"""
XBRL2 Module - Enhanced XBRL Processing for EdgarTools
This module provides enhanced parsing and processing of XBRL data,
with support for statement standardization and multi-period statement stitching.
Example usage:
from edgar import Company
from edgar.xbrl import XBRL, XBRLS
# Parse a single filing
company = Company("AAPL")
filing = company.latest_10k()
xbrl = XBRL.from_filing(filing)
# Access statements from a single filing
balance_sheet = xbrl.statements.balance_sheet()
income_statement = xbrl.statements.income_statement()
# Render the statement or convert to DataFrame
print(balance_sheet.render())
df = income_statement.to_dataframe()
# For multi-period analysis, use XBRLS to stitch statements together
filings = company.latest("10-K", 3) # Get 3 years of 10-K filings
xbrls = XBRLS.from_filings(filings)
# Access stitched statements showing multiple years of data
stitched_income = xbrls.statements.income_statement()
# Render the stitched statement or convert to DataFrame
print(stitched_income.render())
df = stitched_income.to_dataframe()
"""
from edgar.xbrl.facts import FactQuery, FactsView
from edgar.xbrl.rendering import RenderedStatement
from edgar.xbrl.standardization import StandardConcept
from edgar.xbrl.statements import Statement, Statements, StitchedStatement, StitchedStatements
# Export statement stitching functionality
from edgar.xbrl.stitching import (
XBRLS,
StatementStitcher,
StitchedFactQuery,
StitchedFactsView,
render_stitched_statement,
stitch_statements,
to_pandas,
)
from edgar.xbrl.xbrl import XBRL, XBRLFilingWithNoXbrlData
__all__ = [
'XBRL',
'XBRLFilingWithNoXbrlData',
'XBRLS',
'Statements',
'Statement',
'StitchedStatements',
'StitchedStatement',
'StandardConcept',
'StatementStitcher',
'stitch_statements',
'render_stitched_statement',
'RenderedStatement',
'to_pandas',
'FactsView',
'FactQuery',
'StitchedFactsView',
'StitchedFactQuery'
]

View File

@@ -0,0 +1,180 @@
"""
Abstract concept detection for XBRL elements.
This module provides utilities to determine if an XBRL concept should be marked as abstract,
using multiple fallback strategies when taxonomy schema information is not available.
Background:
-----------
EdgarTools currently only parses company-specific XSD schema files included in SEC filings.
Standard taxonomy schemas (US-GAAP, DEI, etc.) are referenced externally and not parsed.
This means concepts from standard taxonomies are added to the element catalog without their
abstract attribute information, defaulting to abstract=False.
Solution:
---------
This module implements a multi-tier fallback strategy for abstract detection:
1. Trust schema abstract attribute (if available and True)
2. Check known abstract concepts (explicit list)
3. Pattern matching on concept name
4. Structural heuristics (has children but no values)
See: Issue #450 - Statement of Equity rendering problems
"""
import re
from typing import List, Set
# Known abstract concepts from US-GAAP taxonomy
# These are explicitly marked abstract="true" in the US-GAAP taxonomy schemas
KNOWN_ABSTRACT_CONCEPTS: Set[str] = {
# Statement abstracts
'us-gaap_StatementOfFinancialPositionAbstract',
'us-gaap_StatementOfStockholdersEquityAbstract',
'us-gaap_StatementOfIncomeAndComprehensiveIncomeAbstract',
'us-gaap_StatementOfCashFlowsAbstract',
'us-gaap_IncomeStatementAbstract',
# Roll forward abstracts
'us-gaap_IncreaseDecreaseInStockholdersEquityRollForward',
'us-gaap_PropertyPlantAndEquipmentRollForward',
'us-gaap_IntangibleAssetsRollForward',
'us-gaap_LongTermDebtRollForward',
# Reconciliation abstracts
'us-gaap_AdjustmentsToReconcileNetIncomeLossToCashProvidedByUsedInOperatingActivitiesAbstract',
# Table and axis abstracts
'us-gaap_StatementTable',
'us-gaap_StatementLineItems',
'us-gaap_StatementEquityComponentsAxis',
'us-gaap_EquityComponentDomain',
# Accounting policies
'us-gaap_AccountingPoliciesAbstract',
'us-gaap_SignificantAccountingPoliciesTextBlock',
# Disclosure abstracts
'us-gaap_DisclosureTextBlockAbstract',
# Document and entity information (DEI)
'dei_DocumentInformationAbstract',
'dei_EntityInformationAbstract',
'dei_CoverAbstract',
}
# Patterns that indicate a concept is likely abstract
# These are based on XBRL naming conventions
ABSTRACT_CONCEPT_PATTERNS: List[str] = [
r'.*Abstract$', # Ends with "Abstract"
r'.*RollForward$', # Ends with "RollForward" (roll forward tables)
r'.*Table$', # Ends with "Table" (dimensional tables)
r'.*Axis$', # Ends with "Axis" (dimensional axes)
r'.*Domain$', # Ends with "Domain" (dimension domains)
r'.*LineItems$', # Ends with "LineItems" (line item tables)
r'.*TextBlock$', # Ends with "TextBlock" (disclosure text blocks)
]
def is_abstract_concept(
concept_name: str,
schema_abstract: bool = False,
has_children: bool = False,
has_values: bool = False
) -> bool:
"""
Determine if an XBRL concept should be marked as abstract using multiple fallback strategies.
Strategy priority:
1. Trust schema if it explicitly says abstract=True
2. Check against known abstract concepts list
3. Apply pattern matching on concept name
4. Use structural heuristics (has children but no values)
5. Default to schema value or False
Args:
concept_name: The XBRL concept name (e.g., "us-gaap_StatementOfStockholdersEquityAbstract")
schema_abstract: The abstract attribute from the schema (if available)
has_children: Whether this concept has children in the presentation tree
has_values: Whether this concept has fact values in the instance
Returns:
True if the concept should be marked as abstract, False otherwise
Examples:
>>> is_abstract_concept('us-gaap_StatementOfStockholdersEquityAbstract')
True
>>> is_abstract_concept('us-gaap_Revenue')
False
>>> is_abstract_concept('us-gaap_SomethingRollForward')
True
>>> is_abstract_concept('us-gaap_UnknownConcept', has_children=True, has_values=False)
True
"""
# Strategy 1: Trust schema if it says True
if schema_abstract:
return True
# Strategy 2: Check known abstract concepts
if concept_name in KNOWN_ABSTRACT_CONCEPTS:
return True
# Strategy 3: Pattern matching
for pattern in ABSTRACT_CONCEPT_PATTERNS:
if re.match(pattern, concept_name):
return True
# Strategy 4: Structural heuristics
# If a concept has children in the presentation tree but no fact values,
# it's likely an abstract header
if has_children and not has_values:
return True
# Strategy 5: Default to schema value (or False if not provided)
return schema_abstract
def add_known_abstract_concept(concept_name: str) -> None:
"""
Add a concept to the known abstracts list.
This allows runtime extension of the known abstracts list when new abstract
concepts are discovered that don't match existing patterns.
Args:
concept_name: The XBRL concept name to add
"""
KNOWN_ABSTRACT_CONCEPTS.add(concept_name)
def add_abstract_pattern(pattern: str) -> None:
"""
Add a pattern to the abstract pattern list.
Args:
pattern: Regular expression pattern to match abstract concepts
"""
ABSTRACT_CONCEPT_PATTERNS.append(pattern)
def get_known_abstract_concepts() -> Set[str]:
"""
Get the set of known abstract concepts.
Returns:
Set of concept names known to be abstract
"""
return KNOWN_ABSTRACT_CONCEPTS.copy()
def get_abstract_patterns() -> List[str]:
"""
Get the list of abstract concept patterns.
Returns:
List of regex patterns used to identify abstract concepts
"""
return ABSTRACT_CONCEPT_PATTERNS.copy()

View File

@@ -0,0 +1,125 @@
"""Financial fraud detection module.
This module provides tools for detecting potential financial fraud and anomalies:
- Benford's Law Analysis for digit distribution anomalies
- Altman Z-Score for bankruptcy risk
- Beneish M-Score for earnings manipulation
- Piotroski F-Score for financial strength
"""
import math
from collections import Counter
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from ..standardization import StandardConcept
from .metrics import AltmanZScore, BeneishMScore, PiotroskiFScore
@dataclass
class BenfordResult:
"""Results from Benford's Law analysis."""
observed_dist: Dict[int, float] # Observed digit distribution
expected_dist: Dict[int, float] # Expected Benford distribution
chi_square: float # Chi-square statistic
p_value: float # P-value for goodness of fit
anomalous: bool # Whether distribution is significantly different
def __repr__(self) -> str:
return f"{'Anomalous' if self.anomalous else 'Normal'} (p={self.p_value:.3f})"
class FraudDetector:
"""Detect potential financial fraud using multiple methods."""
def __init__(self, xbrl):
"""Initialize with an XBRL instance."""
self.xbrl = xbrl
self.altman = AltmanZScore(xbrl)
self.beneish = BeneishMScore(xbrl)
self.piotroski = PiotroskiFScore(xbrl)
def analyze_digit_distribution(self, values: List[float], significance: float = 0.05) -> Optional[BenfordResult]:
"""Analyze digit distribution using Benford's Law.
Args:
values: List of numeric values to analyze
significance: P-value threshold for anomaly detection
Returns:
BenfordResult with analysis results, or None if insufficient data
"""
if len(values) < 10: # Need reasonable sample size
return None
# Get first digits
first_digits = [int(str(abs(float(v))).lstrip('0')[0]) for v in values if v != 0]
if not first_digits:
return None
# Calculate observed distribution
digit_counts = Counter(first_digits)
total = len(first_digits)
observed_dist = {d: digit_counts.get(d, 0) / total for d in range(1, 10)}
# Calculate expected Benford distribution
expected_dist = {d: math.log10(1 + 1/d) for d in range(1, 10)}
# Perform chi-square test
chi_square = 0
for d in range(1, 10):
expected = expected_dist[d] * total
observed = digit_counts.get(d, 0)
chi_square += (observed - expected) ** 2 / expected
# Get p-value (8 degrees of freedom for digits 1-9)
from scipy.stats import chi2
p_value = 1 - chi2.cdf(chi_square, 8)
return BenfordResult(
observed_dist=observed_dist,
expected_dist=expected_dist,
chi_square=chi_square,
p_value=p_value,
anomalous=p_value < significance
)
def analyze_all(self) -> Dict[str, Any]:
"""Run all fraud detection analyses.
Returns:
Dict containing:
- altman_z: Altman Z-Score results
- beneish_m: Beneish M-Score results
- piotroski_f: Piotroski F-Score results
- benford: Benford's Law analysis results
"""
# Get financial values for Benford analysis
values = []
for concept in [
StandardConcept.TOTAL_ASSETS,
StandardConcept.TOTAL_LIABILITIES,
StandardConcept.TOTAL_EQUITY,
StandardConcept.REVENUE,
StandardConcept.NET_INCOME,
StandardConcept.OPERATING_INCOME,
StandardConcept.OPERATING_CASH_FLOW
]:
if hasattr(self.xbrl.statements, 'balance_sheet'):
bs_value = self.altman._get_value(concept)
if bs_value:
values.append(bs_value)
if hasattr(self.xbrl.statements, 'income_statement'):
is_value = self.altman._get_value(concept, "IncomeStatement")
if is_value:
values.append(is_value)
if hasattr(self.xbrl.statements, 'cash_flow'):
cf_value = self.altman._get_value(concept, "CashFlow")
if cf_value:
values.append(cf_value)
return {
'altman_z': self.altman.calculate(),
'beneish_m': self.beneish.calculate(),
'piotroski_f': self.piotroski.calculate(),
'benford': self.analyze_digit_distribution(values)
}

View File

@@ -0,0 +1,411 @@
"""Financial metrics and analysis module.
This module provides various financial metrics and analysis tools including:
- Altman Z-Score for bankruptcy prediction
- Beneish M-Score for earnings manipulation detection
- Piotroski F-Score for financial strength assessment
- Montier C-Score for earnings manipulation detection
"""
from dataclasses import dataclass
from typing import Dict, Optional
from ..standardization import MappingStore, StandardConcept
@dataclass
class MetricResult:
"""Container for metric calculation results with metadata."""
value: float
components: Dict[str, float]
interpretation: str
period: str
def __repr__(self) -> str:
return f"{self.value:.2f} ({self.interpretation})"
class FinancialMetrics:
"""Base class for financial metrics calculations."""
def __init__(self, xbrl):
"""Initialize with an XBRL instance."""
self.xbrl = xbrl
self._balance_sheet_df = None
self._income_stmt_df = None
self._cash_flow_df = None
self._bs_period = None
self._is_period = None
self._cf_period = None
# Initialize concept mappings
self._mapping_store = MappingStore()
# Initialize dataframes if statements exist
if self.xbrl.statements.balance_sheet:
bs = self.xbrl.statements.balance_sheet
self._balance_sheet_df = bs.to_dataframe()
self._bs_period = bs.periods[0].label
if self.xbrl.statements.income_statement:
is_ = self.xbrl.statements.income_statement
self._income_stmt_df = is_.to_dataframe()
self._is_period = is_.periods[0].label
if self.xbrl.statements.cash_flow:
cf = self.xbrl.statements.cash_flow
self._cash_flow_df = cf.to_dataframe()
self._cf_period = cf.periods[0].label
def _get_value(self, label: StandardConcept, statement_type: str = "BalanceSheet", period_offset: int = 0) -> Optional[float]:
"""Safely extract a numeric value using the standardized label from the appropriate statement.
Args:
label: The standardized concept to retrieve
statement_type: Type of financial statement ("BalanceSheet", "IncomeStatement", "CashFlow")
period_offset: Offset from current period (0 for current, -1 for prior, etc.)
Returns:
The numeric value if found, None otherwise
"""
try:
concepts = self._mapping_store.get_company_concepts(label)
if not concepts:
return None
df = None
if statement_type == "BalanceSheet" and self._balance_sheet_df is not None:
df = self._balance_sheet_df
elif statement_type == "IncomeStatement" and self._income_stmt_df is not None:
df = self._income_stmt_df
elif statement_type == "CashFlow" and self._cash_flow_df is not None:
df = self._cash_flow_df
if df is None:
return None
# Get all available periods
periods = df.columns.tolist()
if not periods:
return None
# Get target period based on offset
try:
target_period = periods[period_offset]
except IndexError:
return None
# Try each concept mapping
for concept in concepts:
try:
return df.loc[concept, target_period]
except KeyError:
continue
return None
except ValueError:
return None
class AltmanZScore(FinancialMetrics):
"""Calculate Altman Z-Score for bankruptcy prediction."""
def calculate(self) -> Optional[MetricResult]:
"""Calculate Altman Z-Score.
Z-Score = 1.2X₁ + 1.4X₂ + 3.3X₃ + 0.6X₄ + 1.0X₅
where:
X₁ = Working Capital / Total Assets
X₂ = Retained Earnings / Total Assets
X₃ = EBIT / Total Assets
X₄ = Market Value of Equity / Total Liabilities
X₅ = Sales / Total Assets
"""
# Get required values
working_capital = self._get_working_capital()
total_assets = self._get_value(StandardConcept.TOTAL_ASSETS)
retained_earnings = self._get_value(StandardConcept.RETAINED_EARNINGS)
ebit = self._get_value(StandardConcept.OPERATING_INCOME, "IncomeStatement")
market_value = self._get_value(StandardConcept.TOTAL_EQUITY) # Using book value as proxy
total_liabilities = self._get_value(StandardConcept.TOTAL_LIABILITIES)
revenue = self._get_value(StandardConcept.REVENUE, "IncomeStatement")
# Check if we have all required values
if not all([working_capital, total_assets, retained_earnings, ebit,
market_value, total_liabilities, revenue]):
return None
# Cast to float to help type checker
working_capital = float(working_capital) # type: ignore
total_assets = float(total_assets) # type: ignore
retained_earnings = float(retained_earnings) # type: ignore
ebit = float(ebit) # type: ignore
market_value = float(market_value) # type: ignore
total_liabilities = float(total_liabilities) # type: ignore
revenue = float(revenue) # type: ignore
# Calculate ratios
x1 = working_capital / total_assets
x2 = retained_earnings / total_assets
x3 = ebit / total_assets
x4 = market_value / total_liabilities
x5 = revenue / total_assets
# Calculate Z-Score
z_score = 1.2*x1 + 1.4*x2 + 3.3*x3 + 0.6*x4 + 1.0*x5
# Interpret score
if z_score > 2.99:
interpretation = "Safe Zone: Low probability of financial distress"
elif z_score > 1.81:
interpretation = "Grey Zone: Moderate risk of financial distress"
else:
interpretation = "Distress Zone: High risk of financial distress"
return MetricResult(
value=z_score,
components={
'working_capital_to_assets': x1,
'retained_earnings_to_assets': x2,
'ebit_to_assets': x3,
'equity_to_liabilities': x4,
'sales_to_assets': x5
},
interpretation=interpretation,
period=self._bs_period if self._bs_period is not None else ""
)
def _get_working_capital(self) -> Optional[float]:
"""Calculate working capital."""
current_assets = self._get_value(StandardConcept.TOTAL_CURRENT_ASSETS)
current_liab = self._get_value(StandardConcept.TOTAL_CURRENT_LIABILITIES)
if current_assets is None or current_liab is None:
return None
return current_assets - current_liab
class BeneishMScore(FinancialMetrics):
"""Calculate Beneish M-Score for earnings manipulation detection."""
def calculate(self) -> Optional[MetricResult]:
"""Calculate Beneish M-Score.
M-Score = -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI
- 0.172*SGAI + 4.679*TATA - 0.327*LVGI
where:
DSRI = Days Sales in Receivables Index
GMI = Gross Margin Index
AQI = Asset Quality Index
SGI = Sales Growth Index
DEPI = Depreciation Index
SGAI = SG&A Expense Index
TATA = Total Accruals to Total Assets
LVGI = Leverage Index
A score greater than -2.22 indicates a high probability of earnings manipulation.
"""
# Get current year values
receivables = self._get_value(StandardConcept.ACCOUNTS_RECEIVABLE)
revenue = self._get_value(StandardConcept.REVENUE, "IncomeStatement")
gross_profit = self._get_value(StandardConcept.GROSS_PROFIT, "IncomeStatement")
total_assets = self._get_value(StandardConcept.TOTAL_ASSETS)
ppe = self._get_value(StandardConcept.PROPERTY_PLANT_EQUIPMENT)
depreciation = self._get_value(StandardConcept.DEPRECIATION_AMORTIZATION, "IncomeStatement")
sga = self._get_value(StandardConcept.SGA_EXPENSE, "IncomeStatement")
total_liabilities = self._get_value(StandardConcept.TOTAL_LIABILITIES)
# Get prior year values (assuming they're available)
prior_receivables = self._get_value(StandardConcept.ACCOUNTS_RECEIVABLE, period_offset=-1)
prior_revenue = self._get_value(StandardConcept.REVENUE, "IncomeStatement", period_offset=-1)
prior_gross_profit = self._get_value(StandardConcept.GROSS_PROFIT, "IncomeStatement", period_offset=-1)
prior_total_assets = self._get_value(StandardConcept.TOTAL_ASSETS, period_offset=-1)
prior_ppe = self._get_value(StandardConcept.PROPERTY_PLANT_EQUIPMENT, period_offset=-1)
prior_depreciation = self._get_value(StandardConcept.DEPRECIATION_AMORTIZATION, "IncomeStatement", period_offset=-1)
prior_sga = self._get_value(StandardConcept.SGA_EXPENSE, "IncomeStatement", period_offset=-1)
prior_total_liabilities = self._get_value(StandardConcept.TOTAL_LIABILITIES, period_offset=-1)
# Check if we have all required values
if not all([receivables, revenue, gross_profit, total_assets, ppe, depreciation, sga, total_liabilities,
prior_receivables, prior_revenue, prior_gross_profit, prior_total_assets, prior_ppe,
prior_depreciation, prior_sga, prior_total_liabilities]):
return None
# Cast to float to help type checker
receivables = float(receivables) # type: ignore
revenue = float(revenue) # type: ignore
gross_profit = float(gross_profit) # type: ignore
total_assets = float(total_assets) # type: ignore
ppe = float(ppe) # type: ignore
depreciation = float(depreciation) # type: ignore
sga = float(sga) # type: ignore
total_liabilities = float(total_liabilities) # type: ignore
prior_receivables = float(prior_receivables) # type: ignore
prior_revenue = float(prior_revenue) # type: ignore
prior_gross_profit = float(prior_gross_profit) # type: ignore
prior_total_assets = float(prior_total_assets) # type: ignore
prior_ppe = float(prior_ppe) # type: ignore
prior_depreciation = float(prior_depreciation) # type: ignore
prior_sga = float(prior_sga) # type: ignore
prior_total_liabilities = float(prior_total_liabilities) # type: ignore
# Calculate components
dsri = (receivables / revenue) / (prior_receivables / prior_revenue)
gmi = (prior_gross_profit / prior_revenue) / (gross_profit / revenue)
aqi = ((total_assets - ppe) / total_assets) / ((prior_total_assets - prior_ppe) / prior_total_assets)
sgi = revenue / prior_revenue
depi = (prior_depreciation / prior_ppe) / (depreciation / ppe)
sgai = (sga / revenue) / (prior_sga / prior_revenue)
tata = (total_assets - prior_total_assets) / total_assets
lvgi = (total_liabilities / total_assets) / (prior_total_liabilities / prior_total_assets)
# Calculate M-Score
m_score = -4.84 + 0.92*dsri + 0.528*gmi + 0.404*aqi + 0.892*sgi + \
0.115*depi - 0.172*sgai + 4.679*tata - 0.327*lvgi
# Interpret score
if m_score > -2.22:
interpretation = "High probability of earnings manipulation"
else:
interpretation = "Low probability of earnings manipulation"
return MetricResult(
value=m_score,
components={
'dsri': dsri,
'gmi': gmi,
'aqi': aqi,
'sgi': sgi,
'depi': depi,
'sgai': sgai,
'tata': tata,
'lvgi': lvgi
},
interpretation=interpretation,
period=self._bs_period if self._bs_period is not None else ""
)
class PiotroskiFScore(FinancialMetrics):
"""Calculate Piotroski F-Score for financial strength assessment."""
def calculate(self) -> Optional[MetricResult]:
"""Calculate Piotroski F-Score.
The F-Score is the sum of 9 binary signals (0 or 1) across three categories:
Profitability:
1. Return on Assets (ROA) > 0
2. Operating Cash Flow > 0
3. ROA(t) > ROA(t-1)
4. Cash flow from operations > ROA
Leverage, Liquidity and Source of Funds:
5. Long-term debt ratio(t) < Long-term debt ratio(t-1)
6. Current ratio(t) > Current ratio(t-1)
7. No new shares issued
Operating Efficiency:
8. Gross margin(t) > Gross margin(t-1)
9. Asset turnover(t) > Asset turnover(t-1)
A score of 8-9 indicates a strong company, while 0-2 indicates a weak company.
"""
scores = {}
total_score = 0
# Get current year values
net_income = self._get_value(StandardConcept.NET_INCOME, "IncomeStatement")
total_assets = self._get_value(StandardConcept.TOTAL_ASSETS)
operating_cash_flow = self._get_value(StandardConcept.OPERATING_CASH_FLOW, "CashFlow")
long_term_debt = self._get_value(StandardConcept.LONG_TERM_DEBT)
current_assets = self._get_value(StandardConcept.TOTAL_CURRENT_ASSETS)
current_liab = self._get_value(StandardConcept.TOTAL_CURRENT_LIABILITIES)
shares_outstanding = self._get_value(StandardConcept.SHARES_OUTSTANDING)
revenue = self._get_value(StandardConcept.REVENUE, "IncomeStatement")
gross_profit = self._get_value(StandardConcept.GROSS_PROFIT, "IncomeStatement")
# Get prior year values
prior_net_income = self._get_value(StandardConcept.NET_INCOME, "IncomeStatement", -1)
prior_total_assets = self._get_value(StandardConcept.TOTAL_ASSETS, "BalanceSheet", -1)
prior_long_term_debt = self._get_value(StandardConcept.LONG_TERM_DEBT, "BalanceSheet", -1)
prior_current_assets = self._get_value(StandardConcept.TOTAL_CURRENT_ASSETS, "BalanceSheet", -1)
prior_current_liab = self._get_value(StandardConcept.TOTAL_CURRENT_LIABILITIES, "BalanceSheet", -1)
prior_shares_outstanding = self._get_value(StandardConcept.SHARES_OUTSTANDING, "BalanceSheet", -1)
prior_revenue = self._get_value(StandardConcept.REVENUE, "IncomeStatement", -1)
prior_gross_profit = self._get_value(StandardConcept.GROSS_PROFIT, "IncomeStatement", -1)
# Check if we have minimum required values for any calculations
if not all([net_income, total_assets, operating_cash_flow]):
return None
# Cast to float
net_income = float(net_income) # type: ignore
total_assets = float(total_assets) # type: ignore
operating_cash_flow = float(operating_cash_flow) # type: ignore
# 1. ROA > 0
roa = net_income / total_assets
scores['roa_positive'] = 1 if roa > 0 else 0
total_score += scores['roa_positive']
# 2. Operating Cash Flow > 0
scores['cfoa_positive'] = 1 if operating_cash_flow > 0 else 0
total_score += scores['cfoa_positive']
# 3. ROA(t) > ROA(t-1)
if prior_net_income is not None and prior_total_assets is not None:
prior_roa = float(prior_net_income) / float(prior_total_assets) # type: ignore
scores['roa_higher'] = 1 if roa > prior_roa else 0
total_score += scores['roa_higher']
# 4. Cash flow from operations > ROA
scores['quality_earnings'] = 1 if operating_cash_flow / total_assets > roa else 0
total_score += scores['quality_earnings']
# 5. Long-term debt ratio
if all([long_term_debt, prior_long_term_debt]):
ltdr = float(long_term_debt) / total_assets # type: ignore
prior_ltdr = float(prior_long_term_debt) / float(prior_total_assets) # type: ignore
scores['leverage_lower'] = 1 if ltdr < prior_ltdr else 0
total_score += scores['leverage_lower']
# 6. Current ratio
if all([current_assets, current_liab, prior_current_assets, prior_current_liab]):
curr_ratio = float(current_assets) / float(current_liab) # type: ignore
prior_curr_ratio = float(prior_current_assets) / float(prior_current_liab) # type: ignore
scores['liquidity_higher'] = 1 if curr_ratio > prior_curr_ratio else 0
total_score += scores['liquidity_higher']
# 7. No new shares issued
if shares_outstanding is not None and prior_shares_outstanding is not None:
scores['no_dilution'] = 1 if float(shares_outstanding) <= float(prior_shares_outstanding) else 0 # type: ignore
total_score += scores['no_dilution']
# 8. Gross margin
if all([gross_profit, revenue, prior_gross_profit, prior_revenue]):
margin = float(gross_profit) / float(revenue) # type: ignore
prior_margin = float(prior_gross_profit) / float(prior_revenue) # type: ignore
scores['margin_higher'] = 1 if margin > prior_margin else 0
total_score += scores['margin_higher']
# 9. Asset turnover
if all([revenue, prior_revenue]):
turnover = float(revenue) / total_assets # type: ignore
prior_turnover = float(prior_revenue) / float(prior_total_assets) # type: ignore
scores['turnover_higher'] = 1 if turnover > prior_turnover else 0
total_score += scores['turnover_higher']
# Interpret score
if total_score >= 8:
interpretation = "Strong financial position"
elif total_score >= 5:
interpretation = "Moderate financial position"
else:
interpretation = "Weak financial position"
return MetricResult(
value=total_score,
components=scores,
interpretation=interpretation,
period=self._bs_period if self._bs_period is not None else ""
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,477 @@
"""
Core utilities for XBRL processing.
This module provides common functions used throughout the XBRL parser.
"""
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple, Union
# Constants for label roles
STANDARD_LABEL = "http://www.xbrl.org/2003/role/label"
TERSE_LABEL = "http://www.xbrl.org/2003/role/terseLabel"
PERIOD_START_LABEL = "http://www.xbrl.org/2003/role/periodStartLabel"
PERIOD_END_LABEL = "http://www.xbrl.org/2003/role/periodEndLabel"
TOTAL_LABEL = "http://www.xbrl.org/2003/role/totalLabel"
# XML namespaces
NAMESPACES = {
"xlink": "http://www.w3.org/1999/xlink",
"xsd": "http://www.w3.org/2001/XMLSchema",
"xbrli": "http://www.xbrl.org/2003/instance",
"link": "http://www.xbrl.org/2003/linkbase"
}
def parse_date(date_str: str) -> datetime.date:
"""
Parse an XBRL date string to a date object.
Args:
date_str: Date string in YYYY-MM-DD format
Returns:
datetime.date object
"""
if not date_str:
raise ValueError("Empty date string provided")
try:
# Parse the date string
date_obj = datetime.strptime(date_str, '%Y-%m-%d').date()
# Additional validation - some dates in XBRL can have invalid day values
# (e.g. September 31, which doesn't exist)
year, month, day = map(int, date_str.split('-'))
# Validate day of month
if month == 2: # February
if day > 29:
# February never has more than 29 days
raise ValueError(f"Invalid day {day} for February")
elif day == 29 and not (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)):
# February 29 is only valid in leap years
raise ValueError(f"Invalid day 29 for February in non-leap year {year}")
elif month in [4, 6, 9, 11] and day > 30:
# April, June, September, November have 30 days max
raise ValueError(f"Invalid day {day} for month {month}")
elif day > 31:
# No month has more than 31 days
raise ValueError(f"Invalid day {day}")
return date_obj
except (ValueError, TypeError) as e:
# Provide more specific error message
raise ValueError(f"Invalid date format or value: {date_str} - {str(e)}") from e
def format_date(date_obj: datetime.date) -> str:
"""
Format a date object to a human-readable string.
Args:
date_obj: datetime.date object
Returns:
Formatted date string (e.g., "Sep 30, 2023")
"""
# Use abbreviated month format (%b) instead of full month (%B)
formatted_date = date_obj.strftime('%b %d, %Y')
# Remove leading zeros from day
if formatted_date.split()[1].startswith('0'):
day_part = formatted_date.split()[1].lstrip('0')
formatted_date = f"{formatted_date.split()[0]} {day_part} {formatted_date.split()[2]}"
return formatted_date
def extract_element_id(href: str) -> str:
"""
Extract element ID from an XLink href.
Args:
href: XLink href attribute value
Returns:
Element ID
"""
return href.split('#')[-1]
def classify_duration(days: int) -> str:
"""
Classify a duration in days as quarterly, semi-annual, annual, etc.
Args:
days: Duration in days
Returns:
Description of the duration (e.g., "Quarterly", "Annual")
"""
if 85 <= days <= 95:
return "Quarterly"
elif 175 <= days <= 185:
return "Semi-Annual"
elif 265 <= days <= 285:
return "Nine Months"
elif 350 <= days <= 380:
return "Annual"
else:
return "Period"
def determine_dominant_scale(statement_data: List[Dict[str, Any]],
periods_to_display: List[Tuple[str, str]]) -> int:
"""
Determine the dominant scale (thousands, millions, billions) for a statement.
This looks at all monetary values in the statement and determines the most appropriate
scale to use for the "In millions/billions/thousands" note.
Args:
statement_data: The statement data with items and values
periods_to_display: List of period keys and labels to consider
Returns:
int: The dominant scale (-3 for thousands, -6 for millions, -9 for billions, 0 for no scaling)
"""
# Collect all decimals attributes
all_decimals = []
for item in statement_data:
# Skip non-monetary items or items without values
if not item.get('has_values', False) or not item.get('values'):
continue
# Skip items that appear to be share counts or ratios
label_lower = item['label'].lower()
if any(keyword in label_lower for keyword in [
'shares', 'share', 'stock', 'eps', 'earnings per share',
'weighted average', 'number of', 'per common share', 'per share',
'per basic', 'per diluted', 'outstanding', 'issued',
'ratio', 'margin', 'percentage', 'rate', 'per cent'
]):
continue
# Get all decimals values for this item
for period_key, _ in periods_to_display:
if period_key in item.get('decimals', {}):
decimals = item['decimals'][period_key]
if isinstance(decimals, int):
all_decimals.append(decimals)
# If we have decimals information, use that to determine the scale
if all_decimals:
# Count the occurrences of each scale
scale_counts = {
-9: 0, # billions
-6: 0, # millions
-3: 0, # thousands
0: 0 # no scaling
}
for decimals in all_decimals:
if decimals <= -9:
scale_counts[-9] += 1
elif decimals <= -6:
scale_counts[-6] += 1
elif decimals <= -3:
scale_counts[-3] += 1
else:
scale_counts[0] += 1
# Find the most common scale (excluding no scaling)
most_common_scale = 0
max_count = 0
for scale, count in scale_counts.items():
if scale != 0 and count > max_count: # Prioritize scaling over no scaling
max_count = count
most_common_scale = scale
return most_common_scale
# If no decimals information, examine the magnitude of values
all_values = []
for item in statement_data:
if not item.get('has_values', False) or not item.get('values'):
continue
# Skip items that appear to be share counts or ratios
label_lower = item['label'].lower()
if any(keyword in label_lower for keyword in [
'shares', 'share', 'stock', 'eps', 'earnings per share',
'weighted average', 'number of', 'per common share', 'per share',
'per basic', 'per diluted', 'outstanding', 'issued',
'ratio', 'margin', 'percentage', 'rate', 'per cent'
]):
continue
# Get all values for this item
for period_key, _ in periods_to_display:
value = item['values'].get(period_key)
if isinstance(value, (int, float)) and value != 0:
all_values.append(abs(value))
# Determine the appropriate scale based on the magnitude of values
if all_values:
# Calculate median value to avoid outliers affecting the scale
all_values.sort()
median_value = all_values[len(all_values) // 2]
if median_value >= 1_000_000_000:
return -9 # billions
elif median_value >= 1_000_000:
return -6 # millions
elif median_value >= 1_000:
return -3 # thousands
# Default to millions if we couldn't determine a scale
return -6
def get_currency_symbol(unit_measure: Optional[str]) -> str:
"""
Get the appropriate currency symbol from a unit measure string.
Args:
unit_measure: Unit measure string (e.g., 'iso4217:USD', 'iso4217:EUR')
Returns:
Currency symbol (e.g., '$', '', '£')
"""
if not unit_measure:
return "$" # Default to USD
# Map common ISO 4217 currency codes to symbols
currency_symbols = {
'iso4217:USD': '$',
'iso4217:EUR': '',
'iso4217:GBP': '£',
'iso4217:JPY': '¥',
'iso4217:CAD': 'C$',
'iso4217:AUD': 'A$',
'iso4217:CHF': 'CHF',
'iso4217:CNY': '¥',
'iso4217:INR': '',
'iso4217:KRW': '',
'iso4217:BRL': 'R$',
'iso4217:MXN': 'MX$',
'iso4217:SEK': 'kr',
'iso4217:NOK': 'kr',
'iso4217:DKK': 'kr',
'iso4217:PLN': '',
'iso4217:CZK': '',
'iso4217:HUF': 'Ft',
'iso4217:RUB': '',
'iso4217:ZAR': 'R',
'iso4217:SGD': 'S$',
'iso4217:HKD': 'HK$',
'iso4217:TWD': 'NT$',
'iso4217:THB': '฿',
'iso4217:MYR': 'RM',
'iso4217:IDR': 'Rp',
'iso4217:PHP': '',
'iso4217:VND': '',
'iso4217:ILS': '',
'iso4217:TRY': '',
'iso4217:AED': 'AED',
'iso4217:SAR': 'SR',
'iso4217:EGP': '',
'iso4217:NGN': '',
}
return currency_symbols.get(unit_measure, '$') # Default to USD if unknown
def format_value(value: Union[int, float, str], is_monetary: bool, scale: int,
decimals: Optional[int] = None, currency_symbol: Optional[str] = None) -> str:
"""
Format a value with appropriate scaling and formatting.
Args:
value: The value to format
is_monetary: Whether the value is monetary
scale: The scale to apply (-3 for thousands, -6 for millions, -9 for billions)
decimals: XBRL decimals attribute value (optional)
currency_symbol: Currency symbol to use for monetary values (default: '$')
Returns:
Formatted value string
"""
# Handle non-numeric or zero values
if not isinstance(value, (int, float)) or value == 0:
return "" if value == 0 else str(value)
# Apply scaling
scaled_value = value
if scale <= -9: # Billions
scaled_value = value / 1_000_000_000
elif scale <= -6: # Millions
scaled_value = value / 1_000_000
elif scale <= -3: # Thousands
scaled_value = value / 1_000
# Determine decimal places to show
if isinstance(decimals, int):
if decimals >= 0:
# Positive decimals - show up to 2 decimal places
decimal_places = min(2, decimals)
else:
# For negative decimals, adjust based on scaling
if scale <= -9: # Billions
decimal_places = min(2, max(0, decimals + 9))
elif scale <= -6: # Millions
decimal_places = min(2, max(0, decimals + 6))
elif scale <= -3: # Thousands
decimal_places = min(2, max(0, decimals + 3))
else:
# For unscaled values, respect the decimals attribute
# If decimals is negative, show that many zeros to the left of decimal
# E.g., decimals=-2 means precision to hundreds place (two zeros after decimal)
decimal_places = max(0, -decimals)
else:
# Default decimal places
if is_monetary:
decimal_places = 0 # Standard for financial statements
else:
# For non-monetary values, check if it's effectively a whole number
if abs(round(value) - value) < 0.001:
decimal_places = 0 # Effectively whole numbers
else:
decimal_places = 2 # Show 2 decimals for actual fractional values
# Apply formatting
decimal_format = f",.{decimal_places}f"
# Format with currency symbol if monetary, otherwise just format the number
if is_monetary:
# Use the provided currency symbol or default to '$'
symbol = currency_symbol if currency_symbol is not None else '$'
if value < 0:
return f"{symbol}({abs(scaled_value):{decimal_format}})"
else:
return f"{symbol}{scaled_value:{decimal_format}}"
else:
# For non-monetary values, use parentheses for negative numbers
if value < 0:
return f"({abs(scaled_value):{decimal_format}})"
else:
return f"{scaled_value:{decimal_format}}"
def find_previous_fiscal_year_period(instant_periods: List[Dict[str, Any]],
prev_fiscal_year: int,
fiscal_month: int,
fiscal_day: int) -> Optional[Dict[str, Any]]:
"""
Find the previous fiscal year period using simple matching logic.
Args:
instant_periods: List of instant periods sorted by date (most recent first)
prev_fiscal_year: Previous fiscal year to find
fiscal_month: Fiscal year end month
fiscal_day: Fiscal year end day
Returns:
Previous fiscal year period or None if not found
"""
for period in instant_periods[1:]: # Skip the current one
try:
period_date = parse_date(period['date'])
# Check if this period is from the previous fiscal year and around fiscal year end
if (period_date.year == prev_fiscal_year and
period_date.month == fiscal_month and
abs(period_date.day - fiscal_day) <= 7):
return period
except (ValueError, TypeError):
continue
return None
def get_unit_display_name(unit_ref: Optional[str]) -> Optional[str]:
"""
Convert unit_ref to human-readable unit name.
Maps XBRL unit references to standard display names:
- 'U-Monetary' / 'iso4217:USD' -> 'usd'
- 'U-Shares' / 'shares' -> 'shares'
- 'U-USD-per-shares' -> 'usdPerShare'
- etc.
Args:
unit_ref: XBRL unit reference string
Returns:
Human-readable unit name or None if unit_ref is None
Examples:
>>> get_unit_display_name('U-Monetary')
'usd'
>>> get_unit_display_name('U-Shares')
'shares'
>>> get_unit_display_name('U-USD-per-shares')
'usdPerShare'
"""
if not unit_ref:
return None
# Common unit patterns and their standard names
unit_ref_lower = unit_ref.lower()
# Per-share units (ratios) - Check FIRST before simple monetary/share checks
if 'per' in unit_ref_lower and 'share' in unit_ref_lower:
if 'usd' in unit_ref_lower or 'monetary' in unit_ref_lower:
return 'usdPerShare'
elif 'eur' in unit_ref_lower:
return 'eurPerShare'
elif 'gbp' in unit_ref_lower:
return 'gbpPerShare'
else:
return 'perShare'
# Share units (but not per-share)
if 'share' in unit_ref_lower:
return 'shares'
# Monetary units
if 'monetary' in unit_ref_lower or 'iso4217:usd' in unit_ref_lower or unit_ref_lower == 'usd':
return 'usd'
elif 'eur' in unit_ref_lower or 'iso4217:eur' in unit_ref_lower:
return 'eur'
elif 'gbp' in unit_ref_lower or 'iso4217:gbp' in unit_ref_lower:
return 'gbp'
elif 'jpy' in unit_ref_lower or 'iso4217:jpy' in unit_ref_lower:
return 'jpy'
# Pure numbers / ratios (no unit)
if 'pure' in unit_ref_lower or 'number' in unit_ref_lower:
return 'number'
# Default: return a simplified version of the unit_ref
# Remove common prefixes and normalize
simplified = unit_ref.replace('U-', '').replace('iso4217:', '')
return simplified.lower()
def is_point_in_time(period_type: Optional[str]) -> Optional[bool]:
"""
Determine if a period type represents a point-in-time value.
Args:
period_type: XBRL period type ('instant' or 'duration')
Returns:
True for 'instant' periods, False for 'duration' periods, None if period_type is None
Examples:
>>> is_point_in_time('instant')
True
>>> is_point_in_time('duration')
False
>>> is_point_in_time(None)
None
"""
if period_type is None:
return None
return period_type == 'instant'

View File

@@ -0,0 +1,876 @@
"""
Current Period API - Convenient access to current period financial data.
This module provides the CurrentPeriodView class that offers simplified access
to the most recent period's financial data without comparative information,
addressing GitHub issue #425.
Key features:
- Automatic detection of the current (most recent) period
- Direct access to balance sheet, income statement, and cash flow data
- Support for raw XBRL concept names (unprocessed)
- Notes and disclosures access
- Beginner-friendly API design
"""
from datetime import date, datetime
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
import pandas as pd
from edgar.core import log
from edgar.richtools import repr_rich
from edgar.xbrl.exceptions import StatementNotFound
if TYPE_CHECKING:
from edgar.xbrl.statements import Statement
class CurrentPeriodView:
"""
Convenient access to current period financial data.
This class provides simplified access to the most recent period's
financial data without comparative information. It automatically
detects the current period and provides easy access to key statements.
Example usage:
>>> xbrl = filing.xbrl()
>>> current = xbrl.current_period
>>> balance_sheet = current.balance_sheet()
>>> income_statement = current.income_statement(raw_concepts=True)
"""
def __init__(self, xbrl):
"""
Initialize CurrentPeriodView with an XBRL object.
Args:
xbrl: XBRL object containing parsed financial data
"""
self.xbrl = xbrl
self._current_period_key = None
self._current_period_label = None
@property
def period_key(self) -> str:
"""
Get the current period key (most recent period).
The current period is determined by:
1. Document period end date if available
2. Most recent period in reporting periods
3. Fallback to any available period
Returns:
Period key string (e.g., "instant_2024-12-31" or "duration_2024-01-01_2024-12-31")
"""
if self._current_period_key is None:
self._current_period_key = self._detect_current_period()
return self._current_period_key
@property
def period_label(self) -> str:
"""
Get the human-readable label for the current period.
Returns:
Human-readable period label (e.g., "December 31, 2024" or "Year Ended December 31, 2024")
"""
if self._current_period_label is None:
self._detect_current_period() # This sets both key and label
return self._current_period_label or self.period_key
def _detect_current_period(self) -> str:
"""
Detect the current (most recent) period from available data.
Strategy:
1. Use document period end date to find matching instant period
2. If no instant match, find most recent duration period ending on document period end
3. Fall back to most recent period by end date
4. Final fallback to first available period
Returns:
Period key for the current period
"""
if not self.xbrl.reporting_periods:
log.warning("No reporting periods found in XBRL data")
return ""
# Try to use document period end date if available
document_period_end = None
if hasattr(self.xbrl, 'period_of_report') and self.xbrl.period_of_report:
try:
if isinstance(self.xbrl.period_of_report, str):
document_period_end = datetime.strptime(self.xbrl.period_of_report, '%Y-%m-%d').date()
elif isinstance(self.xbrl.period_of_report, (date, datetime)):
document_period_end = self.xbrl.period_of_report
if isinstance(document_period_end, datetime):
document_period_end = document_period_end.date()
except (ValueError, TypeError):
log.debug(f"Could not parse document period end date: {self.xbrl.period_of_report}")
# Sort periods by end date (most recent first)
periods_by_date = []
for period in self.xbrl.reporting_periods:
period_key = period['key']
period_label = period.get('label', period_key)
end_date = None
try:
if period_key.startswith('instant_'):
# Format: "instant_2024-12-31"
date_str = period_key.split('_', 1)[1]
end_date = datetime.strptime(date_str, '%Y-%m-%d').date()
elif period_key.startswith('duration_'):
# Format: "duration_2024-01-01_2024-12-31"
parts = period_key.split('_')
if len(parts) >= 3:
date_str = parts[2] # End date
end_date = datetime.strptime(date_str, '%Y-%m-%d').date()
if end_date:
periods_by_date.append((end_date, period_key, period_label))
except (ValueError, IndexError):
log.debug(f"Could not parse period key: {period_key}")
continue
if not periods_by_date:
# Fallback to first available period if no dates could be parsed
first_period = self.xbrl.reporting_periods[0]
self._current_period_key = first_period['key']
self._current_period_label = first_period.get('label', first_period['key'])
log.debug(f"Using fallback period: {self._current_period_key}")
return self._current_period_key
# Sort by date (most recent first)
periods_by_date.sort(key=lambda x: x[0], reverse=True)
# Strategy 1: If we have document period end, look for exact matches
# Prefer instant periods over duration periods when both match document end date
if document_period_end:
instant_match = None
duration_match = None
for end_date, period_key, period_label in periods_by_date:
if end_date == document_period_end:
if period_key.startswith('instant_'):
instant_match = (period_key, period_label)
elif period_key.startswith('duration_'):
duration_match = (period_key, period_label)
# Prefer instant match if available
if instant_match:
self._current_period_key = instant_match[0]
self._current_period_label = instant_match[1]
log.debug(f"Found instant period matching document end date: {instant_match[0]}")
return self._current_period_key
elif duration_match:
self._current_period_key = duration_match[0]
self._current_period_label = duration_match[1]
log.debug(f"Found duration period matching document end date: {duration_match[0]}")
return self._current_period_key
# Strategy 2: Use most recent period
most_recent = periods_by_date[0]
self._current_period_key = most_recent[1]
self._current_period_label = most_recent[2]
log.debug(f"Selected most recent period: {self._current_period_key} ({self._current_period_label})")
return self._current_period_key
def _get_appropriate_period_for_statement(self, statement_type: str) -> str:
"""
Get the appropriate period type for the given statement type.
Balance sheet items are point-in-time (instant periods).
Income statement and cash flow items represent activities over time (duration periods).
Args:
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
Returns:
Period key appropriate for the statement type
"""
# Statements that use instant periods (point in time)
instant_statements = {
'BalanceSheet',
'StatementOfEquity',
'StatementOfFinancialPosition'
}
# Statements that use duration periods (period of time)
duration_statements = {
'IncomeStatement',
'CashFlowStatement',
'ComprehensiveIncome',
'StatementOfOperations',
'StatementOfCashFlows'
}
if statement_type in instant_statements:
# Use the current instant period
return self.period_key
elif statement_type in duration_statements:
# Find the most recent duration period with the same end date
if not self.xbrl.reporting_periods:
return self.period_key # Fallback to current period
# Get the end date from the current period (which might be instant)
current_end_date = None
current_period_key = self.period_key
if current_period_key.startswith('instant_'):
# Extract date from instant period
date_str = current_period_key.split('_', 1)[1]
try:
from datetime import datetime
current_end_date = datetime.strptime(date_str, '%Y-%m-%d').date()
except (ValueError, IndexError):
return self.period_key # Fallback
elif current_period_key.startswith('duration_'):
# Extract end date from duration period
parts = current_period_key.split('_')
if len(parts) >= 3:
try:
from datetime import datetime
current_end_date = datetime.strptime(parts[2], '%Y-%m-%d').date()
except (ValueError, IndexError):
return self.period_key # Fallback
if current_end_date:
# Look for a duration period ending on the same date
# Prefer annual periods, then quarterly, then other durations
matching_periods = []
for period in self.xbrl.reporting_periods:
period_key = period['key']
if period_key.startswith('duration_'):
parts = period_key.split('_')
if len(parts) >= 3:
try:
from datetime import datetime
end_date = datetime.strptime(parts[2], '%Y-%m-%d').date()
if end_date == current_end_date:
period_type = period.get('period_type', '')
priority = 1 if period_type == 'Annual' else (2 if period_type == 'Quarterly' else 3)
matching_periods.append((priority, period_key, period.get('label', period_key)))
except (ValueError, IndexError):
continue
if matching_periods:
# Sort by priority (1=Annual, 2=Quarterly, 3=Other) and return the best match
matching_periods.sort(key=lambda x: x[0])
selected_period = matching_periods[0][1]
log.debug(f"Selected duration period for {statement_type}: {selected_period}")
return selected_period
# Fallback: use current period even if it's not ideal
return self.period_key
else:
# Unknown statement type, use current period
log.debug(f"Unknown statement type {statement_type}, using current period: {self.period_key}")
return self.period_key
def balance_sheet(self, raw_concepts: bool = False, as_statement: bool = True) -> Union[pd.DataFrame, 'Statement']:
"""
Get current period balance sheet data.
Args:
raw_concepts: If True, preserve original XBRL concept names
(e.g., "us-gaap:Assets" instead of "Assets")
as_statement: If True, return a Statement object (default),
if False, return DataFrame
Returns:
Statement object with rich formatting by default,
or pandas DataFrame if as_statement=False
Example:
>>> stmt = xbrl.current_period.balance_sheet()
>>> print(stmt) # Rich formatted table
>>> df = xbrl.current_period.balance_sheet(as_statement=False)
>>> assets = df[df['label'].str.contains('Assets', case=False)]['value'].iloc[0]
"""
if as_statement:
return self._get_statement_object('BalanceSheet')
return self._get_statement_dataframe('BalanceSheet', raw_concepts=raw_concepts)
def income_statement(self, raw_concepts: bool = False, as_statement: bool = True) -> Union[pd.DataFrame, 'Statement']:
"""
Get current period income statement data.
Args:
raw_concepts: If True, preserve original XBRL concept names
(e.g., "us-gaap:Revenues" instead of "Revenue")
as_statement: If True, return a Statement object (default),
if False, return DataFrame
Returns:
Statement object with rich formatting by default,
or pandas DataFrame if as_statement=False
Example:
>>> stmt = xbrl.current_period.income_statement()
>>> print(stmt) # Rich formatted table
>>> df = xbrl.current_period.income_statement(as_statement=False, raw_concepts=True)
>>> revenue = df[df['concept'].str.contains('Revenues')]['value'].iloc[0]
"""
if as_statement:
return self._get_statement_object('IncomeStatement')
return self._get_statement_dataframe('IncomeStatement', raw_concepts=raw_concepts)
def cashflow_statement(self, raw_concepts: bool = False, as_statement: bool = True) -> Union[pd.DataFrame, 'Statement']:
"""
Get current period cash flow statement data.
Args:
raw_concepts: If True, preserve original XBRL concept names
(e.g., "us-gaap:NetCashProvidedByUsedInOperatingActivities")
as_statement: If True, return a Statement object (default),
if False, return DataFrame
Returns:
Statement object with rich formatting by default,
or pandas DataFrame if as_statement=False
Example:
>>> stmt = xbrl.current_period.cashflow_statement()
>>> print(stmt) # Rich formatted table
>>> df = xbrl.current_period.cashflow_statement(as_statement=False)
>>> operating_cf = df[df['label'].str.contains('Operating')]['value'].iloc[0]
"""
if as_statement:
return self._get_statement_object('CashFlowStatement')
return self._get_statement_dataframe('CashFlowStatement', raw_concepts=raw_concepts)
def statement_of_equity(self, raw_concepts: bool = False, as_statement: bool = True) -> Union[pd.DataFrame, 'Statement']:
"""
Get current period statement of equity data.
Args:
raw_concepts: If True, preserve original XBRL concept names
as_statement: If True, return a Statement object (default),
if False, return DataFrame
Returns:
Statement object with rich formatting by default,
or pandas DataFrame if as_statement=False
"""
if as_statement:
return self._get_statement_object('StatementOfEquity')
return self._get_statement_dataframe('StatementOfEquity', raw_concepts=raw_concepts)
def comprehensive_income(self, raw_concepts: bool = False, as_statement: bool = True) -> Union[pd.DataFrame, 'Statement']:
"""
Get current period comprehensive income statement data.
Args:
raw_concepts: If True, preserve original XBRL concept names
as_statement: If True, return a Statement object (default),
if False, return DataFrame
Returns:
Statement object with rich formatting by default,
or pandas DataFrame if as_statement=False
"""
if as_statement:
return self._get_statement_object('ComprehensiveIncome')
return self._get_statement_dataframe('ComprehensiveIncome', raw_concepts=raw_concepts)
def _get_statement_dataframe(self, statement_type: str, raw_concepts: bool = False) -> pd.DataFrame:
"""
Internal method to get statement data as DataFrame for current period.
Args:
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
raw_concepts: Whether to preserve raw XBRL concept names
Returns:
pandas DataFrame with statement data filtered to current period
Raises:
StatementNotFound: If the requested statement type is not available
"""
try:
# Select appropriate period based on statement type
period_filter = self._get_appropriate_period_for_statement(statement_type)
# Get raw statement data filtered to current period
statement_data = self.xbrl.get_statement(statement_type, period_filter=period_filter)
if not statement_data:
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown')
raise StatementNotFound(
statement_type=statement_type,
confidence=0.0,
found_statements=[],
entity_name=entity_name,
reason=f"No data found for {statement_type} in period {self.period_label}"
)
# Convert to DataFrame
rows = []
for item in statement_data:
# Get the value for appropriate period
values = item.get('values', {})
current_value = values.get(period_filter)
if current_value is not None:
row = {
'concept': self._get_concept_name(item, raw_concepts),
'label': item.get('label', ''),
'value': current_value,
'level': item.get('level', 0),
'is_abstract': item.get('is_abstract', False)
}
# Add original concept name if raw_concepts is requested
if raw_concepts:
row['standardized_label'] = item.get('label', '')
# Try to get original concept names from all_names
all_names = item.get('all_names', [])
if all_names:
row['original_concept'] = all_names[0] # First is usually original
# Add dimension information if present
if item.get('is_dimension', False):
row['dimension_label'] = item.get('full_dimension_label', '')
row['is_dimension'] = True
rows.append(row)
if not rows:
# Create empty DataFrame with expected structure
columns = ['concept', 'label', 'value', 'level', 'is_abstract']
if raw_concepts:
columns.extend(['standardized_label', 'original_concept'])
return pd.DataFrame(columns=columns)
return pd.DataFrame(rows)
except Exception as e:
log.error(f"Error retrieving {statement_type} for current period: {str(e)}")
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown')
raise StatementNotFound(
statement_type=statement_type,
confidence=0.0,
found_statements=[],
entity_name=entity_name,
reason=f"Failed to retrieve {statement_type}: {str(e)}"
) from e
def _get_statement_object(self, statement_type: str) -> 'Statement':
"""
Internal method to get statement as a Statement object for current period.
Args:
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
Returns:
Statement object with current period filtering applied
Raises:
StatementNotFound: If the requested statement type is not available
"""
try:
# Import here to avoid circular imports
# Select appropriate period based on statement type
period_filter = self._get_appropriate_period_for_statement(statement_type)
# Find the statement using the unified statement finder
matching_statements, found_role, actual_statement_type = self.xbrl.find_statement(statement_type)
if not found_role:
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown')
raise StatementNotFound(
statement_type=statement_type,
confidence=0.0,
found_statements=[],
entity_name=entity_name,
reason=f"No matching {statement_type} found for current period {self.period_label}"
)
# Create a Statement object with period filtering
# We'll create a custom Statement class that applies period filtering
statement = CurrentPeriodStatement(
self.xbrl,
found_role,
canonical_type=statement_type,
period_filter=period_filter,
period_label=self.period_label
)
return statement
except Exception as e:
log.error(f"Error retrieving {statement_type} statement object for current period: {str(e)}")
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown')
raise StatementNotFound(
statement_type=statement_type,
confidence=0.0,
found_statements=[],
entity_name=entity_name,
reason=f"Failed to retrieve {statement_type} statement: {str(e)}"
) from e
def _get_concept_name(self, item: Dict[str, Any], raw_concepts: bool) -> str:
"""
Get the appropriate concept name based on raw_concepts flag.
Args:
item: Statement line item dictionary
raw_concepts: Whether to use raw XBRL concept names
Returns:
Concept name (raw or processed)
"""
if raw_concepts:
# Try to get original concept name
all_names = item.get('all_names', [])
if all_names:
# Return first name, converting underscores back to colons for XBRL format
original = all_names[0]
if '_' in original and ':' not in original:
# This looks like a normalized name, try to restore colon format
parts = original.split('_', 1)
if len(parts) == 2 and parts[0] in ['us-gaap', 'dei', 'srt']:
return f"{parts[0]}:{parts[1]}"
return original
return item.get('concept', '')
else:
# Use processed concept name
return item.get('concept', '')
def notes(self, section_name: Optional[str] = None) -> List[Dict[str, Any]]:
"""
Get notes to financial statements for the current period.
Args:
section_name: Optional specific note section to retrieve
(e.g., "inventory", "revenue recognition")
Returns:
List of note sections with their content
Note:
This is a placeholder implementation. Full notes access would require
additional development to parse and structure note content.
"""
# Get all statements and filter for notes
all_statements = self.xbrl.get_all_statements()
note_statements = []
for stmt in all_statements:
stmt_type = (stmt.get('type') or '').lower()
definition = (stmt.get('definition') or '').lower()
# Check if this looks like a note section
if ('note' in stmt_type or 'note' in definition or
'disclosure' in stmt_type or 'disclosure' in definition):
# If specific section requested, filter by name
if section_name:
if section_name.lower() in definition or section_name.lower() in stmt_type:
note_statements.append({
'section_name': stmt.get('definition', 'Untitled Note'),
'type': stmt.get('type', ''),
'role': stmt.get('role', ''),
'element_count': stmt.get('element_count', 0)
})
else:
# Return all note sections
note_statements.append({
'section_name': stmt.get('definition', 'Untitled Note'),
'type': stmt.get('type', ''),
'role': stmt.get('role', ''),
'element_count': stmt.get('element_count', 0)
})
return note_statements
def get_fact(self, concept: str, raw_concept: bool = False) -> Any:
"""
Get a specific fact value for the current period.
Args:
concept: XBRL concept name to look up
raw_concept: If True, treat concept as raw XBRL name (with colons)
Returns:
Fact value if found, None otherwise
Example:
>>> revenue = xbrl.current_period.get_fact('Revenues')
>>> revenue_raw = xbrl.current_period.get_fact('us-gaap:Revenues', raw_concept=True)
"""
try:
# Normalize concept name if needed
if raw_concept and ':' in concept:
# Convert colon format to underscore for internal lookup
concept = concept.replace(':', '_')
# Use XBRL's fact finding method with current period filter
facts = self.xbrl._find_facts_for_element(concept, period_filter=self.period_key)
if facts:
# Return the first matching fact's value
for _context_id, wrapped_fact in facts.items():
fact = wrapped_fact['fact']
return fact.numeric_value if fact.numeric_value is not None else fact.value
return None
except Exception as e:
log.debug(f"Error retrieving fact {concept}: {str(e)}")
return None
def to_dict(self) -> Dict[str, Any]:
"""
Convert current period data to a dictionary format.
Returns:
Dictionary with current period information and key financial data
"""
result = {
'period_key': self.period_key,
'period_label': self.period_label,
'entity_name': getattr(self.xbrl, 'entity_name', None),
'document_type': getattr(self.xbrl, 'document_type', None),
'statements': {}
}
# Try to get key statements
statement_types = ['BalanceSheet', 'IncomeStatement', 'CashFlowStatement']
for stmt_type in statement_types:
try:
df = self._get_statement_dataframe(stmt_type, raw_concepts=False)
if not df.empty:
# Convert DataFrame to list of dicts for JSON serialization
result['statements'][stmt_type] = df.to_dict('records')
except StatementNotFound:
result['statements'][stmt_type] = None
return result
def debug_info(self) -> Dict[str, Any]:
"""
Get debugging information about the current period and data availability.
Returns:
Dictionary with detailed debugging information
"""
info = {
'current_period_key': self.period_key,
'current_period_label': self.period_label,
'total_reporting_periods': len(self.xbrl.reporting_periods),
'entity_name': getattr(self.xbrl, 'entity_name', 'Unknown'),
'document_period_end': getattr(self.xbrl, 'period_of_report', None),
'periods': [],
'statements': {}
}
# Add all periods with basic info
for period in self.xbrl.reporting_periods:
period_info = {
'key': period['key'],
'label': period.get('label', 'No label'),
'type': 'instant' if 'instant_' in period['key'] else 'duration'
}
info['periods'].append(period_info)
# Check statement availability
statement_types = ['BalanceSheet', 'IncomeStatement', 'CashFlowStatement']
for stmt_type in statement_types:
try:
# Get the period that would be used for this statement
period_for_stmt = self._get_appropriate_period_for_statement(stmt_type)
# Get raw statement data
raw_data = self.xbrl.get_statement(stmt_type, period_filter=period_for_stmt)
if raw_data:
# Count items with values
items_with_values = sum(1 for item in raw_data
if period_for_stmt in item.get('values', {}))
info['statements'][stmt_type] = {
'period_used': period_for_stmt,
'raw_data_items': len(raw_data),
'items_with_values': items_with_values,
'available': items_with_values > 0,
'error': None
}
else:
info['statements'][stmt_type] = {
'period_used': period_for_stmt,
'raw_data_items': 0,
'items_with_values': 0,
'available': False,
'error': 'No raw data returned'
}
except Exception as e:
info['statements'][stmt_type] = {
'period_used': None,
'raw_data_items': 0,
'items_with_values': 0,
'available': False,
'error': str(e)
}
return info
def __repr__(self) -> str:
"""String representation showing current period info."""
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown Entity')
return f"CurrentPeriodView(entity='{entity_name}', period='{self.period_label}')"
def __str__(self) -> str:
"""User-friendly string representation."""
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown Entity')
return f"Current Period Data for {entity_name}\nPeriod: {self.period_label}"
class CurrentPeriodStatement:
"""
A Statement object that applies current period filtering.
This class wraps a regular Statement object and ensures that only
the current period data is shown when rendering or accessing data.
"""
def __init__(self, xbrl, role_or_type: str, canonical_type: Optional[str] = None,
period_filter: Optional[str] = None, period_label: Optional[str] = None):
"""
Initialize with period filtering.
Args:
xbrl: XBRL object containing parsed data
role_or_type: Role URI, statement type, or statement short name
canonical_type: Optional canonical statement type
period_filter: Period key to filter to
period_label: Human-readable period label
"""
self.xbrl = xbrl
self.role_or_type = role_or_type
self.canonical_type = canonical_type
self.period_filter = period_filter
self.period_label = period_label
# Create the underlying Statement object
from edgar.xbrl.statements import Statement
self._statement = Statement(xbrl, role_or_type, canonical_type, skip_concept_check=True)
def render(self, standard: bool = True, show_date_range: bool = False,
include_dimensions: bool = True) -> Any:
"""
Render the statement as a formatted table for current period only.
Args:
standard: Whether to use standardized concept labels
show_date_range: Whether to show full date ranges for duration periods
include_dimensions: Whether to include dimensional segment data
Returns:
Rich Table containing the rendered statement for current period
"""
# Use the canonical type for rendering if available, otherwise use the role
rendering_type = self.canonical_type if self.canonical_type else self.role_or_type
return self.xbrl.render_statement(
rendering_type,
period_filter=self.period_filter,
standard=standard,
show_date_range=show_date_range,
include_dimensions=include_dimensions
)
def get_raw_data(self) -> List[Dict[str, Any]]:
"""
Get the raw statement data filtered to current period.
Returns:
List of line items with values for current period only
"""
return self._statement.get_raw_data(period_filter=self.period_filter)
def get_dataframe(self, raw_concepts: bool = False) -> pd.DataFrame:
"""
Convert the statement to a DataFrame for current period.
Args:
raw_concepts: If True, preserve original XBRL concept names
Returns:
pandas DataFrame with current period data only
"""
# Get raw data for current period
raw_data = self.get_raw_data()
# Convert to DataFrame format similar to CurrentPeriodView
rows = []
for item in raw_data:
values = item.get('values', {})
current_value = values.get(self.period_filter)
if current_value is not None:
concept_name = item.get('concept', '')
if raw_concepts:
# Try to get original concept name
all_names = item.get('all_names', [])
if all_names:
original = all_names[0]
if '_' in original and ':' not in original:
parts = original.split('_', 1)
if len(parts) == 2 and parts[0] in ['us-gaap', 'dei', 'srt']:
concept_name = f"{parts[0]}:{parts[1]}"
else:
concept_name = original
else:
concept_name = original
row = {
'concept': concept_name,
'label': item.get('label', ''),
'value': current_value,
'level': item.get('level', 0),
'is_abstract': item.get('is_abstract', False)
}
# Add original concept name if raw_concepts is requested
if raw_concepts:
row['standardized_label'] = item.get('label', '')
all_names = item.get('all_names', [])
if all_names:
row['original_concept'] = all_names[0]
# Add dimension information if present
if item.get('is_dimension', False):
row['dimension_label'] = item.get('full_dimension_label', '')
row['is_dimension'] = True
rows.append(row)
return pd.DataFrame(rows)
def calculate_ratios(self) -> Dict[str, float]:
"""Calculate common financial ratios for this statement."""
return self._statement.calculate_ratios()
def __rich__(self) -> Any:
"""Rich console representation."""
return self.render()
def __repr__(self) -> str:
"""String representation."""
return repr_rich(self.__rich__())
def __str__(self) -> str:
"""User-friendly string representation."""
return repr(self)

View File

@@ -0,0 +1,232 @@
"""
Revenue Deduplication Strategy for Issue #438
This module implements intelligent deduplication for revenue concepts
that may have the same underlying value but different GAAP concept names.
The strategy:
1. Identify groups of items with the same value in the same period
2. Apply hierarchical precedence rules to choose the most appropriate concept
3. Filter out less specific concepts when duplicates exist
Revenue Concept Hierarchy (most to least preferred):
1. us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax (most specific - ASC 606)
2. us-gaap:Revenues (standard general concept)
3. us-gaap:SalesRevenueNet (less common)
4. us-gaap:Revenue (least specific)
"""
import logging
from collections import defaultdict
from typing import Any, Dict, List, Set
log = logging.getLogger(__name__)
class RevenueDeduplicator:
"""
Handles deduplication of revenue concepts in financial statements.
"""
# Revenue concept precedence (higher number = higher precedence)
REVENUE_CONCEPT_PRECEDENCE = {
'us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax': 100, # Most specific (ASC 606)
'us-gaap:Revenues': 90, # Standard concept
'us-gaap:SalesRevenueNet': 80, # Alternative concept
'us-gaap:Revenue': 70, # Generic concept
'us-gaap:TotalRevenuesAndGains': 60, # Broader concept
}
# Additional revenue-related concepts that might cause duplicates
REVENUE_RELATED_CONCEPTS = {
'RevenueFromContractWithCustomerExcludingAssessedTax',
'Revenues',
'Revenue',
'SalesRevenueNet',
'TotalRevenuesAndGains',
'RevenueFromContractWithCustomer',
'TotalRevenues'
}
@classmethod
def deduplicate_statement_items(cls, statement_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Remove duplicate revenue concepts from statement items.
Args:
statement_items: List of statement line items
Returns:
Filtered list with duplicates removed
"""
if not statement_items:
return statement_items
# Group items by period and value to find potential duplicates
period_value_groups = cls._group_by_period_value(statement_items)
# Identify items to remove
items_to_remove = set()
for (_period, _value), items in period_value_groups.items():
if len(items) > 1 and cls._are_revenue_duplicates(items):
# This is a group of revenue items with the same value
# Keep only the highest precedence item
items_to_remove.update(cls._select_duplicates_to_remove(items))
# Filter out the items marked for removal
result = []
for i, item in enumerate(statement_items):
if i not in items_to_remove:
result.append(item)
else:
log.debug("Removed duplicate revenue item: %s = %s", item.get('label', 'Unknown'), item.get('values', {}))
removed_count = len(statement_items) - len(result)
if removed_count > 0:
log.info("Revenue deduplication: removed %d duplicate items", removed_count)
return result
@classmethod
def _group_by_period_value(cls, statement_items: List[Dict[str, Any]]) -> Dict[tuple, List[tuple]]:
"""
Group statement items by (period, value) pairs.
Returns:
Dict mapping (period, value) to list of (index, item) tuples
"""
groups = defaultdict(list)
for i, item in enumerate(statement_items):
values = item.get('values', {})
for period, value in values.items():
if value is not None and value != 0:
groups[(period, value)].append((i, item))
return groups
@classmethod
def _are_revenue_duplicates(cls, indexed_items: List[tuple]) -> bool:
"""
Check if a group of items are revenue duplicates.
Args:
indexed_items: List of (index, item) tuples
Returns:
True if these items are revenue duplicates
"""
revenue_count = 0
for _, item in indexed_items:
if cls._is_revenue_concept(item):
revenue_count += 1
# If we have multiple revenue concepts, they're potential duplicates
return revenue_count > 1
@classmethod
def _is_revenue_concept(cls, item: Dict[str, Any]) -> bool:
"""
Check if an item represents a revenue concept.
"""
concept = item.get('concept', '')
all_names = item.get('all_names', [])
label = item.get('label', '').lower()
# First check for exclusions (costs, expenses, etc.)
exclusion_terms = ['cost', 'expense', 'loss', 'depreciation', 'amortization']
for name in [concept] + all_names + [label]:
if any(excl in name.lower() for excl in exclusion_terms):
return False
# Look for revenue-related terms in concept or names
for name in [concept] + all_names:
if any(term in name for term in cls.REVENUE_RELATED_CONCEPTS):
return True
# Also check label for revenue indicators (but not cost-related)
if any(term in label for term in ['revenue', 'sales']) and not any(excl in label for excl in exclusion_terms):
return True
return False
@classmethod
def _select_duplicates_to_remove(cls, indexed_items: List[tuple]) -> Set[int]:
"""
Select which items to remove from a duplicate group.
Args:
indexed_items: List of (index, item) tuples
Returns:
Set of indices to remove
"""
if len(indexed_items) <= 1:
return set()
# Score each item by precedence
scored_items = []
for index, item in indexed_items:
score = cls._get_precedence_score(item)
scored_items.append((score, index, item))
# Sort by score (highest first)
scored_items.sort(reverse=True)
# Keep the highest scored item, remove the rest
indices_to_remove = set()
for i in range(1, len(scored_items)): # Skip first (highest scored)
_, index, item = scored_items[i]
indices_to_remove.add(index)
return indices_to_remove
@classmethod
def _get_precedence_score(cls, item: Dict[str, Any]) -> int:
"""
Get the precedence score for a revenue concept.
Higher scores are preferred and will be kept.
"""
concept = item.get('concept', '')
all_names = item.get('all_names', [])
# Check for exact matches in precedence table
for name in [concept] + all_names:
if name in cls.REVENUE_CONCEPT_PRECEDENCE:
return cls.REVENUE_CONCEPT_PRECEDENCE[name]
# Check for partial matches (handle namespace prefixes)
for name in [concept] + all_names:
for precedence_concept, score in cls.REVENUE_CONCEPT_PRECEDENCE.items():
if precedence_concept.split(':')[-1] in name:
return score
# Default score for unrecognized revenue concepts
return 50
@classmethod
def get_deduplication_stats(cls, original_items: List[Dict[str, Any]],
deduplicated_items: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Generate statistics about the deduplication process.
"""
original_count = len(original_items)
deduplicated_count = len(deduplicated_items)
removed_count = original_count - deduplicated_count
# Count revenue items
original_revenue_count = sum(1 for item in original_items if cls._is_revenue_concept(item))
deduplicated_revenue_count = sum(1 for item in deduplicated_items if cls._is_revenue_concept(item))
return {
'original_total_items': original_count,
'deduplicated_total_items': deduplicated_count,
'removed_items': removed_count,
'original_revenue_items': original_revenue_count,
'deduplicated_revenue_items': deduplicated_revenue_count,
'removed_revenue_items': original_revenue_count - deduplicated_revenue_count,
'deduplication_performed': removed_count > 0
}

View File

@@ -0,0 +1,567 @@
# Statement Class Documentation
## Overview
The `Statement` class represents a single financial statement extracted from XBRL data. It provides methods for viewing, manipulating, and analyzing financial statement data including income statements, balance sheets, cash flow statements, and disclosure notes.
A Statement object contains:
- **Line items** with values across multiple periods
- **Hierarchy** showing the structure and relationships
- **Metadata** including concept names and labels
- **Period information** for time-series analysis
## Getting a Statement
### From XBRL
```python
# Get XBRL data first
xbrl = filing.xbrl()
# Access specific statements
income = xbrl.statements.income_statement()
balance = xbrl.statements.balance_sheet()
cashflow = xbrl.statements.cash_flow_statement()
equity = xbrl.statements.statement_of_equity()
# By name
cover_page = xbrl.statements['CoverPage']
# By index
first_statement = xbrl.statements[0]
```
## Viewing Statements
### Rich Display
```python
# Print statement to see formatted table
print(income)
# Shows:
# - Statement title
# - Line items with hierarchical structure
# - Values for multiple periods
# - Proper number formatting
```
### Text Representation
```python
# Get plain text version
text = str(income)
# Or explicitly
text_output = income.text()
```
## Converting to DataFrame
### Basic Conversion
```python
# Convert statement to pandas DataFrame
df = income.to_dataframe()
# DataFrame structure:
# - Index: Line item labels or concepts
# - Columns: Period dates
# - Values: Financial amounts
```
### With Period Filter
```python
# Filter to specific periods
df = income.to_dataframe(period_filter='2024')
# Only includes periods matching the filter
```
### Accessing Specific Data
```python
# Convert to DataFrame for easy analysis
df = income.to_dataframe()
# Access specific line items
revenue = df.loc['Revenue']
net_income = df.loc['Net Income']
# Access specific periods
current_period = df.iloc[:, 0] # First column (most recent)
prior_period = df.iloc[:, 1] # Second column
# Specific cell
current_revenue = df.loc['Revenue', df.columns[0]]
```
## Statement Properties
### Available Periods
```python
# Get list of periods in the statement
periods = statement.periods
# Each period is a date string (YYYY-MM-DD)
for period in periods:
print(f"Data available for: {period}")
```
### Statement Name and Type
```python
# Get statement information
name = statement.name # Statement display name
concept = statement.concept # XBRL concept identifier
```
### Raw Data Access
```python
# Get underlying statement data structure
raw_data = statement.get_raw_data()
# Returns list of dictionaries with:
# - concept: XBRL concept name
# - label: Display label
# - values: Dict of period -> value
# - level: Hierarchy depth
# - all_names: All concept variations
```
## Rendering and Display
### Custom Rendering
```python
# Render with specific options
rendered = statement.render()
# Rendered statement has rich formatting
print(rendered)
```
### Text Export
```python
# Get markdown-formatted text
markdown_text = statement.text()
# Suitable for:
# - AI/LLM consumption
# - Documentation
# - Text-based analysis
```
## Working with Statement Data
### Calculate Growth Rates
```python
# Convert to DataFrame
df = income.to_dataframe()
# Calculate period-over-period growth
if len(df.columns) >= 2:
current = df.iloc[:, 0]
prior = df.iloc[:, 1]
# Growth rate
growth = ((current - prior) / prior * 100).round(2)
# Create comparison DataFrame
comparison = pd.DataFrame({
'Current': current,
'Prior': prior,
'Growth %': growth
})
print(comparison)
```
### Extract Specific Metrics
```python
# Get income statement metrics
df = income.to_dataframe()
# Extract key metrics from most recent period
current = df.iloc[:, 0]
metrics = {
'Revenue': current.get('Revenue', 0),
'Operating Income': current.get('Operating Income', 0),
'Net Income': current.get('Net Income', 0),
}
# Calculate derived metrics
if metrics['Revenue'] > 0:
metrics['Operating Margin'] = (
metrics['Operating Income'] / metrics['Revenue'] * 100
)
metrics['Net Margin'] = (
metrics['Net Income'] / metrics['Revenue'] * 100
)
```
### Filter Line Items
```python
# Convert to DataFrame
df = balance.to_dataframe()
# Filter for specific items
asset_items = df[df.index.str.contains('Asset', case=False)]
liability_items = df[df.index.str.contains('Liabilit', case=False)]
# Get subtotals
if 'Current Assets' in df.index:
current_assets = df.loc['Current Assets']
```
### Time Series Analysis
```python
# Get multiple periods
df = income.to_dataframe()
# Plot revenue trend
if 'Revenue' in df.index:
revenue_series = df.loc['Revenue']
# Convert to numeric and plot
import matplotlib.pyplot as plt
revenue_series.plot(kind='line', title='Revenue Trend')
plt.show()
```
## Common Workflows
### Compare Current vs Prior Period
```python
# Get income statement
income = xbrl.statements.income_statement()
df = income.to_dataframe()
# Ensure we have at least 2 periods
if len(df.columns) >= 2:
# Create comparison
comparison = pd.DataFrame({
'Current': df.iloc[:, 0],
'Prior': df.iloc[:, 1],
'Change': df.iloc[:, 0] - df.iloc[:, 1],
'Change %': ((df.iloc[:, 0] - df.iloc[:, 1]) / df.iloc[:, 1] * 100).round(2)
})
# Show key metrics
key_items = ['Revenue', 'Operating Income', 'Net Income']
for item in key_items:
if item in comparison.index:
print(f"\n{item}:")
print(comparison.loc[item])
```
### Extract All Periods to CSV
```python
# Get statement
statement = xbrl.statements.income_statement()
# Convert and save
df = statement.to_dataframe()
df.to_csv('income_statement.csv')
print(f"Exported {len(df)} line items across {len(df.columns)} periods")
```
### Build Financial Ratios
```python
# Get both income statement and balance sheet
income = xbrl.statements.income_statement()
balance = xbrl.statements.balance_sheet()
# Convert to DataFrames
income_df = income.to_dataframe()
balance_df = balance.to_dataframe()
# Extract values (most recent period)
revenue = income_df.loc['Revenue', income_df.columns[0]]
net_income = income_df.loc['Net Income', income_df.columns[0]]
total_assets = balance_df.loc['Assets', balance_df.columns[0]]
total_equity = balance_df.loc['Equity', balance_df.columns[0]]
# Calculate ratios
ratios = {
'Net Profit Margin': (net_income / revenue * 100).round(2),
'ROA': (net_income / total_assets * 100).round(2),
'ROE': (net_income / total_equity * 100).round(2),
'Asset Turnover': (revenue / total_assets).round(2),
}
print("Financial Ratios:")
for ratio, value in ratios.items():
print(f" {ratio}: {value}")
```
### Search for Specific Items
```python
# Get statement as DataFrame
df = income.to_dataframe()
# Search for items containing keywords
research_costs = df[df.index.str.contains('Research', case=False)]
tax_items = df[df.index.str.contains('Tax', case=False)]
# Or get raw data with concept names
raw = income.get_raw_data()
research_concepts = [
item for item in raw
if 'research' in item['label'].lower()
]
```
### Aggregate Subcategories
```python
# Get statement
df = balance.to_dataframe()
# Define categories (adjust based on actual labels)
current_asset_categories = [
'Cash and Cash Equivalents',
'Accounts Receivable',
'Inventory',
'Other Current Assets'
]
# Sum categories
current_assets_sum = sum([
df.loc[cat, df.columns[0]]
for cat in current_asset_categories
if cat in df.index
])
# Verify against reported total
if 'Current Assets' in df.index:
reported_total = df.loc['Current Assets', df.columns[0]]
print(f"Calculated: {current_assets_sum}")
print(f"Reported: {reported_total}")
print(f"Difference: {current_assets_sum - reported_total}")
```
## Integration with Analysis Tools
### With Pandas
```python
# Statement integrates seamlessly with pandas
df = statement.to_dataframe()
# Use all pandas functionality
summary = df.describe()
correlations = df.T.corr()
rolling_avg = df.T.rolling(window=4).mean()
```
### With NumPy
```python
import numpy as np
# Convert to numpy array for numerical operations
df = statement.to_dataframe()
values = df.values
# Numerical analysis
mean_values = np.mean(values, axis=1)
std_values = np.std(values, axis=1)
growth_rates = np.diff(values, axis=1) / values[:, :-1]
```
### Export for Visualization
```python
# Prepare data for plotting
df = income.to_dataframe()
# Select key items
plot_items = ['Revenue', 'Operating Income', 'Net Income']
plot_data = df.loc[plot_items].T
# Plot with matplotlib
import matplotlib.pyplot as plt
plot_data.plot(kind='bar', figsize=(12, 6))
plt.title('Income Statement Trends')
plt.xlabel('Period')
plt.ylabel('Amount (USD)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
```
## Error Handling
### Missing Line Items
```python
# Check if item exists before accessing
df = statement.to_dataframe()
if 'Revenue' in df.index:
revenue = df.loc['Revenue']
else:
print("Revenue not found in statement")
# Try alternative names
for alt in ['Revenues', 'Total Revenue', 'Net Revenue']:
if alt in df.index:
revenue = df.loc[alt]
break
```
### Handling Different Formats
```python
# Companies may use different labels
def find_item(df, possible_names):
"""Find item by trying multiple possible names."""
for name in possible_names:
if name in df.index:
return df.loc[name]
return None
# Usage
revenue_names = ['Revenue', 'Revenues', 'Total Revenue', 'Net Sales']
revenue = find_item(df, revenue_names)
if revenue is not None:
print(f"Found revenue: {revenue}")
else:
print("Revenue not found under common names")
```
### Incomplete Period Data
```python
# Check data availability
df = statement.to_dataframe()
# Check for null values
missing_data = df.isnull().sum()
if missing_data.any():
print("Periods with missing data:")
print(missing_data[missing_data > 0])
# Fill missing with 0 or forward fill
df_filled = df.fillna(0) # Replace NaN with 0
# or
df_filled = df.fillna(method='ffill') # Forward fill
```
## Best Practices
1. **Always convert to DataFrame for analysis**:
```python
df = statement.to_dataframe() # Easier to work with
```
2. **Check item names before accessing**:
```python
if 'Revenue' in df.index:
revenue = df.loc['Revenue']
```
3. **Handle multiple naming conventions**:
```python
# Try variations
for name in ['Revenue', 'Revenues', 'Total Revenue']:
if name in df.index:
revenue = df.loc[name]
break
```
4. **Validate calculated values**:
```python
# Check against reported totals
calculated = sum(components)
reported = df.loc['Total']
assert abs(calculated - reported) < 0.01, "Mismatch!"
```
5. **Use period filters appropriately**:
```python
# Filter to specific years
df_2024 = statement.to_dataframe(period_filter='2024')
```
## Performance Tips
### Caching DataFrames
```python
# Cache the DataFrame if using repeatedly
df_cache = statement.to_dataframe()
# Reuse cached version
revenue = df_cache.loc['Revenue']
net_income = df_cache.loc['Net Income']
# ... more operations
```
### Selective Period Loading
```python
# If you only need recent data
current_only = xbrl.current_period.income_statement()
df = current_only.to_dataframe() # Smaller, faster
```
## Troubleshooting
### "KeyError: Line item not found"
**Cause**: Item label doesn't match exactly
**Solution**:
```python
# List all available items
print(df.index.tolist())
# Or search for pattern
matching = df[df.index.str.contains('keyword', case=False)]
```
### "Empty DataFrame"
**Cause**: Statement has no data or wrong period filter
**Solution**:
```python
# Check raw data
raw = statement.get_raw_data()
print(f"Statement has {len(raw)} items")
# Check periods
print(f"Available periods: {statement.periods}")
```
### "Index error when accessing columns"
**Cause**: Fewer periods than expected
**Solution**:
```python
# Check column count first
if len(df.columns) >= 2:
current = df.iloc[:, 0]
prior = df.iloc[:, 1]
else:
print("Insufficient periods for comparison")
```
This guide covers the essential patterns for working with Statement objects in edgartools. For information on accessing statements from XBRL, see the XBRL documentation.

View File

@@ -0,0 +1,587 @@
# XBRL Class Documentation
## Overview
The `XBRL` class is the primary interface for working with XBRL (eXtensible Business Reporting Language) financial data from SEC filings. It provides structured access to financial statements, facts, and related data extracted from filings like 10-K, 10-Q, and 8-K reports.
XBRL documents contain:
- **Financial statements** (Income Statement, Balance Sheet, Cash Flow, etc.)
- **Facts** - Individual data points with values, periods, and dimensions
- **Contexts** - Time periods and dimensional information
- **Presentation** - How facts are organized into statements
## Getting XBRL Data
### From a Filing
```python
# Get XBRL from any filing with financial data
filing = company.get_filings(form="10-K").latest()
xbrl = filing.xbrl()
```
### Quick Check
```python
# Print XBRL to see what's available
print(xbrl)
# Shows: company info, available statements, periods, and usage examples
```
## Accessing Financial Statements
### Core Statement Methods
The XBRL class provides convenient methods for accessing standard financial statements:
```python
# Access core financial statements
income = xbrl.statements.income_statement()
balance = xbrl.statements.balance_sheet()
cashflow = xbrl.statements.cash_flow_statement()
equity = xbrl.statements.statement_of_equity()
comprehensive = xbrl.statements.comprehensive_income()
```
### Access by Name
You can access any statement by its exact name as it appears in the filing:
```python
# List all available statements
print(xbrl.statements)
# Access specific statement by name
cover_page = xbrl.statements['CoverPage']
disclosure = xbrl.statements['CONDENSED CONSOLIDATED BALANCE SHEETS Unaudited']
```
### Access by Index
Statements can also be accessed by their index position:
```python
# Get statement by index (0-based)
first_statement = xbrl.statements[0]
sixth_statement = xbrl.statements[6]
```
## Working with Periods
### Current Period Only
To work with just the most recent period's data:
```python
# Get current period XBRL view
current = xbrl.current_period
# Access statements for current period
current_income = current.income_statement()
current_balance = current.balance_sheet()
```
### Multi-Period Statements
By default, statements include multiple periods for comparison:
```python
# Get income statement with comparative periods
income = xbrl.statements.income_statement()
# Typically includes current year/quarter and prior periods
# Convert to DataFrame to see all periods
df = income.to_dataframe()
print(df.columns) # Shows all available periods
```
### Available Periods
```python
# See what periods are available
for period in xbrl.reporting_periods:
print(f"Period: {period['label']}, Key: {period['key']}")
```
## Querying Facts
The `.facts` property provides a powerful query interface for finding specific data points:
### Basic Fact Queries
```python
# Get all revenue facts
revenue_facts = xbrl.facts.query().by_concept('Revenue').to_dataframe()
# Get net income facts
net_income = xbrl.facts.query().by_concept('NetIncome').to_dataframe()
# Search by label instead of concept name
revenue = xbrl.facts.query().by_label('Revenue').to_dataframe()
```
### Filter by Period
```python
# Get facts for a specific period
period_key = "duration_2024-01-01_2024-12-31"
facts_2024 = xbrl.facts.query().by_period_key(period_key).to_dataframe()
# Filter by fiscal year
facts_fy2024 = xbrl.facts.query().by_fiscal_year(2024).to_dataframe()
# Filter by fiscal period
q1_facts = xbrl.facts.query().by_fiscal_period("Q1").to_dataframe()
```
### Filter by Statement Type
```python
# Get all income statement facts
income_facts = xbrl.facts.query().by_statement_type("IncomeStatement").to_dataframe()
# Get all balance sheet facts
balance_facts = xbrl.facts.query().by_statement_type("BalanceSheet").to_dataframe()
```
### Chaining Filters
```python
# Combine multiple filters
revenue_2024 = (xbrl.facts.query()
.by_concept('Revenue')
.by_fiscal_year(2024)
.by_period_type('duration')
.to_dataframe())
```
### Pattern Matching
```python
# Find all concepts matching a pattern (case-insensitive)
asset_facts = xbrl.facts.query().by_concept('Asset', exact=False).to_dataframe()
# Search labels with pattern
liability_facts = xbrl.facts.query().by_label('liabilities', exact=False).to_dataframe()
```
## Converting to DataFrames
### Statement to DataFrame
```python
# Convert any statement to pandas DataFrame
income = xbrl.statements.income_statement()
df = income.to_dataframe()
# DataFrame has:
# - One row per line item
# - One column per period
# - Index is the concept/label
```
### Facts to DataFrame
```python
# Query returns DataFrame directly
df = xbrl.facts.query().by_concept('Revenue').to_dataframe()
# DataFrame columns:
# - concept: XBRL concept name
# - label: Human-readable label
# - value: Fact value
# - period: Period identifier
# - start: Period start date (for duration)
# - end: Period end date
# - unit: Unit of measure (e.g., USD)
# - dimensions: Dimensional breakdowns (if any)
```
## Advanced Patterns
### Finding Specific Disclosures
```python
# Get statements organized by category
categories = xbrl.statements.get_statements_by_category()
# View all disclosures
disclosures = categories['disclosure']
for disc in disclosures:
print(f"{disc['index']}: {disc['title']}")
# View all notes
notes = categories['note']
for note in notes:
print(f"{note['index']}: {note['title']}")
# Get core financial statements
core_statements = categories['statement']
# Or list all statements to find specific ones
all_statements = xbrl.get_all_statements()
for stmt in all_statements:
print(f"{stmt['type']}: {stmt['title']}")
# Access by exact name or index
risk_factors = xbrl.statements['RiskFactorsDisclosure']
# Or by index from the category list
first_disclosure = xbrl.statements[disclosures[0]['index']]
```
### Cross-Period Analysis
```python
# Get multi-period income statement
income = xbrl.statements.income_statement()
df = income.to_dataframe()
# Calculate year-over-year growth
if len(df.columns) >= 2:
current = df.iloc[:, 0]
prior = df.iloc[:, 1]
growth = ((current - prior) / prior * 100).round(2)
print(f"Revenue growth: {growth.loc['Revenue']}%")
```
### Working with Dimensions
```python
# Query facts with specific dimensional breakdowns
segment_revenue = (xbrl.facts.query()
.by_concept('Revenue')
.by_dimension('Segment', 'ProductSegment')
.to_dataframe())
# Group by dimensions
segment_totals = segment_revenue.groupby('dimensions')['value'].sum()
```
### Custom Fact Filtering
```python
# Use custom filter function
large_amounts = xbrl.facts.query().by_value(lambda v: abs(v) > 1000000).to_dataframe()
# Custom filter with lambda
recent_facts = xbrl.facts.query().by_custom(
lambda fact: fact['end'] >= '2024-01-01'
).to_dataframe()
```
## Common Workflows
### Extract Revenue from Income Statement
```python
# Method 1: Via statement
income = xbrl.statements.income_statement()
df = income.to_dataframe()
revenue = df.loc['Revenue']
# Method 2: Via facts query
revenue_facts = xbrl.facts.query().by_concept('Revenues').to_dataframe()
latest_revenue = revenue_facts.iloc[0]['value']
```
### Compare Current vs Prior Year
```python
# Get current period data
current = xbrl.current_period
current_income = current.income_statement()
current_df = current_income.to_dataframe()
# Get full multi-period data
full_income = xbrl.statements.income_statement()
full_df = full_income.to_dataframe()
# Compare
if len(full_df.columns) >= 2:
comparison = pd.DataFrame({
'Current': full_df.iloc[:, 0],
'Prior': full_df.iloc[:, 1],
'Change': full_df.iloc[:, 0] - full_df.iloc[:, 1]
})
print(comparison)
```
### Extract Specific Disclosure Data
```python
# Find debt-related disclosures
all_statements = xbrl.get_all_statements()
debt_statements = [s for s in all_statements if 'debt' in s['title'].lower()]
# Access first debt disclosure
if debt_statements:
debt_disclosure = xbrl.statements[debt_statements[0]['type']]
debt_df = debt_disclosure.to_dataframe()
```
### Export All Core Statements
```python
# Export all core financial statements to CSV
statements_to_export = {
'income_statement': xbrl.statements.income_statement(),
'balance_sheet': xbrl.statements.balance_sheet(),
'cash_flow': xbrl.statements.cash_flow_statement(),
}
for name, stmt in statements_to_export.items():
if stmt:
df = stmt.to_dataframe()
df.to_csv(f"{name}.csv")
```
### Build Custom Financial Summary
```python
# Extract key metrics from multiple statements
metrics = {}
# Revenue and profit from income statement
income = xbrl.statements.income_statement()
income_df = income.to_dataframe()
metrics['Revenue'] = income_df.loc['Revenue', income_df.columns[0]]
metrics['Net Income'] = income_df.loc['Net Income', income_df.columns[0]]
# Assets from balance sheet
balance = xbrl.statements.balance_sheet()
balance_df = balance.to_dataframe()
metrics['Total Assets'] = balance_df.loc['Assets', balance_df.columns[0]]
# Cash flow from operations
cashflow = xbrl.statements.cash_flow_statement()
cashflow_df = cashflow.to_dataframe()
metrics['Operating Cash Flow'] = cashflow_df.loc['Operating Activities', cashflow_df.columns[0]]
# Create summary DataFrame
summary = pd.DataFrame([metrics])
print(summary)
```
## Entity Information
### Access Filing Metadata
```python
# Get entity and filing information
entity_info = xbrl.entity_info
print(f"Company: {entity_info.get('entity_name')}")
print(f"Ticker: {entity_info.get('trading_symbol')}")
print(f"CIK: {entity_info.get('entity_identifier')}")
print(f"Form: {entity_info.get('document_type')}")
print(f"Fiscal Year: {entity_info.get('document_fiscal_year_focus')}")
print(f"Fiscal Period: {entity_info.get('document_fiscal_period_focus')}")
```
## Error Handling
### Missing Statements
```python
from edgar.xbrl.xbrl import StatementNotFound
try:
equity = xbrl.statements.statement_of_equity()
except StatementNotFound:
print("Statement of equity not available in this filing")
equity = None
```
### Empty Query Results
```python
# Query returns empty DataFrame if no matches
results = xbrl.facts.query().by_concept('NonexistentConcept').to_dataframe()
if results.empty:
print("No facts found matching query")
```
### Handling Multiple Formats
```python
# Some companies use different concept names
revenue_concepts = ['Revenue', 'Revenues', 'SalesRevenue', 'RevenueFromContractWithCustomer']
for concept in revenue_concepts:
revenue = xbrl.facts.query().by_concept(concept).to_dataframe()
if not revenue.empty:
print(f"Found revenue under concept: {concept}")
break
```
## Performance Considerations
### Caching
```python
# Facts are cached after first access
facts = xbrl.facts # First call - loads data
facts2 = xbrl.facts # Subsequent calls use cache
```
### Limiting Results
```python
# Use limit() to reduce memory usage for large result sets
sample_facts = xbrl.facts.query().limit(100).to_dataframe()
```
### Efficient Filtering
```python
# Apply specific filters early in the query chain
# Good: specific filters first
revenue = (xbrl.facts.query()
.by_statement_type("IncomeStatement") # Narrow down first
.by_concept("Revenue") # Then more specific
.to_dataframe())
# Less efficient: broad query then filter
all_facts = xbrl.facts.query().to_dataframe()
revenue = all_facts[all_facts['concept'] == 'Revenue']
```
## Data Structure Reference
### Key Properties
| Property | Type | Description |
|----------|------|-------------|
| `statements` | Statements | Access to financial statements |
| `facts` | FactsView | Query interface for facts |
| `entity_info` | dict | Company and filing metadata |
| `reporting_periods` | list | Available time periods |
| `contexts` | dict | XBRL contexts (periods + dimensions) |
| `units` | dict | Units of measure |
| `current_period` | CurrentPeriodView | Current period only |
### Fact DataFrame Columns
When you convert facts to a DataFrame using `.to_dataframe()`, you get:
- `concept`: XBRL element name (e.g., 'Revenues', 'Assets')
- `label`: Human-readable label
- `value`: Fact value (numeric or text)
- `period`: Period identifier
- `start`: Period start date (for duration periods)
- `end`: Period end date
- `unit`: Unit of measure (e.g., 'USD', 'shares')
- `dimensions`: Dictionary of dimensional breakdowns
- `decimals`: Precision indicator
## Integration with Other Classes
### With Filing
```python
# XBRL comes from filing
filing = company.get_filings(form="10-K").latest()
xbrl = filing.xbrl()
# Access back to filing if needed
# (Store reference if you need it)
```
### With Company
```python
# Get multiple filings and compare XBRL data
filings = company.get_filings(form="10-Q", count=4)
revenue_trend = []
for filing in filings:
xbrl = filing.xbrl()
revenue = xbrl.facts.query().by_concept('Revenue').to_dataframe()
if not revenue.empty:
revenue_trend.append({
'filing_date': filing.filing_date,
'revenue': revenue.iloc[0]['value']
})
trend_df = pd.DataFrame(revenue_trend)
```
## Best Practices
1. **Check statement availability** before accessing:
```python
print(xbrl) # See what's available
```
2. **Use query chaining** for complex filters:
```python
results = (xbrl.facts.query()
.by_statement_type("IncomeStatement")
.by_fiscal_year(2024)
.by_period_type("duration")
.to_dataframe())
```
3. **Handle missing data gracefully**:
```python
try:
stmt = xbrl.statements.equity_statement()
except StatementNotFound:
stmt = None
```
4. **Convert to DataFrame for analysis**:
```python
df = statement.to_dataframe() # Easier to work with
```
5. **Use current_period for latest data**:
```python
current = xbrl.current_period
latest_income = current.income_statement()
```
## Troubleshooting
### "Statement not found"
**Cause**: Statement doesn't exist in this filing or uses non-standard name
**Solution**:
```python
# List all available statements
print(xbrl.statements)
# Or check available types
all_statements = xbrl.get_all_statements()
statement_types = [s['type'] for s in all_statements]
```
### "No facts found"
**Cause**: Concept name doesn't match or no data for period
**Solution**:
```python
# Try pattern matching
results = xbrl.facts.query().by_concept('Revenue', exact=False).to_dataframe()
# Or search by label
results = xbrl.facts.query().by_label('revenue').to_dataframe()
```
### "Empty DataFrame"
**Cause**: Period filter too restrictive or no data available
**Solution**:
```python
# Check available periods
print(xbrl.reporting_periods)
# Query without period filter
all_revenue = xbrl.facts.query().by_concept('Revenue').to_dataframe()
```
This comprehensive guide covers the essential patterns for working with XBRL data in edgartools. For more examples, see the Filing and Statement documentation.

View File

@@ -0,0 +1,311 @@
"""
Examples demonstrating how to use the XBRL2 module.
This module provides multiple examples demonstrating different ways to use the XBRL2 module.
"""
from pathlib import Path
from rich import print
from rich.console import Console
from edgar import Company, Filing
from edgar.xbrl.statements import Statements
from edgar.xbrl.xbrl import XBRL
def render_financial_statements(ticker="AAPL"):
"""
Demonstrates how to render financial statements in a tabular format.
"""
company = Company("AAPL")
# Get the latest filing
filing = company.latest("10-K")
# Create an XBRL object
xbrl = XBRL.from_filing(filing)
# Display entity information
print("\n[bold]Entity Information:[/bold]")
for key, value in xbrl.entity_info.items():
print(f"{key}: {value}")
# Display available reporting periods
print("\n[bold]Available Reporting Periods:[/bold]")
for i, period in enumerate(xbrl.reporting_periods):
if period['type'] == 'instant':
print(f"{i + 1}. As of {period['date']}")
else:
print(f"{i + 1}. {period['start_date']} to {period['end_date']}")
# Show available period views for each statement
print("\n[bold]Available Period Views for Balance Sheet:[/bold]")
bs_views = xbrl.get_period_views("BalanceSheet")
for view in bs_views:
print(f"- {view['name']}: {view['description']}")
print("\n[bold]Available Period Views for Income Statement:[/bold]")
is_views = xbrl.get_period_views("IncomeStatement")
for view in is_views:
print(f"- {view['name']}: {view['description']}")
# Render Balance Sheet using default view
print("\n[bold]Balance Sheet (Default View):[/bold]")
balance_sheet = xbrl.render_statement("BalanceSheet")
print(balance_sheet)
# Render Balance Sheet with Current vs. Previous Period view if available
if bs_views and any(v['name'] == 'Current vs. Previous Period' for v in bs_views):
print("\n[bold]Balance Sheet (Current vs. Previous Period):[/bold]")
current_vs_prev_bs = xbrl.render_statement("BalanceSheet", period_view="Current vs. Previous Period")
print(current_vs_prev_bs)
# Render Income Statement using default view
print("\n[bold]Income Statement (Default View):[/bold]")
income_statement = xbrl.render_statement("IncomeStatement")
print(income_statement)
# Render Income Statement with Annual Comparison view if available
if is_views and any(v['name'] == 'Annual Comparison' for v in is_views):
print("\n[bold]Income Statement (Annual Comparison):[/bold]")
annual_is = xbrl.render_statement("IncomeStatement", period_view="Annual Comparison")
print(annual_is)
# Render Cash Flow Statement
print("\n[bold]Cash Flow Statement:[/bold]")
cash_flow = xbrl.render_statement("CashFlowStatement")
print(cash_flow)
# Get a specific period for rendering
if xbrl.reporting_periods:
# Use the most recent instant period for Balance Sheet
instant_periods = [p for p in xbrl.reporting_periods if p['type'] == 'instant']
if instant_periods:
period = instant_periods[0] # Most recent period
period_key = f"instant_{period['date']}"
print(f"\n[bold]Balance Sheet (As of {period['date']} only):[/bold]")
single_period_bs = xbrl.render_statement("BalanceSheet", period_filter=period_key)
print(single_period_bs)
# Use most recent duration period for Income Statement
duration_periods = [p for p in xbrl.reporting_periods if p['type'] == 'duration']
if duration_periods:
period = duration_periods[0] # Most recent period
period_key = f"duration_{period['start_date']}_{period['end_date']}"
print(f"\n[bold]Income Statement ({period['start_date']} to {period['end_date']} only):[/bold]")
single_period_is = xbrl.render_statement("IncomeStatement", period_filter=period_key)
print(single_period_is)
def using_statements_api(ticker="TSLA"):
"""
Demonstrates the use of the user-friendly Statements API.
"""
company = Company(ticker)
# Get the latest filing
filing = company.latest("10-K")
# Create an XBRL object
xbrl = XBRL.from_filing(filing)
# Create a Statements object for easier access
statements = Statements(xbrl)
# Display available statements
print("\n[bold]Available Statements:[/bold]")
print(statements)
# Display balance sheet
print("\n[bold]Balance Sheet:[/bold]")
balance_sheet = statements.balance_sheet()
print(balance_sheet)
# Display income statement
print("\n[bold]Income Statement:[/bold]")
income_statement = statements.income_statement()
print(income_statement)
# Display cash flow statement
print("\n[bold]Cash Flow Statement:[/bold]")
cash_flow = statements.cashflow_statement()
print(cash_flow)
# Get available period views
print("\n[bold]Available Period Views for Income Statement:[/bold]")
period_views = statements.get_period_views("IncomeStatement")
for view in period_views:
print(f"- {view['name']}: {view['description']}")
# Display with specific period view if available
if period_views:
view_name = period_views[0]['name']
print(f"\n[bold]Income Statement with {view_name} Period View:[/bold]")
income_statement_view = statements.income_statement(period_view=view_name)
print(income_statement_view)
# Display three-column view if available
print("\n[bold]Three-Column Statement View (if available):[/bold]")
period_views = statements.get_period_views("BalanceSheet")
three_year_view = next((v for v in period_views if "Three" in v['name']), None)
if three_year_view:
print(f"\n[bold]Balance Sheet with Three Periods ({three_year_view['name']}):[/bold]")
print(f"Description: {three_year_view['description']}")
three_col_bs = statements.balance_sheet(period_view=three_year_view['name'])
print(three_col_bs)
else:
print("[yellow]No three-period view available for this filing.[/yellow]")
# Convert to dataframe
print("\n[bold]Converting to DataFrame:[/bold]")
df = statements.to_dataframe("IncomeStatement")
print(f"DataFrame shape: {df.shape}")
print(df.head(3))
def example_with_real_filing():
"""
Example using a real filing from SEC.
Note: This requires internet access.
"""
# Using print directly with rich formatting instead of console
print("[bold]Example with Real Filing[/bold]")
try:
# Get a filing with XBRL attachments
filing = Filing.get('0000320193-23-000077') # Apple 10-K
print(f"Retrieved filing: {filing.form} for {filing.company} ({filing.filing_date})")
# Parse XBRL data
xbrl = XBRL.from_filing(filing)
# Create Statements object
statements = Statements(xbrl)
# Display entity information
print("\n[bold]Entity Information:[/bold]")
entity_info = {
'entity_name': xbrl.entity_info.get('entity_name'),
'ticker': xbrl.entity_info.get('ticker'),
'document_type': xbrl.entity_info.get('document_type'),
'fiscal_year': xbrl.entity_info.get('fiscal_year'),
'fiscal_period': xbrl.entity_info.get('fiscal_period')
}
for key, value in entity_info.items():
print(f"{key}: {value}")
# Display balance sheet
print("\n[bold]Balance Sheet:[/bold]")
balance_sheet = statements.balance_sheet()
print(balance_sheet)
except Exception as e:
print(f"[bold red]Error loading real filing: {str(e)}[/bold red]")
print("[yellow]Note: This example requires internet access to fetch filings from SEC EDGAR.[/yellow]")
def standardized_statements_example():
"""
Demonstrates the use of standardized concept labels.
"""
# Path to XBRL files
sample_dir = Path(__file__).parent / "aapl"
# Create an XBRL object by parsing the directory
xbrl = XBRL.from_directory(sample_dir)
# Create a Statements object for easier access
statements = Statements(xbrl)
# Display original income statement
print("\n[bold]Income Statement (Original Labels):[/bold]")
income_statement = statements.income_statement()
print(income_statement)
# Display standardized income statement
print("\n[bold]Income Statement (Standardized Labels):[/bold]")
income_statement_std = statements.income_statement(standard=True)
print(income_statement_std)
# Display original balance sheet
print("\n[bold]Balance Sheet (Original Labels):[/bold]")
balance_sheet = statements.balance_sheet()
print(balance_sheet)
# Display standardized balance sheet
print("\n[bold]Balance Sheet (Standardized Labels):[/bold]")
balance_sheet_std = statements.balance_sheet(standard=True)
print(balance_sheet_std)
# Show standardized statement with a specific period view
period_views = statements.get_period_views("BalanceSheet")
if period_views:
view_name = period_views[0]['name']
print(f"\n[bold]Balance Sheet ({view_name}) with Standardized Labels:[/bold]")
balance_sheet_view_std = statements.balance_sheet(period_view=view_name, standard=True)
print(balance_sheet_view_std)
# Demonstrate standardized DataFrames
print("\n[bold]Converting to DataFrame with Standardized Labels:[/bold]")
# Original DataFrame
print("\n[bold]Original DataFrame:[/bold]")
df_orig = statements.to_dataframe("IncomeStatement", standard=False)
if not df_orig.empty:
print(f"DataFrame shape: {df_orig.shape}")
print(df_orig[['concept', 'label']].head(3))
# Standardized DataFrame
print("\n[bold]Standardized DataFrame:[/bold]")
df_std = statements.to_dataframe("IncomeStatement", standard=True)
if not df_std.empty:
print(f"DataFrame shape: {df_std.shape}")
if 'original_label' in df_std.columns:
print(df_std[['concept', 'label', 'original_label']].head(3))
else:
print(df_std[['concept', 'label']].head(3))
if __name__ == "__main__":
console = Console()
print("[bold cyan]XBRL2 Module Examples[/bold cyan]")
print("[yellow]Choose an example to run:[/yellow]")
print("1. Render Financial Statements (Direct XBRL API)")
print("2. Using Statements API (User-friendly API)")
print("3. Example with Real Filing (Requires Internet)")
print("4. Standardized Statements (Concept Standardization)")
print("5. Run All Examples")
try:
choice = input("\nEnter your choice (1-5): ")
if choice == "1":
render_financial_statements()
elif choice == "2":
using_statements_api()
elif choice == "3":
example_with_real_filing()
elif choice == "4":
standardized_statements_example()
elif choice == "5":
print("\n[bold]Running All Examples[/bold]\n")
print("\n[bold cyan]Example 1: Render Financial Statements[/bold cyan]\n")
render_financial_statements()
print("\n" + "-" * 80 + "\n")
print("\n[bold cyan]Example 2: Using Statements API[/bold cyan]\n")
using_statements_api()
print("\n" + "-" * 80 + "\n")
print("\n[bold cyan]Example 3: Example with Real Filing[/bold cyan]\n")
example_with_real_filing()
print("\n" + "-" * 80 + "\n")
print("\n[bold cyan]Example 4: Standardized Statements[/bold cyan]\n")
standardized_statements_example()
else:
print("[bold red]Invalid choice. Please run the script again and select a valid option.[/bold red]")
except KeyboardInterrupt:
print("\n[yellow]Examples cancelled by user.[/yellow]")
except Exception as e:
print(f"[bold red]Error running examples: {str(e)}[/bold red]")

View File

@@ -0,0 +1,36 @@
"""
XBRL-specific exceptions.
"""
from dataclasses import dataclass
from typing import List
@dataclass
class StatementNotFound(Exception):
"""Exception raised when a statement cannot be resolved with sufficient confidence."""
statement_type: str
confidence: float
found_statements: List[str]
entity_name: str = "Unknown"
cik: str = "Unknown"
period_of_report: str = "Unknown"
reason: str = ""
def __str__(self):
base_msg = f"Failed to resolve {self.statement_type} for {self.entity_name} (CIK: {self.cik}, Period: {self.period_of_report})"
if self.confidence > 0:
confidence_msg = f"Low confidence match: {self.confidence:.2f}"
else:
confidence_msg = "No matching statements found"
if self.found_statements:
found_msg = f"Found statements: {self.found_statements}"
else:
found_msg = "No statements available"
details = f"{base_msg}. {confidence_msg}. {found_msg}"
if self.reason:
details += f". {self.reason}"
return details

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,313 @@
"""
Data models for XBRL parsing.
This module defines the core data structures used throughout the XBRL parser.
"""
from typing import Any, Dict, List, Optional, Union
from pydantic import BaseModel, Field
# Constants for label roles
STANDARD_LABEL = "http://www.xbrl.org/2003/role/label"
TERSE_LABEL = "http://www.xbrl.org/2003/role/terseLabel"
PERIOD_START_LABEL = "http://www.xbrl.org/2003/role/periodStartLabel"
PERIOD_END_LABEL = "http://www.xbrl.org/2003/role/periodEndLabel"
TOTAL_LABEL = "http://www.xbrl.org/2003/role/totalLabel"
def select_display_label(
labels: Dict[str, str],
preferred_label: Optional[str] = None,
standard_label: Optional[str] = None,
element_id: Optional[str] = None,
element_name: Optional[str] = None
) -> str:
"""
Select the most appropriate label for display, following a consistent priority order.
Includes standardization mapping to provide consistent labels across companies.
Args:
labels: Dictionary of available labels
preferred_label: Role of the preferred label (if specified in presentation linkbase)
standard_label: The standard label content (if available)
element_id: Element ID (fallback)
element_name: Element name (alternative fallback)
Returns:
The selected label according to priority rules, with standardization applied if available
"""
# First, select the best available label using existing priority logic
selected_label = None
# 1. Use preferred label if specified and available
if preferred_label and labels and preferred_label in labels:
selected_label = labels[preferred_label]
# 2. Use terse label if available (more user-friendly)
elif labels and TERSE_LABEL in labels:
selected_label = labels[TERSE_LABEL]
# 3. Fall back to standard label
elif standard_label:
selected_label = standard_label
# 4. Try STANDARD_LABEL directly from labels dict
elif labels and STANDARD_LABEL in labels:
selected_label = labels[STANDARD_LABEL]
# 5. Take any available label
elif labels:
selected_label = next(iter(labels.values()), "")
# 6. Use element name if available
elif element_name:
selected_label = element_name
# 7. Last resort: element ID
else:
selected_label = element_id or ""
# Apply standardization if we have an element_id (concept)
if element_id and selected_label:
try:
from edgar.xbrl.standardization.core import initialize_default_mappings
# Initialize mapping store (cached after first call)
if not hasattr(select_display_label, '_mapping_store'):
select_display_label._mapping_store = initialize_default_mappings(read_only=True)
# Try to get standardized concept
standardized_label = select_display_label._mapping_store.get_standard_concept(element_id)
if standardized_label:
return standardized_label
except ImportError:
# Standardization not available, continue with selected label
pass
except Exception:
# Any other error in standardization, continue with selected label
pass
return selected_label
class ElementCatalog:
"""
A catalog of XBRL elements with their properties.
This is the base data structure for element metadata as described in the design document.
Attributes:
name: The name of the element (e.g., "us-gaap_NetIncome")
data_type: The data type of the element (e.g., "monetary", "string", etc.)
period_type: The period type of the element (e.g., "instant", "duration")
balance: The balance type of the element (e.g., "debit", "credit", or None)
abstract: Whether the element is abstract (True/False)
labels: A dictionary of labels for the element, keyed by role URI
"""
def __init__(self,
name: str,
data_type: str,
period_type: str,
balance: Optional[str] = None,
abstract: bool = False,
labels: Optional[Dict[str, str]] = None
):
self.name = name
self.data_type = data_type
self.period_type = period_type
self.balance = balance
self.abstract = abstract
self.labels = labels if labels is not None else {}
def __str__(self) -> str:
return self.name
class Context(BaseModel):
"""
An XBRL context defining entity, period, and dimensional information.
This corresponds to the Context Registry in the design document.
"""
context_id: str
entity: Dict[str, Any] = Field(default_factory=dict)
period: Dict[str, Any] = Field(default_factory=dict)
dimensions: Dict[str, str] = Field(default_factory=dict)
@property
def period_string(self) -> str:
"""Return a human-readable string representation of the period."""
if self.period.get('type') == 'instant':
return f"As of {self.period.get('instant')}"
elif self.period.get('type') == 'duration':
return f"From {self.period.get('startDate')} to {self.period.get('endDate')}"
else:
return "Forever"
class Fact(BaseModel):
"""
An XBRL fact with value and references to context, unit, and element.
This corresponds to the Fact Database in the design document.
The instance_id field is used to differentiate between duplicate facts
that share the same element_id and context_ref. When a fact has no
duplicates, instance_id will be None.
The fact_id field preserves the original id attribute from the XML element,
enabling linkage with footnotes.
"""
element_id: str
context_ref: str
value: str
unit_ref: Optional[str] = None
decimals: Optional[Union[int, str]] = None # int or "INF"
numeric_value: Optional[float] = None
footnotes: List[str] = Field(default_factory=list)
instance_id: Optional[int] = None
fact_id: Optional[str] = None # Original id attribute from the XML
class Footnote(BaseModel):
"""
Represents an XBRL footnote with its text content and related facts.
Footnotes are linked to facts via footnoteArc elements that connect
fact IDs to footnote IDs using xlink:from and xlink:to attributes.
"""
footnote_id: str
text: str
lang: Optional[str] = "en-US"
role: Optional[str] = None
related_fact_ids: List[str] = Field(default_factory=list)
class PresentationNode(BaseModel):
"""
A node in the presentation hierarchy.
This corresponds to the Presentation Node in the design document.
"""
element_id: str
parent: Optional[str] = None
children: List[str] = Field(default_factory=list)
order: float = 0.0
preferred_label: Optional[str] = None
depth: int = 0
# Additional information linked from element catalog
element_name: Optional[str] = None
standard_label: Optional[str] = None
is_abstract: bool = False
labels: Dict[str, str] = Field(default_factory=dict)
@property
def display_label(self) -> str:
"""
Return the appropriate label for display, prioritizing user-friendly options.
Label selection priority:
1. Preferred label (if specified in presentation linkbase)
2. Terse label (for more concise display)
3. Label (standard label)
4. Element ID (fallback)
"""
return select_display_label(
labels=self.labels,
standard_label=self.standard_label,
preferred_label=self.preferred_label,
element_id=self.element_id
)
class PresentationTree(BaseModel):
"""
A presentation tree for a specific role.
This corresponds to the Presentation Hierarchy in the design document.
"""
role_uri: str
definition: str
root_element_id: str
all_nodes: Dict[str, PresentationNode] = Field(default_factory=dict)
order: int = 0
class CalculationNode(BaseModel):
"""
A node in the calculation hierarchy.
This corresponds to the Calculation Node in the design document.
"""
element_id: str
children: List[str] = Field(default_factory=list)
parent: Optional[str] = None
weight: float = 1.0
order: float = 0.0
# Information linked from schema
balance_type: Optional[str] = None # "debit", "credit", or None
period_type: Optional[str] = None # "instant" or "duration"
class CalculationTree(BaseModel):
"""
A calculation tree for a specific role.
This corresponds to the Calculation Network in the design document.
"""
role_uri: str
definition: str
root_element_id: str
all_nodes: Dict[str, CalculationNode] = Field(default_factory=dict)
class Axis(BaseModel):
"""
A dimensional axis (dimension) in XBRL.
This corresponds to the Axis (Dimension) in the design document.
"""
element_id: str
label: str
domain_id: Optional[str] = None
default_member_id: Optional[str] = None
is_typed_dimension: bool = False
typed_domain_ref: str = ""
class Domain(BaseModel):
"""
A domain in an XBRL dimensional structure.
This corresponds to the Domain in the design document.
"""
element_id: str
label: str
members: List[str] = Field(default_factory=list) # List of domain member element IDs
parent: Optional[str] = None # Parent domain element ID
class Table(BaseModel):
"""
A dimensional table (hypercube) in XBRL.
This corresponds to the Table (Hypercube) in the design document.
"""
element_id: str
label: str
role_uri: str
axes: List[str] = Field(default_factory=list) # List of axis element IDs
line_items: List[str] = Field(default_factory=list) # List of line item element IDs
closed: bool = False
context_element: str = "segment"
class XBRLProcessingError(Exception):
"""Exception raised for errors during XBRL processing."""
pass

View File

@@ -0,0 +1,27 @@
"""
XBRL Parser Components.
This package provides specialized parser components for different aspects
of XBRL document processing. Each parser handles a specific responsibility
in the XBRL parsing workflow.
"""
from .base import BaseParser
from .calculation import CalculationParser
from .coordinator import XBRLParser
from .definition import DefinitionParser
from .instance import InstanceParser
from .labels import LabelsParser
from .presentation import PresentationParser
from .schema import SchemaParser
__all__ = [
'BaseParser',
'XBRLParser',
'SchemaParser',
'LabelsParser',
'PresentationParser',
'CalculationParser',
'DefinitionParser',
'InstanceParser',
]

View File

@@ -0,0 +1,148 @@
"""
Base parser functionality for XBRL parsing components.
This module provides common utilities and base functionality shared across
all XBRL parser components.
"""
from typing import Any, Dict
from lxml import etree as ET
from edgar.core import log
from edgar.xbrl.core import NAMESPACES
class BaseParser:
"""Base class for XBRL parser components with common functionality."""
def __init__(self):
"""Initialize base parser with common data structures."""
# Common namespaces and utilities available to all parsers
self.namespaces = NAMESPACES
def _safe_parse_xml(self, content: str) -> ET.Element:
"""
Safely parse XML content with lxml, handling encoding declarations properly.
Args:
content: XML content as string or bytes
Returns:
parsed XML root element
"""
parser = ET.XMLParser(remove_blank_text=True, recover=True)
# Convert to bytes for safer parsing if needed
if isinstance(content, str):
content_bytes = content.encode('utf-8')
else:
content_bytes = content
# Parse with lxml
return ET.XML(content_bytes, parser)
def _parse_order_attribute(self, arc) -> float:
"""Parse order attribute from arc, checking both order and xlink:order."""
# Try xlink:order first (XBRL standard)
order_value = arc.get('{http://www.w3.org/1999/xlink}order')
if order_value is None:
# Fallback to order attribute
order_value = arc.get('order')
# Debug logging to understand what's in the XBRL document
if order_value is not None:
log.debug(f"Found order attribute: {order_value}")
else:
# Log all attributes to see what's actually there
all_attrs = dict(arc.attrib) if hasattr(arc, 'attrib') else {}
log.debug(f"No order attribute found. Available attributes: {all_attrs}")
try:
return float(order_value) if order_value is not None else 0.0
except (ValueError, TypeError):
return 0.0
def _extract_role_info(self, role_element) -> Dict[str, Any]:
"""
Extract role information from a role element.
Args:
role_element: XML element containing role definition
Returns:
Dictionary with role information
"""
role_info = {}
# Get role URI
role_uri = role_element.get('roleURI', '')
role_info['uri'] = role_uri
# Extract role definition/label
definition_elem = role_element.find('.//{http://www.xbrl.org/2003/linkbase}definition')
if definition_elem is not None:
role_info['definition'] = definition_elem.text or ''
else:
# Fallback: create definition from role URI
role_info['definition'] = role_uri.split('/')[-1].replace('_', ' ') if role_uri else ''
return role_info
def _get_element_namespace_and_name(self, element_id: str) -> tuple[str, str]:
"""
Extract namespace and local name from an element ID.
Args:
element_id: Element identifier (may include namespace prefix)
Returns:
Tuple of (namespace, local_name)
"""
if ':' in element_id:
prefix, local_name = element_id.split(':', 1)
# Map common prefixes to namespaces
namespace_map = {
'us-gaap': 'http://fasb.org/us-gaap/2024',
'dei': 'http://xbrl.sec.gov/dei/2024',
'invest': 'http://xbrl.sec.gov/invest/2013-01-31',
'country': 'http://xbrl.sec.gov/country/2023',
'currency': 'http://xbrl.sec.gov/currency/2023',
'exch': 'http://xbrl.sec.gov/exch/2023',
'naics': 'http://xbrl.sec.gov/naics/2023',
'sic': 'http://xbrl.sec.gov/sic/2023',
'stpr': 'http://xbrl.sec.gov/stpr/2023',
}
namespace = namespace_map.get(prefix, f'http://unknown.namespace/{prefix}')
return namespace, local_name
else:
return '', element_id
def _normalize_element_id(self, element_id: str) -> str:
"""
Normalize element ID to a consistent format.
Args:
element_id: Original element identifier
Returns:
Normalized element identifier
"""
if ':' in element_id:
prefix, name = element_id.split(':', 1)
return f"{prefix}_{name}"
return element_id
def _log_parsing_progress(self, component: str, count: int, total: int = None):
"""
Log parsing progress for debugging.
Args:
component: Name of component being parsed
count: Number of items processed
total: Total number of items (optional)
"""
if total:
log.debug(f"Parsed {count}/{total} {component}")
else:
log.debug(f"Parsed {count} {component}")

View File

@@ -0,0 +1,223 @@
"""
Calculation parser for XBRL documents.
This module handles parsing of XBRL calculation linkbases and building
calculation trees with weights for validation.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from edgar.xbrl.core import NAMESPACES, extract_element_id
from edgar.xbrl.models import CalculationNode, CalculationTree, ElementCatalog, Fact, XBRLProcessingError
from .base import BaseParser
class CalculationParser(BaseParser):
"""Parser for XBRL calculation linkbases."""
def __init__(self, calculation_roles: Dict[str, Dict[str, Any]],
calculation_trees: Dict[str, CalculationTree],
element_catalog: Dict[str, ElementCatalog],
facts: Dict[str, Fact]):
"""
Initialize calculation parser with data structure references.
Args:
calculation_roles: Reference to calculation roles dictionary
calculation_trees: Reference to calculation trees dictionary
element_catalog: Reference to element catalog dictionary
facts: Reference to facts dictionary
"""
super().__init__()
# Store references to data structures
self.calculation_roles = calculation_roles
self.calculation_trees = calculation_trees
self.element_catalog = element_catalog
self.facts = facts
def parse_calculation(self, file_path: Union[str, Path]) -> None:
"""Parse calculation linkbase file and build calculation trees."""
try:
content = Path(file_path).read_text()
self.parse_calculation_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing calculation file {file_path}: {str(e)}") from e
def parse_calculation_content(self, content: str) -> None:
"""Parse calculation linkbase content and build calculation trees."""
try:
# Use safe XML parsing method
root = self._safe_parse_xml(content)
# Extract calculation links
calculation_links = root.findall('.//{http://www.xbrl.org/2003/linkbase}calculationLink')
for link in calculation_links:
role = link.get('{http://www.w3.org/1999/xlink}role')
if not role:
continue
# Store role information
role_id = role.split('/')[-1] if '/' in role else role
role_def = role_id.replace('_', ' ')
self.calculation_roles[role] = {
'roleUri': role,
'definition': role_def,
'roleId': role_id
}
# Extract arcs
arcs = link.findall('.//{http://www.xbrl.org/2003/linkbase}calculationArc')
# Create relationships list
relationships = []
for arc in arcs:
from_ref = arc.get('{http://www.w3.org/1999/xlink}from')
to_ref = arc.get('{http://www.w3.org/1999/xlink}to')
order = self._parse_order_attribute(arc)
weight = float(arc.get('weight', '1.0'))
if not from_ref or not to_ref:
continue
# Find locators for from/to references
from_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{from_ref}"]')
to_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{to_ref}"]')
if from_loc is None or to_loc is None:
continue
from_href = from_loc.get('{http://www.w3.org/1999/xlink}href')
to_href = to_loc.get('{http://www.w3.org/1999/xlink}href')
if not from_href or not to_href:
continue
# Extract element IDs
from_element = extract_element_id(from_href)
to_element = extract_element_id(to_href)
# Add relationship
relationships.append({
'from_element': from_element,
'to_element': to_element,
'order': order,
'weight': weight
})
# Build calculation tree for this role
if relationships:
self._build_calculation_tree(role, relationships)
except Exception as e:
raise XBRLProcessingError(f"Error parsing calculation content: {str(e)}") from e
def _build_calculation_tree(self, role: str, relationships: List[Dict[str, Any]]) -> None:
"""
Build a calculation tree from relationships.
Args:
role: Extended link role URI
relationships: List of relationships (from_element, to_element, order, weight)
"""
# Group relationships by source element
from_map = {}
to_map = {}
for rel in relationships:
from_element = rel['from_element']
to_element = rel['to_element']
if from_element not in from_map:
from_map[from_element] = []
from_map[from_element].append(rel)
if to_element not in to_map:
to_map[to_element] = []
to_map[to_element].append(rel)
# Find root elements (appear as 'from' but not as 'to')
root_elements = set(from_map.keys()) - set(to_map.keys())
if not root_elements:
return # No root elements found
# Create calculation tree
tree = CalculationTree(
role_uri=role,
definition=self.calculation_roles[role]['definition'],
root_element_id=next(iter(root_elements)),
all_nodes={}
)
# Build tree recursively
for root_id in root_elements:
self._build_calculation_subtree(root_id, None, from_map, tree.all_nodes)
# Add tree to collection
self.calculation_trees[role] = tree
def _build_calculation_subtree(self, element_id: str, parent_id: Optional[str],
from_map: Dict[str, List[Dict[str, Any]]],
all_nodes: Dict[str, CalculationNode]) -> None:
"""
Recursively build a calculation subtree.
Args:
element_id: Current element ID
parent_id: Parent element ID
from_map: Map of relationships by source element
all_nodes: Dictionary to store all nodes
"""
# Create node
node = CalculationNode(
element_id=element_id,
parent=parent_id,
children=[]
)
# Add element information if available
elem_info = None
if element_id in self.element_catalog:
elem_info = self.element_catalog[element_id]
else:
# Try alternative element ID formats (colon vs underscore)
alt_element_id = element_id.replace(':', '_') if ':' in element_id else element_id.replace('_', ':')
if alt_element_id in self.element_catalog:
elem_info = self.element_catalog[alt_element_id]
if elem_info:
node.balance_type = elem_info.balance
node.period_type = elem_info.period_type
# Add to collection
all_nodes[element_id] = node
# Process children
if element_id in from_map:
# Sort children by order
children = sorted(from_map[element_id], key=lambda r: r['order'])
for rel in children:
child_id = rel['to_element']
# Add child to parent's children list
node.children.append(child_id)
# Set weight
weight = rel['weight']
# Recursively build child subtree
self._build_calculation_subtree(
child_id, element_id, from_map, all_nodes
)
# Update weight and order after child is built
if child_id in all_nodes:
all_nodes[child_id].weight = weight
all_nodes[child_id].order = rel['order']

View File

@@ -0,0 +1,382 @@
"""
Shared XBRL concept definitions for balance types and deprecated normalization lists.
This module contains balance type mappings for common US-GAAP concepts to support
the balance column in DataFrame exports without parsing full taxonomy schemas.
DEPRECATED: Static normalization concept lists (CONSISTENT_POSITIVE_CONCEPTS,
LEGITIMATE_NEGATIVE_CONCEPTS) are kept for historical reference but no longer used.
Testing confirmed that SEC XBRL instance data is already consistent across companies.
See Issue #463 analysis for details.
"""
# =============================================================================
# DEPRECATED CONCEPT LISTS (No longer used as of Issue #463)
# =============================================================================
# These lists were created to work around perceived inconsistencies in XBRL data.
# Testing revealed that raw SEC instance data is ALREADY consistent across companies.
#
# Historical context:
# - Issues #290, #334, #451 reported negative values for expenses
# - Root cause: EdgarTools was misusing calculation weights for display logic
# - These lists fixed symptoms but not the actual problem
# - Issue #463 removed calculation weight application during parsing
# - Result: Raw values preserved as-is (matching SEC CompanyFacts API)
#
# Kept for historical reference and potential future use cases.
# =============================================================================
CONSISTENT_POSITIVE_CONCEPTS = {
# Research and Development Expenses
'us-gaap_ResearchAndDevelopmentExpense',
'us_gaap_ResearchAndDevelopmentExpense',
'ResearchAndDevelopmentExpense',
# Selling, General & Administrative Expenses
'us-gaap_SellingGeneralAndAdministrativeExpense',
'us_gaap_SellingGeneralAndAdministrativeExpense',
'SellingGeneralAndAdministrativeExpense',
# General and Administrative Expenses (separate from SG&A)
'us-gaap_GeneralAndAdministrativeExpense',
'us_gaap_GeneralAndAdministrativeExpense',
'GeneralAndAdministrativeExpense',
# Selling Expenses
'us-gaap_SellingExpense',
'us_gaap_SellingExpense',
'SellingExpense',
# Marketing and Advertising Expenses
'us-gaap_SellingAndMarketingExpense',
'us_gaap_SellingAndMarketingExpense',
'SellingAndMarketingExpense',
'us-gaap_MarketingExpense',
'us_gaap_MarketingExpense',
'MarketingExpense',
'us-gaap_AdvertisingExpense',
'us_gaap_AdvertisingExpense',
'AdvertisingExpense',
# Share-based Compensation Expenses
'us-gaap_AllocatedShareBasedCompensationExpense',
'us_gaap_AllocatedShareBasedCompensationExpense',
'AllocatedShareBasedCompensationExpense',
'us-gaap_ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized',
'us_gaap_ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized',
'ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized',
# Operating Expenses (general)
'us-gaap_OperatingExpenses',
'us_gaap_OperatingExpenses',
'OperatingExpenses',
# Professional Services Expenses
'us-gaap_ProfessionalServiceFees',
'us_gaap_ProfessionalServiceFees',
'ProfessionalServiceFees',
# Compensation and Benefits
'us-gaap_LaborAndRelatedExpense',
'us_gaap_LaborAndRelatedExpense',
'LaborAndRelatedExpense',
'us-gaap_EmployeeBenefitsExpense',
'us_gaap_EmployeeBenefitsExpense',
'EmployeeBenefitsExpense',
# Cost of Revenue and Cost of Goods/Services Sold (Issue #290, #451)
'us-gaap_CostOfRevenue',
'us_gaap_CostOfRevenue',
'CostOfRevenue',
'us-gaap_CostOfGoodsAndServicesSold',
'us_gaap_CostOfGoodsAndServicesSold',
'CostOfGoodsAndServicesSold',
'us-gaap_CostOfGoodsSold',
'us_gaap_CostOfGoodsSold',
'CostOfGoodsSold',
'us-gaap_CostOfServices',
'us_gaap_CostOfServices',
'CostOfServices',
# Income Tax Expense (Issue #451)
'us-gaap_IncomeTaxExpenseBenefit',
'us_gaap_IncomeTaxExpenseBenefit',
'IncomeTaxExpenseBenefit',
'us-gaap_IncomeTaxRecoveryExpense',
'us_gaap_IncomeTaxRecoveryExpense',
'IncomeTaxRecoveryExpense',
# Cash Flow Statement - Financing Activities (cash outflows)
# These represent uses of cash that should always be positive
'us-gaap_PaymentsForRepurchaseOfCommonStock',
'us_gaap_PaymentsForRepurchaseOfCommonStock',
'PaymentsForRepurchaseOfCommonStock',
'us-gaap_PaymentsOfDividends',
'us_gaap_PaymentsOfDividends',
'PaymentsOfDividends',
'us-gaap_PaymentsOfDividendsCommonStock',
'us_gaap_PaymentsOfDividendsCommonStock',
'PaymentsOfDividendsCommonStock',
'us-gaap_PaymentsOfDividendsPreferredStockAndPreferenceStock',
'us_gaap_PaymentsOfDividendsPreferredStockAndPreferenceStock',
'PaymentsOfDividendsPreferredStockAndPreferenceStock'
}
# DEPRECATED: Concepts that can legitimately be negative
# This list is no longer used but kept for historical reference.
LEGITIMATE_NEGATIVE_CONCEPTS = {
# Interest expense/income that can be net negative
'us-gaap_InterestIncomeExpenseNet',
'us_gaap_InterestIncomeExpenseNet',
'InterestIncomeExpenseNet',
# Foreign exchange gains/losses
'us-gaap_ForeignCurrencyTransactionGainLossBeforeTax',
'us_gaap_ForeignCurrencyTransactionGainLossBeforeTax',
'ForeignCurrencyTransactionGainLossBeforeTax',
# Restructuring reversals/credits
'us-gaap_RestructuringChargesAndReversals',
'us_gaap_RestructuringChargesAndReversals',
'RestructuringChargesAndReversals'
}
# US-GAAP Balance Type Mappings (Issue #463)
#
# This mapping provides balance types for common US-GAAP concepts to support
# the balance column in DataFrame exports without requiring full taxonomy parsing.
#
# Balance types:
# - "debit": Assets, Expenses (increase with debits, decrease with credits)
# - "credit": Liabilities, Equity, Revenue (increase with credits, decrease with debits)
#
# TODO: Eventually replace with full US-GAAP taxonomy parser that follows schema imports
#
US_GAAP_BALANCE_TYPES = {
# ============================================================================
# ASSETS (Balance: debit)
# ============================================================================
# Current Assets
'us-gaap:Cash': 'debit',
'Cash': 'debit', # Short form
'us-gaap:CashAndCashEquivalentsAtCarryingValue': 'debit',
'CashAndCashEquivalentsAtCarryingValue': 'debit', # Short form
'us-gaap:CashEquivalentsAtCarryingValue': 'debit',
'us-gaap:RestrictedCashAndCashEquivalents': 'debit',
'us-gaap:MarketableSecurities': 'debit',
'us-gaap:AvailableForSaleSecuritiesDebtSecurities': 'debit',
'us-gaap:ShortTermInvestments': 'debit',
'us-gaap:AccountsReceivableNetCurrent': 'debit',
'us-gaap:AccountsReceivableGrossCurrent': 'debit',
'us-gaap:Inventory': 'debit',
'us-gaap:InventoryNet': 'debit',
'us-gaap:PrepaidExpenseAndOtherAssetsCurrent': 'debit',
'us-gaap:DeferredTaxAssetsNetCurrent': 'debit',
'us-gaap:OtherAssetsCurrent': 'debit',
'us-gaap:AssetsCurrent': 'debit',
# Non-Current Assets
'us-gaap:PropertyPlantAndEquipmentNet': 'debit',
'us-gaap:PropertyPlantAndEquipmentGross': 'debit',
'us-gaap:Land': 'debit',
'us-gaap:BuildingsAndImprovementsGross': 'debit',
'us-gaap:MachineryAndEquipmentGross': 'debit',
'us-gaap:Goodwill': 'debit',
'us-gaap:IntangibleAssetsNetExcludingGoodwill': 'debit',
'us-gaap:IntangibleAssetsGrossExcludingGoodwill': 'debit',
'us-gaap:LongTermInvestments': 'debit',
'us-gaap:DeferredTaxAssetsNetNoncurrent': 'debit',
'us-gaap:OtherAssetsNoncurrent': 'debit',
'us-gaap:AssetsNoncurrent': 'debit',
'us-gaap:Assets': 'debit',
'Assets': 'debit', # Short form
# ============================================================================
# LIABILITIES (Balance: credit)
# ============================================================================
# Current Liabilities
'us-gaap:AccountsPayableCurrent': 'credit',
'us-gaap:AccruedLiabilitiesCurrent': 'credit',
'us-gaap:DeferredRevenueCurrent': 'credit',
'us-gaap:ContractWithCustomerLiabilityCurrent': 'credit',
'us-gaap:ShortTermBorrowings': 'credit',
'us-gaap:LongTermDebtCurrent': 'credit',
'us-gaap:CommercialPaper': 'credit',
'us-gaap:AccruedIncomeTaxesCurrent': 'credit',
'us-gaap:DividendsPayableCurrent': 'credit',
'us-gaap:OtherLiabilitiesCurrent': 'credit',
'us-gaap:LiabilitiesCurrent': 'credit',
# Non-Current Liabilities
'us-gaap:LongTermDebtNoncurrent': 'credit',
'us-gaap:LongTermDebtAndCapitalLeaseObligations': 'credit',
'us-gaap:DeferredRevenueNoncurrent': 'credit',
'us-gaap:DeferredTaxLiabilitiesNoncurrent': 'credit',
'us-gaap:PensionAndOtherPostretirementDefinedBenefitPlansLiabilitiesNoncurrent': 'credit',
'us-gaap:OtherLiabilitiesNoncurrent': 'credit',
'us-gaap:LiabilitiesNoncurrent': 'credit',
'us-gaap:Liabilities': 'credit',
# ============================================================================
# EQUITY (Balance: credit)
# ============================================================================
'us-gaap:CommonStockValue': 'credit',
'us-gaap:CommonStockSharesIssued': 'credit',
'us-gaap:CommonStockSharesOutstanding': 'credit',
'us-gaap:PreferredStockValue': 'credit',
'us-gaap:AdditionalPaidInCapital': 'credit',
'us-gaap:AdditionalPaidInCapitalCommonStock': 'credit',
'us-gaap:RetainedEarningsAccumulatedDeficit': 'credit',
'us-gaap:TreasuryStockValue': 'debit', # Contra-equity (debit balance)
'us-gaap:AccumulatedOtherComprehensiveIncomeLossNetOfTax': 'credit',
'us-gaap:StockholdersEquity': 'credit',
'us-gaap:StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest': 'credit',
'us-gaap:LiabilitiesAndStockholdersEquity': 'credit',
# ============================================================================
# REVENUE (Balance: credit)
# ============================================================================
'us-gaap:Revenues': 'credit',
'Revenues': 'credit', # Short form
'Revenue': 'credit', # Short form (singular)
'us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax': 'credit',
'RevenueFromContractWithCustomerExcludingAssessedTax': 'credit', # Short form
'us-gaap:RevenueFromContractWithCustomerIncludingAssessedTax': 'credit',
'RevenueFromContractWithCustomerIncludingAssessedTax': 'credit', # Short form
'us-gaap:SalesRevenueNet': 'credit',
'us-gaap:SalesRevenueGoodsNet': 'credit',
'us-gaap:SalesRevenueServicesNet': 'credit',
'us-gaap:InterestAndDividendIncomeOperating': 'credit',
'us-gaap:InterestIncomeOther': 'credit',
'us-gaap:InvestmentIncomeInterest': 'credit',
'us-gaap:GainLossOnSaleOfPropertyPlantEquipment': 'credit',
'us-gaap:GainLossOnInvestments': 'credit',
'us-gaap:OtherNonoperatingIncomeExpense': 'credit',
# ============================================================================
# EXPENSES & COSTS (Balance: debit)
# ============================================================================
# Cost of Revenue
'us-gaap:CostOfRevenue': 'debit',
'us-gaap:CostOfGoodsAndServicesSold': 'debit',
'us-gaap:CostOfGoodsSold': 'debit',
'us-gaap:CostOfServices': 'debit',
# Operating Expenses
'us-gaap:ResearchAndDevelopmentExpense': 'debit',
'us-gaap:SellingGeneralAndAdministrativeExpense': 'debit',
'us-gaap:GeneralAndAdministrativeExpense': 'debit',
'us-gaap:SellingExpense': 'debit',
'us-gaap:SellingAndMarketingExpense': 'debit',
'us-gaap:MarketingExpense': 'debit',
'us-gaap:AdvertisingExpense': 'debit',
'us-gaap:DepreciationDepletionAndAmortization': 'debit',
'us-gaap:Depreciation': 'debit',
'us-gaap:AmortizationOfIntangibleAssets': 'debit',
'us-gaap:RestructuringCharges': 'debit',
'us-gaap:AssetImpairmentCharges': 'debit',
'us-gaap:ShareBasedCompensation': 'debit',
# Other Expenses
'us-gaap:InterestExpense': 'debit',
'us-gaap:InterestExpenseDebt': 'debit',
'us-gaap:IncomeTaxExpenseBenefit': 'debit',
'us-gaap:ProvisionForDoubtfulAccounts': 'debit',
# ============================================================================
# INCOME & TOTALS (Balance: credit)
# ============================================================================
'us-gaap:GrossProfit': 'credit',
'us-gaap:OperatingIncomeLoss': 'credit',
'us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest': 'credit',
'us-gaap:IncomeLossFromContinuingOperations': 'credit',
'us-gaap:NetIncomeLoss': 'credit',
'us-gaap:NetIncomeLossAvailableToCommonStockholdersBasic': 'credit',
'us-gaap:NetIncomeLossAvailableToCommonStockholdersDiluted': 'credit',
'us-gaap:ComprehensiveIncomeNetOfTax': 'credit',
# ============================================================================
# CASH FLOW STATEMENT
# ============================================================================
# Operating Activities
'us-gaap:NetCashProvidedByUsedInOperatingActivities': 'debit',
'us-gaap:DepreciationAndAmortization': 'debit',
'us-gaap:ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized': 'debit',
'us-gaap:DeferredIncomeTaxExpenseBenefit': 'debit',
# Investing Activities
'us-gaap:NetCashProvidedByUsedInInvestingActivities': 'debit',
'us-gaap:PaymentsToAcquirePropertyPlantAndEquipment': 'credit', # Cash outflow
'us-gaap:PaymentsToAcquireBusinessesNetOfCashAcquired': 'credit', # Cash outflow
'us-gaap:PaymentsToAcquireMarketableSecurities': 'credit', # Cash outflow
'us-gaap:ProceedsFromSaleOfPropertyPlantAndEquipment': 'debit', # Cash inflow
'us-gaap:ProceedsFromSaleOfAvailableForSaleSecuritiesDebt': 'debit', # Cash inflow
# Financing Activities
'us-gaap:NetCashProvidedByUsedInFinancingActivities': 'debit',
'us-gaap:ProceedsFromIssuanceOfCommonStock': 'debit', # Cash inflow
'us-gaap:ProceedsFromIssuanceOfLongTermDebt': 'debit', # Cash inflow
'us-gaap:RepaymentsOfLongTermDebt': 'credit', # Cash outflow
'us-gaap:PaymentsOfDividends': 'credit', # Cash outflow
'us-gaap:PaymentsOfDividendsCommonStock': 'credit', # Cash outflow
'us-gaap:PaymentsForRepurchaseOfCommonStock': 'credit', # Cash outflow
}
def get_balance_type(concept: str) -> str:
"""
Get the balance type for a concept.
Looks up the balance type from the static US-GAAP mapping, handling
both colon and underscore namespace separators.
Args:
concept: The concept name (e.g., 'us-gaap:Revenue' or 'us-gaap_Revenue' or 'us_gaap_Revenue')
Returns:
Balance type ('debit', 'credit', or None if not found)
Example:
>>> get_balance_type('us-gaap:Cash')
'debit'
>>> get_balance_type('us-gaap_Revenue')
'credit'
>>> get_balance_type('us_gaap_Revenue')
'credit'
>>> get_balance_type('UnknownConcept')
None
"""
# Try direct lookup first (standard form)
if concept in US_GAAP_BALANCE_TYPES:
return US_GAAP_BALANCE_TYPES[concept]
# Normalize to standard form: us-gaap:LocalName
# Handle common namespace prefix variations
normalized = concept
# Replace known namespace patterns
# us_gaap_Cash -> us-gaap:Cash
# us-gaap_Cash -> us-gaap:Cash
if 'us_gaap' in normalized:
normalized = normalized.replace('us_gaap_', 'us-gaap:')
normalized = normalized.replace('us_gaap:', 'us-gaap:')
elif 'us-gaap' in normalized:
normalized = normalized.replace('us-gaap_', 'us-gaap:')
# Try normalized form
if normalized in US_GAAP_BALANCE_TYPES:
return US_GAAP_BALANCE_TYPES[normalized]
# Try converting all underscores to colons (simple fallback)
concept_all_colons = concept.replace('_', ':')
if concept_all_colons in US_GAAP_BALANCE_TYPES:
return US_GAAP_BALANCE_TYPES[concept_all_colons]
return None

View File

@@ -0,0 +1,291 @@
"""
XBRL Parser Coordinator.
This module provides the main XBRLParser class that coordinates parsing
workflow across all specialized parser components while maintaining
API compatibility with the original monolithic parser.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from edgar.core import log
from edgar.xbrl.models import (
Axis,
CalculationTree,
Context,
Domain,
ElementCatalog,
Fact,
PresentationTree,
Table,
XBRLProcessingError,
)
from .calculation import CalculationParser
from .definition import DefinitionParser
from .instance import InstanceParser
from .labels import LabelsParser
from .presentation import PresentationParser
from .schema import SchemaParser
class XBRLParser:
"""
Coordinated XBRL parser that delegates to specialized component parsers.
This class maintains full API compatibility with the original monolithic
XBRLParser while providing improved maintainability through component separation.
"""
def __init__(self):
"""Initialize the coordinated XBRL parser with all data structures."""
# Core data structures
self.element_catalog: Dict[str, ElementCatalog] = {}
self.contexts: Dict[str, Context] = {}
self.facts: Dict[str, Fact] = {}
self.units: Dict[str, Any] = {}
self.footnotes: Dict[str, Any] = {}
# Presentation structures
self.presentation_roles: Dict[str, Dict[str, Any]] = {}
self.presentation_trees: Dict[str, PresentationTree] = {}
# Calculation structures
self.calculation_roles: Dict[str, Dict[str, Any]] = {}
self.calculation_trees: Dict[str, CalculationTree] = {}
# Definition (dimensional) structures
self.definition_roles: Dict[str, Dict[str, Any]] = {}
self.tables: Dict[str, List[Table]] = {}
self.axes: Dict[str, Axis] = {}
self.domains: Dict[str, Domain] = {}
# Entity information
self.entity_info: Dict[str, Any] = {}
self.dei_facts: Dict[str, Fact] = {}
# Reporting periods
self.reporting_periods: List[Dict[str, Any]] = []
# Mapping of context IDs to period identifiers for easy lookup
self.context_period_map: Dict[str, str] = {}
# Initialize component parsers
self._init_parsers()
def _init_parsers(self):
"""Initialize all component parsers with shared data structures."""
# Create component parsers with references to shared data structures
self.schema_parser = SchemaParser(
element_catalog=self.element_catalog
)
self.labels_parser = LabelsParser(
element_catalog=self.element_catalog
)
self.presentation_parser = PresentationParser(
presentation_roles=self.presentation_roles,
presentation_trees=self.presentation_trees,
element_catalog=self.element_catalog
)
self.calculation_parser = CalculationParser(
calculation_roles=self.calculation_roles,
calculation_trees=self.calculation_trees,
element_catalog=self.element_catalog,
facts=self.facts
)
self.definition_parser = DefinitionParser(
definition_roles=self.definition_roles,
tables=self.tables,
axes=self.axes,
domains=self.domains,
element_catalog=self.element_catalog
)
self.instance_parser = InstanceParser(
contexts=self.contexts,
facts=self.facts,
units=self.units,
footnotes=self.footnotes,
calculation_trees=self.calculation_trees,
entity_info=self.entity_info,
reporting_periods=self.reporting_periods,
context_period_map=self.context_period_map
)
# Set up cross-references for embedded linkbase processing
self.schema_parser.set_linkbase_parsers(
labels_parser=self.labels_parser,
presentation_parser=self.presentation_parser,
calculation_parser=self.calculation_parser,
definition_parser=self.definition_parser
)
def _create_normalized_fact_key(self, element_id: str, context_ref: str, instance_id: Optional[int] = None) -> str:
"""
Create a normalized fact key using underscore format.
Args:
element_id: The element ID
context_ref: The context reference
instance_id: Optional instance ID for duplicate facts
Returns:
Normalized key in format: element_id_context_ref[_instance_id]
"""
return self.instance_parser._create_normalized_fact_key(element_id, context_ref, instance_id)
def get_facts_by_key(self, element_id: str, context_ref: str) -> List[Fact]:
"""Get all facts matching the given element ID and context reference.
This method handles both single facts and duplicate facts using the hybrid storage approach.
For single facts, it returns a list with one fact. For duplicates, it returns all instances.
Args:
element_id: The element ID to look up
context_ref: The context reference
Returns:
List of matching facts
"""
# Create base key for lookup
base_key = self._create_normalized_fact_key(element_id, context_ref)
# Check if single fact exists
if base_key in self.facts:
return [self.facts[base_key]]
# Check for duplicate facts (with instance IDs)
matching_facts = []
instance_id = 0
while True:
instance_key = self._create_normalized_fact_key(element_id, context_ref, instance_id)
if instance_key in self.facts:
matching_facts.append(self.facts[instance_key])
instance_id += 1
else:
break
return matching_facts
def get_fact(self, element_id: str, context_ref: str) -> Optional[Fact]:
"""Get a single fact by element ID and context reference.
Returns the first fact if multiple instances exist.
Args:
element_id: The element ID to look up
context_ref: The context reference
Returns:
The fact if found, None otherwise
"""
facts = self.get_facts_by_key(element_id, context_ref)
return facts[0] if facts else None
def parse_directory(self, directory_path: Union[str, Path]) -> None:
"""
Parse all XBRL files in a directory.
Args:
directory_path: Path to directory containing XBRL files
"""
try:
directory = Path(directory_path)
if not directory.is_dir():
raise XBRLProcessingError(f"Directory not found: {directory_path}")
log.debug(f"Parsing XBRL directory: {directory}")
# Parse schema files first to build element catalog
schema_files = list(directory.glob('*.xsd'))
for schema_file in schema_files:
log.debug(f"Parsing schema: {schema_file}")
self.schema_parser.parse_schema(schema_file)
# Parse linkbase files
linkbase_patterns = [
('*_lab.xml', self.labels_parser.parse_labels),
('*_pre.xml', self.presentation_parser.parse_presentation),
('*_cal.xml', self.calculation_parser.parse_calculation),
('*_def.xml', self.definition_parser.parse_definition),
]
for pattern, parser_method in linkbase_patterns:
linkbase_files = list(directory.glob(pattern))
for linkbase_file in linkbase_files:
log.debug(f"Parsing linkbase: {linkbase_file}")
parser_method(linkbase_file)
# Parse instance files last (they depend on schemas and linkbases)
instance_files = list(directory.glob('*.xml'))
# Filter out linkbase files
instance_files = [f for f in instance_files if not any(
f.name.endswith(suffix) for suffix in ['_lab.xml', '_pre.xml', '_cal.xml', '_def.xml']
)]
for instance_file in instance_files:
log.debug(f"Parsing instance: {instance_file}")
self.instance_parser.parse_instance(instance_file)
log.info(f"Successfully parsed XBRL directory with {len(self.facts)} facts")
except Exception as e:
raise XBRLProcessingError(f"Error parsing directory {directory_path}: {str(e)}") from e
# Delegate methods to component parsers for API compatibility
def parse_schema(self, file_path: Union[str, Path]) -> None:
"""Parse schema file and extract element information."""
return self.schema_parser.parse_schema(file_path)
def parse_schema_content(self, content: str) -> None:
"""Parse schema content and extract element information."""
return self.schema_parser.parse_schema_content(content)
def parse_labels(self, file_path: Union[str, Path]) -> None:
"""Parse label linkbase file and extract label information."""
return self.labels_parser.parse_labels(file_path)
def parse_labels_content(self, content: str) -> None:
"""Parse label linkbase content and extract label information."""
return self.labels_parser.parse_labels_content(content)
def parse_presentation(self, file_path: Union[str, Path]) -> None:
"""Parse presentation linkbase file and build presentation trees."""
return self.presentation_parser.parse_presentation(file_path)
def parse_presentation_content(self, content: str) -> None:
"""Parse presentation linkbase content and build presentation trees."""
return self.presentation_parser.parse_presentation_content(content)
def parse_calculation(self, file_path: Union[str, Path]) -> None:
"""Parse calculation linkbase file and build calculation trees."""
return self.calculation_parser.parse_calculation(file_path)
def parse_calculation_content(self, content: str) -> None:
"""Parse calculation linkbase content and build calculation trees."""
return self.calculation_parser.parse_calculation_content(content)
def parse_definition(self, file_path: Union[str, Path]) -> None:
"""Parse definition linkbase file and build dimensional structures."""
return self.definition_parser.parse_definition(file_path)
def parse_definition_content(self, content: str) -> None:
"""Parse definition linkbase content and build dimensional structures."""
return self.definition_parser.parse_definition_content(content)
def parse_instance(self, file_path: Union[str, Path]) -> None:
"""Parse instance document file and extract contexts, facts, and units."""
return self.instance_parser.parse_instance(file_path)
def parse_instance_content(self, content: str) -> None:
"""Parse instance document content and extract contexts, facts, and units."""
return self.instance_parser.parse_instance_content(content)
def count_facts(self, content: str) -> tuple:
"""Count the number of facts in the instance document."""
return self.instance_parser.count_facts(content)

View File

@@ -0,0 +1,235 @@
"""
Definition parser for XBRL documents.
This module handles parsing of XBRL definition linkbases and building
dimensional structures like tables, axes, and domains.
"""
from pathlib import Path
from typing import Any, Dict, List, Union
from edgar.xbrl.core import NAMESPACES, STANDARD_LABEL, extract_element_id
from edgar.xbrl.models import Axis, Domain, ElementCatalog, Table, XBRLProcessingError
from .base import BaseParser
class DefinitionParser(BaseParser):
"""Parser for XBRL definition linkbases."""
def __init__(self, definition_roles: Dict[str, Dict[str, Any]],
tables: Dict[str, List[Table]],
axes: Dict[str, Axis],
domains: Dict[str, Domain],
element_catalog: Dict[str, ElementCatalog]):
"""
Initialize definition parser with data structure references.
Args:
definition_roles: Reference to definition roles dictionary
tables: Reference to tables dictionary
axes: Reference to axes dictionary
domains: Reference to domains dictionary
element_catalog: Reference to element catalog dictionary
"""
super().__init__()
# Store references to data structures
self.definition_roles = definition_roles
self.tables = tables
self.axes = axes
self.domains = domains
self.element_catalog = element_catalog
def parse_definition(self, file_path: Union[str, Path]) -> None:
"""Parse definition linkbase file and build dimensional structures."""
try:
content = Path(file_path).read_text()
self.parse_definition_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing definition file {file_path}: {str(e)}") from e
def parse_definition_content(self, content: str) -> None:
"""Parse definition linkbase content and build dimensional structures."""
try:
root = self._safe_parse_xml(content)
# Extract definition links
definition_links = root.findall('.//{http://www.xbrl.org/2003/linkbase}definitionLink')
for link in definition_links:
role = link.get('{http://www.w3.org/1999/xlink}role')
if not role:
continue
# Store role information
role_id = role.split('/')[-1] if '/' in role else role
role_def = role_id.replace('_', ' ')
self.definition_roles[role] = {
'roleUri': role,
'definition': role_def,
'roleId': role_id
}
# Extract arcs
arcs = link.findall('.//{http://www.xbrl.org/2003/linkbase}definitionArc')
# Create relationships list
relationships = []
for arc in arcs:
from_ref = arc.get('{http://www.w3.org/1999/xlink}from')
to_ref = arc.get('{http://www.w3.org/1999/xlink}to')
order = self._parse_order_attribute(arc)
# Get the arcrole - this is important for identifying dimensional relationships
arcrole = arc.get('{http://www.w3.org/1999/xlink}arcrole')
if not from_ref or not to_ref or not arcrole:
continue
# Find locators for from/to references
from_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{from_ref}"]')
to_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{to_ref}"]')
if from_loc is None or to_loc is None:
continue
from_href = from_loc.get('{http://www.w3.org/1999/xlink}href')
to_href = to_loc.get('{http://www.w3.org/1999/xlink}href')
if not from_href or not to_href:
continue
# Extract element IDs
from_element = extract_element_id(from_href)
to_element = extract_element_id(to_href)
# Add relationship with arcrole
relationships.append({
'from_element': from_element,
'to_element': to_element,
'order': order,
'arcrole': arcrole
})
# Process dimensional structures from relationships
self._process_dimensional_relationships(role, relationships)
except Exception as e:
raise XBRLProcessingError(f"Error parsing definition content: {str(e)}") from e
def _process_dimensional_relationships(self, role: str, relationships: List[Dict[str, Any]]) -> None:
"""
Process dimensional relationships to build tables, axes, and domains.
Args:
role: Extended link role URI
relationships: List of dimensional relationships
"""
# XBRL Dimensions arcrole URIs
HYPERCUBE_DIMENSION = "http://xbrl.org/int/dim/arcrole/hypercube-dimension"
DIMENSION_DOMAIN = "http://xbrl.org/int/dim/arcrole/dimension-domain"
DOMAIN_MEMBER = "http://xbrl.org/int/dim/arcrole/domain-member"
ALL = "http://xbrl.org/int/dim/arcrole/all"
# Group relationships by arcrole
grouped_rels = {}
for rel in relationships:
arcrole = rel['arcrole']
if arcrole not in grouped_rels:
grouped_rels[arcrole] = []
grouped_rels[arcrole].append(rel)
# Process hypercube-dimension relationships to identify tables and axes
hypercube_axes = {} # Map of hypercubes to their axes
if HYPERCUBE_DIMENSION in grouped_rels:
for rel in grouped_rels[HYPERCUBE_DIMENSION]:
table_id = rel['from_element']
axis_id = rel['to_element']
if table_id not in hypercube_axes:
hypercube_axes[table_id] = []
hypercube_axes[table_id].append(axis_id)
# Create or update axis
if axis_id not in self.axes:
self.axes[axis_id] = Axis(
element_id=axis_id,
label=self._get_element_label(axis_id)
)
# Process dimension-domain relationships to link axes to domains
if DIMENSION_DOMAIN in grouped_rels:
for rel in grouped_rels[DIMENSION_DOMAIN]:
axis_id = rel['from_element']
domain_id = rel['to_element']
# Link domain to axis
if axis_id in self.axes:
self.axes[axis_id].domain_id = domain_id
# Create or update domain
if domain_id not in self.domains:
self.domains[domain_id] = Domain(
element_id=domain_id,
label=self._get_element_label(domain_id)
)
# Process domain-member relationships to build domain hierarchies
if DOMAIN_MEMBER in grouped_rels:
# Group by parent (domain) element
domain_members = {}
for rel in grouped_rels[DOMAIN_MEMBER]:
domain_id = rel['from_element']
member_id = rel['to_element']
if domain_id not in domain_members:
domain_members[domain_id] = []
domain_members[domain_id].append(member_id)
# Also create the domain if it doesn't exist
if domain_id not in self.domains:
self.domains[domain_id] = Domain(
element_id=domain_id,
label=self._get_element_label(domain_id)
)
# Update domains with their members
for domain_id, members in domain_members.items():
if domain_id in self.domains:
self.domains[domain_id].members = members
# Process 'all' relationships to identify line items and build hypercubes (tables)
if ALL in grouped_rels:
tables_by_role = []
for rel in grouped_rels[ALL]:
line_items_id = rel['to_element']
table_id = rel['from_element']
# Only process if this table has axes defined
if table_id in hypercube_axes:
table = Table(
element_id=table_id,
label=self._get_element_label(table_id),
role_uri=role,
axes=hypercube_axes[table_id],
line_items=[line_items_id],
closed=False # Default
)
tables_by_role.append(table)
# Add tables to collection
if tables_by_role:
self.tables[role] = tables_by_role
def _get_element_label(self, element_id: str) -> str:
"""Get the label for an element, falling back to the element ID if not found."""
if element_id in self.element_catalog and self.element_catalog[element_id].labels:
# Use standard label if available
standard_label = self.element_catalog[element_id].labels.get(STANDARD_LABEL)
if standard_label:
return standard_label
return element_id # Fallback to element ID

View File

@@ -0,0 +1,768 @@
"""
Instance parser for XBRL documents.
This module handles parsing of XBRL instance documents including facts, contexts,
units, footnotes, and entity information extraction.
"""
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Union
from lxml import etree as ET
from edgar.core import log
from edgar.xbrl.core import NAMESPACES, classify_duration
from edgar.xbrl.models import Context, Fact, XBRLProcessingError
from .base import BaseParser
class InstanceParser(BaseParser):
"""Parser for XBRL instance documents."""
def __init__(self, contexts: Dict[str, Context], facts: Dict[str, Fact],
units: Dict[str, Any], footnotes: Dict[str, Any],
calculation_trees: Dict[str, Any], entity_info: Dict[str, Any],
reporting_periods: List[Dict[str, Any]], context_period_map: Dict[str, str]):
"""
Initialize instance parser with data structure references.
Args:
contexts: Reference to contexts dictionary
facts: Reference to facts dictionary
units: Reference to units dictionary
footnotes: Reference to footnotes dictionary
calculation_trees: Reference to calculation trees dictionary
entity_info: Reference to entity info dictionary
reporting_periods: Reference to reporting periods list
context_period_map: Reference to context period map
"""
super().__init__()
# Store references to data structures
self.contexts = contexts
self.facts = facts
self.units = units
self.footnotes = footnotes
self.calculation_trees = calculation_trees
self.entity_info = entity_info
self.reporting_periods = reporting_periods
self.context_period_map = context_period_map
# DEI facts extracted during entity info processing
self.dei_facts: Dict[str, Fact] = {}
def _create_normalized_fact_key(self, element_id: str, context_ref: str, instance_id: int = None) -> str:
"""
Create a normalized fact key using underscore format.
Args:
element_id: The element ID
context_ref: The context reference
instance_id: Optional instance ID for duplicate facts
Returns:
Normalized key in format: element_id_context_ref[_instance_id]
"""
normalized_element_id = element_id
if ':' in element_id:
prefix, name = element_id.split(':', 1)
normalized_element_id = f"{prefix}_{name}"
if instance_id is not None:
return f"{normalized_element_id}_{context_ref}_{instance_id}"
return f"{normalized_element_id}_{context_ref}"
def parse_instance(self, file_path: Union[str, Path]) -> None:
"""Parse instance document file and extract contexts, facts, and units."""
try:
content = Path(file_path).read_text()
self.parse_instance_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing instance file {file_path}: {str(e)}") from e
def parse_instance_content(self, content: str) -> None:
"""Parse instance document content and extract contexts, facts, and units."""
try:
# Use lxml's optimized parser with smart string handling and recovery mode
parser = ET.XMLParser(remove_blank_text=True, recover=True, huge_tree=True)
# Convert to bytes for faster parsing if not already
if isinstance(content, str):
content_bytes = content.encode('utf-8')
else:
content_bytes = content
# Parse content with optimized settings
root = ET.XML(content_bytes, parser)
# Extract data in optimal order (contexts first, then units, then facts)
# This ensures dependencies are resolved before they're needed
self._extract_contexts(root)
self._extract_units(root)
self._extract_facts(root)
self._extract_footnotes(root)
# Post-processing steps after all raw data is extracted
self._extract_entity_info()
self._build_reporting_periods()
except Exception as e:
raise XBRLProcessingError(f"Error parsing instance content: {str(e)}") from e
def count_facts(self, content: str) -> tuple:
"""Count the number of facts in the instance document
This function counts both unique facts and total fact instances in the XBRL document.
Returns:
tuple: (unique_facts_count, total_fact_instances)
"""
# Use lxml's optimized parser with smart string handling and recovery mode
parser = ET.XMLParser(remove_blank_text=True, recover=True, huge_tree=True)
# Convert to bytes for faster parsing if not already
if isinstance(content, str):
content_bytes = content.encode('utf-8')
else:
content_bytes = content
# Parse content with optimized settings
root = ET.XML(content_bytes, parser)
# Fast path to identify non-fact elements to skip
skip_tag_endings = {'}context', '}unit', '}schemaRef'}
# Track both total instances and unique facts
total_fact_instances = 0 # Total number of fact references in the document
unique_facts = set() # Set of unique element_id + context_ref combinations
create_key = self._create_normalized_fact_key
# Define counting function
def count_element(element):
"""Process a single element as a potential fact."""
nonlocal total_fact_instances
# Skip known non-fact elements
tag = element.tag
for ending in skip_tag_endings:
if tag.endswith(ending):
return
# Get context reference - key check to identify facts
context_ref = element.get('contextRef')
if context_ref is None:
return
# Extract element namespace and name - optimized split
if '}' in tag:
namespace, element_name = tag.split('}', 1)
namespace = namespace[1:] # Faster than strip('{')
else:
element_name = tag
namespace = None
# Get namespace prefix - cached for performance
prefix = None
for std_prefix, std_uri_base in NAMESPACES.items():
if namespace.startswith(std_uri_base):
prefix = std_prefix
break
if not prefix and namespace:
# Try to extract prefix from the namespace
parts = namespace.split('/')
prefix = parts[-1] if parts else ''
# Construct element ID with optimized string concatenation
if prefix:
element_id = f"{prefix}:{element_name}" if prefix else element_name
else:
element_id = element_name
# Create a normalized key using underscore format for consistency
normalized_key = create_key(element_id, context_ref)
# Track unique facts
unique_facts.add(normalized_key)
# Increment total instances count
total_fact_instances += 1
# Optimize traversal using lxml's iterchildren and iterdescendants if available
if hasattr(root, 'iterchildren'):
# Use lxml's optimized traversal methods
for child in root.iterchildren():
count_element(child)
# Process nested elements with optimized iteration
for descendant in child.iterdescendants():
count_element(descendant)
else:
# Fallback for ElementTree
for child in root:
count_element(child)
for descendant in child.findall('.//*'):
count_element(descendant)
# Return tuple of counts (unique_facts_count, total_fact_instances)
return len(unique_facts), total_fact_instances
def _extract_contexts(self, root: ET.Element) -> None:
"""Extract contexts from instance document."""
try:
# Find all context elements
for context_elem in root.findall('.//{http://www.xbrl.org/2003/instance}context'):
context_id = context_elem.get('id')
if not context_id:
continue
# Create context object
context = Context(context_id=context_id)
# Extract entity information
entity_elem = context_elem.find('.//{http://www.xbrl.org/2003/instance}entity')
if entity_elem is not None:
# Get identifier
identifier_elem = entity_elem.find('.//{http://www.xbrl.org/2003/instance}identifier')
if identifier_elem is not None:
scheme = identifier_elem.get('scheme', '')
identifier = identifier_elem.text
context.entity = {
'scheme': scheme,
'identifier': identifier
}
# Get segment dimensions if present
segment_elem = entity_elem.find('.//{http://www.xbrl.org/2003/instance}segment')
if segment_elem is not None:
# Extract explicit dimensions
for dim_elem in segment_elem.findall('.//{http://xbrl.org/2006/xbrldi}explicitMember'):
dimension = dim_elem.get('dimension')
value = dim_elem.text
if dimension and value:
context.dimensions[dimension] = value
# Extract typed dimensions
for dim_elem in segment_elem.findall('.//{http://xbrl.org/2006/xbrldi}typedMember'):
dimension = dim_elem.get('dimension')
if dimension:
# The typed dimension value is the text content of the first child element
for child in dim_elem:
# Extract the text content, which contains the actual typed member value
if child.text and child.text.strip():
context.dimensions[dimension] = child.text.strip()
else:
# Fallback to tag if no text content
context.dimensions[dimension] = child.tag
break
# Extract period information
period_elem = context_elem.find('.//{http://www.xbrl.org/2003/instance}period')
if period_elem is not None:
# Check for instant period
instant_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}instant')
if instant_elem is not None and instant_elem.text:
context.period = {
'type': 'instant',
'instant': instant_elem.text
}
# Check for duration period
start_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}startDate')
end_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}endDate')
if start_elem is not None and end_elem is not None and start_elem.text and end_elem.text:
context.period = {
'type': 'duration',
'startDate': start_elem.text,
'endDate': end_elem.text
}
# Check for forever period
forever_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}forever')
if forever_elem is not None:
context.period = {
'type': 'forever'
}
# Add context to registry
self.contexts[context_id] = context
except Exception as e:
raise XBRLProcessingError(f"Error extracting contexts: {str(e)}") from e
def _extract_units(self, root: ET.Element) -> None:
"""Extract units from instance document."""
try:
# Find all unit elements
for unit_elem in root.findall('.//{http://www.xbrl.org/2003/instance}unit'):
unit_id = unit_elem.get('id')
if not unit_id:
continue
# Check for measure
measure_elem = unit_elem.find('.//{http://www.xbrl.org/2003/instance}measure')
if measure_elem is not None and measure_elem.text:
self.units[unit_id] = {
'type': 'simple',
'measure': measure_elem.text
}
continue
# Check for divide
divide_elem = unit_elem.find('.//{http://www.xbrl.org/2003/instance}divide')
if divide_elem is not None:
# Get numerator
numerator_elem = divide_elem.find('.//{http://www.xbrl.org/2003/instance}unitNumerator')
denominator_elem = divide_elem.find('.//{http://www.xbrl.org/2003/instance}unitDenominator')
if numerator_elem is not None and denominator_elem is not None:
# Get measures
numerator_measures = [elem.text for elem in numerator_elem.findall('.//{http://www.xbrl.org/2003/instance}measure') if elem.text]
denominator_measures = [elem.text for elem in denominator_elem.findall('.//{http://www.xbrl.org/2003/instance}measure') if elem.text]
self.units[unit_id] = {
'type': 'divide',
'numerator': numerator_measures,
'denominator': denominator_measures
}
except Exception as e:
raise XBRLProcessingError(f"Error extracting units: {str(e)}") from e
def _extract_facts(self, root: ET.Element) -> None:
"""Extract facts from instance document."""
try:
# Get direct access to nsmap if using lxml (much faster than regex extraction)
if hasattr(root, 'nsmap'):
# Leverage lxml's native nsmap functionality
prefix_map = {uri: prefix for prefix, uri in root.nsmap.items() if prefix is not None}
else:
# Fallback for ElementTree - precompile regex patterns for namespace extraction
xmlns_pattern = '{http://www.w3.org/2000/xmlns/}'
prefix_map = {}
# Extract namespace declarations from root
for attr_name, attr_value in root.attrib.items():
if attr_name.startswith(xmlns_pattern) or attr_name.startswith('xmlns:'):
# Extract the prefix more efficiently
if attr_name.startswith(xmlns_pattern):
prefix = attr_name[len(xmlns_pattern):]
else:
prefix = attr_name.split(':', 1)[1]
prefix_map[attr_value] = prefix
# Initialize counters and tracking
fact_count = 0
facts_dict = {}
base_keys = {}
# Fast path to identify non-fact elements to skip - compile as set for O(1) lookup
skip_tag_endings = {
'schemaRef',
'roleRef',
'arcroleRef',
'linkbaseRef',
'context',
'unit'
}
def process_element(element):
"""Process a single element as a potential fact."""
nonlocal fact_count
# Skip annotation nodes and other non element nodes
if not ET.iselement(element):
return
# Skip known non-fact elements - faster check with set membership
# If the tag is not a string, try calling () to get the string value (in rare cases)
if callable(element.tag):
if isinstance(element, ET._Comment):
return
if not element.values():
return
tag = element.tag
for ending in skip_tag_endings:
if tag.endswith(ending):
return
# Get context reference - key check to identify facts
context_ref = element.get('contextRef')
if not context_ref:
return
# Get fact ID if present (for footnote linkage)
fact_id = element.get('id')
# Extract element namespace and name - optimized split
if '}' in tag:
namespace, element_name = tag.split('}', 1)
namespace = namespace[1:] # Faster than strip('{')
# Try to extract prefix from the namespace
prefix = prefix_map.get(namespace)
if not prefix:
parts = namespace.split('/')
prefix = parts[-1] if parts else ''
else:
element_name = tag
prefix = ''
# Construct element ID with optimized string concatenation
element_id = f"{prefix}:{element_name}" if prefix else element_name
# Get unit reference
unit_ref = element.get('unitRef')
# Get value - optimize string handling
value = element.text
if not value or not value.strip():
# Only check children if text is empty - use direct iteration for speed
for sub_elem in element:
sub_text = sub_elem.text
if sub_text and sub_text.strip():
value = sub_text
break
# Optimize string handling - inline conditional
value = value.strip() if value else ""
# Get decimals attribute - direct access
decimals = element.get('decimals')
# Optimize numeric conversion with faster try/except
numeric_value = None
if value:
try:
numeric_value = float(value)
except (ValueError, TypeError):
pass
# Create base key for duplicate detection
base_key = self._create_normalized_fact_key(element_id, context_ref)
# Handle duplicates
instance_id = None
if base_key in base_keys:
# This is a duplicate - convert existing fact to use instance_id if needed
if base_key in facts_dict:
existing_fact = facts_dict[base_key]
# Move existing fact to new key with instance_id=0
del facts_dict[base_key]
existing_fact.instance_id = 0
facts_dict[self._create_normalized_fact_key(element_id, context_ref, 0)] = existing_fact
# Add new fact with next instance_id
instance_id = len(base_keys[base_key])
base_keys[base_key].append(True)
else:
# First instance of this fact
base_keys[base_key] = [True]
# Create fact object
fact = Fact(
element_id=element_id,
context_ref=context_ref,
value=value,
unit_ref=unit_ref,
decimals=decimals,
numeric_value=numeric_value,
instance_id=instance_id,
fact_id=fact_id
)
# Store fact with appropriate key
key = self._create_normalized_fact_key(element_id, context_ref, instance_id)
facts_dict[key] = fact
fact_count += 1
# Use lxml's optimized traversal methods
if hasattr(root, 'iterchildren'):
# Use lxml's optimized traversal methods
for child in root.iterchildren():
process_element(child)
# Process nested elements with optimized iteration
for descendant in child.iterdescendants():
process_element(descendant)
else:
# Fallback for ElementTree
for child in root:
process_element(child)
for descendant in child.findall('.//*'):
process_element(descendant)
# Update instance facts
self.facts.update(facts_dict)
log.debug(f"Extracted {fact_count} facts ({len(base_keys)} unique fact identifiers)")
except Exception as e:
raise XBRLProcessingError(f"Error extracting facts: {str(e)}") from e
def _extract_footnotes(self, root: ET.Element) -> None:
"""Extract footnotes from instance document.
Footnotes in XBRL are linked to facts via footnoteLink elements that contain:
1. footnote elements with the actual text content
2. footnoteArc elements that connect fact IDs to footnote IDs
"""
try:
from edgar.xbrl.models import Footnote
# Find all footnoteLink elements
for footnote_link in root.findall('.//{http://www.xbrl.org/2003/linkbase}footnoteLink'):
# First, extract all footnote definitions
for footnote_elem in footnote_link.findall('{http://www.xbrl.org/2003/linkbase}footnote'):
# Try both 'id' and 'xlink:label' attributes
footnote_id = footnote_elem.get('id') or footnote_elem.get('{http://www.w3.org/1999/xlink}label')
if not footnote_id:
continue
# Get footnote attributes
lang = footnote_elem.get('{http://www.w3.org/XML/1998/namespace}lang', 'en-US')
role = footnote_elem.get('{http://www.w3.org/1999/xlink}role')
# Extract text content, handling XHTML formatting
footnote_text = ""
# Check for XHTML content
xhtml_divs = footnote_elem.findall('.//{http://www.w3.org/1999/xhtml}div')
if xhtml_divs:
# Concatenate all text within XHTML elements
for div in xhtml_divs:
footnote_text += "".join(div.itertext()).strip()
else:
# Fall back to direct text content
footnote_text = "".join(footnote_elem.itertext()).strip()
# Create Footnote object
footnote = Footnote(
footnote_id=footnote_id,
text=footnote_text,
lang=lang,
role=role,
related_fact_ids=[]
)
self.footnotes[footnote_id] = footnote
# Second, process footnoteArc elements to link facts to footnotes
for arc_elem in footnote_link.findall('{http://www.xbrl.org/2003/linkbase}footnoteArc'):
fact_id = arc_elem.get('{http://www.w3.org/1999/xlink}from')
footnote_id = arc_elem.get('{http://www.w3.org/1999/xlink}to')
if fact_id and footnote_id:
# Add fact ID to footnote's related facts
if footnote_id in self.footnotes:
self.footnotes[footnote_id].related_fact_ids.append(fact_id)
else:
log.warning(f"Footnote arc references undefined footnote: {footnote_id}")
# Also update the fact's footnotes list if we can find it
# This requires finding the fact by its fact_id
for fact in self.facts.values():
if fact.fact_id == fact_id:
if footnote_id not in fact.footnotes:
fact.footnotes.append(footnote_id)
break
log.debug(f"Extracted {len(self.footnotes)} footnotes")
except Exception as e:
# Log the error but don't fail - footnotes are optional
log.warning(f"Error extracting footnotes: {str(e)}")
def _extract_entity_info(self) -> None:
"""Extract entity information from contexts and DEI facts."""
try:
# Extract CIK/identifier from first context
identifier = None
if self.contexts:
first = next(iter(self.contexts.values()))
ident = first.entity.get('identifier')
if ident and ident.isdigit():
identifier = ident.lstrip('0')
# Collect all DEI facts into a dict: concept -> Fact
self.dei_facts: Dict[str, Fact] = {}
for fact in self.facts.values():
eid = fact.element_id
if eid.startswith('dei:'):
concept = eid.split(':', 1)[1]
elif eid.startswith('dei_'):
concept = eid.split('_', 1)[1]
else:
continue
self.dei_facts[concept] = fact
# Helper: get the first available DEI fact value
def get_dei(*names):
for n in names:
f = self.dei_facts.get(n)
if f:
return f.value
return None
# Build entity_info preserving existing keys
self.entity_info.update({
'entity_name': get_dei('EntityRegistrantName'),
'ticker': get_dei('TradingSymbol'),
'identifier': identifier,
'document_type': get_dei('DocumentType'),
'reporting_end_date': None,
'document_period_end_date':get_dei('DocumentPeriodEndDate'),
'fiscal_year': get_dei('DocumentFiscalYearFocus','FiscalYearFocus','FiscalYear'),
'fiscal_period': get_dei('DocumentFiscalPeriodFocus','FiscalPeriodFocus'),
'fiscal_year_end_month': None,
'fiscal_year_end_day': None,
'annual_report': False,
'quarterly_report': False,
'amendment': False,
})
# Determine reporting_end_date from contexts
for ctx in self.contexts.values():
period = getattr(ctx, 'period', {})
if period.get('type') == 'instant':
ds = period.get('instant')
if ds:
try:
dt_obj = datetime.strptime(ds, '%Y-%m-%d').date()
curr = self.entity_info['reporting_end_date']
if curr is None or dt_obj > curr:
self.entity_info['reporting_end_date'] = dt_obj
except Exception:
pass
# Parse fiscal year end date into month/day
fye = get_dei('CurrentFiscalYearEndDate','FiscalYearEnd')
if fye:
try:
s = fye
if s.startswith('--'):
s = s[2:]
if '-' in s:
m, d = s.split('-', 1)
if m.isdigit() and d.isdigit():
self.entity_info['fiscal_year_end_month'] = int(m)
self.entity_info['fiscal_year_end_day'] = int(d)
except Exception:
pass
# Flags based on document_type
dt_val = self.entity_info['document_type'] or ''
self.entity_info['annual_report'] = (dt_val == '10-K')
self.entity_info['quarterly_report'] = (dt_val == '10-Q')
self.entity_info['amendment'] = ('/A' in dt_val)
log.debug(f"Entity info: {self.entity_info}")
except Exception as e:
log.warning(f"Warning: Error extracting entity info: {str(e)}")
def _build_reporting_periods(self) -> None:
"""Build reporting periods from contexts."""
try:
# Clear existing periods
self.reporting_periods.clear()
self.context_period_map.clear()
# Collect unique periods from contexts
instant_periods = {}
duration_periods = {}
for context_id, context in self.contexts.items():
if 'period' in context.model_dump() and 'type' in context.period:
period_type = context.period.get('type')
if period_type == 'instant':
date_str = context.period.get('instant')
if date_str:
if date_str not in instant_periods:
instant_periods[date_str] = []
# Add context ID to this period
instant_periods[date_str].append(context_id)
# Map context to period key
period_key = f"instant_{date_str}"
self.context_period_map[context_id] = period_key
elif period_type == 'duration':
start_date = context.period.get('startDate')
end_date = context.period.get('endDate')
if start_date and end_date:
duration_key = f"{start_date}_{end_date}"
if duration_key not in duration_periods:
duration_periods[duration_key] = []
# Add context ID to this period
duration_periods[duration_key].append(context_id)
# Map context to period key
period_key = f"duration_{start_date}_{end_date}"
self.context_period_map[context_id] = period_key
# Process instant periods
for date_str, context_ids in instant_periods.items():
try:
date_obj = datetime.strptime(date_str, '%Y-%m-%d').date()
formatted_date = date_obj.strftime('%B %d, %Y')
period = {
'type': 'instant',
'date': date_str,
'date_obj': date_obj,
'label': formatted_date,
'context_ids': context_ids,
'key': f"instant_{date_str}"
}
self.reporting_periods.append(period)
except (ValueError, TypeError):
# Skip invalid dates
continue
# Process duration periods
for period_key, context_ids in duration_periods.items():
start_date, end_date = period_key.split('_')
try:
start_obj = datetime.strptime(start_date, '%Y-%m-%d').date()
end_obj = datetime.strptime(end_date, '%Y-%m-%d').date()
formatted_start = start_obj.strftime('%B %d, %Y')
formatted_end = end_obj.strftime('%B %d, %Y')
# Calculate duration in days
days = (end_obj - start_obj).days
# Determine period type based on duration
period_description = classify_duration(days)
period = {
'type': 'duration',
'start_date': start_date,
'end_date': end_date,
'start_obj': start_obj,
'end_obj': end_obj,
'days': days,
'period_type': period_description,
'label': f"{period_description}: {formatted_start} to {formatted_end}",
'context_ids': context_ids,
'key': f"duration_{start_date}_{end_date}"
}
self.reporting_periods.append(period)
except (ValueError, TypeError):
# Skip invalid dates
continue
# Sort periods by date (most recent first)
self.reporting_periods.sort(key=lambda p: p['date_obj'] if p['type'] == 'instant' else p['end_obj'], reverse=True)
# Debug printout to verify periods are extracted
if len(self.reporting_periods) > 0:
log.debug(f"Found {len(self.reporting_periods)} reporting periods.")
log.debug(f"First period: {self.reporting_periods[0]['label']}")
else:
log.debug("Warning: No reporting periods found!")
# Debug context period map
log.debug(f"Context period map has {len(self.context_period_map)} entries.")
except Exception as e:
# Log error but don't fail
log.debug(f"Warning: Error building reporting periods: {str(e)}")
self.reporting_periods.clear()

View File

@@ -0,0 +1,149 @@
"""
Labels parser for XBRL documents.
This module handles parsing of XBRL label linkbases and extracting
element labels for display purposes.
"""
from pathlib import Path
from typing import Dict, Union
from lxml import etree as ET
from edgar.xbrl.core import STANDARD_LABEL, extract_element_id
from edgar.xbrl.models import ElementCatalog, XBRLProcessingError
from .base import BaseParser
class LabelsParser(BaseParser):
"""Parser for XBRL label linkbases."""
def __init__(self, element_catalog: Dict[str, ElementCatalog]):
"""
Initialize labels parser with data structure references.
Args:
element_catalog: Reference to element catalog dictionary
"""
super().__init__()
# Store references to data structures
self.element_catalog = element_catalog
def parse_labels(self, file_path: Union[str, Path]) -> None:
"""Parse label linkbase file and extract label information."""
try:
content = Path(file_path).read_text()
self.parse_labels_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing label file {file_path}: {str(e)}") from e
def parse_labels_content(self, content: str) -> None:
"""Parse label linkbase content and extract label information."""
try:
# Optimize: Register namespaces for faster XPath lookups
nsmap = {
'link': 'http://www.xbrl.org/2003/linkbase',
'xlink': 'http://www.w3.org/1999/xlink',
'xml': 'http://www.w3.org/XML/1998/namespace'
}
# Optimize: Use lxml parser with smart string handling
parser = ET.XMLParser(remove_blank_text=True, recover=True)
root = ET.XML(content.encode('utf-8'), parser)
# Optimize: Use specific XPath expressions with namespaces for faster lookups
# This is much faster than using findall with '//' in element tree
label_arcs = root.xpath('//link:labelArc', namespaces=nsmap)
labels = root.xpath('//link:label', namespaces=nsmap)
# Optimize: Pre-allocate dictionary with expected size
label_lookup = {}
# Optimize: Cache attribute lookups
xlink_label = '{http://www.w3.org/1999/xlink}label'
xlink_role = '{http://www.w3.org/1999/xlink}role'
xml_lang = '{http://www.w3.org/XML/1998/namespace}lang'
default_role = 'http://www.xbrl.org/2003/role/label'
# Optimize: Process labels in a single pass with direct attribute access
for label in labels:
label_id = label.get(xlink_label)
if not label_id:
continue
# Get text first - if empty, skip further processing
text = label.text
if text is None:
continue
# Get attributes - direct lookup is faster than method calls
role = label.get(xlink_role, default_role)
lang = label.get(xml_lang, 'en-US')
# Create nested dictionaries only when needed
if label_id not in label_lookup:
label_lookup[label_id] = {}
if lang not in label_lookup[label_id]:
label_lookup[label_id][lang] = {}
label_lookup[label_id][lang][role] = text
# Optimize: Cache attribute lookups for arcs
xlink_from = '{http://www.w3.org/1999/xlink}from'
xlink_to = '{http://www.w3.org/1999/xlink}to'
xlink_href = '{http://www.w3.org/1999/xlink}href'
# Optimize: Create a lookup table for locators by label for faster access
loc_by_label = {}
for loc in root.xpath('//link:loc', namespaces=nsmap):
loc_label = loc.get(xlink_label)
if loc_label:
loc_by_label[loc_label] = loc.get(xlink_href)
# Connect labels to elements using arcs - with optimized lookups
for arc in label_arcs:
from_ref = arc.get(xlink_from)
to_ref = arc.get(xlink_to)
if not from_ref or not to_ref or to_ref not in label_lookup:
continue
# Use cached locator lookup instead of expensive XPath
href = loc_by_label.get(from_ref)
if not href:
continue
# Extract element ID from href
element_id = extract_element_id(href)
# Find labels for this element - check most likely case first
if 'en-US' in label_lookup[to_ref]:
element_labels = label_lookup[to_ref]['en-US']
# Optimize: Update catalog with minimal overhead
catalog_entry = self.element_catalog.get(element_id)
if catalog_entry:
catalog_entry.labels.update(element_labels)
else:
# Create placeholder in catalog
self.element_catalog[element_id] = ElementCatalog(
name=element_id,
data_type="",
period_type="duration",
labels=element_labels
)
except Exception as e:
raise XBRLProcessingError(f"Error parsing label content: {str(e)}") from e
def get_element_label(self, element_id: str) -> str:
"""Get the label for an element, falling back to the element ID if not found."""
if element_id in self.element_catalog and self.element_catalog[element_id].labels:
# Use standard label if available
standard_label = self.element_catalog[element_id].labels.get(STANDARD_LABEL)
if standard_label:
return standard_label
return element_id # Fallback to element ID

View File

@@ -0,0 +1,249 @@
"""
Presentation parser for XBRL documents.
This module handles parsing of XBRL presentation linkbases and building
presentation trees for financial statement structure.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from lxml import etree as ET
from edgar.xbrl.core import extract_element_id
from edgar.xbrl.models import ElementCatalog, PresentationNode, PresentationTree, XBRLProcessingError
from .base import BaseParser
class PresentationParser(BaseParser):
"""Parser for XBRL presentation linkbases."""
def __init__(self, presentation_roles: Dict[str, Dict[str, Any]],
presentation_trees: Dict[str, PresentationTree],
element_catalog: Dict[str, ElementCatalog]):
"""
Initialize presentation parser with data structure references.
Args:
presentation_roles: Reference to presentation roles dictionary
presentation_trees: Reference to presentation trees dictionary
element_catalog: Reference to element catalog dictionary
"""
super().__init__()
# Store references to data structures
self.presentation_roles = presentation_roles
self.presentation_trees = presentation_trees
self.element_catalog = element_catalog
def parse_presentation(self, file_path: Union[str, Path]) -> None:
"""Parse presentation linkbase file and build presentation trees."""
try:
content = Path(file_path).read_text()
self.parse_presentation_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing presentation file {file_path}: {str(e)}") from e
def parse_presentation_content(self, content: str) -> None:
"""Parse presentation linkbase content and build presentation trees."""
try:
# Optimize: Register namespaces for faster XPath lookups
nsmap = {
'link': 'http://www.xbrl.org/2003/linkbase',
'xlink': 'http://www.w3.org/1999/xlink'
}
# Optimize: Use lxml parser with smart string handling
parser = ET.XMLParser(remove_blank_text=True, recover=True)
root = ET.XML(content.encode('utf-8'), parser)
# Optimize: Use XPath with namespaces for faster extraction
presentation_links = root.xpath('//link:presentationLink', namespaces=nsmap)
# Optimize: Cache attribute paths
xlink_role = '{http://www.w3.org/1999/xlink}role'
xlink_from = '{http://www.w3.org/1999/xlink}from'
xlink_to = '{http://www.w3.org/1999/xlink}to'
xlink_label = '{http://www.w3.org/1999/xlink}label'
xlink_href = '{http://www.w3.org/1999/xlink}href'
for link in presentation_links:
role = link.get(xlink_role)
if not role:
continue
# Store role information
role_id = role.split('/')[-1] if '/' in role else role
role_def = role_id.replace('_', ' ')
self.presentation_roles[role] = {
'roleUri': role,
'definition': role_def,
'roleId': role_id
}
# Optimize: Pre-build locator map to avoid repeated XPath lookups
loc_map = {}
for loc in link.xpath('.//link:loc', namespaces=nsmap):
label = loc.get(xlink_label)
if label:
loc_map[label] = loc.get(xlink_href)
# Optimize: Extract arcs using direct xpath with context
arcs = link.xpath('.//link:presentationArc', namespaces=nsmap)
# Create relationships map - pre-allocate with known size
relationships = []
relationships_append = relationships.append # Local function reference for speed
# Process arcs with optimized locator lookups
for arc in arcs:
from_ref = arc.get(xlink_from)
to_ref = arc.get(xlink_to)
if not from_ref or not to_ref:
continue
# Optimize: Use cached locator references instead of expensive XPath lookups
from_href = loc_map.get(from_ref)
to_href = loc_map.get(to_ref)
if not from_href or not to_href:
continue
# Parse order attribute correctly
order = self._parse_order_attribute(arc)
preferred_label = arc.get('preferredLabel')
# Extract element IDs from hrefs
from_element = extract_element_id(from_href)
to_element = extract_element_id(to_href)
# Add relationship using local function reference
relationships_append({
'from_element': from_element,
'to_element': to_element,
'order': order,
'preferred_label': preferred_label
})
# Build presentation tree for this role if we have relationships
if relationships:
self._build_presentation_tree(role, relationships)
except Exception as e:
raise XBRLProcessingError(f"Error parsing presentation content: {str(e)}") from e
def _build_presentation_tree(self, role: str, relationships: List[Dict[str, Any]]) -> None:
"""
Build a presentation tree from relationships.
Args:
role: Extended link role URI
relationships: List of relationships (from_element, to_element, order, preferred_label)
"""
# Group relationships by source element
from_map = {}
to_map = {}
for rel in relationships:
from_element = rel['from_element']
to_element = rel['to_element']
if from_element not in from_map:
from_map[from_element] = []
from_map[from_element].append(rel)
if to_element not in to_map:
to_map[to_element] = []
to_map[to_element].append(rel)
# Find root elements (appear as 'from' but not as 'to')
root_elements = set(from_map.keys()) - set(to_map.keys())
if not root_elements:
return # No root elements found
# Create presentation tree
tree = PresentationTree(
role_uri=role,
definition=self.presentation_roles[role]['definition'],
root_element_id=next(iter(root_elements)),
all_nodes={}
)
# Build tree recursively
for root_id in root_elements:
self._build_presentation_subtree(root_id, None, 0, from_map, tree.all_nodes)
# Add tree to collection
self.presentation_trees[role] = tree
def _build_presentation_subtree(self, element_id: str, parent_id: Optional[str], depth: int,
from_map: Dict[str, List[Dict[str, Any]]],
all_nodes: Dict[str, PresentationNode]) -> None:
"""
Recursively build a presentation subtree.
Args:
element_id: Current element ID
parent_id: Parent element ID
depth: Current depth in tree
from_map: Map of relationships by source element
all_nodes: Dictionary to store all nodes
"""
# Create node
node = PresentationNode(
element_id=element_id,
parent=parent_id,
children=[],
depth=depth
)
# Add element information if available
if element_id in self.element_catalog:
elem_info = self.element_catalog[element_id]
node.element_name = elem_info.name
node.standard_label = elem_info.labels.get('http://www.xbrl.org/2003/role/label', elem_info.name)
# Use enhanced abstract detection (Issue #450 fix)
# The element catalog may not have correct abstract info for standard taxonomy concepts
from edgar.xbrl.abstract_detection import is_abstract_concept
node.is_abstract = is_abstract_concept(
concept_name=elem_info.name,
schema_abstract=elem_info.abstract,
has_children=False, # Will be updated after children are processed
has_values=False # Will be determined later when facts are loaded
)
node.labels = elem_info.labels
# Add to collection
all_nodes[element_id] = node
# Process children
if element_id in from_map:
# Sort children by order
children = sorted(from_map[element_id], key=lambda r: r['order'])
for rel in children:
child_id = rel['to_element']
# Add child to parent's children list
node.children.append(child_id)
# Set preferred label
preferred_label = rel['preferred_label']
# Recursively build child subtree
self._build_presentation_subtree(
child_id, element_id, depth + 1, from_map, all_nodes
)
# Update preferred label and order after child is built
if child_id in all_nodes:
if preferred_label:
all_nodes[child_id].preferred_label = preferred_label
all_nodes[child_id].order = rel['order']

View File

@@ -0,0 +1,210 @@
"""
Schema parser for XBRL documents.
This module handles parsing of XBRL taxonomy schemas and element catalog
creation with element definitions and properties.
"""
from pathlib import Path
from typing import Dict, Union
from lxml import etree as ET
from edgar.core import log
from edgar.xbrl.models import ElementCatalog, XBRLProcessingError
from .base import BaseParser
class SchemaParser(BaseParser):
"""Parser for XBRL taxonomy schemas."""
def __init__(self, element_catalog: Dict[str, ElementCatalog]):
"""
Initialize schema parser with data structure references.
Args:
element_catalog: Reference to element catalog dictionary
"""
super().__init__()
# Store references to data structures
self.element_catalog = element_catalog
# Will be set by coordinator when needed
self.parse_labels_content = None
self.parse_presentation_content = None
self.parse_calculation_content = None
self.parse_definition_content = None
def set_linkbase_parsers(self, labels_parser, presentation_parser, calculation_parser, definition_parser):
"""
Set references to other parsers for embedded linkbase processing.
Args:
labels_parser: LabelsParser instance
presentation_parser: PresentationParser instance
calculation_parser: CalculationParser instance
definition_parser: DefinitionParser instance
"""
self.parse_labels_content = labels_parser.parse_labels_content
self.parse_presentation_content = presentation_parser.parse_presentation_content
self.parse_calculation_content = calculation_parser.parse_calculation_content
self.parse_definition_content = definition_parser.parse_definition_content
def parse_schema(self, file_path: Union[str, Path]) -> None:
"""Parse schema file and extract element information."""
try:
content = Path(file_path).read_text()
self.parse_schema_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing schema file {file_path}: {str(e)}") from e
def parse_schema_content(self, content: str) -> None:
"""Parse schema content and extract element information."""
try:
# Use the safe XML parsing helper
root = self._safe_parse_xml(content)
# Extract element declarations
for element in root.findall('.//{http://www.w3.org/2001/XMLSchema}element'):
element_id = element.get('id') or element.get('name')
if not element_id:
continue
# Extract element properties
data_type = element.get('type', '')
# Check for balance and period type
# First check as attributes on the element (modern XBRL style)
balance_type = element.get('{http://www.xbrl.org/2003/instance}balance')
period_type = element.get('{http://www.xbrl.org/2003/instance}periodType')
abstract = element.get('abstract', 'false').lower() == 'true'
# If not found as attributes, look in nested annotations (legacy style)
if not balance_type or not period_type:
annotation = element.find('.//{http://www.w3.org/2001/XMLSchema}annotation')
if annotation is not None:
for appinfo in annotation.findall('.//{http://www.w3.org/2001/XMLSchema}appinfo'):
if not balance_type:
balance_element = appinfo.find('.//{http://www.xbrl.org/2003/instance}balance')
if balance_element is not None:
balance_type = balance_element.text
if not period_type:
period_element = appinfo.find('.//{http://www.xbrl.org/2003/instance}periodType')
if period_element is not None:
period_type = period_element.text
# Create element catalog entry
self.element_catalog[element_id] = ElementCatalog(
name=element_id,
data_type=data_type,
period_type=period_type or "duration", # Default to duration
balance=balance_type,
abstract=abstract,
labels={}
)
# Extract embedded linkbases if present
embedded_linkbases = self._extract_embedded_linkbases(content)
# If embedded linkbases were found, parse them
if embedded_linkbases and 'linkbases' in embedded_linkbases:
if 'label' in embedded_linkbases['linkbases'] and self.parse_labels_content:
label_content = embedded_linkbases['linkbases']['label']
self.parse_labels_content(label_content)
if 'presentation' in embedded_linkbases['linkbases'] and self.parse_presentation_content:
presentation_content = embedded_linkbases['linkbases']['presentation']
self.parse_presentation_content(presentation_content)
if 'calculation' in embedded_linkbases['linkbases'] and self.parse_calculation_content:
calculation_content = embedded_linkbases['linkbases']['calculation']
self.parse_calculation_content(calculation_content)
if 'definition' in embedded_linkbases['linkbases'] and self.parse_definition_content:
definition_content = embedded_linkbases['linkbases']['definition']
self.parse_definition_content(definition_content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing schema content: {str(e)}") from e
def _extract_embedded_linkbases(self, schema_content: str) -> Dict[str, Dict[str, str]]:
"""
Extract embedded linkbases and role types from the schema file.
Args:
schema_content: XML content of the schema file
Returns:
Dictionary containing embedded linkbases and role type information
"""
embedded_data = {
'linkbases': {},
'role_types': {}
}
try:
# Use the safe XML parsing helper
root = self._safe_parse_xml(schema_content)
# Create namespace map for use with XPath
nsmap = {
'xsd': 'http://www.w3.org/2001/XMLSchema',
'link': 'http://www.xbrl.org/2003/linkbase'
}
# Find all appinfo elements using optimized XPath
for appinfo in root.xpath('.//xsd:appinfo', namespaces=nsmap):
# Extract role types
for role_type in appinfo.xpath('./link:roleType', namespaces=nsmap):
role_uri = role_type.get('roleURI')
role_id = role_type.get('id')
# Use optimized XPath to find definition
definition = role_type.find('./link:definition', nsmap)
definition_text = definition.text if definition is not None else ""
# Use optimized XPath to find usedOn elements
used_on = [elem.text for elem in role_type.xpath('./link:usedOn', namespaces=nsmap) if elem.text]
if role_uri:
embedded_data['role_types'][role_uri] = {
'id': role_id,
'definition': definition_text,
'used_on': used_on
}
# Find the linkbase element with optimized XPath
linkbase = appinfo.find('./link:linkbase', nsmap)
if linkbase is not None:
# Extract the entire linkbase element as a string - with proper encoding
linkbase_string = ET.tostring(linkbase, encoding='unicode', method='xml')
# Extract each type of linkbase with optimized XPath
for linkbase_type in ['presentation', 'label', 'calculation', 'definition']:
# Use direct child XPath for better performance
xpath_expr = f'./link:{linkbase_type}Link'
linkbase_elements = linkbase.xpath(xpath_expr, namespaces=nsmap)
if linkbase_elements:
# Convert all linkbase elements of this type to strings
linkbase_strings = [
ET.tostring(elem, encoding='unicode', method='xml')
for elem in linkbase_elements
]
# Join multiple linkbase elements efficiently
linkbase_header = linkbase_string.split('>', 1)[0] + '>'
embedded_data['linkbases'][linkbase_type] = (
f"{linkbase_header}\n" +
'\n'.join(linkbase_strings) +
"\n</link:linkbase>"
)
return embedded_data
except Exception as e:
# Log the error but don't fail - just return empty embedded data
log.warning(f"Warning: Error extracting embedded linkbases: {str(e)}")
return embedded_data

View File

@@ -0,0 +1,312 @@
"""
Enhanced period selection with data availability checking.
This module provides functions to verify that selected periods have sufficient
data before displaying them to investors.
"""
from typing import Dict, List, Optional, Set, Tuple
def count_facts_for_period(xbrl_instance, period_key: str, statement_type: Optional[str] = None) -> int:
"""
Count the number of facts available for a specific period.
Args:
xbrl_instance: XBRL instance with facts
period_key: Period key to check (e.g., 'instant_2024-09-28')
statement_type: Optional statement type to filter facts
Returns:
Number of facts found for this period
"""
fact_count = 0
# Parse period key to get context criteria
if period_key.startswith('instant_'):
period_type = 'instant'
period_date = period_key.replace('instant_', '')
elif 'duration_' in period_key:
period_type = 'duration'
parts = period_key.split('_')
if len(parts) >= 3:
start_date = parts[1]
end_date = parts[2]
else:
return 0
else:
return 0
# Count facts matching this period
for _fact_key, fact in xbrl_instance._facts.items():
# Get context for this fact
context = xbrl_instance.contexts.get(fact.context_ref)
if not context:
continue
# Check if period matches
period_data = context.model_dump().get('period', {})
if period_type == 'instant':
if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
fact_count += 1
elif period_type == 'duration':
if (period_data.get('type') == 'duration' and
period_data.get('startDate') == start_date and
period_data.get('endDate') == end_date):
fact_count += 1
return fact_count
def get_essential_concepts_for_statement(statement_type: str) -> Set[str]:
"""
Get the essential concepts that should be present for a statement type.
These are the minimum concepts investors expect to see.
"""
essential_concepts = {
'BalanceSheet': {
# Core balance sheet items
'Assets', 'AssetsCurrent',
'Liabilities', 'LiabilitiesCurrent',
'StockholdersEquity', 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest',
# Common important items
'CashAndCashEquivalentsAtCarryingValue', 'Cash',
'AccountsReceivableNetCurrent', 'AccountsReceivable',
'Inventory', 'InventoryNet',
'PropertyPlantAndEquipmentNet',
'AccountsPayableCurrent', 'AccountsPayable',
'LongTermDebt', 'LongTermDebtNoncurrent'
},
'IncomeStatement': {
# Core income items
'Revenues', 'RevenueFromContractWithCustomerExcludingAssessedTax', 'SalesRevenueNet',
'CostOfRevenue', 'CostOfGoodsAndServicesSold', 'CostOfGoodsSold',
'GrossProfit',
'OperatingExpenses', 'OperatingCostsAndExpenses',
'OperatingIncomeLoss', 'IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest',
'NetIncomeLoss', 'ProfitLoss',
# Common important items
'ResearchAndDevelopmentExpense',
'SellingGeneralAndAdministrativeExpense',
'EarningsPerShareBasic', 'EarningsPerShareDiluted'
},
'CashFlowStatement': {
# Core cash flow items
'NetCashProvidedByUsedInOperatingActivities',
'NetCashProvidedByUsedInInvestingActivities',
'NetCashProvidedByUsedInFinancingActivities',
'CashAndCashEquivalentsPeriodIncreaseDecrease',
# Common important items
'NetIncomeLoss',
'DepreciationDepletionAndAmortization', 'DepreciationAndAmortization',
'PaymentsToAcquirePropertyPlantAndEquipment',
'PaymentsOfDividends', 'PaymentsOfDividendsCommonStock'
}
}
return essential_concepts.get(statement_type, set())
def check_period_data_quality(xbrl_instance, period_key: str, statement_type: str) -> Dict[str, any]:
"""
Check the data quality for a specific period.
Returns:
Dictionary with quality metrics:
- fact_count: Total number of facts
- meaningful_fact_count: Number of facts with meaningful (non-empty) values
- essential_coverage: Percentage of essential concepts found
- has_sufficient_data: Boolean indicating if period should be displayed
- missing_essentials: List of missing essential concepts
- has_meaningful_data: Boolean indicating if period has meaningful values (fixes Issue #408)
"""
# Count total facts
fact_count = count_facts_for_period(xbrl_instance, period_key, statement_type)
# Count meaningful facts (non-empty values) - Fix for Issue #408
meaningful_fact_count = 0
# Get essential concepts
essential_concepts = get_essential_concepts_for_statement(statement_type)
# Check which essential concepts are present
found_essentials = set()
missing_essentials = set()
# Parse period for context matching
if period_key.startswith('instant_'):
period_type = 'instant'
period_date = period_key.replace('instant_', '')
else:
period_type = 'duration'
parts = period_key.split('_')
if len(parts) >= 3:
start_date = parts[1]
end_date = parts[2]
else:
return {
'fact_count': fact_count,
'essential_coverage': 0.0,
'has_sufficient_data': False,
'missing_essentials': list(essential_concepts)
}
# Check each essential concept
for concept in essential_concepts:
concept_found = False
# Look for this concept in facts
for _fact_key, fact in xbrl_instance._facts.items():
if concept_found:
break
# Check if this fact matches the concept
element = xbrl_instance.element_catalog.get(fact.element_id)
if element and concept in element.name:
# Check if it's for our period
context = xbrl_instance.contexts.get(fact.context_ref)
if context:
period_data = context.model_dump().get('period', {})
if period_type == 'instant':
if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
found_essentials.add(concept)
concept_found = True
else:
if (period_data.get('type') == 'duration' and
period_data.get('startDate') == start_date and
period_data.get('endDate') == end_date):
found_essentials.add(concept)
concept_found = True
if not concept_found:
missing_essentials.add(concept)
# Count meaningful facts (non-empty values) - Fix for Issue #408
for _fact_key, fact in xbrl_instance._facts.items():
# Check if it's for our period
context = xbrl_instance.contexts.get(fact.context_ref)
if context:
period_data = context.model_dump().get('period', {})
period_matches = False
if period_type == 'instant':
if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
period_matches = True
else:
if (period_data.get('type') == 'duration' and
period_data.get('startDate') == start_date and
period_data.get('endDate') == end_date):
period_matches = True
if period_matches:
# Check if fact has meaningful value
fact_value = getattr(fact, 'value', None)
if fact_value is not None:
str_value = str(fact_value).strip()
if str_value and str_value.lower() not in ['', 'nan', 'none']:
try:
import pandas as pd
numeric_value = pd.to_numeric(str_value, errors='coerce')
if not pd.isna(numeric_value):
meaningful_fact_count += 1
except:
# If not numeric but not empty, might still be meaningful
if len(str_value) > 0:
meaningful_fact_count += 1
# Calculate coverage
essential_coverage = len(found_essentials) / len(essential_concepts) if essential_concepts else 0.0
# Determine if sufficient data
# Require at least 50% essential coverage or 20+ facts
has_sufficient_data = essential_coverage >= 0.5 or fact_count >= 20
# Determine if has meaningful data (fixes Issue #408)
# A period has meaningful data if it has at least some facts with non-empty values
has_meaningful_data = meaningful_fact_count > 0
return {
'fact_count': fact_count,
'meaningful_fact_count': meaningful_fact_count,
'essential_coverage': essential_coverage,
'has_sufficient_data': has_sufficient_data,
'has_meaningful_data': has_meaningful_data,
'missing_essentials': list(missing_essentials),
'found_essentials': list(found_essentials)
}
def filter_periods_with_data(xbrl_instance, periods: List[Tuple[str, str]],
statement_type: str,
min_fact_count: int = 10) -> List[Tuple[str, str]]:
"""
Filter periods to only include those with sufficient data.
Args:
xbrl_instance: XBRL instance
periods: List of (period_key, label) tuples
statement_type: Type of statement
min_fact_count: Minimum number of facts required
Returns:
Filtered list of periods with sufficient data
"""
filtered_periods = []
for period_key, label in periods:
quality = check_period_data_quality(xbrl_instance, period_key, statement_type)
# Include period if it has sufficient data AND meaningful data (fixes Issue #408)
if (quality['has_sufficient_data'] and
quality['fact_count'] >= min_fact_count and
quality['has_meaningful_data']):
filtered_periods.append((period_key, label))
else:
# Log why period was excluded
pass
return filtered_periods
def determine_investor_preferred_periods(xbrl_instance, statement_type: str) -> List[Tuple[str, str]]:
"""
Enhanced period selection that prioritizes what investors want to see.
For Annual Reports:
1. Current fiscal year
2. Prior fiscal year (YoY comparison)
3. Two years ago (3-year trend)
For Quarterly Reports:
1. Current quarter
2. Same quarter prior year (YoY)
3. Current YTD
4. Prior year YTD
Only includes periods with sufficient data.
"""
from edgar.xbrl.period_selector import select_periods
# Start with the unified period selection
base_periods = select_periods(xbrl_instance, statement_type)
# Filter for data availability
periods_with_data = filter_periods_with_data(
xbrl_instance,
base_periods,
statement_type,
min_fact_count=10
)
# If we lost too many periods, be less strict
if len(periods_with_data) < 2 and len(base_periods) >= 2:
# Try again with lower threshold
periods_with_data = filter_periods_with_data(
xbrl_instance,
base_periods,
statement_type,
min_fact_count=5
)
return periods_with_data

View File

@@ -0,0 +1,622 @@
"""
Unified Period Selection System
A streamlined, single-responsibility approach to XBRL period selection that:
- Consolidates logic from legacy periods.py and smart_periods.py
- Always applies document date filtering to prevent future period bugs
- Preserves essential fiscal intelligence while eliminating complexity
- Provides a single, clear entry point for all period selection
This replaces 1,275 lines of dual-system complexity with ~200 lines of focused logic.
"""
import logging
from datetime import date, datetime
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
def select_periods(xbrl, statement_type: str, max_periods: int = 4) -> List[Tuple[str, str]]:
"""
Single entry point for period selection.
Args:
xbrl: XBRL instance with reporting_periods and entity_info
statement_type: 'BalanceSheet', 'IncomeStatement', 'CashFlowStatement', etc.
max_periods: Maximum number of periods to return
Returns:
List of (period_key, period_label) tuples, most recent first
"""
# Step 1: Always filter by document date first (prevents future date bugs)
all_periods = xbrl.reporting_periods
document_end_date = xbrl.period_of_report
if not all_periods:
logger.warning("No reporting periods available for %s", xbrl.entity_name)
return []
filtered_periods = _filter_by_document_date(all_periods, document_end_date)
if not filtered_periods:
logger.warning("No valid periods found after document date filtering for %s", xbrl.entity_name)
return [(p['key'], p['label']) for p in all_periods[:max_periods]] # Fallback to unfiltered
try:
# Step 2: Statement-specific logic
if statement_type == 'BalanceSheet':
candidate_periods = _select_balance_sheet_periods(filtered_periods, max_periods)
else: # Income/Cash Flow statements
candidate_periods = _select_duration_periods(filtered_periods, xbrl.entity_info, max_periods)
# Step 3: Filter out periods with insufficient data
periods_with_data = _filter_periods_with_sufficient_data(xbrl, candidate_periods, statement_type)
if periods_with_data:
return periods_with_data
else:
# If no periods have sufficient data, return the candidates anyway
logger.warning("No periods with sufficient data found for %s %s, returning all candidates", xbrl.entity_name, statement_type)
return candidate_periods
except Exception as e:
logger.error("Period selection failed for %s %s: %s", xbrl.entity_name, statement_type, e)
# Final fallback: return filtered periods (document date filter already applied)
return [(p['key'], p['label']) for p in filtered_periods[:max_periods]]
def _filter_by_document_date(periods: List[Dict], document_end_date: Optional[str]) -> List[Dict]:
"""
Filter periods to only include those that end on or before the document date.
This prevents the future date bug where periods from 2026-2029 were selected
for a 2024 filing.
"""
if not document_end_date:
return periods
try:
doc_end_date = datetime.strptime(document_end_date, '%Y-%m-%d').date()
except (ValueError, TypeError):
logger.debug("Could not parse document end date: %s", document_end_date)
return periods
filtered_periods = []
for period in periods:
try:
if period['type'] == 'instant':
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
if period_date <= doc_end_date:
filtered_periods.append(period)
else: # duration
period_end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
if period_end_date <= doc_end_date:
filtered_periods.append(period)
except (ValueError, TypeError):
# If we can't parse the period date, include it to be safe
filtered_periods.append(period)
return filtered_periods
def _select_balance_sheet_periods(periods: List[Dict], max_periods: int) -> List[Tuple[str, str]]:
"""
Select instant periods for balance sheet statements.
Balance sheets are point-in-time snapshots, so we need instant periods.
We select the most recent instant periods with basic fiscal year intelligence.
"""
instant_periods = [p for p in periods if p['type'] == 'instant']
if not instant_periods:
logger.warning("No instant periods found for balance sheet")
return []
# Sort by date (most recent first)
instant_periods = _sort_periods_by_date(instant_periods, 'instant')
# Take more candidate periods initially (up to 10) to ensure we capture fiscal year ends
# Many filings have several instant periods (quarterly, mid-year, etc.) with minimal data
# We need to cast a wider net initially and let data filtering select the best ones
# Issue #464: Was only checking first 4 periods, missing prior fiscal year ends
candidate_count = min(10, len(instant_periods))
selected_periods = []
for period in instant_periods[:candidate_count]:
selected_periods.append((period['key'], period['label']))
if len(selected_periods) >= max_periods * 3: # Check up to 3x max_periods
break
return selected_periods
def _select_duration_periods(periods: List[Dict], entity_info: Dict[str, Any], max_periods: int) -> List[Tuple[str, str]]:
"""
Select duration periods for income/cash flow statements with fiscal intelligence.
This consolidates the sophisticated fiscal year logic from the legacy system
while keeping it simple and focused.
"""
duration_periods = [p for p in periods if p['type'] == 'duration']
if not duration_periods:
logger.warning("No duration periods found for income/cash flow statement")
return []
# Get fiscal information for intelligent period selection
fiscal_period = entity_info.get('fiscal_period', 'FY')
fiscal_year_end_month = entity_info.get('fiscal_year_end_month')
fiscal_year_end_day = entity_info.get('fiscal_year_end_day')
# Filter for annual periods if this is an annual report
if fiscal_period == 'FY':
annual_periods = _get_annual_periods(duration_periods)
if annual_periods:
# Apply fiscal year alignment scoring
scored_periods = _score_fiscal_alignment(annual_periods, fiscal_year_end_month, fiscal_year_end_day)
return [(p['key'], p['label']) for p in scored_periods[:max_periods]]
# For quarterly reports or if no annual periods found, use sophisticated quarterly logic
return _select_quarterly_periods(duration_periods, max_periods)
def _select_quarterly_periods(duration_periods: List[Dict], max_periods: int) -> List[Tuple[str, str]]:
"""
Select quarterly periods with intelligent investor-focused logic.
For quarterly filings, investors typically want:
1. Current quarter (most recent quarterly period)
2. Same quarter from prior year (YoY comparison)
3. Year-to-date current year (6-month, 9-month YTD)
4. Year-to-date prior year (comparative YTD)
Issue #464 Fix: Cast wider net by checking more quarterly periods and returning
more candidates (max_periods * 3) to let data quality filtering select the best ones.
This mirrors the successful Balance Sheet fix from v4.20.1.
"""
if not duration_periods:
return []
# Categorize periods by duration to identify types
quarterly_periods = [] # ~90 days (80-100)
ytd_periods = [] # 180-280 days (semi-annual, 9-month YTD)
for period in duration_periods:
try:
start_date = datetime.strptime(period['start_date'], '%Y-%m-%d').date()
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
duration_days = (end_date - start_date).days
if 80 <= duration_days <= 100: # Quarterly
quarterly_periods.append(period)
elif 150 <= duration_days <= 285: # YTD (semi-annual to 9-month)
ytd_periods.append(period)
# Skip periods that are too short (<80 days) or too long (>285 days but <300)
except (ValueError, TypeError, KeyError):
continue
# Sort periods by end date (most recent first)
quarterly_periods = _sort_periods_by_date(quarterly_periods, 'duration')
ytd_periods = _sort_periods_by_date(ytd_periods, 'duration')
selected_periods = []
# 1. Add current quarter (most recent quarterly period)
if quarterly_periods:
current_quarter = quarterly_periods[0]
selected_periods.append((current_quarter['key'], current_quarter['label']))
# 2. Find same quarter from prior year for YoY comparison
# Issue #464: Check more quarterly periods to find prior year matches
try:
current_end = datetime.strptime(current_quarter['end_date'], '%Y-%m-%d').date()
target_year = current_end.year - 1
# Check up to 12 quarterly periods instead of just a few
check_count = min(12, len(quarterly_periods) - 1)
for period in quarterly_periods[1:check_count + 1]:
period_end = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
# Same quarter if same month and within 15 days, previous year
if (period_end.year == target_year and
period_end.month == current_end.month and
abs(period_end.day - current_end.day) <= 15):
selected_periods.append((period['key'], period['label']))
break
except (ValueError, TypeError, KeyError):
pass
# 3. Add current year YTD (most recent YTD period)
if ytd_periods:
current_ytd = ytd_periods[0]
# Avoid duplicates - check if this YTD period is already selected as quarterly
if not any(current_ytd['key'] == key for key, _ in selected_periods):
selected_periods.append((current_ytd['key'], current_ytd['label']))
# 4. Add additional YTD candidates for data quality filtering to choose from
# Issue #464: Cast wider net instead of strict matching to handle fiscal year differences
# Example: AAPL current YTD ends June 29, prior YTD ends July 1 (different months)
# Let data quality filtering choose the best periods based on fact counts
if len(selected_periods) < max_periods * 3:
added_keys = {key for key, _ in selected_periods}
check_count = min(8, len(ytd_periods) - 1)
for period in ytd_periods[1:check_count + 1]: # Skip first (already added as current_ytd)
if period['key'] not in added_keys and len(selected_periods) < max_periods * 3:
selected_periods.append((period['key'], period['label']))
added_keys.add(period['key'])
# If we still don't have enough periods, add other quarterly periods
# Issue #464: Check more periods and return more candidates
if len(selected_periods) < max_periods * 3:
added_keys = {key for key, _ in selected_periods}
check_count = min(12, len(quarterly_periods))
for period in quarterly_periods[:check_count]:
if period['key'] not in added_keys and len(selected_periods) < max_periods * 3:
selected_periods.append((period['key'], period['label']))
added_keys.add(period['key'])
# Issue #464: Return max_periods * 3 candidates instead of just max_periods
# Let data quality filtering in _filter_periods_with_sufficient_data choose the best ones
# This mirrors the successful Balance Sheet fix from v4.20.1 (line 128)
return selected_periods[:max_periods * 3]
def _get_annual_periods(duration_periods: List[Dict]) -> List[Dict]:
"""
Filter duration periods to only include truly annual periods (>300 days).
This consolidates the 300-day logic that was duplicated across both systems.
"""
annual_periods = []
for period in duration_periods:
if _is_annual_period(period):
annual_periods.append(period)
return annual_periods
def _is_annual_period(period: Dict) -> bool:
"""
Determine if a period is truly annual (300-400 days).
Annual periods should be approximately one year, allowing for:
- Leap years (366 days)
- Slight variations in fiscal year end dates
- But rejecting multi-year cumulative periods
"""
try:
start_date = datetime.strptime(period['start_date'], '%Y-%m-%d').date()
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
duration_days = (end_date - start_date).days
# Annual periods should be between 300-400 days
# This rejects quarterly (~90 days) and multi-year (>400 days) periods
return 300 < duration_days <= 400
except (ValueError, TypeError, KeyError):
return False
def _score_fiscal_alignment(periods: List[Dict], fiscal_month: Optional[int], fiscal_day: Optional[int]) -> List[Dict]:
"""
Score and sort periods based on fiscal year alignment.
This preserves the sophisticated fiscal intelligence from the legacy system.
"""
if fiscal_month is None or fiscal_day is None:
# No fiscal info available, just sort by date
return _sort_periods_by_date(periods, 'duration')
scored_periods = []
for period in periods:
try:
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
score = _calculate_fiscal_alignment_score(end_date, fiscal_month, fiscal_day)
# Add score to period for sorting
period_with_score = period.copy()
period_with_score['fiscal_score'] = score
scored_periods.append(period_with_score)
except (ValueError, TypeError, KeyError):
# If we can't score it, give it a low score
period_with_score = period.copy()
period_with_score['fiscal_score'] = 0
scored_periods.append(period_with_score)
# Sort by fiscal score (highest first), then by date
scored_periods.sort(key=lambda p: (p.get('fiscal_score', 0), p.get('end_date', '')), reverse=True)
return scored_periods
def _calculate_fiscal_alignment_score(end_date: date, fiscal_month: int, fiscal_day: int) -> int:
"""
Calculate fiscal year alignment score (0-100).
Consolidated from the legacy system's fiscal alignment logic.
"""
if end_date.month == fiscal_month and end_date.day == fiscal_day:
return 100 # Perfect fiscal year end match
elif end_date.month == fiscal_month and abs(end_date.day - fiscal_day) <= 15:
return 75 # Same month, within 15 days
elif abs(end_date.month - fiscal_month) <= 1:
return 50 # Adjacent month
else:
return 25 # Different quarter
def _sort_periods_by_date(periods: List[Dict], period_type: str) -> List[Dict]:
"""
Sort periods by date (most recent first).
Handles both instant and duration periods correctly.
"""
def get_sort_key(period):
try:
if period_type == 'instant':
return datetime.strptime(period['date'], '%Y-%m-%d').date()
else: # duration
return datetime.strptime(period['end_date'], '%Y-%m-%d').date()
except (ValueError, TypeError, KeyError):
return date.min # Sort problematic periods to the end
return sorted(periods, key=get_sort_key, reverse=True)
def _calculate_dynamic_thresholds(facts_by_period: Dict, statement_type: str) -> int:
"""
Calculate minimum fact threshold based on actual data distribution.
This adapts to company size - small companies get lower thresholds,
large companies maintain high standards.
Args:
facts_by_period: Pre-grouped facts by period key
statement_type: Statement type to analyze
Returns:
Minimum fact count threshold for this company/statement
"""
# Collect fact counts for this statement type across all periods
statement_fact_counts = []
for period_key, period_facts in facts_by_period.items():
statement_facts = [
f for f in period_facts
if f.get('statement_type') == statement_type
]
if statement_facts:
statement_fact_counts.append(len(statement_facts))
if not statement_fact_counts:
# No data for this statement type - use conservative default
return 10
# Sort to find the richest periods
statement_fact_counts.sort(reverse=True)
# Strategy: Use 40% of the richest period's fact count as minimum
# This adapts to company size while still filtering sparse periods
richest_period_facts = statement_fact_counts[0]
# Calculate adaptive threshold
adaptive_threshold = int(richest_period_facts * 0.4)
# Apply floor and ceiling
MIN_FLOOR = 10 # Never go below 10 facts
MAX_CEILING = {
'BalanceSheet': 40,
'IncomeStatement': 25,
'CashFlowStatement': 20
}
threshold = max(MIN_FLOOR, min(adaptive_threshold, MAX_CEILING.get(statement_type, 30)))
logger.debug("Dynamic threshold for %s: %d (richest period: %d facts, 40%% = %d)",
statement_type, threshold, richest_period_facts, adaptive_threshold)
return threshold
def _calculate_dynamic_concept_diversity(facts_by_period: Dict, statement_type: str) -> int:
"""
Calculate minimum concept diversity based on actual data.
Returns:
Minimum unique concept count for this company/statement
"""
if statement_type != 'BalanceSheet':
return 0 # Only apply to Balance Sheets for now
# Find maximum concept diversity across periods
max_concepts = 0
for period_facts in facts_by_period.values():
statement_facts = [
f for f in period_facts
if f.get('statement_type') == statement_type
]
unique_concepts = len(set(f.get('concept') for f in statement_facts if f.get('concept')))
max_concepts = max(max_concepts, unique_concepts)
# Require 30% of maximum concept diversity, but at least 5
diversity_threshold = max(5, int(max_concepts * 0.3))
logger.debug("Dynamic concept diversity for %s: %d (max concepts: %d)",
statement_type, diversity_threshold, max_concepts)
return diversity_threshold
# Enhanced essential concept patterns with multiple variations
ESSENTIAL_CONCEPT_PATTERNS = {
'BalanceSheet': [
# Pattern groups - any match in group counts as finding that concept
['Assets', 'AssetsCurrent', 'AssetsNoncurrent', 'AssetsFairValueDisclosure'],
['Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent', 'LiabilitiesAndStockholdersEquity'],
['Equity', 'StockholdersEquity', 'ShareholdersEquity', 'PartnersCapital',
'MembersEquity', 'ShareholdersEquityIncludingPortionAttributableToNoncontrollingInterest']
],
'IncomeStatement': [
['Revenue', 'Revenues', 'SalesRevenue', 'SalesRevenueNet', 'RevenueFromContractWithCustomer'],
['NetIncome', 'NetIncomeLoss', 'ProfitLoss', 'NetIncomeLossAvailableToCommonStockholdersBasic'],
['OperatingIncome', 'OperatingIncomeLoss', 'IncomeLossFromOperations']
],
'CashFlowStatement': [
['OperatingCashFlow', 'NetCashProvidedByUsedInOperatingActivities',
'CashProvidedByUsedInOperatingActivities'],
['InvestingCashFlow', 'NetCashProvidedByUsedInInvestingActivities',
'CashProvidedByUsedInInvestingActivities'],
['FinancingCashFlow', 'NetCashProvidedByUsedInFinancingActivities',
'CashProvidedByUsedInFinancingActivities']
]
}
def _check_essential_concepts_flexible(statement_facts: List[Dict], statement_type: str) -> int:
"""
Check for essential concepts using flexible pattern matching.
Returns count of essential concept groups found (not individual patterns).
"""
concept_groups = ESSENTIAL_CONCEPT_PATTERNS.get(statement_type, [])
if not concept_groups:
return 0
# Extract all concepts from facts once
fact_concepts = [f.get('concept', '').lower() for f in statement_facts if f.get('concept')]
essential_concept_count = 0
# For each concept group, check if ANY pattern matches
for pattern_group in concept_groups:
group_matched = False
for pattern in pattern_group:
pattern_lower = pattern.lower()
# Check if this pattern appears in any fact concept
if any(pattern_lower in concept for concept in fact_concepts):
group_matched = True
logger.debug("Essential concept matched: %s (from group %s)",
pattern, pattern_group[0])
break
if group_matched:
essential_concept_count += 1
return essential_concept_count
def _filter_periods_with_sufficient_data(xbrl, candidate_periods: List[Tuple[str, str]], statement_type: str) -> List[Tuple[str, str]]:
"""
Filter periods to only include those with sufficient financial data.
This prevents selection of periods that exist in the taxonomy but have
no meaningful financial facts (like the Alphabet 2019 case).
Issue #464: Added statement-specific fact count checks and concept diversity
requirements to prevent showing sparse historical periods with only 1-2 concepts.
Performance optimization: Retrieves all facts once and works with in-memory data
instead of creating 40+ DataFrames per statement rendering.
"""
MIN_FACTS_THRESHOLD = 10 # Minimum facts needed for a period to be considered viable
# PERFORMANCE FIX: Get all facts once at the start (single operation)
all_facts = xbrl.facts.get_facts() # Returns List[Dict] - fast!
# Pre-group facts by period_key (O(n) operation, done once)
facts_by_period = {}
for fact in all_facts:
period_key = fact.get('period_key')
if period_key:
if period_key not in facts_by_period:
facts_by_period[period_key] = []
facts_by_period[period_key].append(fact)
# Pre-group facts by statement type within each period
statement_facts_by_period = {}
for period_key, period_facts in facts_by_period.items():
statement_facts_by_period[period_key] = [
f for f in period_facts
if f.get('statement_type') == statement_type
]
# DYNAMIC THRESHOLDS: Calculate based on this company's data distribution
statement_min_facts = _calculate_dynamic_thresholds(facts_by_period, statement_type)
min_concept_diversity = _calculate_dynamic_concept_diversity(facts_by_period, statement_type)
# Get essential concept groups for this statement type
required_concept_groups = len(ESSENTIAL_CONCEPT_PATTERNS.get(statement_type, []))
periods_with_data = []
# Loop through candidates using pre-computed groups (no DataFrame conversions!)
for period_key, period_label in candidate_periods:
try:
# Get pre-grouped facts (fast list access, not DataFrame query)
statement_facts = statement_facts_by_period.get(period_key, [])
period_facts = facts_by_period.get(period_key, [])
statement_fact_count = len(statement_facts)
total_fact_count = len(period_facts)
# Check statement-specific threshold
if statement_fact_count < statement_min_facts:
logger.debug("Period %s has insufficient %s facts (%d < %d)",
period_label, statement_type, statement_fact_count, statement_min_facts)
continue
# Fallback check for total facts
if total_fact_count < MIN_FACTS_THRESHOLD:
logger.debug("Period %s has insufficient facts (%d < %d)",
period_label, total_fact_count, MIN_FACTS_THRESHOLD)
continue
# Check concept diversity (Issue #464)
if statement_type == 'BalanceSheet':
unique_concepts = len(set(f.get('concept') for f in statement_facts if f.get('concept')))
if unique_concepts < min_concept_diversity:
logger.debug("Period %s lacks concept diversity (%d < %d unique concepts)",
period_label, unique_concepts, min_concept_diversity)
continue
# FLEXIBLE CONCEPT MATCHING: Check essential concepts using pattern groups
essential_concept_count = _check_essential_concepts_flexible(statement_facts, statement_type)
# Require at least half the essential concept groups
min_essential_required = max(1, required_concept_groups // 2)
if essential_concept_count >= min_essential_required:
periods_with_data.append((period_key, period_label))
unique_concepts_count = len(set(f.get('concept') for f in statement_facts if f.get('concept')))
logger.debug("Period %s has sufficient data: %d %s facts, %d unique concepts, %d/%d essential concepts",
period_label, statement_fact_count, statement_type,
unique_concepts_count,
essential_concept_count, required_concept_groups)
else:
logger.debug("Period %s lacks essential concepts: %d/%d present",
period_label, essential_concept_count, required_concept_groups)
except Exception as e:
logger.warning("Error checking data for period %s: %s", period_label, e)
# Be more conservative - don't include if we can't verify
continue
return periods_with_data
# Legacy compatibility functions - to be removed after migration
def determine_periods_to_display(xbrl_instance, statement_type: str) -> List[Tuple[str, str]]:
"""Legacy compatibility wrapper."""
logger.warning("Using legacy compatibility wrapper - update to use select_periods() directly")
return select_periods(xbrl_instance, statement_type)
def select_smart_periods(xbrl, statement_type: str, max_periods: int = 4) -> List[Tuple[str, str]]:
"""Legacy compatibility wrapper."""
logger.warning("Using legacy compatibility wrapper - update to use select_periods() directly")
return select_periods(xbrl, statement_type, max_periods)

View File

@@ -0,0 +1,693 @@
"""
Period handling functionality for XBRL statements.
This module provides functions for handling periods in XBRL statements, including:
- Determining available period views for different statement types
- Selecting appropriate periods for display
- Handling fiscal year and quarter information
"""
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple
# Configuration for different statement types
STATEMENT_TYPE_CONFIG = {
'BalanceSheet': {
'period_type': 'instant',
'max_periods': 3,
'allow_annual_comparison': True,
'views': [
{
'name': 'Three Recent Periods',
'description': 'Shows three most recent reporting periods',
'max_periods': 3,
'requires_min_periods': 3
},
{
'name': 'Current vs. Previous Period',
'description': 'Shows the current period and the previous period',
'max_periods': 2,
'requires_min_periods': 1
},
{
'name': 'Three-Year Annual Comparison',
'description': 'Shows three fiscal years for comparison',
'max_periods': 3,
'requires_min_periods': 3,
'annual_only': True
}
]
},
'IncomeStatement': {
'period_type': 'duration',
'max_periods': 3,
'allow_annual_comparison': True,
'views': [
{
'name': 'Three Recent Periods',
'description': 'Shows three most recent reporting periods',
'max_periods': 3,
'requires_min_periods': 3
},
{
'name': 'YTD and Quarterly Breakdown',
'description': 'Shows YTD figures and quarterly breakdown',
'max_periods': 5,
'requires_min_periods': 2,
'mixed_view': True
}
]
},
'StatementOfEquity': {
'period_type': 'duration',
'max_periods': 3,
'views': [
{
'name': 'Three Recent Periods',
'description': 'Shows three most recent reporting periods',
'max_periods': 3,
'requires_min_periods': 1
}
]
},
'ComprehensiveIncome': {
'period_type': 'duration',
'max_periods': 3,
'views': [
{
'name': 'Three Recent Periods',
'description': 'Shows three most recent reporting periods',
'max_periods': 3,
'requires_min_periods': 1
}
]
},
'CoverPage': {
'period_type': 'instant',
'max_periods': 1,
'views': [
{
'name': 'Current Period',
'description': 'Shows the current reporting period',
'max_periods': 1,
'requires_min_periods': 1
}
]
},
'Notes': {
'period_type': 'instant',
'max_periods': 1,
'views': [
{
'name': 'Current Period',
'description': 'Shows the current reporting period',
'max_periods': 1,
'requires_min_periods': 1
}
]
}
}
def sort_periods(periods: List[Dict], period_type: str) -> List[Dict]:
"""Sort periods by date, with most recent first."""
if period_type == 'instant':
return sorted(periods, key=lambda x: x['date'], reverse=True)
return sorted(periods, key=lambda x: (x['end_date'], x['start_date']), reverse=True)
def filter_periods_by_document_end_date(periods: List[Dict], document_period_end_date: str, period_type: str) -> List[Dict]:
"""Filter periods to only include those that end on or before the document period end date."""
if not document_period_end_date:
return periods
try:
doc_end_date = datetime.strptime(document_period_end_date, '%Y-%m-%d').date()
except (ValueError, TypeError):
# If we can't parse the document end date, return all periods
return periods
filtered_periods = []
for period in periods:
try:
if period_type == 'instant':
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
if period_date <= doc_end_date:
filtered_periods.append(period)
else: # duration
period_end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
if period_end_date <= doc_end_date:
filtered_periods.append(period)
except (ValueError, TypeError):
# If we can't parse the period date, include it to be safe
filtered_periods.append(period)
return filtered_periods
def filter_periods_by_type(periods: List[Dict], period_type: str) -> List[Dict]:
"""Filter periods by their type (instant or duration)."""
return [p for p in periods if p['type'] == period_type]
def calculate_fiscal_alignment_score(end_date: datetime.date, fiscal_month: int, fiscal_day: int) -> int:
"""Calculate how well a date aligns with fiscal year end."""
if end_date.month == fiscal_month and end_date.day == fiscal_day:
return 100
if end_date.month == fiscal_month and abs(end_date.day - fiscal_day) <= 15:
return 75
if abs(end_date.month - fiscal_month) <= 1 and abs(end_date.day - fiscal_day) <= 15:
return 50
return 0
def generate_period_view(view_config: Dict[str, Any], periods: List[Dict], is_annual: bool = False) -> Optional[Dict[str, Any]]:
"""Generate a period view based on configuration and available periods.
Args:
view_config: Configuration for the view (from STATEMENT_TYPE_CONFIG)
periods: List of periods to choose from
is_annual: Whether this is an annual report
Returns:
Dictionary with view name, description, and period keys if view is valid,
None if view cannot be generated with available periods
"""
if len(periods) < view_config['requires_min_periods']:
return None
if view_config.get('annual_only', False) and not is_annual:
return None
max_periods = min(view_config['max_periods'], len(periods))
return {
'name': view_config['name'],
'description': view_config['description'],
'period_keys': [p['key'] for p in periods[:max_periods]]
}
def generate_mixed_view(view_config: Dict[str, Any], ytd_periods: List[Dict],
quarterly_periods: List[Dict]) -> Optional[Dict[str, Any]]:
"""Generate a mixed view combining YTD and quarterly periods.
Args:
view_config: Configuration for the view
ytd_periods: List of year-to-date periods
quarterly_periods: List of quarterly periods
Returns:
Dictionary with view configuration if valid, None otherwise
"""
if not ytd_periods or not quarterly_periods:
return None
mixed_keys = []
# Add current YTD
mixed_keys.append(ytd_periods[0]['key'])
# Add recent quarters
for q in quarterly_periods[:min(4, len(quarterly_periods))]:
if q['key'] not in mixed_keys:
mixed_keys.append(q['key'])
if len(mixed_keys) >= view_config['requires_min_periods']:
return {
'name': view_config['name'],
'description': view_config['description'],
'period_keys': mixed_keys[:view_config['max_periods']]
}
return None
def get_period_views(xbrl_instance, statement_type: str) -> List[Dict[str, Any]]:
"""
Get available period views for a statement type.
Args:
xbrl_instance: XBRL instance with context and entity information
statement_type: Type of statement to get period views for
Returns:
List of period view options with name, description, and period keys
"""
period_views = []
# Get statement configuration
config = STATEMENT_TYPE_CONFIG.get(statement_type)
if not config:
return period_views
# Get useful entity info for period selection
entity_info = xbrl_instance.entity_info
fiscal_period_focus = entity_info.get('fiscal_period')
annual_report = fiscal_period_focus == 'FY'
# Get all periods
all_periods = xbrl_instance.reporting_periods
document_period_end_date = xbrl_instance.period_of_report
# Filter and sort periods by type
period_type = config['period_type']
periods = filter_periods_by_type(all_periods, period_type)
# Filter by document period end date to exclude periods after the reporting period
periods = filter_periods_by_document_end_date(periods, document_period_end_date, period_type)
periods = sort_periods(periods, period_type)
# If this statement type allows annual comparison and this is an annual report,
# filter for annual periods
annual_periods = []
if config.get('allow_annual_comparison') and annual_report:
fiscal_month = entity_info.get('fiscal_year_end_month')
fiscal_day = entity_info.get('fiscal_year_end_day')
if fiscal_month is not None and fiscal_day is not None:
for period in periods:
try:
date_field = 'date' if period_type == 'instant' else 'end_date'
end_date = datetime.strptime(period[date_field], '%Y-%m-%d').date()
score = calculate_fiscal_alignment_score(end_date, fiscal_month, fiscal_day)
if score > 0: # Any alignment is good enough for a view
annual_periods.append(period)
except (ValueError, TypeError):
continue
# Generate views based on configuration
for view_config in config.get('views', []):
if view_config.get('mixed_view'):
# Special handling for mixed YTD/quarterly views
ytd_periods = [p for p in periods if p.get('ytd')]
quarterly_periods = [p for p in periods if p.get('quarterly')]
view = generate_mixed_view(view_config, ytd_periods, quarterly_periods)
elif view_config.get('annual_only'):
# Views that should only show annual periods
view = generate_period_view(view_config, annual_periods, annual_report)
else:
# Standard views using all periods
view = generate_period_view(view_config, periods, annual_report)
if view:
period_views.append(view)
return period_views
def determine_periods_to_display(
xbrl_instance,
statement_type: str,
period_filter: Optional[str] = None,
period_view: Optional[str] = None
) -> List[Tuple[str, str]]:
"""
Determine which periods should be displayed for a statement.
Uses smart period selection, which balances investor needs
with data availability for optimal financial analysis.
Args:
xbrl_instance: XBRL instance with context and entity information
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
period_filter: Optional period key to filter by specific reporting period
period_view: Optional name of a predefined period view
Returns:
List of tuples with period keys and labels to display
"""
periods_to_display = []
# If a specific period is requested, use only that
if period_filter:
for period in xbrl_instance.reporting_periods:
if period['key'] == period_filter:
periods_to_display.append((period_filter, period['label']))
break
return periods_to_display
# If a period view is specified, use that
if period_view:
available_views = get_period_views(xbrl_instance, statement_type)
matching_view = next((view for view in available_views if view['name'] == period_view), None)
if matching_view:
for period_key in matching_view['period_keys']:
for period in xbrl_instance.reporting_periods:
if period['key'] == period_key:
periods_to_display.append((period_key, period['label']))
break
return periods_to_display
# Use unified period selection system with fallback to legacy logic
try:
from edgar.xbrl.period_selector import select_periods
return select_periods(xbrl_instance, statement_type)
except Exception as e:
# Log the error and fall back to legacy logic
import logging
logging.warning("Unified period selection failed, using legacy logic: %s", e)
# Continue to legacy logic below
# If no specific periods requested, use default logic based on statement type
all_periods = xbrl_instance.reporting_periods
entity_info = xbrl_instance.entity_info
fiscal_period_focus = entity_info.get('fiscal_period')
document_period_end_date = xbrl_instance.period_of_report
# Filter periods by statement type
if statement_type == 'BalanceSheet':
instant_periods = filter_periods_by_type(all_periods, 'instant')
# Filter by document period end date to exclude periods after the reporting period
instant_periods = filter_periods_by_document_end_date(instant_periods, document_period_end_date, 'instant')
instant_periods = sort_periods(instant_periods, 'instant')
# Get fiscal information for better period matching
fiscal_period_focus = entity_info.get('fiscal_period')
fiscal_year_focus = entity_info.get('fiscal_year')
fiscal_year_end_month = entity_info.get('fiscal_year_end_month')
fiscal_year_end_day = entity_info.get('fiscal_year_end_day')
if instant_periods:
# Take latest instant period that is not later than document_period_end_date
current_period = instant_periods[0] # Most recent
period_key = current_period['key']
periods_to_display.append((period_key, current_period['label']))
# Try to find appropriate comparison period
try:
current_date = datetime.strptime(current_period['date'], '%Y-%m-%d').date()
# Use fiscal information if available for better matching
if fiscal_year_end_month is not None and fiscal_year_end_day is not None:
# Check if this is a fiscal year end report
is_fiscal_year_end = False
if fiscal_period_focus == 'FY' or (
current_date.month == fiscal_year_end_month and
abs(current_date.day - fiscal_year_end_day) <= 7):
is_fiscal_year_end = True
if is_fiscal_year_end and fiscal_year_focus:
# For fiscal year end, find the previous fiscal year end period
prev_fiscal_year = int(fiscal_year_focus) - 1 if isinstance(fiscal_year_focus,
(int, str)) and str(
fiscal_year_focus).isdigit() else current_date.year - 1
# Look for a comparable period from previous fiscal year
for period in instant_periods[1:]: # Skip the current one
try:
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
# Check if this period is from the previous fiscal year and around fiscal year end
if (period_date.year == prev_fiscal_year and
period_date.month == fiscal_year_end_month and
abs(period_date.day - fiscal_year_end_day) <= 15):
periods_to_display.append((period['key'], period['label']))
break
except (ValueError, TypeError):
continue
# If no appropriate period found yet, try generic date-based comparison
if len(periods_to_display) == 1:
# Look for a period from previous year with similar date pattern
prev_year = current_date.year - 1
for period in instant_periods[1:]: # Skip the current one
try:
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
# If from previous year with similar month/day
if period_date.year == prev_year:
periods_to_display.append((period['key'], period['label']))
break
except (ValueError, TypeError):
continue
# Only add additional comparable periods (up to a total of 3)
# For annual reports, only add periods that are also fiscal year ends
is_annual_report = (fiscal_period_focus == 'FY')
added_period_keys = [key for key, _ in periods_to_display]
for period in instant_periods[1:]: # Skip current period
if len(periods_to_display) >= 3:
break # Stop when we have 3 periods
# For annual reports, only add periods that are fiscal year ends
# ENHANCED: Ensure we're selecting true annual period ends, not quarterly
if is_annual_report and fiscal_year_end_month is not None and fiscal_year_end_day is not None:
try:
# Check if this period is close to the fiscal year end
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
# STRICT CHECK: For annual reports, be more selective
# The period should be within a reasonable range of fiscal year end
is_fiscal_year_end = (
period_date.month == fiscal_year_end_month and
abs(period_date.day - fiscal_year_end_day) <= 15 # Allow some flexibility
)
# Additional check: Ensure this is approximately 1 year before previous periods
if is_fiscal_year_end and len(periods_to_display) > 0:
prev_date_str = periods_to_display[-1][0].split('_')[-1] if '_' in periods_to_display[-1][0] else None
if prev_date_str:
try:
prev_date = datetime.strptime(prev_date_str, '%Y-%m-%d').date()
year_diff = abs((prev_date - period_date).days)
# Should be approximately 365 days apart (allow 350-380 range)
if not (350 <= year_diff <= 380):
is_fiscal_year_end = False
except (ValueError, TypeError):
pass
# Only include this period if it's a fiscal year end
if not is_fiscal_year_end:
continue # Skip non-fiscal-year-end periods
except (ValueError, TypeError):
continue # Skip periods with invalid dates
# Don't add periods we've already added
period_key = period['key']
if period_key not in added_period_keys:
periods_to_display.append((period_key, period['label']))
except (ValueError, TypeError):
# If date parsing failed, still try to select appropriate periods
# For annual reports, we should only show fiscal year end periods
is_annual_report = (fiscal_period_focus == 'FY')
added_count = 0
for i, period in enumerate(instant_periods):
if i == 0:
continue # Skip first period which should already be added
if added_count >= 2: # Already added 2 more (for a total of 3)
break
# For annual reports, only add periods that are close to fiscal year end
if (is_annual_report and fiscal_year_end_month is not None and
fiscal_year_end_day is not None):
try:
period_date = datetime.strptime(period['date'], '%Y-%m-%d').date()
# Only add periods close to fiscal year end
if (period_date.month != fiscal_year_end_month or
abs(period_date.day - fiscal_year_end_day) > 15):
continue # Skip periods that aren't fiscal year ends
except (ValueError, TypeError):
continue # Skip periods with invalid dates
periods_to_display.append((period['key'], period['label']))
added_count += 1
elif statement_type in ['IncomeStatement', 'CashFlowStatement']:
duration_periods = filter_periods_by_type(all_periods, 'duration')
# Filter by document period end date to exclude periods after the reporting period
duration_periods = filter_periods_by_document_end_date(duration_periods, document_period_end_date, 'duration')
duration_periods = sort_periods(duration_periods, 'duration')
if duration_periods:
# For annual reports, prioritize annual periods
if fiscal_period_focus == 'FY':
# Get fiscal year end information if available
fiscal_year_end_month = entity_info.get('fiscal_year_end_month')
fiscal_year_end_day = entity_info.get('fiscal_year_end_day')
# First pass: Find all periods that are approximately a year long
# CRITICAL FIX: Apply strict duration filtering to ensure we only get annual periods
# Some facts are marked as FY but are actually quarterly (90 days vs 363+ days)
candidate_annual_periods = []
for period in duration_periods:
try:
start_date = datetime.strptime(period['start_date'], '%Y-%m-%d').date()
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
days = (end_date - start_date).days
# STRICT CHECK: Annual periods must be > 300 days
# This filters out quarterly periods incorrectly marked as FY
if days > 300: # Truly annual period (not quarterly)
# Add a score to each period for later sorting
# Default score is 0 (will be increased for fiscal year matches)
period_with_score = period.copy()
period_with_score['fiscal_alignment_score'] = 0
period_with_score['duration_days'] = days # Store for debugging
candidate_annual_periods.append(period_with_score)
except (ValueError, TypeError):
continue
# Second pass: Score periods based on alignment with fiscal year pattern
if fiscal_year_end_month is not None and fiscal_year_end_day is not None:
for period in candidate_annual_periods:
try:
# Check how closely the end date aligns with fiscal year end
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
# Perfect match: Same month and day as fiscal year end
if end_date.month == fiscal_year_end_month and end_date.day == fiscal_year_end_day:
period['fiscal_alignment_score'] = 100
# Strong match: Same month and within 15 days
elif end_date.month == fiscal_year_end_month and abs(end_date.day - fiscal_year_end_day) <= 15:
period['fiscal_alignment_score'] = 75
# Moderate match: Month before/after and close to the day
elif abs(end_date.month - fiscal_year_end_month) <= 1 and abs(end_date.day - fiscal_year_end_day) <= 15:
period['fiscal_alignment_score'] = 50
except (ValueError, TypeError):
continue
# Sort periods by fiscal alignment (higher score first) and then by recency (end date)
annual_periods = sorted(
candidate_annual_periods,
key=lambda x: (x['fiscal_alignment_score'], x['end_date']),
reverse=True # Highest score and most recent first
)
if annual_periods:
# Take up to 3 best matching annual periods (prioritizing fiscal year alignment)
for period in annual_periods[:3]:
periods_to_display.append((period['key'], period['label']))
return periods_to_display
# For quarterly reports, apply intelligent period selection
else:
# First, categorize periods by duration to identify meaningful financial periods
quarterly_periods = [] # 85-95 days (one quarter)
ytd_periods = [] # 175-185 days (two quarters), 265-275 days (three quarters)
annual_periods = [] # 350-380 days (full year for comparisons)
current_year = None
if document_period_end_date:
try:
current_year = datetime.strptime(document_period_end_date, '%Y-%m-%d').year
except (ValueError, TypeError):
pass
# Categorize all duration periods by their length
# ENHANCED: More strict duration checking to avoid misclassification
for period in duration_periods:
try:
start_date = datetime.strptime(period['start_date'], '%Y-%m-%d').date()
end_date = datetime.strptime(period['end_date'], '%Y-%m-%d').date()
days = (end_date - start_date).days
# Skip single-day or very short periods (less than 30 days)
if days < 30:
continue
# Categorize by duration with stricter checks
if 80 <= days <= 100: # Quarterly period (~90 days), slightly wider range
period['period_type'] = 'quarterly'
period['days'] = days
quarterly_periods.append(period)
elif 170 <= days <= 190: # Semi-annual/YTD for Q2 (~180 days)
period['period_type'] = 'semi-annual'
period['days'] = days
ytd_periods.append(period)
elif 260 <= days <= 280: # YTD for Q3 (~270 days)
period['period_type'] = 'three-quarters'
period['days'] = days
ytd_periods.append(period)
elif days > 300: # Annual period for comparisons (strict check)
period['period_type'] = 'annual'
period['days'] = days
annual_periods.append(period)
except (ValueError, TypeError):
continue
# Build the optimal set of periods for quarterly reporting
selected_periods = []
# 1. Add the most recent quarterly period (current quarter)
if quarterly_periods:
# Find the most recent quarterly period
recent_quarterly = quarterly_periods[0] # Already sorted by end date
selected_periods.append(recent_quarterly)
# Try to find the same quarter from previous year for comparison
if current_year:
for qp in quarterly_periods[1:]:
try:
qp_end = datetime.strptime(qp['end_date'], '%Y-%m-%d').date()
recent_end = datetime.strptime(recent_quarterly['end_date'], '%Y-%m-%d').date()
# Same quarter, previous year (within 15 days tolerance)
if (qp_end.year == current_year - 1 and
qp_end.month == recent_end.month and
abs(qp_end.day - recent_end.day) <= 15):
selected_periods.append(qp)
break
except (ValueError, TypeError):
continue
# 2. Add the most recent YTD period if available
if ytd_periods:
# Find the YTD period that ends closest to the document period end
selected_periods.append(ytd_periods[0])
# 3. If we don't have enough periods yet, add more quarterly periods
if len(selected_periods) < 3:
for period in quarterly_periods:
if period not in selected_periods and len(selected_periods) < 3:
selected_periods.append(period)
# 4. If still not enough, consider annual periods for year-over-year comparison
if len(selected_periods) < 3 and annual_periods:
for period in annual_periods:
if len(selected_periods) < 3:
selected_periods.append(period)
# Convert selected periods to display format
for period in selected_periods[:3]: # Limit to 3 periods
periods_to_display.append((period['key'], period['label']))
# For other statement types (not covered by specific logic above)
else:
# Get configuration for this statement type, or use defaults
statement_info = STATEMENT_TYPE_CONFIG.get(statement_type, {})
if not statement_info:
# For unknown statement types, use heuristics based on available periods
# For unknown statement types, determine preferences based on fiscal period
if fiscal_period_focus == 'FY':
# For annual reports, prefer duration periods and show comparisons
statement_info = {
'period_type': 'duration',
'max_periods': 3,
'allow_annual_comparison': True
}
else:
# For interim reports, accept either type but limit to current period
statement_info = {
'period_type': 'either',
'max_periods': 1,
'allow_annual_comparison': False
}
# Select periods based on determined preferences
period_type = statement_info.get('period_type', 'either')
max_periods = statement_info.get('max_periods', 1)
if period_type == 'instant' or period_type == 'either':
instant_periods = filter_periods_by_type(all_periods, 'instant')
instant_periods = filter_periods_by_document_end_date(instant_periods, document_period_end_date, 'instant')
instant_periods = sort_periods(instant_periods, 'instant')
if instant_periods:
for period in instant_periods[:max_periods]:
periods_to_display.append((period['key'], period['label']))
if (period_type == 'duration' or (period_type == 'either' and not periods_to_display)):
duration_periods = filter_periods_by_type(all_periods, 'duration')
duration_periods = filter_periods_by_document_end_date(duration_periods, document_period_end_date, 'duration')
duration_periods = sort_periods(duration_periods, 'duration')
if duration_periods:
for period in duration_periods[:max_periods]:
periods_to_display.append((period['key'], period['label']))
return periods_to_display

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,44 @@
# XBRL2 Standardization
This package provides functionality for standardizing XBRL concepts across different company filings.
## Overview
The standardization module maps company-specific XBRL concepts to standardized concept names,
enabling consistent presentation of financial statements regardless of the filing entity.
This is particularly useful for:
- Comparing financial data across different companies
- Building standardized reports and visualizations
- Creating consistent financial datasets for analysis
## Components
- `StandardConcept`: An enumeration of standard financial statement concepts
- `MappingStore`: Storage for mappings between company-specific and standard concepts
- `ConceptMapper`: Maps company-specific concepts to standard concepts using various techniques
- `standardize_statement`: Function to standardize a statement's labels
## Usage
```python
from edgar.xbrl.standardization import StandardConcept, initialize_default_mappings, ConceptMapper,
standardize_statement
# Get the default mappings
store = initialize_default_mappings()
# Create a mapper
mapper = ConceptMapper(store)
# Standardize a statement
standardized_data = standardize_statement(statement_data, mapper)
```
## Concept Mappings
The standardized concept mappings are stored in the `concept_mappings.json` file included
in this package. This file maps standard concept names to lists of company-specific concept IDs.
The file is automatically loaded when initializing the `MappingStore` and can be extended
with new mappings as needed.

View File

@@ -0,0 +1,17 @@
"""
XBRL concept standardization package.
This package provides functionality to map company-specific XBRL concepts
to standardized concept names, enabling consistent presentation of financial
statements regardless of the filing entity.
"""
from edgar.xbrl.standardization.core import ConceptMapper, MappingStore, StandardConcept, initialize_default_mappings, standardize_statement
__all__ = [
'StandardConcept',
'MappingStore',
'ConceptMapper',
'standardize_statement',
'initialize_default_mappings'
]

View File

@@ -0,0 +1,21 @@
{
"concept_mappings": {
"Sales and Service Revenue": [
"brka_SalesAndServiceRevenue"
]
},
"hierarchy_rules": {
"Revenue": {
"components": [
"Sales and Service Revenue",
"Operating Lease Revenue"
],
"description": "Total revenue comprises sales/service revenue and operating lease income for holding company"
}
},
"business_context": {
"entity_type": "holding_company",
"industry": "diversified_conglomerate",
"description": "Berkshire Hathaway operates diverse businesses including insurance, utilities, railroads, and manufacturing"
}
}

View File

@@ -0,0 +1,64 @@
{
"entity_info": {
"name": "Microsoft Corporation",
"cik": "0000789019",
"ticker": "MSFT",
"description": "Microsoft-specific concept mappings for unique business terminology"
},
"concept_mappings": {
"_comment_msft_revenue": "Microsoft uses specific revenue categorization that differs from standard tech companies",
"Product Revenue": [
"msft_ProductRevenue",
"msft_WindowsCommercialRevenue",
"msft_WindowsConsumerRevenue",
"msft_OfficeCommercialRevenue"
],
"Service Revenue": [
"msft_ServiceRevenue",
"msft_CloudServicesRevenue",
"msft_ConsultingServicesRevenue"
],
"Subscription Revenue": [
"msft_Office365CommercialRevenue",
"msft_Office365ConsumerRevenue",
"msft_DynamicsRevenue"
],
"Platform Revenue": [
"msft_AzureRevenue",
"msft_XboxContentAndServicesRevenue"
],
"_comment_msft_expenses": "Microsoft has unique expense categorizations for sales and marketing vs G&A",
"Sales and Marketing Expense": [
"msft_SalesAndMarketingExpense",
"msft_AdvertisingAndPromotionExpense"
],
"Technical Support Expense": [
"msft_TechnicalSupportExpense",
"msft_CustomerSupportExpense"
]
},
"hierarchy_rules": {
"_comment": "Rules for handling Microsoft-specific hierarchical relationships",
"revenue_hierarchy": {
"parent": "Revenue",
"children": ["Product Revenue", "Service Revenue", "Subscription Revenue", "Platform Revenue"],
"calculation_rule": "sum"
},
"expense_hierarchy": {
"parent": "Operating Expenses",
"children": ["Sales and Marketing Expense", "Technical Support Expense"],
"calculation_rule": "sum"
}
}
}

View File

@@ -0,0 +1,54 @@
{
"metadata": {
"entity_identifier": "tsla",
"company_name": "Tesla, Inc.",
"cik": "1318605",
"priority": "high",
"created_date": "2024-06-25",
"last_updated": "2024-06-25",
"description": "Tesla-specific concept mappings to handle automotive, energy, and service revenue streams"
},
"concept_mappings": {
"Automotive Revenue": [
"tsla_AutomotiveRevenue",
"tsla_AutomotiveSales",
"tsla_VehicleRevenue"
],
"Automotive Leasing Revenue": [
"tsla_AutomotiveLeasing",
"tsla_AutomotiveLeasingRevenue",
"tsla_VehicleLeasingRevenue"
],
"Energy Revenue": [
"tsla_EnergyGenerationAndStorageRevenue",
"tsla_EnergyRevenue",
"tsla_SolarRevenue",
"tsla_EnergyStorageRevenue"
],
"Service Revenue": [
"tsla_ServicesAndOtherRevenue",
"tsla_ServiceRevenue",
"tsla_SuperchargerRevenue"
]
},
"hierarchy_rules": {
"Revenue": {
"children": [
"Automotive Revenue",
"Energy Revenue",
"Service Revenue"
]
},
"Automotive Revenue": {
"children": [
"Automotive Leasing Revenue"
]
}
},
"business_context": {
"primary_revenue_streams": ["automotive", "energy", "services"],
"revenue_model": "product_and_service",
"key_metrics": ["vehicle_deliveries", "energy_deployments"],
"industry": "automotive_technology"
}
}

View File

@@ -0,0 +1,353 @@
{
"_comment_revenue_hierarchy": "REVENUE HIERARCHY FIX: Separated total revenue from component revenue types to prevent duplicate labels. Contract and product revenue are components that should have distinct labels from total revenue.",
"Revenue": [
"us-gaap_Revenue",
"us-gaap_Revenues",
"us-gaap_SalesRevenueNet",
"us-gaap_OperatingRevenue"
],
"Contract Revenue": [
"us-gaap_RevenueFromContractWithCustomerExcludingAssessedTax",
"us-gaap_RevenueFromContractWithCustomerIncludingAssessedTax"
],
"Product Revenue": [
"us-gaap_SalesRevenueGoodsNet",
"us-gaap_ProductSales"
],
"Operating Lease Revenue": [
"us-gaap_OperatingLeaseLeaseIncome"
],
"_comment_cost_of_revenue_hierarchy": "COST OF REVENUE HIERARCHY FIX: Separated different cost types to prevent duplicate labels. Different business models (manufacturing, service, mixed) use different cost concepts that should have distinct labels for clarity.",
"Cost of Revenue": [
"us-gaap_CostOfRevenueAbstract"
],
"Total Cost of Revenue": [
"us-gaap_CostOfRevenue"
],
"Cost of Goods Sold": [
"us-gaap_CostOfGoodsSold"
],
"Cost of Goods and Services Sold": [
"us-gaap_CostOfGoodsAndServicesSold"
],
"Cost of Sales": [
"us-gaap_CostOfSales"
],
"Cost of Goods and Services Excluding Depreciation": [
"us-gaap_CostOfGoodsAndServiceExcludingDepreciationDepletionAndAmortization"
],
"Direct Operating Costs": [
"us-gaap_DirectOperatingCosts"
],
"Costs and Expenses": [
"us-gaap_CostsAndExpenses"
],
"Gross Profit": [
"us-gaap_GrossProfit"
],
"Operating Expenses": [
"us-gaap_NoninterestExpense",
"us-gaap_OperatingCostsAndExpenses",
"us-gaap_OperatingExpenses"
],
"Research and Development Expense": [
"us-gaap_ResearchAndDevelopmentCosts",
"us-gaap_ResearchAndDevelopmentExpense"
],
"_comment_sga_hierarchy": "SG&A HIERARCHY FIX: Separated total SG&A from components to prevent duplicate labels. Previously all three concepts below mapped to 'Selling, General and Administrative Expense' causing confusion when companies report both total and components.",
"Selling, General and Administrative Expense": [
"us-gaap_SellingGeneralAndAdministrativeExpense"
],
"General and Administrative Expense": [
"us-gaap_GeneralAndAdministrativeExpense",
"us-gaap_AdministrativeExpense"
],
"Selling Expense": [
"us-gaap_SellingAndMarketingExpense",
"us-gaap_SellingExpense"
],
"Marketing Expense": [
"us-gaap_MarketingExpense",
"us-gaap_AdvertisingExpense"
],
"Operating Income": [
"us-gaap_OperatingIncomeLoss",
"us-gaap_OperatingIncome",
"us-gaap_IncomeLossFromContinuingOperationsBeforeInterestAndTaxes"
],
"Nonoperating Income/Expense": [
"orcl_NonoperatingIncomeExpenseIncludingEliminationOfNetIncomeLossAttributableToNoncontrollingInterests",
"us-gaap_NonoperatingIncomeExpense"
],
"Interest Expense": [
"us-gaap_InterestAndDebtExpense",
"us-gaap_InterestExpense",
"us-gaap_InterestIncomeExpenseNet"
],
"Interest Expense (operating)": [
"us-gaap_InterestExpenseOperating"
],
"Interest Expense (non-operating)": [
"us-gaap_InterestExpenseNonoperating"
],
"_comment_income_before_tax_hierarchy": "INCOME BEFORE TAX HIERARCHY FIX: Separated total income before tax from component types to prevent duplicate labels. Continuing operations and extraordinary items are components that should have distinct labels.",
"Income Before Tax": [
"us-gaap_IncomeLossBeforeIncomeTaxes"
],
"Income Before Tax from Continuing Operations": [
"us-gaap_IncomeLossFromContinuingOperationsBeforeIncomeTaxes",
"us-gaap_IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest",
"orcl_IncomeLossFromContinuingOperationsIncludingNoncontrollingInterestBeforeIncomeTaxesExtraordinaryItems"
],
"Income Tax Expense": [
"us-gaap_IncomeTaxesPaidNet",
"us-gaap_IncomeTaxExpenseBenefit"
],
"_comment_net_income_hierarchy": "NET INCOME HIERARCHY FIX: Separated total net income from component income types to prevent duplicate labels. Continuing operations income and profit/loss are components that should have distinct labels from total net income.",
"Net Income": [
"us-gaap_NetIncome",
"us-gaap_NetIncomeLoss"
],
"Net Income from Continuing Operations": [
"us-gaap_IncomeLossFromContinuingOperationsIncludingPortionAttributableToNoncontrollingInterest",
"us-gaap_IncomeLossFromContinuingOperations"
],
"Profit or Loss": [
"us-gaap_ProfitLoss"
],
"Net Income Attributable to Noncontrolling Interest": [
"us-gaap_NetIncomeLossAttributableToNonredeemableNoncontrollingInterest",
"us-gaap_NetIncomeLossAttributableToNoncontrollingInterest"
],
"Basic Net Income Available to Common Shareholders": [
"us-gaap_NetIncomeLossAvailableToCommonStockholdersBasic"
],
"Diluted Net Income Available to Common Shareholders": [
"us-gaap_NetIncomeLossAvailableToCommonStockholdersDiluted"
],
"Accumulated Other Comprehensive Income/Loss": [
"us-gaap_AccumulatedOtherComprehensiveIncomeLossNetOfTax"
],
"Earnings Per Share": [
"us-gaap_EarningsPerShareAbstract"
],
"Earnings Per Share (Basic)": [
"us-gaap_EarningsPerShareBasic"
],
"Earnings Per Share (Diluted)": [
"us-gaap_EarningsPerShareDiluted"
],
"Shares Outstanding": [
"us-gaap_WeightedAverageNumberOfSharesOutstandingAbstract"
],
"Shares Outstanding (Basic)": [
"us-gaap_WeightedAverageNumberOfSharesOutstandingBasic"
],
"Shares Outstanding (Diluted)": [
"us-gaap_WeightedAverageNumberOfDilutedSharesOutstanding"
],
"Cash and Cash Equivalents": [
"us-gaap_CashEquivalentsAtCarryingValue",
"us-gaap_Cash",
"us-gaap_CashAndCashEquivalentsAtCarryingValue",
"us-gaap_CashCashEquivalentsAndShortTermInvestments"
],
"Accounts Receivable": [
"us-gaap_AccountsReceivableNet",
"us-gaap_ReceivablesNetCurrent",
"us-gaap_AccountsReceivableNetCurrent",
"us-gaap_AccountsReceivableGross"
],
"Inventory": [
"us-gaap_InventoryGross",
"us-gaap_InventoryFinishedGoods",
"us-gaap_InventoryNet"
],
"Prepaid Expenses": [
"us-gaap_PrepaidExpenseAndOtherAssetsCurrent",
"us-gaap_PrepaidExpenseCurrent"
],
"Current Marketable Securities": [
"us-gaap_AvailableForSaleSecuritiesDebtSecuritiesCurrent",
"us-gaap_MarketableSecuritiesCurrent"
],
"Non Current Marketable Securities": [
"us-gaap_MarketableSecuritiesNoncurrent"
],
"Total Current Assets": [
"us-gaap_AssetsCurrent"
],
"Total Non Current Assets": [
"us-gaap_AssetsNoncurrent"
],
"Property, Plant and Equipment": [
"us-gaap_PropertyPlantAndEquipmentGross",
"us-gaap_PropertyPlantAndEquipmentNet",
"us-gaap_FixedAssets"
],
"Goodwill": [
"us-gaap_Goodwill"
],
"Intangible Assets": [
"us-gaap_IntangibleAssetsNetIncludingGoodwill",
"us-gaap_IntangibleAssetsNetExcludingGoodwill",
"us-gaap_FiniteLivedIntangibleAssetsNet"
],
"Total Assets": [
"us-gaap_Assets",
"us-gaap_AssetsTotal"
],
"Long-Term Investments": [
"us-gaap_LongTermInvestments"
],
"Accounts Payable": [
"us-gaap_AccountsPayableCurrent",
"us-gaap_AccountsPayableTradeCurrent"
],
"Accrued Liabilities": [
"us-gaap_OtherAccruedLiabilitiesCurrent",
"us-gaap_AccruedLiabilitiesCurrent",
"us-gaap_EmployeeRelatedLiabilitiesCurrent"
],
"Short Term Debt": [
"us-gaap_DebtCurrent",
"us-gaap_ShortTermBorrowings",
"us-gaap_LongTermDebtCurrent"
],
"Total Current Liabilities": [
"us-gaap_LiabilitiesCurrent"
],
"Total Non Current Liabilities": [
"us-gaap_LiabilitiesNoncurrent"
],
"Long Term Debt": [
"us-gaap_LongTermDebtAndCapitalLeaseObligations",
"us-gaap_LongTermDebt",
"us-gaap_LongTermBorrowings",
"us-gaap_LongTermDebtNoncurrent"
],
"Notes Payable, Current": [
"us-gaap_NotesPayableCurrent"
],
"Notes Payable, Non Current": [
"us-gaap_LongTermNotesAndLoans"
],
"Deferred Revenue": [
"us-gaap_DeferredRevenueNoncurrent",
"us-gaap_DeferredRevenueCurrent",
"us-gaap_DeferredRevenue"
],
"Total Liabilities": [
"us-gaap_LiabilitiesTotal",
"us-gaap_Liabilities"
],
"Common Stock Shares Outstanding": [
"us-gaap_CommonStockSharesOutstanding"
],
"Common Stock Shares Issued": [
"us-gaap_CommonStockSharesIssued"
],
"Common Stock": [
"us-gaap_CommonStocksIncludingAdditionalPaidInCapital",
"us-gaap_StockholdersEquityCommonStock",
"us-gaap_CommonStockValue"
],
"Preferred Stock": [
"us-gaap_PreferredStockValue"
],
"Treasury Stock Common Value": [
"us-gaap_TreasuryStockCommonValue",
"us-gaap_TreasuryStockValue"
],
"Retained Earnings": [
"us-gaap_RetainedEarnings",
"us-gaap_RetainedEarningsAccumulatedDeficit"
],
"Minority Interest": [
"us-gaap_MinorityInterest",
"us-gaap_NoncontrollingInterest"
],
"Total Stockholders' Equity": [
"us-gaap_EquityAttributableToParent",
"us-gaap_StockholdersEquity",
"us-gaap_StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest",
"us-gaap_StockholdersEquityAttributableToParent"
],
"Total Liabilities and Stockholders' Equity": [
"us-gaap_LiabilitiesAndStockholdersEquity"
],
"Net Cash from Operating Activities": [
"us-gaap_NetCashProvidedByUsedInOperatingActivities",
"us-gaap_NetCashProvidedByUsedInOperatingActivitiesContinuingOperations"
],
"Net Cash from Investing Activities": [
"us-gaap_NetCashProvidedByUsedInInvestingActivities",
"us-gaap_NetCashProvidedByUsedInInvestingActivitiesContinuingOperations"
],
"Net Cash from Financing Activities": [
"us-gaap_NetCashProvidedByUsedInFinancingActivitiesContinuingOperations",
"us-gaap_NetCashProvidedByUsedInFinancingActivities"
],
"Net Change in Cash": [
"us-gaap_IncreaseDecreaseInCashAndCashEquivalents",
"us-gaap_CashAndCashEquivalentsPeriodIncreaseDecrease",
"us-gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect"
],
"Payments for Property, Plant and Equipment": [
"us-gaap_PaymentsToAcquirePropertyPlantAndEquipment"
],
"Payments of Dividends": [
"us-gaap_PaymentsOfDividends"
],
"Tax Withholding for Share-Based Compensation": [
"us-gaap_PaymentsRelatedToTaxWithholdingForShareBasedCompensation"
],
"Payments to Acquire Businesses": [
"us-gaap_PaymentsToAcquireBusinessesNetOfCashAcquired"
],
"Proceeds from Issuance of Common Stock": [
"us-gaap_ProceedsFromIssuanceOfCommonStock"
],
"Proceeds from Issuance of Long-Term Debt": [
"us-gaap_ProceedsFromIssuanceOfLongTermDebt"
],
"Proceeds from Maturities, Prepayments and Calls of Securities": [
"us-gaap_ProceedsFromMaturitiesPrepaymentsAndCallsOfAvailableForSaleSecurities"
],
"Proceeds from Sale and Maturity of Other Investments": [
"us-gaap_ProceedsFromSaleAndMaturityOfOtherInvestments"
],
"Proceeds from Sale of Debt Securities, ": [
"us-gaap_ProceedsFromSaleOfAvailableForSaleSecuritiesDebt"
],
"Proceeds from (Repayments of) Commercial Paper": [
"us-gaap_ProceedsFromRepaymentsOfCommercialPaper"
],
"Other Assets": [
"us-gaap_OtherAssets"
],
"Other Current Assets": [
"us-gaap_OtherAssetsCurrent"
],
"Other Non Current Assets": [
"us-gaap_OtherAssetsNoncurrent"
],
"Deferred Tax Assets": [
"us-gaap_DeferredIncomeTaxAssetsNet"
],
"Other Liabilities": [
"us-gaap_OtherLiabilities"
],
"Other Current Liabilities": [
"us-gaap_OtherLiabilitiesCurrent"
],
"Other Non Current Liabilities": [
"us-gaap_OtherLiabilitiesNoncurrent"
],
"Depreciation and Amortization": [
"us-gaap_AmortizationOfIntangibleAssets",
"us-gaap_Depreciation",
"us-gaap_DepreciationAndAmortization"
]
}

View File

@@ -0,0 +1,817 @@
"""
Module for standardizing XBRL concepts across different company filings.
This module provides functionality to map company-specific XBRL concepts
to standardized concept names, enabling consistent presentation of financial
statements regardless of the filing entity.
"""
import json
import os
from difflib import SequenceMatcher
from enum import Enum
from typing import Any, Dict, List, Optional, Set, Tuple
import pandas as pd
class StandardConcept(str, Enum):
"""
Standardized concept names for financial statements.
The enum value (string) is the display label used for presentation.
These labels should match keys in concept_mappings.json.
"""
# Balance Sheet - Assets
CASH_AND_EQUIVALENTS = "Cash and Cash Equivalents"
ACCOUNTS_RECEIVABLE = "Accounts Receivable"
INVENTORY = "Inventory"
PREPAID_EXPENSES = "Prepaid Expenses"
TOTAL_CURRENT_ASSETS = "Total Current Assets"
PROPERTY_PLANT_EQUIPMENT = "Property, Plant and Equipment"
GOODWILL = "Goodwill"
INTANGIBLE_ASSETS = "Intangible Assets"
TOTAL_ASSETS = "Total Assets"
# Balance Sheet - Liabilities
ACCOUNTS_PAYABLE = "Accounts Payable"
ACCRUED_LIABILITIES = "Accrued Liabilities"
SHORT_TERM_DEBT = "Short Term Debt"
TOTAL_CURRENT_LIABILITIES = "Total Current Liabilities"
LONG_TERM_DEBT = "Long Term Debt"
DEFERRED_REVENUE = "Deferred Revenue"
TOTAL_LIABILITIES = "Total Liabilities"
# Balance Sheet - Equity
COMMON_STOCK = "Common Stock"
RETAINED_EARNINGS = "Retained Earnings"
TOTAL_EQUITY = "Total Stockholders' Equity"
# Income Statement - Revenue Hierarchy
REVENUE = "Revenue"
CONTRACT_REVENUE = "Contract Revenue"
PRODUCT_REVENUE = "Product Revenue"
SERVICE_REVENUE = "Service Revenue"
SUBSCRIPTION_REVENUE = "Subscription Revenue"
LEASING_REVENUE = "Leasing Revenue"
# Industry-Specific Revenue Concepts
AUTOMOTIVE_REVENUE = "Automotive Revenue"
AUTOMOTIVE_LEASING_REVENUE = "Automotive Leasing Revenue"
ENERGY_REVENUE = "Energy Revenue"
SOFTWARE_REVENUE = "Software Revenue"
HARDWARE_REVENUE = "Hardware Revenue"
PLATFORM_REVENUE = "Platform Revenue"
# Income Statement - Expenses
COST_OF_REVENUE = "Cost of Revenue"
COST_OF_GOODS_SOLD = "Cost of Goods Sold"
COST_OF_GOODS_AND_SERVICES_SOLD = "Cost of Goods and Services Sold"
COST_OF_SALES = "Cost of Sales"
COSTS_AND_EXPENSES = "Costs and Expenses"
DIRECT_OPERATING_COSTS = "Direct Operating Costs"
GROSS_PROFIT = "Gross Profit"
OPERATING_EXPENSES = "Operating Expenses"
RESEARCH_AND_DEVELOPMENT = "Research and Development Expense"
# Enhanced Expense Hierarchy
SELLING_GENERAL_ADMIN = "Selling, General and Administrative Expense"
SELLING_EXPENSE = "Selling Expense"
GENERAL_ADMIN_EXPENSE = "General and Administrative Expense"
MARKETING_EXPENSE = "Marketing Expense"
SALES_EXPENSE = "Sales Expense"
# Other Income Statement
OPERATING_INCOME = "Operating Income"
INTEREST_EXPENSE = "Interest Expense"
INCOME_BEFORE_TAX = "Income Before Tax"
INCOME_BEFORE_TAX_CONTINUING_OPS = "Income Before Tax from Continuing Operations"
INCOME_TAX_EXPENSE = "Income Tax Expense"
NET_INCOME = "Net Income"
NET_INCOME_CONTINUING_OPS = "Net Income from Continuing Operations"
NET_INCOME_NONCONTROLLING = "Net Income Attributable to Noncontrolling Interest"
PROFIT_OR_LOSS = "Profit or Loss"
# Cash Flow Statement
CASH_FROM_OPERATIONS = "Net Cash from Operating Activities"
CASH_FROM_INVESTING = "Net Cash from Investing Activities"
CASH_FROM_FINANCING = "Net Cash from Financing Activities"
NET_CHANGE_IN_CASH = "Net Change in Cash"
@classmethod
def get_from_label(cls, label: str) -> Optional['StandardConcept']:
"""
Get a StandardConcept enum by its label value.
Args:
label: The label string to look up
Returns:
The corresponding StandardConcept or None if not found
"""
for concept in cls:
if concept.value == label:
return concept
return None
@classmethod
def get_all_values(cls) -> Set[str]:
"""
Get all label values defined in the enum.
Returns:
Set of all label strings
"""
return {concept.value for concept in cls}
class MappingStore:
"""
Storage for mappings between company-specific concepts and standard concepts.
Attributes:
source (str): Path to the JSON file storing the mappings
mappings (Dict[str, Set[str]]): Dictionary mapping standard concepts to sets of company concepts
company_mappings (Dict[str, Dict]): Company-specific mappings loaded from company_mappings/
merged_mappings (Dict[str, List[Tuple]]): Merged mappings with priority scoring
"""
def __init__(self, source: Optional[str] = None, validate_with_enum: bool = False, read_only: bool = False):
"""
Initialize the mapping store.
Args:
source: Path to the JSON file storing the mappings. If None, uses default location.
validate_with_enum: Whether to validate JSON keys against StandardConcept enum
read_only: If True, never save changes back to the file (used in testing)
"""
self.read_only = read_only
if source is None:
# Try a few different ways to locate the file, handling both development
# and installed package scenarios
self.source = None
# Default to a file in the same directory as this module (development mode)
module_dir = os.path.dirname(os.path.abspath(__file__))
potential_path = os.path.join(module_dir, "concept_mappings.json")
if os.path.exists(potential_path):
self.source = potential_path
# If not found, try to load from package data (installed package)
if self.source is None:
try:
import importlib.resources as pkg_resources
try:
# For Python 3.9+
with pkg_resources.files('edgar.xbrl.standardization').joinpath('concept_mappings.json').open('r') as f:
# Just read the file to see if it exists, we'll load it properly later
f.read(1)
self.source = potential_path # Use the same path as before
except (ImportError, FileNotFoundError, AttributeError):
# Fallback for older Python versions
try:
import pkg_resources as legacy_resources
if legacy_resources.resource_exists('edgar.xbrl.standardization', 'concept_mappings.json'):
self.source = potential_path # Use the same path as before
except (ImportError, FileNotFoundError):
pass
except ImportError:
pass
# If we still haven't found the file, use the default path anyway
# (it will fail gracefully in _load_mappings)
if self.source is None:
self.source = potential_path
else:
self.source = source
self.mappings = self._load_mappings()
# Load company-specific mappings (always enabled)
self.company_mappings = self._load_all_company_mappings()
self.merged_mappings = self._create_merged_mappings()
self.hierarchy_rules = self._load_hierarchy_rules()
# Validate the loaded mappings against StandardConcept enum
if validate_with_enum:
self.validate_against_enum()
def validate_against_enum(self) -> Tuple[bool, List[str]]:
"""
Validate that all keys in the mappings exist in StandardConcept enum.
Returns:
Tuple of (is_valid, list_of_missing_keys)
"""
standard_values = StandardConcept.get_all_values()
json_keys = set(self.mappings.keys())
# Find keys in JSON that aren't in enum
missing_in_enum = json_keys - standard_values
# Find enum values not in JSON (just for information)
missing_in_json = standard_values - json_keys
import logging
logger = logging.getLogger(__name__)
if missing_in_enum:
logger.warning("Found %d keys in concept_mappings.json that don't exist in StandardConcept enum: %s", len(missing_in_enum), sorted(missing_in_enum))
if missing_in_json:
logger.info("Found %d StandardConcept values without mappings in concept_mappings.json: %s", len(missing_in_json), sorted(missing_in_json))
return len(missing_in_enum) == 0, list(missing_in_enum)
def to_dataframe(self) -> pd.DataFrame:
"""
Convert mappings to a pandas DataFrame for analysis and visualization.
Returns:
DataFrame with columns for standard_concept and company_concept
"""
try:
import pandas as pd
except ImportError:
raise ImportError("pandas is required for to_dataframe() but is not installed") from None
rows = []
for standard_concept, company_concepts in self.mappings.items():
for company_concept in company_concepts:
rows.append({
'standard_concept': standard_concept,
'company_concept': company_concept
})
return pd.DataFrame(rows)
def _load_all_company_mappings(self) -> Dict[str, Dict]:
"""Load all company-specific mapping files from company_mappings/ directory."""
mappings = {}
company_dir = os.path.join(os.path.dirname(self.source or __file__), "company_mappings")
if os.path.exists(company_dir):
for file in os.listdir(company_dir):
if file.endswith("_mappings.json"):
entity_id = file.replace("_mappings.json", "")
try:
with open(os.path.join(company_dir, file), 'r') as f:
company_data = json.load(f)
mappings[entity_id] = company_data
except (FileNotFoundError, json.JSONDecodeError) as e:
import logging
logger = logging.getLogger(__name__)
logger.warning("Failed to load %s: %s", file, e)
return mappings
def _create_merged_mappings(self) -> Dict[str, List[Tuple[str, str, int]]]:
"""Create merged mappings with priority scoring.
Priority levels:
1. Core mappings (lowest)
2. Company mappings (higher)
3. Company-specific matches (highest when company detected)
Returns:
Dict mapping standard concepts to list of (company_concept, source, priority) tuples
"""
merged = {}
# Add core mappings (priority 1 - lowest)
for std_concept, company_concepts in self.mappings.items():
merged[std_concept] = []
for concept in company_concepts:
merged[std_concept].append((concept, "core", 1))
# Add company mappings (priority 2 - higher)
for entity_id, company_data in self.company_mappings.items():
concept_mappings = company_data.get("concept_mappings", {})
priority_level = 2
for std_concept, company_concepts in concept_mappings.items():
if std_concept not in merged:
merged[std_concept] = []
for concept in company_concepts:
merged[std_concept].append((concept, entity_id, priority_level))
return merged
def _load_hierarchy_rules(self) -> Dict[str, Dict]:
"""Load hierarchy rules from company mappings."""
all_rules = {}
# Add company hierarchy rules
for _entity_id, company_data in self.company_mappings.items():
hierarchy_rules = company_data.get("hierarchy_rules", {})
all_rules.update(hierarchy_rules)
return all_rules
def _detect_entity_from_concept(self, concept: str) -> Optional[str]:
"""Detect entity identifier from concept name prefix."""
if '_' in concept:
prefix = concept.split('_')[0].lower()
# Check if this prefix corresponds to a known company
if prefix in self.company_mappings:
return prefix
return None
def _load_mappings(self) -> Dict[str, Set[str]]:
"""
Load mappings from the JSON file.
Returns:
Dictionary mapping standard concepts to sets of company concepts
"""
data = None
# First try direct file access
try:
with open(self.source, 'r') as f:
data = json.load(f)
except (FileNotFoundError, IOError, PermissionError):
# If direct file access fails, try package resources
try:
try:
# Modern importlib.resources approach (Python 3.9+)
import importlib.resources as pkg_resources
try:
# For Python 3.9+
with pkg_resources.files('edgar.xbrl.standardization').joinpath('concept_mappings.json').open('r') as f:
data = json.load(f)
except (ImportError, FileNotFoundError, AttributeError):
# Fallback to legacy pkg_resources
import pkg_resources as legacy_resources
resource_string = legacy_resources.resource_string('edgar.xbrl.standardization', 'concept_mappings.json')
data = json.loads(resource_string)
except ImportError:
pass
except Exception:
# If all attempts fail, log a warning
import logging
logger = logging.getLogger(__name__)
logger.warning("Could not load concept_mappings.json. Standardization will be limited.")
# If we have data, process it based on its structure
if data:
# Check if the structure is flat or nested
if any(isinstance(value, dict) for value in data.values()):
# Nested structure by statement type
flattened = {}
for _statement_type, concepts in data.items():
for standard_concept, company_concepts in concepts.items():
flattened[standard_concept] = set(company_concepts)
return flattened
else:
# Flat structure
return {k: set(v) for k, v in data.items()}
# If all methods fail, return empty mappings
# The initialize_default_mappings function will create a file if needed
return {}
def _save_mappings(self) -> None:
"""Save mappings to the JSON file, unless in read_only mode."""
# Skip saving if in read_only mode
if self.read_only:
return
# Ensure directory exists
directory = os.path.dirname(self.source)
if directory and not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
# Convert sets to lists for JSON serialization
serializable_mappings = {k: list(v) for k, v in self.mappings.items()}
with open(self.source, 'w') as f:
json.dump(serializable_mappings, f, indent=2)
def add(self, company_concept: str, standard_concept: str) -> None:
"""
Add a mapping from a company concept to a standard concept.
Args:
company_concept: The company-specific concept
standard_concept: The standard concept
"""
if standard_concept not in self.mappings:
self.mappings[standard_concept] = set()
self.mappings[standard_concept].add(company_concept)
self._save_mappings()
def get_standard_concept(self, company_concept: str, context: Dict = None) -> Optional[str]:
"""
Get the standard concept for a given company concept with priority-based resolution.
Args:
company_concept: The company-specific concept
context: Optional context information (not used in current implementation)
Returns:
The standard concept or None if not found
"""
# Use merged mappings with priority-based resolution
if self.merged_mappings:
# Detect company from concept prefix (e.g., 'tsla:Revenue' -> 'tsla')
detected_entity = self._detect_entity_from_concept(company_concept)
# Search through merged mappings with priority
candidates = []
for std_concept, mapping_list in self.merged_mappings.items():
for concept, source, priority in mapping_list:
if concept == company_concept:
# Boost priority if it matches detected entity
effective_priority = priority
if detected_entity and source == detected_entity:
effective_priority = 4 # Highest priority for exact company match
candidates.append((std_concept, effective_priority, source))
# Return highest priority match
if candidates:
best_match = max(candidates, key=lambda x: x[1])
import logging
logger = logging.getLogger(__name__)
logger.debug("Mapping applied: %s -> %s (source: %s, priority: %s)", company_concept, best_match[0], best_match[2], best_match[1])
return best_match[0]
# Fallback to core mappings
for standard_concept, company_concepts in self.mappings.items():
if company_concept in company_concepts:
return standard_concept
return None
def get_company_concepts(self, standard_concept: str) -> Set[str]:
"""
Get all company concepts mapped to a standard concept.
Args:
standard_concept: The standard concept
Returns:
Set of company concepts mapped to the standard concept
"""
return self.mappings.get(standard_concept, set())
class ConceptMapper:
"""
Maps company-specific concepts to standard concepts using various techniques.
Attributes:
mapping_store (MappingStore): Storage for concept mappings
pending_mappings (Dict): Low-confidence mappings pending review
_cache (Dict): In-memory cache of mapped concepts
"""
def __init__(self, mapping_store: MappingStore):
"""
Initialize the concept mapper.
Args:
mapping_store: Storage for concept mappings
"""
self.mapping_store = mapping_store
self.pending_mappings = {}
# Cache for faster lookups of previously mapped concepts
self._cache = {}
# Precompute lowercased standard concept values for faster comparison
self._std_concept_values = [(concept, concept.value.lower()) for concept in StandardConcept]
# Statement-specific keyword sets for faster contextual matching
self._bs_keywords = {'assets', 'liabilities', 'equity', 'cash', 'debt', 'inventory', 'receivable', 'payable'}
self._is_keywords = {'revenue', 'sales', 'income', 'expense', 'profit', 'loss', 'tax', 'earnings'}
self._cf_keywords = {'cash', 'operating', 'investing', 'financing', 'activities'}
def map_concept(self, company_concept: str, label: str, context: Dict[str, Any]) -> Optional[str]:
"""
Map a company concept to a standard concept.
Args:
company_concept: The company-specific concept
label: The label for the concept
context: Additional context information (statement type, calculation relationships, etc.)
Returns:
The standard concept or None if no mapping found
"""
# Use cache for faster lookups
cache_key = (company_concept, context.get('statement_type', ''))
if cache_key in self._cache:
return self._cache[cache_key]
# Check if we already have a mapping in the store
standard_concept = self.mapping_store.get_standard_concept(company_concept)
if standard_concept:
self._cache[cache_key] = standard_concept
return standard_concept
# Cache negative results too to avoid repeated inference
self._cache[cache_key] = None
return None
def _infer_mapping(self, company_concept: str, label: str, context: Dict[str, Any]) -> Tuple[Optional[str], float]:
"""
Infer a mapping between a company concept and a standard concept.
Args:
company_concept: The company-specific concept
label: The label for the concept
context: Additional context information
Returns:
Tuple of (standard_concept, confidence)
"""
# Fast path for common patterns
label_lower = label.lower()
# Quick matching for common concepts without full sequence matching
if "total assets" in label_lower:
return StandardConcept.TOTAL_ASSETS.value, 0.95
elif "revenue" in label_lower and len(label_lower) < 30: # Only match short labels to avoid false positives
return StandardConcept.REVENUE.value, 0.9
elif "net income" in label_lower and "parent" not in label_lower:
return StandardConcept.NET_INCOME.value, 0.9
# Faster direct match checking with precomputed lowercase values
for std_concept, std_value_lower in self._std_concept_values:
if std_value_lower == label_lower:
return std_concept.value, 1.0 # Perfect match
# Fall back to sequence matching for similarity
best_match = None
best_score = 0
# Only compute similarity if some relevant keywords are present to reduce workload
statement_type = context.get("statement_type", "")
# Statement type based filtering to reduce unnecessary comparisons
limited_concepts = []
if statement_type == "BalanceSheet":
if any(kw in label_lower for kw in self._bs_keywords):
# Filter to balance sheet concepts only
limited_concepts = [c for c, v in self._std_concept_values
if any(kw in v for kw in self._bs_keywords)]
elif statement_type == "IncomeStatement":
if any(kw in label_lower for kw in self._is_keywords):
# Filter to income statement concepts only
limited_concepts = [c for c, v in self._std_concept_values
if any(kw in v for kw in self._is_keywords)]
elif statement_type == "CashFlowStatement":
if any(kw in label_lower for kw in self._cf_keywords):
# Filter to cash flow concepts only
limited_concepts = [c for c, v in self._std_concept_values
if any(kw in v for kw in self._cf_keywords)]
# Use limited concepts if available, otherwise use all
concepts_to_check = limited_concepts if limited_concepts else [c for c, _ in self._std_concept_values]
# Calculate similarities for candidate concepts
for std_concept in concepts_to_check:
# Calculate similarity between labels
similarity = SequenceMatcher(None, label_lower, std_concept.value.lower()).ratio()
# Check if this is the best match so far
if similarity > best_score:
best_score = similarity
best_match = std_concept.value
# Apply specific contextual rules based on statement type
if statement_type == "BalanceSheet":
if "assets" in label_lower and "total" in label_lower:
if best_match == StandardConcept.TOTAL_ASSETS.value:
best_score = min(1.0, best_score + 0.2)
elif "liabilities" in label_lower and "total" in label_lower:
if best_match == StandardConcept.TOTAL_LIABILITIES.value:
best_score = min(1.0, best_score + 0.2)
elif "equity" in label_lower and ("total" in label_lower or "stockholders" in label_lower):
if best_match == StandardConcept.TOTAL_EQUITY.value:
best_score = min(1.0, best_score + 0.2)
elif statement_type == "IncomeStatement":
if any(term in label_lower for term in ["revenue", "sales"]):
if best_match == StandardConcept.REVENUE.value:
best_score = min(1.0, best_score + 0.2)
elif "net income" in label_lower:
if best_match == StandardConcept.NET_INCOME.value:
best_score = min(1.0, best_score + 0.2)
# Promote to 0.5 confidence if score close enough to help match
# more items that are almost at threshold
if 0.45 <= best_score < 0.5:
best_score = 0.5
# If confidence is too low, return None
if best_score < 0.5:
return None, 0.0
return best_match, best_score
def learn_mappings(self, filings: List[Dict[str, Any]]) -> None:
"""
Learn mappings from a list of filings.
Args:
filings: List of dicts with XBRL data
"""
# Pre-filter to only process unmapped concepts
mapped_concepts = set()
for _std_concept, company_concepts in self.mapping_store.mappings.items():
mapped_concepts.update(company_concepts)
# Process only unmapped filings
unmapped_filings = [f for f in filings if f.get("concept") not in mapped_concepts]
# Create a batch of mappings to add
mappings_to_add = {}
for filing in unmapped_filings:
concept = filing["concept"]
label = filing["label"]
context = {
"statement_type": filing.get("statement_type", ""),
"calculation_parent": filing.get("calculation_parent", ""),
"position": filing.get("position", "")
}
# Infer mapping and confidence
standard_concept, confidence = self._infer_mapping(concept, label, context)
# Handle based on confidence
if standard_concept and confidence >= 0.9:
if standard_concept not in mappings_to_add:
mappings_to_add[standard_concept] = set()
mappings_to_add[standard_concept].add(concept)
elif standard_concept and confidence >= 0.5:
if standard_concept not in self.pending_mappings:
self.pending_mappings[standard_concept] = []
self.pending_mappings[standard_concept].append((concept, confidence, label))
# Batch add all mappings at once
for std_concept, concepts in mappings_to_add.items():
for concept in concepts:
self.mapping_store.add(concept, std_concept)
# Update cache
cache_key = (concept, filing.get("statement_type", ""))
self._cache[cache_key] = std_concept
def save_pending_mappings(self, destination: str) -> None:
"""
Save pending mappings to a file.
Args:
destination: Path to save the pending mappings
"""
# Convert to serializable format
serializable_mappings = {}
for std_concept, mappings in self.pending_mappings.items():
serializable_mappings[std_concept] = [
{"concept": c, "confidence": conf, "label": lbl}
for c, conf, lbl in mappings
]
with open(destination, 'w') as f:
json.dump(serializable_mappings, f, indent=2)
def standardize_statement(statement_data: List[Dict[str, Any]], mapper: ConceptMapper) -> List[Dict[str, Any]]:
"""
Standardize labels in a statement using the concept mapper.
Args:
statement_data: List of statement line items
mapper: ConceptMapper instance
Returns:
Statement data with standardized labels where possible
"""
# Pre-filter to identify which items need standardization
# This avoids unnecessary copying and processing
items_to_standardize = []
statement_type = statement_data[0].get("statement_type", "") if statement_data else ""
# First pass - identify which items need standardization and prepare context
for i, item in enumerate(statement_data):
# Skip abstract elements and dimensions as they don't need standardization
if item.get("is_abstract", False) or item.get("is_dimension", False):
continue
concept = item.get("concept", "")
if not concept:
continue
label = item.get("label", "")
if not label:
continue
# Build minimal context once, reuse for multiple calls
context = {
"statement_type": item.get("statement_type", "") or statement_type,
"level": item.get("level", 0),
"is_total": "total" in label.lower() or item.get("is_total", False)
}
items_to_standardize.append((i, concept, label, context))
# If no items need standardization, return early with unchanged data
if not items_to_standardize:
return statement_data
# Second pass - create result list with standardized items
result = []
# Track which indices need standardization for faster lookup
standardize_indices = {i for i, _, _, _ in items_to_standardize}
# Process all items
for i, item in enumerate(statement_data):
if i not in standardize_indices:
# Items that don't need standardization are used as-is
result.append(item)
continue
# Get the prepared data for this item
_, concept, label, context = next((x for x in items_to_standardize if x[0] == i), (None, None, None, None))
# Try to map the concept
standard_label = mapper.map_concept(concept, label, context)
# If we found a mapping, create a modified copy
if standard_label:
# Create a shallow copy only when needed
standardized_item = item.copy()
standardized_item["label"] = standard_label
standardized_item["original_label"] = label
result.append(standardized_item)
else:
# No mapping found, use original item
result.append(item)
return result
def create_default_mappings_file(file_path: str) -> None:
"""
Create the initial concept_mappings.json file with default mappings.
This can be called during package installation or initialization.
Args:
file_path: Path where to create the file
"""
# Ensure directory exists
directory = os.path.dirname(file_path)
if directory and not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
# The file already exists, don't overwrite it
if os.path.exists(file_path):
return
# Create a minimal set of mappings to get started
minimal_mappings = {
StandardConcept.REVENUE.value: [
"us-gaap_Revenue",
"us-gaap_SalesRevenueNet",
"us-gaap_Revenues"
],
StandardConcept.NET_INCOME.value: [
"us-gaap_NetIncome",
"us-gaap_NetIncomeLoss",
"us-gaap_ProfitLoss"
],
StandardConcept.TOTAL_ASSETS.value: [
"us-gaap_Assets",
"us-gaap_AssetsTotal"
]
}
# Write the file
with open(file_path, 'w') as f:
json.dump(minimal_mappings, f, indent=2)
# Initialize MappingStore - only loads from JSON
def initialize_default_mappings(read_only: bool = False) -> MappingStore:
"""
Initialize a MappingStore with mappings from the concept_mappings.json file.
Args:
read_only: If True, prevent writing changes back to the file (used in testing)
Returns:
MappingStore initialized with mappings from JSON file
"""
store = MappingStore(read_only=read_only)
# If JSON file doesn't exist, create it with minimal default mappings
# Only do this in non-read_only mode to avoid test-initiated file creation
if not read_only and not os.path.exists(store.source):
create_default_mappings_file(store.source)
return store

View File

@@ -0,0 +1,872 @@
"""
Statement Resolution for XBRL data.
This module provides a robust system for identifying and matching XBRL financial statements,
notes, and disclosures regardless of taxonomy variations and company-specific customizations.
"""
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict, List, Optional, Tuple
from edgar.core import log
from edgar.xbrl.exceptions import StatementNotFound
from edgar.xbrl.statements import statement_to_concepts
class StatementCategory(Enum):
"""Categories of XBRL presentation sections."""
FINANCIAL_STATEMENT = "statement"
NOTE = "note"
DISCLOSURE = "disclosure"
DOCUMENT = "document" # For cover page, signatures, etc.
OTHER = "other"
@dataclass
class ConceptPattern:
"""Pattern for matching statement concepts across different taxonomies."""
pattern: str
weight: float = 1.0
@dataclass
class StatementType:
"""Detailed information about a statement type for matching."""
name: str
primary_concepts: List[str]
category: StatementCategory = StatementCategory.FINANCIAL_STATEMENT # Default to financial statement
alternative_concepts: List[str] = field(default_factory=list)
concept_patterns: List[str] = field(default_factory=list)
key_concepts: List[str] = field(default_factory=list)
role_patterns: List[str] = field(default_factory=list)
title: str = ""
supports_parenthetical: bool = False
weight_map: Dict[str, float] = field(default_factory=dict)
def match_concept(self, concept_name: str) -> bool:
"""Check if a concept name matches this statement type's concepts."""
# Try exact primary concept match
if concept_name in self.primary_concepts:
return True
# Try alternate concepts
if concept_name in self.alternative_concepts:
return True
# Try matching against patterns
for pattern in self.concept_patterns:
if re.match(pattern, concept_name):
return True
return False
def match_role(self, role_uri: str, role_name: str = "", role_def: str = "") -> bool:
"""Check if role information matches this statement type."""
name_lower = self.name.lower()
# Check exact match in role parts
if name_lower in role_uri.lower():
return True
if role_name and name_lower in role_name.lower():
return True
if role_def and name_lower in role_def.lower():
return True
# Try pattern matching
for pattern in self.role_patterns:
if re.match(pattern, role_uri) or (role_name and re.match(pattern, role_name)):
return True
return False
# Registry of statement types with matching information
statement_registry = {
"BalanceSheet": StatementType(
name="BalanceSheet",
category=StatementCategory.FINANCIAL_STATEMENT,
primary_concepts=["us-gaap_StatementOfFinancialPositionAbstract"],
alternative_concepts=[
"us-gaap_BalanceSheetAbstract",
"ifrs-full_StatementOfFinancialPositionAbstract" # IFRS equivalent
],
concept_patterns=[
r".*_StatementOfFinancialPositionAbstract$",
r".*_BalanceSheetAbstract$",
r".*_ConsolidatedBalanceSheetsAbstract$",
r".*_CondensedConsolidatedBalanceSheetsUnauditedAbstract$"
],
key_concepts=[
"us-gaap_Assets", "us-gaap_Liabilities", "us-gaap_StockholdersEquity",
"ifrs-full_Assets", "ifrs-full_Liabilities", "ifrs-full_Equity" # IFRS equivalents
],
role_patterns=[
r".*[Bb]alance[Ss]heet.*",
r".*[Ss]tatement[Oo]f[Ff]inancial[Pp]osition.*",
r".*StatementConsolidatedBalanceSheets.*"
],
title="Consolidated Balance Sheets",
supports_parenthetical=True,
weight_map={"assets": 0.3, "liabilities": 0.3, "equity": 0.4}
),
"IncomeStatement": StatementType(
name="IncomeStatement",
category=StatementCategory.FINANCIAL_STATEMENT,
primary_concepts=["us-gaap_IncomeStatementAbstract"],
alternative_concepts=[
"us-gaap_StatementOfIncomeAbstract",
"ifrs-full_IncomeStatementAbstract" # IFRS equivalent
],
concept_patterns=[
r".*_IncomeStatementAbstract$",
r".*_StatementOfIncomeAbstract$",
r".*_ConsolidatedStatementsOfIncomeAbstract$",
r".*_CondensedConsolidatedStatementsOfIncomeUnauditedAbstract$"
],
key_concepts=[
"us-gaap_Revenues", "us-gaap_NetIncomeLoss",
"ifrs-full_Revenue", "ifrs-full_ProfitLoss" # IFRS equivalents
],
role_patterns=[
r".*[Ii]ncome[Ss]tatement.*",
r".*[Ss]tatement[Oo]f[Ii]ncome.*",
r".*[Oo]perations.*",
r".*StatementConsolidatedStatementsOfIncome.*"
],
title="Consolidated Statement of Income",
supports_parenthetical=True,
weight_map={"revenues": 0.4, "netIncomeLoss": 0.6}
),
"CashFlowStatement": StatementType(
name="CashFlowStatement",
category=StatementCategory.FINANCIAL_STATEMENT,
primary_concepts=["us-gaap_StatementOfCashFlowsAbstract"],
alternative_concepts=["ifrs-full_StatementOfCashFlowsAbstract"], # IFRS equivalent
concept_patterns=[
r".*_StatementOfCashFlowsAbstract$",
r".*_CashFlowsAbstract$",
r".*_ConsolidatedStatementsOfCashFlowsAbstract$",
r".*_CondensedConsolidatedStatementsOfCashFlowsUnauditedAbstract$"
],
key_concepts=[
"us-gaap_NetCashProvidedByUsedInOperatingActivities",
"us-gaap_CashAndCashEquivalentsPeriodIncreaseDecrease",
"ifrs-full_CashFlowsFromUsedInOperatingActivities", # IFRS equivalents
"ifrs-full_IncreaseDecreaseInCashAndCashEquivalents"
],
role_patterns=[
r".*[Cc]ash[Ff]low.*",
r".*[Ss]tatement[Oo]f[Cc]ash[Ff]lows.*",
r".*StatementConsolidatedStatementsOfCashFlows.*"
],
title="Consolidated Statement of Cash Flows",
supports_parenthetical=False
),
"StatementOfEquity": StatementType(
name="StatementOfEquity",
category=StatementCategory.FINANCIAL_STATEMENT,
primary_concepts=["us-gaap_StatementOfStockholdersEquityAbstract"],
alternative_concepts=[
"us-gaap_StatementOfShareholdersEquityAbstract",
"us-gaap_StatementOfPartnersCapitalAbstract"
],
concept_patterns=[
r".*_StatementOfStockholdersEquityAbstract$",
r".*_StatementOfShareholdersEquityAbstract$",
r".*_StatementOfChangesInEquityAbstract$",
r".*_ConsolidatedStatementsOfShareholdersEquityAbstract$"
],
key_concepts=["us-gaap_StockholdersEquity", "us-gaap_CommonStock", "us-gaap_RetainedEarnings"],
role_patterns=[
r".*[Ee]quity.*",
r".*[Ss]tockholders.*",
r".*[Ss]hareholders.*",
r".*[Cc]hanges[Ii]n[Ee]quity.*",
r".*StatementConsolidatedStatementsOfStockholdersEquity.*"
],
title="Consolidated Statement of Equity",
supports_parenthetical=False
),
"ComprehensiveIncome": StatementType(
name="ComprehensiveIncome",
category=StatementCategory.FINANCIAL_STATEMENT,
primary_concepts=["us-gaap_StatementOfIncomeAndComprehensiveIncomeAbstract"],
alternative_concepts=["us-gaap_StatementOfComprehensiveIncomeAbstract"],
concept_patterns=[
r".*_ComprehensiveIncomeAbstract$",
r".*_StatementOfComprehensiveIncomeAbstract$",
r".*_ConsolidatedStatementsOfComprehensiveIncomeAbstract$"
],
key_concepts=["us-gaap_ComprehensiveIncomeNetOfTax"],
role_patterns=[
r".*[Cc]omprehensive[Ii]ncome.*",
r".*[Oo]ther[Cc]omprehensive.*",
r".*StatementConsolidatedStatementsOfComprehensiveIncome.*"
],
title="Consolidated Statement of Comprehensive Income",
supports_parenthetical=True
),
"Notes": StatementType(
name="Notes",
category=StatementCategory.NOTE,
primary_concepts=["us-gaap_NotesToFinancialStatementsAbstract"],
alternative_concepts=[],
concept_patterns=[
r".*_NotesToFinancialStatementsAbstract$",
r".*_NotesAbstract$"
],
key_concepts=[],
role_patterns=[
r".*[Nn]otes[Tt]o[Ff]inancial[Ss]tatements.*",
r".*[Nn]ote\s+\d+.*",
r".*[Nn]otes.*"
],
title="Notes to Financial Statements",
supports_parenthetical=False
),
"AccountingPolicies": StatementType(
name="AccountingPolicies",
category=StatementCategory.NOTE,
primary_concepts=["us-gaap_AccountingPoliciesAbstract"],
alternative_concepts=[],
concept_patterns=[
r".*_AccountingPoliciesAbstract$",
r".*_SignificantAccountingPoliciesAbstract$"
],
key_concepts=["us-gaap_SignificantAccountingPoliciesTextBlock"],
role_patterns=[
r".*[Aa]ccounting[Pp]olicies.*",
r".*[Ss]ignificant[Aa]ccounting[Pp]olicies.*"
],
title="Significant Accounting Policies",
supports_parenthetical=False
),
"Disclosures": StatementType(
name="Disclosures",
category=StatementCategory.DISCLOSURE,
primary_concepts=["us-gaap_DisclosuresAbstract"],
alternative_concepts=[],
concept_patterns=[
r".*_DisclosuresAbstract$",
r".*_DisclosureAbstract$"
],
key_concepts=[],
role_patterns=[
r".*[Dd]isclosure.*"
],
title="Disclosures",
supports_parenthetical=False
),
"SegmentDisclosure": StatementType(
name="SegmentDisclosure",
category=StatementCategory.DISCLOSURE,
primary_concepts=["us-gaap_SegmentDisclosureAbstract"],
alternative_concepts=[],
concept_patterns=[
r".*_SegmentDisclosureAbstract$",
r".*_SegmentReportingDisclosureAbstract$"
],
key_concepts=["us-gaap_SegmentReportingDisclosureTextBlock"],
role_patterns=[
r".*[Ss]egment.*",
r".*[Ss]egment[Rr]eporting.*",
r".*[Ss]egment[Ii]nformation.*"
],
title="Segment Information",
supports_parenthetical=False
),
"CoverPage": StatementType(
name="CoverPage",
category=StatementCategory.DOCUMENT,
primary_concepts=["dei_CoverAbstract"],
concept_patterns=[r".*_CoverAbstract$"],
key_concepts=["dei_EntityRegistrantName", "dei_DocumentType"],
role_patterns=[r".*[Cc]over.*"],
title="Cover Page",
supports_parenthetical=False
)
}
class StatementResolver:
"""
Resolves statement identifiers to actual XBRL statement roles.
This class provides a multi-layered approach to statement matching,
handling taxonomy variations and company-specific customizations.
"""
def __init__(self, xbrl):
"""
Initialize with an XBRL object.
Args:
xbrl: XBRL object containing parsed data
"""
self.xbrl = xbrl
self._cache = {}
# Build indices for faster lookups
self._statement_by_role_uri = {}
self._statement_by_role_name = {}
self._statement_by_primary_concept = {}
self._statement_by_type = {}
self._statement_by_role_def = {}
# Map legacy statement types to new registry
self._legacy_to_registry = {}
for legacy_type, info in statement_to_concepts.items():
if legacy_type in statement_registry:
self._legacy_to_registry[legacy_type] = legacy_type
continue
# Try to find a match in the registry
for reg_type, reg_info in statement_registry.items():
if info.concept in reg_info.primary_concepts or info.concept in reg_info.alternative_concepts:
self._legacy_to_registry[legacy_type] = reg_type
break
# Initialize indices when instantiated
self._initialize_indices()
def _initialize_indices(self):
"""Build lookup indices for fast statement retrieval."""
# Get all statements
statements = self.xbrl.get_all_statements()
# Reset indices
self._statement_by_role_uri = {}
self._statement_by_role_name = {}
self._statement_by_primary_concept = {}
self._statement_by_type = {}
self._statement_by_role_def = {}
# Build indices
for stmt in statements:
role = stmt.get('role', '')
role_name = stmt.get('role_name', '').lower() if stmt.get('role_name') else ''
primary_concept = stmt.get('primary_concept', '')
stmt_type = stmt.get('type', '')
role_def = stmt.get('definition', '').lower() if stmt.get('definition') else ''
# By role URI
self._statement_by_role_uri[role] = stmt
# By role name
if role_name:
if role_name not in self._statement_by_role_name:
self._statement_by_role_name[role_name] = []
self._statement_by_role_name[role_name].append(stmt)
# By primary concept
if primary_concept:
if primary_concept not in self._statement_by_primary_concept:
self._statement_by_primary_concept[primary_concept] = []
self._statement_by_primary_concept[primary_concept].append(stmt)
# By statement type
if stmt_type:
if stmt_type not in self._statement_by_type:
self._statement_by_type[stmt_type] = []
self._statement_by_type[stmt_type].append(stmt)
# By role definition (without spaces, lowercase)
if role_def:
def_key = role_def.replace(' ', '')
if def_key not in self._statement_by_role_def:
self._statement_by_role_def[def_key] = []
self._statement_by_role_def[def_key].append(stmt)
def _match_by_primary_concept(self, statement_type: str, is_parenthetical: bool = False) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
"""
Match statements using primary concept names.
Args:
statement_type: Statement type to match
is_parenthetical: Whether to look for a parenthetical statement
Returns:
Tuple of (matching statements, found role, confidence score)
"""
# Convert legacy types to registry types if needed
if statement_type in self._legacy_to_registry:
registry_type = self._legacy_to_registry[statement_type]
else:
registry_type = statement_type
# Check if this is a known statement type
if registry_type not in statement_registry:
return [], None, 0.0
# Get registry information
registry_entry = statement_registry[registry_type]
# Try to match by primary concepts
matched_statements = []
for concept in registry_entry.primary_concepts + registry_entry.alternative_concepts:
if concept in self._statement_by_primary_concept:
for stmt in self._statement_by_primary_concept[concept]:
# Handle parenthetical check
if registry_entry.supports_parenthetical:
role_def = stmt.get('definition', '').lower()
is_role_parenthetical = 'parenthetical' in role_def
# Skip if parenthetical status doesn't match
if is_parenthetical != is_role_parenthetical:
continue
matched_statements.append(stmt)
# If we found matching statements, return with high confidence
if matched_statements:
return matched_statements, matched_statements[0]['role'], 0.9
return [], None, 0.0
def _match_by_concept_pattern(self, statement_type: str, is_parenthetical: bool = False) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
"""
Match statements using regex patterns on concept names to handle custom company namespaces.
Args:
statement_type: Statement type to match
is_parenthetical: Whether to look for a parenthetical statement
Returns:
Tuple of (matching statements, found role, confidence score)
"""
# Convert legacy types to registry types if needed
if statement_type in self._legacy_to_registry:
registry_type = self._legacy_to_registry[statement_type]
else:
registry_type = statement_type
# Check if this is a known statement type
if registry_type not in statement_registry:
return [], None, 0.0
# Get registry information
registry_entry = statement_registry[registry_type]
concept_patterns = registry_entry.concept_patterns
if not concept_patterns:
return [], None, 0.0
# Get all statements to check against patterns
all_statements = self.xbrl.get_all_statements()
# Check each statement's primary concept against our patterns
matched_statements = []
for stmt in all_statements:
primary_concept = stmt.get('primary_concept', '')
# Skip if no primary concept
if not primary_concept:
continue
# Check if this concept matches any of our patterns
for pattern in concept_patterns:
if re.match(pattern, primary_concept):
# For parenthetical statements, check the role definition
if registry_entry.supports_parenthetical:
role_def = stmt.get('definition', '').lower()
is_role_parenthetical = 'parenthetical' in role_def
# Skip if parenthetical status doesn't match
if is_parenthetical != is_role_parenthetical:
continue
matched_statements.append(stmt)
break # Found a match, no need to check other patterns
# If we found matching statements, return with high confidence
if matched_statements:
return matched_statements, matched_statements[0]['role'], 0.85
return [], None, 0.0
def _match_by_role_pattern(self, statement_type: str, is_parenthetical: bool = False) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
"""
Match statements using role URI or role name patterns.
Args:
statement_type: Statement type to match
is_parenthetical: Whether to look for a parenthetical statement
Returns:
Tuple of (matching statements, found role, confidence score)
"""
# Convert legacy types to registry types if needed
if statement_type in self._legacy_to_registry:
registry_type = self._legacy_to_registry[statement_type]
else:
registry_type = statement_type
# Check if this is a known statement type
if registry_type not in statement_registry:
return [], None, 0.0
# Get registry information
registry_entry = statement_registry[registry_type]
role_patterns = registry_entry.role_patterns
if not role_patterns:
return [], None, 0.0
# Get all statements
all_statements = self.xbrl.get_all_statements()
# Check each statement's role and role name against our patterns
matched_statements = []
for stmt in all_statements:
role = stmt.get('role', '')
role_name = stmt.get('role_name', '')
# Check if role matches any pattern
for pattern in role_patterns:
if (re.search(pattern, role, re.IGNORECASE) or
(role_name and re.search(pattern, role_name, re.IGNORECASE))):
# For parenthetical statements, check the role definition
if registry_entry.supports_parenthetical:
role_def = stmt.get('definition', '').lower()
is_role_parenthetical = 'parenthetical' in role_def
# Skip if parenthetical status doesn't match
if is_parenthetical != is_role_parenthetical:
continue
matched_statements.append(stmt)
break # Found a match, no need to check other patterns
# If we found matching statements, return with good confidence
if matched_statements:
return matched_statements, matched_statements[0]['role'], 0.75
return [], None, 0.0
def _match_by_content(self, statement_type: str) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
"""
Match statements by analyzing their content against key concepts.
Args:
statement_type: Statement type to match
Returns:
Tuple of (matching statements, found role, confidence score)
"""
# Convert legacy types to registry types if needed
if statement_type in self._legacy_to_registry:
registry_type = self._legacy_to_registry[statement_type]
else:
registry_type = statement_type
# Check if this is a known statement type
if registry_type not in statement_registry:
return [], None, 0.0
# Get registry information
registry_entry = statement_registry[registry_type]
key_concepts = registry_entry.key_concepts
if not key_concepts:
return [], None, 0.0
# Get all statements
all_statements = self.xbrl.get_all_statements()
# Score each statement based on presence of key concepts
statement_scores = []
for stmt in all_statements:
role = stmt.get('role', '')
if role not in self.xbrl.presentation_trees:
continue
# Get concept nodes for this role
tree = self.xbrl.presentation_trees[role]
all_nodes = set(tree.all_nodes.keys())
# Count matching key concepts
matches = 0
total_weight = 0.0
for concept in key_concepts:
# Normalize concept name
normalized = concept.replace(':', '_')
if concept in all_nodes or normalized in all_nodes:
matches += 1
# Add weighting if available
concept_key = concept.split('_')[-1].lower()
weight = registry_entry.weight_map.get(concept_key, 1.0)
total_weight += weight
# Calculate confidence score (weighted by presence of key concepts)
if key_concepts:
# Base confidence on percentage of key concepts found
confidence = matches / len(key_concepts)
# Apply weighting if available
if total_weight > 0:
confidence = min(total_weight / sum(registry_entry.weight_map.values()), 1.0)
else:
confidence = 0.0
if confidence > 0:
statement_scores.append((stmt, confidence))
# Sort by confidence score
statement_scores.sort(key=lambda x: x[1], reverse=True)
# Return best match if above threshold
if statement_scores and statement_scores[0][1] >= 0.4:
best_match, confidence = statement_scores[0]
return [best_match], best_match['role'], min(confidence + 0.2, 0.85) # Boost confidence but cap at 0.85
return [], None, 0.0
def _match_by_standard_name(self, statement_type: str) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
"""
Match statements by standard statement type name.
Args:
statement_type: Statement type to match
Returns:
Tuple of (matching statements, found role, confidence score)
"""
# Check if we have statements of this type
if statement_type in self._statement_by_type:
statements = self._statement_by_type[statement_type]
if statements:
return statements, statements[0]['role'], 0.95
return [], None, 0.0
def _match_by_role_definition(self, statement_type: str) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
"""
Match statements by role definition text.
Args:
statement_type: Statement type or definition text to match
Returns:
Tuple of (matching statements, found role, confidence score)
"""
# Clean statement type for matching
clean_type = statement_type.lower().replace(' ', '')
# Try exact match
if clean_type in self._statement_by_role_def:
statements = self._statement_by_role_def[clean_type]
if statements:
return statements, statements[0]['role'], 0.85
# Try partial match
for def_key, statements in self._statement_by_role_def.items():
if clean_type in def_key:
return statements, statements[0]['role'], 0.65
if def_key in clean_type:
return statements, statements[0]['role'], 0.55
return [], None, 0.0
def _get_best_guess(self, statement_type: str) -> Tuple[List[Dict[str, Any]], Optional[str], float]:
"""
Make a best guess when all other methods fail.
Args:
statement_type: Statement type to guess
Returns:
Tuple of (matching statements, found role, confidence score)
"""
# Try partial matching on role names
clean_type = statement_type.lower()
for role_name, statements in self._statement_by_role_name.items():
if clean_type in role_name or role_name in clean_type:
return statements, statements[0]['role'], 0.4
# If we have statements of any type, return the first one with very low confidence
all_statements = self.xbrl.get_all_statements()
if all_statements:
# Try to find a primary financial statement
for stmt_type in ['BalanceSheet', 'IncomeStatement', 'CashFlowStatement']:
if stmt_type in self._statement_by_type:
statements = self._statement_by_type[stmt_type]
if statements:
return statements, statements[0]['role'], 0.2
# Last resort: return first statement
return [all_statements[0]], all_statements[0]['role'], 0.1
return [], None, 0.0
def find_statement(self, statement_type: str, is_parenthetical: bool = False,
category_filter: Optional[StatementCategory] = None) -> Tuple[List[Dict[str, Any]], Optional[str], str, float]:
"""
Find a statement by type, with multi-layered fallback approach.
Args:
statement_type: Statement type or identifier
is_parenthetical: Whether to look for parenthetical version
category_filter: Optional filter to only match statements of a specific category
Returns:
Tuple of (matching_statements, found_role, canonical_statement_type, confidence_score)
Note:
For standard statement types like "BalanceSheet", "IncomeStatement", etc., the
canonical_statement_type will be the input statement_type, allowing downstream
code to still recognize and apply type-specific logic.
"""
# Check cache first
category_key = str(category_filter.value) if category_filter else "None"
cache_key = f"{statement_type}_{is_parenthetical}_{category_key}"
if cache_key in self._cache:
return self._cache[cache_key]
# If this is a role URI we already know, return immediately
if statement_type in self._statement_by_role_uri:
stmt = self._statement_by_role_uri[statement_type]
# Apply category filter if specified
if category_filter:
# Get category from statement or determine based on type
stmt_category = None
if 'category' in stmt and stmt['category']:
stmt_category = stmt['category']
elif stmt['type'] in statement_registry:
stmt_category = statement_registry[stmt['type']].category.value
# Skip if category doesn't match
if stmt_category != category_filter.value:
result = ([], None, statement_type, 0.0)
self._cache[cache_key] = result
return result
result = ([stmt], statement_type, stmt.get('type', statement_type), 1.0)
self._cache[cache_key] = result
return result
# Check if this is a canonical statement type from the registry
is_canonical_type = statement_type in statement_registry
# Try standard name matching first (exact type match)
match = self._match_by_standard_name(statement_type)
if match[0] and match[2] > 0.9: # Very high confidence
statements, role, conf = match
# For canonical types, preserve the original statement_type
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
result = (statements, role, canonical_type, conf)
self._cache[cache_key] = result
return result
# Try primary concept matching
match = self._match_by_primary_concept(statement_type, is_parenthetical)
if match[0] and match[2] > 0.8: # High confidence
statements, role, conf = match
# For canonical types, preserve the original statement_type
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
result = (statements, role, canonical_type, conf)
self._cache[cache_key] = result
return result
# Try custom namespace matching
match = self._match_by_concept_pattern(statement_type, is_parenthetical)
if match[0] and match[2] > 0.8: # High confidence
statements, role, conf = match
# For canonical types, preserve the original statement_type
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
result = (statements, role, canonical_type, conf)
self._cache[cache_key] = result
return result
# Try role pattern matching
match = self._match_by_role_pattern(statement_type, is_parenthetical)
if match[0] and match[2] > 0.7: # Good confidence
statements, role, conf = match
# For canonical types, preserve the original statement_type
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
result = (statements, role, canonical_type, conf)
self._cache[cache_key] = result
return result
# Try content-based analysis
match = self._match_by_content(statement_type)
if match[0] and match[2] > 0.6: # Moderate confidence
statements, role, conf = match
# For canonical types, preserve the original statement_type
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
result = (statements, role, canonical_type, conf)
self._cache[cache_key] = result
return result
# Try role definition matching
match = self._match_by_role_definition(statement_type)
if match[0] and match[2] > 0.5: # Lower confidence but still useful
statements, role, conf = match
# For canonical types, preserve the original statement_type
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
result = (statements, role, canonical_type, conf)
self._cache[cache_key] = result
return result
# No good match found, return best guess with low confidence
statements, role, conf = self._get_best_guess(statement_type)
if conf < 0.4:
# Get entity context for detailed error reporting
entity_name = getattr(self.xbrl, 'entity_name', 'Unknown')
cik = getattr(self.xbrl, 'cik', 'Unknown')
period_of_report = getattr(self.xbrl, 'period_of_report', 'Unknown')
if len(statements) == 0:
raise StatementNotFound(
statement_type=statement_type,
confidence=conf,
found_statements=[],
entity_name=entity_name,
cik=cik,
period_of_report=period_of_report,
reason="No statements available in XBRL data"
)
elif conf < 0.3:
found_statements = [s['definition'] for s in statements]
raise StatementNotFound(
statement_type=statement_type,
confidence=conf,
found_statements=found_statements,
entity_name=entity_name,
cik=cik,
period_of_report=period_of_report,
reason="Confidence threshold not met"
)
else:
log.warn(
f"No good match found for statement type '{statement_type}'. The best guess has low confidence: {conf:.2f}")
if statements:
# For canonical types, preserve the original statement_type
canonical_type = statement_type if is_canonical_type else statements[0].get('type', statement_type)
result = (statements, role, canonical_type, conf)
else:
result = ([], None, statement_type, 0.0)
self._cache[cache_key] = result
return result

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,27 @@
"""
XBRL Statement Stitching Package
This package provides functionality to combine multiple XBRL statements
across different time periods into a unified view, handling concept
consistency issues and normalizing data representation.
"""
# Import standardize_statement for backwards compatibility with tests
from edgar.xbrl.standardization import standardize_statement
from edgar.xbrl.stitching.core import StatementStitcher, stitch_statements
from edgar.xbrl.stitching.periods import determine_optimal_periods
from edgar.xbrl.stitching.query import StitchedFactQuery, StitchedFactsView
from edgar.xbrl.stitching.utils import render_stitched_statement, to_pandas
from edgar.xbrl.stitching.xbrls import XBRLS
__all__ = [
'XBRLS',
'StatementStitcher',
'stitch_statements',
'determine_optimal_periods',
'render_stitched_statement',
'to_pandas',
'standardize_statement',
'StitchedFactsView',
'StitchedFactQuery'
]

View File

@@ -0,0 +1,621 @@
"""
XBRL Statement Stitching - Core Functionality
This module contains the core StatementStitcher class and related functionality
for combining multiple XBRL statements across different time periods.
"""
from collections import defaultdict
from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional, Set, Tuple, Union
from edgar.xbrl.core import format_date, parse_date
from edgar.xbrl.standardization import ConceptMapper, initialize_default_mappings, standardize_statement
from edgar.xbrl.stitching.ordering import StatementOrderingManager
from edgar.xbrl.stitching.periods import determine_optimal_periods
from edgar.xbrl.stitching.presentation import VirtualPresentationTree
class StatementStitcher:
"""
Combines multiple statements across time periods into a unified view.
This class handles the complexities of combining financial statements
from different periods, including:
- Normalizing concepts that change over time
- Aligning periods correctly
- Handling missing data points
- Providing both standardized and company-specific views
"""
class PeriodType(str, Enum):
"""Types of period views available for stitched statements"""
RECENT_PERIODS = "Most Recent Periods"
RECENT_YEARS = "Recent Years"
THREE_YEAR_COMPARISON = "Three-Year Comparison"
THREE_QUARTERS = "Three Recent Quarters"
ANNUAL_COMPARISON = "Annual Comparison"
QUARTERLY_TREND = "Quarterly Trend"
ALL_PERIODS = "All Available Periods"
def __init__(self, concept_mapper: Optional[ConceptMapper] = None):
"""
Initialize a StatementStitcher instance.
Args:
concept_mapper: Optional ConceptMapper for standardizing concepts.
If None, a default mapper is created.
"""
if concept_mapper is None:
self.mapping_store = initialize_default_mappings()
self.concept_mapper = ConceptMapper(self.mapping_store)
else:
self.concept_mapper = concept_mapper
self.mapping_store = concept_mapper.mapping_store
# Initialize data structures
self.periods = [] # Ordered list of period identifiers
self.period_dates = {} # Maps period ID to display dates
self.data = defaultdict(dict) # {concept: {period: value}}
self.concept_metadata = {} # Metadata for each concept (level, etc.)
self.ordering_manager = None # Will be initialized during stitching
self.original_statement_order = [] # Track original order for hierarchy context
def stitch_statements(
self,
statements: List[Dict[str, Any]],
period_type: Union[PeriodType, str] = PeriodType.RECENT_PERIODS,
max_periods: int = None,
standard: bool = True
) -> Dict[str, Any]:
"""
Stitch multiple statements into a unified view.
Args:
statements: List of statement data from different filings
period_type: Type of period view to generate
max_periods: Maximum number of periods to include
standard: Whether to use standardized concept labels
Returns:
Dictionary with stitched statement data
"""
# Reset state
self.periods = []
self.period_dates = {}
self.data = defaultdict(dict)
self.concept_metadata = {}
self.original_statement_order = []
# Initialize ordering manager for this statement type
statement_type = statements[0].get('statement_type', 'IncomeStatement') if statements else 'IncomeStatement'
self.ordering_manager = StatementOrderingManager(statement_type)
# Capture original statement order from the most recent (first) statement for hierarchy context
if statements:
reference_statement = statements[0]
self.original_statement_order = []
for item in reference_statement.get('data', []):
concept = item.get('concept')
label = item.get('label')
if concept:
self.original_statement_order.append(concept)
if label and label not in self.original_statement_order:
self.original_statement_order.append(label)
# Extract and sort all periods
all_periods = self._extract_periods(statements)
# Set max_periods if not provided
max_periods = max_periods or len(statements) + 2 # Allow for the last statement to have 3 periods
# Select appropriate periods based on period_type
selected_periods = self._select_periods(all_periods, period_type, max_periods)
self.periods = selected_periods
# Process each statement
for _i, statement in enumerate(statements):
# Only process statements that have periods in our selection
statement_periods = set(statement['periods'].keys())
relevant_periods = statement_periods.intersection(set(selected_periods))
if not relevant_periods:
continue
# Standardize the statement if needed
if standard:
processed_data = self._standardize_statement_data(statement)
else:
processed_data = statement['data']
# Store data for each item
self._integrate_statement_data(processed_data, statement['periods'], relevant_periods)
# Format the stitched data
return self._format_output_with_ordering(statements)
def _extract_periods(self, statements: List[Dict[str, Any]]) -> List[Tuple[str, datetime]]:
"""
Extract and sort all periods from the statements, de-duplicating periods with the same date.
Args:
statements: List of statement data
Returns:
List of (period_id, end_date) tuples, sorted by date (newest first)
"""
# Use a dictionary to track unique periods by their end date
# This will handle cases where different period_ids reference the same date
unique_periods = {} # key: date string, value: (period_id, datetime, statement_index)
for i, statement in enumerate(statements):
# Use statement index (i) to prioritize more recent filings
# Lower index = more recent filing
for period_id, period_info in statement['periods'].items():
# Extract end date for sorting
try:
# Initialize normalized_key to silence the type checker
normalized_key = ""
if period_id.startswith('instant_'):
date_str = period_id.split('_')[1]
# Format the date consistently with single statements
try:
date_obj = parse_date(date_str)
display_date = format_date(date_obj)
except ValueError:
# Fall back to original label if parsing fails
display_date = period_info['label']
period_type = 'instant'
# For instant periods, create a normalized key with just the date
normalized_key = f"{period_type}_{date_str}"
else: # duration
# For durations, extract both start and end dates
parts = period_id.split('_')
if len(parts) >= 3:
start_date_str = parts[1]
end_date_str = parts[2]
start_date = parse_date(start_date_str)
end_date = parse_date(end_date_str)
date_str = end_date_str # Use end date for sorting
# Format end date consistently - for stitched statements,
# we only need the end date for duration periods as that's what users compare
display_date = format_date(end_date)
period_type = 'duration'
# Create a normalized key that combines period type, start date, and end date
normalized_key = f"{period_type}_{format_date(start_date)}_{format_date(end_date)}"
else:
# Skip malformed period IDs
continue
# Parse the end date for sorting
end_date = parse_date(date_str)
# Check if we already have this period (by normalized key)
if normalized_key in unique_periods:
existing_idx = unique_periods[normalized_key][2]
# Only replace if this statement is from a more recent filing
if i < existing_idx:
unique_periods[normalized_key] = (period_id, end_date, i)
self.period_dates[period_id] = display_date
else:
# Add new period
unique_periods[normalized_key] = (period_id, end_date, i)
self.period_dates[period_id] = display_date
except (ValueError, TypeError, IndexError):
# Skip periods with invalid dates
continue
# Extract and sort the unique periods
all_periods = [(period_id, end_date) for period_id, end_date, _ in unique_periods.values()]
# Sort by date, newest first
return sorted(all_periods, key=lambda x: x[1], reverse=True)
def _select_periods(
self,
all_periods: List[Tuple[str, Union[str,datetime]]],
period_type: Union[PeriodType, str],
max_periods: int
) -> List[str]:
"""
Select appropriate periods based on period_type.
Args:
all_periods: List of (period_id, end_date) tuples
period_type: Type of period view to generate
max_periods: Maximum number of periods to include
Returns:
List of selected period IDs
"""
if isinstance(period_type, str):
try:
period_type = StatementStitcher.PeriodType(period_type)
except ValueError:
# Default to recent periods if string doesn't match enum
period_type = StatementStitcher.PeriodType.RECENT_PERIODS
# Extract period types (instant vs duration)
instants = [(pid, date) for pid, date in all_periods if pid.startswith('instant_')]
durations = [(pid, date) for pid, date in all_periods if not pid.startswith('instant_')]
# Apply different selection logic based on period_type
if period_type == StatementStitcher.PeriodType.RECENT_PERIODS:
# Just take the most recent periods up to max_periods
return [pid for pid, _ in all_periods[:max_periods]]
elif period_type == StatementStitcher.PeriodType.THREE_YEAR_COMPARISON:
# For balance sheets, find year-end instants
year_ends = []
years_seen = set()
for pid, date in instants:
year = parse_date(date).year
if year not in years_seen and len(year_ends) < max_periods:
year_ends.append(pid)
years_seen.add(year)
return year_ends
elif period_type == StatementStitcher.PeriodType.THREE_QUARTERS:
# Find the most recent quarters (for income statements)
quarterly_periods = []
for pid, _date in durations:
# Check if this appears to be a quarterly period
if not pid.startswith('duration_'):
continue
start_date_str = pid.split('_')[1]
end_date_str = pid.split('_')[2]
try:
start_date = parse_date(start_date_str)
end_date = parse_date(end_date_str)
days = (end_date - start_date).days
# Assuming quarterly is around 90 days
if 80 <= days <= 95:
quarterly_periods.append(pid)
if len(quarterly_periods) >= max_periods:
break
except (ValueError, TypeError, IndexError):
continue
return quarterly_periods
elif period_type == StatementStitcher.PeriodType.ANNUAL_COMPARISON:
# Find annual periods (for income statements)
annual_periods = []
for pid, _date in durations:
# Check if this appears to be an annual period
if not pid.startswith('duration_'):
continue
start_date_str = pid.split('_')[1]
end_date_str = pid.split('_')[2]
try:
start_date = parse_date(start_date_str)
end_date = parse_date(end_date_str)
days = (end_date - start_date).days
# Assuming annual is around 365 days
if 350 <= days <= 380:
annual_periods.append(pid)
if len(annual_periods) >= max_periods:
break
except (ValueError, TypeError, IndexError):
continue
return annual_periods
elif period_type == StatementStitcher.PeriodType.ALL_PERIODS:
# Return all periods, newest first, up to max_periods
return [pid for pid, _ in all_periods[:max_periods]]
# Default to recent periods
return [pid for pid, _ in all_periods[:max_periods]]
def _standardize_statement_data(self, statement: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Standardize the statement data using the concept mapper.
Args:
statement: Statement data
Returns:
Standardized statement data
"""
# Add statement type to context for better mapping
statement_type = statement.get('statement_type', '')
statement_data = statement['data']
for item in statement_data:
item['statement_type'] = statement_type
# Apply standardization using the concept mapper
return standardize_statement(statement_data, self.concept_mapper)
def _integrate_statement_data(
self,
statement_data: List[Dict[str, Any]],
period_map: Dict[str, Dict[str, str]],
relevant_periods: Set[str]
) -> None:
"""
Integrate statement data from one statement into the stitched view.
Args:
statement_data: Statement data
period_map: Map of period IDs to period information
relevant_periods: Set of periods from this statement to include
"""
# Map to track concepts by their underlying concept ID, not just label
# This helps merge rows that represent the same concept but have different labels
concept_to_label_map = {}
for item in statement_data:
concept = item.get('concept')
label = item.get('label')
# Skip items without concept or label
if not concept or not label:
continue
# Skip abstract items with no children (headers without data)
if item.get('is_abstract', False) and not item.get('children'):
continue
# Skip dimension items
if any(bracket in label for bracket in ['[Axis]', '[Domain]', '[Member]', '[Line Items]', '[Table]', '[Abstract]']):
continue
# Use concept as the primary key for identifying the same financial line item
# This is more reliable than labels which may vary across filings
# If we've already seen this concept, use the existing label as the key
# This ensures we merge rows that represent the same concept
if concept in concept_to_label_map:
concept_key = concept_to_label_map[concept]
else:
# For a new concept, use the current label as the key
concept_key = label
# Remember this mapping for future occurrences
concept_to_label_map[concept] = concept_key
# Store metadata about the concept (level, abstract status, etc.)
# If we've already seen this concept, only update metadata if it's from a more recent period
# This ensures we use labels from the most recent filing when merging rows
if concept_key not in self.concept_metadata:
self.concept_metadata[concept_key] = {
'level': item.get('level', 0),
'is_abstract': item.get('is_abstract', False),
'is_total': item.get('is_total', False) or 'total' in label.lower(),
'original_concept': concept,
'latest_label': label # Store the original label too
}
else:
# For existing concepts, update the label to use the most recent one
# We determine which periods are most recent based on position in self.periods
# (earlier indices are more recent periods)
# Find the periods in this statement
statement_periods = [p for p in relevant_periods if p in self.periods]
if statement_periods:
# Get the most recent period in this statement
most_recent_period = min(statement_periods, key=lambda p: self.periods.index(p))
most_recent_idx = self.periods.index(most_recent_period)
# Find the earliest period where we have data for this concept
existing_periods = [p for p in self.data[concept_key].keys() if p in self.periods]
if existing_periods:
earliest_existing_idx = min(self.periods.index(p) for p in existing_periods)
# If this statement has more recent data, update the label
if most_recent_idx < earliest_existing_idx:
# Update the concept key label for display
new_concept_key = label
# If we're changing the label, we need to migrate existing data
if new_concept_key != concept_key:
# Copy existing data to the new key
if new_concept_key not in self.data:
self.data[new_concept_key] = self.data[concept_key].copy()
# Update metadata
self.concept_metadata[new_concept_key] = self.concept_metadata[concept_key].copy()
self.concept_metadata[new_concept_key]['latest_label'] = label
# Update the concept mapping
concept_to_label_map[concept] = new_concept_key
concept_key = new_concept_key
else:
# Just update the latest label
self.concept_metadata[concept_key]['latest_label'] = label
# Store values for relevant periods
for period_id in relevant_periods:
if period_id in self.periods: # Only include selected periods
value = item.get('values', {}).get(period_id)
if value is not None:
self.data[concept_key][period_id] = {
'value': value,
'decimals': item.get('decimals', {}).get(period_id, 0)
}
def _format_output_with_ordering(self, statements: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Format the stitched data for rendering with intelligent ordering using virtual presentation tree.
Args:
statements: Original statements for ordering reference
Returns:
Stitched statement data in the expected format
"""
# Get unified ordering for all concepts using the ordering manager
concept_ordering = {}
if self.ordering_manager:
concept_ordering = self.ordering_manager.determine_ordering(statements)
# Build virtual presentation tree to preserve hierarchy while applying semantic ordering
presentation_tree = VirtualPresentationTree(self.ordering_manager)
ordered_nodes = presentation_tree.build_tree(
concept_metadata=self.concept_metadata,
concept_ordering=concept_ordering,
original_statement_order=self.original_statement_order
)
# Convert nodes back to the expected format
ordered_concepts = [(node.concept, node.metadata) for node in ordered_nodes]
# Build the output structure
result = {
'periods': [(pid, self.period_dates.get(pid, pid)) for pid in self.periods],
'statement_data': []
}
for concept, metadata in ordered_concepts:
# Create an item for each concept
item = {
# Use the latest label if available, otherwise fall back to the concept key
'label': metadata.get('latest_label', concept),
'level': metadata['level'],
'is_abstract': metadata['is_abstract'],
'is_total': metadata['is_total'],
'concept': metadata['original_concept'],
'values': {},
'decimals': {}
}
# Add values for each period
for period_id in self.periods:
if period_id in self.data[concept]:
item['values'][period_id] = self.data[concept][period_id]['value']
item['decimals'][period_id] = self.data[concept][period_id]['decimals']
# Set has_values flag based on whether there are any values
item['has_values'] = len(item['values']) > 0
# Only include items with values or abstract items
if item['has_values'] or item['is_abstract']:
result['statement_data'].append(item)
return result
def _format_output(self) -> Dict[str, Any]:
"""
Backward compatibility method - calls the new ordering-aware method.
Returns:
Stitched statement data in the expected format
"""
# For backward compatibility, call the new method with empty statements
# This will use alphabetical ordering as before
return self._format_output_with_ordering([])
def stitch_statements(
xbrl_list: List[Any],
statement_type: str = 'IncomeStatement',
period_type: Union[StatementStitcher.PeriodType, str] = StatementStitcher.PeriodType.RECENT_PERIODS,
max_periods: int = 3,
standard: bool = True,
use_optimal_periods: bool = True,
include_dimensions: bool = False
) -> Dict[str, Any]:
"""
Stitch together statements from multiple XBRL objects.
Args:
xbrl_list: List of XBRL objects, should be from the same company and ordered by date
statement_type: Type of statement to stitch ('IncomeStatement', 'BalanceSheet', etc.)
period_type: Type of period view to generate
max_periods: Maximum number of periods to include (default: 3)
standard: Whether to use standardized concept labels (default: True)
use_optimal_periods: Whether to use the entity info to determine optimal periods (default: True)
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
Returns:
Stitched statement data
"""
# Initialize the stitcher
stitcher = StatementStitcher()
# Collect statements of the specified type from each XBRL object
statements = []
# If using optimal periods based on entity info
if use_optimal_periods:
# Use our utility function to determine the best periods
optimal_periods = determine_optimal_periods(xbrl_list, statement_type, max_periods=max_periods)
# Limit to max_periods if needed
if len(optimal_periods) > max_periods:
optimal_periods = optimal_periods[:max_periods]
# Extract the XBRL objects that contain our optimal periods
for period_metadata in optimal_periods:
xbrl_index = period_metadata['xbrl_index']
xbrl = xbrl_list[xbrl_index]
# Get the statement and period info
statement = xbrl.get_statement_by_type(statement_type, include_dimensions=include_dimensions)
if statement:
# Only include the specific period from this statement
period_key = period_metadata['period_key']
# Check if this period exists in the statement
if period_key in statement['periods']:
# Create a filtered version of the statement with just this period
filtered_statement = {
'role': statement['role'],
'definition': statement['definition'],
'statement_type': statement['statement_type'],
'periods': {period_key: statement['periods'][period_key]},
'data': statement['data']
}
# Update the period label to include information from entity_info
display_date = period_metadata['display_date']
period_type = period_metadata['period_type']
fiscal_period = period_metadata.get('fiscal_period')
# Create a more informative label
if period_type == 'instant':
if fiscal_period == 'FY':
period_label = f"FY {display_date}"
else:
period_label = display_date
else: # duration
# For duration periods, add fiscal quarter/year info if available
if fiscal_period == 'FY':
period_label = f"FY {display_date}"
elif fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
period_label = f"{fiscal_period} {display_date}"
else:
period_label = display_date
# Update the period label
filtered_statement['periods'][period_key] = {
'label': period_label,
'original_label': statement['periods'][period_key]['label']
}
statements.append(filtered_statement)
# Traditional approach without using entity info
else:
for xbrl in xbrl_list:
# Get statement data for the specified type
statement = xbrl.find_statement(statement_type)
if statement:
statements.append(statement)
# Stitch the statements
return stitcher.stitch_statements(statements, period_type, max_periods, standard)

View File

@@ -0,0 +1,833 @@
"""
XBRL Statement Ordering - Intelligent Ordering for Multi-Period Statements
This module provides consistent ordering for financial statements across multiple periods
by combining template-based, reference-based, and semantic positioning strategies.
"""
import re
from enum import Enum
from typing import Dict, List, Optional, Tuple
try:
from rapidfuzz import fuzz
except ImportError:
# Fallback to difflib if rapidfuzz is not available
from difflib import SequenceMatcher
class fuzz:
@staticmethod
def ratio(s1: str, s2: str) -> float:
return SequenceMatcher(None, s1, s2).ratio() * 100
class StatementType(str, Enum):
"""Supported statement types for ordering"""
INCOME_STATEMENT = "IncomeStatement"
BALANCE_SHEET = "BalanceSheet"
CASH_FLOW = "CashFlowStatement"
EQUITY = "StatementOfEquity"
class FinancialStatementTemplates:
"""Canonical ordering templates for financial statements based on XBRL concepts"""
INCOME_STATEMENT_TEMPLATE = [
# Revenue Section (0-99)
(0, "revenue_section", [
# Product/Service Revenue Components
"us-gaap:SalesRevenueGoodsNet",
"us-gaap:ProductSales",
"us-gaap:SalesRevenueServicesNet",
"us-gaap:SubscriptionRevenue",
# Contract Revenue
"us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
"us-gaap:RevenueFromContractWithCustomerIncludingAssessedTax",
# Total Revenue
"us-gaap:Revenue",
"us-gaap:Revenues",
"us-gaap:SalesRevenueNet",
"us-gaap:OperatingRevenue"
]),
# Cost Section (100-199)
(100, "cost_section", [
"us-gaap:CostOfRevenueAbstract", # Abstract
"us-gaap:CostOfRevenue", # Total
"us-gaap:CostOfGoodsSold",
"us-gaap:CostOfGoodsAndServicesSold",
"us-gaap:CostOfSales",
"us-gaap:DirectOperatingCosts",
"us-gaap:CostsAndExpenses"
]),
# Gross Profit (200-299)
(200, "gross_profit", [
"us-gaap:GrossProfit"
]),
# Operating Expenses (300-399)
(300, "operating_expenses", [
# R&D Expenses
"us-gaap:ResearchAndDevelopmentCosts",
"us-gaap:ResearchAndDevelopmentExpense",
# SG&A Expenses
"us-gaap:SellingGeneralAndAdministrativeExpense",
"us-gaap:GeneralAndAdministrativeExpense",
"us-gaap:AdministrativeExpense",
"us-gaap:SellingAndMarketingExpense",
"us-gaap:SellingExpense",
"us-gaap:MarketingExpense",
"us-gaap:AdvertisingExpense",
# Total Operating Expenses
"us-gaap:NoninterestExpense",
"us-gaap:OperatingCostsAndExpenses",
"us-gaap:OperatingExpenses"
]),
# Operating Income (400-499)
(400, "operating_income", [
"us-gaap:OperatingIncomeLoss",
"us-gaap:OperatingIncome",
"us-gaap:IncomeLossFromContinuingOperationsBeforeInterestAndTaxes"
]),
# Non-Operating (500-599)
(500, "non_operating", [
"us-gaap:InterestIncomeExpenseNet",
"us-gaap:InterestAndDebtExpense",
"us-gaap:InterestExpense",
"us-gaap:InterestExpenseNonoperating", # ADBE uses this for non-operating interest expense
"us-gaap:InterestIncome",
"us-gaap:InvestmentIncomeInterest", # NVIDIA uses this variant
"us-gaap:OtherNonoperatingIncomeExpense",
"us-gaap:NonoperatingIncomeExpense",
"orcl:NonoperatingIncomeExpenseIncludingEliminationOfNetIncomeLossAttributableToNoncontrollingInterests"
]),
# Pre-Tax Income (600-699)
(600, "pretax_income", [
"us-gaap:IncomeLossBeforeIncomeTaxes",
"us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxes",
"us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest",
"orcl:IncomeLossFromContinuingOperationsIncludingNoncontrollingInterestBeforeIncomeTaxesExtraordinaryItems"
]),
# Tax (700-799)
(700, "tax", [
"us-gaap:IncomeTaxesPaidNet",
"us-gaap:IncomeTaxExpenseBenefit"
]),
# Net Income (800-899)
(800, "net_income", [
"us-gaap:IncomeLossFromContinuingOperationsIncludingPortionAttributableToNoncontrollingInterest",
"us-gaap:IncomeLossFromContinuingOperations",
"us-gaap:NetIncome",
"us-gaap:NetIncomeLoss",
"us-gaap:ProfitLoss",
"us-gaap:NetIncomeLossAttributableToNonredeemableNoncontrollingInterest",
"us-gaap:NetIncomeLossAttributableToNoncontrollingInterest"
]),
# Per Share Data (900-999)
(900, "per_share", [
"us-gaap:EarningsPerShareAbstract",
"us-gaap:EarningsPerShareBasic",
"us-gaap:EarningsPerShareDiluted",
"us-gaap:WeightedAverageNumberOfSharesOutstandingAbstract",
"us-gaap:WeightedAverageNumberOfSharesOutstandingBasic",
"us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding"
])
]
BALANCE_SHEET_TEMPLATE = [
# Current Assets (0-199)
(0, "current_assets", [
"Cash and Cash Equivalents",
"Cash",
"Short-term Investments",
"Marketable Securities",
"Accounts Receivable",
"Trade Receivables",
"Inventory",
"Prepaid Expenses",
"Other Current Assets",
"Total Current Assets"
]),
# Non-Current Assets (200-399)
(200, "noncurrent_assets", [
"Property, Plant and Equipment",
"Property and Equipment",
"Long-term Investments",
"Goodwill",
"Intangible Assets",
"Other Non-current Assets",
"Total Non-current Assets",
"Total Assets"
]),
# Current Liabilities (400-599)
(400, "current_liabilities", [
"Accounts Payable",
"Trade Payables",
"Accrued Liabilities",
"Accrued Expenses",
"Short-term Debt",
"Current Portion of Long-term Debt",
"Other Current Liabilities",
"Total Current Liabilities"
]),
# Non-Current Liabilities (600-799)
(600, "noncurrent_liabilities", [
"Long-term Debt",
"Deferred Revenue",
"Deferred Tax Liabilities",
"Other Non-current Liabilities",
"Total Non-current Liabilities",
"Total Liabilities"
]),
# Equity (800-999)
(800, "equity", [
"Common Stock",
"Additional Paid-in Capital",
"Retained Earnings",
"Accumulated Other Comprehensive Income",
"Treasury Stock",
"Total Stockholders' Equity",
"Total Shareholders' Equity",
"Total Equity"
])
]
def get_template_position(self, item_concept: str, item_label: str, statement_type: str) -> Optional[float]:
"""
Get template position for an item, prioritizing concept-based matching over label matching.
Args:
item_concept: The XBRL concept (e.g., "us-gaap:Revenue")
item_label: The display label (e.g., "Contract Revenue")
statement_type: Type of statement ("IncomeStatement", "BalanceSheet", etc.)
Returns:
Float position in template, or None if no match found
"""
# Handle different statement type formats
if statement_type == "IncomeStatement":
template_name = "INCOME_STATEMENT_TEMPLATE"
elif statement_type == "BalanceSheet":
template_name = "BALANCE_SHEET_TEMPLATE"
else:
template_name = f"{statement_type.upper()}_TEMPLATE"
template = getattr(self, template_name, None)
if not template:
return None
# Strategy 1: Direct concept matching (highest priority)
if item_concept:
normalized_concept = self._normalize_xbrl_concept(item_concept)
for base_pos, _section_name, template_concepts in template:
for i, template_concept in enumerate(template_concepts):
template_normalized = self._normalize_xbrl_concept(template_concept)
if normalized_concept == template_normalized:
return float(base_pos + i)
# Strategy 2: Label-based matching as fallback (for compatibility)
if item_label:
for base_pos, _section_name, template_concepts in template:
for i, template_concept in enumerate(template_concepts):
if self._labels_match(item_label, template_concept):
return float(base_pos + i)
return None
def _normalize_xbrl_concept(self, concept: str) -> str:
"""
Normalize XBRL concept for matching.
Handles variations in concept format:
- "us-gaap:Revenue" vs "us-gaap_Revenue"
- Case sensitivity
- Namespace prefixes
"""
if not concept:
return ""
# Normalize separators (: vs _)
normalized = concept.lower()
normalized = normalized.replace(':', '_')
# Handle common namespace variations
# us-gaap, usgaap, gaap all should match
if normalized.startswith('us-gaap_') or normalized.startswith('usgaap_'):
normalized = 'us-gaap_' + normalized.split('_', 1)[1]
elif normalized.startswith('gaap_'):
normalized = 'us-gaap_' + normalized.split('_', 1)[1]
return normalized
def _labels_match(self, label1: str, label2: str) -> bool:
"""Check if two labels represent the same financial item (fallback for non-concept matching)"""
if not label1 or not label2:
return False
# For XBRL concepts in templates, don't try to match against labels
if ':' in label2 or '_gaap_' in label2.lower():
return False
# Use existing normalization logic for label matching
norm1 = self._normalize_concept(label1)
norm2 = self._normalize_concept(label2)
# Exact match
if norm1 == norm2:
return True
# Fuzzy matching for similar concepts
similarity = fuzz.ratio(norm1, norm2) / 100.0
return similarity > 0.7
def _concepts_match(self, concept1: str, concept2: str) -> bool:
"""Check if two concepts represent the same financial item"""
# Normalize for comparison
norm1 = self._normalize_concept(concept1)
norm2 = self._normalize_concept(concept2)
# Exact match
if norm1 == norm2:
return True
# Fuzzy matching for similar concepts
similarity = fuzz.ratio(norm1, norm2) / 100.0
return similarity > 0.7 # Lowered threshold for better matching
def _normalize_concept(self, concept: str) -> str:
"""Normalize concept for comparison"""
if not concept:
return ""
# Remove common variations
normalized = concept.lower()
normalized = re.sub(r'\s+', ' ', normalized) # Normalize whitespace
normalized = re.sub(r'[,\.]', '', normalized) # Remove punctuation
normalized = re.sub(r'\(.*?\)', '', normalized) # Remove parenthetical
normalized = re.sub(r'\bexpense\b', '', normalized) # Remove 'expense' suffix
normalized = re.sub(r'\bincome\b', '', normalized) # Remove 'income' suffix for matching
return normalized.strip()
class ReferenceOrderingStrategy:
"""Extract ordering from reference statement"""
def establish_reference_order(self, statements: List[Dict]) -> Dict[str, float]:
"""Establish reference ordering from best available statement"""
if not statements:
return {}
# Strategy: Use most recent statement (statements are ordered newest first)
reference_statement = statements[0]
reference_order = {}
for i, item in enumerate(reference_statement.get('data', [])):
concept = item.get('concept')
label = item.get('label')
if concept:
# Store by both concept ID and label for flexibility
reference_order[concept] = float(i)
if label:
reference_order[label] = float(i)
return reference_order
class SemanticPositioning:
"""Position concepts based on financial statement semantics"""
def __init__(self, statement_type: str):
self.statement_type = statement_type
self.section_defaults = self._get_section_defaults()
def _get_section_defaults(self) -> Dict[str, float]:
"""Default positions for each section when no other guidance available"""
if self.statement_type == "IncomeStatement":
return {
"revenue": 50.0,
"cost": 150.0,
"gross_profit": 250.0,
"expense": 350.0,
"operating_income": 450.0,
"non_operating": 550.0,
"pretax_income": 650.0,
"tax": 750.0,
"net_income": 850.0,
"per_share": 950.0
}
elif self.statement_type == "BalanceSheet":
return {
"current_assets": 100.0,
"noncurrent_assets": 300.0,
"current_liabilities": 500.0,
"noncurrent_liabilities": 700.0,
"equity": 900.0
}
return {}
def infer_position(self, concept: str, existing_order: Dict[str, float]) -> float:
"""Infer semantic position for a new concept"""
# Rule-based positioning
section = self._classify_concept_section(concept)
if section:
return self._position_in_section(concept, section, existing_order)
# Parent-child relationship positioning
parent = self._find_parent_concept(concept, existing_order)
if parent:
return existing_order[parent] + 0.1 # Just after parent
# Similarity-based positioning
similar_concept = self._find_most_similar_concept(concept, existing_order)
if similar_concept:
return existing_order[similar_concept] + 0.1
# Default to end
return 999.0
def _classify_concept_section(self, concept: str) -> Optional[str]:
"""Classify concept into financial statement section"""
if not concept:
return None
concept_lower = concept.lower()
if self.statement_type == "IncomeStatement":
# Revenue indicators
if any(term in concept_lower for term in ['revenue', 'sales']) and not any(term in concept_lower for term in ['cost', 'expense']):
return "revenue"
# Cost indicators
elif any(term in concept_lower for term in ['cost of', 'cogs']):
return "cost"
# Gross profit
elif 'gross profit' in concept_lower or 'gross margin' in concept_lower:
return "gross_profit"
# Operating expenses
elif any(term in concept_lower for term in ['r&d', 'research', 'selling', 'administrative', 'marketing']) or ('expense' in concept_lower and 'tax' not in concept_lower):
return "expense"
# Operating income
elif 'operating income' in concept_lower or 'operating profit' in concept_lower:
return "operating_income"
# Non-operating
elif any(term in concept_lower for term in ['interest', 'other income', 'nonoperating']):
return "non_operating"
# Pre-tax income
elif 'before tax' in concept_lower or 'pretax' in concept_lower:
return "pretax_income"
# Tax
elif 'tax' in concept_lower and 'expense' in concept_lower:
return "tax"
# Net income
elif 'net income' in concept_lower or 'net earnings' in concept_lower:
return "net_income"
# Per share
elif any(term in concept_lower for term in ['per share', 'earnings per', 'shares outstanding']):
return "per_share"
elif self.statement_type == "BalanceSheet":
if any(term in concept_lower for term in ['cash', 'receivable', 'inventory', 'prepaid']) or ('current' in concept_lower and 'asset' in concept_lower):
return "current_assets"
elif any(term in concept_lower for term in ['property', 'equipment', 'goodwill', 'intangible']) or ('asset' in concept_lower and 'current' not in concept_lower):
return "noncurrent_assets"
elif any(term in concept_lower for term in ['payable', 'accrued']) or ('current' in concept_lower and 'liabilit' in concept_lower):
return "current_liabilities"
elif 'debt' in concept_lower or ('liabilit' in concept_lower and 'current' not in concept_lower):
return "noncurrent_liabilities"
elif any(term in concept_lower for term in ['equity', 'stock', 'retained earnings', 'capital']):
return "equity"
return None
def _position_in_section(self, concept: str, section: str, existing_order: Dict[str, float]) -> float:
"""Position concept within its identified section"""
section_concepts = [
(label, pos) for label, pos in existing_order.items()
if self._classify_concept_section(label) == section
]
if not section_concepts:
# Section doesn't exist yet - use template defaults
return self.section_defaults.get(section, 999.0)
# Find best position within section
section_concepts.sort(key=lambda x: x[1]) # Sort by position
# Simple strategy: place at end of section
last_pos = section_concepts[-1][1]
return last_pos + 0.1
def _find_parent_concept(self, concept: str, existing_order: Dict[str, float]) -> Optional[str]:
"""Find parent concept in hierarchy"""
if not concept:
return None
# Look for hierarchical relationships
# e.g., "Software Revenue" -> "Revenue"
concept_words = set(concept.lower().split())
candidates = []
for existing_concept in existing_order.keys():
if not existing_concept:
continue
existing_words = set(existing_concept.lower().split())
# Check if existing concept is a parent (subset of words)
# Also check for common patterns like "expense" being a parent of "X expense"
if (existing_words.issubset(concept_words) and len(existing_words) < len(concept_words)) or \
(existing_concept.lower() in concept.lower() and existing_concept.lower() != concept.lower()):
candidates.append((existing_concept, len(existing_words)))
if candidates:
# Return the most specific parent (most words in common)
return max(candidates, key=lambda x: x[1])[0]
return None
def _find_most_similar_concept(self, concept: str, existing_order: Dict[str, float]) -> Optional[str]:
"""Find most similar existing concept"""
if not concept:
return None
best_match = None
best_similarity = 0.0
for existing_concept in existing_order.keys():
if not existing_concept:
continue
similarity = fuzz.ratio(concept.lower(), existing_concept.lower()) / 100.0
if similarity > best_similarity and similarity > 0.5: # Minimum threshold
best_similarity = similarity
best_match = existing_concept
return best_match
class StatementOrderingManager:
"""Manages consistent ordering across multi-period statements"""
def __init__(self, statement_type: str):
self.statement_type = statement_type
self.templates = FinancialStatementTemplates()
self.reference_strategy = ReferenceOrderingStrategy()
self.semantic_positioning = SemanticPositioning(statement_type)
def determine_ordering(self, statements: List[Dict]) -> Dict[str, float]:
"""
Determine unified ordering for all concepts across statements.
Returns:
Dict mapping concept -> sort_key (float for interpolation)
"""
if not statements:
return {}
all_concepts = self._extract_all_concepts(statements)
# Strategy 1: Template-based ordering (highest priority)
template_positioned = self._apply_template_ordering(all_concepts, statements)
# Strategy 2: Reference statement ordering for non-template items
reference_positioned = self._apply_reference_ordering(
all_concepts, statements, template_positioned
)
# Strategy 3: Semantic positioning for orphan concepts
semantic_positioned = self._apply_semantic_positioning(
all_concepts, template_positioned, reference_positioned
)
# Strategy 4: Section-aware consolidation to maintain template groupings
final_ordering = self._consolidate_section_ordering(
semantic_positioned, template_positioned, statements
)
return final_ordering
def _extract_all_concepts(self, statements: List[Dict]) -> set:
"""Extract all unique concepts from statements"""
all_concepts = set()
for statement in statements:
for item in statement.get('data', []):
concept = item.get('concept')
label = item.get('label')
if concept:
all_concepts.add(concept)
if label:
all_concepts.add(label)
return all_concepts
def _apply_template_ordering(self, concepts: set, statements: List[Dict]) -> Dict[str, float]:
"""Apply template-based ordering for known concepts using concept-first matching"""
template_order = {}
# Build a mapping of concepts/labels to their actual XBRL concepts for better matching
concept_to_xbrl = {}
label_to_xbrl = {}
for statement in statements:
for item in statement.get('data', []):
concept = item.get('concept')
label = item.get('label')
if concept and label:
concept_to_xbrl[concept] = concept
label_to_xbrl[label] = concept
elif concept:
concept_to_xbrl[concept] = concept
# Apply template ordering with concept priority
for concept_or_label in concepts:
# Determine if this is a concept or label
is_concept = concept_or_label in concept_to_xbrl
is_label = concept_or_label in label_to_xbrl
# Get the actual XBRL concept and label for this item
if is_concept:
xbrl_concept = concept_or_label
# Try to find the corresponding label
corresponding_label = None
for stmt in statements:
for item in stmt.get('data', []):
if item.get('concept') == concept_or_label:
corresponding_label = item.get('label')
break
if corresponding_label:
break
elif is_label:
xbrl_concept = label_to_xbrl.get(concept_or_label)
corresponding_label = concept_or_label
else:
# Neither concept nor label found in mappings
xbrl_concept = None
corresponding_label = concept_or_label
# Try concept-based matching first, then label-based
template_pos = self.templates.get_template_position(
item_concept=xbrl_concept,
item_label=corresponding_label,
statement_type=self.statement_type
)
if template_pos is not None:
template_order[concept_or_label] = template_pos
# IMPORTANT: If we found a template position for a concept,
# also apply it to the corresponding label (and vice versa)
# This ensures consistent ordering regardless of whether the
# stitcher uses concept or label as the key
if is_concept and corresponding_label and corresponding_label in concepts:
template_order[corresponding_label] = template_pos
elif is_label and xbrl_concept and xbrl_concept in concepts:
template_order[xbrl_concept] = template_pos
return template_order
def _apply_reference_ordering(self, concepts: set, statements: List[Dict],
template_positioned: Dict[str, float]) -> Dict[str, float]:
"""Apply reference statement ordering for remaining concepts"""
reference_order = self.reference_strategy.establish_reference_order(statements)
combined_order = template_positioned.copy()
for concept in concepts:
if concept not in combined_order and concept in reference_order:
combined_order[concept] = reference_order[concept]
return combined_order
def _apply_semantic_positioning(self, concepts: set, template_positioned: Dict[str, float],
reference_positioned: Dict[str, float]) -> Dict[str, float]:
"""Apply semantic positioning for orphan concepts"""
final_order = reference_positioned.copy()
# Position remaining concepts using semantic rules
for concept in concepts:
if concept not in final_order:
semantic_pos = self.semantic_positioning.infer_position(concept, final_order)
final_order[concept] = semantic_pos
return final_order
def _consolidate_section_ordering(self, semantic_positioned: Dict[str, float],
template_positioned: Dict[str, float],
statements: List[Dict]) -> Dict[str, float]:
"""
Consolidate ordering to maintain template section groupings.
This prevents reference ordering from breaking up logical template sections
like per-share data (EPS + Shares Outstanding).
"""
# Identify template sections and their concepts
template_sections = self._identify_template_sections(template_positioned)
# Separate template-positioned from non-template items
template_items = {}
non_template_items = {}
for concept, position in semantic_positioned.items():
if concept in template_positioned:
template_items[concept] = position
else:
non_template_items[concept] = position
# Re-organize to ensure section integrity
final_ordering = {}
# Process template sections in order
for section_name, section_concepts in template_sections.items():
# Find all template items (concepts and labels) that belong to this section
section_template_items = []
for concept in section_concepts:
if concept in template_items:
section_template_items.append(concept)
# Also find labels that correspond to concepts in this section
# by checking if any template_items have the same template position
section_template_positions = set()
for concept in section_concepts:
if concept in template_positioned:
section_template_positions.add(template_positioned[concept])
# Find labels that have the same template positions as section concepts
for item, pos in template_items.items():
if pos in section_template_positions and item not in section_template_items:
section_template_items.append(item)
if section_template_items:
# Use the template base position for this section to ensure strong grouping
section_base_pos = self._get_section_base_position(section_name)
# For critical sections like per_share, use an even stronger override
if section_name == "per_share":
# Force per-share items to be at the very end, regardless of hierarchy
section_base_pos = 950.0
# Ensure all items in this section stay grouped together
for i, item in enumerate(sorted(section_template_items,
key=lambda x: template_items.get(x, 999.0))):
final_ordering[item] = section_base_pos + i * 0.1
# Add non-template items, adjusting positions to avoid breaking template sections
section_ranges = self._get_section_ranges(final_ordering, template_sections)
for concept, position in non_template_items.items():
# Find appropriate insertion point that doesn't break template sections
adjusted_position = self._find_insertion_point(position, section_ranges)
final_ordering[concept] = adjusted_position
return final_ordering
def _get_section_base_position(self, section_name: str) -> float:
"""Get the base position for a template section"""
if self.statement_type == "IncomeStatement":
template = self.templates.INCOME_STATEMENT_TEMPLATE
elif self.statement_type == "BalanceSheet":
template = self.templates.BALANCE_SHEET_TEMPLATE
else:
return 999.0
for base_pos, name, _concepts in template:
if name == section_name:
return float(base_pos)
return 999.0
def _identify_template_sections(self, template_positioned: Dict[str, float]) -> Dict[str, List[str]]:
"""Identify which concepts belong to which template sections"""
sections = {}
# Get the template for this statement type
if self.statement_type == "IncomeStatement":
template = self.templates.INCOME_STATEMENT_TEMPLATE
elif self.statement_type == "BalanceSheet":
template = self.templates.BALANCE_SHEET_TEMPLATE
else:
return {}
# Build mapping of concepts to sections
for _base_pos, section_name, template_concepts in template:
section_concepts = []
for concept in template_positioned.keys():
# Check if this concept matches any template concept in this section
for template_concept in template_concepts:
if self._concept_matches_template(concept, template_concept):
section_concepts.append(concept)
break
if section_concepts:
sections[section_name] = section_concepts
return sections
def _concept_matches_template(self, concept: str, template_concept: str) -> bool:
"""Check if a concept matches a template concept"""
# For XBRL concepts, do direct comparison
if ':' in template_concept or '_gaap_' in template_concept.lower():
return self._normalize_xbrl_concept(concept) == self._normalize_xbrl_concept(template_concept)
# For labels, use fuzzy matching
return self._labels_match(concept, template_concept)
def _get_section_ranges(self, final_ordering: Dict[str, float],
template_sections: Dict[str, List[str]]) -> List[Tuple[float, float, str]]:
"""Get the position ranges occupied by each template section"""
ranges = []
for section_name, concepts in template_sections.items():
section_positions = [final_ordering[c] for c in concepts if c in final_ordering]
if section_positions:
min_pos = min(section_positions)
max_pos = max(section_positions)
ranges.append((min_pos, max_pos, section_name))
return sorted(ranges)
def _find_insertion_point(self, desired_position: float,
section_ranges: List[Tuple[float, float, str]]) -> float:
"""Find appropriate insertion point that doesn't break template sections"""
# Check if desired position conflicts with any template section
for min_pos, max_pos, section_name in section_ranges:
if min_pos <= desired_position <= max_pos:
# Position conflicts with a template section
# Place it just before the section (unless it should logically be after)
# Special handling for per-share section
if section_name == "per_share" and desired_position < min_pos:
# Items that should come before per-share data
return min_pos - 1.0
else:
# Place after the section
return max_pos + 1.0
# No conflicts, use desired position
return desired_position
def _normalize_xbrl_concept(self, concept: str) -> str:
"""Delegate to templates class for concept normalization"""
return self.templates._normalize_xbrl_concept(concept)
def _labels_match(self, label1: str, label2: str) -> bool:
"""Delegate to templates class for label matching"""
return self.templates._labels_match(label1, label2)

View File

@@ -0,0 +1,547 @@
"""
XBRL Statement Stitching - Period Optimization (Refactored)
This module provides functionality to determine optimal periods for stitching
statements across multiple XBRL filings, handling period selection and
fiscal period matching.
Refactored to use a clean class-based architecture for better maintainability,
testability, and extensibility.
"""
import logging
from dataclasses import dataclass
from datetime import date
from typing import Any, Dict, List, Optional, Tuple
from edgar.xbrl.core import format_date, parse_date
from edgar.xbrl.xbrl import XBRL
logger = logging.getLogger(__name__)
@dataclass
class PeriodSelectionConfig:
"""Configuration for period selection behavior"""
# Duration ranges for different period types
annual_duration_range: Tuple[int, int] = (350, 380)
quarterly_duration_range: Tuple[int, int] = (80, 100)
q2_ytd_range: Tuple[int, int] = (175, 190)
q3_ytd_range: Tuple[int, int] = (260, 285)
q4_annual_range: Tuple[int, int] = (350, 380)
# Target durations for optimization
target_annual_days: int = 365
target_quarterly_days: int = 90
target_q2_ytd_days: int = 180
target_q3_ytd_days: int = 270
# Behavior flags
require_exact_matches: bool = True
allow_fallback_when_no_doc_date: bool = True
max_periods_default: int = 8
class PeriodMatcher:
"""Handles exact period matching logic"""
def __init__(self, config: PeriodSelectionConfig):
self.config = config
def find_exact_instant_match(self, periods: List[Dict], target_date: date) -> Optional[Dict]:
"""Find instant period that exactly matches target date"""
for period in periods:
try:
period_date = parse_date(period['date'])
if period_date == target_date:
return period
except (ValueError, TypeError) as e:
logger.warning("Failed to parse period date '%s': %s", period.get('date'), e)
continue
return None
def find_exact_duration_match(self, periods: List[Dict], target_date: date) -> Optional[Dict]:
"""Find duration period that ends exactly on target date"""
for period in periods:
try:
end_date = parse_date(period['end_date'])
if end_date == target_date:
return period
except (ValueError, TypeError) as e:
logger.warning("Failed to parse period end date '%s': %s", period.get('end_date'), e)
continue
return None
def filter_by_duration_range(self, periods: List[Dict], min_days: int, max_days: int, target_days: int) -> List[Dict]:
"""Filter periods by duration and sort by proximity to target"""
filtered_periods = []
for period in periods:
duration_days = period.get('duration_days')
if duration_days is None:
try:
start_date = parse_date(period['start_date'])
end_date = parse_date(period['end_date'])
duration_days = (end_date - start_date).days
period = period.copy()
period['duration_days'] = duration_days
except (ValueError, TypeError) as e:
logger.warning("Failed to calculate duration for period: %s", e)
continue
if min_days <= duration_days <= max_days:
filtered_periods.append(period)
# Sort by proximity to target duration
filtered_periods.sort(key=lambda x: abs(x['duration_days'] - target_days))
return filtered_periods
class FiscalPeriodClassifier:
"""Classifies and filters periods based on fiscal information"""
def __init__(self, config: PeriodSelectionConfig):
self.config = config
def classify_annual_periods(self, periods: List[Dict]) -> List[Dict]:
"""Identify annual periods (350-380 days)"""
min_days, max_days = self.config.annual_duration_range
target_days = self.config.target_annual_days
annual_periods = []
for period in periods:
duration_days = period.get('duration_days', 0)
if min_days <= duration_days <= max_days:
annual_periods.append(period)
# Sort by proximity to target annual duration
annual_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
return annual_periods
def classify_quarterly_periods(self, periods: List[Dict]) -> List[Dict]:
"""Identify quarterly periods (80-100 days)"""
min_days, max_days = self.config.quarterly_duration_range
target_days = self.config.target_quarterly_days
quarterly_periods = []
for period in periods:
duration_days = period.get('duration_days', 0)
if min_days <= duration_days <= max_days:
quarterly_periods.append(period)
# Sort by proximity to target quarterly duration
quarterly_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
return quarterly_periods
def classify_ytd_periods(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
"""Identify YTD periods based on fiscal quarter"""
if fiscal_period not in ['Q2', 'Q3', 'Q4']:
return []
# Get expected duration range for this fiscal period
duration_ranges = {
'Q2': self.config.q2_ytd_range,
'Q3': self.config.q3_ytd_range,
'Q4': self.config.q4_annual_range
}
target_durations = {
'Q2': self.config.target_q2_ytd_days,
'Q3': self.config.target_q3_ytd_days,
'Q4': self.config.target_annual_days
}
min_days, max_days = duration_ranges[fiscal_period]
target_days = target_durations[fiscal_period]
ytd_periods = []
for period in periods:
duration_days = period.get('duration_days', 0)
if min_days <= duration_days <= max_days:
ytd_periods.append(period)
# Sort by proximity to target duration
ytd_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
return ytd_periods
def get_expected_durations(self, fiscal_period: str) -> Dict[str, Tuple[int, int]]:
"""Get expected duration ranges for fiscal period"""
if fiscal_period == 'FY':
return {'annual': self.config.annual_duration_range}
elif fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
durations = {'quarterly': self.config.quarterly_duration_range}
if fiscal_period == 'Q2':
durations['ytd'] = self.config.q2_ytd_range
elif fiscal_period == 'Q3':
durations['ytd'] = self.config.q3_ytd_range
elif fiscal_period == 'Q4':
durations['ytd'] = self.config.q4_annual_range
return durations
else:
return {}
class StatementTypeSelector:
"""Handles statement-specific period selection logic"""
def __init__(self, matcher: PeriodMatcher, classifier: FiscalPeriodClassifier):
self.matcher = matcher
self.classifier = classifier
def select_balance_sheet_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date]) -> List[Dict]:
"""Select instant periods for balance sheets"""
# Filter for instant periods only
instant_periods = [p for p in xbrl.reporting_periods if p['type'] == 'instant']
if not instant_periods:
return []
# If we have document_period_end_date, find exact match
if doc_period_end_date:
exact_match = self.matcher.find_exact_instant_match(instant_periods, doc_period_end_date)
if exact_match:
return [exact_match]
else:
# No exact match found - don't use fallback to prevent fiscal year boundary issues
logger.info("No exact instant period match found for %s", doc_period_end_date)
return []
# No document_period_end_date available - use most recent period
instant_periods.sort(key=lambda x: x['date'], reverse=True)
return [instant_periods[0]]
def select_income_statement_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
fiscal_period: str) -> List[Dict]:
"""Select duration periods for income statements"""
return self._select_duration_periods(xbrl, doc_period_end_date, fiscal_period)
def select_cash_flow_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
fiscal_period: str) -> List[Dict]:
"""Select duration periods for cash flow statements"""
return self._select_duration_periods(xbrl, doc_period_end_date, fiscal_period)
def _select_duration_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
fiscal_period: str) -> List[Dict]:
"""Common logic for selecting duration periods"""
# Filter for duration periods only
duration_periods = [p for p in xbrl.reporting_periods if p['type'] == 'duration']
if not duration_periods:
return []
# Add duration_days to all periods
enriched_periods = []
for period in duration_periods:
try:
start_date = parse_date(period['start_date'])
end_date = parse_date(period['end_date'])
period_copy = period.copy()
period_copy['duration_days'] = (end_date - start_date).days
enriched_periods.append(period_copy)
except (ValueError, TypeError) as e:
logger.warning("Failed to parse period dates: %s", e)
continue
if not enriched_periods:
return []
# If we have document_period_end_date, find periods that end exactly on that date
if doc_period_end_date:
matching_periods = []
for period in enriched_periods:
try:
end_date = parse_date(period['end_date'])
if end_date == doc_period_end_date:
matching_periods.append(period)
except (ValueError, TypeError):
continue
if matching_periods:
return self._select_appropriate_durations(matching_periods, fiscal_period)
else:
# No exact match found - don't use fallback
logger.info("No exact duration period match found for %s", doc_period_end_date)
return []
# No document_period_end_date - use fallback logic
return self._select_fallback_periods(enriched_periods, fiscal_period)
def _select_appropriate_durations(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
"""Select appropriate duration periods based on fiscal period"""
selected_periods = []
is_annual = fiscal_period == 'FY'
if is_annual:
# For annual reports, select annual periods
annual_periods = self.classifier.classify_annual_periods(periods)
if annual_periods:
selected_periods.append(annual_periods[0])
else:
# For quarterly reports, select quarterly period
quarterly_periods = self.classifier.classify_quarterly_periods(periods)
if quarterly_periods:
selected_periods.append(quarterly_periods[0])
# Also select YTD period if appropriate
ytd_periods = self.classifier.classify_ytd_periods(periods, fiscal_period)
if ytd_periods:
selected_periods.append(ytd_periods[0])
return selected_periods
def _select_fallback_periods(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
"""Fallback period selection when no document_period_end_date is available"""
is_annual = fiscal_period == 'FY'
if is_annual:
# For annual reports, prefer periods closest to 365 days
annual_periods = self.classifier.classify_annual_periods(periods)
if annual_periods:
# Sort by end date and take the most recent
annual_periods.sort(key=lambda x: x['end_date'], reverse=True)
return [annual_periods[0]]
else:
# For quarterly reports, prefer quarterly duration
quarterly_periods = self.classifier.classify_quarterly_periods(periods)
selected_periods = []
if quarterly_periods:
quarterly_periods.sort(key=lambda x: x['end_date'], reverse=True)
selected_periods.append(quarterly_periods[0])
# Add YTD period if available
ytd_periods = self.classifier.classify_ytd_periods(periods, fiscal_period)
if ytd_periods:
ytd_periods.sort(key=lambda x: x['end_date'], reverse=True)
selected_periods.append(ytd_periods[0])
return selected_periods
# If no appropriate periods found, return the most recent period
periods.sort(key=lambda x: x['end_date'], reverse=True)
return [periods[0]]
class PeriodMetadataEnricher:
"""Handles period metadata enrichment"""
def enrich_period_metadata(self, period: Dict, xbrl_index: int, entity_info: Dict,
doc_period_end_date: Optional[date], fiscal_period: str,
fiscal_year: str) -> Dict[str, Any]:
"""Add comprehensive metadata to period"""
period_metadata = {
'xbrl_index': xbrl_index,
'period_key': period['key'],
'period_label': period['label'],
'period_type': period['type'],
'entity_info': entity_info,
'doc_period_end_date': doc_period_end_date,
'fiscal_period': fiscal_period,
'fiscal_year': fiscal_year
}
# Add date information
if period['type'] == 'instant':
period_metadata['date'] = parse_date(period['date'])
period_metadata['display_date'] = format_date(period_metadata['date'])
else: # duration
period_metadata['start_date'] = parse_date(period['start_date'])
period_metadata['end_date'] = parse_date(period['end_date'])
period_metadata['duration_days'] = period.get('duration_days',
(period_metadata['end_date'] - period_metadata['start_date']).days)
period_metadata['display_date'] = format_date(period_metadata['end_date'])
return period_metadata
class PeriodDeduplicator:
"""Handles period deduplication and sorting"""
def deduplicate_periods(self, periods: List[Dict], statement_type: str) -> List[Dict]:
"""Remove duplicate periods using exact date matching"""
filtered_periods = []
for period in periods:
too_close = False
for included_period in filtered_periods:
# Skip if period types don't match
if period['period_type'] != included_period['period_type']:
continue
# Calculate date difference
if period['period_type'] == 'instant':
date1 = period['date']
date2 = included_period['date']
else: # duration
date1 = period['end_date']
date2 = included_period['end_date']
# Periods are duplicates if they have exactly the same date
if date1 == date2:
too_close = True
break
if not too_close:
filtered_periods.append(period)
return filtered_periods
def sort_periods_chronologically(self, periods: List[Dict], statement_type: str) -> List[Dict]:
"""Sort periods by appropriate date field"""
if statement_type == 'BalanceSheet':
return sorted(periods, key=lambda x: x['date'], reverse=True)
else:
return sorted(periods, key=lambda x: x['end_date'], reverse=True)
def limit_periods(self, periods: List[Dict], max_periods: int) -> List[Dict]:
"""Limit to maximum number of periods"""
return periods[:max_periods] if len(periods) > max_periods else periods
class PeriodOptimizer:
"""Main orchestrator for period optimization"""
def __init__(self, config: Optional[PeriodSelectionConfig] = None):
self.config = config or PeriodSelectionConfig()
self.matcher = PeriodMatcher(self.config)
self.classifier = FiscalPeriodClassifier(self.config)
self.selector = StatementTypeSelector(self.matcher, self.classifier)
self.enricher = PeriodMetadataEnricher()
self.deduplicator = PeriodDeduplicator()
def determine_optimal_periods(self, xbrl_list: List[XBRL], statement_type: str,
max_periods: Optional[int] = None) -> List[Dict[str, Any]]:
"""Main entry point - orchestrates the entire process"""
max_periods = max_periods or self.config.max_periods_default
# Step 1: Extract periods from all XBRLs
all_periods = self._extract_all_periods(xbrl_list, statement_type)
# Step 2: Enrich with metadata
enriched_periods = self._enrich_with_metadata(all_periods)
# Step 3: Deduplicate, sort, and limit
final_periods = self._deduplicate_and_limit(enriched_periods, max_periods, statement_type)
return final_periods
def _extract_all_periods(self, xbrl_list: List[XBRL], statement_type: str) -> List[Dict[str, Any]]:
"""Extract periods from all XBRL objects"""
all_periods = []
for i, xbrl in enumerate(xbrl_list):
# Skip None XBRLs (pre-XBRL era filings before 2009)
if xbrl is None:
continue
# Skip XBRLs with no reporting periods
if not xbrl.reporting_periods:
continue
entity_info = xbrl.entity_info or {}
doc_period_end_date = self._parse_document_period_end_date(entity_info)
fiscal_period = entity_info.get('fiscal_period')
fiscal_year = entity_info.get('fiscal_year')
# Select appropriate periods based on statement type
selected_periods = self._select_periods_for_statement_type(
xbrl, statement_type, doc_period_end_date, fiscal_period
)
# Add context information to each period
for period in selected_periods:
period_with_context = {
'period': period,
'xbrl_index': i,
'entity_info': entity_info,
'doc_period_end_date': doc_period_end_date,
'fiscal_period': fiscal_period,
'fiscal_year': fiscal_year
}
all_periods.append(period_with_context)
return all_periods
def _parse_document_period_end_date(self, entity_info: Dict) -> Optional[date]:
"""Parse document_period_end_date from entity_info"""
if 'document_period_end_date' not in entity_info:
return None
try:
doc_period_end_date = entity_info['document_period_end_date']
if not isinstance(doc_period_end_date, date):
doc_period_end_date = parse_date(str(doc_period_end_date))
return doc_period_end_date
except (ValueError, TypeError) as e:
logger.warning("Failed to parse document_period_end_date: %s", e)
return None
def _select_periods_for_statement_type(self, xbrl: XBRL, statement_type: str,
doc_period_end_date: Optional[date],
fiscal_period: str) -> List[Dict]:
"""Select periods based on statement type"""
if statement_type == 'BalanceSheet':
return self.selector.select_balance_sheet_periods(xbrl, doc_period_end_date)
elif statement_type in ['IncomeStatement', 'CashFlowStatement']:
if statement_type == 'IncomeStatement':
return self.selector.select_income_statement_periods(xbrl, doc_period_end_date, fiscal_period)
else:
return self.selector.select_cash_flow_periods(xbrl, doc_period_end_date, fiscal_period)
else:
# For other statement types, use income statement logic as default
return self.selector.select_income_statement_periods(xbrl, doc_period_end_date, fiscal_period)
def _enrich_with_metadata(self, all_periods: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Enrich periods with comprehensive metadata"""
enriched_periods = []
for period_context in all_periods:
period = period_context['period']
enriched_metadata = self.enricher.enrich_period_metadata(
period,
period_context['xbrl_index'],
period_context['entity_info'],
period_context['doc_period_end_date'],
period_context['fiscal_period'],
period_context['fiscal_year']
)
enriched_periods.append(enriched_metadata)
return enriched_periods
def _deduplicate_and_limit(self, periods: List[Dict[str, Any]], max_periods: int,
statement_type: str) -> List[Dict[str, Any]]:
"""Deduplicate, sort, and limit periods"""
# Sort periods chronologically
sorted_periods = self.deduplicator.sort_periods_chronologically(periods, statement_type)
# Remove duplicates
deduplicated_periods = self.deduplicator.deduplicate_periods(sorted_periods, statement_type)
# Limit to maximum number of periods
final_periods = self.deduplicator.limit_periods(deduplicated_periods, max_periods)
return final_periods
# Main function that maintains the original API
def determine_optimal_periods(xbrl_list: List[XBRL], statement_type: str, max_periods: int = 8) -> List[Dict[str, Any]]:
"""
Determine the optimal periods to display for stitched statements from a list of XBRL objects.
This function analyzes entity info and reporting periods across multiple XBRL instances
to select the most appropriate periods for display, ensuring consistency in period selection
when creating stitched statements.
Args:
xbrl_list: List of XBRL objects ordered chronologically
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
max_periods: Maximum number of periods to return (default is 8)
Returns:
List of period metadata dictionaries containing information for display
"""
optimizer = PeriodOptimizer()
return optimizer.determine_optimal_periods(xbrl_list, statement_type, max_periods)

View File

@@ -0,0 +1,256 @@
"""
XBRL Presentation Tree - Virtual presentation tree for multi-period statements
This module creates a virtual presentation tree that preserves hierarchical
relationships while applying semantic ordering within sibling groups.
"""
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
@dataclass
class PresentationNode:
"""Represents a node in the virtual presentation tree"""
concept: str
label: str
level: int
metadata: Dict[str, Any]
semantic_order: float = 999.0
original_index: int = 999
def __post_init__(self):
self.children: List[PresentationNode] = []
self.parent: Optional[PresentationNode] = None
def add_child(self, child: 'PresentationNode'):
"""Add a child node and set parent relationship"""
child.parent = self
self.children.append(child)
def sort_children(self):
"""Sort children using semantic ordering while preserving hierarchy"""
# Sort direct children by semantic order, then by original index as tiebreaker
self.children.sort(key=lambda x: (x.semantic_order, x.original_index))
# Recursively sort grandchildren
for child in self.children:
child.sort_children()
def flatten_to_list(self) -> List['PresentationNode']:
"""Flatten tree to ordered list while preserving hierarchy"""
result = [self]
for child in self.children:
result.extend(child.flatten_to_list())
return result
class VirtualPresentationTree:
"""Builds and manages virtual presentation tree for stitched statements"""
def __init__(self, ordering_manager=None):
self.ordering_manager = ordering_manager
self.root_nodes: List[PresentationNode] = []
self.all_nodes: Dict[str, PresentationNode] = {}
def build_tree(self, concept_metadata: Dict, concept_ordering: Dict,
original_statement_order: List[str] = None) -> List[PresentationNode]:
"""
Build presentation tree from concept metadata and ordering.
Args:
concept_metadata: Metadata for each concept including level
concept_ordering: Semantic ordering positions
original_statement_order: Original order of concepts for context
Returns:
Flattened list of nodes in correct presentation order
"""
# Step 1: Create nodes for all concepts
self._create_nodes(concept_metadata, concept_ordering, original_statement_order)
# Step 2: Build parent-child relationships based on levels and context
self._build_hierarchy(original_statement_order or [])
# Step 3: Apply semantic ordering within sibling groups
self._apply_semantic_ordering()
# Step 4: Flatten tree to linear list
return self._flatten_tree()
def _create_nodes(self, concept_metadata: Dict, concept_ordering: Dict,
original_statement_order: List[str] = None):
"""Create nodes for all concepts"""
self.all_nodes = {}
for i, (concept, metadata) in enumerate(concept_metadata.items()):
label = metadata.get('latest_label', concept)
level = metadata.get('level', 0)
semantic_order = concept_ordering.get(concept, concept_ordering.get(label, 999.0))
# Track original index for maintaining some original order context
original_index = i
if original_statement_order:
try:
original_index = original_statement_order.index(concept)
except ValueError:
try:
original_index = original_statement_order.index(label)
except ValueError:
original_index = i + 1000 # Place unknown concepts later
node = PresentationNode(
concept=concept,
label=label,
level=level,
metadata=metadata,
semantic_order=semantic_order,
original_index=original_index
)
self.all_nodes[concept] = node
def _build_hierarchy(self, original_order: List[str]):
"""Build parent-child relationships based on level progression and context"""
# Sort nodes by their original order to maintain context for hierarchy detection
nodes_in_order = []
# First, try to use original order if available
if original_order:
# Map concepts in original order
concept_to_node = {node.concept: node for node in self.all_nodes.values()}
label_to_node = {node.label: node for node in self.all_nodes.values()}
for item in original_order:
if item in concept_to_node:
nodes_in_order.append(concept_to_node[item])
elif item in label_to_node:
nodes_in_order.append(label_to_node[item])
# Add any remaining nodes not in original order
remaining_nodes = [node for node in self.all_nodes.values()
if node not in nodes_in_order]
remaining_nodes.sort(key=lambda x: x.original_index)
nodes_in_order.extend(remaining_nodes)
else:
# Fall back to sorting by original index
nodes_in_order = sorted(self.all_nodes.values(),
key=lambda x: x.original_index)
# Build hierarchy using a parent stack approach
parent_stack = [] # Stack of potential parents at each level
for node in nodes_in_order:
current_level = node.level
# Pop parents that are at the same level or deeper
# We're looking for a parent at a level less than current
while parent_stack and parent_stack[-1].level >= current_level:
parent_stack.pop()
if parent_stack:
# Check if potential parent and child belong to compatible sections
parent = parent_stack[-1]
# Prevent cross-section hierarchies for critical sections like per_share
should_be_child = self._should_be_hierarchical_child(parent, node)
if should_be_child:
# Valid parent-child relationship
parent.add_child(node)
else:
# Different sections - make this a root node instead
self.root_nodes.append(node)
else:
# No parent - this is a root node
self.root_nodes.append(node)
# This node could be a parent for subsequent nodes
parent_stack.append(node)
def _apply_semantic_ordering(self):
"""Apply semantic ordering within sibling groups"""
# Sort root nodes by semantic order first, then original index
self.root_nodes.sort(key=lambda x: (x.semantic_order, x.original_index))
# Sort children within each parent recursively
for root in self.root_nodes:
root.sort_children()
def _flatten_tree(self) -> List[PresentationNode]:
"""Flatten tree to linear list preserving hierarchy"""
result = []
for root in self.root_nodes:
result.extend(root.flatten_to_list())
return result
def _should_be_hierarchical_child(self, parent: PresentationNode, child: PresentationNode) -> bool:
"""
Determine if child should be hierarchically under parent based on semantic ordering.
Prevents cross-section hierarchies that would break template section groupings.
"""
# Get semantic ordering positions
parent_order = parent.semantic_order
child_order = child.semantic_order
# If both have very specific semantic orders from templates (not defaults),
# check if they're in similar ranges (same section)
if parent_order < 900 and child_order < 900:
# Both are template-positioned, check if they're in similar sections
# Allow parent-child within 200 points (roughly same section)
section_gap = abs(parent_order - child_order)
if section_gap > 200:
return False
# Special case: Per-share items (900+) should never be children of early items
if child_order >= 900 and parent_order < 800:
return False
# Special case: Non-operating items (500-599) should not be children of operating items
if 500 <= child_order < 600 and parent_order < 500:
return False
# Special case: Revenue items should not be parents of per-share items
if parent_order < 100 and child_order >= 900:
return False
# Check for semantic incompatibility based on labels
child_label = child.label.lower()
parent_label = parent.label.lower()
# Per-share items should not be children of non-per-share items
if any(term in child_label for term in ['earnings per share', 'shares outstanding']):
if not any(term in parent_label for term in ['earnings', 'shares', 'per share']):
return False
# Interest expense items should not be children of non-interest items
if 'interest expense' in child_label:
if 'interest' not in parent_label and 'nonoperating' not in parent_label:
return False
# Otherwise, allow hierarchical relationship
return True
def debug_tree(self) -> str:
"""Generate a debug representation of the tree"""
lines = []
def _add_node_lines(node: PresentationNode, depth: int = 0):
indent = " " * depth
lines.append(f"{indent}├─ {node.label} (level={node.level}, "
f"semantic={node.semantic_order:.1f}, orig={node.original_index})")
for child in node.children:
_add_node_lines(child, depth + 1)
lines.append("Virtual Presentation Tree:")
for root in self.root_nodes:
_add_node_lines(root)
return "\n".join(lines)

View File

@@ -0,0 +1,640 @@
"""
XBRL Statement Stitching - Query Functionality
This module provides query functionality for stitched XBRL facts, allowing
users to query standardized, multi-period financial data.
"""
import re
from collections import defaultdict
from typing import TYPE_CHECKING, Any, Dict, List, Optional
import pandas as pd
from rich import box
from rich.console import Group
from rich.markdown import Markdown
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from edgar.richtools import repr_rich
from edgar.xbrl.facts import FactQuery
if TYPE_CHECKING:
from edgar.xbrl.stitching.xbrls import XBRLS
class StitchedFactsView:
"""
A view over stitched facts from multiple XBRL filings.
This class extracts facts from stitched statements rather than raw XBRL facts,
ensuring that queries operate on standardized, post-processed data.
"""
def __init__(self, xbrls: 'XBRLS'):
self.xbrls = xbrls
self._facts_cache = None
self._last_cache_key = None
def __len__(self):
return len(self.get_facts())
@property
def entity_name(self):
"""Get entity name from the most recent XBRL filing."""
if self.xbrls.xbrl_list:
return getattr(self.xbrls.xbrl_list[0], 'entity_name', 'Unknown Entity')
return 'Unknown Entity'
@property
def document_type(self):
"""Get document type from entity info."""
return self.xbrls.entity_info.get('document_type', 'Multi-Period Stitched')
def get_facts(self,
max_periods: int = 8,
standard: bool = True,
statement_types: Optional[List[str]] = None) -> List[Dict[str, Any]]:
"""
Extract facts from stitched statements.
Args:
max_periods: Maximum periods to include
standard: Whether to use standardized labels
statement_types: List of statement types to include
Returns:
List of fact dictionaries with stitched/standardized data
"""
# Create cache key
cache_key = (max_periods, standard, tuple(statement_types or []))
if self._facts_cache and self._last_cache_key == cache_key:
return self._facts_cache
statement_types = statement_types or [
'IncomeStatement', 'BalanceSheet', 'CashFlowStatement',
'StatementOfEquity', 'ComprehensiveIncome'
]
all_facts = []
for statement_type in statement_types:
try:
# Get stitched statement data (this applies standardization)
stitched_data = self.xbrls.get_statement(
statement_type=statement_type,
max_periods=max_periods,
standard=standard
)
# Extract facts from stitched data
facts = self._extract_facts_from_stitched_data(
stitched_data, statement_type
)
all_facts.extend(facts)
except Exception:
# Skip statements that can't be stitched
continue
# Cache results
self._facts_cache = all_facts
self._last_cache_key = cache_key
return all_facts
def _extract_facts_from_stitched_data(self,
stitched_data: Dict[str, Any],
statement_type: str) -> List[Dict[str, Any]]:
"""
Convert stitched statement data back to fact-like records for querying.
Args:
stitched_data: Output from StatementStitcher
statement_type: Type of statement
Returns:
List of fact dictionaries
"""
facts = []
periods = stitched_data.get('periods', [])
statement_data = stitched_data.get('statement_data', [])
for item in statement_data:
# Skip abstract items without values
if item.get('is_abstract', False) and not item.get('has_values', False):
continue
concept = item.get('concept', '')
label = item.get('label', '')
original_label = item.get('original_label', label)
# Create a fact record for each period with data
for period_id, value in item.get('values', {}).items():
if value is None:
continue
# Find period metadata
period_info = self._get_period_info(period_id, periods)
fact = {
# Core identification
'concept': concept,
'label': label, # Standardized label
'original_label': original_label, # Original company label
'statement_type': statement_type,
# Value information
'value': value,
'numeric_value': self._convert_to_numeric(value),
'decimals': item.get('decimals', {}).get(period_id, 0),
# Period information
'period_key': period_id,
'period_type': period_info.get('period_type', 'duration'),
'period_start': period_info.get('period_start'),
'period_end': period_info.get('period_end'),
'period_instant': period_info.get('period_instant'),
'period_label': period_info.get('period_label', ''),
# Statement context
'level': item.get('level', 0),
'is_abstract': item.get('is_abstract', False),
'is_total': item.get('is_total', False),
# Multi-filing context
'filing_count': len(self.xbrls.xbrl_list),
'standardized': True, # Mark as coming from standardized data
# Source attribution (which XBRL filing this came from)
'source_filing_index': self._determine_source_filing(period_id),
}
# Add fiscal period info if available
fiscal_info = self._extract_fiscal_info(period_id)
fact.update(fiscal_info)
facts.append(fact)
return facts
def _get_period_info(self, period_id: str, periods: List[tuple]) -> Dict[str, Any]:
"""Extract period metadata from period_id and periods list."""
period_info = {}
# Find matching period
for pid, label in periods:
if pid == period_id:
period_info['period_label'] = label
break
# Parse period_id to extract dates and type
if period_id.startswith('instant_'):
period_info['period_type'] = 'instant'
date_str = period_id.replace('instant_', '')
period_info['period_instant'] = date_str
period_info['period_end'] = date_str
elif period_id.startswith('duration_'):
period_info['period_type'] = 'duration'
parts = period_id.replace('duration_', '').split('_')
if len(parts) >= 2:
period_info['period_start'] = parts[0]
period_info['period_end'] = parts[1]
return period_info
def _convert_to_numeric(self, value: Any) -> Optional[float]:
"""Convert value to numeric if possible."""
if value is None:
return None
try:
if isinstance(value, (int, float)):
return float(value)
if isinstance(value, str):
# Remove commas and try to convert
cleaned = value.replace(',', '').replace('$', '').strip()
return float(cleaned)
except (ValueError, TypeError):
pass
return None
def _determine_source_filing(self, period_id: str) -> Optional[int]:
"""Determine which filing this period came from."""
# This would require enhanced tracking in the stitching process
# For now, return None but this could be enhanced
return None
def _extract_fiscal_info(self, period_id: str) -> Dict[str, Any]:
"""Extract fiscal period and year information."""
fiscal_info = {}
# Try to extract fiscal info from entity_info of the relevant XBRL
# This is a simplified approach - could be enhanced with better tracking
if self.xbrls.xbrl_list:
entity_info = self.xbrls.xbrl_list[0].entity_info
if entity_info:
fiscal_info['fiscal_period'] = entity_info.get('fiscal_period')
fiscal_info['fiscal_year'] = entity_info.get('fiscal_year')
return fiscal_info
def query(self, **kwargs) -> 'StitchedFactQuery':
"""Create a new query for stitched facts."""
return StitchedFactQuery(self, **kwargs)
class StitchedFactQuery(FactQuery):
"""
Enhanced fact query for stitched/standardized multi-filing data.
Extends the base FactQuery with capabilities specific to multi-period,
standardized financial data.
"""
def __init__(self, stitched_facts_view: StitchedFactsView, **kwargs):
# Initialize with stitched facts view instead of regular facts view
self._stitched_facts_view = stitched_facts_view
# Initialize base FactQuery attributes manually since we're not calling super().__init__
self._facts_view = stitched_facts_view # For compatibility with base class
self._filters = []
self._transformations = []
self._aggregations = []
self._include_dimensions = True
self._include_contexts = True
self._include_element_info = True
self._sort_by = None
self._sort_ascending = True
self._limit = None
self._statement_type = None
# Multi-filing specific options
self._cross_period_only = False
self._trend_analysis = False
self._require_all_periods = False
# Store query-specific parameters for get_facts
self._max_periods = kwargs.get('max_periods', 8)
self._standard = kwargs.get('standard', True)
self._statement_types = kwargs.get('statement_types', None)
def __str__(self):
return f"StitchedFactQuery(filters={len(self._filters)})"
# Enhanced filtering methods for multi-filing scenarios
def by_standardized_concept(self, concept_name: str) -> 'StitchedFactQuery':
"""
Filter by standardized concept name (e.g., 'Revenue', 'Net Income').
Args:
concept_name: Standardized concept name
Returns:
Self for method chaining
"""
# Query both the standardized label and original concept
self._filters.append(
lambda f: (f.get('label') == concept_name or
concept_name.lower() in f.get('label', '').lower() or
concept_name.lower() in f.get('concept', '').lower())
)
return self
def by_original_label(self, pattern: str, exact: bool = False) -> 'StitchedFactQuery':
"""
Filter by original company-specific labels before standardization.
Args:
pattern: Pattern to match against original labels
exact: Whether to require exact match
Returns:
Self for method chaining
"""
if exact:
self._filters.append(lambda f: f.get('original_label') == pattern)
else:
regex = re.compile(pattern, re.IGNORECASE)
self._filters.append(
lambda f: f.get('original_label') and regex.search(f['original_label'])
)
return self
def across_periods(self, min_periods: int = 2) -> 'StitchedFactQuery':
"""
Filter to concepts that appear across multiple periods.
Args:
min_periods: Minimum number of periods the concept must appear in
Returns:
Self for method chaining
"""
self._cross_period_only = True
self._min_periods = min_periods
return self
def by_fiscal_period(self, fiscal_period: str) -> 'StitchedFactQuery':
"""
Filter by fiscal period (FY, Q1, Q2, Q3, Q4).
Args:
fiscal_period: Fiscal period identifier
Returns:
Self for method chaining
"""
self._filters.append(
lambda f: f.get('fiscal_period') == fiscal_period
)
return self
def by_filing_index(self, filing_index: int) -> 'StitchedFactQuery':
"""
Filter facts by which filing they originated from.
Args:
filing_index: Index of the filing (0 = most recent)
Returns:
Self for method chaining
"""
self._filters.append(
lambda f: f.get('source_filing_index') == filing_index
)
return self
def trend_analysis(self, concept: str) -> 'StitchedFactQuery':
"""
Set up for trend analysis of a specific concept across periods.
Args:
concept: Concept to analyze trends for
Returns:
Self for method chaining
"""
self._trend_analysis = True
self.by_standardized_concept(concept)
return self
def complete_periods_only(self) -> 'StitchedFactQuery':
"""
Only return concepts that have values in all available periods.
Returns:
Self for method chaining
"""
self._require_all_periods = True
return self
def execute(self) -> List[Dict[str, Any]]:
"""
Execute the query with enhanced multi-period processing.
Returns:
List of fact dictionaries
"""
# Get base results from stitched facts with query parameters
results = self._stitched_facts_view.get_facts(
max_periods=self._max_periods,
standard=self._standard,
statement_types=self._statement_types
)
# Apply standard filters
for filter_func in self._filters:
results = [f for f in results if filter_func(f)]
# Apply transformations
for transform_fn in self._transformations:
for fact in results:
if 'value' in fact and fact['value'] is not None:
fact['value'] = transform_fn(fact['value'])
# Apply aggregations
if self._aggregations:
aggregated_results = {}
for agg in self._aggregations:
dimension = agg['dimension']
func = agg['function']
# Group facts by dimension
groups = {}
for fact in results:
dim_value = fact.get(f'dim_{dimension}')
if dim_value and 'value' in fact and fact['value'] is not None:
if dim_value not in groups:
groups[dim_value] = []
groups[dim_value].append(fact['value'])
# Apply aggregation function
for dim_value, values in groups.items():
agg_value = 0.0 # Initialize with default value
if func == 'sum':
agg_value = sum(values)
elif func == 'average':
agg_value = sum(values) / len(values)
key = (dimension, dim_value)
if key not in aggregated_results:
aggregated_results[key] = {'dimension': dimension, 'value': dim_value, 'values': {}}
aggregated_results[key]['values'][func] = agg_value
results = list(aggregated_results.values())
# Apply cross-period filtering if requested
if self._cross_period_only:
results = self._filter_cross_period_concepts(results)
# Apply complete periods filtering if requested
if self._require_all_periods:
results = self._filter_complete_periods(results)
# Apply trend analysis if requested
if self._trend_analysis:
results = self._prepare_trend_data(results)
# Apply sorting if specified
if results and self._sort_by and self._sort_by in results[0]:
results.sort(key=lambda f: f.get(self._sort_by, ''),
reverse=not self._sort_ascending)
# Apply limit if specified
if self._limit is not None:
results = results[:self._limit]
return results
def _filter_cross_period_concepts(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Filter to concepts that appear in multiple periods."""
concept_periods = defaultdict(set)
for fact in results:
concept_key = (fact.get('concept', ''), fact.get('label', ''))
concept_periods[concept_key].add(fact.get('period_key', ''))
# Filter to concepts with minimum period count
valid_concepts = {
concept for concept, periods in concept_periods.items()
if len(periods) >= getattr(self, '_min_periods', 2)
}
return [
fact for fact in results
if (fact.get('concept', ''), fact.get('label', '')) in valid_concepts
]
def _filter_complete_periods(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Filter to concepts that have values in all periods."""
# Get all available periods
all_periods = set(fact.get('period_key', '') for fact in results)
concept_periods = defaultdict(set)
for fact in results:
concept_key = (fact.get('concept', ''), fact.get('label', ''))
concept_periods[concept_key].add(fact.get('period_key', ''))
# Filter to concepts with complete period coverage
complete_concepts = {
concept for concept, periods in concept_periods.items()
if periods == all_periods
}
return [
fact for fact in results
if (fact.get('concept', ''), fact.get('label', '')) in complete_concepts
]
def _prepare_trend_data(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Prepare data for trend analysis by sorting periods."""
# Sort by period end date for trend analysis
return sorted(results, key=lambda f: f.get('period_end', ''))
def to_trend_dataframe(self) -> pd.DataFrame:
"""
Create a DataFrame optimized for trend analysis.
Returns:
DataFrame with concepts as rows and periods as columns
"""
results = self.execute()
if not results:
return pd.DataFrame()
# Pivot data for trend analysis
df = pd.DataFrame(results)
# Create pivot table with concepts as rows and periods as columns
if 'concept' in df.columns and 'period_end' in df.columns and 'numeric_value' in df.columns:
trend_df = df.pivot_table(
index=['label', 'concept'],
columns='period_end',
values='numeric_value',
aggfunc='first'
)
return trend_df
return df
def to_dataframe(self, *columns) -> pd.DataFrame:
"""
Execute the query and return results as a DataFrame.
Args:
columns: List of columns to include in the DataFrame
Returns:
pandas DataFrame with query results
"""
results = self.execute()
if not results:
return pd.DataFrame()
df = pd.DataFrame(results)
df['value'] = df['value'].astype(str) # Ensure value is string for display
# Filter columns based on inclusion flags
if not self._include_dimensions:
df = df.loc[:, [col for col in df.columns if not col.startswith('dim_')]]
if not self._include_contexts:
context_cols = ['context_ref', 'entity_identifier', 'entity_scheme',
'period_type']
df = df.loc[:, [col for col in df.columns if col not in context_cols]]
if not self._include_element_info:
element_cols = ['element_id', 'element_name', 'element_type', 'element_period_type',
'element_balance', 'element_label']
df = df.loc[:, [col for col in df.columns if col not in element_cols]]
# Drop empty columns
df = df.dropna(axis=1, how='all')
# Filter columns if specified
if columns:
df = df[list(columns)]
# Skip these columns
skip_columns = ['fact_key', 'period_key']
# Order columns
first_columns = [col for col in
['concept', 'label', 'original_label', 'value', 'numeric_value',
'period_start', 'period_end', 'decimals', 'statement_type', 'fiscal_period']
if col in df.columns]
columns = first_columns + [col for col in df.columns
if col not in first_columns
and col not in skip_columns]
return df[columns]
def __rich__(self):
title = Text.assemble(("Stitched Facts Query"),
)
subtitle = Text.assemble((self._stitched_facts_view.entity_name, "bold deep_sky_blue1"),
" - ",
(self._stitched_facts_view.document_type)
)
df = self.to_dataframe().fillna('')
columns = df.columns.tolist()
description = Markdown(
f"""
Use *to_dataframe(columns)* to get a DataFrame of the results.
e.g. `query.to_dataframe('concept', 'value', 'period_end')`
Available columns:
'{', '.join(columns)}'
**Enhanced Multi-Period Methods:**
- `across_periods(min_periods=2)` - Filter to concepts across multiple periods
- `by_standardized_concept('Revenue')` - Filter by standardized labels
- `by_original_label('Net sales')` - Filter by original company labels
- `trend_analysis('Revenue')` - Set up trend analysis
- `to_trend_dataframe()` - Get trend-optimized DataFrame
"""
)
display_columns = [col for col in ['label', 'concept', 'value', 'period_start', 'period_end', 'statement_type']
if col in columns]
if not df.empty:
df_display = df[display_columns].head(10) # Show first 10 rows
table = Table(*display_columns, show_header=True, header_style="bold", box=box.SIMPLE)
for t in df_display.itertuples(index=False):
row = []
for i in t:
row.append(str(i)[:50]) # Truncate long values
table.add_row(*row)
else:
table = Table("No results found", box=box.SIMPLE)
panel = Panel(Group(description, table), title=title, subtitle=subtitle, box=box.ROUNDED)
return panel
def __repr__(self):
return repr_rich(self.__rich__())

View File

@@ -0,0 +1,106 @@
"""
XBRL Statement Stitching - Utility Functions
This module contains utility functions for rendering and converting stitched
statement data.
"""
from typing import Any, Dict, Optional
import pandas as pd
def render_stitched_statement(
stitched_data: Dict[str, Any],
statement_title: str,
statement_type: str,
entity_info: Dict[str, Any] = None,
show_date_range: bool = False,
xbrl_instance: Optional[Any] = None
):
"""
Render a stitched statement using the same rendering logic as individual statements.
Args:
stitched_data: Stitched statement data
statement_title: Title of the statement
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
entity_info: Entity information (optional)
show_date_range: Whether to show full date ranges for duration periods
Returns:
RichTable: A formatted table representation of the stitched statement
"""
from edgar.xbrl.rendering import render_statement
# Extract periods and statement data
periods_to_display = stitched_data['periods']
statement_data = stitched_data['statement_data']
# Apply special title formatting for stitched statements
if len(periods_to_display) > 1:
# For multiple periods, modify the title to indicate the trend view
period_desc = f" ({len(periods_to_display)}-Period View)"
statement_title = f"{statement_title}{period_desc}"
# Use the existing rendering function with the new show_date_range parameter
return render_statement(
statement_data=statement_data,
periods_to_display=periods_to_display,
statement_title=statement_title,
statement_type=statement_type,
entity_info=entity_info,
show_date_range=show_date_range,
xbrl_instance=xbrl_instance
)
def to_pandas(stitched_data: Dict[str, Any]) -> pd.DataFrame:
"""
Convert stitched statement data to a pandas DataFrame.
Args:
stitched_data: Stitched statement data
Returns:
DataFrame with periods as columns and concepts as index
"""
# Extract periods and statement data
statement_data = stitched_data['statement_data']
# Create ordered list of period column names (preserving the original ordering)
period_columns = []
for period_id, _period_label in stitched_data['periods']:
# Use the end_date in YYYY-MM-DD format as the column name
col = period_id[-10:]
period_columns.append(col)
# Create a dictionary for the DataFrame with ordered columns
# Start with metadata columns
data = {}
data['label'] = []
data['concept'] = []
# Initialize period columns in the correct order (newest first)
for col in period_columns:
data[col] = []
for _i, item in enumerate(statement_data):
# Skip abstract items without values
if item['is_abstract'] and not item['has_values']:
continue
data['label'].append(item['label'])
data['concept'].append(item['concept'])
# Add values for each period in the correct order
for period_id, _period_label in stitched_data['periods']:
col = period_id[-10:]
value = item['values'].get(period_id)
data[col].append(value)
# Create the DataFrame with columns in the correct order
column_order = ['label', 'concept'] + period_columns
df = pd.DataFrame(data, columns=column_order)
return df

View File

@@ -0,0 +1,340 @@
"""
XBRL Statement Stitching - XBRLS Class
This module contains the XBRLS class which represents multiple XBRL filings
stitched together for multi-period analysis.
"""
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
import pandas as pd
from edgar.xbrl.stitching.core import StatementStitcher, stitch_statements
from edgar.xbrl.stitching.query import StitchedFactQuery, StitchedFactsView
if TYPE_CHECKING:
from edgar._filings import Filings
from edgar.xbrl.statements import StitchedStatements
class XBRLS:
"""
A class representing multiple XBRL filings stitched together.
This provides a unified view of financial data across multiple time periods,
automatically handling the complexities of statement stitching.
"""
def __init__(self, xbrl_list: List[Any]):
"""
Initialize an XBRLS instance with a list of XBRL objects.
Args:
xbrl_list: List of XBRL objects, should be from the same company
and ordered from newest to oldest
"""
# Store the list of XBRL objects
self.xbrl_list = xbrl_list
# Extract entity info from the most recent XBRL
self.entity_info = xbrl_list[0].entity_info if xbrl_list else {}
# Cache for stitched statements
self._statement_cache = {}
# Cache for stitched facts view
self._stitched_facts_view = None
@classmethod
def from_filings(cls, filings: Union['Filings', List[Any]], filter_amendments:bool=True) -> 'XBRLS':
"""
Create an XBRLS object from a list of Filing objects or a Filings object containing multiple filings.
Each filing should be the same form (e.g., 10-K, 10-Q) and from the same company.
Args:
filings: List of Filing objects, should be from the same company
Returns:
XBRLS object with stitched data
"""
from edgar.xbrl.xbrl import XBRL
if filter_amendments:
filtered_filings = filings.filter(amendments=False)
else:
filtered_filings = filings
# Sort filings by date (newest first)
sorted_filings = sorted(filtered_filings, key=lambda f: f.filing_date, reverse=True)
# Create XBRL objects from filings
xbrl_list = []
for filing in sorted_filings:
try:
xbrl = XBRL.from_filing(filing)
xbrl_list.append(xbrl)
except Exception:
pass
return cls(xbrl_list)
@classmethod
def from_xbrl_objects(cls, xbrl_list: List[Any]) -> 'XBRLS':
"""
Create an XBRLS object from a list of XBRL objects.
Args:
xbrl_list: List of XBRL objects, should be from the same company
Returns:
XBRLS object with stitched data
"""
return cls(xbrl_list)
@property
def statements(self) -> 'StitchedStatements':
"""
Get a user-friendly interface to access stitched financial statements.
Returns:
StitchedStatements object
"""
from edgar.xbrl.statements import StitchedStatements
return StitchedStatements(self)
@property
def facts(self) -> StitchedFactsView:
"""
Get a view over stitched facts from all XBRL filings.
Returns:
StitchedFactsView for querying standardized, multi-period data
"""
if self._stitched_facts_view is None:
self._stitched_facts_view = StitchedFactsView(self)
return self._stitched_facts_view
def query(self,
max_periods: int = 8,
standardize: bool = True,
statement_types: Optional[List[str]] = None,
**kwargs) -> StitchedFactQuery:
"""
Start a new query for stitched facts across all filings.
Args:
max_periods: Maximum periods to include in stitched data
standardize: Whether to use standardized labels
statement_types: List of statement types to include
**kwargs: Additional options passed to StitchedFactQuery
Returns:
StitchedFactQuery for building complex queries
"""
# Pass query parameters to the StitchedFactQuery
kwargs.update({
'max_periods': max_periods,
'standardize': standardize,
'statement_types': statement_types
})
return self.facts.query(**kwargs)
def get_statement(self, statement_type: str,
max_periods: int = 8,
standard: bool = True,
use_optimal_periods: bool = True,
include_dimensions: bool = False) -> Dict[str, Any]:
"""
Get a stitched statement of the specified type.
Args:
statement_type: Type of statement to stitch ('IncomeStatement', 'BalanceSheet', etc.)
max_periods: Maximum number of periods to include
standard: Whether to use standardized concept labels
use_optimal_periods: Whether to use entity info to determine optimal periods
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
Returns:
Dictionary with stitched statement data
"""
# Check cache first
cache_key = f"{statement_type}_{max_periods}_{standard}_{use_optimal_periods}_{include_dimensions}"
if cache_key in self._statement_cache:
return self._statement_cache[cache_key]
# Stitch the statement
result = stitch_statements(
self.xbrl_list,
statement_type=statement_type,
period_type=StatementStitcher.PeriodType.ALL_PERIODS,
max_periods=max_periods,
standard=standard,
use_optimal_periods=use_optimal_periods,
include_dimensions=include_dimensions
)
# Cache the result
self._statement_cache[cache_key] = result
return result
def render_statement(self, statement_type: str,
max_periods: int = 8,
standardize: bool = True,
use_optimal_periods: bool = True,
show_date_range: bool = False,
include_dimensions: bool = False):
"""
Render a stitched statement in a rich table format.
Args:
statement_type: Type of statement to render ('BalanceSheet', 'IncomeStatement', etc.)
max_periods: Maximum number of periods to include
standardize: Whether to use standardized concept labels
use_optimal_periods: Whether to use entity info to determine optimal periods
show_date_range: Whether to show full date ranges for duration periods
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
Returns:
RichTable: A formatted table representation of the stitched statement
"""
# Create a StitchedStatement object and use its render method
from edgar.xbrl.statements import StitchedStatement
statement = StitchedStatement(self, statement_type, max_periods, standardize, use_optimal_periods, include_dimensions)
return statement.render(show_date_range=show_date_range)
def to_dataframe(self, statement_type: str,
max_periods: int = 8,
standardize: bool = True) -> pd.DataFrame:
"""
Convert a stitched statement to a pandas DataFrame.
Args:
statement_type: Type of statement to convert ('BalanceSheet', 'IncomeStatement', etc.)
max_periods: Maximum number of periods to include
standardize: Whether to use standardized concept labels
Returns:
DataFrame with periods as columns and concepts as index
"""
# Create a StitchedStatement object and use its to_dataframe method
from edgar.xbrl.statements import StitchedStatement
statement = StitchedStatement(self, statement_type, max_periods, standardize)
return statement.to_dataframe()
def get_periods(self) -> List[Dict[str, str]]:
"""
Get all available periods across all XBRL objects.
Returns:
List of period information dictionaries, each containing:
- 'type': 'instant' or 'duration'
- 'key': period key (e.g., 'instant_2024-09-28', 'duration_2024-01-01_2024-09-28')
- 'label': human-readable label
For instant periods:
- 'date': end date as 'YYYY-MM-DD'
For duration periods:
- 'start_date': start date as 'YYYY-MM-DD'
- 'end_date': end date as 'YYYY-MM-DD'
- 'days': duration in days
- 'period_type': classification ('Annual', 'Quarterly', etc.)
"""
all_periods = []
# Go through all XBRL objects to collect periods
for xbrl in self.xbrl_list:
all_periods.extend(xbrl.reporting_periods)
# De-duplicate periods with the same labels
unique_periods = {}
for period in all_periods:
# Use the date string as the unique key
key = period['date'] if period['type'] == 'instant' else f"{period['start_date']}_{period['end_date']}"
if key not in unique_periods:
unique_periods[key] = period
return list(unique_periods.values())
def get_period_end_dates(self) -> List[str]:
"""
Get end dates for all available periods in YYYY-MM-DD format.
This is a convenience method that extracts just the end dates from periods,
handling both instant and duration periods correctly.
Returns:
List of end dates as strings in YYYY-MM-DD format, sorted newest first
"""
periods = self.get_periods()
end_dates = []
for period in periods:
if period.get('type') == 'duration':
end_date = period.get('end_date')
elif period.get('type') == 'instant':
end_date = period.get('date')
else:
continue
if end_date:
end_dates.append(end_date)
# Sort newest first and remove duplicates while preserving order
seen = set()
sorted_dates = []
for date in sorted(set(end_dates), reverse=True):
if date not in seen:
sorted_dates.append(date)
seen.add(date)
return sorted_dates
def __str__(self) -> str:
"""
String representation of the XBRLS object.
Returns:
String representation
"""
filing_count = len(self.xbrl_list)
periods = self.get_periods()
return f"XBRLS with {filing_count} filings covering {len(periods)} unique periods"
def __rich__(self):
"""
Rich representation for pretty console output.
Returns:
Rich console representation
"""
from rich.panel import Panel
from rich.text import Text
# Get information about the XBRLS object
filing_count = len(self.xbrl_list)
periods = self.get_periods()
# Create a panel with the information
content = Text.from_markup("[bold]XBRLS Object[/bold]\n")
content.append(f"Filings: {filing_count}\n")
content.append(f"Unique Periods: {len(periods)}\n")
# List available statement types
statement_types = set()
for xbrl in self.xbrl_list:
statements = xbrl.get_all_statements()
for stmt in statements:
if stmt['type']:
statement_types.add(stmt['type'])
content.append("\n[bold]Available Statement Types:[/bold]\n")
for stmt_type in sorted(statement_types):
content.append(f"- {stmt_type}\n")
# Show how to access statements
content.append("\n[bold]Example Usage:[/bold]\n")
content.append("xbrls.statements.income_statement()\n")
content.append("xbrls.statements.balance_sheet()\n")
content.append("xbrls.to_dataframe('IncomeStatement')\n")
return Panel(content, title="XBRLS", expand=False)

File diff suppressed because it is too large Load Diff