Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
"""
XBRL Statement Stitching Package
This package provides functionality to combine multiple XBRL statements
across different time periods into a unified view, handling concept
consistency issues and normalizing data representation.
"""
# Import standardize_statement for backwards compatibility with tests
from edgar.xbrl.standardization import standardize_statement
from edgar.xbrl.stitching.core import StatementStitcher, stitch_statements
from edgar.xbrl.stitching.periods import determine_optimal_periods
from edgar.xbrl.stitching.query import StitchedFactQuery, StitchedFactsView
from edgar.xbrl.stitching.utils import render_stitched_statement, to_pandas
from edgar.xbrl.stitching.xbrls import XBRLS
__all__ = [
'XBRLS',
'StatementStitcher',
'stitch_statements',
'determine_optimal_periods',
'render_stitched_statement',
'to_pandas',
'standardize_statement',
'StitchedFactsView',
'StitchedFactQuery'
]

View File

@@ -0,0 +1,621 @@
"""
XBRL Statement Stitching - Core Functionality
This module contains the core StatementStitcher class and related functionality
for combining multiple XBRL statements across different time periods.
"""
from collections import defaultdict
from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional, Set, Tuple, Union
from edgar.xbrl.core import format_date, parse_date
from edgar.xbrl.standardization import ConceptMapper, initialize_default_mappings, standardize_statement
from edgar.xbrl.stitching.ordering import StatementOrderingManager
from edgar.xbrl.stitching.periods import determine_optimal_periods
from edgar.xbrl.stitching.presentation import VirtualPresentationTree
class StatementStitcher:
"""
Combines multiple statements across time periods into a unified view.
This class handles the complexities of combining financial statements
from different periods, including:
- Normalizing concepts that change over time
- Aligning periods correctly
- Handling missing data points
- Providing both standardized and company-specific views
"""
class PeriodType(str, Enum):
"""Types of period views available for stitched statements"""
RECENT_PERIODS = "Most Recent Periods"
RECENT_YEARS = "Recent Years"
THREE_YEAR_COMPARISON = "Three-Year Comparison"
THREE_QUARTERS = "Three Recent Quarters"
ANNUAL_COMPARISON = "Annual Comparison"
QUARTERLY_TREND = "Quarterly Trend"
ALL_PERIODS = "All Available Periods"
def __init__(self, concept_mapper: Optional[ConceptMapper] = None):
"""
Initialize a StatementStitcher instance.
Args:
concept_mapper: Optional ConceptMapper for standardizing concepts.
If None, a default mapper is created.
"""
if concept_mapper is None:
self.mapping_store = initialize_default_mappings()
self.concept_mapper = ConceptMapper(self.mapping_store)
else:
self.concept_mapper = concept_mapper
self.mapping_store = concept_mapper.mapping_store
# Initialize data structures
self.periods = [] # Ordered list of period identifiers
self.period_dates = {} # Maps period ID to display dates
self.data = defaultdict(dict) # {concept: {period: value}}
self.concept_metadata = {} # Metadata for each concept (level, etc.)
self.ordering_manager = None # Will be initialized during stitching
self.original_statement_order = [] # Track original order for hierarchy context
def stitch_statements(
self,
statements: List[Dict[str, Any]],
period_type: Union[PeriodType, str] = PeriodType.RECENT_PERIODS,
max_periods: int = None,
standard: bool = True
) -> Dict[str, Any]:
"""
Stitch multiple statements into a unified view.
Args:
statements: List of statement data from different filings
period_type: Type of period view to generate
max_periods: Maximum number of periods to include
standard: Whether to use standardized concept labels
Returns:
Dictionary with stitched statement data
"""
# Reset state
self.periods = []
self.period_dates = {}
self.data = defaultdict(dict)
self.concept_metadata = {}
self.original_statement_order = []
# Initialize ordering manager for this statement type
statement_type = statements[0].get('statement_type', 'IncomeStatement') if statements else 'IncomeStatement'
self.ordering_manager = StatementOrderingManager(statement_type)
# Capture original statement order from the most recent (first) statement for hierarchy context
if statements:
reference_statement = statements[0]
self.original_statement_order = []
for item in reference_statement.get('data', []):
concept = item.get('concept')
label = item.get('label')
if concept:
self.original_statement_order.append(concept)
if label and label not in self.original_statement_order:
self.original_statement_order.append(label)
# Extract and sort all periods
all_periods = self._extract_periods(statements)
# Set max_periods if not provided
max_periods = max_periods or len(statements) + 2 # Allow for the last statement to have 3 periods
# Select appropriate periods based on period_type
selected_periods = self._select_periods(all_periods, period_type, max_periods)
self.periods = selected_periods
# Process each statement
for _i, statement in enumerate(statements):
# Only process statements that have periods in our selection
statement_periods = set(statement['periods'].keys())
relevant_periods = statement_periods.intersection(set(selected_periods))
if not relevant_periods:
continue
# Standardize the statement if needed
if standard:
processed_data = self._standardize_statement_data(statement)
else:
processed_data = statement['data']
# Store data for each item
self._integrate_statement_data(processed_data, statement['periods'], relevant_periods)
# Format the stitched data
return self._format_output_with_ordering(statements)
def _extract_periods(self, statements: List[Dict[str, Any]]) -> List[Tuple[str, datetime]]:
"""
Extract and sort all periods from the statements, de-duplicating periods with the same date.
Args:
statements: List of statement data
Returns:
List of (period_id, end_date) tuples, sorted by date (newest first)
"""
# Use a dictionary to track unique periods by their end date
# This will handle cases where different period_ids reference the same date
unique_periods = {} # key: date string, value: (period_id, datetime, statement_index)
for i, statement in enumerate(statements):
# Use statement index (i) to prioritize more recent filings
# Lower index = more recent filing
for period_id, period_info in statement['periods'].items():
# Extract end date for sorting
try:
# Initialize normalized_key to silence the type checker
normalized_key = ""
if period_id.startswith('instant_'):
date_str = period_id.split('_')[1]
# Format the date consistently with single statements
try:
date_obj = parse_date(date_str)
display_date = format_date(date_obj)
except ValueError:
# Fall back to original label if parsing fails
display_date = period_info['label']
period_type = 'instant'
# For instant periods, create a normalized key with just the date
normalized_key = f"{period_type}_{date_str}"
else: # duration
# For durations, extract both start and end dates
parts = period_id.split('_')
if len(parts) >= 3:
start_date_str = parts[1]
end_date_str = parts[2]
start_date = parse_date(start_date_str)
end_date = parse_date(end_date_str)
date_str = end_date_str # Use end date for sorting
# Format end date consistently - for stitched statements,
# we only need the end date for duration periods as that's what users compare
display_date = format_date(end_date)
period_type = 'duration'
# Create a normalized key that combines period type, start date, and end date
normalized_key = f"{period_type}_{format_date(start_date)}_{format_date(end_date)}"
else:
# Skip malformed period IDs
continue
# Parse the end date for sorting
end_date = parse_date(date_str)
# Check if we already have this period (by normalized key)
if normalized_key in unique_periods:
existing_idx = unique_periods[normalized_key][2]
# Only replace if this statement is from a more recent filing
if i < existing_idx:
unique_periods[normalized_key] = (period_id, end_date, i)
self.period_dates[period_id] = display_date
else:
# Add new period
unique_periods[normalized_key] = (period_id, end_date, i)
self.period_dates[period_id] = display_date
except (ValueError, TypeError, IndexError):
# Skip periods with invalid dates
continue
# Extract and sort the unique periods
all_periods = [(period_id, end_date) for period_id, end_date, _ in unique_periods.values()]
# Sort by date, newest first
return sorted(all_periods, key=lambda x: x[1], reverse=True)
def _select_periods(
self,
all_periods: List[Tuple[str, Union[str,datetime]]],
period_type: Union[PeriodType, str],
max_periods: int
) -> List[str]:
"""
Select appropriate periods based on period_type.
Args:
all_periods: List of (period_id, end_date) tuples
period_type: Type of period view to generate
max_periods: Maximum number of periods to include
Returns:
List of selected period IDs
"""
if isinstance(period_type, str):
try:
period_type = StatementStitcher.PeriodType(period_type)
except ValueError:
# Default to recent periods if string doesn't match enum
period_type = StatementStitcher.PeriodType.RECENT_PERIODS
# Extract period types (instant vs duration)
instants = [(pid, date) for pid, date in all_periods if pid.startswith('instant_')]
durations = [(pid, date) for pid, date in all_periods if not pid.startswith('instant_')]
# Apply different selection logic based on period_type
if period_type == StatementStitcher.PeriodType.RECENT_PERIODS:
# Just take the most recent periods up to max_periods
return [pid for pid, _ in all_periods[:max_periods]]
elif period_type == StatementStitcher.PeriodType.THREE_YEAR_COMPARISON:
# For balance sheets, find year-end instants
year_ends = []
years_seen = set()
for pid, date in instants:
year = parse_date(date).year
if year not in years_seen and len(year_ends) < max_periods:
year_ends.append(pid)
years_seen.add(year)
return year_ends
elif period_type == StatementStitcher.PeriodType.THREE_QUARTERS:
# Find the most recent quarters (for income statements)
quarterly_periods = []
for pid, _date in durations:
# Check if this appears to be a quarterly period
if not pid.startswith('duration_'):
continue
start_date_str = pid.split('_')[1]
end_date_str = pid.split('_')[2]
try:
start_date = parse_date(start_date_str)
end_date = parse_date(end_date_str)
days = (end_date - start_date).days
# Assuming quarterly is around 90 days
if 80 <= days <= 95:
quarterly_periods.append(pid)
if len(quarterly_periods) >= max_periods:
break
except (ValueError, TypeError, IndexError):
continue
return quarterly_periods
elif period_type == StatementStitcher.PeriodType.ANNUAL_COMPARISON:
# Find annual periods (for income statements)
annual_periods = []
for pid, _date in durations:
# Check if this appears to be an annual period
if not pid.startswith('duration_'):
continue
start_date_str = pid.split('_')[1]
end_date_str = pid.split('_')[2]
try:
start_date = parse_date(start_date_str)
end_date = parse_date(end_date_str)
days = (end_date - start_date).days
# Assuming annual is around 365 days
if 350 <= days <= 380:
annual_periods.append(pid)
if len(annual_periods) >= max_periods:
break
except (ValueError, TypeError, IndexError):
continue
return annual_periods
elif period_type == StatementStitcher.PeriodType.ALL_PERIODS:
# Return all periods, newest first, up to max_periods
return [pid for pid, _ in all_periods[:max_periods]]
# Default to recent periods
return [pid for pid, _ in all_periods[:max_periods]]
def _standardize_statement_data(self, statement: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Standardize the statement data using the concept mapper.
Args:
statement: Statement data
Returns:
Standardized statement data
"""
# Add statement type to context for better mapping
statement_type = statement.get('statement_type', '')
statement_data = statement['data']
for item in statement_data:
item['statement_type'] = statement_type
# Apply standardization using the concept mapper
return standardize_statement(statement_data, self.concept_mapper)
def _integrate_statement_data(
self,
statement_data: List[Dict[str, Any]],
period_map: Dict[str, Dict[str, str]],
relevant_periods: Set[str]
) -> None:
"""
Integrate statement data from one statement into the stitched view.
Args:
statement_data: Statement data
period_map: Map of period IDs to period information
relevant_periods: Set of periods from this statement to include
"""
# Map to track concepts by their underlying concept ID, not just label
# This helps merge rows that represent the same concept but have different labels
concept_to_label_map = {}
for item in statement_data:
concept = item.get('concept')
label = item.get('label')
# Skip items without concept or label
if not concept or not label:
continue
# Skip abstract items with no children (headers without data)
if item.get('is_abstract', False) and not item.get('children'):
continue
# Skip dimension items
if any(bracket in label for bracket in ['[Axis]', '[Domain]', '[Member]', '[Line Items]', '[Table]', '[Abstract]']):
continue
# Use concept as the primary key for identifying the same financial line item
# This is more reliable than labels which may vary across filings
# If we've already seen this concept, use the existing label as the key
# This ensures we merge rows that represent the same concept
if concept in concept_to_label_map:
concept_key = concept_to_label_map[concept]
else:
# For a new concept, use the current label as the key
concept_key = label
# Remember this mapping for future occurrences
concept_to_label_map[concept] = concept_key
# Store metadata about the concept (level, abstract status, etc.)
# If we've already seen this concept, only update metadata if it's from a more recent period
# This ensures we use labels from the most recent filing when merging rows
if concept_key not in self.concept_metadata:
self.concept_metadata[concept_key] = {
'level': item.get('level', 0),
'is_abstract': item.get('is_abstract', False),
'is_total': item.get('is_total', False) or 'total' in label.lower(),
'original_concept': concept,
'latest_label': label # Store the original label too
}
else:
# For existing concepts, update the label to use the most recent one
# We determine which periods are most recent based on position in self.periods
# (earlier indices are more recent periods)
# Find the periods in this statement
statement_periods = [p for p in relevant_periods if p in self.periods]
if statement_periods:
# Get the most recent period in this statement
most_recent_period = min(statement_periods, key=lambda p: self.periods.index(p))
most_recent_idx = self.periods.index(most_recent_period)
# Find the earliest period where we have data for this concept
existing_periods = [p for p in self.data[concept_key].keys() if p in self.periods]
if existing_periods:
earliest_existing_idx = min(self.periods.index(p) for p in existing_periods)
# If this statement has more recent data, update the label
if most_recent_idx < earliest_existing_idx:
# Update the concept key label for display
new_concept_key = label
# If we're changing the label, we need to migrate existing data
if new_concept_key != concept_key:
# Copy existing data to the new key
if new_concept_key not in self.data:
self.data[new_concept_key] = self.data[concept_key].copy()
# Update metadata
self.concept_metadata[new_concept_key] = self.concept_metadata[concept_key].copy()
self.concept_metadata[new_concept_key]['latest_label'] = label
# Update the concept mapping
concept_to_label_map[concept] = new_concept_key
concept_key = new_concept_key
else:
# Just update the latest label
self.concept_metadata[concept_key]['latest_label'] = label
# Store values for relevant periods
for period_id in relevant_periods:
if period_id in self.periods: # Only include selected periods
value = item.get('values', {}).get(period_id)
if value is not None:
self.data[concept_key][period_id] = {
'value': value,
'decimals': item.get('decimals', {}).get(period_id, 0)
}
def _format_output_with_ordering(self, statements: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Format the stitched data for rendering with intelligent ordering using virtual presentation tree.
Args:
statements: Original statements for ordering reference
Returns:
Stitched statement data in the expected format
"""
# Get unified ordering for all concepts using the ordering manager
concept_ordering = {}
if self.ordering_manager:
concept_ordering = self.ordering_manager.determine_ordering(statements)
# Build virtual presentation tree to preserve hierarchy while applying semantic ordering
presentation_tree = VirtualPresentationTree(self.ordering_manager)
ordered_nodes = presentation_tree.build_tree(
concept_metadata=self.concept_metadata,
concept_ordering=concept_ordering,
original_statement_order=self.original_statement_order
)
# Convert nodes back to the expected format
ordered_concepts = [(node.concept, node.metadata) for node in ordered_nodes]
# Build the output structure
result = {
'periods': [(pid, self.period_dates.get(pid, pid)) for pid in self.periods],
'statement_data': []
}
for concept, metadata in ordered_concepts:
# Create an item for each concept
item = {
# Use the latest label if available, otherwise fall back to the concept key
'label': metadata.get('latest_label', concept),
'level': metadata['level'],
'is_abstract': metadata['is_abstract'],
'is_total': metadata['is_total'],
'concept': metadata['original_concept'],
'values': {},
'decimals': {}
}
# Add values for each period
for period_id in self.periods:
if period_id in self.data[concept]:
item['values'][period_id] = self.data[concept][period_id]['value']
item['decimals'][period_id] = self.data[concept][period_id]['decimals']
# Set has_values flag based on whether there are any values
item['has_values'] = len(item['values']) > 0
# Only include items with values or abstract items
if item['has_values'] or item['is_abstract']:
result['statement_data'].append(item)
return result
def _format_output(self) -> Dict[str, Any]:
"""
Backward compatibility method - calls the new ordering-aware method.
Returns:
Stitched statement data in the expected format
"""
# For backward compatibility, call the new method with empty statements
# This will use alphabetical ordering as before
return self._format_output_with_ordering([])
def stitch_statements(
xbrl_list: List[Any],
statement_type: str = 'IncomeStatement',
period_type: Union[StatementStitcher.PeriodType, str] = StatementStitcher.PeriodType.RECENT_PERIODS,
max_periods: int = 3,
standard: bool = True,
use_optimal_periods: bool = True,
include_dimensions: bool = False
) -> Dict[str, Any]:
"""
Stitch together statements from multiple XBRL objects.
Args:
xbrl_list: List of XBRL objects, should be from the same company and ordered by date
statement_type: Type of statement to stitch ('IncomeStatement', 'BalanceSheet', etc.)
period_type: Type of period view to generate
max_periods: Maximum number of periods to include (default: 3)
standard: Whether to use standardized concept labels (default: True)
use_optimal_periods: Whether to use the entity info to determine optimal periods (default: True)
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
Returns:
Stitched statement data
"""
# Initialize the stitcher
stitcher = StatementStitcher()
# Collect statements of the specified type from each XBRL object
statements = []
# If using optimal periods based on entity info
if use_optimal_periods:
# Use our utility function to determine the best periods
optimal_periods = determine_optimal_periods(xbrl_list, statement_type, max_periods=max_periods)
# Limit to max_periods if needed
if len(optimal_periods) > max_periods:
optimal_periods = optimal_periods[:max_periods]
# Extract the XBRL objects that contain our optimal periods
for period_metadata in optimal_periods:
xbrl_index = period_metadata['xbrl_index']
xbrl = xbrl_list[xbrl_index]
# Get the statement and period info
statement = xbrl.get_statement_by_type(statement_type, include_dimensions=include_dimensions)
if statement:
# Only include the specific period from this statement
period_key = period_metadata['period_key']
# Check if this period exists in the statement
if period_key in statement['periods']:
# Create a filtered version of the statement with just this period
filtered_statement = {
'role': statement['role'],
'definition': statement['definition'],
'statement_type': statement['statement_type'],
'periods': {period_key: statement['periods'][period_key]},
'data': statement['data']
}
# Update the period label to include information from entity_info
display_date = period_metadata['display_date']
period_type = period_metadata['period_type']
fiscal_period = period_metadata.get('fiscal_period')
# Create a more informative label
if period_type == 'instant':
if fiscal_period == 'FY':
period_label = f"FY {display_date}"
else:
period_label = display_date
else: # duration
# For duration periods, add fiscal quarter/year info if available
if fiscal_period == 'FY':
period_label = f"FY {display_date}"
elif fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
period_label = f"{fiscal_period} {display_date}"
else:
period_label = display_date
# Update the period label
filtered_statement['periods'][period_key] = {
'label': period_label,
'original_label': statement['periods'][period_key]['label']
}
statements.append(filtered_statement)
# Traditional approach without using entity info
else:
for xbrl in xbrl_list:
# Get statement data for the specified type
statement = xbrl.find_statement(statement_type)
if statement:
statements.append(statement)
# Stitch the statements
return stitcher.stitch_statements(statements, period_type, max_periods, standard)

View File

@@ -0,0 +1,833 @@
"""
XBRL Statement Ordering - Intelligent Ordering for Multi-Period Statements
This module provides consistent ordering for financial statements across multiple periods
by combining template-based, reference-based, and semantic positioning strategies.
"""
import re
from enum import Enum
from typing import Dict, List, Optional, Tuple
try:
from rapidfuzz import fuzz
except ImportError:
# Fallback to difflib if rapidfuzz is not available
from difflib import SequenceMatcher
class fuzz:
@staticmethod
def ratio(s1: str, s2: str) -> float:
return SequenceMatcher(None, s1, s2).ratio() * 100
class StatementType(str, Enum):
"""Supported statement types for ordering"""
INCOME_STATEMENT = "IncomeStatement"
BALANCE_SHEET = "BalanceSheet"
CASH_FLOW = "CashFlowStatement"
EQUITY = "StatementOfEquity"
class FinancialStatementTemplates:
"""Canonical ordering templates for financial statements based on XBRL concepts"""
INCOME_STATEMENT_TEMPLATE = [
# Revenue Section (0-99)
(0, "revenue_section", [
# Product/Service Revenue Components
"us-gaap:SalesRevenueGoodsNet",
"us-gaap:ProductSales",
"us-gaap:SalesRevenueServicesNet",
"us-gaap:SubscriptionRevenue",
# Contract Revenue
"us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
"us-gaap:RevenueFromContractWithCustomerIncludingAssessedTax",
# Total Revenue
"us-gaap:Revenue",
"us-gaap:Revenues",
"us-gaap:SalesRevenueNet",
"us-gaap:OperatingRevenue"
]),
# Cost Section (100-199)
(100, "cost_section", [
"us-gaap:CostOfRevenueAbstract", # Abstract
"us-gaap:CostOfRevenue", # Total
"us-gaap:CostOfGoodsSold",
"us-gaap:CostOfGoodsAndServicesSold",
"us-gaap:CostOfSales",
"us-gaap:DirectOperatingCosts",
"us-gaap:CostsAndExpenses"
]),
# Gross Profit (200-299)
(200, "gross_profit", [
"us-gaap:GrossProfit"
]),
# Operating Expenses (300-399)
(300, "operating_expenses", [
# R&D Expenses
"us-gaap:ResearchAndDevelopmentCosts",
"us-gaap:ResearchAndDevelopmentExpense",
# SG&A Expenses
"us-gaap:SellingGeneralAndAdministrativeExpense",
"us-gaap:GeneralAndAdministrativeExpense",
"us-gaap:AdministrativeExpense",
"us-gaap:SellingAndMarketingExpense",
"us-gaap:SellingExpense",
"us-gaap:MarketingExpense",
"us-gaap:AdvertisingExpense",
# Total Operating Expenses
"us-gaap:NoninterestExpense",
"us-gaap:OperatingCostsAndExpenses",
"us-gaap:OperatingExpenses"
]),
# Operating Income (400-499)
(400, "operating_income", [
"us-gaap:OperatingIncomeLoss",
"us-gaap:OperatingIncome",
"us-gaap:IncomeLossFromContinuingOperationsBeforeInterestAndTaxes"
]),
# Non-Operating (500-599)
(500, "non_operating", [
"us-gaap:InterestIncomeExpenseNet",
"us-gaap:InterestAndDebtExpense",
"us-gaap:InterestExpense",
"us-gaap:InterestExpenseNonoperating", # ADBE uses this for non-operating interest expense
"us-gaap:InterestIncome",
"us-gaap:InvestmentIncomeInterest", # NVIDIA uses this variant
"us-gaap:OtherNonoperatingIncomeExpense",
"us-gaap:NonoperatingIncomeExpense",
"orcl:NonoperatingIncomeExpenseIncludingEliminationOfNetIncomeLossAttributableToNoncontrollingInterests"
]),
# Pre-Tax Income (600-699)
(600, "pretax_income", [
"us-gaap:IncomeLossBeforeIncomeTaxes",
"us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxes",
"us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest",
"orcl:IncomeLossFromContinuingOperationsIncludingNoncontrollingInterestBeforeIncomeTaxesExtraordinaryItems"
]),
# Tax (700-799)
(700, "tax", [
"us-gaap:IncomeTaxesPaidNet",
"us-gaap:IncomeTaxExpenseBenefit"
]),
# Net Income (800-899)
(800, "net_income", [
"us-gaap:IncomeLossFromContinuingOperationsIncludingPortionAttributableToNoncontrollingInterest",
"us-gaap:IncomeLossFromContinuingOperations",
"us-gaap:NetIncome",
"us-gaap:NetIncomeLoss",
"us-gaap:ProfitLoss",
"us-gaap:NetIncomeLossAttributableToNonredeemableNoncontrollingInterest",
"us-gaap:NetIncomeLossAttributableToNoncontrollingInterest"
]),
# Per Share Data (900-999)
(900, "per_share", [
"us-gaap:EarningsPerShareAbstract",
"us-gaap:EarningsPerShareBasic",
"us-gaap:EarningsPerShareDiluted",
"us-gaap:WeightedAverageNumberOfSharesOutstandingAbstract",
"us-gaap:WeightedAverageNumberOfSharesOutstandingBasic",
"us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding"
])
]
BALANCE_SHEET_TEMPLATE = [
# Current Assets (0-199)
(0, "current_assets", [
"Cash and Cash Equivalents",
"Cash",
"Short-term Investments",
"Marketable Securities",
"Accounts Receivable",
"Trade Receivables",
"Inventory",
"Prepaid Expenses",
"Other Current Assets",
"Total Current Assets"
]),
# Non-Current Assets (200-399)
(200, "noncurrent_assets", [
"Property, Plant and Equipment",
"Property and Equipment",
"Long-term Investments",
"Goodwill",
"Intangible Assets",
"Other Non-current Assets",
"Total Non-current Assets",
"Total Assets"
]),
# Current Liabilities (400-599)
(400, "current_liabilities", [
"Accounts Payable",
"Trade Payables",
"Accrued Liabilities",
"Accrued Expenses",
"Short-term Debt",
"Current Portion of Long-term Debt",
"Other Current Liabilities",
"Total Current Liabilities"
]),
# Non-Current Liabilities (600-799)
(600, "noncurrent_liabilities", [
"Long-term Debt",
"Deferred Revenue",
"Deferred Tax Liabilities",
"Other Non-current Liabilities",
"Total Non-current Liabilities",
"Total Liabilities"
]),
# Equity (800-999)
(800, "equity", [
"Common Stock",
"Additional Paid-in Capital",
"Retained Earnings",
"Accumulated Other Comprehensive Income",
"Treasury Stock",
"Total Stockholders' Equity",
"Total Shareholders' Equity",
"Total Equity"
])
]
def get_template_position(self, item_concept: str, item_label: str, statement_type: str) -> Optional[float]:
"""
Get template position for an item, prioritizing concept-based matching over label matching.
Args:
item_concept: The XBRL concept (e.g., "us-gaap:Revenue")
item_label: The display label (e.g., "Contract Revenue")
statement_type: Type of statement ("IncomeStatement", "BalanceSheet", etc.)
Returns:
Float position in template, or None if no match found
"""
# Handle different statement type formats
if statement_type == "IncomeStatement":
template_name = "INCOME_STATEMENT_TEMPLATE"
elif statement_type == "BalanceSheet":
template_name = "BALANCE_SHEET_TEMPLATE"
else:
template_name = f"{statement_type.upper()}_TEMPLATE"
template = getattr(self, template_name, None)
if not template:
return None
# Strategy 1: Direct concept matching (highest priority)
if item_concept:
normalized_concept = self._normalize_xbrl_concept(item_concept)
for base_pos, _section_name, template_concepts in template:
for i, template_concept in enumerate(template_concepts):
template_normalized = self._normalize_xbrl_concept(template_concept)
if normalized_concept == template_normalized:
return float(base_pos + i)
# Strategy 2: Label-based matching as fallback (for compatibility)
if item_label:
for base_pos, _section_name, template_concepts in template:
for i, template_concept in enumerate(template_concepts):
if self._labels_match(item_label, template_concept):
return float(base_pos + i)
return None
def _normalize_xbrl_concept(self, concept: str) -> str:
"""
Normalize XBRL concept for matching.
Handles variations in concept format:
- "us-gaap:Revenue" vs "us-gaap_Revenue"
- Case sensitivity
- Namespace prefixes
"""
if not concept:
return ""
# Normalize separators (: vs _)
normalized = concept.lower()
normalized = normalized.replace(':', '_')
# Handle common namespace variations
# us-gaap, usgaap, gaap all should match
if normalized.startswith('us-gaap_') or normalized.startswith('usgaap_'):
normalized = 'us-gaap_' + normalized.split('_', 1)[1]
elif normalized.startswith('gaap_'):
normalized = 'us-gaap_' + normalized.split('_', 1)[1]
return normalized
def _labels_match(self, label1: str, label2: str) -> bool:
"""Check if two labels represent the same financial item (fallback for non-concept matching)"""
if not label1 or not label2:
return False
# For XBRL concepts in templates, don't try to match against labels
if ':' in label2 or '_gaap_' in label2.lower():
return False
# Use existing normalization logic for label matching
norm1 = self._normalize_concept(label1)
norm2 = self._normalize_concept(label2)
# Exact match
if norm1 == norm2:
return True
# Fuzzy matching for similar concepts
similarity = fuzz.ratio(norm1, norm2) / 100.0
return similarity > 0.7
def _concepts_match(self, concept1: str, concept2: str) -> bool:
"""Check if two concepts represent the same financial item"""
# Normalize for comparison
norm1 = self._normalize_concept(concept1)
norm2 = self._normalize_concept(concept2)
# Exact match
if norm1 == norm2:
return True
# Fuzzy matching for similar concepts
similarity = fuzz.ratio(norm1, norm2) / 100.0
return similarity > 0.7 # Lowered threshold for better matching
def _normalize_concept(self, concept: str) -> str:
"""Normalize concept for comparison"""
if not concept:
return ""
# Remove common variations
normalized = concept.lower()
normalized = re.sub(r'\s+', ' ', normalized) # Normalize whitespace
normalized = re.sub(r'[,\.]', '', normalized) # Remove punctuation
normalized = re.sub(r'\(.*?\)', '', normalized) # Remove parenthetical
normalized = re.sub(r'\bexpense\b', '', normalized) # Remove 'expense' suffix
normalized = re.sub(r'\bincome\b', '', normalized) # Remove 'income' suffix for matching
return normalized.strip()
class ReferenceOrderingStrategy:
"""Extract ordering from reference statement"""
def establish_reference_order(self, statements: List[Dict]) -> Dict[str, float]:
"""Establish reference ordering from best available statement"""
if not statements:
return {}
# Strategy: Use most recent statement (statements are ordered newest first)
reference_statement = statements[0]
reference_order = {}
for i, item in enumerate(reference_statement.get('data', [])):
concept = item.get('concept')
label = item.get('label')
if concept:
# Store by both concept ID and label for flexibility
reference_order[concept] = float(i)
if label:
reference_order[label] = float(i)
return reference_order
class SemanticPositioning:
"""Position concepts based on financial statement semantics"""
def __init__(self, statement_type: str):
self.statement_type = statement_type
self.section_defaults = self._get_section_defaults()
def _get_section_defaults(self) -> Dict[str, float]:
"""Default positions for each section when no other guidance available"""
if self.statement_type == "IncomeStatement":
return {
"revenue": 50.0,
"cost": 150.0,
"gross_profit": 250.0,
"expense": 350.0,
"operating_income": 450.0,
"non_operating": 550.0,
"pretax_income": 650.0,
"tax": 750.0,
"net_income": 850.0,
"per_share": 950.0
}
elif self.statement_type == "BalanceSheet":
return {
"current_assets": 100.0,
"noncurrent_assets": 300.0,
"current_liabilities": 500.0,
"noncurrent_liabilities": 700.0,
"equity": 900.0
}
return {}
def infer_position(self, concept: str, existing_order: Dict[str, float]) -> float:
"""Infer semantic position for a new concept"""
# Rule-based positioning
section = self._classify_concept_section(concept)
if section:
return self._position_in_section(concept, section, existing_order)
# Parent-child relationship positioning
parent = self._find_parent_concept(concept, existing_order)
if parent:
return existing_order[parent] + 0.1 # Just after parent
# Similarity-based positioning
similar_concept = self._find_most_similar_concept(concept, existing_order)
if similar_concept:
return existing_order[similar_concept] + 0.1
# Default to end
return 999.0
def _classify_concept_section(self, concept: str) -> Optional[str]:
"""Classify concept into financial statement section"""
if not concept:
return None
concept_lower = concept.lower()
if self.statement_type == "IncomeStatement":
# Revenue indicators
if any(term in concept_lower for term in ['revenue', 'sales']) and not any(term in concept_lower for term in ['cost', 'expense']):
return "revenue"
# Cost indicators
elif any(term in concept_lower for term in ['cost of', 'cogs']):
return "cost"
# Gross profit
elif 'gross profit' in concept_lower or 'gross margin' in concept_lower:
return "gross_profit"
# Operating expenses
elif any(term in concept_lower for term in ['r&d', 'research', 'selling', 'administrative', 'marketing']) or ('expense' in concept_lower and 'tax' not in concept_lower):
return "expense"
# Operating income
elif 'operating income' in concept_lower or 'operating profit' in concept_lower:
return "operating_income"
# Non-operating
elif any(term in concept_lower for term in ['interest', 'other income', 'nonoperating']):
return "non_operating"
# Pre-tax income
elif 'before tax' in concept_lower or 'pretax' in concept_lower:
return "pretax_income"
# Tax
elif 'tax' in concept_lower and 'expense' in concept_lower:
return "tax"
# Net income
elif 'net income' in concept_lower or 'net earnings' in concept_lower:
return "net_income"
# Per share
elif any(term in concept_lower for term in ['per share', 'earnings per', 'shares outstanding']):
return "per_share"
elif self.statement_type == "BalanceSheet":
if any(term in concept_lower for term in ['cash', 'receivable', 'inventory', 'prepaid']) or ('current' in concept_lower and 'asset' in concept_lower):
return "current_assets"
elif any(term in concept_lower for term in ['property', 'equipment', 'goodwill', 'intangible']) or ('asset' in concept_lower and 'current' not in concept_lower):
return "noncurrent_assets"
elif any(term in concept_lower for term in ['payable', 'accrued']) or ('current' in concept_lower and 'liabilit' in concept_lower):
return "current_liabilities"
elif 'debt' in concept_lower or ('liabilit' in concept_lower and 'current' not in concept_lower):
return "noncurrent_liabilities"
elif any(term in concept_lower for term in ['equity', 'stock', 'retained earnings', 'capital']):
return "equity"
return None
def _position_in_section(self, concept: str, section: str, existing_order: Dict[str, float]) -> float:
"""Position concept within its identified section"""
section_concepts = [
(label, pos) for label, pos in existing_order.items()
if self._classify_concept_section(label) == section
]
if not section_concepts:
# Section doesn't exist yet - use template defaults
return self.section_defaults.get(section, 999.0)
# Find best position within section
section_concepts.sort(key=lambda x: x[1]) # Sort by position
# Simple strategy: place at end of section
last_pos = section_concepts[-1][1]
return last_pos + 0.1
def _find_parent_concept(self, concept: str, existing_order: Dict[str, float]) -> Optional[str]:
"""Find parent concept in hierarchy"""
if not concept:
return None
# Look for hierarchical relationships
# e.g., "Software Revenue" -> "Revenue"
concept_words = set(concept.lower().split())
candidates = []
for existing_concept in existing_order.keys():
if not existing_concept:
continue
existing_words = set(existing_concept.lower().split())
# Check if existing concept is a parent (subset of words)
# Also check for common patterns like "expense" being a parent of "X expense"
if (existing_words.issubset(concept_words) and len(existing_words) < len(concept_words)) or \
(existing_concept.lower() in concept.lower() and existing_concept.lower() != concept.lower()):
candidates.append((existing_concept, len(existing_words)))
if candidates:
# Return the most specific parent (most words in common)
return max(candidates, key=lambda x: x[1])[0]
return None
def _find_most_similar_concept(self, concept: str, existing_order: Dict[str, float]) -> Optional[str]:
"""Find most similar existing concept"""
if not concept:
return None
best_match = None
best_similarity = 0.0
for existing_concept in existing_order.keys():
if not existing_concept:
continue
similarity = fuzz.ratio(concept.lower(), existing_concept.lower()) / 100.0
if similarity > best_similarity and similarity > 0.5: # Minimum threshold
best_similarity = similarity
best_match = existing_concept
return best_match
class StatementOrderingManager:
"""Manages consistent ordering across multi-period statements"""
def __init__(self, statement_type: str):
self.statement_type = statement_type
self.templates = FinancialStatementTemplates()
self.reference_strategy = ReferenceOrderingStrategy()
self.semantic_positioning = SemanticPositioning(statement_type)
def determine_ordering(self, statements: List[Dict]) -> Dict[str, float]:
"""
Determine unified ordering for all concepts across statements.
Returns:
Dict mapping concept -> sort_key (float for interpolation)
"""
if not statements:
return {}
all_concepts = self._extract_all_concepts(statements)
# Strategy 1: Template-based ordering (highest priority)
template_positioned = self._apply_template_ordering(all_concepts, statements)
# Strategy 2: Reference statement ordering for non-template items
reference_positioned = self._apply_reference_ordering(
all_concepts, statements, template_positioned
)
# Strategy 3: Semantic positioning for orphan concepts
semantic_positioned = self._apply_semantic_positioning(
all_concepts, template_positioned, reference_positioned
)
# Strategy 4: Section-aware consolidation to maintain template groupings
final_ordering = self._consolidate_section_ordering(
semantic_positioned, template_positioned, statements
)
return final_ordering
def _extract_all_concepts(self, statements: List[Dict]) -> set:
"""Extract all unique concepts from statements"""
all_concepts = set()
for statement in statements:
for item in statement.get('data', []):
concept = item.get('concept')
label = item.get('label')
if concept:
all_concepts.add(concept)
if label:
all_concepts.add(label)
return all_concepts
def _apply_template_ordering(self, concepts: set, statements: List[Dict]) -> Dict[str, float]:
"""Apply template-based ordering for known concepts using concept-first matching"""
template_order = {}
# Build a mapping of concepts/labels to their actual XBRL concepts for better matching
concept_to_xbrl = {}
label_to_xbrl = {}
for statement in statements:
for item in statement.get('data', []):
concept = item.get('concept')
label = item.get('label')
if concept and label:
concept_to_xbrl[concept] = concept
label_to_xbrl[label] = concept
elif concept:
concept_to_xbrl[concept] = concept
# Apply template ordering with concept priority
for concept_or_label in concepts:
# Determine if this is a concept or label
is_concept = concept_or_label in concept_to_xbrl
is_label = concept_or_label in label_to_xbrl
# Get the actual XBRL concept and label for this item
if is_concept:
xbrl_concept = concept_or_label
# Try to find the corresponding label
corresponding_label = None
for stmt in statements:
for item in stmt.get('data', []):
if item.get('concept') == concept_or_label:
corresponding_label = item.get('label')
break
if corresponding_label:
break
elif is_label:
xbrl_concept = label_to_xbrl.get(concept_or_label)
corresponding_label = concept_or_label
else:
# Neither concept nor label found in mappings
xbrl_concept = None
corresponding_label = concept_or_label
# Try concept-based matching first, then label-based
template_pos = self.templates.get_template_position(
item_concept=xbrl_concept,
item_label=corresponding_label,
statement_type=self.statement_type
)
if template_pos is not None:
template_order[concept_or_label] = template_pos
# IMPORTANT: If we found a template position for a concept,
# also apply it to the corresponding label (and vice versa)
# This ensures consistent ordering regardless of whether the
# stitcher uses concept or label as the key
if is_concept and corresponding_label and corresponding_label in concepts:
template_order[corresponding_label] = template_pos
elif is_label and xbrl_concept and xbrl_concept in concepts:
template_order[xbrl_concept] = template_pos
return template_order
def _apply_reference_ordering(self, concepts: set, statements: List[Dict],
template_positioned: Dict[str, float]) -> Dict[str, float]:
"""Apply reference statement ordering for remaining concepts"""
reference_order = self.reference_strategy.establish_reference_order(statements)
combined_order = template_positioned.copy()
for concept in concepts:
if concept not in combined_order and concept in reference_order:
combined_order[concept] = reference_order[concept]
return combined_order
def _apply_semantic_positioning(self, concepts: set, template_positioned: Dict[str, float],
reference_positioned: Dict[str, float]) -> Dict[str, float]:
"""Apply semantic positioning for orphan concepts"""
final_order = reference_positioned.copy()
# Position remaining concepts using semantic rules
for concept in concepts:
if concept not in final_order:
semantic_pos = self.semantic_positioning.infer_position(concept, final_order)
final_order[concept] = semantic_pos
return final_order
def _consolidate_section_ordering(self, semantic_positioned: Dict[str, float],
template_positioned: Dict[str, float],
statements: List[Dict]) -> Dict[str, float]:
"""
Consolidate ordering to maintain template section groupings.
This prevents reference ordering from breaking up logical template sections
like per-share data (EPS + Shares Outstanding).
"""
# Identify template sections and their concepts
template_sections = self._identify_template_sections(template_positioned)
# Separate template-positioned from non-template items
template_items = {}
non_template_items = {}
for concept, position in semantic_positioned.items():
if concept in template_positioned:
template_items[concept] = position
else:
non_template_items[concept] = position
# Re-organize to ensure section integrity
final_ordering = {}
# Process template sections in order
for section_name, section_concepts in template_sections.items():
# Find all template items (concepts and labels) that belong to this section
section_template_items = []
for concept in section_concepts:
if concept in template_items:
section_template_items.append(concept)
# Also find labels that correspond to concepts in this section
# by checking if any template_items have the same template position
section_template_positions = set()
for concept in section_concepts:
if concept in template_positioned:
section_template_positions.add(template_positioned[concept])
# Find labels that have the same template positions as section concepts
for item, pos in template_items.items():
if pos in section_template_positions and item not in section_template_items:
section_template_items.append(item)
if section_template_items:
# Use the template base position for this section to ensure strong grouping
section_base_pos = self._get_section_base_position(section_name)
# For critical sections like per_share, use an even stronger override
if section_name == "per_share":
# Force per-share items to be at the very end, regardless of hierarchy
section_base_pos = 950.0
# Ensure all items in this section stay grouped together
for i, item in enumerate(sorted(section_template_items,
key=lambda x: template_items.get(x, 999.0))):
final_ordering[item] = section_base_pos + i * 0.1
# Add non-template items, adjusting positions to avoid breaking template sections
section_ranges = self._get_section_ranges(final_ordering, template_sections)
for concept, position in non_template_items.items():
# Find appropriate insertion point that doesn't break template sections
adjusted_position = self._find_insertion_point(position, section_ranges)
final_ordering[concept] = adjusted_position
return final_ordering
def _get_section_base_position(self, section_name: str) -> float:
"""Get the base position for a template section"""
if self.statement_type == "IncomeStatement":
template = self.templates.INCOME_STATEMENT_TEMPLATE
elif self.statement_type == "BalanceSheet":
template = self.templates.BALANCE_SHEET_TEMPLATE
else:
return 999.0
for base_pos, name, _concepts in template:
if name == section_name:
return float(base_pos)
return 999.0
def _identify_template_sections(self, template_positioned: Dict[str, float]) -> Dict[str, List[str]]:
"""Identify which concepts belong to which template sections"""
sections = {}
# Get the template for this statement type
if self.statement_type == "IncomeStatement":
template = self.templates.INCOME_STATEMENT_TEMPLATE
elif self.statement_type == "BalanceSheet":
template = self.templates.BALANCE_SHEET_TEMPLATE
else:
return {}
# Build mapping of concepts to sections
for _base_pos, section_name, template_concepts in template:
section_concepts = []
for concept in template_positioned.keys():
# Check if this concept matches any template concept in this section
for template_concept in template_concepts:
if self._concept_matches_template(concept, template_concept):
section_concepts.append(concept)
break
if section_concepts:
sections[section_name] = section_concepts
return sections
def _concept_matches_template(self, concept: str, template_concept: str) -> bool:
"""Check if a concept matches a template concept"""
# For XBRL concepts, do direct comparison
if ':' in template_concept or '_gaap_' in template_concept.lower():
return self._normalize_xbrl_concept(concept) == self._normalize_xbrl_concept(template_concept)
# For labels, use fuzzy matching
return self._labels_match(concept, template_concept)
def _get_section_ranges(self, final_ordering: Dict[str, float],
template_sections: Dict[str, List[str]]) -> List[Tuple[float, float, str]]:
"""Get the position ranges occupied by each template section"""
ranges = []
for section_name, concepts in template_sections.items():
section_positions = [final_ordering[c] for c in concepts if c in final_ordering]
if section_positions:
min_pos = min(section_positions)
max_pos = max(section_positions)
ranges.append((min_pos, max_pos, section_name))
return sorted(ranges)
def _find_insertion_point(self, desired_position: float,
section_ranges: List[Tuple[float, float, str]]) -> float:
"""Find appropriate insertion point that doesn't break template sections"""
# Check if desired position conflicts with any template section
for min_pos, max_pos, section_name in section_ranges:
if min_pos <= desired_position <= max_pos:
# Position conflicts with a template section
# Place it just before the section (unless it should logically be after)
# Special handling for per-share section
if section_name == "per_share" and desired_position < min_pos:
# Items that should come before per-share data
return min_pos - 1.0
else:
# Place after the section
return max_pos + 1.0
# No conflicts, use desired position
return desired_position
def _normalize_xbrl_concept(self, concept: str) -> str:
"""Delegate to templates class for concept normalization"""
return self.templates._normalize_xbrl_concept(concept)
def _labels_match(self, label1: str, label2: str) -> bool:
"""Delegate to templates class for label matching"""
return self.templates._labels_match(label1, label2)

View File

@@ -0,0 +1,547 @@
"""
XBRL Statement Stitching - Period Optimization (Refactored)
This module provides functionality to determine optimal periods for stitching
statements across multiple XBRL filings, handling period selection and
fiscal period matching.
Refactored to use a clean class-based architecture for better maintainability,
testability, and extensibility.
"""
import logging
from dataclasses import dataclass
from datetime import date
from typing import Any, Dict, List, Optional, Tuple
from edgar.xbrl.core import format_date, parse_date
from edgar.xbrl.xbrl import XBRL
logger = logging.getLogger(__name__)
@dataclass
class PeriodSelectionConfig:
"""Configuration for period selection behavior"""
# Duration ranges for different period types
annual_duration_range: Tuple[int, int] = (350, 380)
quarterly_duration_range: Tuple[int, int] = (80, 100)
q2_ytd_range: Tuple[int, int] = (175, 190)
q3_ytd_range: Tuple[int, int] = (260, 285)
q4_annual_range: Tuple[int, int] = (350, 380)
# Target durations for optimization
target_annual_days: int = 365
target_quarterly_days: int = 90
target_q2_ytd_days: int = 180
target_q3_ytd_days: int = 270
# Behavior flags
require_exact_matches: bool = True
allow_fallback_when_no_doc_date: bool = True
max_periods_default: int = 8
class PeriodMatcher:
"""Handles exact period matching logic"""
def __init__(self, config: PeriodSelectionConfig):
self.config = config
def find_exact_instant_match(self, periods: List[Dict], target_date: date) -> Optional[Dict]:
"""Find instant period that exactly matches target date"""
for period in periods:
try:
period_date = parse_date(period['date'])
if period_date == target_date:
return period
except (ValueError, TypeError) as e:
logger.warning("Failed to parse period date '%s': %s", period.get('date'), e)
continue
return None
def find_exact_duration_match(self, periods: List[Dict], target_date: date) -> Optional[Dict]:
"""Find duration period that ends exactly on target date"""
for period in periods:
try:
end_date = parse_date(period['end_date'])
if end_date == target_date:
return period
except (ValueError, TypeError) as e:
logger.warning("Failed to parse period end date '%s': %s", period.get('end_date'), e)
continue
return None
def filter_by_duration_range(self, periods: List[Dict], min_days: int, max_days: int, target_days: int) -> List[Dict]:
"""Filter periods by duration and sort by proximity to target"""
filtered_periods = []
for period in periods:
duration_days = period.get('duration_days')
if duration_days is None:
try:
start_date = parse_date(period['start_date'])
end_date = parse_date(period['end_date'])
duration_days = (end_date - start_date).days
period = period.copy()
period['duration_days'] = duration_days
except (ValueError, TypeError) as e:
logger.warning("Failed to calculate duration for period: %s", e)
continue
if min_days <= duration_days <= max_days:
filtered_periods.append(period)
# Sort by proximity to target duration
filtered_periods.sort(key=lambda x: abs(x['duration_days'] - target_days))
return filtered_periods
class FiscalPeriodClassifier:
"""Classifies and filters periods based on fiscal information"""
def __init__(self, config: PeriodSelectionConfig):
self.config = config
def classify_annual_periods(self, periods: List[Dict]) -> List[Dict]:
"""Identify annual periods (350-380 days)"""
min_days, max_days = self.config.annual_duration_range
target_days = self.config.target_annual_days
annual_periods = []
for period in periods:
duration_days = period.get('duration_days', 0)
if min_days <= duration_days <= max_days:
annual_periods.append(period)
# Sort by proximity to target annual duration
annual_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
return annual_periods
def classify_quarterly_periods(self, periods: List[Dict]) -> List[Dict]:
"""Identify quarterly periods (80-100 days)"""
min_days, max_days = self.config.quarterly_duration_range
target_days = self.config.target_quarterly_days
quarterly_periods = []
for period in periods:
duration_days = period.get('duration_days', 0)
if min_days <= duration_days <= max_days:
quarterly_periods.append(period)
# Sort by proximity to target quarterly duration
quarterly_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
return quarterly_periods
def classify_ytd_periods(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
"""Identify YTD periods based on fiscal quarter"""
if fiscal_period not in ['Q2', 'Q3', 'Q4']:
return []
# Get expected duration range for this fiscal period
duration_ranges = {
'Q2': self.config.q2_ytd_range,
'Q3': self.config.q3_ytd_range,
'Q4': self.config.q4_annual_range
}
target_durations = {
'Q2': self.config.target_q2_ytd_days,
'Q3': self.config.target_q3_ytd_days,
'Q4': self.config.target_annual_days
}
min_days, max_days = duration_ranges[fiscal_period]
target_days = target_durations[fiscal_period]
ytd_periods = []
for period in periods:
duration_days = period.get('duration_days', 0)
if min_days <= duration_days <= max_days:
ytd_periods.append(period)
# Sort by proximity to target duration
ytd_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
return ytd_periods
def get_expected_durations(self, fiscal_period: str) -> Dict[str, Tuple[int, int]]:
"""Get expected duration ranges for fiscal period"""
if fiscal_period == 'FY':
return {'annual': self.config.annual_duration_range}
elif fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
durations = {'quarterly': self.config.quarterly_duration_range}
if fiscal_period == 'Q2':
durations['ytd'] = self.config.q2_ytd_range
elif fiscal_period == 'Q3':
durations['ytd'] = self.config.q3_ytd_range
elif fiscal_period == 'Q4':
durations['ytd'] = self.config.q4_annual_range
return durations
else:
return {}
class StatementTypeSelector:
"""Handles statement-specific period selection logic"""
def __init__(self, matcher: PeriodMatcher, classifier: FiscalPeriodClassifier):
self.matcher = matcher
self.classifier = classifier
def select_balance_sheet_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date]) -> List[Dict]:
"""Select instant periods for balance sheets"""
# Filter for instant periods only
instant_periods = [p for p in xbrl.reporting_periods if p['type'] == 'instant']
if not instant_periods:
return []
# If we have document_period_end_date, find exact match
if doc_period_end_date:
exact_match = self.matcher.find_exact_instant_match(instant_periods, doc_period_end_date)
if exact_match:
return [exact_match]
else:
# No exact match found - don't use fallback to prevent fiscal year boundary issues
logger.info("No exact instant period match found for %s", doc_period_end_date)
return []
# No document_period_end_date available - use most recent period
instant_periods.sort(key=lambda x: x['date'], reverse=True)
return [instant_periods[0]]
def select_income_statement_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
fiscal_period: str) -> List[Dict]:
"""Select duration periods for income statements"""
return self._select_duration_periods(xbrl, doc_period_end_date, fiscal_period)
def select_cash_flow_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
fiscal_period: str) -> List[Dict]:
"""Select duration periods for cash flow statements"""
return self._select_duration_periods(xbrl, doc_period_end_date, fiscal_period)
def _select_duration_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
fiscal_period: str) -> List[Dict]:
"""Common logic for selecting duration periods"""
# Filter for duration periods only
duration_periods = [p for p in xbrl.reporting_periods if p['type'] == 'duration']
if not duration_periods:
return []
# Add duration_days to all periods
enriched_periods = []
for period in duration_periods:
try:
start_date = parse_date(period['start_date'])
end_date = parse_date(period['end_date'])
period_copy = period.copy()
period_copy['duration_days'] = (end_date - start_date).days
enriched_periods.append(period_copy)
except (ValueError, TypeError) as e:
logger.warning("Failed to parse period dates: %s", e)
continue
if not enriched_periods:
return []
# If we have document_period_end_date, find periods that end exactly on that date
if doc_period_end_date:
matching_periods = []
for period in enriched_periods:
try:
end_date = parse_date(period['end_date'])
if end_date == doc_period_end_date:
matching_periods.append(period)
except (ValueError, TypeError):
continue
if matching_periods:
return self._select_appropriate_durations(matching_periods, fiscal_period)
else:
# No exact match found - don't use fallback
logger.info("No exact duration period match found for %s", doc_period_end_date)
return []
# No document_period_end_date - use fallback logic
return self._select_fallback_periods(enriched_periods, fiscal_period)
def _select_appropriate_durations(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
"""Select appropriate duration periods based on fiscal period"""
selected_periods = []
is_annual = fiscal_period == 'FY'
if is_annual:
# For annual reports, select annual periods
annual_periods = self.classifier.classify_annual_periods(periods)
if annual_periods:
selected_periods.append(annual_periods[0])
else:
# For quarterly reports, select quarterly period
quarterly_periods = self.classifier.classify_quarterly_periods(periods)
if quarterly_periods:
selected_periods.append(quarterly_periods[0])
# Also select YTD period if appropriate
ytd_periods = self.classifier.classify_ytd_periods(periods, fiscal_period)
if ytd_periods:
selected_periods.append(ytd_periods[0])
return selected_periods
def _select_fallback_periods(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
"""Fallback period selection when no document_period_end_date is available"""
is_annual = fiscal_period == 'FY'
if is_annual:
# For annual reports, prefer periods closest to 365 days
annual_periods = self.classifier.classify_annual_periods(periods)
if annual_periods:
# Sort by end date and take the most recent
annual_periods.sort(key=lambda x: x['end_date'], reverse=True)
return [annual_periods[0]]
else:
# For quarterly reports, prefer quarterly duration
quarterly_periods = self.classifier.classify_quarterly_periods(periods)
selected_periods = []
if quarterly_periods:
quarterly_periods.sort(key=lambda x: x['end_date'], reverse=True)
selected_periods.append(quarterly_periods[0])
# Add YTD period if available
ytd_periods = self.classifier.classify_ytd_periods(periods, fiscal_period)
if ytd_periods:
ytd_periods.sort(key=lambda x: x['end_date'], reverse=True)
selected_periods.append(ytd_periods[0])
return selected_periods
# If no appropriate periods found, return the most recent period
periods.sort(key=lambda x: x['end_date'], reverse=True)
return [periods[0]]
class PeriodMetadataEnricher:
"""Handles period metadata enrichment"""
def enrich_period_metadata(self, period: Dict, xbrl_index: int, entity_info: Dict,
doc_period_end_date: Optional[date], fiscal_period: str,
fiscal_year: str) -> Dict[str, Any]:
"""Add comprehensive metadata to period"""
period_metadata = {
'xbrl_index': xbrl_index,
'period_key': period['key'],
'period_label': period['label'],
'period_type': period['type'],
'entity_info': entity_info,
'doc_period_end_date': doc_period_end_date,
'fiscal_period': fiscal_period,
'fiscal_year': fiscal_year
}
# Add date information
if period['type'] == 'instant':
period_metadata['date'] = parse_date(period['date'])
period_metadata['display_date'] = format_date(period_metadata['date'])
else: # duration
period_metadata['start_date'] = parse_date(period['start_date'])
period_metadata['end_date'] = parse_date(period['end_date'])
period_metadata['duration_days'] = period.get('duration_days',
(period_metadata['end_date'] - period_metadata['start_date']).days)
period_metadata['display_date'] = format_date(period_metadata['end_date'])
return period_metadata
class PeriodDeduplicator:
"""Handles period deduplication and sorting"""
def deduplicate_periods(self, periods: List[Dict], statement_type: str) -> List[Dict]:
"""Remove duplicate periods using exact date matching"""
filtered_periods = []
for period in periods:
too_close = False
for included_period in filtered_periods:
# Skip if period types don't match
if period['period_type'] != included_period['period_type']:
continue
# Calculate date difference
if period['period_type'] == 'instant':
date1 = period['date']
date2 = included_period['date']
else: # duration
date1 = period['end_date']
date2 = included_period['end_date']
# Periods are duplicates if they have exactly the same date
if date1 == date2:
too_close = True
break
if not too_close:
filtered_periods.append(period)
return filtered_periods
def sort_periods_chronologically(self, periods: List[Dict], statement_type: str) -> List[Dict]:
"""Sort periods by appropriate date field"""
if statement_type == 'BalanceSheet':
return sorted(periods, key=lambda x: x['date'], reverse=True)
else:
return sorted(periods, key=lambda x: x['end_date'], reverse=True)
def limit_periods(self, periods: List[Dict], max_periods: int) -> List[Dict]:
"""Limit to maximum number of periods"""
return periods[:max_periods] if len(periods) > max_periods else periods
class PeriodOptimizer:
"""Main orchestrator for period optimization"""
def __init__(self, config: Optional[PeriodSelectionConfig] = None):
self.config = config or PeriodSelectionConfig()
self.matcher = PeriodMatcher(self.config)
self.classifier = FiscalPeriodClassifier(self.config)
self.selector = StatementTypeSelector(self.matcher, self.classifier)
self.enricher = PeriodMetadataEnricher()
self.deduplicator = PeriodDeduplicator()
def determine_optimal_periods(self, xbrl_list: List[XBRL], statement_type: str,
max_periods: Optional[int] = None) -> List[Dict[str, Any]]:
"""Main entry point - orchestrates the entire process"""
max_periods = max_periods or self.config.max_periods_default
# Step 1: Extract periods from all XBRLs
all_periods = self._extract_all_periods(xbrl_list, statement_type)
# Step 2: Enrich with metadata
enriched_periods = self._enrich_with_metadata(all_periods)
# Step 3: Deduplicate, sort, and limit
final_periods = self._deduplicate_and_limit(enriched_periods, max_periods, statement_type)
return final_periods
def _extract_all_periods(self, xbrl_list: List[XBRL], statement_type: str) -> List[Dict[str, Any]]:
"""Extract periods from all XBRL objects"""
all_periods = []
for i, xbrl in enumerate(xbrl_list):
# Skip None XBRLs (pre-XBRL era filings before 2009)
if xbrl is None:
continue
# Skip XBRLs with no reporting periods
if not xbrl.reporting_periods:
continue
entity_info = xbrl.entity_info or {}
doc_period_end_date = self._parse_document_period_end_date(entity_info)
fiscal_period = entity_info.get('fiscal_period')
fiscal_year = entity_info.get('fiscal_year')
# Select appropriate periods based on statement type
selected_periods = self._select_periods_for_statement_type(
xbrl, statement_type, doc_period_end_date, fiscal_period
)
# Add context information to each period
for period in selected_periods:
period_with_context = {
'period': period,
'xbrl_index': i,
'entity_info': entity_info,
'doc_period_end_date': doc_period_end_date,
'fiscal_period': fiscal_period,
'fiscal_year': fiscal_year
}
all_periods.append(period_with_context)
return all_periods
def _parse_document_period_end_date(self, entity_info: Dict) -> Optional[date]:
"""Parse document_period_end_date from entity_info"""
if 'document_period_end_date' not in entity_info:
return None
try:
doc_period_end_date = entity_info['document_period_end_date']
if not isinstance(doc_period_end_date, date):
doc_period_end_date = parse_date(str(doc_period_end_date))
return doc_period_end_date
except (ValueError, TypeError) as e:
logger.warning("Failed to parse document_period_end_date: %s", e)
return None
def _select_periods_for_statement_type(self, xbrl: XBRL, statement_type: str,
doc_period_end_date: Optional[date],
fiscal_period: str) -> List[Dict]:
"""Select periods based on statement type"""
if statement_type == 'BalanceSheet':
return self.selector.select_balance_sheet_periods(xbrl, doc_period_end_date)
elif statement_type in ['IncomeStatement', 'CashFlowStatement']:
if statement_type == 'IncomeStatement':
return self.selector.select_income_statement_periods(xbrl, doc_period_end_date, fiscal_period)
else:
return self.selector.select_cash_flow_periods(xbrl, doc_period_end_date, fiscal_period)
else:
# For other statement types, use income statement logic as default
return self.selector.select_income_statement_periods(xbrl, doc_period_end_date, fiscal_period)
def _enrich_with_metadata(self, all_periods: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Enrich periods with comprehensive metadata"""
enriched_periods = []
for period_context in all_periods:
period = period_context['period']
enriched_metadata = self.enricher.enrich_period_metadata(
period,
period_context['xbrl_index'],
period_context['entity_info'],
period_context['doc_period_end_date'],
period_context['fiscal_period'],
period_context['fiscal_year']
)
enriched_periods.append(enriched_metadata)
return enriched_periods
def _deduplicate_and_limit(self, periods: List[Dict[str, Any]], max_periods: int,
statement_type: str) -> List[Dict[str, Any]]:
"""Deduplicate, sort, and limit periods"""
# Sort periods chronologically
sorted_periods = self.deduplicator.sort_periods_chronologically(periods, statement_type)
# Remove duplicates
deduplicated_periods = self.deduplicator.deduplicate_periods(sorted_periods, statement_type)
# Limit to maximum number of periods
final_periods = self.deduplicator.limit_periods(deduplicated_periods, max_periods)
return final_periods
# Main function that maintains the original API
def determine_optimal_periods(xbrl_list: List[XBRL], statement_type: str, max_periods: int = 8) -> List[Dict[str, Any]]:
"""
Determine the optimal periods to display for stitched statements from a list of XBRL objects.
This function analyzes entity info and reporting periods across multiple XBRL instances
to select the most appropriate periods for display, ensuring consistency in period selection
when creating stitched statements.
Args:
xbrl_list: List of XBRL objects ordered chronologically
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
max_periods: Maximum number of periods to return (default is 8)
Returns:
List of period metadata dictionaries containing information for display
"""
optimizer = PeriodOptimizer()
return optimizer.determine_optimal_periods(xbrl_list, statement_type, max_periods)

View File

@@ -0,0 +1,256 @@
"""
XBRL Presentation Tree - Virtual presentation tree for multi-period statements
This module creates a virtual presentation tree that preserves hierarchical
relationships while applying semantic ordering within sibling groups.
"""
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
@dataclass
class PresentationNode:
"""Represents a node in the virtual presentation tree"""
concept: str
label: str
level: int
metadata: Dict[str, Any]
semantic_order: float = 999.0
original_index: int = 999
def __post_init__(self):
self.children: List[PresentationNode] = []
self.parent: Optional[PresentationNode] = None
def add_child(self, child: 'PresentationNode'):
"""Add a child node and set parent relationship"""
child.parent = self
self.children.append(child)
def sort_children(self):
"""Sort children using semantic ordering while preserving hierarchy"""
# Sort direct children by semantic order, then by original index as tiebreaker
self.children.sort(key=lambda x: (x.semantic_order, x.original_index))
# Recursively sort grandchildren
for child in self.children:
child.sort_children()
def flatten_to_list(self) -> List['PresentationNode']:
"""Flatten tree to ordered list while preserving hierarchy"""
result = [self]
for child in self.children:
result.extend(child.flatten_to_list())
return result
class VirtualPresentationTree:
"""Builds and manages virtual presentation tree for stitched statements"""
def __init__(self, ordering_manager=None):
self.ordering_manager = ordering_manager
self.root_nodes: List[PresentationNode] = []
self.all_nodes: Dict[str, PresentationNode] = {}
def build_tree(self, concept_metadata: Dict, concept_ordering: Dict,
original_statement_order: List[str] = None) -> List[PresentationNode]:
"""
Build presentation tree from concept metadata and ordering.
Args:
concept_metadata: Metadata for each concept including level
concept_ordering: Semantic ordering positions
original_statement_order: Original order of concepts for context
Returns:
Flattened list of nodes in correct presentation order
"""
# Step 1: Create nodes for all concepts
self._create_nodes(concept_metadata, concept_ordering, original_statement_order)
# Step 2: Build parent-child relationships based on levels and context
self._build_hierarchy(original_statement_order or [])
# Step 3: Apply semantic ordering within sibling groups
self._apply_semantic_ordering()
# Step 4: Flatten tree to linear list
return self._flatten_tree()
def _create_nodes(self, concept_metadata: Dict, concept_ordering: Dict,
original_statement_order: List[str] = None):
"""Create nodes for all concepts"""
self.all_nodes = {}
for i, (concept, metadata) in enumerate(concept_metadata.items()):
label = metadata.get('latest_label', concept)
level = metadata.get('level', 0)
semantic_order = concept_ordering.get(concept, concept_ordering.get(label, 999.0))
# Track original index for maintaining some original order context
original_index = i
if original_statement_order:
try:
original_index = original_statement_order.index(concept)
except ValueError:
try:
original_index = original_statement_order.index(label)
except ValueError:
original_index = i + 1000 # Place unknown concepts later
node = PresentationNode(
concept=concept,
label=label,
level=level,
metadata=metadata,
semantic_order=semantic_order,
original_index=original_index
)
self.all_nodes[concept] = node
def _build_hierarchy(self, original_order: List[str]):
"""Build parent-child relationships based on level progression and context"""
# Sort nodes by their original order to maintain context for hierarchy detection
nodes_in_order = []
# First, try to use original order if available
if original_order:
# Map concepts in original order
concept_to_node = {node.concept: node for node in self.all_nodes.values()}
label_to_node = {node.label: node for node in self.all_nodes.values()}
for item in original_order:
if item in concept_to_node:
nodes_in_order.append(concept_to_node[item])
elif item in label_to_node:
nodes_in_order.append(label_to_node[item])
# Add any remaining nodes not in original order
remaining_nodes = [node for node in self.all_nodes.values()
if node not in nodes_in_order]
remaining_nodes.sort(key=lambda x: x.original_index)
nodes_in_order.extend(remaining_nodes)
else:
# Fall back to sorting by original index
nodes_in_order = sorted(self.all_nodes.values(),
key=lambda x: x.original_index)
# Build hierarchy using a parent stack approach
parent_stack = [] # Stack of potential parents at each level
for node in nodes_in_order:
current_level = node.level
# Pop parents that are at the same level or deeper
# We're looking for a parent at a level less than current
while parent_stack and parent_stack[-1].level >= current_level:
parent_stack.pop()
if parent_stack:
# Check if potential parent and child belong to compatible sections
parent = parent_stack[-1]
# Prevent cross-section hierarchies for critical sections like per_share
should_be_child = self._should_be_hierarchical_child(parent, node)
if should_be_child:
# Valid parent-child relationship
parent.add_child(node)
else:
# Different sections - make this a root node instead
self.root_nodes.append(node)
else:
# No parent - this is a root node
self.root_nodes.append(node)
# This node could be a parent for subsequent nodes
parent_stack.append(node)
def _apply_semantic_ordering(self):
"""Apply semantic ordering within sibling groups"""
# Sort root nodes by semantic order first, then original index
self.root_nodes.sort(key=lambda x: (x.semantic_order, x.original_index))
# Sort children within each parent recursively
for root in self.root_nodes:
root.sort_children()
def _flatten_tree(self) -> List[PresentationNode]:
"""Flatten tree to linear list preserving hierarchy"""
result = []
for root in self.root_nodes:
result.extend(root.flatten_to_list())
return result
def _should_be_hierarchical_child(self, parent: PresentationNode, child: PresentationNode) -> bool:
"""
Determine if child should be hierarchically under parent based on semantic ordering.
Prevents cross-section hierarchies that would break template section groupings.
"""
# Get semantic ordering positions
parent_order = parent.semantic_order
child_order = child.semantic_order
# If both have very specific semantic orders from templates (not defaults),
# check if they're in similar ranges (same section)
if parent_order < 900 and child_order < 900:
# Both are template-positioned, check if they're in similar sections
# Allow parent-child within 200 points (roughly same section)
section_gap = abs(parent_order - child_order)
if section_gap > 200:
return False
# Special case: Per-share items (900+) should never be children of early items
if child_order >= 900 and parent_order < 800:
return False
# Special case: Non-operating items (500-599) should not be children of operating items
if 500 <= child_order < 600 and parent_order < 500:
return False
# Special case: Revenue items should not be parents of per-share items
if parent_order < 100 and child_order >= 900:
return False
# Check for semantic incompatibility based on labels
child_label = child.label.lower()
parent_label = parent.label.lower()
# Per-share items should not be children of non-per-share items
if any(term in child_label for term in ['earnings per share', 'shares outstanding']):
if not any(term in parent_label for term in ['earnings', 'shares', 'per share']):
return False
# Interest expense items should not be children of non-interest items
if 'interest expense' in child_label:
if 'interest' not in parent_label and 'nonoperating' not in parent_label:
return False
# Otherwise, allow hierarchical relationship
return True
def debug_tree(self) -> str:
"""Generate a debug representation of the tree"""
lines = []
def _add_node_lines(node: PresentationNode, depth: int = 0):
indent = " " * depth
lines.append(f"{indent}├─ {node.label} (level={node.level}, "
f"semantic={node.semantic_order:.1f}, orig={node.original_index})")
for child in node.children:
_add_node_lines(child, depth + 1)
lines.append("Virtual Presentation Tree:")
for root in self.root_nodes:
_add_node_lines(root)
return "\n".join(lines)

View File

@@ -0,0 +1,640 @@
"""
XBRL Statement Stitching - Query Functionality
This module provides query functionality for stitched XBRL facts, allowing
users to query standardized, multi-period financial data.
"""
import re
from collections import defaultdict
from typing import TYPE_CHECKING, Any, Dict, List, Optional
import pandas as pd
from rich import box
from rich.console import Group
from rich.markdown import Markdown
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from edgar.richtools import repr_rich
from edgar.xbrl.facts import FactQuery
if TYPE_CHECKING:
from edgar.xbrl.stitching.xbrls import XBRLS
class StitchedFactsView:
"""
A view over stitched facts from multiple XBRL filings.
This class extracts facts from stitched statements rather than raw XBRL facts,
ensuring that queries operate on standardized, post-processed data.
"""
def __init__(self, xbrls: 'XBRLS'):
self.xbrls = xbrls
self._facts_cache = None
self._last_cache_key = None
def __len__(self):
return len(self.get_facts())
@property
def entity_name(self):
"""Get entity name from the most recent XBRL filing."""
if self.xbrls.xbrl_list:
return getattr(self.xbrls.xbrl_list[0], 'entity_name', 'Unknown Entity')
return 'Unknown Entity'
@property
def document_type(self):
"""Get document type from entity info."""
return self.xbrls.entity_info.get('document_type', 'Multi-Period Stitched')
def get_facts(self,
max_periods: int = 8,
standard: bool = True,
statement_types: Optional[List[str]] = None) -> List[Dict[str, Any]]:
"""
Extract facts from stitched statements.
Args:
max_periods: Maximum periods to include
standard: Whether to use standardized labels
statement_types: List of statement types to include
Returns:
List of fact dictionaries with stitched/standardized data
"""
# Create cache key
cache_key = (max_periods, standard, tuple(statement_types or []))
if self._facts_cache and self._last_cache_key == cache_key:
return self._facts_cache
statement_types = statement_types or [
'IncomeStatement', 'BalanceSheet', 'CashFlowStatement',
'StatementOfEquity', 'ComprehensiveIncome'
]
all_facts = []
for statement_type in statement_types:
try:
# Get stitched statement data (this applies standardization)
stitched_data = self.xbrls.get_statement(
statement_type=statement_type,
max_periods=max_periods,
standard=standard
)
# Extract facts from stitched data
facts = self._extract_facts_from_stitched_data(
stitched_data, statement_type
)
all_facts.extend(facts)
except Exception:
# Skip statements that can't be stitched
continue
# Cache results
self._facts_cache = all_facts
self._last_cache_key = cache_key
return all_facts
def _extract_facts_from_stitched_data(self,
stitched_data: Dict[str, Any],
statement_type: str) -> List[Dict[str, Any]]:
"""
Convert stitched statement data back to fact-like records for querying.
Args:
stitched_data: Output from StatementStitcher
statement_type: Type of statement
Returns:
List of fact dictionaries
"""
facts = []
periods = stitched_data.get('periods', [])
statement_data = stitched_data.get('statement_data', [])
for item in statement_data:
# Skip abstract items without values
if item.get('is_abstract', False) and not item.get('has_values', False):
continue
concept = item.get('concept', '')
label = item.get('label', '')
original_label = item.get('original_label', label)
# Create a fact record for each period with data
for period_id, value in item.get('values', {}).items():
if value is None:
continue
# Find period metadata
period_info = self._get_period_info(period_id, periods)
fact = {
# Core identification
'concept': concept,
'label': label, # Standardized label
'original_label': original_label, # Original company label
'statement_type': statement_type,
# Value information
'value': value,
'numeric_value': self._convert_to_numeric(value),
'decimals': item.get('decimals', {}).get(period_id, 0),
# Period information
'period_key': period_id,
'period_type': period_info.get('period_type', 'duration'),
'period_start': period_info.get('period_start'),
'period_end': period_info.get('period_end'),
'period_instant': period_info.get('period_instant'),
'period_label': period_info.get('period_label', ''),
# Statement context
'level': item.get('level', 0),
'is_abstract': item.get('is_abstract', False),
'is_total': item.get('is_total', False),
# Multi-filing context
'filing_count': len(self.xbrls.xbrl_list),
'standardized': True, # Mark as coming from standardized data
# Source attribution (which XBRL filing this came from)
'source_filing_index': self._determine_source_filing(period_id),
}
# Add fiscal period info if available
fiscal_info = self._extract_fiscal_info(period_id)
fact.update(fiscal_info)
facts.append(fact)
return facts
def _get_period_info(self, period_id: str, periods: List[tuple]) -> Dict[str, Any]:
"""Extract period metadata from period_id and periods list."""
period_info = {}
# Find matching period
for pid, label in periods:
if pid == period_id:
period_info['period_label'] = label
break
# Parse period_id to extract dates and type
if period_id.startswith('instant_'):
period_info['period_type'] = 'instant'
date_str = period_id.replace('instant_', '')
period_info['period_instant'] = date_str
period_info['period_end'] = date_str
elif period_id.startswith('duration_'):
period_info['period_type'] = 'duration'
parts = period_id.replace('duration_', '').split('_')
if len(parts) >= 2:
period_info['period_start'] = parts[0]
period_info['period_end'] = parts[1]
return period_info
def _convert_to_numeric(self, value: Any) -> Optional[float]:
"""Convert value to numeric if possible."""
if value is None:
return None
try:
if isinstance(value, (int, float)):
return float(value)
if isinstance(value, str):
# Remove commas and try to convert
cleaned = value.replace(',', '').replace('$', '').strip()
return float(cleaned)
except (ValueError, TypeError):
pass
return None
def _determine_source_filing(self, period_id: str) -> Optional[int]:
"""Determine which filing this period came from."""
# This would require enhanced tracking in the stitching process
# For now, return None but this could be enhanced
return None
def _extract_fiscal_info(self, period_id: str) -> Dict[str, Any]:
"""Extract fiscal period and year information."""
fiscal_info = {}
# Try to extract fiscal info from entity_info of the relevant XBRL
# This is a simplified approach - could be enhanced with better tracking
if self.xbrls.xbrl_list:
entity_info = self.xbrls.xbrl_list[0].entity_info
if entity_info:
fiscal_info['fiscal_period'] = entity_info.get('fiscal_period')
fiscal_info['fiscal_year'] = entity_info.get('fiscal_year')
return fiscal_info
def query(self, **kwargs) -> 'StitchedFactQuery':
"""Create a new query for stitched facts."""
return StitchedFactQuery(self, **kwargs)
class StitchedFactQuery(FactQuery):
"""
Enhanced fact query for stitched/standardized multi-filing data.
Extends the base FactQuery with capabilities specific to multi-period,
standardized financial data.
"""
def __init__(self, stitched_facts_view: StitchedFactsView, **kwargs):
# Initialize with stitched facts view instead of regular facts view
self._stitched_facts_view = stitched_facts_view
# Initialize base FactQuery attributes manually since we're not calling super().__init__
self._facts_view = stitched_facts_view # For compatibility with base class
self._filters = []
self._transformations = []
self._aggregations = []
self._include_dimensions = True
self._include_contexts = True
self._include_element_info = True
self._sort_by = None
self._sort_ascending = True
self._limit = None
self._statement_type = None
# Multi-filing specific options
self._cross_period_only = False
self._trend_analysis = False
self._require_all_periods = False
# Store query-specific parameters for get_facts
self._max_periods = kwargs.get('max_periods', 8)
self._standard = kwargs.get('standard', True)
self._statement_types = kwargs.get('statement_types', None)
def __str__(self):
return f"StitchedFactQuery(filters={len(self._filters)})"
# Enhanced filtering methods for multi-filing scenarios
def by_standardized_concept(self, concept_name: str) -> 'StitchedFactQuery':
"""
Filter by standardized concept name (e.g., 'Revenue', 'Net Income').
Args:
concept_name: Standardized concept name
Returns:
Self for method chaining
"""
# Query both the standardized label and original concept
self._filters.append(
lambda f: (f.get('label') == concept_name or
concept_name.lower() in f.get('label', '').lower() or
concept_name.lower() in f.get('concept', '').lower())
)
return self
def by_original_label(self, pattern: str, exact: bool = False) -> 'StitchedFactQuery':
"""
Filter by original company-specific labels before standardization.
Args:
pattern: Pattern to match against original labels
exact: Whether to require exact match
Returns:
Self for method chaining
"""
if exact:
self._filters.append(lambda f: f.get('original_label') == pattern)
else:
regex = re.compile(pattern, re.IGNORECASE)
self._filters.append(
lambda f: f.get('original_label') and regex.search(f['original_label'])
)
return self
def across_periods(self, min_periods: int = 2) -> 'StitchedFactQuery':
"""
Filter to concepts that appear across multiple periods.
Args:
min_periods: Minimum number of periods the concept must appear in
Returns:
Self for method chaining
"""
self._cross_period_only = True
self._min_periods = min_periods
return self
def by_fiscal_period(self, fiscal_period: str) -> 'StitchedFactQuery':
"""
Filter by fiscal period (FY, Q1, Q2, Q3, Q4).
Args:
fiscal_period: Fiscal period identifier
Returns:
Self for method chaining
"""
self._filters.append(
lambda f: f.get('fiscal_period') == fiscal_period
)
return self
def by_filing_index(self, filing_index: int) -> 'StitchedFactQuery':
"""
Filter facts by which filing they originated from.
Args:
filing_index: Index of the filing (0 = most recent)
Returns:
Self for method chaining
"""
self._filters.append(
lambda f: f.get('source_filing_index') == filing_index
)
return self
def trend_analysis(self, concept: str) -> 'StitchedFactQuery':
"""
Set up for trend analysis of a specific concept across periods.
Args:
concept: Concept to analyze trends for
Returns:
Self for method chaining
"""
self._trend_analysis = True
self.by_standardized_concept(concept)
return self
def complete_periods_only(self) -> 'StitchedFactQuery':
"""
Only return concepts that have values in all available periods.
Returns:
Self for method chaining
"""
self._require_all_periods = True
return self
def execute(self) -> List[Dict[str, Any]]:
"""
Execute the query with enhanced multi-period processing.
Returns:
List of fact dictionaries
"""
# Get base results from stitched facts with query parameters
results = self._stitched_facts_view.get_facts(
max_periods=self._max_periods,
standard=self._standard,
statement_types=self._statement_types
)
# Apply standard filters
for filter_func in self._filters:
results = [f for f in results if filter_func(f)]
# Apply transformations
for transform_fn in self._transformations:
for fact in results:
if 'value' in fact and fact['value'] is not None:
fact['value'] = transform_fn(fact['value'])
# Apply aggregations
if self._aggregations:
aggregated_results = {}
for agg in self._aggregations:
dimension = agg['dimension']
func = agg['function']
# Group facts by dimension
groups = {}
for fact in results:
dim_value = fact.get(f'dim_{dimension}')
if dim_value and 'value' in fact and fact['value'] is not None:
if dim_value not in groups:
groups[dim_value] = []
groups[dim_value].append(fact['value'])
# Apply aggregation function
for dim_value, values in groups.items():
agg_value = 0.0 # Initialize with default value
if func == 'sum':
agg_value = sum(values)
elif func == 'average':
agg_value = sum(values) / len(values)
key = (dimension, dim_value)
if key not in aggregated_results:
aggregated_results[key] = {'dimension': dimension, 'value': dim_value, 'values': {}}
aggregated_results[key]['values'][func] = agg_value
results = list(aggregated_results.values())
# Apply cross-period filtering if requested
if self._cross_period_only:
results = self._filter_cross_period_concepts(results)
# Apply complete periods filtering if requested
if self._require_all_periods:
results = self._filter_complete_periods(results)
# Apply trend analysis if requested
if self._trend_analysis:
results = self._prepare_trend_data(results)
# Apply sorting if specified
if results and self._sort_by and self._sort_by in results[0]:
results.sort(key=lambda f: f.get(self._sort_by, ''),
reverse=not self._sort_ascending)
# Apply limit if specified
if self._limit is not None:
results = results[:self._limit]
return results
def _filter_cross_period_concepts(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Filter to concepts that appear in multiple periods."""
concept_periods = defaultdict(set)
for fact in results:
concept_key = (fact.get('concept', ''), fact.get('label', ''))
concept_periods[concept_key].add(fact.get('period_key', ''))
# Filter to concepts with minimum period count
valid_concepts = {
concept for concept, periods in concept_periods.items()
if len(periods) >= getattr(self, '_min_periods', 2)
}
return [
fact for fact in results
if (fact.get('concept', ''), fact.get('label', '')) in valid_concepts
]
def _filter_complete_periods(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Filter to concepts that have values in all periods."""
# Get all available periods
all_periods = set(fact.get('period_key', '') for fact in results)
concept_periods = defaultdict(set)
for fact in results:
concept_key = (fact.get('concept', ''), fact.get('label', ''))
concept_periods[concept_key].add(fact.get('period_key', ''))
# Filter to concepts with complete period coverage
complete_concepts = {
concept for concept, periods in concept_periods.items()
if periods == all_periods
}
return [
fact for fact in results
if (fact.get('concept', ''), fact.get('label', '')) in complete_concepts
]
def _prepare_trend_data(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Prepare data for trend analysis by sorting periods."""
# Sort by period end date for trend analysis
return sorted(results, key=lambda f: f.get('period_end', ''))
def to_trend_dataframe(self) -> pd.DataFrame:
"""
Create a DataFrame optimized for trend analysis.
Returns:
DataFrame with concepts as rows and periods as columns
"""
results = self.execute()
if not results:
return pd.DataFrame()
# Pivot data for trend analysis
df = pd.DataFrame(results)
# Create pivot table with concepts as rows and periods as columns
if 'concept' in df.columns and 'period_end' in df.columns and 'numeric_value' in df.columns:
trend_df = df.pivot_table(
index=['label', 'concept'],
columns='period_end',
values='numeric_value',
aggfunc='first'
)
return trend_df
return df
def to_dataframe(self, *columns) -> pd.DataFrame:
"""
Execute the query and return results as a DataFrame.
Args:
columns: List of columns to include in the DataFrame
Returns:
pandas DataFrame with query results
"""
results = self.execute()
if not results:
return pd.DataFrame()
df = pd.DataFrame(results)
df['value'] = df['value'].astype(str) # Ensure value is string for display
# Filter columns based on inclusion flags
if not self._include_dimensions:
df = df.loc[:, [col for col in df.columns if not col.startswith('dim_')]]
if not self._include_contexts:
context_cols = ['context_ref', 'entity_identifier', 'entity_scheme',
'period_type']
df = df.loc[:, [col for col in df.columns if col not in context_cols]]
if not self._include_element_info:
element_cols = ['element_id', 'element_name', 'element_type', 'element_period_type',
'element_balance', 'element_label']
df = df.loc[:, [col for col in df.columns if col not in element_cols]]
# Drop empty columns
df = df.dropna(axis=1, how='all')
# Filter columns if specified
if columns:
df = df[list(columns)]
# Skip these columns
skip_columns = ['fact_key', 'period_key']
# Order columns
first_columns = [col for col in
['concept', 'label', 'original_label', 'value', 'numeric_value',
'period_start', 'period_end', 'decimals', 'statement_type', 'fiscal_period']
if col in df.columns]
columns = first_columns + [col for col in df.columns
if col not in first_columns
and col not in skip_columns]
return df[columns]
def __rich__(self):
title = Text.assemble(("Stitched Facts Query"),
)
subtitle = Text.assemble((self._stitched_facts_view.entity_name, "bold deep_sky_blue1"),
" - ",
(self._stitched_facts_view.document_type)
)
df = self.to_dataframe().fillna('')
columns = df.columns.tolist()
description = Markdown(
f"""
Use *to_dataframe(columns)* to get a DataFrame of the results.
e.g. `query.to_dataframe('concept', 'value', 'period_end')`
Available columns:
'{', '.join(columns)}'
**Enhanced Multi-Period Methods:**
- `across_periods(min_periods=2)` - Filter to concepts across multiple periods
- `by_standardized_concept('Revenue')` - Filter by standardized labels
- `by_original_label('Net sales')` - Filter by original company labels
- `trend_analysis('Revenue')` - Set up trend analysis
- `to_trend_dataframe()` - Get trend-optimized DataFrame
"""
)
display_columns = [col for col in ['label', 'concept', 'value', 'period_start', 'period_end', 'statement_type']
if col in columns]
if not df.empty:
df_display = df[display_columns].head(10) # Show first 10 rows
table = Table(*display_columns, show_header=True, header_style="bold", box=box.SIMPLE)
for t in df_display.itertuples(index=False):
row = []
for i in t:
row.append(str(i)[:50]) # Truncate long values
table.add_row(*row)
else:
table = Table("No results found", box=box.SIMPLE)
panel = Panel(Group(description, table), title=title, subtitle=subtitle, box=box.ROUNDED)
return panel
def __repr__(self):
return repr_rich(self.__rich__())

View File

@@ -0,0 +1,106 @@
"""
XBRL Statement Stitching - Utility Functions
This module contains utility functions for rendering and converting stitched
statement data.
"""
from typing import Any, Dict, Optional
import pandas as pd
def render_stitched_statement(
stitched_data: Dict[str, Any],
statement_title: str,
statement_type: str,
entity_info: Dict[str, Any] = None,
show_date_range: bool = False,
xbrl_instance: Optional[Any] = None
):
"""
Render a stitched statement using the same rendering logic as individual statements.
Args:
stitched_data: Stitched statement data
statement_title: Title of the statement
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
entity_info: Entity information (optional)
show_date_range: Whether to show full date ranges for duration periods
Returns:
RichTable: A formatted table representation of the stitched statement
"""
from edgar.xbrl.rendering import render_statement
# Extract periods and statement data
periods_to_display = stitched_data['periods']
statement_data = stitched_data['statement_data']
# Apply special title formatting for stitched statements
if len(periods_to_display) > 1:
# For multiple periods, modify the title to indicate the trend view
period_desc = f" ({len(periods_to_display)}-Period View)"
statement_title = f"{statement_title}{period_desc}"
# Use the existing rendering function with the new show_date_range parameter
return render_statement(
statement_data=statement_data,
periods_to_display=periods_to_display,
statement_title=statement_title,
statement_type=statement_type,
entity_info=entity_info,
show_date_range=show_date_range,
xbrl_instance=xbrl_instance
)
def to_pandas(stitched_data: Dict[str, Any]) -> pd.DataFrame:
"""
Convert stitched statement data to a pandas DataFrame.
Args:
stitched_data: Stitched statement data
Returns:
DataFrame with periods as columns and concepts as index
"""
# Extract periods and statement data
statement_data = stitched_data['statement_data']
# Create ordered list of period column names (preserving the original ordering)
period_columns = []
for period_id, _period_label in stitched_data['periods']:
# Use the end_date in YYYY-MM-DD format as the column name
col = period_id[-10:]
period_columns.append(col)
# Create a dictionary for the DataFrame with ordered columns
# Start with metadata columns
data = {}
data['label'] = []
data['concept'] = []
# Initialize period columns in the correct order (newest first)
for col in period_columns:
data[col] = []
for _i, item in enumerate(statement_data):
# Skip abstract items without values
if item['is_abstract'] and not item['has_values']:
continue
data['label'].append(item['label'])
data['concept'].append(item['concept'])
# Add values for each period in the correct order
for period_id, _period_label in stitched_data['periods']:
col = period_id[-10:]
value = item['values'].get(period_id)
data[col].append(value)
# Create the DataFrame with columns in the correct order
column_order = ['label', 'concept'] + period_columns
df = pd.DataFrame(data, columns=column_order)
return df

View File

@@ -0,0 +1,340 @@
"""
XBRL Statement Stitching - XBRLS Class
This module contains the XBRLS class which represents multiple XBRL filings
stitched together for multi-period analysis.
"""
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
import pandas as pd
from edgar.xbrl.stitching.core import StatementStitcher, stitch_statements
from edgar.xbrl.stitching.query import StitchedFactQuery, StitchedFactsView
if TYPE_CHECKING:
from edgar._filings import Filings
from edgar.xbrl.statements import StitchedStatements
class XBRLS:
"""
A class representing multiple XBRL filings stitched together.
This provides a unified view of financial data across multiple time periods,
automatically handling the complexities of statement stitching.
"""
def __init__(self, xbrl_list: List[Any]):
"""
Initialize an XBRLS instance with a list of XBRL objects.
Args:
xbrl_list: List of XBRL objects, should be from the same company
and ordered from newest to oldest
"""
# Store the list of XBRL objects
self.xbrl_list = xbrl_list
# Extract entity info from the most recent XBRL
self.entity_info = xbrl_list[0].entity_info if xbrl_list else {}
# Cache for stitched statements
self._statement_cache = {}
# Cache for stitched facts view
self._stitched_facts_view = None
@classmethod
def from_filings(cls, filings: Union['Filings', List[Any]], filter_amendments:bool=True) -> 'XBRLS':
"""
Create an XBRLS object from a list of Filing objects or a Filings object containing multiple filings.
Each filing should be the same form (e.g., 10-K, 10-Q) and from the same company.
Args:
filings: List of Filing objects, should be from the same company
Returns:
XBRLS object with stitched data
"""
from edgar.xbrl.xbrl import XBRL
if filter_amendments:
filtered_filings = filings.filter(amendments=False)
else:
filtered_filings = filings
# Sort filings by date (newest first)
sorted_filings = sorted(filtered_filings, key=lambda f: f.filing_date, reverse=True)
# Create XBRL objects from filings
xbrl_list = []
for filing in sorted_filings:
try:
xbrl = XBRL.from_filing(filing)
xbrl_list.append(xbrl)
except Exception:
pass
return cls(xbrl_list)
@classmethod
def from_xbrl_objects(cls, xbrl_list: List[Any]) -> 'XBRLS':
"""
Create an XBRLS object from a list of XBRL objects.
Args:
xbrl_list: List of XBRL objects, should be from the same company
Returns:
XBRLS object with stitched data
"""
return cls(xbrl_list)
@property
def statements(self) -> 'StitchedStatements':
"""
Get a user-friendly interface to access stitched financial statements.
Returns:
StitchedStatements object
"""
from edgar.xbrl.statements import StitchedStatements
return StitchedStatements(self)
@property
def facts(self) -> StitchedFactsView:
"""
Get a view over stitched facts from all XBRL filings.
Returns:
StitchedFactsView for querying standardized, multi-period data
"""
if self._stitched_facts_view is None:
self._stitched_facts_view = StitchedFactsView(self)
return self._stitched_facts_view
def query(self,
max_periods: int = 8,
standardize: bool = True,
statement_types: Optional[List[str]] = None,
**kwargs) -> StitchedFactQuery:
"""
Start a new query for stitched facts across all filings.
Args:
max_periods: Maximum periods to include in stitched data
standardize: Whether to use standardized labels
statement_types: List of statement types to include
**kwargs: Additional options passed to StitchedFactQuery
Returns:
StitchedFactQuery for building complex queries
"""
# Pass query parameters to the StitchedFactQuery
kwargs.update({
'max_periods': max_periods,
'standardize': standardize,
'statement_types': statement_types
})
return self.facts.query(**kwargs)
def get_statement(self, statement_type: str,
max_periods: int = 8,
standard: bool = True,
use_optimal_periods: bool = True,
include_dimensions: bool = False) -> Dict[str, Any]:
"""
Get a stitched statement of the specified type.
Args:
statement_type: Type of statement to stitch ('IncomeStatement', 'BalanceSheet', etc.)
max_periods: Maximum number of periods to include
standard: Whether to use standardized concept labels
use_optimal_periods: Whether to use entity info to determine optimal periods
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
Returns:
Dictionary with stitched statement data
"""
# Check cache first
cache_key = f"{statement_type}_{max_periods}_{standard}_{use_optimal_periods}_{include_dimensions}"
if cache_key in self._statement_cache:
return self._statement_cache[cache_key]
# Stitch the statement
result = stitch_statements(
self.xbrl_list,
statement_type=statement_type,
period_type=StatementStitcher.PeriodType.ALL_PERIODS,
max_periods=max_periods,
standard=standard,
use_optimal_periods=use_optimal_periods,
include_dimensions=include_dimensions
)
# Cache the result
self._statement_cache[cache_key] = result
return result
def render_statement(self, statement_type: str,
max_periods: int = 8,
standardize: bool = True,
use_optimal_periods: bool = True,
show_date_range: bool = False,
include_dimensions: bool = False):
"""
Render a stitched statement in a rich table format.
Args:
statement_type: Type of statement to render ('BalanceSheet', 'IncomeStatement', etc.)
max_periods: Maximum number of periods to include
standardize: Whether to use standardized concept labels
use_optimal_periods: Whether to use entity info to determine optimal periods
show_date_range: Whether to show full date ranges for duration periods
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
Returns:
RichTable: A formatted table representation of the stitched statement
"""
# Create a StitchedStatement object and use its render method
from edgar.xbrl.statements import StitchedStatement
statement = StitchedStatement(self, statement_type, max_periods, standardize, use_optimal_periods, include_dimensions)
return statement.render(show_date_range=show_date_range)
def to_dataframe(self, statement_type: str,
max_periods: int = 8,
standardize: bool = True) -> pd.DataFrame:
"""
Convert a stitched statement to a pandas DataFrame.
Args:
statement_type: Type of statement to convert ('BalanceSheet', 'IncomeStatement', etc.)
max_periods: Maximum number of periods to include
standardize: Whether to use standardized concept labels
Returns:
DataFrame with periods as columns and concepts as index
"""
# Create a StitchedStatement object and use its to_dataframe method
from edgar.xbrl.statements import StitchedStatement
statement = StitchedStatement(self, statement_type, max_periods, standardize)
return statement.to_dataframe()
def get_periods(self) -> List[Dict[str, str]]:
"""
Get all available periods across all XBRL objects.
Returns:
List of period information dictionaries, each containing:
- 'type': 'instant' or 'duration'
- 'key': period key (e.g., 'instant_2024-09-28', 'duration_2024-01-01_2024-09-28')
- 'label': human-readable label
For instant periods:
- 'date': end date as 'YYYY-MM-DD'
For duration periods:
- 'start_date': start date as 'YYYY-MM-DD'
- 'end_date': end date as 'YYYY-MM-DD'
- 'days': duration in days
- 'period_type': classification ('Annual', 'Quarterly', etc.)
"""
all_periods = []
# Go through all XBRL objects to collect periods
for xbrl in self.xbrl_list:
all_periods.extend(xbrl.reporting_periods)
# De-duplicate periods with the same labels
unique_periods = {}
for period in all_periods:
# Use the date string as the unique key
key = period['date'] if period['type'] == 'instant' else f"{period['start_date']}_{period['end_date']}"
if key not in unique_periods:
unique_periods[key] = period
return list(unique_periods.values())
def get_period_end_dates(self) -> List[str]:
"""
Get end dates for all available periods in YYYY-MM-DD format.
This is a convenience method that extracts just the end dates from periods,
handling both instant and duration periods correctly.
Returns:
List of end dates as strings in YYYY-MM-DD format, sorted newest first
"""
periods = self.get_periods()
end_dates = []
for period in periods:
if period.get('type') == 'duration':
end_date = period.get('end_date')
elif period.get('type') == 'instant':
end_date = period.get('date')
else:
continue
if end_date:
end_dates.append(end_date)
# Sort newest first and remove duplicates while preserving order
seen = set()
sorted_dates = []
for date in sorted(set(end_dates), reverse=True):
if date not in seen:
sorted_dates.append(date)
seen.add(date)
return sorted_dates
def __str__(self) -> str:
"""
String representation of the XBRLS object.
Returns:
String representation
"""
filing_count = len(self.xbrl_list)
periods = self.get_periods()
return f"XBRLS with {filing_count} filings covering {len(periods)} unique periods"
def __rich__(self):
"""
Rich representation for pretty console output.
Returns:
Rich console representation
"""
from rich.panel import Panel
from rich.text import Text
# Get information about the XBRLS object
filing_count = len(self.xbrl_list)
periods = self.get_periods()
# Create a panel with the information
content = Text.from_markup("[bold]XBRLS Object[/bold]\n")
content.append(f"Filings: {filing_count}\n")
content.append(f"Unique Periods: {len(periods)}\n")
# List available statement types
statement_types = set()
for xbrl in self.xbrl_list:
statements = xbrl.get_all_statements()
for stmt in statements:
if stmt['type']:
statement_types.add(stmt['type'])
content.append("\n[bold]Available Statement Types:[/bold]\n")
for stmt_type in sorted(statement_types):
content.append(f"- {stmt_type}\n")
# Show how to access statements
content.append("\n[bold]Example Usage:[/bold]\n")
content.append("xbrls.statements.income_statement()\n")
content.append("xbrls.statements.balance_sheet()\n")
content.append("xbrls.to_dataframe('IncomeStatement')\n")
return Panel(content, title="XBRLS", expand=False)