Initial commit
This commit is contained in:
@@ -0,0 +1,27 @@
|
||||
"""
|
||||
XBRL Statement Stitching Package
|
||||
|
||||
This package provides functionality to combine multiple XBRL statements
|
||||
across different time periods into a unified view, handling concept
|
||||
consistency issues and normalizing data representation.
|
||||
"""
|
||||
|
||||
# Import standardize_statement for backwards compatibility with tests
|
||||
from edgar.xbrl.standardization import standardize_statement
|
||||
from edgar.xbrl.stitching.core import StatementStitcher, stitch_statements
|
||||
from edgar.xbrl.stitching.periods import determine_optimal_periods
|
||||
from edgar.xbrl.stitching.query import StitchedFactQuery, StitchedFactsView
|
||||
from edgar.xbrl.stitching.utils import render_stitched_statement, to_pandas
|
||||
from edgar.xbrl.stitching.xbrls import XBRLS
|
||||
|
||||
__all__ = [
|
||||
'XBRLS',
|
||||
'StatementStitcher',
|
||||
'stitch_statements',
|
||||
'determine_optimal_periods',
|
||||
'render_stitched_statement',
|
||||
'to_pandas',
|
||||
'standardize_statement',
|
||||
'StitchedFactsView',
|
||||
'StitchedFactQuery'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
621
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/core.py
Normal file
621
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/core.py
Normal file
@@ -0,0 +1,621 @@
|
||||
"""
|
||||
XBRL Statement Stitching - Core Functionality
|
||||
|
||||
This module contains the core StatementStitcher class and related functionality
|
||||
for combining multiple XBRL statements across different time periods.
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
from edgar.xbrl.core import format_date, parse_date
|
||||
from edgar.xbrl.standardization import ConceptMapper, initialize_default_mappings, standardize_statement
|
||||
from edgar.xbrl.stitching.ordering import StatementOrderingManager
|
||||
from edgar.xbrl.stitching.periods import determine_optimal_periods
|
||||
from edgar.xbrl.stitching.presentation import VirtualPresentationTree
|
||||
|
||||
|
||||
class StatementStitcher:
|
||||
"""
|
||||
Combines multiple statements across time periods into a unified view.
|
||||
|
||||
This class handles the complexities of combining financial statements
|
||||
from different periods, including:
|
||||
- Normalizing concepts that change over time
|
||||
- Aligning periods correctly
|
||||
- Handling missing data points
|
||||
- Providing both standardized and company-specific views
|
||||
"""
|
||||
|
||||
class PeriodType(str, Enum):
|
||||
"""Types of period views available for stitched statements"""
|
||||
RECENT_PERIODS = "Most Recent Periods"
|
||||
RECENT_YEARS = "Recent Years"
|
||||
THREE_YEAR_COMPARISON = "Three-Year Comparison"
|
||||
THREE_QUARTERS = "Three Recent Quarters"
|
||||
ANNUAL_COMPARISON = "Annual Comparison"
|
||||
QUARTERLY_TREND = "Quarterly Trend"
|
||||
ALL_PERIODS = "All Available Periods"
|
||||
|
||||
def __init__(self, concept_mapper: Optional[ConceptMapper] = None):
|
||||
"""
|
||||
Initialize a StatementStitcher instance.
|
||||
|
||||
Args:
|
||||
concept_mapper: Optional ConceptMapper for standardizing concepts.
|
||||
If None, a default mapper is created.
|
||||
"""
|
||||
if concept_mapper is None:
|
||||
self.mapping_store = initialize_default_mappings()
|
||||
self.concept_mapper = ConceptMapper(self.mapping_store)
|
||||
else:
|
||||
self.concept_mapper = concept_mapper
|
||||
self.mapping_store = concept_mapper.mapping_store
|
||||
|
||||
# Initialize data structures
|
||||
self.periods = [] # Ordered list of period identifiers
|
||||
self.period_dates = {} # Maps period ID to display dates
|
||||
self.data = defaultdict(dict) # {concept: {period: value}}
|
||||
self.concept_metadata = {} # Metadata for each concept (level, etc.)
|
||||
self.ordering_manager = None # Will be initialized during stitching
|
||||
self.original_statement_order = [] # Track original order for hierarchy context
|
||||
|
||||
def stitch_statements(
|
||||
self,
|
||||
statements: List[Dict[str, Any]],
|
||||
period_type: Union[PeriodType, str] = PeriodType.RECENT_PERIODS,
|
||||
max_periods: int = None,
|
||||
standard: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Stitch multiple statements into a unified view.
|
||||
|
||||
Args:
|
||||
statements: List of statement data from different filings
|
||||
period_type: Type of period view to generate
|
||||
max_periods: Maximum number of periods to include
|
||||
standard: Whether to use standardized concept labels
|
||||
|
||||
Returns:
|
||||
Dictionary with stitched statement data
|
||||
"""
|
||||
# Reset state
|
||||
self.periods = []
|
||||
self.period_dates = {}
|
||||
self.data = defaultdict(dict)
|
||||
self.concept_metadata = {}
|
||||
self.original_statement_order = []
|
||||
|
||||
# Initialize ordering manager for this statement type
|
||||
statement_type = statements[0].get('statement_type', 'IncomeStatement') if statements else 'IncomeStatement'
|
||||
self.ordering_manager = StatementOrderingManager(statement_type)
|
||||
|
||||
# Capture original statement order from the most recent (first) statement for hierarchy context
|
||||
if statements:
|
||||
reference_statement = statements[0]
|
||||
self.original_statement_order = []
|
||||
for item in reference_statement.get('data', []):
|
||||
concept = item.get('concept')
|
||||
label = item.get('label')
|
||||
if concept:
|
||||
self.original_statement_order.append(concept)
|
||||
if label and label not in self.original_statement_order:
|
||||
self.original_statement_order.append(label)
|
||||
|
||||
# Extract and sort all periods
|
||||
all_periods = self._extract_periods(statements)
|
||||
|
||||
# Set max_periods if not provided
|
||||
max_periods = max_periods or len(statements) + 2 # Allow for the last statement to have 3 periods
|
||||
|
||||
# Select appropriate periods based on period_type
|
||||
selected_periods = self._select_periods(all_periods, period_type, max_periods)
|
||||
self.periods = selected_periods
|
||||
|
||||
# Process each statement
|
||||
for _i, statement in enumerate(statements):
|
||||
# Only process statements that have periods in our selection
|
||||
statement_periods = set(statement['periods'].keys())
|
||||
relevant_periods = statement_periods.intersection(set(selected_periods))
|
||||
|
||||
if not relevant_periods:
|
||||
continue
|
||||
|
||||
# Standardize the statement if needed
|
||||
if standard:
|
||||
processed_data = self._standardize_statement_data(statement)
|
||||
else:
|
||||
processed_data = statement['data']
|
||||
|
||||
# Store data for each item
|
||||
self._integrate_statement_data(processed_data, statement['periods'], relevant_periods)
|
||||
|
||||
# Format the stitched data
|
||||
return self._format_output_with_ordering(statements)
|
||||
|
||||
def _extract_periods(self, statements: List[Dict[str, Any]]) -> List[Tuple[str, datetime]]:
|
||||
"""
|
||||
Extract and sort all periods from the statements, de-duplicating periods with the same date.
|
||||
|
||||
Args:
|
||||
statements: List of statement data
|
||||
|
||||
Returns:
|
||||
List of (period_id, end_date) tuples, sorted by date (newest first)
|
||||
"""
|
||||
# Use a dictionary to track unique periods by their end date
|
||||
# This will handle cases where different period_ids reference the same date
|
||||
unique_periods = {} # key: date string, value: (period_id, datetime, statement_index)
|
||||
|
||||
for i, statement in enumerate(statements):
|
||||
# Use statement index (i) to prioritize more recent filings
|
||||
# Lower index = more recent filing
|
||||
for period_id, period_info in statement['periods'].items():
|
||||
# Extract end date for sorting
|
||||
try:
|
||||
# Initialize normalized_key to silence the type checker
|
||||
normalized_key = ""
|
||||
|
||||
if period_id.startswith('instant_'):
|
||||
date_str = period_id.split('_')[1]
|
||||
# Format the date consistently with single statements
|
||||
try:
|
||||
date_obj = parse_date(date_str)
|
||||
display_date = format_date(date_obj)
|
||||
except ValueError:
|
||||
# Fall back to original label if parsing fails
|
||||
display_date = period_info['label']
|
||||
period_type = 'instant'
|
||||
# For instant periods, create a normalized key with just the date
|
||||
normalized_key = f"{period_type}_{date_str}"
|
||||
else: # duration
|
||||
# For durations, extract both start and end dates
|
||||
parts = period_id.split('_')
|
||||
if len(parts) >= 3:
|
||||
start_date_str = parts[1]
|
||||
end_date_str = parts[2]
|
||||
start_date = parse_date(start_date_str)
|
||||
end_date = parse_date(end_date_str)
|
||||
date_str = end_date_str # Use end date for sorting
|
||||
|
||||
# Format end date consistently - for stitched statements,
|
||||
# we only need the end date for duration periods as that's what users compare
|
||||
display_date = format_date(end_date)
|
||||
period_type = 'duration'
|
||||
# Create a normalized key that combines period type, start date, and end date
|
||||
normalized_key = f"{period_type}_{format_date(start_date)}_{format_date(end_date)}"
|
||||
else:
|
||||
# Skip malformed period IDs
|
||||
continue
|
||||
|
||||
# Parse the end date for sorting
|
||||
end_date = parse_date(date_str)
|
||||
|
||||
# Check if we already have this period (by normalized key)
|
||||
if normalized_key in unique_periods:
|
||||
existing_idx = unique_periods[normalized_key][2]
|
||||
# Only replace if this statement is from a more recent filing
|
||||
if i < existing_idx:
|
||||
unique_periods[normalized_key] = (period_id, end_date, i)
|
||||
self.period_dates[period_id] = display_date
|
||||
else:
|
||||
# Add new period
|
||||
unique_periods[normalized_key] = (period_id, end_date, i)
|
||||
self.period_dates[period_id] = display_date
|
||||
|
||||
except (ValueError, TypeError, IndexError):
|
||||
# Skip periods with invalid dates
|
||||
continue
|
||||
|
||||
# Extract and sort the unique periods
|
||||
all_periods = [(period_id, end_date) for period_id, end_date, _ in unique_periods.values()]
|
||||
|
||||
# Sort by date, newest first
|
||||
return sorted(all_periods, key=lambda x: x[1], reverse=True)
|
||||
|
||||
def _select_periods(
|
||||
self,
|
||||
all_periods: List[Tuple[str, Union[str,datetime]]],
|
||||
period_type: Union[PeriodType, str],
|
||||
max_periods: int
|
||||
) -> List[str]:
|
||||
"""
|
||||
Select appropriate periods based on period_type.
|
||||
|
||||
Args:
|
||||
all_periods: List of (period_id, end_date) tuples
|
||||
period_type: Type of period view to generate
|
||||
max_periods: Maximum number of periods to include
|
||||
|
||||
Returns:
|
||||
List of selected period IDs
|
||||
"""
|
||||
if isinstance(period_type, str):
|
||||
try:
|
||||
period_type = StatementStitcher.PeriodType(period_type)
|
||||
except ValueError:
|
||||
# Default to recent periods if string doesn't match enum
|
||||
period_type = StatementStitcher.PeriodType.RECENT_PERIODS
|
||||
|
||||
# Extract period types (instant vs duration)
|
||||
instants = [(pid, date) for pid, date in all_periods if pid.startswith('instant_')]
|
||||
durations = [(pid, date) for pid, date in all_periods if not pid.startswith('instant_')]
|
||||
|
||||
# Apply different selection logic based on period_type
|
||||
if period_type == StatementStitcher.PeriodType.RECENT_PERIODS:
|
||||
# Just take the most recent periods up to max_periods
|
||||
return [pid for pid, _ in all_periods[:max_periods]]
|
||||
|
||||
elif period_type == StatementStitcher.PeriodType.THREE_YEAR_COMPARISON:
|
||||
# For balance sheets, find year-end instants
|
||||
year_ends = []
|
||||
years_seen = set()
|
||||
|
||||
for pid, date in instants:
|
||||
year = parse_date(date).year
|
||||
if year not in years_seen and len(year_ends) < max_periods:
|
||||
year_ends.append(pid)
|
||||
years_seen.add(year)
|
||||
|
||||
return year_ends
|
||||
|
||||
elif period_type == StatementStitcher.PeriodType.THREE_QUARTERS:
|
||||
# Find the most recent quarters (for income statements)
|
||||
quarterly_periods = []
|
||||
|
||||
for pid, _date in durations:
|
||||
# Check if this appears to be a quarterly period
|
||||
if not pid.startswith('duration_'):
|
||||
continue
|
||||
|
||||
start_date_str = pid.split('_')[1]
|
||||
end_date_str = pid.split('_')[2]
|
||||
|
||||
try:
|
||||
start_date = parse_date(start_date_str)
|
||||
end_date = parse_date(end_date_str)
|
||||
days = (end_date - start_date).days
|
||||
|
||||
# Assuming quarterly is around 90 days
|
||||
if 80 <= days <= 95:
|
||||
quarterly_periods.append(pid)
|
||||
if len(quarterly_periods) >= max_periods:
|
||||
break
|
||||
except (ValueError, TypeError, IndexError):
|
||||
continue
|
||||
|
||||
return quarterly_periods
|
||||
|
||||
elif period_type == StatementStitcher.PeriodType.ANNUAL_COMPARISON:
|
||||
# Find annual periods (for income statements)
|
||||
annual_periods = []
|
||||
|
||||
for pid, _date in durations:
|
||||
# Check if this appears to be an annual period
|
||||
if not pid.startswith('duration_'):
|
||||
continue
|
||||
|
||||
start_date_str = pid.split('_')[1]
|
||||
end_date_str = pid.split('_')[2]
|
||||
|
||||
try:
|
||||
start_date = parse_date(start_date_str)
|
||||
end_date = parse_date(end_date_str)
|
||||
days = (end_date - start_date).days
|
||||
|
||||
# Assuming annual is around 365 days
|
||||
if 350 <= days <= 380:
|
||||
annual_periods.append(pid)
|
||||
if len(annual_periods) >= max_periods:
|
||||
break
|
||||
except (ValueError, TypeError, IndexError):
|
||||
continue
|
||||
|
||||
return annual_periods
|
||||
|
||||
elif period_type == StatementStitcher.PeriodType.ALL_PERIODS:
|
||||
# Return all periods, newest first, up to max_periods
|
||||
return [pid for pid, _ in all_periods[:max_periods]]
|
||||
|
||||
# Default to recent periods
|
||||
return [pid for pid, _ in all_periods[:max_periods]]
|
||||
|
||||
def _standardize_statement_data(self, statement: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Standardize the statement data using the concept mapper.
|
||||
|
||||
Args:
|
||||
statement: Statement data
|
||||
|
||||
Returns:
|
||||
Standardized statement data
|
||||
"""
|
||||
# Add statement type to context for better mapping
|
||||
statement_type = statement.get('statement_type', '')
|
||||
statement_data = statement['data']
|
||||
|
||||
for item in statement_data:
|
||||
item['statement_type'] = statement_type
|
||||
|
||||
# Apply standardization using the concept mapper
|
||||
return standardize_statement(statement_data, self.concept_mapper)
|
||||
|
||||
def _integrate_statement_data(
|
||||
self,
|
||||
statement_data: List[Dict[str, Any]],
|
||||
period_map: Dict[str, Dict[str, str]],
|
||||
relevant_periods: Set[str]
|
||||
) -> None:
|
||||
"""
|
||||
Integrate statement data from one statement into the stitched view.
|
||||
|
||||
Args:
|
||||
statement_data: Statement data
|
||||
period_map: Map of period IDs to period information
|
||||
relevant_periods: Set of periods from this statement to include
|
||||
"""
|
||||
# Map to track concepts by their underlying concept ID, not just label
|
||||
# This helps merge rows that represent the same concept but have different labels
|
||||
concept_to_label_map = {}
|
||||
|
||||
for item in statement_data:
|
||||
concept = item.get('concept')
|
||||
label = item.get('label')
|
||||
|
||||
# Skip items without concept or label
|
||||
if not concept or not label:
|
||||
continue
|
||||
|
||||
# Skip abstract items with no children (headers without data)
|
||||
if item.get('is_abstract', False) and not item.get('children'):
|
||||
continue
|
||||
|
||||
# Skip dimension items
|
||||
if any(bracket in label for bracket in ['[Axis]', '[Domain]', '[Member]', '[Line Items]', '[Table]', '[Abstract]']):
|
||||
continue
|
||||
|
||||
# Use concept as the primary key for identifying the same financial line item
|
||||
# This is more reliable than labels which may vary across filings
|
||||
|
||||
# If we've already seen this concept, use the existing label as the key
|
||||
# This ensures we merge rows that represent the same concept
|
||||
if concept in concept_to_label_map:
|
||||
concept_key = concept_to_label_map[concept]
|
||||
else:
|
||||
# For a new concept, use the current label as the key
|
||||
concept_key = label
|
||||
# Remember this mapping for future occurrences
|
||||
concept_to_label_map[concept] = concept_key
|
||||
|
||||
# Store metadata about the concept (level, abstract status, etc.)
|
||||
# If we've already seen this concept, only update metadata if it's from a more recent period
|
||||
# This ensures we use labels from the most recent filing when merging rows
|
||||
if concept_key not in self.concept_metadata:
|
||||
self.concept_metadata[concept_key] = {
|
||||
'level': item.get('level', 0),
|
||||
'is_abstract': item.get('is_abstract', False),
|
||||
'is_total': item.get('is_total', False) or 'total' in label.lower(),
|
||||
'original_concept': concept,
|
||||
'latest_label': label # Store the original label too
|
||||
}
|
||||
else:
|
||||
# For existing concepts, update the label to use the most recent one
|
||||
# We determine which periods are most recent based on position in self.periods
|
||||
# (earlier indices are more recent periods)
|
||||
|
||||
# Find the periods in this statement
|
||||
statement_periods = [p for p in relevant_periods if p in self.periods]
|
||||
if statement_periods:
|
||||
# Get the most recent period in this statement
|
||||
most_recent_period = min(statement_periods, key=lambda p: self.periods.index(p))
|
||||
most_recent_idx = self.periods.index(most_recent_period)
|
||||
|
||||
# Find the earliest period where we have data for this concept
|
||||
existing_periods = [p for p in self.data[concept_key].keys() if p in self.periods]
|
||||
if existing_periods:
|
||||
earliest_existing_idx = min(self.periods.index(p) for p in existing_periods)
|
||||
|
||||
# If this statement has more recent data, update the label
|
||||
if most_recent_idx < earliest_existing_idx:
|
||||
# Update the concept key label for display
|
||||
new_concept_key = label
|
||||
|
||||
# If we're changing the label, we need to migrate existing data
|
||||
if new_concept_key != concept_key:
|
||||
# Copy existing data to the new key
|
||||
if new_concept_key not in self.data:
|
||||
self.data[new_concept_key] = self.data[concept_key].copy()
|
||||
|
||||
# Update metadata
|
||||
self.concept_metadata[new_concept_key] = self.concept_metadata[concept_key].copy()
|
||||
self.concept_metadata[new_concept_key]['latest_label'] = label
|
||||
|
||||
# Update the concept mapping
|
||||
concept_to_label_map[concept] = new_concept_key
|
||||
concept_key = new_concept_key
|
||||
else:
|
||||
# Just update the latest label
|
||||
self.concept_metadata[concept_key]['latest_label'] = label
|
||||
|
||||
# Store values for relevant periods
|
||||
for period_id in relevant_periods:
|
||||
if period_id in self.periods: # Only include selected periods
|
||||
value = item.get('values', {}).get(period_id)
|
||||
if value is not None:
|
||||
self.data[concept_key][period_id] = {
|
||||
'value': value,
|
||||
'decimals': item.get('decimals', {}).get(period_id, 0)
|
||||
}
|
||||
|
||||
def _format_output_with_ordering(self, statements: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Format the stitched data for rendering with intelligent ordering using virtual presentation tree.
|
||||
|
||||
Args:
|
||||
statements: Original statements for ordering reference
|
||||
|
||||
Returns:
|
||||
Stitched statement data in the expected format
|
||||
"""
|
||||
# Get unified ordering for all concepts using the ordering manager
|
||||
concept_ordering = {}
|
||||
if self.ordering_manager:
|
||||
concept_ordering = self.ordering_manager.determine_ordering(statements)
|
||||
|
||||
# Build virtual presentation tree to preserve hierarchy while applying semantic ordering
|
||||
presentation_tree = VirtualPresentationTree(self.ordering_manager)
|
||||
ordered_nodes = presentation_tree.build_tree(
|
||||
concept_metadata=self.concept_metadata,
|
||||
concept_ordering=concept_ordering,
|
||||
original_statement_order=self.original_statement_order
|
||||
)
|
||||
|
||||
# Convert nodes back to the expected format
|
||||
ordered_concepts = [(node.concept, node.metadata) for node in ordered_nodes]
|
||||
|
||||
# Build the output structure
|
||||
result = {
|
||||
'periods': [(pid, self.period_dates.get(pid, pid)) for pid in self.periods],
|
||||
'statement_data': []
|
||||
}
|
||||
|
||||
for concept, metadata in ordered_concepts:
|
||||
# Create an item for each concept
|
||||
item = {
|
||||
# Use the latest label if available, otherwise fall back to the concept key
|
||||
'label': metadata.get('latest_label', concept),
|
||||
'level': metadata['level'],
|
||||
'is_abstract': metadata['is_abstract'],
|
||||
'is_total': metadata['is_total'],
|
||||
'concept': metadata['original_concept'],
|
||||
'values': {},
|
||||
'decimals': {}
|
||||
}
|
||||
|
||||
# Add values for each period
|
||||
for period_id in self.periods:
|
||||
if period_id in self.data[concept]:
|
||||
item['values'][period_id] = self.data[concept][period_id]['value']
|
||||
item['decimals'][period_id] = self.data[concept][period_id]['decimals']
|
||||
|
||||
# Set has_values flag based on whether there are any values
|
||||
item['has_values'] = len(item['values']) > 0
|
||||
|
||||
# Only include items with values or abstract items
|
||||
if item['has_values'] or item['is_abstract']:
|
||||
result['statement_data'].append(item)
|
||||
|
||||
return result
|
||||
|
||||
def _format_output(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Backward compatibility method - calls the new ordering-aware method.
|
||||
|
||||
Returns:
|
||||
Stitched statement data in the expected format
|
||||
"""
|
||||
# For backward compatibility, call the new method with empty statements
|
||||
# This will use alphabetical ordering as before
|
||||
return self._format_output_with_ordering([])
|
||||
|
||||
|
||||
def stitch_statements(
|
||||
xbrl_list: List[Any],
|
||||
statement_type: str = 'IncomeStatement',
|
||||
period_type: Union[StatementStitcher.PeriodType, str] = StatementStitcher.PeriodType.RECENT_PERIODS,
|
||||
max_periods: int = 3,
|
||||
standard: bool = True,
|
||||
use_optimal_periods: bool = True,
|
||||
include_dimensions: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Stitch together statements from multiple XBRL objects.
|
||||
|
||||
Args:
|
||||
xbrl_list: List of XBRL objects, should be from the same company and ordered by date
|
||||
statement_type: Type of statement to stitch ('IncomeStatement', 'BalanceSheet', etc.)
|
||||
period_type: Type of period view to generate
|
||||
max_periods: Maximum number of periods to include (default: 3)
|
||||
standard: Whether to use standardized concept labels (default: True)
|
||||
use_optimal_periods: Whether to use the entity info to determine optimal periods (default: True)
|
||||
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
|
||||
|
||||
Returns:
|
||||
Stitched statement data
|
||||
"""
|
||||
# Initialize the stitcher
|
||||
stitcher = StatementStitcher()
|
||||
|
||||
# Collect statements of the specified type from each XBRL object
|
||||
statements = []
|
||||
|
||||
# If using optimal periods based on entity info
|
||||
if use_optimal_periods:
|
||||
# Use our utility function to determine the best periods
|
||||
optimal_periods = determine_optimal_periods(xbrl_list, statement_type, max_periods=max_periods)
|
||||
|
||||
# Limit to max_periods if needed
|
||||
if len(optimal_periods) > max_periods:
|
||||
optimal_periods = optimal_periods[:max_periods]
|
||||
|
||||
# Extract the XBRL objects that contain our optimal periods
|
||||
for period_metadata in optimal_periods:
|
||||
xbrl_index = period_metadata['xbrl_index']
|
||||
xbrl = xbrl_list[xbrl_index]
|
||||
|
||||
# Get the statement and period info
|
||||
statement = xbrl.get_statement_by_type(statement_type, include_dimensions=include_dimensions)
|
||||
if statement:
|
||||
# Only include the specific period from this statement
|
||||
period_key = period_metadata['period_key']
|
||||
|
||||
# Check if this period exists in the statement
|
||||
if period_key in statement['periods']:
|
||||
# Create a filtered version of the statement with just this period
|
||||
filtered_statement = {
|
||||
'role': statement['role'],
|
||||
'definition': statement['definition'],
|
||||
'statement_type': statement['statement_type'],
|
||||
'periods': {period_key: statement['periods'][period_key]},
|
||||
'data': statement['data']
|
||||
}
|
||||
|
||||
# Update the period label to include information from entity_info
|
||||
display_date = period_metadata['display_date']
|
||||
period_type = period_metadata['period_type']
|
||||
fiscal_period = period_metadata.get('fiscal_period')
|
||||
|
||||
# Create a more informative label
|
||||
if period_type == 'instant':
|
||||
if fiscal_period == 'FY':
|
||||
period_label = f"FY {display_date}"
|
||||
else:
|
||||
period_label = display_date
|
||||
else: # duration
|
||||
# For duration periods, add fiscal quarter/year info if available
|
||||
if fiscal_period == 'FY':
|
||||
period_label = f"FY {display_date}"
|
||||
elif fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
|
||||
period_label = f"{fiscal_period} {display_date}"
|
||||
else:
|
||||
period_label = display_date
|
||||
|
||||
# Update the period label
|
||||
filtered_statement['periods'][period_key] = {
|
||||
'label': period_label,
|
||||
'original_label': statement['periods'][period_key]['label']
|
||||
}
|
||||
|
||||
statements.append(filtered_statement)
|
||||
# Traditional approach without using entity info
|
||||
else:
|
||||
for xbrl in xbrl_list:
|
||||
# Get statement data for the specified type
|
||||
statement = xbrl.find_statement(statement_type)
|
||||
if statement:
|
||||
statements.append(statement)
|
||||
|
||||
# Stitch the statements
|
||||
return stitcher.stitch_statements(statements, period_type, max_periods, standard)
|
||||
@@ -0,0 +1,833 @@
|
||||
"""
|
||||
XBRL Statement Ordering - Intelligent Ordering for Multi-Period Statements
|
||||
|
||||
This module provides consistent ordering for financial statements across multiple periods
|
||||
by combining template-based, reference-based, and semantic positioning strategies.
|
||||
"""
|
||||
|
||||
import re
|
||||
from enum import Enum
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
except ImportError:
|
||||
# Fallback to difflib if rapidfuzz is not available
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
class fuzz:
|
||||
@staticmethod
|
||||
def ratio(s1: str, s2: str) -> float:
|
||||
return SequenceMatcher(None, s1, s2).ratio() * 100
|
||||
|
||||
|
||||
class StatementType(str, Enum):
|
||||
"""Supported statement types for ordering"""
|
||||
INCOME_STATEMENT = "IncomeStatement"
|
||||
BALANCE_SHEET = "BalanceSheet"
|
||||
CASH_FLOW = "CashFlowStatement"
|
||||
EQUITY = "StatementOfEquity"
|
||||
|
||||
|
||||
class FinancialStatementTemplates:
|
||||
"""Canonical ordering templates for financial statements based on XBRL concepts"""
|
||||
|
||||
INCOME_STATEMENT_TEMPLATE = [
|
||||
# Revenue Section (0-99)
|
||||
(0, "revenue_section", [
|
||||
# Product/Service Revenue Components
|
||||
"us-gaap:SalesRevenueGoodsNet",
|
||||
"us-gaap:ProductSales",
|
||||
"us-gaap:SalesRevenueServicesNet",
|
||||
"us-gaap:SubscriptionRevenue",
|
||||
# Contract Revenue
|
||||
"us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
|
||||
"us-gaap:RevenueFromContractWithCustomerIncludingAssessedTax",
|
||||
# Total Revenue
|
||||
"us-gaap:Revenue",
|
||||
"us-gaap:Revenues",
|
||||
"us-gaap:SalesRevenueNet",
|
||||
"us-gaap:OperatingRevenue"
|
||||
]),
|
||||
|
||||
# Cost Section (100-199)
|
||||
(100, "cost_section", [
|
||||
"us-gaap:CostOfRevenueAbstract", # Abstract
|
||||
"us-gaap:CostOfRevenue", # Total
|
||||
"us-gaap:CostOfGoodsSold",
|
||||
"us-gaap:CostOfGoodsAndServicesSold",
|
||||
"us-gaap:CostOfSales",
|
||||
"us-gaap:DirectOperatingCosts",
|
||||
"us-gaap:CostsAndExpenses"
|
||||
]),
|
||||
|
||||
# Gross Profit (200-299)
|
||||
(200, "gross_profit", [
|
||||
"us-gaap:GrossProfit"
|
||||
]),
|
||||
|
||||
# Operating Expenses (300-399)
|
||||
(300, "operating_expenses", [
|
||||
# R&D Expenses
|
||||
"us-gaap:ResearchAndDevelopmentCosts",
|
||||
"us-gaap:ResearchAndDevelopmentExpense",
|
||||
# SG&A Expenses
|
||||
"us-gaap:SellingGeneralAndAdministrativeExpense",
|
||||
"us-gaap:GeneralAndAdministrativeExpense",
|
||||
"us-gaap:AdministrativeExpense",
|
||||
"us-gaap:SellingAndMarketingExpense",
|
||||
"us-gaap:SellingExpense",
|
||||
"us-gaap:MarketingExpense",
|
||||
"us-gaap:AdvertisingExpense",
|
||||
# Total Operating Expenses
|
||||
"us-gaap:NoninterestExpense",
|
||||
"us-gaap:OperatingCostsAndExpenses",
|
||||
"us-gaap:OperatingExpenses"
|
||||
]),
|
||||
|
||||
# Operating Income (400-499)
|
||||
(400, "operating_income", [
|
||||
"us-gaap:OperatingIncomeLoss",
|
||||
"us-gaap:OperatingIncome",
|
||||
"us-gaap:IncomeLossFromContinuingOperationsBeforeInterestAndTaxes"
|
||||
]),
|
||||
|
||||
# Non-Operating (500-599)
|
||||
(500, "non_operating", [
|
||||
"us-gaap:InterestIncomeExpenseNet",
|
||||
"us-gaap:InterestAndDebtExpense",
|
||||
"us-gaap:InterestExpense",
|
||||
"us-gaap:InterestExpenseNonoperating", # ADBE uses this for non-operating interest expense
|
||||
"us-gaap:InterestIncome",
|
||||
"us-gaap:InvestmentIncomeInterest", # NVIDIA uses this variant
|
||||
"us-gaap:OtherNonoperatingIncomeExpense",
|
||||
"us-gaap:NonoperatingIncomeExpense",
|
||||
"orcl:NonoperatingIncomeExpenseIncludingEliminationOfNetIncomeLossAttributableToNoncontrollingInterests"
|
||||
]),
|
||||
|
||||
# Pre-Tax Income (600-699)
|
||||
(600, "pretax_income", [
|
||||
"us-gaap:IncomeLossBeforeIncomeTaxes",
|
||||
"us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxes",
|
||||
"us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest",
|
||||
"orcl:IncomeLossFromContinuingOperationsIncludingNoncontrollingInterestBeforeIncomeTaxesExtraordinaryItems"
|
||||
]),
|
||||
|
||||
# Tax (700-799)
|
||||
(700, "tax", [
|
||||
"us-gaap:IncomeTaxesPaidNet",
|
||||
"us-gaap:IncomeTaxExpenseBenefit"
|
||||
]),
|
||||
|
||||
# Net Income (800-899)
|
||||
(800, "net_income", [
|
||||
"us-gaap:IncomeLossFromContinuingOperationsIncludingPortionAttributableToNoncontrollingInterest",
|
||||
"us-gaap:IncomeLossFromContinuingOperations",
|
||||
"us-gaap:NetIncome",
|
||||
"us-gaap:NetIncomeLoss",
|
||||
"us-gaap:ProfitLoss",
|
||||
"us-gaap:NetIncomeLossAttributableToNonredeemableNoncontrollingInterest",
|
||||
"us-gaap:NetIncomeLossAttributableToNoncontrollingInterest"
|
||||
]),
|
||||
|
||||
# Per Share Data (900-999)
|
||||
(900, "per_share", [
|
||||
"us-gaap:EarningsPerShareAbstract",
|
||||
"us-gaap:EarningsPerShareBasic",
|
||||
"us-gaap:EarningsPerShareDiluted",
|
||||
"us-gaap:WeightedAverageNumberOfSharesOutstandingAbstract",
|
||||
"us-gaap:WeightedAverageNumberOfSharesOutstandingBasic",
|
||||
"us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding"
|
||||
])
|
||||
]
|
||||
|
||||
BALANCE_SHEET_TEMPLATE = [
|
||||
# Current Assets (0-199)
|
||||
(0, "current_assets", [
|
||||
"Cash and Cash Equivalents",
|
||||
"Cash",
|
||||
"Short-term Investments",
|
||||
"Marketable Securities",
|
||||
"Accounts Receivable",
|
||||
"Trade Receivables",
|
||||
"Inventory",
|
||||
"Prepaid Expenses",
|
||||
"Other Current Assets",
|
||||
"Total Current Assets"
|
||||
]),
|
||||
|
||||
# Non-Current Assets (200-399)
|
||||
(200, "noncurrent_assets", [
|
||||
"Property, Plant and Equipment",
|
||||
"Property and Equipment",
|
||||
"Long-term Investments",
|
||||
"Goodwill",
|
||||
"Intangible Assets",
|
||||
"Other Non-current Assets",
|
||||
"Total Non-current Assets",
|
||||
"Total Assets"
|
||||
]),
|
||||
|
||||
# Current Liabilities (400-599)
|
||||
(400, "current_liabilities", [
|
||||
"Accounts Payable",
|
||||
"Trade Payables",
|
||||
"Accrued Liabilities",
|
||||
"Accrued Expenses",
|
||||
"Short-term Debt",
|
||||
"Current Portion of Long-term Debt",
|
||||
"Other Current Liabilities",
|
||||
"Total Current Liabilities"
|
||||
]),
|
||||
|
||||
# Non-Current Liabilities (600-799)
|
||||
(600, "noncurrent_liabilities", [
|
||||
"Long-term Debt",
|
||||
"Deferred Revenue",
|
||||
"Deferred Tax Liabilities",
|
||||
"Other Non-current Liabilities",
|
||||
"Total Non-current Liabilities",
|
||||
"Total Liabilities"
|
||||
]),
|
||||
|
||||
# Equity (800-999)
|
||||
(800, "equity", [
|
||||
"Common Stock",
|
||||
"Additional Paid-in Capital",
|
||||
"Retained Earnings",
|
||||
"Accumulated Other Comprehensive Income",
|
||||
"Treasury Stock",
|
||||
"Total Stockholders' Equity",
|
||||
"Total Shareholders' Equity",
|
||||
"Total Equity"
|
||||
])
|
||||
]
|
||||
|
||||
def get_template_position(self, item_concept: str, item_label: str, statement_type: str) -> Optional[float]:
|
||||
"""
|
||||
Get template position for an item, prioritizing concept-based matching over label matching.
|
||||
|
||||
Args:
|
||||
item_concept: The XBRL concept (e.g., "us-gaap:Revenue")
|
||||
item_label: The display label (e.g., "Contract Revenue")
|
||||
statement_type: Type of statement ("IncomeStatement", "BalanceSheet", etc.)
|
||||
|
||||
Returns:
|
||||
Float position in template, or None if no match found
|
||||
"""
|
||||
# Handle different statement type formats
|
||||
if statement_type == "IncomeStatement":
|
||||
template_name = "INCOME_STATEMENT_TEMPLATE"
|
||||
elif statement_type == "BalanceSheet":
|
||||
template_name = "BALANCE_SHEET_TEMPLATE"
|
||||
else:
|
||||
template_name = f"{statement_type.upper()}_TEMPLATE"
|
||||
|
||||
template = getattr(self, template_name, None)
|
||||
if not template:
|
||||
return None
|
||||
|
||||
# Strategy 1: Direct concept matching (highest priority)
|
||||
if item_concept:
|
||||
normalized_concept = self._normalize_xbrl_concept(item_concept)
|
||||
for base_pos, _section_name, template_concepts in template:
|
||||
for i, template_concept in enumerate(template_concepts):
|
||||
template_normalized = self._normalize_xbrl_concept(template_concept)
|
||||
if normalized_concept == template_normalized:
|
||||
return float(base_pos + i)
|
||||
|
||||
# Strategy 2: Label-based matching as fallback (for compatibility)
|
||||
if item_label:
|
||||
for base_pos, _section_name, template_concepts in template:
|
||||
for i, template_concept in enumerate(template_concepts):
|
||||
if self._labels_match(item_label, template_concept):
|
||||
return float(base_pos + i)
|
||||
|
||||
return None
|
||||
|
||||
def _normalize_xbrl_concept(self, concept: str) -> str:
|
||||
"""
|
||||
Normalize XBRL concept for matching.
|
||||
|
||||
Handles variations in concept format:
|
||||
- "us-gaap:Revenue" vs "us-gaap_Revenue"
|
||||
- Case sensitivity
|
||||
- Namespace prefixes
|
||||
"""
|
||||
if not concept:
|
||||
return ""
|
||||
|
||||
# Normalize separators (: vs _)
|
||||
normalized = concept.lower()
|
||||
normalized = normalized.replace(':', '_')
|
||||
|
||||
# Handle common namespace variations
|
||||
# us-gaap, usgaap, gaap all should match
|
||||
if normalized.startswith('us-gaap_') or normalized.startswith('usgaap_'):
|
||||
normalized = 'us-gaap_' + normalized.split('_', 1)[1]
|
||||
elif normalized.startswith('gaap_'):
|
||||
normalized = 'us-gaap_' + normalized.split('_', 1)[1]
|
||||
|
||||
return normalized
|
||||
|
||||
def _labels_match(self, label1: str, label2: str) -> bool:
|
||||
"""Check if two labels represent the same financial item (fallback for non-concept matching)"""
|
||||
if not label1 or not label2:
|
||||
return False
|
||||
|
||||
# For XBRL concepts in templates, don't try to match against labels
|
||||
if ':' in label2 or '_gaap_' in label2.lower():
|
||||
return False
|
||||
|
||||
# Use existing normalization logic for label matching
|
||||
norm1 = self._normalize_concept(label1)
|
||||
norm2 = self._normalize_concept(label2)
|
||||
|
||||
# Exact match
|
||||
if norm1 == norm2:
|
||||
return True
|
||||
|
||||
# Fuzzy matching for similar concepts
|
||||
similarity = fuzz.ratio(norm1, norm2) / 100.0
|
||||
return similarity > 0.7
|
||||
|
||||
def _concepts_match(self, concept1: str, concept2: str) -> bool:
|
||||
"""Check if two concepts represent the same financial item"""
|
||||
# Normalize for comparison
|
||||
norm1 = self._normalize_concept(concept1)
|
||||
norm2 = self._normalize_concept(concept2)
|
||||
|
||||
# Exact match
|
||||
if norm1 == norm2:
|
||||
return True
|
||||
|
||||
# Fuzzy matching for similar concepts
|
||||
similarity = fuzz.ratio(norm1, norm2) / 100.0
|
||||
return similarity > 0.7 # Lowered threshold for better matching
|
||||
|
||||
def _normalize_concept(self, concept: str) -> str:
|
||||
"""Normalize concept for comparison"""
|
||||
if not concept:
|
||||
return ""
|
||||
|
||||
# Remove common variations
|
||||
normalized = concept.lower()
|
||||
normalized = re.sub(r'\s+', ' ', normalized) # Normalize whitespace
|
||||
normalized = re.sub(r'[,\.]', '', normalized) # Remove punctuation
|
||||
normalized = re.sub(r'\(.*?\)', '', normalized) # Remove parenthetical
|
||||
normalized = re.sub(r'\bexpense\b', '', normalized) # Remove 'expense' suffix
|
||||
normalized = re.sub(r'\bincome\b', '', normalized) # Remove 'income' suffix for matching
|
||||
return normalized.strip()
|
||||
|
||||
|
||||
class ReferenceOrderingStrategy:
|
||||
"""Extract ordering from reference statement"""
|
||||
|
||||
def establish_reference_order(self, statements: List[Dict]) -> Dict[str, float]:
|
||||
"""Establish reference ordering from best available statement"""
|
||||
|
||||
if not statements:
|
||||
return {}
|
||||
|
||||
# Strategy: Use most recent statement (statements are ordered newest first)
|
||||
reference_statement = statements[0]
|
||||
|
||||
reference_order = {}
|
||||
for i, item in enumerate(reference_statement.get('data', [])):
|
||||
concept = item.get('concept')
|
||||
label = item.get('label')
|
||||
|
||||
if concept:
|
||||
# Store by both concept ID and label for flexibility
|
||||
reference_order[concept] = float(i)
|
||||
if label:
|
||||
reference_order[label] = float(i)
|
||||
|
||||
return reference_order
|
||||
|
||||
|
||||
class SemanticPositioning:
|
||||
"""Position concepts based on financial statement semantics"""
|
||||
|
||||
def __init__(self, statement_type: str):
|
||||
self.statement_type = statement_type
|
||||
self.section_defaults = self._get_section_defaults()
|
||||
|
||||
def _get_section_defaults(self) -> Dict[str, float]:
|
||||
"""Default positions for each section when no other guidance available"""
|
||||
if self.statement_type == "IncomeStatement":
|
||||
return {
|
||||
"revenue": 50.0,
|
||||
"cost": 150.0,
|
||||
"gross_profit": 250.0,
|
||||
"expense": 350.0,
|
||||
"operating_income": 450.0,
|
||||
"non_operating": 550.0,
|
||||
"pretax_income": 650.0,
|
||||
"tax": 750.0,
|
||||
"net_income": 850.0,
|
||||
"per_share": 950.0
|
||||
}
|
||||
elif self.statement_type == "BalanceSheet":
|
||||
return {
|
||||
"current_assets": 100.0,
|
||||
"noncurrent_assets": 300.0,
|
||||
"current_liabilities": 500.0,
|
||||
"noncurrent_liabilities": 700.0,
|
||||
"equity": 900.0
|
||||
}
|
||||
return {}
|
||||
|
||||
def infer_position(self, concept: str, existing_order: Dict[str, float]) -> float:
|
||||
"""Infer semantic position for a new concept"""
|
||||
|
||||
# Rule-based positioning
|
||||
section = self._classify_concept_section(concept)
|
||||
if section:
|
||||
return self._position_in_section(concept, section, existing_order)
|
||||
|
||||
# Parent-child relationship positioning
|
||||
parent = self._find_parent_concept(concept, existing_order)
|
||||
if parent:
|
||||
return existing_order[parent] + 0.1 # Just after parent
|
||||
|
||||
# Similarity-based positioning
|
||||
similar_concept = self._find_most_similar_concept(concept, existing_order)
|
||||
if similar_concept:
|
||||
return existing_order[similar_concept] + 0.1
|
||||
|
||||
# Default to end
|
||||
return 999.0
|
||||
|
||||
def _classify_concept_section(self, concept: str) -> Optional[str]:
|
||||
"""Classify concept into financial statement section"""
|
||||
if not concept:
|
||||
return None
|
||||
|
||||
concept_lower = concept.lower()
|
||||
|
||||
if self.statement_type == "IncomeStatement":
|
||||
# Revenue indicators
|
||||
if any(term in concept_lower for term in ['revenue', 'sales']) and not any(term in concept_lower for term in ['cost', 'expense']):
|
||||
return "revenue"
|
||||
# Cost indicators
|
||||
elif any(term in concept_lower for term in ['cost of', 'cogs']):
|
||||
return "cost"
|
||||
# Gross profit
|
||||
elif 'gross profit' in concept_lower or 'gross margin' in concept_lower:
|
||||
return "gross_profit"
|
||||
# Operating expenses
|
||||
elif any(term in concept_lower for term in ['r&d', 'research', 'selling', 'administrative', 'marketing']) or ('expense' in concept_lower and 'tax' not in concept_lower):
|
||||
return "expense"
|
||||
# Operating income
|
||||
elif 'operating income' in concept_lower or 'operating profit' in concept_lower:
|
||||
return "operating_income"
|
||||
# Non-operating
|
||||
elif any(term in concept_lower for term in ['interest', 'other income', 'nonoperating']):
|
||||
return "non_operating"
|
||||
# Pre-tax income
|
||||
elif 'before tax' in concept_lower or 'pretax' in concept_lower:
|
||||
return "pretax_income"
|
||||
# Tax
|
||||
elif 'tax' in concept_lower and 'expense' in concept_lower:
|
||||
return "tax"
|
||||
# Net income
|
||||
elif 'net income' in concept_lower or 'net earnings' in concept_lower:
|
||||
return "net_income"
|
||||
# Per share
|
||||
elif any(term in concept_lower for term in ['per share', 'earnings per', 'shares outstanding']):
|
||||
return "per_share"
|
||||
|
||||
elif self.statement_type == "BalanceSheet":
|
||||
if any(term in concept_lower for term in ['cash', 'receivable', 'inventory', 'prepaid']) or ('current' in concept_lower and 'asset' in concept_lower):
|
||||
return "current_assets"
|
||||
elif any(term in concept_lower for term in ['property', 'equipment', 'goodwill', 'intangible']) or ('asset' in concept_lower and 'current' not in concept_lower):
|
||||
return "noncurrent_assets"
|
||||
elif any(term in concept_lower for term in ['payable', 'accrued']) or ('current' in concept_lower and 'liabilit' in concept_lower):
|
||||
return "current_liabilities"
|
||||
elif 'debt' in concept_lower or ('liabilit' in concept_lower and 'current' not in concept_lower):
|
||||
return "noncurrent_liabilities"
|
||||
elif any(term in concept_lower for term in ['equity', 'stock', 'retained earnings', 'capital']):
|
||||
return "equity"
|
||||
|
||||
return None
|
||||
|
||||
def _position_in_section(self, concept: str, section: str, existing_order: Dict[str, float]) -> float:
|
||||
"""Position concept within its identified section"""
|
||||
section_concepts = [
|
||||
(label, pos) for label, pos in existing_order.items()
|
||||
if self._classify_concept_section(label) == section
|
||||
]
|
||||
|
||||
if not section_concepts:
|
||||
# Section doesn't exist yet - use template defaults
|
||||
return self.section_defaults.get(section, 999.0)
|
||||
|
||||
# Find best position within section
|
||||
section_concepts.sort(key=lambda x: x[1]) # Sort by position
|
||||
|
||||
# Simple strategy: place at end of section
|
||||
last_pos = section_concepts[-1][1]
|
||||
return last_pos + 0.1
|
||||
|
||||
def _find_parent_concept(self, concept: str, existing_order: Dict[str, float]) -> Optional[str]:
|
||||
"""Find parent concept in hierarchy"""
|
||||
if not concept:
|
||||
return None
|
||||
|
||||
# Look for hierarchical relationships
|
||||
# e.g., "Software Revenue" -> "Revenue"
|
||||
concept_words = set(concept.lower().split())
|
||||
|
||||
candidates = []
|
||||
for existing_concept in existing_order.keys():
|
||||
if not existing_concept:
|
||||
continue
|
||||
|
||||
existing_words = set(existing_concept.lower().split())
|
||||
|
||||
# Check if existing concept is a parent (subset of words)
|
||||
# Also check for common patterns like "expense" being a parent of "X expense"
|
||||
if (existing_words.issubset(concept_words) and len(existing_words) < len(concept_words)) or \
|
||||
(existing_concept.lower() in concept.lower() and existing_concept.lower() != concept.lower()):
|
||||
candidates.append((existing_concept, len(existing_words)))
|
||||
|
||||
if candidates:
|
||||
# Return the most specific parent (most words in common)
|
||||
return max(candidates, key=lambda x: x[1])[0]
|
||||
|
||||
return None
|
||||
|
||||
def _find_most_similar_concept(self, concept: str, existing_order: Dict[str, float]) -> Optional[str]:
|
||||
"""Find most similar existing concept"""
|
||||
if not concept:
|
||||
return None
|
||||
|
||||
best_match = None
|
||||
best_similarity = 0.0
|
||||
|
||||
for existing_concept in existing_order.keys():
|
||||
if not existing_concept:
|
||||
continue
|
||||
|
||||
similarity = fuzz.ratio(concept.lower(), existing_concept.lower()) / 100.0
|
||||
if similarity > best_similarity and similarity > 0.5: # Minimum threshold
|
||||
best_similarity = similarity
|
||||
best_match = existing_concept
|
||||
|
||||
return best_match
|
||||
|
||||
|
||||
class StatementOrderingManager:
|
||||
"""Manages consistent ordering across multi-period statements"""
|
||||
|
||||
def __init__(self, statement_type: str):
|
||||
self.statement_type = statement_type
|
||||
self.templates = FinancialStatementTemplates()
|
||||
self.reference_strategy = ReferenceOrderingStrategy()
|
||||
self.semantic_positioning = SemanticPositioning(statement_type)
|
||||
|
||||
def determine_ordering(self, statements: List[Dict]) -> Dict[str, float]:
|
||||
"""
|
||||
Determine unified ordering for all concepts across statements.
|
||||
|
||||
Returns:
|
||||
Dict mapping concept -> sort_key (float for interpolation)
|
||||
"""
|
||||
if not statements:
|
||||
return {}
|
||||
|
||||
all_concepts = self._extract_all_concepts(statements)
|
||||
|
||||
# Strategy 1: Template-based ordering (highest priority)
|
||||
template_positioned = self._apply_template_ordering(all_concepts, statements)
|
||||
|
||||
# Strategy 2: Reference statement ordering for non-template items
|
||||
reference_positioned = self._apply_reference_ordering(
|
||||
all_concepts, statements, template_positioned
|
||||
)
|
||||
|
||||
# Strategy 3: Semantic positioning for orphan concepts
|
||||
semantic_positioned = self._apply_semantic_positioning(
|
||||
all_concepts, template_positioned, reference_positioned
|
||||
)
|
||||
|
||||
# Strategy 4: Section-aware consolidation to maintain template groupings
|
||||
final_ordering = self._consolidate_section_ordering(
|
||||
semantic_positioned, template_positioned, statements
|
||||
)
|
||||
|
||||
return final_ordering
|
||||
|
||||
def _extract_all_concepts(self, statements: List[Dict]) -> set:
|
||||
"""Extract all unique concepts from statements"""
|
||||
all_concepts = set()
|
||||
|
||||
for statement in statements:
|
||||
for item in statement.get('data', []):
|
||||
concept = item.get('concept')
|
||||
label = item.get('label')
|
||||
if concept:
|
||||
all_concepts.add(concept)
|
||||
if label:
|
||||
all_concepts.add(label)
|
||||
|
||||
return all_concepts
|
||||
|
||||
def _apply_template_ordering(self, concepts: set, statements: List[Dict]) -> Dict[str, float]:
|
||||
"""Apply template-based ordering for known concepts using concept-first matching"""
|
||||
template_order = {}
|
||||
|
||||
# Build a mapping of concepts/labels to their actual XBRL concepts for better matching
|
||||
concept_to_xbrl = {}
|
||||
label_to_xbrl = {}
|
||||
|
||||
for statement in statements:
|
||||
for item in statement.get('data', []):
|
||||
concept = item.get('concept')
|
||||
label = item.get('label')
|
||||
|
||||
if concept and label:
|
||||
concept_to_xbrl[concept] = concept
|
||||
label_to_xbrl[label] = concept
|
||||
elif concept:
|
||||
concept_to_xbrl[concept] = concept
|
||||
|
||||
# Apply template ordering with concept priority
|
||||
for concept_or_label in concepts:
|
||||
# Determine if this is a concept or label
|
||||
is_concept = concept_or_label in concept_to_xbrl
|
||||
is_label = concept_or_label in label_to_xbrl
|
||||
|
||||
# Get the actual XBRL concept and label for this item
|
||||
if is_concept:
|
||||
xbrl_concept = concept_or_label
|
||||
# Try to find the corresponding label
|
||||
corresponding_label = None
|
||||
for stmt in statements:
|
||||
for item in stmt.get('data', []):
|
||||
if item.get('concept') == concept_or_label:
|
||||
corresponding_label = item.get('label')
|
||||
break
|
||||
if corresponding_label:
|
||||
break
|
||||
elif is_label:
|
||||
xbrl_concept = label_to_xbrl.get(concept_or_label)
|
||||
corresponding_label = concept_or_label
|
||||
else:
|
||||
# Neither concept nor label found in mappings
|
||||
xbrl_concept = None
|
||||
corresponding_label = concept_or_label
|
||||
|
||||
# Try concept-based matching first, then label-based
|
||||
template_pos = self.templates.get_template_position(
|
||||
item_concept=xbrl_concept,
|
||||
item_label=corresponding_label,
|
||||
statement_type=self.statement_type
|
||||
)
|
||||
|
||||
if template_pos is not None:
|
||||
template_order[concept_or_label] = template_pos
|
||||
|
||||
# IMPORTANT: If we found a template position for a concept,
|
||||
# also apply it to the corresponding label (and vice versa)
|
||||
# This ensures consistent ordering regardless of whether the
|
||||
# stitcher uses concept or label as the key
|
||||
if is_concept and corresponding_label and corresponding_label in concepts:
|
||||
template_order[corresponding_label] = template_pos
|
||||
elif is_label and xbrl_concept and xbrl_concept in concepts:
|
||||
template_order[xbrl_concept] = template_pos
|
||||
|
||||
return template_order
|
||||
|
||||
def _apply_reference_ordering(self, concepts: set, statements: List[Dict],
|
||||
template_positioned: Dict[str, float]) -> Dict[str, float]:
|
||||
"""Apply reference statement ordering for remaining concepts"""
|
||||
reference_order = self.reference_strategy.establish_reference_order(statements)
|
||||
|
||||
combined_order = template_positioned.copy()
|
||||
|
||||
for concept in concepts:
|
||||
if concept not in combined_order and concept in reference_order:
|
||||
combined_order[concept] = reference_order[concept]
|
||||
|
||||
return combined_order
|
||||
|
||||
def _apply_semantic_positioning(self, concepts: set, template_positioned: Dict[str, float],
|
||||
reference_positioned: Dict[str, float]) -> Dict[str, float]:
|
||||
"""Apply semantic positioning for orphan concepts"""
|
||||
final_order = reference_positioned.copy()
|
||||
|
||||
# Position remaining concepts using semantic rules
|
||||
for concept in concepts:
|
||||
if concept not in final_order:
|
||||
semantic_pos = self.semantic_positioning.infer_position(concept, final_order)
|
||||
final_order[concept] = semantic_pos
|
||||
|
||||
return final_order
|
||||
|
||||
def _consolidate_section_ordering(self, semantic_positioned: Dict[str, float],
|
||||
template_positioned: Dict[str, float],
|
||||
statements: List[Dict]) -> Dict[str, float]:
|
||||
"""
|
||||
Consolidate ordering to maintain template section groupings.
|
||||
|
||||
This prevents reference ordering from breaking up logical template sections
|
||||
like per-share data (EPS + Shares Outstanding).
|
||||
"""
|
||||
# Identify template sections and their concepts
|
||||
template_sections = self._identify_template_sections(template_positioned)
|
||||
|
||||
# Separate template-positioned from non-template items
|
||||
template_items = {}
|
||||
non_template_items = {}
|
||||
|
||||
for concept, position in semantic_positioned.items():
|
||||
if concept in template_positioned:
|
||||
template_items[concept] = position
|
||||
else:
|
||||
non_template_items[concept] = position
|
||||
|
||||
# Re-organize to ensure section integrity
|
||||
final_ordering = {}
|
||||
|
||||
# Process template sections in order
|
||||
for section_name, section_concepts in template_sections.items():
|
||||
# Find all template items (concepts and labels) that belong to this section
|
||||
section_template_items = []
|
||||
|
||||
for concept in section_concepts:
|
||||
if concept in template_items:
|
||||
section_template_items.append(concept)
|
||||
|
||||
# Also find labels that correspond to concepts in this section
|
||||
# by checking if any template_items have the same template position
|
||||
section_template_positions = set()
|
||||
for concept in section_concepts:
|
||||
if concept in template_positioned:
|
||||
section_template_positions.add(template_positioned[concept])
|
||||
|
||||
# Find labels that have the same template positions as section concepts
|
||||
for item, pos in template_items.items():
|
||||
if pos in section_template_positions and item not in section_template_items:
|
||||
section_template_items.append(item)
|
||||
|
||||
if section_template_items:
|
||||
# Use the template base position for this section to ensure strong grouping
|
||||
section_base_pos = self._get_section_base_position(section_name)
|
||||
|
||||
# For critical sections like per_share, use an even stronger override
|
||||
if section_name == "per_share":
|
||||
# Force per-share items to be at the very end, regardless of hierarchy
|
||||
section_base_pos = 950.0
|
||||
|
||||
# Ensure all items in this section stay grouped together
|
||||
for i, item in enumerate(sorted(section_template_items,
|
||||
key=lambda x: template_items.get(x, 999.0))):
|
||||
final_ordering[item] = section_base_pos + i * 0.1
|
||||
|
||||
# Add non-template items, adjusting positions to avoid breaking template sections
|
||||
section_ranges = self._get_section_ranges(final_ordering, template_sections)
|
||||
|
||||
for concept, position in non_template_items.items():
|
||||
# Find appropriate insertion point that doesn't break template sections
|
||||
adjusted_position = self._find_insertion_point(position, section_ranges)
|
||||
final_ordering[concept] = adjusted_position
|
||||
|
||||
return final_ordering
|
||||
|
||||
def _get_section_base_position(self, section_name: str) -> float:
|
||||
"""Get the base position for a template section"""
|
||||
if self.statement_type == "IncomeStatement":
|
||||
template = self.templates.INCOME_STATEMENT_TEMPLATE
|
||||
elif self.statement_type == "BalanceSheet":
|
||||
template = self.templates.BALANCE_SHEET_TEMPLATE
|
||||
else:
|
||||
return 999.0
|
||||
|
||||
for base_pos, name, _concepts in template:
|
||||
if name == section_name:
|
||||
return float(base_pos)
|
||||
|
||||
return 999.0
|
||||
|
||||
def _identify_template_sections(self, template_positioned: Dict[str, float]) -> Dict[str, List[str]]:
|
||||
"""Identify which concepts belong to which template sections"""
|
||||
sections = {}
|
||||
|
||||
# Get the template for this statement type
|
||||
if self.statement_type == "IncomeStatement":
|
||||
template = self.templates.INCOME_STATEMENT_TEMPLATE
|
||||
elif self.statement_type == "BalanceSheet":
|
||||
template = self.templates.BALANCE_SHEET_TEMPLATE
|
||||
else:
|
||||
return {}
|
||||
|
||||
# Build mapping of concepts to sections
|
||||
for _base_pos, section_name, template_concepts in template:
|
||||
section_concepts = []
|
||||
|
||||
for concept in template_positioned.keys():
|
||||
# Check if this concept matches any template concept in this section
|
||||
for template_concept in template_concepts:
|
||||
if self._concept_matches_template(concept, template_concept):
|
||||
section_concepts.append(concept)
|
||||
break
|
||||
|
||||
if section_concepts:
|
||||
sections[section_name] = section_concepts
|
||||
|
||||
return sections
|
||||
|
||||
def _concept_matches_template(self, concept: str, template_concept: str) -> bool:
|
||||
"""Check if a concept matches a template concept"""
|
||||
# For XBRL concepts, do direct comparison
|
||||
if ':' in template_concept or '_gaap_' in template_concept.lower():
|
||||
return self._normalize_xbrl_concept(concept) == self._normalize_xbrl_concept(template_concept)
|
||||
|
||||
# For labels, use fuzzy matching
|
||||
return self._labels_match(concept, template_concept)
|
||||
|
||||
def _get_section_ranges(self, final_ordering: Dict[str, float],
|
||||
template_sections: Dict[str, List[str]]) -> List[Tuple[float, float, str]]:
|
||||
"""Get the position ranges occupied by each template section"""
|
||||
ranges = []
|
||||
|
||||
for section_name, concepts in template_sections.items():
|
||||
section_positions = [final_ordering[c] for c in concepts if c in final_ordering]
|
||||
|
||||
if section_positions:
|
||||
min_pos = min(section_positions)
|
||||
max_pos = max(section_positions)
|
||||
ranges.append((min_pos, max_pos, section_name))
|
||||
|
||||
return sorted(ranges)
|
||||
|
||||
def _find_insertion_point(self, desired_position: float,
|
||||
section_ranges: List[Tuple[float, float, str]]) -> float:
|
||||
"""Find appropriate insertion point that doesn't break template sections"""
|
||||
|
||||
# Check if desired position conflicts with any template section
|
||||
for min_pos, max_pos, section_name in section_ranges:
|
||||
if min_pos <= desired_position <= max_pos:
|
||||
# Position conflicts with a template section
|
||||
# Place it just before the section (unless it should logically be after)
|
||||
|
||||
# Special handling for per-share section
|
||||
if section_name == "per_share" and desired_position < min_pos:
|
||||
# Items that should come before per-share data
|
||||
return min_pos - 1.0
|
||||
else:
|
||||
# Place after the section
|
||||
return max_pos + 1.0
|
||||
|
||||
# No conflicts, use desired position
|
||||
return desired_position
|
||||
|
||||
def _normalize_xbrl_concept(self, concept: str) -> str:
|
||||
"""Delegate to templates class for concept normalization"""
|
||||
return self.templates._normalize_xbrl_concept(concept)
|
||||
|
||||
def _labels_match(self, label1: str, label2: str) -> bool:
|
||||
"""Delegate to templates class for label matching"""
|
||||
return self.templates._labels_match(label1, label2)
|
||||
@@ -0,0 +1,547 @@
|
||||
"""
|
||||
XBRL Statement Stitching - Period Optimization (Refactored)
|
||||
|
||||
This module provides functionality to determine optimal periods for stitching
|
||||
statements across multiple XBRL filings, handling period selection and
|
||||
fiscal period matching.
|
||||
|
||||
Refactored to use a clean class-based architecture for better maintainability,
|
||||
testability, and extensibility.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from edgar.xbrl.core import format_date, parse_date
|
||||
from edgar.xbrl.xbrl import XBRL
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PeriodSelectionConfig:
|
||||
"""Configuration for period selection behavior"""
|
||||
|
||||
# Duration ranges for different period types
|
||||
annual_duration_range: Tuple[int, int] = (350, 380)
|
||||
quarterly_duration_range: Tuple[int, int] = (80, 100)
|
||||
q2_ytd_range: Tuple[int, int] = (175, 190)
|
||||
q3_ytd_range: Tuple[int, int] = (260, 285)
|
||||
q4_annual_range: Tuple[int, int] = (350, 380)
|
||||
|
||||
# Target durations for optimization
|
||||
target_annual_days: int = 365
|
||||
target_quarterly_days: int = 90
|
||||
target_q2_ytd_days: int = 180
|
||||
target_q3_ytd_days: int = 270
|
||||
|
||||
# Behavior flags
|
||||
require_exact_matches: bool = True
|
||||
allow_fallback_when_no_doc_date: bool = True
|
||||
max_periods_default: int = 8
|
||||
|
||||
|
||||
class PeriodMatcher:
|
||||
"""Handles exact period matching logic"""
|
||||
|
||||
def __init__(self, config: PeriodSelectionConfig):
|
||||
self.config = config
|
||||
|
||||
def find_exact_instant_match(self, periods: List[Dict], target_date: date) -> Optional[Dict]:
|
||||
"""Find instant period that exactly matches target date"""
|
||||
for period in periods:
|
||||
try:
|
||||
period_date = parse_date(period['date'])
|
||||
if period_date == target_date:
|
||||
return period
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning("Failed to parse period date '%s': %s", period.get('date'), e)
|
||||
continue
|
||||
return None
|
||||
|
||||
def find_exact_duration_match(self, periods: List[Dict], target_date: date) -> Optional[Dict]:
|
||||
"""Find duration period that ends exactly on target date"""
|
||||
for period in periods:
|
||||
try:
|
||||
end_date = parse_date(period['end_date'])
|
||||
if end_date == target_date:
|
||||
return period
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning("Failed to parse period end date '%s': %s", period.get('end_date'), e)
|
||||
continue
|
||||
return None
|
||||
|
||||
def filter_by_duration_range(self, periods: List[Dict], min_days: int, max_days: int, target_days: int) -> List[Dict]:
|
||||
"""Filter periods by duration and sort by proximity to target"""
|
||||
filtered_periods = []
|
||||
|
||||
for period in periods:
|
||||
duration_days = period.get('duration_days')
|
||||
if duration_days is None:
|
||||
try:
|
||||
start_date = parse_date(period['start_date'])
|
||||
end_date = parse_date(period['end_date'])
|
||||
duration_days = (end_date - start_date).days
|
||||
period = period.copy()
|
||||
period['duration_days'] = duration_days
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning("Failed to calculate duration for period: %s", e)
|
||||
continue
|
||||
|
||||
if min_days <= duration_days <= max_days:
|
||||
filtered_periods.append(period)
|
||||
|
||||
# Sort by proximity to target duration
|
||||
filtered_periods.sort(key=lambda x: abs(x['duration_days'] - target_days))
|
||||
return filtered_periods
|
||||
|
||||
|
||||
class FiscalPeriodClassifier:
|
||||
"""Classifies and filters periods based on fiscal information"""
|
||||
|
||||
def __init__(self, config: PeriodSelectionConfig):
|
||||
self.config = config
|
||||
|
||||
def classify_annual_periods(self, periods: List[Dict]) -> List[Dict]:
|
||||
"""Identify annual periods (350-380 days)"""
|
||||
min_days, max_days = self.config.annual_duration_range
|
||||
target_days = self.config.target_annual_days
|
||||
|
||||
annual_periods = []
|
||||
for period in periods:
|
||||
duration_days = period.get('duration_days', 0)
|
||||
if min_days <= duration_days <= max_days:
|
||||
annual_periods.append(period)
|
||||
|
||||
# Sort by proximity to target annual duration
|
||||
annual_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
|
||||
return annual_periods
|
||||
|
||||
def classify_quarterly_periods(self, periods: List[Dict]) -> List[Dict]:
|
||||
"""Identify quarterly periods (80-100 days)"""
|
||||
min_days, max_days = self.config.quarterly_duration_range
|
||||
target_days = self.config.target_quarterly_days
|
||||
|
||||
quarterly_periods = []
|
||||
for period in periods:
|
||||
duration_days = period.get('duration_days', 0)
|
||||
if min_days <= duration_days <= max_days:
|
||||
quarterly_periods.append(period)
|
||||
|
||||
# Sort by proximity to target quarterly duration
|
||||
quarterly_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
|
||||
return quarterly_periods
|
||||
|
||||
def classify_ytd_periods(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
|
||||
"""Identify YTD periods based on fiscal quarter"""
|
||||
if fiscal_period not in ['Q2', 'Q3', 'Q4']:
|
||||
return []
|
||||
|
||||
# Get expected duration range for this fiscal period
|
||||
duration_ranges = {
|
||||
'Q2': self.config.q2_ytd_range,
|
||||
'Q3': self.config.q3_ytd_range,
|
||||
'Q4': self.config.q4_annual_range
|
||||
}
|
||||
|
||||
target_durations = {
|
||||
'Q2': self.config.target_q2_ytd_days,
|
||||
'Q3': self.config.target_q3_ytd_days,
|
||||
'Q4': self.config.target_annual_days
|
||||
}
|
||||
|
||||
min_days, max_days = duration_ranges[fiscal_period]
|
||||
target_days = target_durations[fiscal_period]
|
||||
|
||||
ytd_periods = []
|
||||
for period in periods:
|
||||
duration_days = period.get('duration_days', 0)
|
||||
if min_days <= duration_days <= max_days:
|
||||
ytd_periods.append(period)
|
||||
|
||||
# Sort by proximity to target duration
|
||||
ytd_periods.sort(key=lambda x: abs(x.get('duration_days', 0) - target_days))
|
||||
return ytd_periods
|
||||
|
||||
def get_expected_durations(self, fiscal_period: str) -> Dict[str, Tuple[int, int]]:
|
||||
"""Get expected duration ranges for fiscal period"""
|
||||
if fiscal_period == 'FY':
|
||||
return {'annual': self.config.annual_duration_range}
|
||||
elif fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
|
||||
durations = {'quarterly': self.config.quarterly_duration_range}
|
||||
if fiscal_period == 'Q2':
|
||||
durations['ytd'] = self.config.q2_ytd_range
|
||||
elif fiscal_period == 'Q3':
|
||||
durations['ytd'] = self.config.q3_ytd_range
|
||||
elif fiscal_period == 'Q4':
|
||||
durations['ytd'] = self.config.q4_annual_range
|
||||
return durations
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
class StatementTypeSelector:
|
||||
"""Handles statement-specific period selection logic"""
|
||||
|
||||
def __init__(self, matcher: PeriodMatcher, classifier: FiscalPeriodClassifier):
|
||||
self.matcher = matcher
|
||||
self.classifier = classifier
|
||||
|
||||
def select_balance_sheet_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date]) -> List[Dict]:
|
||||
"""Select instant periods for balance sheets"""
|
||||
# Filter for instant periods only
|
||||
instant_periods = [p for p in xbrl.reporting_periods if p['type'] == 'instant']
|
||||
|
||||
if not instant_periods:
|
||||
return []
|
||||
|
||||
# If we have document_period_end_date, find exact match
|
||||
if doc_period_end_date:
|
||||
exact_match = self.matcher.find_exact_instant_match(instant_periods, doc_period_end_date)
|
||||
if exact_match:
|
||||
return [exact_match]
|
||||
else:
|
||||
# No exact match found - don't use fallback to prevent fiscal year boundary issues
|
||||
logger.info("No exact instant period match found for %s", doc_period_end_date)
|
||||
return []
|
||||
|
||||
# No document_period_end_date available - use most recent period
|
||||
instant_periods.sort(key=lambda x: x['date'], reverse=True)
|
||||
return [instant_periods[0]]
|
||||
|
||||
def select_income_statement_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
|
||||
fiscal_period: str) -> List[Dict]:
|
||||
"""Select duration periods for income statements"""
|
||||
return self._select_duration_periods(xbrl, doc_period_end_date, fiscal_period)
|
||||
|
||||
def select_cash_flow_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
|
||||
fiscal_period: str) -> List[Dict]:
|
||||
"""Select duration periods for cash flow statements"""
|
||||
return self._select_duration_periods(xbrl, doc_period_end_date, fiscal_period)
|
||||
|
||||
def _select_duration_periods(self, xbrl: XBRL, doc_period_end_date: Optional[date],
|
||||
fiscal_period: str) -> List[Dict]:
|
||||
"""Common logic for selecting duration periods"""
|
||||
# Filter for duration periods only
|
||||
duration_periods = [p for p in xbrl.reporting_periods if p['type'] == 'duration']
|
||||
|
||||
if not duration_periods:
|
||||
return []
|
||||
|
||||
# Add duration_days to all periods
|
||||
enriched_periods = []
|
||||
for period in duration_periods:
|
||||
try:
|
||||
start_date = parse_date(period['start_date'])
|
||||
end_date = parse_date(period['end_date'])
|
||||
period_copy = period.copy()
|
||||
period_copy['duration_days'] = (end_date - start_date).days
|
||||
enriched_periods.append(period_copy)
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning("Failed to parse period dates: %s", e)
|
||||
continue
|
||||
|
||||
if not enriched_periods:
|
||||
return []
|
||||
|
||||
# If we have document_period_end_date, find periods that end exactly on that date
|
||||
if doc_period_end_date:
|
||||
matching_periods = []
|
||||
for period in enriched_periods:
|
||||
try:
|
||||
end_date = parse_date(period['end_date'])
|
||||
if end_date == doc_period_end_date:
|
||||
matching_periods.append(period)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
if matching_periods:
|
||||
return self._select_appropriate_durations(matching_periods, fiscal_period)
|
||||
else:
|
||||
# No exact match found - don't use fallback
|
||||
logger.info("No exact duration period match found for %s", doc_period_end_date)
|
||||
return []
|
||||
|
||||
# No document_period_end_date - use fallback logic
|
||||
return self._select_fallback_periods(enriched_periods, fiscal_period)
|
||||
|
||||
def _select_appropriate_durations(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
|
||||
"""Select appropriate duration periods based on fiscal period"""
|
||||
selected_periods = []
|
||||
|
||||
is_annual = fiscal_period == 'FY'
|
||||
|
||||
if is_annual:
|
||||
# For annual reports, select annual periods
|
||||
annual_periods = self.classifier.classify_annual_periods(periods)
|
||||
if annual_periods:
|
||||
selected_periods.append(annual_periods[0])
|
||||
else:
|
||||
# For quarterly reports, select quarterly period
|
||||
quarterly_periods = self.classifier.classify_quarterly_periods(periods)
|
||||
if quarterly_periods:
|
||||
selected_periods.append(quarterly_periods[0])
|
||||
|
||||
# Also select YTD period if appropriate
|
||||
ytd_periods = self.classifier.classify_ytd_periods(periods, fiscal_period)
|
||||
if ytd_periods:
|
||||
selected_periods.append(ytd_periods[0])
|
||||
|
||||
return selected_periods
|
||||
|
||||
def _select_fallback_periods(self, periods: List[Dict], fiscal_period: str) -> List[Dict]:
|
||||
"""Fallback period selection when no document_period_end_date is available"""
|
||||
is_annual = fiscal_period == 'FY'
|
||||
|
||||
if is_annual:
|
||||
# For annual reports, prefer periods closest to 365 days
|
||||
annual_periods = self.classifier.classify_annual_periods(periods)
|
||||
if annual_periods:
|
||||
# Sort by end date and take the most recent
|
||||
annual_periods.sort(key=lambda x: x['end_date'], reverse=True)
|
||||
return [annual_periods[0]]
|
||||
else:
|
||||
# For quarterly reports, prefer quarterly duration
|
||||
quarterly_periods = self.classifier.classify_quarterly_periods(periods)
|
||||
selected_periods = []
|
||||
|
||||
if quarterly_periods:
|
||||
quarterly_periods.sort(key=lambda x: x['end_date'], reverse=True)
|
||||
selected_periods.append(quarterly_periods[0])
|
||||
|
||||
# Add YTD period if available
|
||||
ytd_periods = self.classifier.classify_ytd_periods(periods, fiscal_period)
|
||||
if ytd_periods:
|
||||
ytd_periods.sort(key=lambda x: x['end_date'], reverse=True)
|
||||
selected_periods.append(ytd_periods[0])
|
||||
|
||||
return selected_periods
|
||||
|
||||
# If no appropriate periods found, return the most recent period
|
||||
periods.sort(key=lambda x: x['end_date'], reverse=True)
|
||||
return [periods[0]]
|
||||
|
||||
|
||||
class PeriodMetadataEnricher:
|
||||
"""Handles period metadata enrichment"""
|
||||
|
||||
def enrich_period_metadata(self, period: Dict, xbrl_index: int, entity_info: Dict,
|
||||
doc_period_end_date: Optional[date], fiscal_period: str,
|
||||
fiscal_year: str) -> Dict[str, Any]:
|
||||
"""Add comprehensive metadata to period"""
|
||||
period_metadata = {
|
||||
'xbrl_index': xbrl_index,
|
||||
'period_key': period['key'],
|
||||
'period_label': period['label'],
|
||||
'period_type': period['type'],
|
||||
'entity_info': entity_info,
|
||||
'doc_period_end_date': doc_period_end_date,
|
||||
'fiscal_period': fiscal_period,
|
||||
'fiscal_year': fiscal_year
|
||||
}
|
||||
|
||||
# Add date information
|
||||
if period['type'] == 'instant':
|
||||
period_metadata['date'] = parse_date(period['date'])
|
||||
period_metadata['display_date'] = format_date(period_metadata['date'])
|
||||
else: # duration
|
||||
period_metadata['start_date'] = parse_date(period['start_date'])
|
||||
period_metadata['end_date'] = parse_date(period['end_date'])
|
||||
period_metadata['duration_days'] = period.get('duration_days',
|
||||
(period_metadata['end_date'] - period_metadata['start_date']).days)
|
||||
period_metadata['display_date'] = format_date(period_metadata['end_date'])
|
||||
|
||||
return period_metadata
|
||||
|
||||
|
||||
class PeriodDeduplicator:
|
||||
"""Handles period deduplication and sorting"""
|
||||
|
||||
def deduplicate_periods(self, periods: List[Dict], statement_type: str) -> List[Dict]:
|
||||
"""Remove duplicate periods using exact date matching"""
|
||||
filtered_periods = []
|
||||
|
||||
for period in periods:
|
||||
too_close = False
|
||||
for included_period in filtered_periods:
|
||||
# Skip if period types don't match
|
||||
if period['period_type'] != included_period['period_type']:
|
||||
continue
|
||||
|
||||
# Calculate date difference
|
||||
if period['period_type'] == 'instant':
|
||||
date1 = period['date']
|
||||
date2 = included_period['date']
|
||||
else: # duration
|
||||
date1 = period['end_date']
|
||||
date2 = included_period['end_date']
|
||||
|
||||
# Periods are duplicates if they have exactly the same date
|
||||
if date1 == date2:
|
||||
too_close = True
|
||||
break
|
||||
|
||||
if not too_close:
|
||||
filtered_periods.append(period)
|
||||
|
||||
return filtered_periods
|
||||
|
||||
def sort_periods_chronologically(self, periods: List[Dict], statement_type: str) -> List[Dict]:
|
||||
"""Sort periods by appropriate date field"""
|
||||
if statement_type == 'BalanceSheet':
|
||||
return sorted(periods, key=lambda x: x['date'], reverse=True)
|
||||
else:
|
||||
return sorted(periods, key=lambda x: x['end_date'], reverse=True)
|
||||
|
||||
def limit_periods(self, periods: List[Dict], max_periods: int) -> List[Dict]:
|
||||
"""Limit to maximum number of periods"""
|
||||
return periods[:max_periods] if len(periods) > max_periods else periods
|
||||
|
||||
|
||||
class PeriodOptimizer:
|
||||
"""Main orchestrator for period optimization"""
|
||||
|
||||
def __init__(self, config: Optional[PeriodSelectionConfig] = None):
|
||||
self.config = config or PeriodSelectionConfig()
|
||||
self.matcher = PeriodMatcher(self.config)
|
||||
self.classifier = FiscalPeriodClassifier(self.config)
|
||||
self.selector = StatementTypeSelector(self.matcher, self.classifier)
|
||||
self.enricher = PeriodMetadataEnricher()
|
||||
self.deduplicator = PeriodDeduplicator()
|
||||
|
||||
def determine_optimal_periods(self, xbrl_list: List[XBRL], statement_type: str,
|
||||
max_periods: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
"""Main entry point - orchestrates the entire process"""
|
||||
max_periods = max_periods or self.config.max_periods_default
|
||||
|
||||
# Step 1: Extract periods from all XBRLs
|
||||
all_periods = self._extract_all_periods(xbrl_list, statement_type)
|
||||
|
||||
# Step 2: Enrich with metadata
|
||||
enriched_periods = self._enrich_with_metadata(all_periods)
|
||||
|
||||
# Step 3: Deduplicate, sort, and limit
|
||||
final_periods = self._deduplicate_and_limit(enriched_periods, max_periods, statement_type)
|
||||
|
||||
return final_periods
|
||||
|
||||
def _extract_all_periods(self, xbrl_list: List[XBRL], statement_type: str) -> List[Dict[str, Any]]:
|
||||
"""Extract periods from all XBRL objects"""
|
||||
all_periods = []
|
||||
|
||||
for i, xbrl in enumerate(xbrl_list):
|
||||
# Skip None XBRLs (pre-XBRL era filings before 2009)
|
||||
if xbrl is None:
|
||||
continue
|
||||
|
||||
# Skip XBRLs with no reporting periods
|
||||
if not xbrl.reporting_periods:
|
||||
continue
|
||||
|
||||
entity_info = xbrl.entity_info or {}
|
||||
doc_period_end_date = self._parse_document_period_end_date(entity_info)
|
||||
fiscal_period = entity_info.get('fiscal_period')
|
||||
fiscal_year = entity_info.get('fiscal_year')
|
||||
|
||||
# Select appropriate periods based on statement type
|
||||
selected_periods = self._select_periods_for_statement_type(
|
||||
xbrl, statement_type, doc_period_end_date, fiscal_period
|
||||
)
|
||||
|
||||
# Add context information to each period
|
||||
for period in selected_periods:
|
||||
period_with_context = {
|
||||
'period': period,
|
||||
'xbrl_index': i,
|
||||
'entity_info': entity_info,
|
||||
'doc_period_end_date': doc_period_end_date,
|
||||
'fiscal_period': fiscal_period,
|
||||
'fiscal_year': fiscal_year
|
||||
}
|
||||
all_periods.append(period_with_context)
|
||||
|
||||
return all_periods
|
||||
|
||||
def _parse_document_period_end_date(self, entity_info: Dict) -> Optional[date]:
|
||||
"""Parse document_period_end_date from entity_info"""
|
||||
if 'document_period_end_date' not in entity_info:
|
||||
return None
|
||||
|
||||
try:
|
||||
doc_period_end_date = entity_info['document_period_end_date']
|
||||
if not isinstance(doc_period_end_date, date):
|
||||
doc_period_end_date = parse_date(str(doc_period_end_date))
|
||||
return doc_period_end_date
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning("Failed to parse document_period_end_date: %s", e)
|
||||
return None
|
||||
|
||||
def _select_periods_for_statement_type(self, xbrl: XBRL, statement_type: str,
|
||||
doc_period_end_date: Optional[date],
|
||||
fiscal_period: str) -> List[Dict]:
|
||||
"""Select periods based on statement type"""
|
||||
if statement_type == 'BalanceSheet':
|
||||
return self.selector.select_balance_sheet_periods(xbrl, doc_period_end_date)
|
||||
elif statement_type in ['IncomeStatement', 'CashFlowStatement']:
|
||||
if statement_type == 'IncomeStatement':
|
||||
return self.selector.select_income_statement_periods(xbrl, doc_period_end_date, fiscal_period)
|
||||
else:
|
||||
return self.selector.select_cash_flow_periods(xbrl, doc_period_end_date, fiscal_period)
|
||||
else:
|
||||
# For other statement types, use income statement logic as default
|
||||
return self.selector.select_income_statement_periods(xbrl, doc_period_end_date, fiscal_period)
|
||||
|
||||
def _enrich_with_metadata(self, all_periods: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Enrich periods with comprehensive metadata"""
|
||||
enriched_periods = []
|
||||
|
||||
for period_context in all_periods:
|
||||
period = period_context['period']
|
||||
enriched_metadata = self.enricher.enrich_period_metadata(
|
||||
period,
|
||||
period_context['xbrl_index'],
|
||||
period_context['entity_info'],
|
||||
period_context['doc_period_end_date'],
|
||||
period_context['fiscal_period'],
|
||||
period_context['fiscal_year']
|
||||
)
|
||||
enriched_periods.append(enriched_metadata)
|
||||
|
||||
return enriched_periods
|
||||
|
||||
def _deduplicate_and_limit(self, periods: List[Dict[str, Any]], max_periods: int,
|
||||
statement_type: str) -> List[Dict[str, Any]]:
|
||||
"""Deduplicate, sort, and limit periods"""
|
||||
# Sort periods chronologically
|
||||
sorted_periods = self.deduplicator.sort_periods_chronologically(periods, statement_type)
|
||||
|
||||
# Remove duplicates
|
||||
deduplicated_periods = self.deduplicator.deduplicate_periods(sorted_periods, statement_type)
|
||||
|
||||
# Limit to maximum number of periods
|
||||
final_periods = self.deduplicator.limit_periods(deduplicated_periods, max_periods)
|
||||
|
||||
return final_periods
|
||||
|
||||
|
||||
# Main function that maintains the original API
|
||||
def determine_optimal_periods(xbrl_list: List[XBRL], statement_type: str, max_periods: int = 8) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Determine the optimal periods to display for stitched statements from a list of XBRL objects.
|
||||
|
||||
This function analyzes entity info and reporting periods across multiple XBRL instances
|
||||
to select the most appropriate periods for display, ensuring consistency in period selection
|
||||
when creating stitched statements.
|
||||
|
||||
Args:
|
||||
xbrl_list: List of XBRL objects ordered chronologically
|
||||
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
max_periods: Maximum number of periods to return (default is 8)
|
||||
|
||||
Returns:
|
||||
List of period metadata dictionaries containing information for display
|
||||
"""
|
||||
optimizer = PeriodOptimizer()
|
||||
return optimizer.determine_optimal_periods(xbrl_list, statement_type, max_periods)
|
||||
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
XBRL Presentation Tree - Virtual presentation tree for multi-period statements
|
||||
|
||||
This module creates a virtual presentation tree that preserves hierarchical
|
||||
relationships while applying semantic ordering within sibling groups.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class PresentationNode:
|
||||
"""Represents a node in the virtual presentation tree"""
|
||||
|
||||
concept: str
|
||||
label: str
|
||||
level: int
|
||||
metadata: Dict[str, Any]
|
||||
semantic_order: float = 999.0
|
||||
original_index: int = 999
|
||||
|
||||
def __post_init__(self):
|
||||
self.children: List[PresentationNode] = []
|
||||
self.parent: Optional[PresentationNode] = None
|
||||
|
||||
def add_child(self, child: 'PresentationNode'):
|
||||
"""Add a child node and set parent relationship"""
|
||||
child.parent = self
|
||||
self.children.append(child)
|
||||
|
||||
def sort_children(self):
|
||||
"""Sort children using semantic ordering while preserving hierarchy"""
|
||||
# Sort direct children by semantic order, then by original index as tiebreaker
|
||||
self.children.sort(key=lambda x: (x.semantic_order, x.original_index))
|
||||
|
||||
# Recursively sort grandchildren
|
||||
for child in self.children:
|
||||
child.sort_children()
|
||||
|
||||
def flatten_to_list(self) -> List['PresentationNode']:
|
||||
"""Flatten tree to ordered list while preserving hierarchy"""
|
||||
result = [self]
|
||||
for child in self.children:
|
||||
result.extend(child.flatten_to_list())
|
||||
return result
|
||||
|
||||
|
||||
class VirtualPresentationTree:
|
||||
"""Builds and manages virtual presentation tree for stitched statements"""
|
||||
|
||||
def __init__(self, ordering_manager=None):
|
||||
self.ordering_manager = ordering_manager
|
||||
self.root_nodes: List[PresentationNode] = []
|
||||
self.all_nodes: Dict[str, PresentationNode] = {}
|
||||
|
||||
def build_tree(self, concept_metadata: Dict, concept_ordering: Dict,
|
||||
original_statement_order: List[str] = None) -> List[PresentationNode]:
|
||||
"""
|
||||
Build presentation tree from concept metadata and ordering.
|
||||
|
||||
Args:
|
||||
concept_metadata: Metadata for each concept including level
|
||||
concept_ordering: Semantic ordering positions
|
||||
original_statement_order: Original order of concepts for context
|
||||
|
||||
Returns:
|
||||
Flattened list of nodes in correct presentation order
|
||||
"""
|
||||
# Step 1: Create nodes for all concepts
|
||||
self._create_nodes(concept_metadata, concept_ordering, original_statement_order)
|
||||
|
||||
# Step 2: Build parent-child relationships based on levels and context
|
||||
self._build_hierarchy(original_statement_order or [])
|
||||
|
||||
# Step 3: Apply semantic ordering within sibling groups
|
||||
self._apply_semantic_ordering()
|
||||
|
||||
# Step 4: Flatten tree to linear list
|
||||
return self._flatten_tree()
|
||||
|
||||
def _create_nodes(self, concept_metadata: Dict, concept_ordering: Dict,
|
||||
original_statement_order: List[str] = None):
|
||||
"""Create nodes for all concepts"""
|
||||
self.all_nodes = {}
|
||||
|
||||
for i, (concept, metadata) in enumerate(concept_metadata.items()):
|
||||
label = metadata.get('latest_label', concept)
|
||||
level = metadata.get('level', 0)
|
||||
semantic_order = concept_ordering.get(concept, concept_ordering.get(label, 999.0))
|
||||
|
||||
# Track original index for maintaining some original order context
|
||||
original_index = i
|
||||
if original_statement_order:
|
||||
try:
|
||||
original_index = original_statement_order.index(concept)
|
||||
except ValueError:
|
||||
try:
|
||||
original_index = original_statement_order.index(label)
|
||||
except ValueError:
|
||||
original_index = i + 1000 # Place unknown concepts later
|
||||
|
||||
node = PresentationNode(
|
||||
concept=concept,
|
||||
label=label,
|
||||
level=level,
|
||||
metadata=metadata,
|
||||
semantic_order=semantic_order,
|
||||
original_index=original_index
|
||||
)
|
||||
|
||||
self.all_nodes[concept] = node
|
||||
|
||||
def _build_hierarchy(self, original_order: List[str]):
|
||||
"""Build parent-child relationships based on level progression and context"""
|
||||
|
||||
# Sort nodes by their original order to maintain context for hierarchy detection
|
||||
nodes_in_order = []
|
||||
|
||||
# First, try to use original order if available
|
||||
if original_order:
|
||||
# Map concepts in original order
|
||||
concept_to_node = {node.concept: node for node in self.all_nodes.values()}
|
||||
label_to_node = {node.label: node for node in self.all_nodes.values()}
|
||||
|
||||
for item in original_order:
|
||||
if item in concept_to_node:
|
||||
nodes_in_order.append(concept_to_node[item])
|
||||
elif item in label_to_node:
|
||||
nodes_in_order.append(label_to_node[item])
|
||||
|
||||
# Add any remaining nodes not in original order
|
||||
remaining_nodes = [node for node in self.all_nodes.values()
|
||||
if node not in nodes_in_order]
|
||||
remaining_nodes.sort(key=lambda x: x.original_index)
|
||||
nodes_in_order.extend(remaining_nodes)
|
||||
else:
|
||||
# Fall back to sorting by original index
|
||||
nodes_in_order = sorted(self.all_nodes.values(),
|
||||
key=lambda x: x.original_index)
|
||||
|
||||
# Build hierarchy using a parent stack approach
|
||||
parent_stack = [] # Stack of potential parents at each level
|
||||
|
||||
for node in nodes_in_order:
|
||||
current_level = node.level
|
||||
|
||||
# Pop parents that are at the same level or deeper
|
||||
# We're looking for a parent at a level less than current
|
||||
while parent_stack and parent_stack[-1].level >= current_level:
|
||||
parent_stack.pop()
|
||||
|
||||
if parent_stack:
|
||||
# Check if potential parent and child belong to compatible sections
|
||||
parent = parent_stack[-1]
|
||||
|
||||
# Prevent cross-section hierarchies for critical sections like per_share
|
||||
should_be_child = self._should_be_hierarchical_child(parent, node)
|
||||
|
||||
if should_be_child:
|
||||
# Valid parent-child relationship
|
||||
parent.add_child(node)
|
||||
else:
|
||||
# Different sections - make this a root node instead
|
||||
self.root_nodes.append(node)
|
||||
else:
|
||||
# No parent - this is a root node
|
||||
self.root_nodes.append(node)
|
||||
|
||||
# This node could be a parent for subsequent nodes
|
||||
parent_stack.append(node)
|
||||
|
||||
def _apply_semantic_ordering(self):
|
||||
"""Apply semantic ordering within sibling groups"""
|
||||
|
||||
# Sort root nodes by semantic order first, then original index
|
||||
self.root_nodes.sort(key=lambda x: (x.semantic_order, x.original_index))
|
||||
|
||||
# Sort children within each parent recursively
|
||||
for root in self.root_nodes:
|
||||
root.sort_children()
|
||||
|
||||
def _flatten_tree(self) -> List[PresentationNode]:
|
||||
"""Flatten tree to linear list preserving hierarchy"""
|
||||
result = []
|
||||
|
||||
for root in self.root_nodes:
|
||||
result.extend(root.flatten_to_list())
|
||||
|
||||
return result
|
||||
|
||||
def _should_be_hierarchical_child(self, parent: PresentationNode, child: PresentationNode) -> bool:
|
||||
"""
|
||||
Determine if child should be hierarchically under parent based on semantic ordering.
|
||||
|
||||
Prevents cross-section hierarchies that would break template section groupings.
|
||||
"""
|
||||
# Get semantic ordering positions
|
||||
parent_order = parent.semantic_order
|
||||
child_order = child.semantic_order
|
||||
|
||||
# If both have very specific semantic orders from templates (not defaults),
|
||||
# check if they're in similar ranges (same section)
|
||||
if parent_order < 900 and child_order < 900:
|
||||
# Both are template-positioned, check if they're in similar sections
|
||||
# Allow parent-child within 200 points (roughly same section)
|
||||
section_gap = abs(parent_order - child_order)
|
||||
if section_gap > 200:
|
||||
return False
|
||||
|
||||
# Special case: Per-share items (900+) should never be children of early items
|
||||
if child_order >= 900 and parent_order < 800:
|
||||
return False
|
||||
|
||||
# Special case: Non-operating items (500-599) should not be children of operating items
|
||||
if 500 <= child_order < 600 and parent_order < 500:
|
||||
return False
|
||||
|
||||
# Special case: Revenue items should not be parents of per-share items
|
||||
if parent_order < 100 and child_order >= 900:
|
||||
return False
|
||||
|
||||
# Check for semantic incompatibility based on labels
|
||||
child_label = child.label.lower()
|
||||
parent_label = parent.label.lower()
|
||||
|
||||
# Per-share items should not be children of non-per-share items
|
||||
if any(term in child_label for term in ['earnings per share', 'shares outstanding']):
|
||||
if not any(term in parent_label for term in ['earnings', 'shares', 'per share']):
|
||||
return False
|
||||
|
||||
# Interest expense items should not be children of non-interest items
|
||||
if 'interest expense' in child_label:
|
||||
if 'interest' not in parent_label and 'nonoperating' not in parent_label:
|
||||
return False
|
||||
|
||||
# Otherwise, allow hierarchical relationship
|
||||
return True
|
||||
|
||||
def debug_tree(self) -> str:
|
||||
"""Generate a debug representation of the tree"""
|
||||
lines = []
|
||||
|
||||
def _add_node_lines(node: PresentationNode, depth: int = 0):
|
||||
indent = " " * depth
|
||||
lines.append(f"{indent}├─ {node.label} (level={node.level}, "
|
||||
f"semantic={node.semantic_order:.1f}, orig={node.original_index})")
|
||||
|
||||
for child in node.children:
|
||||
_add_node_lines(child, depth + 1)
|
||||
|
||||
lines.append("Virtual Presentation Tree:")
|
||||
for root in self.root_nodes:
|
||||
_add_node_lines(root)
|
||||
|
||||
return "\n".join(lines)
|
||||
640
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/query.py
Normal file
640
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/query.py
Normal file
@@ -0,0 +1,640 @@
|
||||
"""
|
||||
XBRL Statement Stitching - Query Functionality
|
||||
|
||||
This module provides query functionality for stitched XBRL facts, allowing
|
||||
users to query standardized, multi-period financial data.
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
||||
|
||||
import pandas as pd
|
||||
from rich import box
|
||||
from rich.console import Group
|
||||
from rich.markdown import Markdown
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from edgar.richtools import repr_rich
|
||||
from edgar.xbrl.facts import FactQuery
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar.xbrl.stitching.xbrls import XBRLS
|
||||
|
||||
|
||||
class StitchedFactsView:
|
||||
"""
|
||||
A view over stitched facts from multiple XBRL filings.
|
||||
|
||||
This class extracts facts from stitched statements rather than raw XBRL facts,
|
||||
ensuring that queries operate on standardized, post-processed data.
|
||||
"""
|
||||
|
||||
def __init__(self, xbrls: 'XBRLS'):
|
||||
self.xbrls = xbrls
|
||||
self._facts_cache = None
|
||||
self._last_cache_key = None
|
||||
|
||||
def __len__(self):
|
||||
return len(self.get_facts())
|
||||
|
||||
@property
|
||||
def entity_name(self):
|
||||
"""Get entity name from the most recent XBRL filing."""
|
||||
if self.xbrls.xbrl_list:
|
||||
return getattr(self.xbrls.xbrl_list[0], 'entity_name', 'Unknown Entity')
|
||||
return 'Unknown Entity'
|
||||
|
||||
@property
|
||||
def document_type(self):
|
||||
"""Get document type from entity info."""
|
||||
return self.xbrls.entity_info.get('document_type', 'Multi-Period Stitched')
|
||||
|
||||
def get_facts(self,
|
||||
max_periods: int = 8,
|
||||
standard: bool = True,
|
||||
statement_types: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract facts from stitched statements.
|
||||
|
||||
Args:
|
||||
max_periods: Maximum periods to include
|
||||
standard: Whether to use standardized labels
|
||||
statement_types: List of statement types to include
|
||||
|
||||
Returns:
|
||||
List of fact dictionaries with stitched/standardized data
|
||||
"""
|
||||
# Create cache key
|
||||
cache_key = (max_periods, standard, tuple(statement_types or []))
|
||||
if self._facts_cache and self._last_cache_key == cache_key:
|
||||
return self._facts_cache
|
||||
|
||||
statement_types = statement_types or [
|
||||
'IncomeStatement', 'BalanceSheet', 'CashFlowStatement',
|
||||
'StatementOfEquity', 'ComprehensiveIncome'
|
||||
]
|
||||
|
||||
all_facts = []
|
||||
|
||||
for statement_type in statement_types:
|
||||
try:
|
||||
# Get stitched statement data (this applies standardization)
|
||||
stitched_data = self.xbrls.get_statement(
|
||||
statement_type=statement_type,
|
||||
max_periods=max_periods,
|
||||
standard=standard
|
||||
)
|
||||
|
||||
# Extract facts from stitched data
|
||||
facts = self._extract_facts_from_stitched_data(
|
||||
stitched_data, statement_type
|
||||
)
|
||||
all_facts.extend(facts)
|
||||
|
||||
except Exception:
|
||||
# Skip statements that can't be stitched
|
||||
continue
|
||||
|
||||
# Cache results
|
||||
self._facts_cache = all_facts
|
||||
self._last_cache_key = cache_key
|
||||
|
||||
return all_facts
|
||||
|
||||
def _extract_facts_from_stitched_data(self,
|
||||
stitched_data: Dict[str, Any],
|
||||
statement_type: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert stitched statement data back to fact-like records for querying.
|
||||
|
||||
Args:
|
||||
stitched_data: Output from StatementStitcher
|
||||
statement_type: Type of statement
|
||||
|
||||
Returns:
|
||||
List of fact dictionaries
|
||||
"""
|
||||
facts = []
|
||||
periods = stitched_data.get('periods', [])
|
||||
statement_data = stitched_data.get('statement_data', [])
|
||||
|
||||
for item in statement_data:
|
||||
# Skip abstract items without values
|
||||
if item.get('is_abstract', False) and not item.get('has_values', False):
|
||||
continue
|
||||
|
||||
concept = item.get('concept', '')
|
||||
label = item.get('label', '')
|
||||
original_label = item.get('original_label', label)
|
||||
|
||||
# Create a fact record for each period with data
|
||||
for period_id, value in item.get('values', {}).items():
|
||||
if value is None:
|
||||
continue
|
||||
|
||||
# Find period metadata
|
||||
period_info = self._get_period_info(period_id, periods)
|
||||
|
||||
fact = {
|
||||
# Core identification
|
||||
'concept': concept,
|
||||
'label': label, # Standardized label
|
||||
'original_label': original_label, # Original company label
|
||||
'statement_type': statement_type,
|
||||
|
||||
# Value information
|
||||
'value': value,
|
||||
'numeric_value': self._convert_to_numeric(value),
|
||||
'decimals': item.get('decimals', {}).get(period_id, 0),
|
||||
|
||||
# Period information
|
||||
'period_key': period_id,
|
||||
'period_type': period_info.get('period_type', 'duration'),
|
||||
'period_start': period_info.get('period_start'),
|
||||
'period_end': period_info.get('period_end'),
|
||||
'period_instant': period_info.get('period_instant'),
|
||||
'period_label': period_info.get('period_label', ''),
|
||||
|
||||
# Statement context
|
||||
'level': item.get('level', 0),
|
||||
'is_abstract': item.get('is_abstract', False),
|
||||
'is_total': item.get('is_total', False),
|
||||
|
||||
# Multi-filing context
|
||||
'filing_count': len(self.xbrls.xbrl_list),
|
||||
'standardized': True, # Mark as coming from standardized data
|
||||
|
||||
# Source attribution (which XBRL filing this came from)
|
||||
'source_filing_index': self._determine_source_filing(period_id),
|
||||
}
|
||||
|
||||
# Add fiscal period info if available
|
||||
fiscal_info = self._extract_fiscal_info(period_id)
|
||||
fact.update(fiscal_info)
|
||||
|
||||
facts.append(fact)
|
||||
|
||||
return facts
|
||||
|
||||
def _get_period_info(self, period_id: str, periods: List[tuple]) -> Dict[str, Any]:
|
||||
"""Extract period metadata from period_id and periods list."""
|
||||
period_info = {}
|
||||
|
||||
# Find matching period
|
||||
for pid, label in periods:
|
||||
if pid == period_id:
|
||||
period_info['period_label'] = label
|
||||
break
|
||||
|
||||
# Parse period_id to extract dates and type
|
||||
if period_id.startswith('instant_'):
|
||||
period_info['period_type'] = 'instant'
|
||||
date_str = period_id.replace('instant_', '')
|
||||
period_info['period_instant'] = date_str
|
||||
period_info['period_end'] = date_str
|
||||
elif period_id.startswith('duration_'):
|
||||
period_info['period_type'] = 'duration'
|
||||
parts = period_id.replace('duration_', '').split('_')
|
||||
if len(parts) >= 2:
|
||||
period_info['period_start'] = parts[0]
|
||||
period_info['period_end'] = parts[1]
|
||||
|
||||
return period_info
|
||||
|
||||
def _convert_to_numeric(self, value: Any) -> Optional[float]:
|
||||
"""Convert value to numeric if possible."""
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
# Remove commas and try to convert
|
||||
cleaned = value.replace(',', '').replace('$', '').strip()
|
||||
return float(cleaned)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
return None
|
||||
|
||||
def _determine_source_filing(self, period_id: str) -> Optional[int]:
|
||||
"""Determine which filing this period came from."""
|
||||
# This would require enhanced tracking in the stitching process
|
||||
# For now, return None but this could be enhanced
|
||||
return None
|
||||
|
||||
def _extract_fiscal_info(self, period_id: str) -> Dict[str, Any]:
|
||||
"""Extract fiscal period and year information."""
|
||||
fiscal_info = {}
|
||||
|
||||
# Try to extract fiscal info from entity_info of the relevant XBRL
|
||||
# This is a simplified approach - could be enhanced with better tracking
|
||||
if self.xbrls.xbrl_list:
|
||||
entity_info = self.xbrls.xbrl_list[0].entity_info
|
||||
if entity_info:
|
||||
fiscal_info['fiscal_period'] = entity_info.get('fiscal_period')
|
||||
fiscal_info['fiscal_year'] = entity_info.get('fiscal_year')
|
||||
|
||||
return fiscal_info
|
||||
|
||||
def query(self, **kwargs) -> 'StitchedFactQuery':
|
||||
"""Create a new query for stitched facts."""
|
||||
return StitchedFactQuery(self, **kwargs)
|
||||
|
||||
|
||||
class StitchedFactQuery(FactQuery):
|
||||
"""
|
||||
Enhanced fact query for stitched/standardized multi-filing data.
|
||||
|
||||
Extends the base FactQuery with capabilities specific to multi-period,
|
||||
standardized financial data.
|
||||
"""
|
||||
|
||||
def __init__(self, stitched_facts_view: StitchedFactsView, **kwargs):
|
||||
# Initialize with stitched facts view instead of regular facts view
|
||||
self._stitched_facts_view = stitched_facts_view
|
||||
|
||||
# Initialize base FactQuery attributes manually since we're not calling super().__init__
|
||||
self._facts_view = stitched_facts_view # For compatibility with base class
|
||||
self._filters = []
|
||||
self._transformations = []
|
||||
self._aggregations = []
|
||||
self._include_dimensions = True
|
||||
self._include_contexts = True
|
||||
self._include_element_info = True
|
||||
self._sort_by = None
|
||||
self._sort_ascending = True
|
||||
self._limit = None
|
||||
self._statement_type = None
|
||||
|
||||
# Multi-filing specific options
|
||||
self._cross_period_only = False
|
||||
self._trend_analysis = False
|
||||
self._require_all_periods = False
|
||||
|
||||
# Store query-specific parameters for get_facts
|
||||
self._max_periods = kwargs.get('max_periods', 8)
|
||||
self._standard = kwargs.get('standard', True)
|
||||
self._statement_types = kwargs.get('statement_types', None)
|
||||
|
||||
def __str__(self):
|
||||
return f"StitchedFactQuery(filters={len(self._filters)})"
|
||||
|
||||
# Enhanced filtering methods for multi-filing scenarios
|
||||
|
||||
def by_standardized_concept(self, concept_name: str) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Filter by standardized concept name (e.g., 'Revenue', 'Net Income').
|
||||
|
||||
Args:
|
||||
concept_name: Standardized concept name
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
# Query both the standardized label and original concept
|
||||
self._filters.append(
|
||||
lambda f: (f.get('label') == concept_name or
|
||||
concept_name.lower() in f.get('label', '').lower() or
|
||||
concept_name.lower() in f.get('concept', '').lower())
|
||||
)
|
||||
return self
|
||||
|
||||
def by_original_label(self, pattern: str, exact: bool = False) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Filter by original company-specific labels before standardization.
|
||||
|
||||
Args:
|
||||
pattern: Pattern to match against original labels
|
||||
exact: Whether to require exact match
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
if exact:
|
||||
self._filters.append(lambda f: f.get('original_label') == pattern)
|
||||
else:
|
||||
regex = re.compile(pattern, re.IGNORECASE)
|
||||
self._filters.append(
|
||||
lambda f: f.get('original_label') and regex.search(f['original_label'])
|
||||
)
|
||||
return self
|
||||
|
||||
def across_periods(self, min_periods: int = 2) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Filter to concepts that appear across multiple periods.
|
||||
|
||||
Args:
|
||||
min_periods: Minimum number of periods the concept must appear in
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
self._cross_period_only = True
|
||||
self._min_periods = min_periods
|
||||
return self
|
||||
|
||||
def by_fiscal_period(self, fiscal_period: str) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Filter by fiscal period (FY, Q1, Q2, Q3, Q4).
|
||||
|
||||
Args:
|
||||
fiscal_period: Fiscal period identifier
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
self._filters.append(
|
||||
lambda f: f.get('fiscal_period') == fiscal_period
|
||||
)
|
||||
return self
|
||||
|
||||
def by_filing_index(self, filing_index: int) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Filter facts by which filing they originated from.
|
||||
|
||||
Args:
|
||||
filing_index: Index of the filing (0 = most recent)
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
self._filters.append(
|
||||
lambda f: f.get('source_filing_index') == filing_index
|
||||
)
|
||||
return self
|
||||
|
||||
def trend_analysis(self, concept: str) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Set up for trend analysis of a specific concept across periods.
|
||||
|
||||
Args:
|
||||
concept: Concept to analyze trends for
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
self._trend_analysis = True
|
||||
self.by_standardized_concept(concept)
|
||||
return self
|
||||
|
||||
def complete_periods_only(self) -> 'StitchedFactQuery':
|
||||
"""
|
||||
Only return concepts that have values in all available periods.
|
||||
|
||||
Returns:
|
||||
Self for method chaining
|
||||
"""
|
||||
self._require_all_periods = True
|
||||
return self
|
||||
|
||||
def execute(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Execute the query with enhanced multi-period processing.
|
||||
|
||||
Returns:
|
||||
List of fact dictionaries
|
||||
"""
|
||||
# Get base results from stitched facts with query parameters
|
||||
results = self._stitched_facts_view.get_facts(
|
||||
max_periods=self._max_periods,
|
||||
standard=self._standard,
|
||||
statement_types=self._statement_types
|
||||
)
|
||||
|
||||
# Apply standard filters
|
||||
for filter_func in self._filters:
|
||||
results = [f for f in results if filter_func(f)]
|
||||
|
||||
# Apply transformations
|
||||
for transform_fn in self._transformations:
|
||||
for fact in results:
|
||||
if 'value' in fact and fact['value'] is not None:
|
||||
fact['value'] = transform_fn(fact['value'])
|
||||
|
||||
# Apply aggregations
|
||||
if self._aggregations:
|
||||
aggregated_results = {}
|
||||
for agg in self._aggregations:
|
||||
dimension = agg['dimension']
|
||||
func = agg['function']
|
||||
|
||||
# Group facts by dimension
|
||||
groups = {}
|
||||
for fact in results:
|
||||
dim_value = fact.get(f'dim_{dimension}')
|
||||
if dim_value and 'value' in fact and fact['value'] is not None:
|
||||
if dim_value not in groups:
|
||||
groups[dim_value] = []
|
||||
groups[dim_value].append(fact['value'])
|
||||
|
||||
# Apply aggregation function
|
||||
for dim_value, values in groups.items():
|
||||
agg_value = 0.0 # Initialize with default value
|
||||
if func == 'sum':
|
||||
agg_value = sum(values)
|
||||
elif func == 'average':
|
||||
agg_value = sum(values) / len(values)
|
||||
|
||||
key = (dimension, dim_value)
|
||||
if key not in aggregated_results:
|
||||
aggregated_results[key] = {'dimension': dimension, 'value': dim_value, 'values': {}}
|
||||
aggregated_results[key]['values'][func] = agg_value
|
||||
|
||||
results = list(aggregated_results.values())
|
||||
|
||||
# Apply cross-period filtering if requested
|
||||
if self._cross_period_only:
|
||||
results = self._filter_cross_period_concepts(results)
|
||||
|
||||
# Apply complete periods filtering if requested
|
||||
if self._require_all_periods:
|
||||
results = self._filter_complete_periods(results)
|
||||
|
||||
# Apply trend analysis if requested
|
||||
if self._trend_analysis:
|
||||
results = self._prepare_trend_data(results)
|
||||
|
||||
# Apply sorting if specified
|
||||
if results and self._sort_by and self._sort_by in results[0]:
|
||||
results.sort(key=lambda f: f.get(self._sort_by, ''),
|
||||
reverse=not self._sort_ascending)
|
||||
|
||||
# Apply limit if specified
|
||||
if self._limit is not None:
|
||||
results = results[:self._limit]
|
||||
|
||||
return results
|
||||
|
||||
def _filter_cross_period_concepts(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Filter to concepts that appear in multiple periods."""
|
||||
concept_periods = defaultdict(set)
|
||||
for fact in results:
|
||||
concept_key = (fact.get('concept', ''), fact.get('label', ''))
|
||||
concept_periods[concept_key].add(fact.get('period_key', ''))
|
||||
|
||||
# Filter to concepts with minimum period count
|
||||
valid_concepts = {
|
||||
concept for concept, periods in concept_periods.items()
|
||||
if len(periods) >= getattr(self, '_min_periods', 2)
|
||||
}
|
||||
|
||||
return [
|
||||
fact for fact in results
|
||||
if (fact.get('concept', ''), fact.get('label', '')) in valid_concepts
|
||||
]
|
||||
|
||||
def _filter_complete_periods(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Filter to concepts that have values in all periods."""
|
||||
# Get all available periods
|
||||
all_periods = set(fact.get('period_key', '') for fact in results)
|
||||
|
||||
concept_periods = defaultdict(set)
|
||||
for fact in results:
|
||||
concept_key = (fact.get('concept', ''), fact.get('label', ''))
|
||||
concept_periods[concept_key].add(fact.get('period_key', ''))
|
||||
|
||||
# Filter to concepts with complete period coverage
|
||||
complete_concepts = {
|
||||
concept for concept, periods in concept_periods.items()
|
||||
if periods == all_periods
|
||||
}
|
||||
|
||||
return [
|
||||
fact for fact in results
|
||||
if (fact.get('concept', ''), fact.get('label', '')) in complete_concepts
|
||||
]
|
||||
|
||||
def _prepare_trend_data(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Prepare data for trend analysis by sorting periods."""
|
||||
# Sort by period end date for trend analysis
|
||||
return sorted(results, key=lambda f: f.get('period_end', ''))
|
||||
|
||||
def to_trend_dataframe(self) -> pd.DataFrame:
|
||||
"""
|
||||
Create a DataFrame optimized for trend analysis.
|
||||
|
||||
Returns:
|
||||
DataFrame with concepts as rows and periods as columns
|
||||
"""
|
||||
results = self.execute()
|
||||
|
||||
if not results:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Pivot data for trend analysis
|
||||
df = pd.DataFrame(results)
|
||||
|
||||
# Create pivot table with concepts as rows and periods as columns
|
||||
if 'concept' in df.columns and 'period_end' in df.columns and 'numeric_value' in df.columns:
|
||||
trend_df = df.pivot_table(
|
||||
index=['label', 'concept'],
|
||||
columns='period_end',
|
||||
values='numeric_value',
|
||||
aggfunc='first'
|
||||
)
|
||||
return trend_df
|
||||
|
||||
return df
|
||||
|
||||
def to_dataframe(self, *columns) -> pd.DataFrame:
|
||||
"""
|
||||
Execute the query and return results as a DataFrame.
|
||||
|
||||
Args:
|
||||
columns: List of columns to include in the DataFrame
|
||||
|
||||
Returns:
|
||||
pandas DataFrame with query results
|
||||
"""
|
||||
results = self.execute()
|
||||
|
||||
if not results:
|
||||
return pd.DataFrame()
|
||||
|
||||
df = pd.DataFrame(results)
|
||||
df['value'] = df['value'].astype(str) # Ensure value is string for display
|
||||
|
||||
# Filter columns based on inclusion flags
|
||||
if not self._include_dimensions:
|
||||
df = df.loc[:, [col for col in df.columns if not col.startswith('dim_')]]
|
||||
|
||||
if not self._include_contexts:
|
||||
context_cols = ['context_ref', 'entity_identifier', 'entity_scheme',
|
||||
'period_type']
|
||||
df = df.loc[:, [col for col in df.columns if col not in context_cols]]
|
||||
|
||||
if not self._include_element_info:
|
||||
element_cols = ['element_id', 'element_name', 'element_type', 'element_period_type',
|
||||
'element_balance', 'element_label']
|
||||
df = df.loc[:, [col for col in df.columns if col not in element_cols]]
|
||||
|
||||
# Drop empty columns
|
||||
df = df.dropna(axis=1, how='all')
|
||||
|
||||
# Filter columns if specified
|
||||
if columns:
|
||||
df = df[list(columns)]
|
||||
|
||||
# Skip these columns
|
||||
skip_columns = ['fact_key', 'period_key']
|
||||
|
||||
# Order columns
|
||||
first_columns = [col for col in
|
||||
['concept', 'label', 'original_label', 'value', 'numeric_value',
|
||||
'period_start', 'period_end', 'decimals', 'statement_type', 'fiscal_period']
|
||||
if col in df.columns]
|
||||
columns = first_columns + [col for col in df.columns
|
||||
if col not in first_columns
|
||||
and col not in skip_columns]
|
||||
|
||||
return df[columns]
|
||||
|
||||
def __rich__(self):
|
||||
title = Text.assemble(("Stitched Facts Query"),
|
||||
)
|
||||
subtitle = Text.assemble((self._stitched_facts_view.entity_name, "bold deep_sky_blue1"),
|
||||
" - ",
|
||||
(self._stitched_facts_view.document_type)
|
||||
)
|
||||
df = self.to_dataframe().fillna('')
|
||||
columns = df.columns.tolist()
|
||||
description = Markdown(
|
||||
f"""
|
||||
Use *to_dataframe(columns)* to get a DataFrame of the results.
|
||||
|
||||
e.g. `query.to_dataframe('concept', 'value', 'period_end')`
|
||||
|
||||
Available columns:
|
||||
'{', '.join(columns)}'
|
||||
|
||||
**Enhanced Multi-Period Methods:**
|
||||
- `across_periods(min_periods=2)` - Filter to concepts across multiple periods
|
||||
- `by_standardized_concept('Revenue')` - Filter by standardized labels
|
||||
- `by_original_label('Net sales')` - Filter by original company labels
|
||||
- `trend_analysis('Revenue')` - Set up trend analysis
|
||||
- `to_trend_dataframe()` - Get trend-optimized DataFrame
|
||||
"""
|
||||
)
|
||||
|
||||
display_columns = [col for col in ['label', 'concept', 'value', 'period_start', 'period_end', 'statement_type']
|
||||
if col in columns]
|
||||
|
||||
if not df.empty:
|
||||
df_display = df[display_columns].head(10) # Show first 10 rows
|
||||
table = Table(*display_columns, show_header=True, header_style="bold", box=box.SIMPLE)
|
||||
for t in df_display.itertuples(index=False):
|
||||
row = []
|
||||
for i in t:
|
||||
row.append(str(i)[:50]) # Truncate long values
|
||||
table.add_row(*row)
|
||||
else:
|
||||
table = Table("No results found", box=box.SIMPLE)
|
||||
|
||||
panel = Panel(Group(description, table), title=title, subtitle=subtitle, box=box.ROUNDED)
|
||||
return panel
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
106
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/utils.py
Normal file
106
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/utils.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""
|
||||
XBRL Statement Stitching - Utility Functions
|
||||
|
||||
This module contains utility functions for rendering and converting stitched
|
||||
statement data.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def render_stitched_statement(
|
||||
stitched_data: Dict[str, Any],
|
||||
statement_title: str,
|
||||
statement_type: str,
|
||||
entity_info: Dict[str, Any] = None,
|
||||
show_date_range: bool = False,
|
||||
xbrl_instance: Optional[Any] = None
|
||||
):
|
||||
"""
|
||||
Render a stitched statement using the same rendering logic as individual statements.
|
||||
|
||||
Args:
|
||||
stitched_data: Stitched statement data
|
||||
statement_title: Title of the statement
|
||||
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
entity_info: Entity information (optional)
|
||||
show_date_range: Whether to show full date ranges for duration periods
|
||||
|
||||
Returns:
|
||||
RichTable: A formatted table representation of the stitched statement
|
||||
"""
|
||||
from edgar.xbrl.rendering import render_statement
|
||||
|
||||
# Extract periods and statement data
|
||||
periods_to_display = stitched_data['periods']
|
||||
statement_data = stitched_data['statement_data']
|
||||
|
||||
# Apply special title formatting for stitched statements
|
||||
if len(periods_to_display) > 1:
|
||||
# For multiple periods, modify the title to indicate the trend view
|
||||
period_desc = f" ({len(periods_to_display)}-Period View)"
|
||||
statement_title = f"{statement_title}{period_desc}"
|
||||
|
||||
# Use the existing rendering function with the new show_date_range parameter
|
||||
return render_statement(
|
||||
statement_data=statement_data,
|
||||
periods_to_display=periods_to_display,
|
||||
statement_title=statement_title,
|
||||
statement_type=statement_type,
|
||||
entity_info=entity_info,
|
||||
show_date_range=show_date_range,
|
||||
xbrl_instance=xbrl_instance
|
||||
)
|
||||
|
||||
|
||||
def to_pandas(stitched_data: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""
|
||||
Convert stitched statement data to a pandas DataFrame.
|
||||
|
||||
Args:
|
||||
stitched_data: Stitched statement data
|
||||
|
||||
Returns:
|
||||
DataFrame with periods as columns and concepts as index
|
||||
"""
|
||||
# Extract periods and statement data
|
||||
statement_data = stitched_data['statement_data']
|
||||
|
||||
# Create ordered list of period column names (preserving the original ordering)
|
||||
period_columns = []
|
||||
for period_id, _period_label in stitched_data['periods']:
|
||||
# Use the end_date in YYYY-MM-DD format as the column name
|
||||
col = period_id[-10:]
|
||||
period_columns.append(col)
|
||||
|
||||
# Create a dictionary for the DataFrame with ordered columns
|
||||
# Start with metadata columns
|
||||
data = {}
|
||||
data['label'] = []
|
||||
data['concept'] = []
|
||||
|
||||
# Initialize period columns in the correct order (newest first)
|
||||
for col in period_columns:
|
||||
data[col] = []
|
||||
|
||||
for _i, item in enumerate(statement_data):
|
||||
# Skip abstract items without values
|
||||
if item['is_abstract'] and not item['has_values']:
|
||||
continue
|
||||
|
||||
data['label'].append(item['label'])
|
||||
data['concept'].append(item['concept'])
|
||||
|
||||
# Add values for each period in the correct order
|
||||
for period_id, _period_label in stitched_data['periods']:
|
||||
col = period_id[-10:]
|
||||
value = item['values'].get(period_id)
|
||||
data[col].append(value)
|
||||
|
||||
# Create the DataFrame with columns in the correct order
|
||||
column_order = ['label', 'concept'] + period_columns
|
||||
df = pd.DataFrame(data, columns=column_order)
|
||||
|
||||
return df
|
||||
340
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/xbrls.py
Normal file
340
venv/lib/python3.10/site-packages/edgar/xbrl/stitching/xbrls.py
Normal file
@@ -0,0 +1,340 @@
|
||||
"""
|
||||
XBRL Statement Stitching - XBRLS Class
|
||||
|
||||
This module contains the XBRLS class which represents multiple XBRL filings
|
||||
stitched together for multi-period analysis.
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from edgar.xbrl.stitching.core import StatementStitcher, stitch_statements
|
||||
from edgar.xbrl.stitching.query import StitchedFactQuery, StitchedFactsView
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar._filings import Filings
|
||||
from edgar.xbrl.statements import StitchedStatements
|
||||
|
||||
|
||||
class XBRLS:
|
||||
"""
|
||||
A class representing multiple XBRL filings stitched together.
|
||||
|
||||
This provides a unified view of financial data across multiple time periods,
|
||||
automatically handling the complexities of statement stitching.
|
||||
"""
|
||||
|
||||
def __init__(self, xbrl_list: List[Any]):
|
||||
"""
|
||||
Initialize an XBRLS instance with a list of XBRL objects.
|
||||
|
||||
Args:
|
||||
xbrl_list: List of XBRL objects, should be from the same company
|
||||
and ordered from newest to oldest
|
||||
"""
|
||||
# Store the list of XBRL objects
|
||||
self.xbrl_list = xbrl_list
|
||||
|
||||
# Extract entity info from the most recent XBRL
|
||||
self.entity_info = xbrl_list[0].entity_info if xbrl_list else {}
|
||||
|
||||
# Cache for stitched statements
|
||||
self._statement_cache = {}
|
||||
|
||||
# Cache for stitched facts view
|
||||
self._stitched_facts_view = None
|
||||
|
||||
@classmethod
|
||||
def from_filings(cls, filings: Union['Filings', List[Any]], filter_amendments:bool=True) -> 'XBRLS':
|
||||
"""
|
||||
Create an XBRLS object from a list of Filing objects or a Filings object containing multiple filings.
|
||||
Each filing should be the same form (e.g., 10-K, 10-Q) and from the same company.
|
||||
|
||||
Args:
|
||||
filings: List of Filing objects, should be from the same company
|
||||
|
||||
Returns:
|
||||
XBRLS object with stitched data
|
||||
"""
|
||||
from edgar.xbrl.xbrl import XBRL
|
||||
|
||||
if filter_amendments:
|
||||
filtered_filings = filings.filter(amendments=False)
|
||||
else:
|
||||
filtered_filings = filings
|
||||
|
||||
# Sort filings by date (newest first)
|
||||
sorted_filings = sorted(filtered_filings, key=lambda f: f.filing_date, reverse=True)
|
||||
|
||||
# Create XBRL objects from filings
|
||||
xbrl_list = []
|
||||
for filing in sorted_filings:
|
||||
try:
|
||||
xbrl = XBRL.from_filing(filing)
|
||||
xbrl_list.append(xbrl)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return cls(xbrl_list)
|
||||
|
||||
@classmethod
|
||||
def from_xbrl_objects(cls, xbrl_list: List[Any]) -> 'XBRLS':
|
||||
"""
|
||||
Create an XBRLS object from a list of XBRL objects.
|
||||
|
||||
Args:
|
||||
xbrl_list: List of XBRL objects, should be from the same company
|
||||
|
||||
Returns:
|
||||
XBRLS object with stitched data
|
||||
"""
|
||||
return cls(xbrl_list)
|
||||
|
||||
@property
|
||||
def statements(self) -> 'StitchedStatements':
|
||||
"""
|
||||
Get a user-friendly interface to access stitched financial statements.
|
||||
|
||||
Returns:
|
||||
StitchedStatements object
|
||||
"""
|
||||
from edgar.xbrl.statements import StitchedStatements
|
||||
return StitchedStatements(self)
|
||||
|
||||
@property
|
||||
def facts(self) -> StitchedFactsView:
|
||||
"""
|
||||
Get a view over stitched facts from all XBRL filings.
|
||||
|
||||
Returns:
|
||||
StitchedFactsView for querying standardized, multi-period data
|
||||
"""
|
||||
if self._stitched_facts_view is None:
|
||||
self._stitched_facts_view = StitchedFactsView(self)
|
||||
return self._stitched_facts_view
|
||||
|
||||
def query(self,
|
||||
max_periods: int = 8,
|
||||
standardize: bool = True,
|
||||
statement_types: Optional[List[str]] = None,
|
||||
**kwargs) -> StitchedFactQuery:
|
||||
"""
|
||||
Start a new query for stitched facts across all filings.
|
||||
|
||||
Args:
|
||||
max_periods: Maximum periods to include in stitched data
|
||||
standardize: Whether to use standardized labels
|
||||
statement_types: List of statement types to include
|
||||
**kwargs: Additional options passed to StitchedFactQuery
|
||||
|
||||
Returns:
|
||||
StitchedFactQuery for building complex queries
|
||||
"""
|
||||
# Pass query parameters to the StitchedFactQuery
|
||||
kwargs.update({
|
||||
'max_periods': max_periods,
|
||||
'standardize': standardize,
|
||||
'statement_types': statement_types
|
||||
})
|
||||
return self.facts.query(**kwargs)
|
||||
|
||||
def get_statement(self, statement_type: str,
|
||||
max_periods: int = 8,
|
||||
standard: bool = True,
|
||||
use_optimal_periods: bool = True,
|
||||
include_dimensions: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
Get a stitched statement of the specified type.
|
||||
|
||||
Args:
|
||||
statement_type: Type of statement to stitch ('IncomeStatement', 'BalanceSheet', etc.)
|
||||
max_periods: Maximum number of periods to include
|
||||
standard: Whether to use standardized concept labels
|
||||
use_optimal_periods: Whether to use entity info to determine optimal periods
|
||||
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
|
||||
|
||||
Returns:
|
||||
Dictionary with stitched statement data
|
||||
"""
|
||||
# Check cache first
|
||||
cache_key = f"{statement_type}_{max_periods}_{standard}_{use_optimal_periods}_{include_dimensions}"
|
||||
if cache_key in self._statement_cache:
|
||||
return self._statement_cache[cache_key]
|
||||
|
||||
# Stitch the statement
|
||||
result = stitch_statements(
|
||||
self.xbrl_list,
|
||||
statement_type=statement_type,
|
||||
period_type=StatementStitcher.PeriodType.ALL_PERIODS,
|
||||
max_periods=max_periods,
|
||||
standard=standard,
|
||||
use_optimal_periods=use_optimal_periods,
|
||||
include_dimensions=include_dimensions
|
||||
)
|
||||
|
||||
# Cache the result
|
||||
self._statement_cache[cache_key] = result
|
||||
|
||||
return result
|
||||
|
||||
def render_statement(self, statement_type: str,
|
||||
max_periods: int = 8,
|
||||
standardize: bool = True,
|
||||
use_optimal_periods: bool = True,
|
||||
show_date_range: bool = False,
|
||||
include_dimensions: bool = False):
|
||||
"""
|
||||
Render a stitched statement in a rich table format.
|
||||
|
||||
Args:
|
||||
statement_type: Type of statement to render ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
max_periods: Maximum number of periods to include
|
||||
standardize: Whether to use standardized concept labels
|
||||
use_optimal_periods: Whether to use entity info to determine optimal periods
|
||||
show_date_range: Whether to show full date ranges for duration periods
|
||||
include_dimensions: Whether to include dimensional segment data (default: False for stitching)
|
||||
|
||||
Returns:
|
||||
RichTable: A formatted table representation of the stitched statement
|
||||
"""
|
||||
# Create a StitchedStatement object and use its render method
|
||||
from edgar.xbrl.statements import StitchedStatement
|
||||
statement = StitchedStatement(self, statement_type, max_periods, standardize, use_optimal_periods, include_dimensions)
|
||||
return statement.render(show_date_range=show_date_range)
|
||||
|
||||
def to_dataframe(self, statement_type: str,
|
||||
max_periods: int = 8,
|
||||
standardize: bool = True) -> pd.DataFrame:
|
||||
"""
|
||||
Convert a stitched statement to a pandas DataFrame.
|
||||
|
||||
Args:
|
||||
statement_type: Type of statement to convert ('BalanceSheet', 'IncomeStatement', etc.)
|
||||
max_periods: Maximum number of periods to include
|
||||
standardize: Whether to use standardized concept labels
|
||||
|
||||
Returns:
|
||||
DataFrame with periods as columns and concepts as index
|
||||
"""
|
||||
# Create a StitchedStatement object and use its to_dataframe method
|
||||
from edgar.xbrl.statements import StitchedStatement
|
||||
statement = StitchedStatement(self, statement_type, max_periods, standardize)
|
||||
return statement.to_dataframe()
|
||||
|
||||
def get_periods(self) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Get all available periods across all XBRL objects.
|
||||
|
||||
Returns:
|
||||
List of period information dictionaries, each containing:
|
||||
- 'type': 'instant' or 'duration'
|
||||
- 'key': period key (e.g., 'instant_2024-09-28', 'duration_2024-01-01_2024-09-28')
|
||||
- 'label': human-readable label
|
||||
For instant periods:
|
||||
- 'date': end date as 'YYYY-MM-DD'
|
||||
For duration periods:
|
||||
- 'start_date': start date as 'YYYY-MM-DD'
|
||||
- 'end_date': end date as 'YYYY-MM-DD'
|
||||
- 'days': duration in days
|
||||
- 'period_type': classification ('Annual', 'Quarterly', etc.)
|
||||
"""
|
||||
all_periods = []
|
||||
|
||||
# Go through all XBRL objects to collect periods
|
||||
for xbrl in self.xbrl_list:
|
||||
all_periods.extend(xbrl.reporting_periods)
|
||||
|
||||
# De-duplicate periods with the same labels
|
||||
unique_periods = {}
|
||||
for period in all_periods:
|
||||
# Use the date string as the unique key
|
||||
key = period['date'] if period['type'] == 'instant' else f"{period['start_date']}_{period['end_date']}"
|
||||
if key not in unique_periods:
|
||||
unique_periods[key] = period
|
||||
|
||||
return list(unique_periods.values())
|
||||
|
||||
def get_period_end_dates(self) -> List[str]:
|
||||
"""
|
||||
Get end dates for all available periods in YYYY-MM-DD format.
|
||||
|
||||
This is a convenience method that extracts just the end dates from periods,
|
||||
handling both instant and duration periods correctly.
|
||||
|
||||
Returns:
|
||||
List of end dates as strings in YYYY-MM-DD format, sorted newest first
|
||||
"""
|
||||
periods = self.get_periods()
|
||||
end_dates = []
|
||||
|
||||
for period in periods:
|
||||
if period.get('type') == 'duration':
|
||||
end_date = period.get('end_date')
|
||||
elif period.get('type') == 'instant':
|
||||
end_date = period.get('date')
|
||||
else:
|
||||
continue
|
||||
|
||||
if end_date:
|
||||
end_dates.append(end_date)
|
||||
|
||||
# Sort newest first and remove duplicates while preserving order
|
||||
seen = set()
|
||||
sorted_dates = []
|
||||
for date in sorted(set(end_dates), reverse=True):
|
||||
if date not in seen:
|
||||
sorted_dates.append(date)
|
||||
seen.add(date)
|
||||
|
||||
return sorted_dates
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
String representation of the XBRLS object.
|
||||
|
||||
Returns:
|
||||
String representation
|
||||
"""
|
||||
filing_count = len(self.xbrl_list)
|
||||
periods = self.get_periods()
|
||||
return f"XBRLS with {filing_count} filings covering {len(periods)} unique periods"
|
||||
|
||||
def __rich__(self):
|
||||
"""
|
||||
Rich representation for pretty console output.
|
||||
|
||||
Returns:
|
||||
Rich console representation
|
||||
"""
|
||||
from rich.panel import Panel
|
||||
from rich.text import Text
|
||||
|
||||
# Get information about the XBRLS object
|
||||
filing_count = len(self.xbrl_list)
|
||||
periods = self.get_periods()
|
||||
|
||||
# Create a panel with the information
|
||||
content = Text.from_markup("[bold]XBRLS Object[/bold]\n")
|
||||
content.append(f"Filings: {filing_count}\n")
|
||||
content.append(f"Unique Periods: {len(periods)}\n")
|
||||
|
||||
# List available statement types
|
||||
statement_types = set()
|
||||
for xbrl in self.xbrl_list:
|
||||
statements = xbrl.get_all_statements()
|
||||
for stmt in statements:
|
||||
if stmt['type']:
|
||||
statement_types.add(stmt['type'])
|
||||
|
||||
content.append("\n[bold]Available Statement Types:[/bold]\n")
|
||||
for stmt_type in sorted(statement_types):
|
||||
content.append(f"- {stmt_type}\n")
|
||||
|
||||
# Show how to access statements
|
||||
content.append("\n[bold]Example Usage:[/bold]\n")
|
||||
content.append("xbrls.statements.income_statement()\n")
|
||||
content.append("xbrls.statements.balance_sheet()\n")
|
||||
content.append("xbrls.to_dataframe('IncomeStatement')\n")
|
||||
|
||||
return Panel(content, title="XBRLS", expand=False)
|
||||
Reference in New Issue
Block a user