Files
edgartools/venv/lib/python3.10/site-packages/edgar/xbrl/facts.py
2025-12-09 12:13:01 +01:00

1514 lines
55 KiB
Python

"""
Facts module for querying XBRL facts.
This module provides a powerful interface for querying XBRL facts based on various
attributes including concept, value, dimension, dates, statement, and more.
It enables convenient retrieval of facts as pandas DataFrames for analysis.
"""
from __future__ import annotations
import re
from decimal import Decimal
from functools import lru_cache
from textwrap import dedent
from typing import Any, Callable, Dict, List, Optional, Set, Union
import pandas as pd
from rich import box
from rich.console import Group
from rich.markdown import Markdown
from rich.panel import Panel
from rich.table import Column, Table
from rich.text import Text
from edgar.richtools import repr_rich
from edgar.xbrl.core import STANDARD_LABEL, parse_date
from edgar.xbrl.models import select_display_label
class FactQuery:
"""
A query builder for XBRL facts that enables filtering by various attributes.
This class provides a fluent interface for building queries against XBRL facts,
allowing filtering by concept, value, period, dimensions, and other attributes.
"""
def __init__(self, facts_view: FactsView):
"""
Initialize a new fact query.
Args:
facts_view: The FactsView instance to query against
"""
self._facts_view = facts_view
self._filters = []
self._transformations = []
self._aggregations = []
self._include_dimensions = True
self._include_contexts = True
self._include_element_info = True
self._sort_by = None
self._sort_ascending = True
self._limit = None
self._statement_type = None
def by_concept(self, pattern: str, exact: bool = False) -> FactQuery:
"""
Filter facts by concept name.
Args:
pattern: Pattern to match against concept names
exact: If True, require exact match; otherwise, use regex pattern matching
Returns:
Self for method chaining
"""
pattern = pattern.replace('_', ':') # Normalize underscores to colons for concept names
if exact:
self._filters.append(lambda f: f['concept'] == pattern)
else:
regex = re.compile(pattern, re.IGNORECASE)
self._filters.append(lambda f: bool(regex.search(f['concept'])))
return self
def by_label(self, pattern: str, exact: bool = False) -> FactQuery:
"""
Filter facts by element label.
This method searches across different label fields, including both the standardized label
(if standardization was applied) and the original label. This ensures you can query by either
the standardized label or the original company-specific label.
Args:
pattern: Pattern to match against element labels
exact: If True, require exact match; otherwise, use regex pattern matching
Returns:
Self for method chaining
"""
if exact:
# Try multiple label fields with exact matching
self._filters.append(lambda f:
('label' in f and f['label'] == pattern) or
('element_label' in f and f['element_label'] == pattern) or
# Also check original_label (present when standardization has been applied)
('original_label' in f and f['original_label'] == pattern)
)
else:
# Use regex pattern matching across multiple label fields
regex = re.compile(pattern, re.IGNORECASE)
self._filters.append(lambda f:
('label' in f and f['label'] is not None and bool(regex.search(str(f['label'])))) or
('element_label' in f and f['element_label'] is not None and
bool(regex.search(str(f['element_label'])))) or
# Also check original_label with regex
('original_label' in f and f['original_label'] is not None and
bool(regex.search(str(f['original_label']))))
)
return self
def by_value(self, value_filter: Union[Callable, str, int, float, list, tuple]) -> FactQuery:
"""
Filter facts by value.
Args:
value_filter: Can be:
- A callable predicate that takes a value and returns bool
- A specific value to match exactly
- A tuple or list of (min, max) for range filtering
Returns:
Self for method chaining
"""
if callable(value_filter):
def numeric_value_filter(f):
return ('numeric_value' in f and
f['numeric_value'] is not None and
value_filter(f['numeric_value']))
self._filters.append(numeric_value_filter)
elif isinstance(value_filter, (list, tuple)) and len(value_filter) == 2:
min_val, max_val = value_filter
def numeric_range_filter(f):
return ('numeric_value' in f and
f['numeric_value'] is not None and
min_val <= f['numeric_value'] <= max_val)
self._filters.append(numeric_range_filter)
else:
def numeric_equality_filter(f):
return ('numeric_value' in f and
f['numeric_value'] is not None and
f['numeric_value'] == value_filter)
self._filters.append(numeric_equality_filter)
return self
def by_period_type(self, period_type: str) -> FactQuery:
"""
Filter facts by period type ('instant' or 'duration').
Args:
period_type: Period type to filter by
Returns:
Self for method chaining
"""
def period_type_filter(f):
return 'period_type' in f and f['period_type'] == period_type
self._filters.append(period_type_filter)
return self
def by_period_key(self, period_key: str) -> FactQuery:
"""
Filter facts by a specific period key.
Args:
period_key: Period key to filter by (e.g., "instant_2023-12-31")
Returns:
Self for method chaining
"""
self._filters.append(lambda f: 'period_key' in f and f['period_key'] == period_key)
return self
def by_period_keys(self, period_keys: List[str]) -> FactQuery:
"""
Filter facts by a list of period keys.
Args:
period_keys: List of period keys to filter by
Returns:
Self for method chaining
"""
self._filters.append(lambda f: 'period_key' in f and f['period_key'] in period_keys)
return self
def by_instant_date(self, date_str: str, exact: bool = True) -> FactQuery:
"""
Filter facts by instant date.
Args:
date_str: Date string in YYYY-MM-DD format
exact: If True, require exact match; if False, match facts with date less than or equal to date_str
Returns:
Self for method chaining
"""
if exact:
self._filters.append(lambda f: 'period_instant' in f and f['period_instant'] == date_str)
else:
date_obj = parse_date(date_str)
self._filters.append(lambda f: 'period_instant' in f and
parse_date(f['period_instant']) <= date_obj)
return self
def by_date_range(self, start_date: Optional[str] = None,
end_date: Optional[str] = None) -> FactQuery:
"""
Filter facts by date range.
Args:
start_date: Optional start date string in YYYY-MM-DD format
end_date: Optional end date string in YYYY-MM-DD format
Returns:
Self for method chaining
"""
if start_date and end_date:
# Match duration facts that fall within the date range
start_obj = parse_date(start_date)
end_obj = parse_date(end_date)
self._filters.append(lambda f:
('period_start' in f and 'period_end' in f and
parse_date(f['period_start']) >= start_obj and
parse_date(f['period_end']) <= end_obj))
elif start_date:
# Match duration facts that start on or after start_date
start_obj = parse_date(start_date)
self._filters.append(lambda f:
('period_start' in f and
parse_date(f['period_start']) >= start_obj))
elif end_date:
# Match duration facts that end on or before end_date
end_obj = parse_date(end_date)
self._filters.append(lambda f:
('period_end' in f and
parse_date(f['period_end']) <= end_obj))
return self
def by_dimension(self, dimension: Optional[str], value: Optional[str] = None) -> FactQuery:
"""
Filter facts by dimension with flexible matching.
This method provides intelligent matching for dimension names and values, handling
common XBRL formatting variations including:
- Namespace prefixes (us-gaap:, srt:, etc.)
- Underscore vs colon separators
- Partial dimension names
Args:
dimension: Dimension name (supports multiple formats), or None to filter for facts with no dimensions
value: Optional dimension value to filter by (supports multiple formats)
Returns:
Self for method chaining
Examples:
# These are all equivalent:
.by_dimension("srt_ProductOrServiceAxis", "us-gaap:ServiceMember")
.by_dimension("srt:ProductOrServiceAxis", "us-gaap_ServiceMember")
.by_dimension("ProductOrServiceAxis", "ServiceMember")
"""
if dimension is None:
# Filter for facts with no dimensions
self._filters.append(lambda f: not any(key.startswith('dim_') for key in f.keys()))
return self
# Normalize the input dimension to match stored format
normalized_dim = self._normalize_dimension_key(dimension)
if value is not None:
# Normalize the value as well
normalized_value = self._normalize_dimension_value(value)
def dimension_filter_with_value(f):
# Try exact match first
if f'dim_{normalized_dim}' in f and f[f'dim_{normalized_dim}'] == normalized_value:
return True
# Try flexible matching for dimensions
for dim_key, dim_value in f.items():
if not dim_key.startswith('dim_'):
continue
# Check if this dimension key matches (flexible)
if self._dimension_key_matches(dim_key, dimension):
# Check if the value matches (flexible)
if self._dimension_value_matches(dim_value, value):
return True
return False
self._filters.append(dimension_filter_with_value)
else:
# Filter for facts that have this dimension (any value)
def dimension_filter_exists(f):
# Try exact match first
if f'dim_{normalized_dim}' in f:
return True
# Try flexible matching
for dim_key in f.keys():
if dim_key.startswith('dim_') and self._dimension_key_matches(dim_key, dimension):
return True
return False
self._filters.append(dimension_filter_exists)
return self
def _normalize_dimension_key(self, dimension: str) -> str:
"""Normalize dimension key to the format used internally (underscores)."""
# Replace colons with underscores (us-gaap:Axis -> us-gaap_Axis)
return dimension.replace(':', '_')
def _normalize_dimension_value(self, value: str) -> str:
"""Normalize dimension value to the format used internally."""
# Replace underscores with colons for values (us-gaap_Member -> us-gaap:Member)
return value.replace('_', ':')
def _dimension_key_matches(self, stored_key: str, query_key: str) -> bool:
"""
Check if a stored dimension key matches a query key with flexible matching.
Args:
stored_key: The dimension key as stored (e.g., 'dim_us-gaap_ProductAxis')
query_key: The dimension key from the query (e.g., 'ProductAxis' or 'us-gaap:ProductAxis')
Returns:
True if the keys match
"""
# Remove 'dim_' prefix from stored key
stored_clean = stored_key[4:] if stored_key.startswith('dim_') else stored_key
# Normalize both keys
stored_normalized = stored_clean.replace(':', '_').replace('-', '_')
query_normalized = query_key.replace(':', '_').replace('-', '_')
# Try exact match
if stored_normalized == query_normalized:
return True
# Try partial match (query might be just the local name without namespace)
if '_' in stored_normalized:
# Extract local name (part after last underscore)
stored_local = stored_normalized.split('_')[-1]
query_local = query_normalized.split('_')[-1]
if stored_local == query_local:
return True
return False
def _dimension_value_matches(self, stored_value: str, query_value: str) -> bool:
"""
Check if a stored dimension value matches a query value with flexible matching.
Args:
stored_value: The dimension value as stored (e.g., 'us-gaap:ServiceMember')
query_value: The dimension value from query (e.g., 'ServiceMember' or 'us-gaap_ServiceMember')
Returns:
True if the values match
"""
if not stored_value or not query_value:
return stored_value == query_value
# Normalize both values (handle colon/underscore variations)
stored_normalized = stored_value.replace('_', ':').replace('-', '_')
query_normalized = query_value.replace('_', ':').replace('-', '_')
# Try exact match
if stored_normalized == query_normalized:
return True
# Try partial match (query might be just the local name without namespace)
if ':' in stored_normalized:
stored_local = stored_normalized.split(':')[-1]
query_local = query_normalized.split(':')[-1] if ':' in query_normalized else query_normalized
if stored_local == query_local:
return True
return False
def by_statement_type(self, statement_type: str) -> FactQuery:
"""
Filter facts by statement type.
Args:
statement_type: Statement type ('BalanceSheet', 'IncomeStatement', etc.)
Returns:
Self for method chaining
"""
self._filters.append(lambda f: 'statement_type' in f and f['statement_type'] == statement_type)
return self
def by_fiscal_period(self, fiscal_period: str) -> FactQuery:
"""
Filter facts by fiscal period (FY, Q1, Q2, Q3, Q4).
Args:
fiscal_period: Fiscal period identifier
Returns:
Self for method chaining
"""
self._filters.append(lambda f: 'fiscal_period' in f and f['fiscal_period'] == fiscal_period)
return self
def by_fiscal_year(self, fiscal_year: Union[int, str]) -> FactQuery:
"""
Filter facts by fiscal year.
Args:
fiscal_year: Fiscal year to filter by
Returns:
Self for method chaining
"""
self._filters.append(lambda f: 'fiscal_year' in f and str(f['fiscal_year']) == str(fiscal_year))
return self
def by_unit(self, unit: str) -> FactQuery:
"""
Filter facts by unit reference.
Args:
unit: Unit reference to filter by
Returns:
Self for method chaining
"""
self._filters.append(lambda f: 'unit_ref' in f and f['unit_ref'] == unit)
return self
def by_custom(self, filter_func: Callable) -> FactQuery:
"""
Add a custom filter function.
Args:
filter_func: Custom filter function that takes a fact dict and returns bool
Returns:
Self for method chaining
"""
self._filters.append(filter_func)
return self
def by_text(self, pattern: str) -> FactQuery:
"""
Search across concept names, labels, and element names for a pattern.
This is a flexible search that looks for the pattern in all text fields, including
both standardized labels and original labels when standardization has been applied.
Args:
pattern: Pattern to search for in various text fields
Returns:
Self for method chaining
"""
regex = re.compile(pattern, re.IGNORECASE)
def text_filter(f):
# Search in concept name
if 'concept' in f and f['concept'] is not None and regex.search(str(f['concept'])):
return True
# Search in label
if 'label' in f and f['label'] is not None and regex.search(str(f['label'])):
return True
# Search in element_label
if 'element_label' in f and f['element_label'] is not None and regex.search(str(f['element_label'])):
return True
# Search in element_name
if 'element_name' in f and f['element_name'] is not None and regex.search(str(f['element_name'])):
return True
# Search in original_label (present when standardization has been applied)
if 'original_label' in f and f['original_label'] is not None and regex.search(str(f['original_label'])):
return True
return False
self._filters.append(text_filter)
return self
def exclude_dimensions(self) -> FactQuery:
"""
Exclude dimension columns from results.
Returns:
Self for method chaining
"""
self._include_dimensions = False
return self
def exclude_contexts(self) -> FactQuery:
"""
Exclude context information from results.
Returns:
Self for method chaining
"""
self._include_contexts = False
return self
def exclude_element_info(self) -> FactQuery:
"""
Exclude element catalog information from results.
Returns:
Self for method chaining
"""
self._include_element_info = False
return self
def sort_by(self, column: str, ascending: bool = True) -> FactQuery:
"""
Set sorting for results.
Args:
column: Column name to sort by
ascending: Sort order (True for ascending, False for descending)
Returns:
Self for method chaining
"""
self._sort_by = column
self._sort_ascending = ascending
return self
def limit(self, n: int) -> FactQuery:
"""
Limit the number of results.
Args:
n: Maximum number of results to return
Returns:
Self for method chaining
"""
self._limit = n
return self
def from_statement(self, statement_type: str) -> 'FactQuery':
"""
Filter facts to only those from a specific statement.
Args:
statement_type: Type of statement (e.g., 'BalanceSheet', 'IncomeStatement')
Returns:
Self for method chaining
"""
self._statement_type = statement_type
self._filters.append(lambda f: f.get('statement_type') == statement_type)
return self
def transform(self, transform_fn: Callable[[Any], Any]) -> 'FactQuery':
"""
Transform fact values using a custom function.
Args:
transform_fn: Function to transform values
Returns:
Self for method chaining
"""
self._transformations.append(transform_fn)
return self
def scale(self, scale_factor: int) -> 'FactQuery':
"""
Scale numeric values by a factor.
Args:
scale_factor: The scaling factor (e.g., 1000 for thousands)
Returns:
Self for method chaining
"""
def scale_transform(value):
if isinstance(value, (int, float, Decimal)):
return value / scale_factor
return value
return self.transform(scale_transform)
def aggregate(self, dimension: str, func: str = 'sum') -> 'FactQuery':
"""
Aggregate values by a dimension.
Args:
dimension: The dimension to aggregate by
func: Aggregation function ('sum' or 'average')
Returns:
Self for method chaining
"""
self._aggregations.append({
'dimension': dimension,
'function': func
})
return self
def execute(self) -> List[Dict[str, Any]]:
"""
Execute the query and return matching facts.
Returns:
List of fact dictionaries
"""
results = self._facts_view.get_facts()
# Apply filters
for filter_func in self._filters:
results = [f for f in results if filter_func(f)]
# Apply transformations
for transform_fn in self._transformations:
for fact in results:
if 'value' in fact and fact['value'] is not None:
fact['value'] = transform_fn(fact['value'])
# Apply aggregations
if self._aggregations:
aggregated_results = {}
for agg in self._aggregations:
dimension = agg['dimension']
func = agg['function']
# Group facts by dimension
groups = {}
for fact in results:
dim_value = fact.get(f'dim_{dimension}')
if dim_value and 'value' in fact and fact['value'] is not None:
if dim_value not in groups:
groups[dim_value] = []
groups[dim_value].append(fact['value'])
# Apply aggregation function
for dim_value, values in groups.items():
if func == 'sum':
agg_value = sum(values)
elif func == 'average':
agg_value = sum(values) / len(values)
key = (dimension, dim_value)
if key not in aggregated_results:
aggregated_results[key] = {'dimension': dimension, 'value': dim_value, 'values': {}}
aggregated_results[key]['values'][func] = agg_value
results = list(aggregated_results.values())
# Apply sorting if specified
if results and self._sort_by and self._sort_by in results[0]:
results.sort(key=lambda f: f.get(self._sort_by, ''),
reverse=not self._sort_ascending)
# Apply limit if specified
if self._limit is not None:
results = results[:self._limit]
return results
@lru_cache(maxsize=8)
def to_dataframe(self, *columns) -> pd.DataFrame:
"""
Execute the query and return results as a DataFrame.
:param columns: List of columns to include in the DataFrame
Returns:
pandas DataFrame with query results
"""
results = self.execute()
if not results:
return pd.DataFrame()
df = pd.DataFrame(results)
# Filter columns based on inclusion flags
if not self._include_dimensions:
df = df.loc[:, [col for col in df.columns if not col.startswith('dim_')]]
if not self._include_contexts:
context_cols = ['context_ref', 'entity_identifier', 'entity_scheme',
'period_type']
df = df.loc[:, [col for col in df.columns if col not in context_cols]]
if not self._include_element_info:
element_cols = ['element_id', 'element_name', 'element_type', 'element_period_type',
'element_balance', 'element_label']
df = df.loc[:, [col for col in df.columns if col not in element_cols]]
# Drop empty columns
df = df.dropna(axis=1, how='all')
# Filter columns if specified
if columns:
columns = [col for col in columns if col in df.columns]
df = df[list(columns)]
# skip these columns
# Note: period_key is now included for time series analysis (Issue #464)
skip_columns = ['fact_key', 'original_label']
if 'statement_role' in df.columns:
# Change the statement_role to statement name
df['statement_name'] = df.statement_role.fillna('').apply(lambda s: s.split('/')[-1] if s else None)
# Remove statement_role column if it exists
if 'statement_role' in df.columns:
df = df.drop(columns=['statement_role'])
# order columns
first_columns = [col for col in
['concept', 'label', 'balance', 'preferred_sign', 'weight', 'value', 'numeric_value',
'period_key', 'period_start', 'period_end', 'period_instant',
'decimals', 'statement_type', 'statement_name']
if col in df.columns]
columns = first_columns + [col for col in df.columns
if col not in first_columns
and col not in skip_columns]
return df[columns]
def __rich__(self):
title = Text.assemble(("Facts Query"),
)
subtitle = Text.assemble((self._facts_view.entity_name, "bold deep_sky_blue1"),
" - ",
(self._facts_view.document_type)
)
df = self.to_dataframe().fillna('')
columns = df.columns.tolist()
description = Markdown(
dedent(f"""
Use *to_dataframe(columns)* to get a DataFrame of the results.
e.g. `query.to_dataframe('concept', 'value', 'period_end')`
Available columns:
'{', '.join(columns)}'
""")
)
display_columns = [col for col in ['concept','label', 'value', 'period_start', 'period_end']
if col in columns]
# What is the maximum width of the concept column?
max_width = df.concept.apply(len).max() if 'concept' in df.columns else 20
rich_columns = [Column('concept', width=max_width)] + display_columns[1:]
df = df[display_columns]
table = Table(*rich_columns, show_header=True, header_style="bold", box=box.SIMPLE)
for t in df.itertuples(index=False):
row = []
for i in t:
row.append(str(i))
table.add_row(*row)
panel = Panel(Group(description, table), title=title, subtitle=subtitle, box=box.ROUNDED)
return panel
def __repr__(self):
return repr_rich(self.__rich__())
class FactsView:
"""
A view over all facts in an XBRL instance, providing methods to query and analyze facts.
"""
def __init__(self, xbrl):
"""
Initialize the FactsView with an XBRL instance.
Args:
xbrl: XBRL instance containing facts, contexts, and elements
"""
self.xbrl = xbrl
self._facts_cache = None
self._facts_df_cache = None
def __len__(self):
return len(self.get_facts())
@property
def entity_name(self):
return self.xbrl.entity_name
@property
def document_type(self):
return self.xbrl.document_type
def get_facts(self) -> List[Dict[str, Any]]:
"""
Get all facts with enriched context and element information.
Returns:
List of enriched fact dictionaries
"""
# Return cached facts if available
if self._facts_cache is not None:
return self._facts_cache
# Prepare a mapping of roles to statement types for faster lookup
# This avoids repeated calls to get_all_statements() for each fact
role_to_statement_type = {}
statements = self.xbrl.get_all_statements()
for stmt in statements:
if stmt['role'] and stmt['type']:
role_to_statement_type[stmt['role']] = (stmt['type'], stmt['role'])
# Prepare a mapping of period keys to fiscal info for faster lookup
period_to_fiscal_info = {}
for period in self.xbrl.reporting_periods:
if 'key' in period:
fiscal_info = {}
if 'fiscal_period' in period:
fiscal_info['fiscal_period'] = period['fiscal_period']
if 'fiscal_year' in period:
fiscal_info['fiscal_year'] = period['fiscal_year']
period_to_fiscal_info[period['key']] = fiscal_info
# Build enriched facts from raw facts, contexts, and elements
enriched_facts = []
for fact_key, fact in self.xbrl._facts.items():
# Create a dict with only necessary fields instead of full model_dump
fact_dict = {
'fact_key': fact_key,
'concept': fact.element_id,
'context_ref': fact.context_ref,
'value': fact.value,
'unit_ref': fact.unit_ref,
'decimals': fact.decimals,
'numeric_value': fact.numeric_value
}
# Split element name from context for better concept display
# Don't override if element_id already has a namespace prefix with colon
if "_" in fact_key and ":" not in fact_dict['concept']:
parts = fact_key.split("_", 1)
if len(parts) == 2:
fact_dict['concept'] = parts[0]
# Add context information
if fact.context_ref in self.xbrl.contexts:
context = self.xbrl.contexts[fact.context_ref]
# Add period information - extract only what we need
if context.period:
# Handle both object and dict representations of period
# (Model objects are converted to dicts in some contexts)
if hasattr(context.period, 'type'):
# Object access
period_type = context.period.type
fact_dict['period_type'] = period_type
if period_type == 'instant':
fact_dict['period_instant'] = context.period.instant
elif period_type == 'duration':
fact_dict['period_start'] = context.period.startDate
fact_dict['period_end'] = context.period.endDate
elif isinstance(context.period, dict):
# Dict access
period_type = context.period.get('type')
fact_dict['period_type'] = period_type
if period_type == 'instant':
fact_dict['period_instant'] = context.period.get('instant')
elif period_type == 'duration':
fact_dict['period_start'] = context.period.get('startDate')
fact_dict['period_end'] = context.period.get('endDate')
# Add entity information - extract only what we need
if context.entity:
# Handle both object and dict representations of entity
if hasattr(context.entity, 'identifier'):
# Object access
fact_dict['entity_identifier'] = context.entity.identifier
fact_dict['entity_scheme'] = context.entity.scheme
elif isinstance(context.entity, dict):
# Dict access
fact_dict['entity_identifier'] = context.entity.get('identifier')
fact_dict['entity_scheme'] = context.entity.get('scheme')
# Add dimensions - handle both object and dict representation
if hasattr(context, 'dimensions') and context.dimensions:
# Check if dimensions is a dict or an attribute
if isinstance(context.dimensions, dict):
for dim_name, dim_value in context.dimensions.items():
dim_key = f"dim_{dim_name.replace(':', '_')}"
fact_dict[dim_key] = dim_value
elif hasattr(context.dimensions, 'items'):
# Handle case where dimensions has items() method but isn't a dict
for dim_name, dim_value in context.dimensions.items():
dim_key = f"dim_{dim_name.replace(':', '_')}"
fact_dict[dim_key] = dim_value
# Get period key from context_period_map if available
period_key = self.xbrl.context_period_map.get(fact.context_ref)
if period_key:
fact_dict['period_key'] = period_key
# Add fiscal info if available
if period_key in period_to_fiscal_info:
fact_dict.update(period_to_fiscal_info[period_key])
# Add element information and statement type
# Normalize element_id to match catalog keys (replace ':' with '_')
element_id = fact.element_id.replace(':', '_')
if element_id in self.xbrl.element_catalog:
element = self.xbrl.element_catalog[element_id]
# First look up preferred_label from presentation trees
# to ensure label consistency between rendering and facts
preferred_label = None
for _role, tree in self.xbrl.presentation_trees.items():
if element_id in tree.all_nodes:
# Get presentation node to find preferred_label
pres_node = tree.all_nodes[element_id]
if pres_node.preferred_label:
preferred_label = pres_node.preferred_label
break # Use the first preferred_label found
# Add label using the same selection logic as display_label
# but including the preferred_label we found above
label = select_display_label(
labels=element.labels,
standard_label=element.labels.get(STANDARD_LABEL),
preferred_label=preferred_label, # May be None, which is handled by select_display_label
element_id=element_id,
element_name=element.name
)
fact_dict['label'] = label
# Store original label (will be used for standardization comparison)
fact_dict['original_label'] = label
# Add balance from element catalog (Issue #463)
# Balance indicates accounting classification (debit/credit)
# Try element catalog first, then fall back to static US-GAAP mapping
balance = element.balance
if balance is None:
# Import here to avoid circular dependencies
from edgar.xbrl.parsers.concepts import get_balance_type
# Try to get balance from static mapping using the original concept ID
balance = get_balance_type(fact.element_id)
fact_dict['balance'] = balance # "debit", "credit", or None
# Add preferred_sign from presentation linkbase (Issue #463)
# Convert preferredLabel to a numeric sign multiplier for display
# -1 means "negate for display", 1 means "use as-is", None means "not specified"
if preferred_label:
# Common preferredLabel values that indicate negation
negation_labels = [
'negatedLabel',
'http://www.xbrl.org/2003/role/negatedLabel',
'negatedTerseLabel',
'http://www.xbrl.org/2003/role/negatedTerseLabel',
'negatedPeriodStartLabel',
'http://www.xbrl.org/2003/role/negatedPeriodStartLabel',
'negatedPeriodEndLabel',
'http://www.xbrl.org/2003/role/negatedPeriodEndLabel'
]
fact_dict['preferred_sign'] = -1 if preferred_label in negation_labels else 1
else:
fact_dict['preferred_sign'] = None
# Determine statement type by checking presentation trees using our precomputed mapping
for role, tree in self.xbrl.presentation_trees.items():
if element_id in tree.all_nodes and role in role_to_statement_type:
statement_type, statement_role = role_to_statement_type[role]
fact_dict['statement_type'] = statement_type
fact_dict['statement_role'] = statement_role
break
# Add weight from calculation tree (Issue #463)
# Weight indicates calculation role (1.0 = add, -1.0 = subtract)
# Note: Weight is role-specific, use primary statement role when available
statement_type = fact_dict.get('statement_type')
fact_dict['weight'] = self._get_primary_weight(element_id, statement_type)
enriched_facts.append(fact_dict)
# Cache the enriched facts
self._facts_cache = enriched_facts
return self._facts_cache
def query(self) -> FactQuery:
"""
Start building a query against facts.
Returns:
FactQuery: A new query builder
"""
return FactQuery(self)
def to_dataframe(self) -> pd.DataFrame:
"""
Convert all facts to a DataFrame.
Returns:
pandas DataFrame containing all facts
"""
if self._facts_df_cache is not None:
return self._facts_df_cache
facts = self.get_facts()
df = pd.DataFrame(facts)
self._facts_df_cache = df
return df
def get_statement_facts(self, statement_type: str) -> pd.DataFrame:
"""
Get facts belonging to a specific statement.
Args:
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
Returns:
pandas DataFrame with facts for the specified statement
"""
return self.query().by_statement_type(statement_type).to_dataframe()
def get_facts_by_concept(self, concept_pattern: str, exact: bool = False) -> pd.DataFrame:
"""
Get facts matching a concept name pattern.
Args:
concept_pattern: Pattern to match against concept names
exact: If True, perform exact matching; otherwise, use regex
Returns:
pandas DataFrame with matching facts
"""
return self.query().by_concept(concept_pattern, exact).to_dataframe()
def search_facts(self, text_pattern: str) -> pd.DataFrame:
"""
Search for facts containing a text pattern in any text field.
This is a flexible search that looks across concept names, labels,
and element names for matching text.
Args:
text_pattern: Text pattern to search for
Returns:
pandas DataFrame with matching facts
"""
return self.query().by_text(text_pattern).to_dataframe()
def get_facts_with_dimensions(self) -> pd.DataFrame:
"""
Get facts that have dimensional qualifiers.
Returns:
pandas DataFrame with dimensionally-qualified facts
"""
return self.query().by_custom(
lambda f: any(key.startswith('dim_') for key in f.keys())
).to_dataframe()
def get_facts_by_period(self, period_key: str) -> pd.DataFrame:
"""
Get facts for a specific reporting period.
Args:
period_key: Period key from reporting_periods
Returns:
pandas DataFrame with facts for the specified period
"""
return self.query().by_period_key(period_key).to_dataframe()
def get_facts_by_period_view(self, statement_type: str, period_view_name: str) -> pd.DataFrame:
"""
Get facts for a specific period view (e.g., "Annual Comparison", "Three-Year Comparison").
Args:
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
period_view_name: Name of the period view as defined in get_period_views
Returns:
pandas DataFrame with facts for the specified period view
"""
# Get available period views for this statement type
period_views = self.xbrl.get_period_views(statement_type)
# Find the requested view
matching_view = next((view for view in period_views if view['name'] == period_view_name), None)
if not matching_view:
# If view not found, return empty DataFrame
return pd.DataFrame()
# Get the period keys for this view
period_keys = matching_view['period_keys']
# Query facts that match any of these period keys and the statement type
query = self.query()
# Filter by statement type
if statement_type:
query = query.by_statement_type(statement_type)
# Filter by the period keys
query = query.by_period_keys(period_keys)
return query.to_dataframe()
def get_facts_by_fiscal_period(self, fiscal_year: Union[int, str],
fiscal_period: str) -> pd.DataFrame:
"""
Get facts for a specific fiscal period.
Args:
fiscal_year: Fiscal year
fiscal_period: Fiscal period ('FY', 'Q1', 'Q2', 'Q3', 'Q4')
Returns:
pandas DataFrame with facts for the specified fiscal period
"""
return self.query().by_fiscal_year(fiscal_year).by_fiscal_period(fiscal_period).to_dataframe()
def summarize(self) -> Dict[str, Any]:
"""
Generate a summary of facts in the XBRL instance.
Returns:
Dictionary with fact summary statistics
"""
facts = self.get_facts()
# Count total facts
total_facts = len(facts)
# Count by data type
types = {}
for fact in facts:
element_type = fact.get('element_type', 'unknown')
types[element_type] = types.get(element_type, 0) + 1
# Count by statement
by_statement = {}
for fact in facts:
stmt_type = fact.get('statement_type', 'unknown')
by_statement[stmt_type] = by_statement.get(stmt_type, 0) + 1
# Count by period type
by_period_type = {}
for fact in facts:
period_type = fact.get('period_type', 'unknown')
by_period_type[period_type] = by_period_type.get(period_type, 0) + 1
# List unique dimensions
dimensions = set()
for fact in facts:
for key in fact.keys():
if key.startswith('dim_'):
dimensions.add(key.replace('dim_', ''))
# List unique periods
periods = set()
for fact in facts:
if 'period_key' in fact:
periods.add(fact['period_key'])
return {
'total_facts': total_facts,
'by_type': types,
'by_statement': by_statement,
'by_period_type': by_period_type,
'dimensions': sorted(list(dimensions)),
'periods': sorted(list(periods))
}
def get_unique_concepts(self) -> List[str]:
"""
Get list of unique concept names in the facts.
Returns:
List of unique concept names
"""
facts = self.get_facts()
concepts = {fact.get('concept') for fact in facts if 'concept' in fact}
return sorted(list(concepts))
def get_unique_dimensions(self) -> Dict[str, Set[str]]:
"""
Get unique dimensions and their values.
Returns:
Dictionary mapping dimension names to sets of possible values
"""
facts = self.get_facts()
dimensions = {}
for fact in facts:
for key, value in fact.items():
if key.startswith('dim_'):
dim_name = key.replace('dim_', '')
if dim_name not in dimensions:
dimensions[dim_name] = set()
dimensions[dim_name].add(value)
return dimensions
def get_available_period_views(self, statement_type: str) -> List[Dict[str, Any]]:
"""
Get available period views for a statement type.
This method returns the period views that can be used with get_facts_by_period_view.
Args:
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', etc.)
Returns:
List of period view metadata with name, description, and period keys
"""
period_views = self.xbrl.get_period_views(statement_type)
# Add facts count for each period view
for view in period_views:
# Count facts for each period key in this view
period_keys = view.get('period_keys', [])
if period_keys:
facts_count = len(self.query()
.by_statement_type(statement_type)
.by_period_keys(period_keys)
.execute())
view['facts_count'] = facts_count
else:
view['facts_count'] = 0
return period_views
def pivot_by_period(self, concept_pattern: str = None,
statement_type: str = None) -> pd.DataFrame:
"""
Create a pivoted view of facts by period.
Args:
concept_pattern: Optional concept pattern to filter by
statement_type: Optional statement type to filter by
Returns:
pandas DataFrame with concepts as rows and periods as columns
"""
query = self.query()
if concept_pattern:
query = query.by_concept(concept_pattern)
if statement_type:
query = query.by_statement_type(statement_type)
df = query.to_dataframe()
if df.empty:
return pd.DataFrame()
# Create concept-period pivot
if 'period_key' in df.columns and 'concept' in df.columns and 'numeric_value' in df.columns:
pivot = df.pivot_table(
values='numeric_value',
index=['concept', 'label'],
columns='period_key',
aggfunc='first' # Take first occurrence for each concept-period combo
)
# Reset index to make 'concept' and 'label' regular columns
pivot = pivot.reset_index()
return pivot
return df # Return original DataFrame if pivoting isn't possible
def pivot_by_dimension(self, dimension: str,
concept_pattern: str = None,
period_key: str = None) -> pd.DataFrame:
"""
Create a pivoted view of facts by dimension values.
Args:
dimension: Dimension to pivot by
concept_pattern: Optional concept pattern to filter by
period_key: Optional period key to filter by
Returns:
pandas DataFrame with concepts as rows and dimension values as columns
"""
query = self.query()
# Apply filters if provided
if concept_pattern:
query = query.by_concept(concept_pattern)
if period_key:
query = query.by_custom(lambda f: 'period_key' in f and f['period_key'] == period_key)
# Ensure we only get facts with this dimension
query = query.by_dimension(dimension)
df = query.to_dataframe()
if df.empty:
return pd.DataFrame()
dim_col = f"dim_{dimension}"
# Create concept-dimension pivot
if dim_col in df.columns and 'concept' in df.columns and 'numeric_value' in df.columns:
pivot = df.pivot_table(
values='numeric_value',
index=['concept', 'label'],
columns=dim_col,
aggfunc='first' # Take first occurrence for each concept-dimension combo
)
# Reset index to make 'concept' and 'label' regular columns
pivot = pivot.reset_index()
return pivot
return df # Return original DataFrame if pivoting isn't possible
def time_series(self, concept: str, exact: bool = True) -> pd.DataFrame:
"""
Create a time series view for a specific concept.
Args:
concept: Concept name to create time series for
exact: If True, require exact concept match; otherwise, use pattern matching
Returns:
pandas DataFrame with time series data for the concept
"""
df = self.query().by_concept(concept, exact).to_dataframe()
if df.empty:
return pd.DataFrame()
# For instant periods, use the instant date
# For duration periods, use the end date
df['date'] = df.apply(
lambda row: row.get('period_instant') if row.get('period_type') == 'instant'
else row.get('period_end') if row.get('period_type') == 'duration'
else None,
axis=1
)
# Drop rows without valid dates
df = df.dropna(subset=['date'])
# Sort by date
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')
# Select relevant columns
columns = ['date', 'numeric_value', 'unit_ref']
if 'label' in df.columns:
columns.append('label')
if 'fiscal_period' in df.columns:
columns.append('fiscal_period')
if 'fiscal_year' in df.columns:
columns.append('fiscal_year')
# Add any dimension columns that exist
dim_cols = [col for col in df.columns if col.startswith('dim_')]
columns.extend(dim_cols)
return df[columns]
def facts_history(self, concept: str, date_col: str = 'period_end',
include_dimensions: bool = True) -> pd.DataFrame:
"""
Get the history of a concept across time, optionally including dimensions.
Args:
concept: Concept name to track
date_col: Date column to use for time series ('period_end', 'period_instant')
include_dimensions: Whether to include dimensional breakdowns
Returns:
pandas DataFrame with time series data
"""
df = self.query().by_concept(concept, True).to_dataframe()
if df.empty:
return pd.DataFrame()
# Filter to only rows with the date column
df = df.dropna(subset=[date_col])
# Convert to datetime
df[date_col] = pd.to_datetime(df[date_col])
# If including dimensions, create a more complex view
if include_dimensions:
# Convert dimension columns to category names
dimension_cols = [col for col in df.columns if col.startswith('dim_')]
if dimension_cols:
# Create a combined dimension key
if len(dimension_cols) > 0:
df['dimension_key'] = df.apply(
lambda row: '-'.join(str(row.get(col, '')) for col in dimension_cols),
axis=1
)
else:
df['dimension_key'] = 'No dimensions'
# Pivot to show time series by dimension
pivot = df.pivot_table(
values='numeric_value',
index=[date_col],
columns=['dimension_key'],
aggfunc='first'
)
return pivot.sort_index()
# Simple time series without dimensions
result = df.sort_values(date_col)[['concept', 'label', date_col, 'numeric_value', 'unit_ref']]
if 'fiscal_period' in df.columns:
result['fiscal_period'] = df['fiscal_period']
if 'fiscal_year' in df.columns:
result['fiscal_year'] = df['fiscal_year']
return result
def _get_primary_weight(self, element_id: str, statement_type: Optional[str]) -> Optional[float]:
"""
Get calculation weight for element from primary statement role.
Weight is role-specific (same concept can have different weights in different statements).
Returns weight from primary statement role if available.
Args:
element_id: Normalized element ID (e.g., 'us_gaap_Revenue')
statement_type: Statement type ('IncomeStatement', 'BalanceSheet', etc.)
Returns:
Weight value (typically 1.0 or -1.0) or None if not in calculations
"""
if not hasattr(self.xbrl, 'calculation_trees'):
return None
# Try to find weight in calculation trees
for role_uri, calc_tree in self.xbrl.calculation_trees.items():
# Prefer calculation tree matching the statement type
if statement_type:
role_lower = role_uri.lower()
if statement_type == "IncomeStatement" and "income" in role_lower:
node = calc_tree.all_nodes.get(element_id)
if node:
return node.weight
elif statement_type == "BalanceSheet" and ("balance" in role_lower or "position" in role_lower):
node = calc_tree.all_nodes.get(element_id)
if node:
return node.weight
elif statement_type == "CashFlowStatement" and "cash" in role_lower:
node = calc_tree.all_nodes.get(element_id)
if node:
return node.weight
# Fallback: return first weight found in any role
for calc_tree in self.xbrl.calculation_trees.values():
node = calc_tree.all_nodes.get(element_id)
if node:
return node.weight
# Not found in any calculation tree
return None
def clear_cache(self) -> None:
"""Clear cached data."""
self._facts_cache = None
self._facts_df_cache = None
def __str__(self):
return f"Facts for {self.xbrl}"
@property
def _title_text(self):
return Text.assemble(("XBRL Facts for ", "bold white"),
(self.xbrl.entity_name, "bold deep_sky_blue1"),
(" - ", "bold magenta"),
(self.xbrl.document_type, "bold white"))
def add_facts_view(xbrl):
"""
Add a FactsView instance to an XBRL object.
Args:
xbrl: XBRL instance
Returns:
FactsView instance
"""
facts_view = FactsView(xbrl)
xbrl.facts_view = facts_view
return facts_view