Files
edgartools/venv/lib/python3.10/site-packages/edgar/xbrl/parsers/instance.py
2025-12-09 12:13:01 +01:00

769 lines
34 KiB
Python

"""
Instance parser for XBRL documents.
This module handles parsing of XBRL instance documents including facts, contexts,
units, footnotes, and entity information extraction.
"""
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Union
from lxml import etree as ET
from edgar.core import log
from edgar.xbrl.core import NAMESPACES, classify_duration
from edgar.xbrl.models import Context, Fact, XBRLProcessingError
from .base import BaseParser
class InstanceParser(BaseParser):
"""Parser for XBRL instance documents."""
def __init__(self, contexts: Dict[str, Context], facts: Dict[str, Fact],
units: Dict[str, Any], footnotes: Dict[str, Any],
calculation_trees: Dict[str, Any], entity_info: Dict[str, Any],
reporting_periods: List[Dict[str, Any]], context_period_map: Dict[str, str]):
"""
Initialize instance parser with data structure references.
Args:
contexts: Reference to contexts dictionary
facts: Reference to facts dictionary
units: Reference to units dictionary
footnotes: Reference to footnotes dictionary
calculation_trees: Reference to calculation trees dictionary
entity_info: Reference to entity info dictionary
reporting_periods: Reference to reporting periods list
context_period_map: Reference to context period map
"""
super().__init__()
# Store references to data structures
self.contexts = contexts
self.facts = facts
self.units = units
self.footnotes = footnotes
self.calculation_trees = calculation_trees
self.entity_info = entity_info
self.reporting_periods = reporting_periods
self.context_period_map = context_period_map
# DEI facts extracted during entity info processing
self.dei_facts: Dict[str, Fact] = {}
def _create_normalized_fact_key(self, element_id: str, context_ref: str, instance_id: int = None) -> str:
"""
Create a normalized fact key using underscore format.
Args:
element_id: The element ID
context_ref: The context reference
instance_id: Optional instance ID for duplicate facts
Returns:
Normalized key in format: element_id_context_ref[_instance_id]
"""
normalized_element_id = element_id
if ':' in element_id:
prefix, name = element_id.split(':', 1)
normalized_element_id = f"{prefix}_{name}"
if instance_id is not None:
return f"{normalized_element_id}_{context_ref}_{instance_id}"
return f"{normalized_element_id}_{context_ref}"
def parse_instance(self, file_path: Union[str, Path]) -> None:
"""Parse instance document file and extract contexts, facts, and units."""
try:
content = Path(file_path).read_text()
self.parse_instance_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing instance file {file_path}: {str(e)}") from e
def parse_instance_content(self, content: str) -> None:
"""Parse instance document content and extract contexts, facts, and units."""
try:
# Use lxml's optimized parser with smart string handling and recovery mode
parser = ET.XMLParser(remove_blank_text=True, recover=True, huge_tree=True)
# Convert to bytes for faster parsing if not already
if isinstance(content, str):
content_bytes = content.encode('utf-8')
else:
content_bytes = content
# Parse content with optimized settings
root = ET.XML(content_bytes, parser)
# Extract data in optimal order (contexts first, then units, then facts)
# This ensures dependencies are resolved before they're needed
self._extract_contexts(root)
self._extract_units(root)
self._extract_facts(root)
self._extract_footnotes(root)
# Post-processing steps after all raw data is extracted
self._extract_entity_info()
self._build_reporting_periods()
except Exception as e:
raise XBRLProcessingError(f"Error parsing instance content: {str(e)}") from e
def count_facts(self, content: str) -> tuple:
"""Count the number of facts in the instance document
This function counts both unique facts and total fact instances in the XBRL document.
Returns:
tuple: (unique_facts_count, total_fact_instances)
"""
# Use lxml's optimized parser with smart string handling and recovery mode
parser = ET.XMLParser(remove_blank_text=True, recover=True, huge_tree=True)
# Convert to bytes for faster parsing if not already
if isinstance(content, str):
content_bytes = content.encode('utf-8')
else:
content_bytes = content
# Parse content with optimized settings
root = ET.XML(content_bytes, parser)
# Fast path to identify non-fact elements to skip
skip_tag_endings = {'}context', '}unit', '}schemaRef'}
# Track both total instances and unique facts
total_fact_instances = 0 # Total number of fact references in the document
unique_facts = set() # Set of unique element_id + context_ref combinations
create_key = self._create_normalized_fact_key
# Define counting function
def count_element(element):
"""Process a single element as a potential fact."""
nonlocal total_fact_instances
# Skip known non-fact elements
tag = element.tag
for ending in skip_tag_endings:
if tag.endswith(ending):
return
# Get context reference - key check to identify facts
context_ref = element.get('contextRef')
if context_ref is None:
return
# Extract element namespace and name - optimized split
if '}' in tag:
namespace, element_name = tag.split('}', 1)
namespace = namespace[1:] # Faster than strip('{')
else:
element_name = tag
namespace = None
# Get namespace prefix - cached for performance
prefix = None
for std_prefix, std_uri_base in NAMESPACES.items():
if namespace.startswith(std_uri_base):
prefix = std_prefix
break
if not prefix and namespace:
# Try to extract prefix from the namespace
parts = namespace.split('/')
prefix = parts[-1] if parts else ''
# Construct element ID with optimized string concatenation
if prefix:
element_id = f"{prefix}:{element_name}" if prefix else element_name
else:
element_id = element_name
# Create a normalized key using underscore format for consistency
normalized_key = create_key(element_id, context_ref)
# Track unique facts
unique_facts.add(normalized_key)
# Increment total instances count
total_fact_instances += 1
# Optimize traversal using lxml's iterchildren and iterdescendants if available
if hasattr(root, 'iterchildren'):
# Use lxml's optimized traversal methods
for child in root.iterchildren():
count_element(child)
# Process nested elements with optimized iteration
for descendant in child.iterdescendants():
count_element(descendant)
else:
# Fallback for ElementTree
for child in root:
count_element(child)
for descendant in child.findall('.//*'):
count_element(descendant)
# Return tuple of counts (unique_facts_count, total_fact_instances)
return len(unique_facts), total_fact_instances
def _extract_contexts(self, root: ET.Element) -> None:
"""Extract contexts from instance document."""
try:
# Find all context elements
for context_elem in root.findall('.//{http://www.xbrl.org/2003/instance}context'):
context_id = context_elem.get('id')
if not context_id:
continue
# Create context object
context = Context(context_id=context_id)
# Extract entity information
entity_elem = context_elem.find('.//{http://www.xbrl.org/2003/instance}entity')
if entity_elem is not None:
# Get identifier
identifier_elem = entity_elem.find('.//{http://www.xbrl.org/2003/instance}identifier')
if identifier_elem is not None:
scheme = identifier_elem.get('scheme', '')
identifier = identifier_elem.text
context.entity = {
'scheme': scheme,
'identifier': identifier
}
# Get segment dimensions if present
segment_elem = entity_elem.find('.//{http://www.xbrl.org/2003/instance}segment')
if segment_elem is not None:
# Extract explicit dimensions
for dim_elem in segment_elem.findall('.//{http://xbrl.org/2006/xbrldi}explicitMember'):
dimension = dim_elem.get('dimension')
value = dim_elem.text
if dimension and value:
context.dimensions[dimension] = value
# Extract typed dimensions
for dim_elem in segment_elem.findall('.//{http://xbrl.org/2006/xbrldi}typedMember'):
dimension = dim_elem.get('dimension')
if dimension:
# The typed dimension value is the text content of the first child element
for child in dim_elem:
# Extract the text content, which contains the actual typed member value
if child.text and child.text.strip():
context.dimensions[dimension] = child.text.strip()
else:
# Fallback to tag if no text content
context.dimensions[dimension] = child.tag
break
# Extract period information
period_elem = context_elem.find('.//{http://www.xbrl.org/2003/instance}period')
if period_elem is not None:
# Check for instant period
instant_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}instant')
if instant_elem is not None and instant_elem.text:
context.period = {
'type': 'instant',
'instant': instant_elem.text
}
# Check for duration period
start_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}startDate')
end_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}endDate')
if start_elem is not None and end_elem is not None and start_elem.text and end_elem.text:
context.period = {
'type': 'duration',
'startDate': start_elem.text,
'endDate': end_elem.text
}
# Check for forever period
forever_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}forever')
if forever_elem is not None:
context.period = {
'type': 'forever'
}
# Add context to registry
self.contexts[context_id] = context
except Exception as e:
raise XBRLProcessingError(f"Error extracting contexts: {str(e)}") from e
def _extract_units(self, root: ET.Element) -> None:
"""Extract units from instance document."""
try:
# Find all unit elements
for unit_elem in root.findall('.//{http://www.xbrl.org/2003/instance}unit'):
unit_id = unit_elem.get('id')
if not unit_id:
continue
# Check for measure
measure_elem = unit_elem.find('.//{http://www.xbrl.org/2003/instance}measure')
if measure_elem is not None and measure_elem.text:
self.units[unit_id] = {
'type': 'simple',
'measure': measure_elem.text
}
continue
# Check for divide
divide_elem = unit_elem.find('.//{http://www.xbrl.org/2003/instance}divide')
if divide_elem is not None:
# Get numerator
numerator_elem = divide_elem.find('.//{http://www.xbrl.org/2003/instance}unitNumerator')
denominator_elem = divide_elem.find('.//{http://www.xbrl.org/2003/instance}unitDenominator')
if numerator_elem is not None and denominator_elem is not None:
# Get measures
numerator_measures = [elem.text for elem in numerator_elem.findall('.//{http://www.xbrl.org/2003/instance}measure') if elem.text]
denominator_measures = [elem.text for elem in denominator_elem.findall('.//{http://www.xbrl.org/2003/instance}measure') if elem.text]
self.units[unit_id] = {
'type': 'divide',
'numerator': numerator_measures,
'denominator': denominator_measures
}
except Exception as e:
raise XBRLProcessingError(f"Error extracting units: {str(e)}") from e
def _extract_facts(self, root: ET.Element) -> None:
"""Extract facts from instance document."""
try:
# Get direct access to nsmap if using lxml (much faster than regex extraction)
if hasattr(root, 'nsmap'):
# Leverage lxml's native nsmap functionality
prefix_map = {uri: prefix for prefix, uri in root.nsmap.items() if prefix is not None}
else:
# Fallback for ElementTree - precompile regex patterns for namespace extraction
xmlns_pattern = '{http://www.w3.org/2000/xmlns/}'
prefix_map = {}
# Extract namespace declarations from root
for attr_name, attr_value in root.attrib.items():
if attr_name.startswith(xmlns_pattern) or attr_name.startswith('xmlns:'):
# Extract the prefix more efficiently
if attr_name.startswith(xmlns_pattern):
prefix = attr_name[len(xmlns_pattern):]
else:
prefix = attr_name.split(':', 1)[1]
prefix_map[attr_value] = prefix
# Initialize counters and tracking
fact_count = 0
facts_dict = {}
base_keys = {}
# Fast path to identify non-fact elements to skip - compile as set for O(1) lookup
skip_tag_endings = {
'schemaRef',
'roleRef',
'arcroleRef',
'linkbaseRef',
'context',
'unit'
}
def process_element(element):
"""Process a single element as a potential fact."""
nonlocal fact_count
# Skip annotation nodes and other non element nodes
if not ET.iselement(element):
return
# Skip known non-fact elements - faster check with set membership
# If the tag is not a string, try calling () to get the string value (in rare cases)
if callable(element.tag):
if isinstance(element, ET._Comment):
return
if not element.values():
return
tag = element.tag
for ending in skip_tag_endings:
if tag.endswith(ending):
return
# Get context reference - key check to identify facts
context_ref = element.get('contextRef')
if not context_ref:
return
# Get fact ID if present (for footnote linkage)
fact_id = element.get('id')
# Extract element namespace and name - optimized split
if '}' in tag:
namespace, element_name = tag.split('}', 1)
namespace = namespace[1:] # Faster than strip('{')
# Try to extract prefix from the namespace
prefix = prefix_map.get(namespace)
if not prefix:
parts = namespace.split('/')
prefix = parts[-1] if parts else ''
else:
element_name = tag
prefix = ''
# Construct element ID with optimized string concatenation
element_id = f"{prefix}:{element_name}" if prefix else element_name
# Get unit reference
unit_ref = element.get('unitRef')
# Get value - optimize string handling
value = element.text
if not value or not value.strip():
# Only check children if text is empty - use direct iteration for speed
for sub_elem in element:
sub_text = sub_elem.text
if sub_text and sub_text.strip():
value = sub_text
break
# Optimize string handling - inline conditional
value = value.strip() if value else ""
# Get decimals attribute - direct access
decimals = element.get('decimals')
# Optimize numeric conversion with faster try/except
numeric_value = None
if value:
try:
numeric_value = float(value)
except (ValueError, TypeError):
pass
# Create base key for duplicate detection
base_key = self._create_normalized_fact_key(element_id, context_ref)
# Handle duplicates
instance_id = None
if base_key in base_keys:
# This is a duplicate - convert existing fact to use instance_id if needed
if base_key in facts_dict:
existing_fact = facts_dict[base_key]
# Move existing fact to new key with instance_id=0
del facts_dict[base_key]
existing_fact.instance_id = 0
facts_dict[self._create_normalized_fact_key(element_id, context_ref, 0)] = existing_fact
# Add new fact with next instance_id
instance_id = len(base_keys[base_key])
base_keys[base_key].append(True)
else:
# First instance of this fact
base_keys[base_key] = [True]
# Create fact object
fact = Fact(
element_id=element_id,
context_ref=context_ref,
value=value,
unit_ref=unit_ref,
decimals=decimals,
numeric_value=numeric_value,
instance_id=instance_id,
fact_id=fact_id
)
# Store fact with appropriate key
key = self._create_normalized_fact_key(element_id, context_ref, instance_id)
facts_dict[key] = fact
fact_count += 1
# Use lxml's optimized traversal methods
if hasattr(root, 'iterchildren'):
# Use lxml's optimized traversal methods
for child in root.iterchildren():
process_element(child)
# Process nested elements with optimized iteration
for descendant in child.iterdescendants():
process_element(descendant)
else:
# Fallback for ElementTree
for child in root:
process_element(child)
for descendant in child.findall('.//*'):
process_element(descendant)
# Update instance facts
self.facts.update(facts_dict)
log.debug(f"Extracted {fact_count} facts ({len(base_keys)} unique fact identifiers)")
except Exception as e:
raise XBRLProcessingError(f"Error extracting facts: {str(e)}") from e
def _extract_footnotes(self, root: ET.Element) -> None:
"""Extract footnotes from instance document.
Footnotes in XBRL are linked to facts via footnoteLink elements that contain:
1. footnote elements with the actual text content
2. footnoteArc elements that connect fact IDs to footnote IDs
"""
try:
from edgar.xbrl.models import Footnote
# Find all footnoteLink elements
for footnote_link in root.findall('.//{http://www.xbrl.org/2003/linkbase}footnoteLink'):
# First, extract all footnote definitions
for footnote_elem in footnote_link.findall('{http://www.xbrl.org/2003/linkbase}footnote'):
# Try both 'id' and 'xlink:label' attributes
footnote_id = footnote_elem.get('id') or footnote_elem.get('{http://www.w3.org/1999/xlink}label')
if not footnote_id:
continue
# Get footnote attributes
lang = footnote_elem.get('{http://www.w3.org/XML/1998/namespace}lang', 'en-US')
role = footnote_elem.get('{http://www.w3.org/1999/xlink}role')
# Extract text content, handling XHTML formatting
footnote_text = ""
# Check for XHTML content
xhtml_divs = footnote_elem.findall('.//{http://www.w3.org/1999/xhtml}div')
if xhtml_divs:
# Concatenate all text within XHTML elements
for div in xhtml_divs:
footnote_text += "".join(div.itertext()).strip()
else:
# Fall back to direct text content
footnote_text = "".join(footnote_elem.itertext()).strip()
# Create Footnote object
footnote = Footnote(
footnote_id=footnote_id,
text=footnote_text,
lang=lang,
role=role,
related_fact_ids=[]
)
self.footnotes[footnote_id] = footnote
# Second, process footnoteArc elements to link facts to footnotes
for arc_elem in footnote_link.findall('{http://www.xbrl.org/2003/linkbase}footnoteArc'):
fact_id = arc_elem.get('{http://www.w3.org/1999/xlink}from')
footnote_id = arc_elem.get('{http://www.w3.org/1999/xlink}to')
if fact_id and footnote_id:
# Add fact ID to footnote's related facts
if footnote_id in self.footnotes:
self.footnotes[footnote_id].related_fact_ids.append(fact_id)
else:
log.warning(f"Footnote arc references undefined footnote: {footnote_id}")
# Also update the fact's footnotes list if we can find it
# This requires finding the fact by its fact_id
for fact in self.facts.values():
if fact.fact_id == fact_id:
if footnote_id not in fact.footnotes:
fact.footnotes.append(footnote_id)
break
log.debug(f"Extracted {len(self.footnotes)} footnotes")
except Exception as e:
# Log the error but don't fail - footnotes are optional
log.warning(f"Error extracting footnotes: {str(e)}")
def _extract_entity_info(self) -> None:
"""Extract entity information from contexts and DEI facts."""
try:
# Extract CIK/identifier from first context
identifier = None
if self.contexts:
first = next(iter(self.contexts.values()))
ident = first.entity.get('identifier')
if ident and ident.isdigit():
identifier = ident.lstrip('0')
# Collect all DEI facts into a dict: concept -> Fact
self.dei_facts: Dict[str, Fact] = {}
for fact in self.facts.values():
eid = fact.element_id
if eid.startswith('dei:'):
concept = eid.split(':', 1)[1]
elif eid.startswith('dei_'):
concept = eid.split('_', 1)[1]
else:
continue
self.dei_facts[concept] = fact
# Helper: get the first available DEI fact value
def get_dei(*names):
for n in names:
f = self.dei_facts.get(n)
if f:
return f.value
return None
# Build entity_info preserving existing keys
self.entity_info.update({
'entity_name': get_dei('EntityRegistrantName'),
'ticker': get_dei('TradingSymbol'),
'identifier': identifier,
'document_type': get_dei('DocumentType'),
'reporting_end_date': None,
'document_period_end_date':get_dei('DocumentPeriodEndDate'),
'fiscal_year': get_dei('DocumentFiscalYearFocus','FiscalYearFocus','FiscalYear'),
'fiscal_period': get_dei('DocumentFiscalPeriodFocus','FiscalPeriodFocus'),
'fiscal_year_end_month': None,
'fiscal_year_end_day': None,
'annual_report': False,
'quarterly_report': False,
'amendment': False,
})
# Determine reporting_end_date from contexts
for ctx in self.contexts.values():
period = getattr(ctx, 'period', {})
if period.get('type') == 'instant':
ds = period.get('instant')
if ds:
try:
dt_obj = datetime.strptime(ds, '%Y-%m-%d').date()
curr = self.entity_info['reporting_end_date']
if curr is None or dt_obj > curr:
self.entity_info['reporting_end_date'] = dt_obj
except Exception:
pass
# Parse fiscal year end date into month/day
fye = get_dei('CurrentFiscalYearEndDate','FiscalYearEnd')
if fye:
try:
s = fye
if s.startswith('--'):
s = s[2:]
if '-' in s:
m, d = s.split('-', 1)
if m.isdigit() and d.isdigit():
self.entity_info['fiscal_year_end_month'] = int(m)
self.entity_info['fiscal_year_end_day'] = int(d)
except Exception:
pass
# Flags based on document_type
dt_val = self.entity_info['document_type'] or ''
self.entity_info['annual_report'] = (dt_val == '10-K')
self.entity_info['quarterly_report'] = (dt_val == '10-Q')
self.entity_info['amendment'] = ('/A' in dt_val)
log.debug(f"Entity info: {self.entity_info}")
except Exception as e:
log.warning(f"Warning: Error extracting entity info: {str(e)}")
def _build_reporting_periods(self) -> None:
"""Build reporting periods from contexts."""
try:
# Clear existing periods
self.reporting_periods.clear()
self.context_period_map.clear()
# Collect unique periods from contexts
instant_periods = {}
duration_periods = {}
for context_id, context in self.contexts.items():
if 'period' in context.model_dump() and 'type' in context.period:
period_type = context.period.get('type')
if period_type == 'instant':
date_str = context.period.get('instant')
if date_str:
if date_str not in instant_periods:
instant_periods[date_str] = []
# Add context ID to this period
instant_periods[date_str].append(context_id)
# Map context to period key
period_key = f"instant_{date_str}"
self.context_period_map[context_id] = period_key
elif period_type == 'duration':
start_date = context.period.get('startDate')
end_date = context.period.get('endDate')
if start_date and end_date:
duration_key = f"{start_date}_{end_date}"
if duration_key not in duration_periods:
duration_periods[duration_key] = []
# Add context ID to this period
duration_periods[duration_key].append(context_id)
# Map context to period key
period_key = f"duration_{start_date}_{end_date}"
self.context_period_map[context_id] = period_key
# Process instant periods
for date_str, context_ids in instant_periods.items():
try:
date_obj = datetime.strptime(date_str, '%Y-%m-%d').date()
formatted_date = date_obj.strftime('%B %d, %Y')
period = {
'type': 'instant',
'date': date_str,
'date_obj': date_obj,
'label': formatted_date,
'context_ids': context_ids,
'key': f"instant_{date_str}"
}
self.reporting_periods.append(period)
except (ValueError, TypeError):
# Skip invalid dates
continue
# Process duration periods
for period_key, context_ids in duration_periods.items():
start_date, end_date = period_key.split('_')
try:
start_obj = datetime.strptime(start_date, '%Y-%m-%d').date()
end_obj = datetime.strptime(end_date, '%Y-%m-%d').date()
formatted_start = start_obj.strftime('%B %d, %Y')
formatted_end = end_obj.strftime('%B %d, %Y')
# Calculate duration in days
days = (end_obj - start_obj).days
# Determine period type based on duration
period_description = classify_duration(days)
period = {
'type': 'duration',
'start_date': start_date,
'end_date': end_date,
'start_obj': start_obj,
'end_obj': end_obj,
'days': days,
'period_type': period_description,
'label': f"{period_description}: {formatted_start} to {formatted_end}",
'context_ids': context_ids,
'key': f"duration_{start_date}_{end_date}"
}
self.reporting_periods.append(period)
except (ValueError, TypeError):
# Skip invalid dates
continue
# Sort periods by date (most recent first)
self.reporting_periods.sort(key=lambda p: p['date_obj'] if p['type'] == 'instant' else p['end_obj'], reverse=True)
# Debug printout to verify periods are extracted
if len(self.reporting_periods) > 0:
log.debug(f"Found {len(self.reporting_periods)} reporting periods.")
log.debug(f"First period: {self.reporting_periods[0]['label']}")
else:
log.debug("Warning: No reporting periods found!")
# Debug context period map
log.debug(f"Context period map has {len(self.context_period_map)} entries.")
except Exception as e:
# Log error but don't fail
log.debug(f"Warning: Error building reporting periods: {str(e)}")
self.reporting_periods.clear()