769 lines
34 KiB
Python
769 lines
34 KiB
Python
"""
|
|
Instance parser for XBRL documents.
|
|
|
|
This module handles parsing of XBRL instance documents including facts, contexts,
|
|
units, footnotes, and entity information extraction.
|
|
"""
|
|
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Union
|
|
|
|
from lxml import etree as ET
|
|
|
|
from edgar.core import log
|
|
from edgar.xbrl.core import NAMESPACES, classify_duration
|
|
from edgar.xbrl.models import Context, Fact, XBRLProcessingError
|
|
|
|
from .base import BaseParser
|
|
|
|
|
|
class InstanceParser(BaseParser):
|
|
"""Parser for XBRL instance documents."""
|
|
|
|
def __init__(self, contexts: Dict[str, Context], facts: Dict[str, Fact],
|
|
units: Dict[str, Any], footnotes: Dict[str, Any],
|
|
calculation_trees: Dict[str, Any], entity_info: Dict[str, Any],
|
|
reporting_periods: List[Dict[str, Any]], context_period_map: Dict[str, str]):
|
|
"""
|
|
Initialize instance parser with data structure references.
|
|
|
|
Args:
|
|
contexts: Reference to contexts dictionary
|
|
facts: Reference to facts dictionary
|
|
units: Reference to units dictionary
|
|
footnotes: Reference to footnotes dictionary
|
|
calculation_trees: Reference to calculation trees dictionary
|
|
entity_info: Reference to entity info dictionary
|
|
reporting_periods: Reference to reporting periods list
|
|
context_period_map: Reference to context period map
|
|
"""
|
|
super().__init__()
|
|
|
|
# Store references to data structures
|
|
self.contexts = contexts
|
|
self.facts = facts
|
|
self.units = units
|
|
self.footnotes = footnotes
|
|
self.calculation_trees = calculation_trees
|
|
self.entity_info = entity_info
|
|
self.reporting_periods = reporting_periods
|
|
self.context_period_map = context_period_map
|
|
|
|
# DEI facts extracted during entity info processing
|
|
self.dei_facts: Dict[str, Fact] = {}
|
|
|
|
def _create_normalized_fact_key(self, element_id: str, context_ref: str, instance_id: int = None) -> str:
|
|
"""
|
|
Create a normalized fact key using underscore format.
|
|
|
|
Args:
|
|
element_id: The element ID
|
|
context_ref: The context reference
|
|
instance_id: Optional instance ID for duplicate facts
|
|
|
|
Returns:
|
|
Normalized key in format: element_id_context_ref[_instance_id]
|
|
"""
|
|
normalized_element_id = element_id
|
|
if ':' in element_id:
|
|
prefix, name = element_id.split(':', 1)
|
|
normalized_element_id = f"{prefix}_{name}"
|
|
if instance_id is not None:
|
|
return f"{normalized_element_id}_{context_ref}_{instance_id}"
|
|
return f"{normalized_element_id}_{context_ref}"
|
|
|
|
def parse_instance(self, file_path: Union[str, Path]) -> None:
|
|
"""Parse instance document file and extract contexts, facts, and units."""
|
|
try:
|
|
content = Path(file_path).read_text()
|
|
self.parse_instance_content(content)
|
|
except Exception as e:
|
|
raise XBRLProcessingError(f"Error parsing instance file {file_path}: {str(e)}") from e
|
|
|
|
def parse_instance_content(self, content: str) -> None:
|
|
"""Parse instance document content and extract contexts, facts, and units."""
|
|
try:
|
|
# Use lxml's optimized parser with smart string handling and recovery mode
|
|
parser = ET.XMLParser(remove_blank_text=True, recover=True, huge_tree=True)
|
|
|
|
# Convert to bytes for faster parsing if not already
|
|
if isinstance(content, str):
|
|
content_bytes = content.encode('utf-8')
|
|
else:
|
|
content_bytes = content
|
|
|
|
# Parse content with optimized settings
|
|
root = ET.XML(content_bytes, parser)
|
|
|
|
# Extract data in optimal order (contexts first, then units, then facts)
|
|
# This ensures dependencies are resolved before they're needed
|
|
self._extract_contexts(root)
|
|
self._extract_units(root)
|
|
self._extract_facts(root)
|
|
self._extract_footnotes(root)
|
|
|
|
# Post-processing steps after all raw data is extracted
|
|
self._extract_entity_info()
|
|
self._build_reporting_periods()
|
|
|
|
except Exception as e:
|
|
raise XBRLProcessingError(f"Error parsing instance content: {str(e)}") from e
|
|
|
|
def count_facts(self, content: str) -> tuple:
|
|
"""Count the number of facts in the instance document
|
|
This function counts both unique facts and total fact instances in the XBRL document.
|
|
|
|
Returns:
|
|
tuple: (unique_facts_count, total_fact_instances)
|
|
"""
|
|
|
|
# Use lxml's optimized parser with smart string handling and recovery mode
|
|
parser = ET.XMLParser(remove_blank_text=True, recover=True, huge_tree=True)
|
|
|
|
# Convert to bytes for faster parsing if not already
|
|
if isinstance(content, str):
|
|
content_bytes = content.encode('utf-8')
|
|
else:
|
|
content_bytes = content
|
|
|
|
# Parse content with optimized settings
|
|
root = ET.XML(content_bytes, parser)
|
|
|
|
# Fast path to identify non-fact elements to skip
|
|
skip_tag_endings = {'}context', '}unit', '}schemaRef'}
|
|
|
|
# Track both total instances and unique facts
|
|
total_fact_instances = 0 # Total number of fact references in the document
|
|
unique_facts = set() # Set of unique element_id + context_ref combinations
|
|
create_key = self._create_normalized_fact_key
|
|
|
|
# Define counting function
|
|
def count_element(element):
|
|
"""Process a single element as a potential fact."""
|
|
nonlocal total_fact_instances
|
|
|
|
# Skip known non-fact elements
|
|
tag = element.tag
|
|
for ending in skip_tag_endings:
|
|
if tag.endswith(ending):
|
|
return
|
|
|
|
# Get context reference - key check to identify facts
|
|
context_ref = element.get('contextRef')
|
|
if context_ref is None:
|
|
return
|
|
|
|
# Extract element namespace and name - optimized split
|
|
if '}' in tag:
|
|
namespace, element_name = tag.split('}', 1)
|
|
namespace = namespace[1:] # Faster than strip('{')
|
|
else:
|
|
element_name = tag
|
|
namespace = None
|
|
|
|
# Get namespace prefix - cached for performance
|
|
prefix = None
|
|
for std_prefix, std_uri_base in NAMESPACES.items():
|
|
if namespace.startswith(std_uri_base):
|
|
prefix = std_prefix
|
|
break
|
|
|
|
if not prefix and namespace:
|
|
# Try to extract prefix from the namespace
|
|
parts = namespace.split('/')
|
|
prefix = parts[-1] if parts else ''
|
|
|
|
# Construct element ID with optimized string concatenation
|
|
if prefix:
|
|
element_id = f"{prefix}:{element_name}" if prefix else element_name
|
|
else:
|
|
element_id = element_name
|
|
|
|
# Create a normalized key using underscore format for consistency
|
|
normalized_key = create_key(element_id, context_ref)
|
|
|
|
# Track unique facts
|
|
unique_facts.add(normalized_key)
|
|
|
|
# Increment total instances count
|
|
total_fact_instances += 1
|
|
|
|
# Optimize traversal using lxml's iterchildren and iterdescendants if available
|
|
if hasattr(root, 'iterchildren'):
|
|
# Use lxml's optimized traversal methods
|
|
for child in root.iterchildren():
|
|
count_element(child)
|
|
# Process nested elements with optimized iteration
|
|
for descendant in child.iterdescendants():
|
|
count_element(descendant)
|
|
else:
|
|
# Fallback for ElementTree
|
|
for child in root:
|
|
count_element(child)
|
|
for descendant in child.findall('.//*'):
|
|
count_element(descendant)
|
|
|
|
# Return tuple of counts (unique_facts_count, total_fact_instances)
|
|
return len(unique_facts), total_fact_instances
|
|
|
|
def _extract_contexts(self, root: ET.Element) -> None:
|
|
"""Extract contexts from instance document."""
|
|
try:
|
|
# Find all context elements
|
|
for context_elem in root.findall('.//{http://www.xbrl.org/2003/instance}context'):
|
|
context_id = context_elem.get('id')
|
|
if not context_id:
|
|
continue
|
|
|
|
# Create context object
|
|
context = Context(context_id=context_id)
|
|
|
|
# Extract entity information
|
|
entity_elem = context_elem.find('.//{http://www.xbrl.org/2003/instance}entity')
|
|
if entity_elem is not None:
|
|
# Get identifier
|
|
identifier_elem = entity_elem.find('.//{http://www.xbrl.org/2003/instance}identifier')
|
|
if identifier_elem is not None:
|
|
scheme = identifier_elem.get('scheme', '')
|
|
identifier = identifier_elem.text
|
|
context.entity = {
|
|
'scheme': scheme,
|
|
'identifier': identifier
|
|
}
|
|
|
|
# Get segment dimensions if present
|
|
segment_elem = entity_elem.find('.//{http://www.xbrl.org/2003/instance}segment')
|
|
if segment_elem is not None:
|
|
# Extract explicit dimensions
|
|
for dim_elem in segment_elem.findall('.//{http://xbrl.org/2006/xbrldi}explicitMember'):
|
|
dimension = dim_elem.get('dimension')
|
|
value = dim_elem.text
|
|
if dimension and value:
|
|
context.dimensions[dimension] = value
|
|
|
|
# Extract typed dimensions
|
|
for dim_elem in segment_elem.findall('.//{http://xbrl.org/2006/xbrldi}typedMember'):
|
|
dimension = dim_elem.get('dimension')
|
|
if dimension:
|
|
# The typed dimension value is the text content of the first child element
|
|
for child in dim_elem:
|
|
# Extract the text content, which contains the actual typed member value
|
|
if child.text and child.text.strip():
|
|
context.dimensions[dimension] = child.text.strip()
|
|
else:
|
|
# Fallback to tag if no text content
|
|
context.dimensions[dimension] = child.tag
|
|
break
|
|
|
|
# Extract period information
|
|
period_elem = context_elem.find('.//{http://www.xbrl.org/2003/instance}period')
|
|
if period_elem is not None:
|
|
# Check for instant period
|
|
instant_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}instant')
|
|
if instant_elem is not None and instant_elem.text:
|
|
context.period = {
|
|
'type': 'instant',
|
|
'instant': instant_elem.text
|
|
}
|
|
|
|
# Check for duration period
|
|
start_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}startDate')
|
|
end_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}endDate')
|
|
if start_elem is not None and end_elem is not None and start_elem.text and end_elem.text:
|
|
context.period = {
|
|
'type': 'duration',
|
|
'startDate': start_elem.text,
|
|
'endDate': end_elem.text
|
|
}
|
|
|
|
# Check for forever period
|
|
forever_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}forever')
|
|
if forever_elem is not None:
|
|
context.period = {
|
|
'type': 'forever'
|
|
}
|
|
|
|
# Add context to registry
|
|
self.contexts[context_id] = context
|
|
|
|
except Exception as e:
|
|
raise XBRLProcessingError(f"Error extracting contexts: {str(e)}") from e
|
|
|
|
def _extract_units(self, root: ET.Element) -> None:
|
|
"""Extract units from instance document."""
|
|
try:
|
|
# Find all unit elements
|
|
for unit_elem in root.findall('.//{http://www.xbrl.org/2003/instance}unit'):
|
|
unit_id = unit_elem.get('id')
|
|
if not unit_id:
|
|
continue
|
|
|
|
# Check for measure
|
|
measure_elem = unit_elem.find('.//{http://www.xbrl.org/2003/instance}measure')
|
|
if measure_elem is not None and measure_elem.text:
|
|
self.units[unit_id] = {
|
|
'type': 'simple',
|
|
'measure': measure_elem.text
|
|
}
|
|
continue
|
|
|
|
# Check for divide
|
|
divide_elem = unit_elem.find('.//{http://www.xbrl.org/2003/instance}divide')
|
|
if divide_elem is not None:
|
|
# Get numerator
|
|
numerator_elem = divide_elem.find('.//{http://www.xbrl.org/2003/instance}unitNumerator')
|
|
denominator_elem = divide_elem.find('.//{http://www.xbrl.org/2003/instance}unitDenominator')
|
|
|
|
if numerator_elem is not None and denominator_elem is not None:
|
|
# Get measures
|
|
numerator_measures = [elem.text for elem in numerator_elem.findall('.//{http://www.xbrl.org/2003/instance}measure') if elem.text]
|
|
denominator_measures = [elem.text for elem in denominator_elem.findall('.//{http://www.xbrl.org/2003/instance}measure') if elem.text]
|
|
|
|
self.units[unit_id] = {
|
|
'type': 'divide',
|
|
'numerator': numerator_measures,
|
|
'denominator': denominator_measures
|
|
}
|
|
|
|
except Exception as e:
|
|
raise XBRLProcessingError(f"Error extracting units: {str(e)}") from e
|
|
|
|
def _extract_facts(self, root: ET.Element) -> None:
|
|
"""Extract facts from instance document."""
|
|
try:
|
|
# Get direct access to nsmap if using lxml (much faster than regex extraction)
|
|
if hasattr(root, 'nsmap'):
|
|
# Leverage lxml's native nsmap functionality
|
|
prefix_map = {uri: prefix for prefix, uri in root.nsmap.items() if prefix is not None}
|
|
else:
|
|
# Fallback for ElementTree - precompile regex patterns for namespace extraction
|
|
xmlns_pattern = '{http://www.w3.org/2000/xmlns/}'
|
|
prefix_map = {}
|
|
|
|
# Extract namespace declarations from root
|
|
for attr_name, attr_value in root.attrib.items():
|
|
if attr_name.startswith(xmlns_pattern) or attr_name.startswith('xmlns:'):
|
|
# Extract the prefix more efficiently
|
|
if attr_name.startswith(xmlns_pattern):
|
|
prefix = attr_name[len(xmlns_pattern):]
|
|
else:
|
|
prefix = attr_name.split(':', 1)[1]
|
|
prefix_map[attr_value] = prefix
|
|
|
|
# Initialize counters and tracking
|
|
fact_count = 0
|
|
facts_dict = {}
|
|
base_keys = {}
|
|
|
|
# Fast path to identify non-fact elements to skip - compile as set for O(1) lookup
|
|
skip_tag_endings = {
|
|
'schemaRef',
|
|
'roleRef',
|
|
'arcroleRef',
|
|
'linkbaseRef',
|
|
'context',
|
|
'unit'
|
|
}
|
|
|
|
def process_element(element):
|
|
"""Process a single element as a potential fact."""
|
|
nonlocal fact_count
|
|
|
|
# Skip annotation nodes and other non element nodes
|
|
if not ET.iselement(element):
|
|
return
|
|
# Skip known non-fact elements - faster check with set membership
|
|
# If the tag is not a string, try calling () to get the string value (in rare cases)
|
|
if callable(element.tag):
|
|
if isinstance(element, ET._Comment):
|
|
return
|
|
if not element.values():
|
|
return
|
|
tag = element.tag
|
|
for ending in skip_tag_endings:
|
|
if tag.endswith(ending):
|
|
return
|
|
|
|
# Get context reference - key check to identify facts
|
|
context_ref = element.get('contextRef')
|
|
if not context_ref:
|
|
return
|
|
|
|
# Get fact ID if present (for footnote linkage)
|
|
fact_id = element.get('id')
|
|
|
|
# Extract element namespace and name - optimized split
|
|
if '}' in tag:
|
|
namespace, element_name = tag.split('}', 1)
|
|
namespace = namespace[1:] # Faster than strip('{')
|
|
|
|
# Try to extract prefix from the namespace
|
|
prefix = prefix_map.get(namespace)
|
|
if not prefix:
|
|
parts = namespace.split('/')
|
|
prefix = parts[-1] if parts else ''
|
|
else:
|
|
element_name = tag
|
|
prefix = ''
|
|
|
|
# Construct element ID with optimized string concatenation
|
|
element_id = f"{prefix}:{element_name}" if prefix else element_name
|
|
|
|
# Get unit reference
|
|
unit_ref = element.get('unitRef')
|
|
|
|
# Get value - optimize string handling
|
|
value = element.text
|
|
if not value or not value.strip():
|
|
# Only check children if text is empty - use direct iteration for speed
|
|
for sub_elem in element:
|
|
sub_text = sub_elem.text
|
|
if sub_text and sub_text.strip():
|
|
value = sub_text
|
|
break
|
|
|
|
# Optimize string handling - inline conditional
|
|
value = value.strip() if value else ""
|
|
|
|
# Get decimals attribute - direct access
|
|
decimals = element.get('decimals')
|
|
|
|
# Optimize numeric conversion with faster try/except
|
|
numeric_value = None
|
|
if value:
|
|
try:
|
|
numeric_value = float(value)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Create base key for duplicate detection
|
|
base_key = self._create_normalized_fact_key(element_id, context_ref)
|
|
|
|
# Handle duplicates
|
|
instance_id = None
|
|
if base_key in base_keys:
|
|
# This is a duplicate - convert existing fact to use instance_id if needed
|
|
if base_key in facts_dict:
|
|
existing_fact = facts_dict[base_key]
|
|
# Move existing fact to new key with instance_id=0
|
|
del facts_dict[base_key]
|
|
existing_fact.instance_id = 0
|
|
facts_dict[self._create_normalized_fact_key(element_id, context_ref, 0)] = existing_fact
|
|
# Add new fact with next instance_id
|
|
instance_id = len(base_keys[base_key])
|
|
base_keys[base_key].append(True)
|
|
else:
|
|
# First instance of this fact
|
|
base_keys[base_key] = [True]
|
|
|
|
# Create fact object
|
|
fact = Fact(
|
|
element_id=element_id,
|
|
context_ref=context_ref,
|
|
value=value,
|
|
unit_ref=unit_ref,
|
|
decimals=decimals,
|
|
numeric_value=numeric_value,
|
|
instance_id=instance_id,
|
|
fact_id=fact_id
|
|
)
|
|
|
|
# Store fact with appropriate key
|
|
key = self._create_normalized_fact_key(element_id, context_ref, instance_id)
|
|
facts_dict[key] = fact
|
|
fact_count += 1
|
|
|
|
# Use lxml's optimized traversal methods
|
|
if hasattr(root, 'iterchildren'):
|
|
# Use lxml's optimized traversal methods
|
|
for child in root.iterchildren():
|
|
process_element(child)
|
|
# Process nested elements with optimized iteration
|
|
for descendant in child.iterdescendants():
|
|
process_element(descendant)
|
|
else:
|
|
# Fallback for ElementTree
|
|
for child in root:
|
|
process_element(child)
|
|
for descendant in child.findall('.//*'):
|
|
process_element(descendant)
|
|
|
|
# Update instance facts
|
|
self.facts.update(facts_dict)
|
|
|
|
log.debug(f"Extracted {fact_count} facts ({len(base_keys)} unique fact identifiers)")
|
|
|
|
except Exception as e:
|
|
raise XBRLProcessingError(f"Error extracting facts: {str(e)}") from e
|
|
|
|
def _extract_footnotes(self, root: ET.Element) -> None:
|
|
"""Extract footnotes from instance document.
|
|
|
|
Footnotes in XBRL are linked to facts via footnoteLink elements that contain:
|
|
1. footnote elements with the actual text content
|
|
2. footnoteArc elements that connect fact IDs to footnote IDs
|
|
"""
|
|
try:
|
|
from edgar.xbrl.models import Footnote
|
|
|
|
# Find all footnoteLink elements
|
|
for footnote_link in root.findall('.//{http://www.xbrl.org/2003/linkbase}footnoteLink'):
|
|
# First, extract all footnote definitions
|
|
for footnote_elem in footnote_link.findall('{http://www.xbrl.org/2003/linkbase}footnote'):
|
|
# Try both 'id' and 'xlink:label' attributes
|
|
footnote_id = footnote_elem.get('id') or footnote_elem.get('{http://www.w3.org/1999/xlink}label')
|
|
if not footnote_id:
|
|
continue
|
|
|
|
# Get footnote attributes
|
|
lang = footnote_elem.get('{http://www.w3.org/XML/1998/namespace}lang', 'en-US')
|
|
role = footnote_elem.get('{http://www.w3.org/1999/xlink}role')
|
|
|
|
# Extract text content, handling XHTML formatting
|
|
footnote_text = ""
|
|
# Check for XHTML content
|
|
xhtml_divs = footnote_elem.findall('.//{http://www.w3.org/1999/xhtml}div')
|
|
if xhtml_divs:
|
|
# Concatenate all text within XHTML elements
|
|
for div in xhtml_divs:
|
|
footnote_text += "".join(div.itertext()).strip()
|
|
else:
|
|
# Fall back to direct text content
|
|
footnote_text = "".join(footnote_elem.itertext()).strip()
|
|
|
|
# Create Footnote object
|
|
footnote = Footnote(
|
|
footnote_id=footnote_id,
|
|
text=footnote_text,
|
|
lang=lang,
|
|
role=role,
|
|
related_fact_ids=[]
|
|
)
|
|
self.footnotes[footnote_id] = footnote
|
|
|
|
# Second, process footnoteArc elements to link facts to footnotes
|
|
for arc_elem in footnote_link.findall('{http://www.xbrl.org/2003/linkbase}footnoteArc'):
|
|
fact_id = arc_elem.get('{http://www.w3.org/1999/xlink}from')
|
|
footnote_id = arc_elem.get('{http://www.w3.org/1999/xlink}to')
|
|
|
|
if fact_id and footnote_id:
|
|
# Add fact ID to footnote's related facts
|
|
if footnote_id in self.footnotes:
|
|
self.footnotes[footnote_id].related_fact_ids.append(fact_id)
|
|
else:
|
|
log.warning(f"Footnote arc references undefined footnote: {footnote_id}")
|
|
|
|
# Also update the fact's footnotes list if we can find it
|
|
# This requires finding the fact by its fact_id
|
|
for fact in self.facts.values():
|
|
if fact.fact_id == fact_id:
|
|
if footnote_id not in fact.footnotes:
|
|
fact.footnotes.append(footnote_id)
|
|
break
|
|
|
|
log.debug(f"Extracted {len(self.footnotes)} footnotes")
|
|
|
|
except Exception as e:
|
|
# Log the error but don't fail - footnotes are optional
|
|
log.warning(f"Error extracting footnotes: {str(e)}")
|
|
|
|
def _extract_entity_info(self) -> None:
|
|
"""Extract entity information from contexts and DEI facts."""
|
|
try:
|
|
# Extract CIK/identifier from first context
|
|
identifier = None
|
|
if self.contexts:
|
|
first = next(iter(self.contexts.values()))
|
|
ident = first.entity.get('identifier')
|
|
if ident and ident.isdigit():
|
|
identifier = ident.lstrip('0')
|
|
|
|
# Collect all DEI facts into a dict: concept -> Fact
|
|
self.dei_facts: Dict[str, Fact] = {}
|
|
for fact in self.facts.values():
|
|
eid = fact.element_id
|
|
if eid.startswith('dei:'):
|
|
concept = eid.split(':', 1)[1]
|
|
elif eid.startswith('dei_'):
|
|
concept = eid.split('_', 1)[1]
|
|
else:
|
|
continue
|
|
self.dei_facts[concept] = fact
|
|
|
|
# Helper: get the first available DEI fact value
|
|
def get_dei(*names):
|
|
for n in names:
|
|
f = self.dei_facts.get(n)
|
|
if f:
|
|
return f.value
|
|
return None
|
|
|
|
# Build entity_info preserving existing keys
|
|
self.entity_info.update({
|
|
'entity_name': get_dei('EntityRegistrantName'),
|
|
'ticker': get_dei('TradingSymbol'),
|
|
'identifier': identifier,
|
|
'document_type': get_dei('DocumentType'),
|
|
'reporting_end_date': None,
|
|
'document_period_end_date':get_dei('DocumentPeriodEndDate'),
|
|
'fiscal_year': get_dei('DocumentFiscalYearFocus','FiscalYearFocus','FiscalYear'),
|
|
'fiscal_period': get_dei('DocumentFiscalPeriodFocus','FiscalPeriodFocus'),
|
|
'fiscal_year_end_month': None,
|
|
'fiscal_year_end_day': None,
|
|
'annual_report': False,
|
|
'quarterly_report': False,
|
|
'amendment': False,
|
|
})
|
|
|
|
# Determine reporting_end_date from contexts
|
|
for ctx in self.contexts.values():
|
|
period = getattr(ctx, 'period', {})
|
|
if period.get('type') == 'instant':
|
|
ds = period.get('instant')
|
|
if ds:
|
|
try:
|
|
dt_obj = datetime.strptime(ds, '%Y-%m-%d').date()
|
|
curr = self.entity_info['reporting_end_date']
|
|
if curr is None or dt_obj > curr:
|
|
self.entity_info['reporting_end_date'] = dt_obj
|
|
except Exception:
|
|
pass
|
|
|
|
# Parse fiscal year end date into month/day
|
|
fye = get_dei('CurrentFiscalYearEndDate','FiscalYearEnd')
|
|
if fye:
|
|
try:
|
|
s = fye
|
|
if s.startswith('--'):
|
|
s = s[2:]
|
|
if '-' in s:
|
|
m, d = s.split('-', 1)
|
|
if m.isdigit() and d.isdigit():
|
|
self.entity_info['fiscal_year_end_month'] = int(m)
|
|
self.entity_info['fiscal_year_end_day'] = int(d)
|
|
except Exception:
|
|
pass
|
|
|
|
# Flags based on document_type
|
|
dt_val = self.entity_info['document_type'] or ''
|
|
self.entity_info['annual_report'] = (dt_val == '10-K')
|
|
self.entity_info['quarterly_report'] = (dt_val == '10-Q')
|
|
self.entity_info['amendment'] = ('/A' in dt_val)
|
|
|
|
log.debug(f"Entity info: {self.entity_info}")
|
|
except Exception as e:
|
|
log.warning(f"Warning: Error extracting entity info: {str(e)}")
|
|
|
|
def _build_reporting_periods(self) -> None:
|
|
"""Build reporting periods from contexts."""
|
|
try:
|
|
# Clear existing periods
|
|
self.reporting_periods.clear()
|
|
self.context_period_map.clear()
|
|
|
|
# Collect unique periods from contexts
|
|
instant_periods = {}
|
|
duration_periods = {}
|
|
|
|
for context_id, context in self.contexts.items():
|
|
if 'period' in context.model_dump() and 'type' in context.period:
|
|
period_type = context.period.get('type')
|
|
|
|
if period_type == 'instant':
|
|
date_str = context.period.get('instant')
|
|
if date_str:
|
|
if date_str not in instant_periods:
|
|
instant_periods[date_str] = []
|
|
|
|
# Add context ID to this period
|
|
instant_periods[date_str].append(context_id)
|
|
|
|
# Map context to period key
|
|
period_key = f"instant_{date_str}"
|
|
self.context_period_map[context_id] = period_key
|
|
|
|
elif period_type == 'duration':
|
|
start_date = context.period.get('startDate')
|
|
end_date = context.period.get('endDate')
|
|
if start_date and end_date:
|
|
duration_key = f"{start_date}_{end_date}"
|
|
if duration_key not in duration_periods:
|
|
duration_periods[duration_key] = []
|
|
|
|
# Add context ID to this period
|
|
duration_periods[duration_key].append(context_id)
|
|
|
|
# Map context to period key
|
|
period_key = f"duration_{start_date}_{end_date}"
|
|
self.context_period_map[context_id] = period_key
|
|
|
|
# Process instant periods
|
|
for date_str, context_ids in instant_periods.items():
|
|
try:
|
|
date_obj = datetime.strptime(date_str, '%Y-%m-%d').date()
|
|
formatted_date = date_obj.strftime('%B %d, %Y')
|
|
|
|
period = {
|
|
'type': 'instant',
|
|
'date': date_str,
|
|
'date_obj': date_obj,
|
|
'label': formatted_date,
|
|
'context_ids': context_ids,
|
|
'key': f"instant_{date_str}"
|
|
}
|
|
self.reporting_periods.append(period)
|
|
except (ValueError, TypeError):
|
|
# Skip invalid dates
|
|
continue
|
|
|
|
# Process duration periods
|
|
for period_key, context_ids in duration_periods.items():
|
|
start_date, end_date = period_key.split('_')
|
|
try:
|
|
start_obj = datetime.strptime(start_date, '%Y-%m-%d').date()
|
|
end_obj = datetime.strptime(end_date, '%Y-%m-%d').date()
|
|
formatted_start = start_obj.strftime('%B %d, %Y')
|
|
formatted_end = end_obj.strftime('%B %d, %Y')
|
|
|
|
# Calculate duration in days
|
|
days = (end_obj - start_obj).days
|
|
|
|
# Determine period type based on duration
|
|
period_description = classify_duration(days)
|
|
|
|
period = {
|
|
'type': 'duration',
|
|
'start_date': start_date,
|
|
'end_date': end_date,
|
|
'start_obj': start_obj,
|
|
'end_obj': end_obj,
|
|
'days': days,
|
|
'period_type': period_description,
|
|
'label': f"{period_description}: {formatted_start} to {formatted_end}",
|
|
'context_ids': context_ids,
|
|
'key': f"duration_{start_date}_{end_date}"
|
|
}
|
|
self.reporting_periods.append(period)
|
|
except (ValueError, TypeError):
|
|
# Skip invalid dates
|
|
continue
|
|
|
|
# Sort periods by date (most recent first)
|
|
self.reporting_periods.sort(key=lambda p: p['date_obj'] if p['type'] == 'instant' else p['end_obj'], reverse=True)
|
|
|
|
# Debug printout to verify periods are extracted
|
|
if len(self.reporting_periods) > 0:
|
|
log.debug(f"Found {len(self.reporting_periods)} reporting periods.")
|
|
log.debug(f"First period: {self.reporting_periods[0]['label']}")
|
|
else:
|
|
log.debug("Warning: No reporting periods found!")
|
|
|
|
# Debug context period map
|
|
log.debug(f"Context period map has {len(self.context_period_map)} entries.")
|
|
|
|
except Exception as e:
|
|
# Log error but don't fail
|
|
log.debug(f"Warning: Error building reporting periods: {str(e)}")
|
|
self.reporting_periods.clear()
|