345 lines
12 KiB
Python
345 lines
12 KiB
Python
"""
|
|
XBRL extraction strategy for inline XBRL documents.
|
|
"""
|
|
|
|
from typing import Dict, Any, Optional
|
|
|
|
from lxml.html import HtmlElement
|
|
|
|
from edgar.documents.types import XBRLFact
|
|
|
|
|
|
class XBRLExtractor:
|
|
"""
|
|
Extracts XBRL facts from inline XBRL (iXBRL) documents.
|
|
|
|
Handles:
|
|
- ix:nonFraction, ix:nonNumeric facts
|
|
- Context and unit resolution
|
|
- Continuation handling
|
|
- Transformation rules
|
|
"""
|
|
|
|
# XBRL namespaces
|
|
NAMESPACES = {
|
|
'ix': 'http://www.xbrl.org/2013/inlineXBRL',
|
|
'xbrli': 'http://www.xbrl.org/2003/instance',
|
|
'xbrldi': 'http://xbrl.org/2006/xbrldi',
|
|
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
|
|
}
|
|
|
|
# Common transformation formats
|
|
TRANSFORMATIONS = {
|
|
'ixt:numdotdecimal': lambda x: x.replace(',', ''),
|
|
'ixt:numcommadecimal': lambda x: x.replace('.', '_').replace(',', '.').replace('_', ','),
|
|
'ixt:zerodash': lambda x: '0' if x == '-' else x,
|
|
'ixt:datedoteu': lambda x: x.replace('.', '-'),
|
|
'ixt:datedotus': lambda x: x.replace('.', '/'),
|
|
}
|
|
|
|
def __init__(self):
|
|
"""Initialize XBRL extractor."""
|
|
self.contexts: Dict[str, Dict[str, Any]] = {}
|
|
self.units: Dict[str, str] = {}
|
|
self.continuations: Dict[str, str] = {}
|
|
self._initialized = False
|
|
|
|
def extract_context(self, element: HtmlElement) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Extract XBRL context from element.
|
|
|
|
Args:
|
|
element: HTML element that might contain XBRL
|
|
|
|
Returns:
|
|
XBRL metadata if found
|
|
"""
|
|
# Check if element is an ix: tag
|
|
if not self._is_xbrl_element(element):
|
|
return None
|
|
|
|
# Initialize context if needed
|
|
if not self._initialized:
|
|
self._initialize_context(element)
|
|
|
|
# Extract based on element type
|
|
tag_name = self._get_local_name(element.tag)
|
|
|
|
if tag_name == 'nonfraction':
|
|
return self._extract_nonfraction(element)
|
|
elif tag_name == 'nonnumeric':
|
|
return self._extract_nonnumeric(element)
|
|
elif tag_name == 'continuation':
|
|
return self._extract_continuation(element)
|
|
elif tag_name == 'footnote':
|
|
return self._extract_footnote(element)
|
|
elif tag_name == 'fraction':
|
|
return self._extract_fraction(element)
|
|
|
|
return None
|
|
|
|
def extract_fact(self, element: HtmlElement) -> Optional[XBRLFact]:
|
|
"""Extract XBRL fact from element."""
|
|
context = self.extract_context(element)
|
|
if not context:
|
|
return None
|
|
|
|
# Get fact value
|
|
value = self._get_fact_value(element)
|
|
|
|
# Create fact
|
|
fact = XBRLFact(
|
|
concept=context.get('name', ''),
|
|
value=value,
|
|
context_ref=context.get('contextRef'),
|
|
unit_ref=context.get('unitRef'),
|
|
decimals=context.get('decimals'),
|
|
scale=context.get('scale'),
|
|
format=context.get('format'),
|
|
sign=context.get('sign')
|
|
)
|
|
|
|
# Resolve references
|
|
if fact.context_ref and fact.context_ref in self.contexts:
|
|
fact.context = self.contexts[fact.context_ref]
|
|
|
|
if fact.unit_ref and fact.unit_ref in self.units:
|
|
fact.unit = self.units[fact.unit_ref]
|
|
|
|
return fact
|
|
|
|
def _is_xbrl_element(self, element: HtmlElement) -> bool:
|
|
"""Check if element is an XBRL element."""
|
|
tag = element.tag
|
|
if not isinstance(tag, str):
|
|
return False
|
|
|
|
# Handle both namespaced and non-namespaced tags
|
|
tag_lower = tag.lower()
|
|
return (
|
|
tag.startswith('{' + self.NAMESPACES['ix'] + '}') or
|
|
tag.startswith('ix:') or
|
|
tag_lower.startswith('ix:')
|
|
)
|
|
|
|
def _get_local_name(self, tag: str) -> str:
|
|
"""Get local name from qualified tag."""
|
|
if '}' in tag:
|
|
return tag.split('}')[1].lower()
|
|
elif ':' in tag:
|
|
return tag.split(':')[1].lower()
|
|
return tag.lower()
|
|
|
|
def _initialize_context(self, element: HtmlElement):
|
|
"""Initialize context and unit information from document."""
|
|
# Find root element
|
|
root = element.getroottree().getroot()
|
|
|
|
# Extract contexts
|
|
self._extract_contexts(root)
|
|
|
|
# Extract units
|
|
self._extract_units(root)
|
|
|
|
self._initialized = True
|
|
|
|
def _extract_contexts(self, root: HtmlElement):
|
|
"""Extract all context definitions."""
|
|
# Look for xbrli:context elements
|
|
for context in root.xpath('//xbrli:context', namespaces=self.NAMESPACES):
|
|
context_id = context.get('id')
|
|
if not context_id:
|
|
continue
|
|
|
|
context_data = {
|
|
'id': context_id
|
|
}
|
|
|
|
# Extract entity
|
|
entity = context.find('.//xbrli:entity', namespaces=self.NAMESPACES)
|
|
if entity is not None:
|
|
identifier = entity.find('.//xbrli:identifier', namespaces=self.NAMESPACES)
|
|
if identifier is not None:
|
|
context_data['entity'] = identifier.text
|
|
context_data['scheme'] = identifier.get('scheme')
|
|
|
|
# Extract period
|
|
period = context.find('.//xbrli:period', namespaces=self.NAMESPACES)
|
|
if period is not None:
|
|
instant = period.find('.//xbrli:instant', namespaces=self.NAMESPACES)
|
|
if instant is not None:
|
|
context_data['instant'] = instant.text
|
|
context_data['period_type'] = 'instant'
|
|
else:
|
|
start = period.find('.//xbrli:startDate', namespaces=self.NAMESPACES)
|
|
end = period.find('.//xbrli:endDate', namespaces=self.NAMESPACES)
|
|
if start is not None and end is not None:
|
|
context_data['start_date'] = start.text
|
|
context_data['end_date'] = end.text
|
|
context_data['period_type'] = 'duration'
|
|
|
|
# Extract dimensions
|
|
segment = context.find('.//xbrli:segment', namespaces=self.NAMESPACES)
|
|
if segment is not None:
|
|
dimensions = {}
|
|
for member in segment.findall('.//xbrldi:explicitMember', namespaces=self.NAMESPACES):
|
|
dim = member.get('dimension')
|
|
if dim:
|
|
dimensions[dim] = member.text
|
|
if dimensions:
|
|
context_data['dimensions'] = dimensions
|
|
|
|
self.contexts[context_id] = context_data
|
|
|
|
def _extract_units(self, root: HtmlElement):
|
|
"""Extract all unit definitions."""
|
|
# Look for xbrli:unit elements
|
|
for unit in root.xpath('//xbrli:unit', namespaces=self.NAMESPACES):
|
|
unit_id = unit.get('id')
|
|
if not unit_id:
|
|
continue
|
|
|
|
# Check for simple measure
|
|
measure = unit.find('.//xbrli:measure', namespaces=self.NAMESPACES)
|
|
if measure is not None:
|
|
self.units[unit_id] = self._normalize_unit(measure.text)
|
|
continue
|
|
|
|
# Check for complex unit (divide)
|
|
divide = unit.find('.//xbrli:divide', namespaces=self.NAMESPACES)
|
|
if divide is not None:
|
|
numerator = divide.find('.//xbrli:unitNumerator/xbrli:measure', namespaces=self.NAMESPACES)
|
|
denominator = divide.find('.//xbrli:unitDenominator/xbrli:measure', namespaces=self.NAMESPACES)
|
|
|
|
if numerator is not None and denominator is not None:
|
|
num_unit = self._normalize_unit(numerator.text)
|
|
den_unit = self._normalize_unit(denominator.text)
|
|
self.units[unit_id] = f"{num_unit}/{den_unit}"
|
|
|
|
def _normalize_unit(self, unit_text: str) -> str:
|
|
"""Normalize unit text."""
|
|
if not unit_text:
|
|
return ''
|
|
|
|
# Remove namespace prefix
|
|
if ':' in unit_text:
|
|
unit_text = unit_text.split(':')[-1]
|
|
|
|
# Common normalizations
|
|
unit_map = {
|
|
'usd': 'USD',
|
|
'shares': 'shares',
|
|
'pure': 'pure',
|
|
'percent': '%'
|
|
}
|
|
|
|
return unit_map.get(unit_text.lower(), unit_text)
|
|
|
|
def _extract_nonfraction(self, element: HtmlElement) -> Dict[str, Any]:
|
|
"""Extract ix:nonFraction element."""
|
|
metadata = {
|
|
'type': 'nonFraction',
|
|
'name': element.get('name'),
|
|
'contextRef': element.get('contextRef') or element.get('contextref'),
|
|
'unitRef': element.get('unitRef') or element.get('unitref'),
|
|
'decimals': element.get('decimals'),
|
|
'scale': element.get('scale'),
|
|
'format': element.get('format'),
|
|
'sign': element.get('sign')
|
|
}
|
|
|
|
# Clean None values
|
|
return {k: v for k, v in metadata.items() if v is not None}
|
|
|
|
def _extract_nonnumeric(self, element: HtmlElement) -> Dict[str, Any]:
|
|
"""Extract ix:nonNumeric element."""
|
|
metadata = {
|
|
'type': 'nonNumeric',
|
|
'name': element.get('name'),
|
|
'contextRef': element.get('contextRef') or element.get('contextref'),
|
|
'format': element.get('format')
|
|
}
|
|
|
|
# Clean None values
|
|
return {k: v for k, v in metadata.items() if v is not None}
|
|
|
|
def _extract_continuation(self, element: HtmlElement) -> Dict[str, Any]:
|
|
"""Extract ix:continuation element."""
|
|
cont_id = element.get('id')
|
|
continued_at = element.get('continuedAt')
|
|
|
|
if cont_id and continued_at:
|
|
# Map continuation to original
|
|
if continued_at in self.continuations:
|
|
original = self.continuations[continued_at]
|
|
self.continuations[cont_id] = original
|
|
return original
|
|
else:
|
|
# Store for later resolution
|
|
metadata = {
|
|
'type': 'continuation',
|
|
'id': cont_id,
|
|
'continuedAt': continued_at
|
|
}
|
|
self.continuations[cont_id] = metadata
|
|
return metadata
|
|
|
|
return {}
|
|
|
|
def _extract_footnote(self, element: HtmlElement) -> Dict[str, Any]:
|
|
"""Extract ix:footnote element."""
|
|
return {
|
|
'type': 'footnote',
|
|
'footnoteRole': element.get('footnoteRole'),
|
|
'footnoteID': element.get('footnoteID')
|
|
}
|
|
|
|
def _extract_fraction(self, element: HtmlElement) -> Dict[str, Any]:
|
|
"""Extract ix:fraction element."""
|
|
metadata = {
|
|
'type': 'fraction',
|
|
'name': element.get('name'),
|
|
'contextRef': element.get('contextRef'),
|
|
'unitRef': element.get('unitRef')
|
|
}
|
|
|
|
# Extract numerator and denominator
|
|
numerator = element.find('.//ix:numerator', namespaces=self.NAMESPACES)
|
|
denominator = element.find('.//ix:denominator', namespaces=self.NAMESPACES)
|
|
|
|
if numerator is not None:
|
|
metadata['numerator'] = numerator.text
|
|
if denominator is not None:
|
|
metadata['denominator'] = denominator.text
|
|
|
|
return {k: v for k, v in metadata.items() if v is not None}
|
|
|
|
def _get_fact_value(self, element: HtmlElement) -> str:
|
|
"""Get fact value from element with transformations."""
|
|
# Get raw value
|
|
value = element.text or ''
|
|
|
|
# Apply format transformation if specified
|
|
format_attr = element.get('format')
|
|
if format_attr and format_attr in self.TRANSFORMATIONS:
|
|
transform = self.TRANSFORMATIONS[format_attr]
|
|
value = transform(value)
|
|
|
|
# Apply scale if specified
|
|
scale = element.get('scale')
|
|
if scale:
|
|
try:
|
|
scale_factor = int(scale)
|
|
numeric_value = float(value.replace(',', ''))
|
|
scaled_value = numeric_value * (10 ** scale_factor)
|
|
value = str(scaled_value)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Apply sign if specified
|
|
sign = element.get('sign')
|
|
if sign == '-':
|
|
if value and not value.startswith('-'):
|
|
value = '-' + value
|
|
|
|
return value.strip() |