Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,345 @@
"""
XBRL extraction strategy for inline XBRL documents.
"""
from typing import Dict, Any, Optional
from lxml.html import HtmlElement
from edgar.documents.types import XBRLFact
class XBRLExtractor:
"""
Extracts XBRL facts from inline XBRL (iXBRL) documents.
Handles:
- ix:nonFraction, ix:nonNumeric facts
- Context and unit resolution
- Continuation handling
- Transformation rules
"""
# XBRL namespaces
NAMESPACES = {
'ix': 'http://www.xbrl.org/2013/inlineXBRL',
'xbrli': 'http://www.xbrl.org/2003/instance',
'xbrldi': 'http://xbrl.org/2006/xbrldi',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}
# Common transformation formats
TRANSFORMATIONS = {
'ixt:numdotdecimal': lambda x: x.replace(',', ''),
'ixt:numcommadecimal': lambda x: x.replace('.', '_').replace(',', '.').replace('_', ','),
'ixt:zerodash': lambda x: '0' if x == '-' else x,
'ixt:datedoteu': lambda x: x.replace('.', '-'),
'ixt:datedotus': lambda x: x.replace('.', '/'),
}
def __init__(self):
"""Initialize XBRL extractor."""
self.contexts: Dict[str, Dict[str, Any]] = {}
self.units: Dict[str, str] = {}
self.continuations: Dict[str, str] = {}
self._initialized = False
def extract_context(self, element: HtmlElement) -> Optional[Dict[str, Any]]:
"""
Extract XBRL context from element.
Args:
element: HTML element that might contain XBRL
Returns:
XBRL metadata if found
"""
# Check if element is an ix: tag
if not self._is_xbrl_element(element):
return None
# Initialize context if needed
if not self._initialized:
self._initialize_context(element)
# Extract based on element type
tag_name = self._get_local_name(element.tag)
if tag_name == 'nonfraction':
return self._extract_nonfraction(element)
elif tag_name == 'nonnumeric':
return self._extract_nonnumeric(element)
elif tag_name == 'continuation':
return self._extract_continuation(element)
elif tag_name == 'footnote':
return self._extract_footnote(element)
elif tag_name == 'fraction':
return self._extract_fraction(element)
return None
def extract_fact(self, element: HtmlElement) -> Optional[XBRLFact]:
"""Extract XBRL fact from element."""
context = self.extract_context(element)
if not context:
return None
# Get fact value
value = self._get_fact_value(element)
# Create fact
fact = XBRLFact(
concept=context.get('name', ''),
value=value,
context_ref=context.get('contextRef'),
unit_ref=context.get('unitRef'),
decimals=context.get('decimals'),
scale=context.get('scale'),
format=context.get('format'),
sign=context.get('sign')
)
# Resolve references
if fact.context_ref and fact.context_ref in self.contexts:
fact.context = self.contexts[fact.context_ref]
if fact.unit_ref and fact.unit_ref in self.units:
fact.unit = self.units[fact.unit_ref]
return fact
def _is_xbrl_element(self, element: HtmlElement) -> bool:
"""Check if element is an XBRL element."""
tag = element.tag
if not isinstance(tag, str):
return False
# Handle both namespaced and non-namespaced tags
tag_lower = tag.lower()
return (
tag.startswith('{' + self.NAMESPACES['ix'] + '}') or
tag.startswith('ix:') or
tag_lower.startswith('ix:')
)
def _get_local_name(self, tag: str) -> str:
"""Get local name from qualified tag."""
if '}' in tag:
return tag.split('}')[1].lower()
elif ':' in tag:
return tag.split(':')[1].lower()
return tag.lower()
def _initialize_context(self, element: HtmlElement):
"""Initialize context and unit information from document."""
# Find root element
root = element.getroottree().getroot()
# Extract contexts
self._extract_contexts(root)
# Extract units
self._extract_units(root)
self._initialized = True
def _extract_contexts(self, root: HtmlElement):
"""Extract all context definitions."""
# Look for xbrli:context elements
for context in root.xpath('//xbrli:context', namespaces=self.NAMESPACES):
context_id = context.get('id')
if not context_id:
continue
context_data = {
'id': context_id
}
# Extract entity
entity = context.find('.//xbrli:entity', namespaces=self.NAMESPACES)
if entity is not None:
identifier = entity.find('.//xbrli:identifier', namespaces=self.NAMESPACES)
if identifier is not None:
context_data['entity'] = identifier.text
context_data['scheme'] = identifier.get('scheme')
# Extract period
period = context.find('.//xbrli:period', namespaces=self.NAMESPACES)
if period is not None:
instant = period.find('.//xbrli:instant', namespaces=self.NAMESPACES)
if instant is not None:
context_data['instant'] = instant.text
context_data['period_type'] = 'instant'
else:
start = period.find('.//xbrli:startDate', namespaces=self.NAMESPACES)
end = period.find('.//xbrli:endDate', namespaces=self.NAMESPACES)
if start is not None and end is not None:
context_data['start_date'] = start.text
context_data['end_date'] = end.text
context_data['period_type'] = 'duration'
# Extract dimensions
segment = context.find('.//xbrli:segment', namespaces=self.NAMESPACES)
if segment is not None:
dimensions = {}
for member in segment.findall('.//xbrldi:explicitMember', namespaces=self.NAMESPACES):
dim = member.get('dimension')
if dim:
dimensions[dim] = member.text
if dimensions:
context_data['dimensions'] = dimensions
self.contexts[context_id] = context_data
def _extract_units(self, root: HtmlElement):
"""Extract all unit definitions."""
# Look for xbrli:unit elements
for unit in root.xpath('//xbrli:unit', namespaces=self.NAMESPACES):
unit_id = unit.get('id')
if not unit_id:
continue
# Check for simple measure
measure = unit.find('.//xbrli:measure', namespaces=self.NAMESPACES)
if measure is not None:
self.units[unit_id] = self._normalize_unit(measure.text)
continue
# Check for complex unit (divide)
divide = unit.find('.//xbrli:divide', namespaces=self.NAMESPACES)
if divide is not None:
numerator = divide.find('.//xbrli:unitNumerator/xbrli:measure', namespaces=self.NAMESPACES)
denominator = divide.find('.//xbrli:unitDenominator/xbrli:measure', namespaces=self.NAMESPACES)
if numerator is not None and denominator is not None:
num_unit = self._normalize_unit(numerator.text)
den_unit = self._normalize_unit(denominator.text)
self.units[unit_id] = f"{num_unit}/{den_unit}"
def _normalize_unit(self, unit_text: str) -> str:
"""Normalize unit text."""
if not unit_text:
return ''
# Remove namespace prefix
if ':' in unit_text:
unit_text = unit_text.split(':')[-1]
# Common normalizations
unit_map = {
'usd': 'USD',
'shares': 'shares',
'pure': 'pure',
'percent': '%'
}
return unit_map.get(unit_text.lower(), unit_text)
def _extract_nonfraction(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:nonFraction element."""
metadata = {
'type': 'nonFraction',
'name': element.get('name'),
'contextRef': element.get('contextRef') or element.get('contextref'),
'unitRef': element.get('unitRef') or element.get('unitref'),
'decimals': element.get('decimals'),
'scale': element.get('scale'),
'format': element.get('format'),
'sign': element.get('sign')
}
# Clean None values
return {k: v for k, v in metadata.items() if v is not None}
def _extract_nonnumeric(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:nonNumeric element."""
metadata = {
'type': 'nonNumeric',
'name': element.get('name'),
'contextRef': element.get('contextRef') or element.get('contextref'),
'format': element.get('format')
}
# Clean None values
return {k: v for k, v in metadata.items() if v is not None}
def _extract_continuation(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:continuation element."""
cont_id = element.get('id')
continued_at = element.get('continuedAt')
if cont_id and continued_at:
# Map continuation to original
if continued_at in self.continuations:
original = self.continuations[continued_at]
self.continuations[cont_id] = original
return original
else:
# Store for later resolution
metadata = {
'type': 'continuation',
'id': cont_id,
'continuedAt': continued_at
}
self.continuations[cont_id] = metadata
return metadata
return {}
def _extract_footnote(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:footnote element."""
return {
'type': 'footnote',
'footnoteRole': element.get('footnoteRole'),
'footnoteID': element.get('footnoteID')
}
def _extract_fraction(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:fraction element."""
metadata = {
'type': 'fraction',
'name': element.get('name'),
'contextRef': element.get('contextRef'),
'unitRef': element.get('unitRef')
}
# Extract numerator and denominator
numerator = element.find('.//ix:numerator', namespaces=self.NAMESPACES)
denominator = element.find('.//ix:denominator', namespaces=self.NAMESPACES)
if numerator is not None:
metadata['numerator'] = numerator.text
if denominator is not None:
metadata['denominator'] = denominator.text
return {k: v for k, v in metadata.items() if v is not None}
def _get_fact_value(self, element: HtmlElement) -> str:
"""Get fact value from element with transformations."""
# Get raw value
value = element.text or ''
# Apply format transformation if specified
format_attr = element.get('format')
if format_attr and format_attr in self.TRANSFORMATIONS:
transform = self.TRANSFORMATIONS[format_attr]
value = transform(value)
# Apply scale if specified
scale = element.get('scale')
if scale:
try:
scale_factor = int(scale)
numeric_value = float(value.replace(',', ''))
scaled_value = numeric_value * (10 ** scale_factor)
value = str(scaled_value)
except (ValueError, TypeError):
pass
# Apply sign if specified
sign = element.get('sign')
if sign == '-':
if value and not value.startswith('-'):
value = '-' + value
return value.strip()