Files
2025-12-09 12:13:01 +01:00

149 lines
5.0 KiB
Python

"""
Base parser functionality for XBRL parsing components.
This module provides common utilities and base functionality shared across
all XBRL parser components.
"""
from typing import Any, Dict
from lxml import etree as ET
from edgar.core import log
from edgar.xbrl.core import NAMESPACES
class BaseParser:
"""Base class for XBRL parser components with common functionality."""
def __init__(self):
"""Initialize base parser with common data structures."""
# Common namespaces and utilities available to all parsers
self.namespaces = NAMESPACES
def _safe_parse_xml(self, content: str) -> ET.Element:
"""
Safely parse XML content with lxml, handling encoding declarations properly.
Args:
content: XML content as string or bytes
Returns:
parsed XML root element
"""
parser = ET.XMLParser(remove_blank_text=True, recover=True)
# Convert to bytes for safer parsing if needed
if isinstance(content, str):
content_bytes = content.encode('utf-8')
else:
content_bytes = content
# Parse with lxml
return ET.XML(content_bytes, parser)
def _parse_order_attribute(self, arc) -> float:
"""Parse order attribute from arc, checking both order and xlink:order."""
# Try xlink:order first (XBRL standard)
order_value = arc.get('{http://www.w3.org/1999/xlink}order')
if order_value is None:
# Fallback to order attribute
order_value = arc.get('order')
# Debug logging to understand what's in the XBRL document
if order_value is not None:
log.debug(f"Found order attribute: {order_value}")
else:
# Log all attributes to see what's actually there
all_attrs = dict(arc.attrib) if hasattr(arc, 'attrib') else {}
log.debug(f"No order attribute found. Available attributes: {all_attrs}")
try:
return float(order_value) if order_value is not None else 0.0
except (ValueError, TypeError):
return 0.0
def _extract_role_info(self, role_element) -> Dict[str, Any]:
"""
Extract role information from a role element.
Args:
role_element: XML element containing role definition
Returns:
Dictionary with role information
"""
role_info = {}
# Get role URI
role_uri = role_element.get('roleURI', '')
role_info['uri'] = role_uri
# Extract role definition/label
definition_elem = role_element.find('.//{http://www.xbrl.org/2003/linkbase}definition')
if definition_elem is not None:
role_info['definition'] = definition_elem.text or ''
else:
# Fallback: create definition from role URI
role_info['definition'] = role_uri.split('/')[-1].replace('_', ' ') if role_uri else ''
return role_info
def _get_element_namespace_and_name(self, element_id: str) -> tuple[str, str]:
"""
Extract namespace and local name from an element ID.
Args:
element_id: Element identifier (may include namespace prefix)
Returns:
Tuple of (namespace, local_name)
"""
if ':' in element_id:
prefix, local_name = element_id.split(':', 1)
# Map common prefixes to namespaces
namespace_map = {
'us-gaap': 'http://fasb.org/us-gaap/2024',
'dei': 'http://xbrl.sec.gov/dei/2024',
'invest': 'http://xbrl.sec.gov/invest/2013-01-31',
'country': 'http://xbrl.sec.gov/country/2023',
'currency': 'http://xbrl.sec.gov/currency/2023',
'exch': 'http://xbrl.sec.gov/exch/2023',
'naics': 'http://xbrl.sec.gov/naics/2023',
'sic': 'http://xbrl.sec.gov/sic/2023',
'stpr': 'http://xbrl.sec.gov/stpr/2023',
}
namespace = namespace_map.get(prefix, f'http://unknown.namespace/{prefix}')
return namespace, local_name
else:
return '', element_id
def _normalize_element_id(self, element_id: str) -> str:
"""
Normalize element ID to a consistent format.
Args:
element_id: Original element identifier
Returns:
Normalized element identifier
"""
if ':' in element_id:
prefix, name = element_id.split(':', 1)
return f"{prefix}_{name}"
return element_id
def _log_parsing_progress(self, component: str, count: int, total: int = None):
"""
Log parsing progress for debugging.
Args:
component: Name of component being parsed
count: Number of items processed
total: Total number of items (optional)
"""
if total:
log.debug(f"Parsed {count}/{total} {component}")
else:
log.debug(f"Parsed {count} {component}")