Initial commit
This commit is contained in:
148
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/base.py
Normal file
148
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/base.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""
|
||||
Base parser functionality for XBRL parsing components.
|
||||
|
||||
This module provides common utilities and base functionality shared across
|
||||
all XBRL parser components.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
from lxml import etree as ET
|
||||
|
||||
from edgar.core import log
|
||||
from edgar.xbrl.core import NAMESPACES
|
||||
|
||||
|
||||
class BaseParser:
|
||||
"""Base class for XBRL parser components with common functionality."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize base parser with common data structures."""
|
||||
# Common namespaces and utilities available to all parsers
|
||||
self.namespaces = NAMESPACES
|
||||
|
||||
def _safe_parse_xml(self, content: str) -> ET.Element:
|
||||
"""
|
||||
Safely parse XML content with lxml, handling encoding declarations properly.
|
||||
|
||||
Args:
|
||||
content: XML content as string or bytes
|
||||
|
||||
Returns:
|
||||
parsed XML root element
|
||||
"""
|
||||
parser = ET.XMLParser(remove_blank_text=True, recover=True)
|
||||
|
||||
# Convert to bytes for safer parsing if needed
|
||||
if isinstance(content, str):
|
||||
content_bytes = content.encode('utf-8')
|
||||
else:
|
||||
content_bytes = content
|
||||
|
||||
# Parse with lxml
|
||||
return ET.XML(content_bytes, parser)
|
||||
|
||||
def _parse_order_attribute(self, arc) -> float:
|
||||
"""Parse order attribute from arc, checking both order and xlink:order."""
|
||||
# Try xlink:order first (XBRL standard)
|
||||
order_value = arc.get('{http://www.w3.org/1999/xlink}order')
|
||||
if order_value is None:
|
||||
# Fallback to order attribute
|
||||
order_value = arc.get('order')
|
||||
|
||||
# Debug logging to understand what's in the XBRL document
|
||||
if order_value is not None:
|
||||
log.debug(f"Found order attribute: {order_value}")
|
||||
else:
|
||||
# Log all attributes to see what's actually there
|
||||
all_attrs = dict(arc.attrib) if hasattr(arc, 'attrib') else {}
|
||||
log.debug(f"No order attribute found. Available attributes: {all_attrs}")
|
||||
|
||||
try:
|
||||
return float(order_value) if order_value is not None else 0.0
|
||||
except (ValueError, TypeError):
|
||||
return 0.0
|
||||
|
||||
def _extract_role_info(self, role_element) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract role information from a role element.
|
||||
|
||||
Args:
|
||||
role_element: XML element containing role definition
|
||||
|
||||
Returns:
|
||||
Dictionary with role information
|
||||
"""
|
||||
role_info = {}
|
||||
|
||||
# Get role URI
|
||||
role_uri = role_element.get('roleURI', '')
|
||||
role_info['uri'] = role_uri
|
||||
|
||||
# Extract role definition/label
|
||||
definition_elem = role_element.find('.//{http://www.xbrl.org/2003/linkbase}definition')
|
||||
if definition_elem is not None:
|
||||
role_info['definition'] = definition_elem.text or ''
|
||||
else:
|
||||
# Fallback: create definition from role URI
|
||||
role_info['definition'] = role_uri.split('/')[-1].replace('_', ' ') if role_uri else ''
|
||||
|
||||
return role_info
|
||||
|
||||
def _get_element_namespace_and_name(self, element_id: str) -> tuple[str, str]:
|
||||
"""
|
||||
Extract namespace and local name from an element ID.
|
||||
|
||||
Args:
|
||||
element_id: Element identifier (may include namespace prefix)
|
||||
|
||||
Returns:
|
||||
Tuple of (namespace, local_name)
|
||||
"""
|
||||
if ':' in element_id:
|
||||
prefix, local_name = element_id.split(':', 1)
|
||||
# Map common prefixes to namespaces
|
||||
namespace_map = {
|
||||
'us-gaap': 'http://fasb.org/us-gaap/2024',
|
||||
'dei': 'http://xbrl.sec.gov/dei/2024',
|
||||
'invest': 'http://xbrl.sec.gov/invest/2013-01-31',
|
||||
'country': 'http://xbrl.sec.gov/country/2023',
|
||||
'currency': 'http://xbrl.sec.gov/currency/2023',
|
||||
'exch': 'http://xbrl.sec.gov/exch/2023',
|
||||
'naics': 'http://xbrl.sec.gov/naics/2023',
|
||||
'sic': 'http://xbrl.sec.gov/sic/2023',
|
||||
'stpr': 'http://xbrl.sec.gov/stpr/2023',
|
||||
}
|
||||
namespace = namespace_map.get(prefix, f'http://unknown.namespace/{prefix}')
|
||||
return namespace, local_name
|
||||
else:
|
||||
return '', element_id
|
||||
|
||||
def _normalize_element_id(self, element_id: str) -> str:
|
||||
"""
|
||||
Normalize element ID to a consistent format.
|
||||
|
||||
Args:
|
||||
element_id: Original element identifier
|
||||
|
||||
Returns:
|
||||
Normalized element identifier
|
||||
"""
|
||||
if ':' in element_id:
|
||||
prefix, name = element_id.split(':', 1)
|
||||
return f"{prefix}_{name}"
|
||||
return element_id
|
||||
|
||||
def _log_parsing_progress(self, component: str, count: int, total: int = None):
|
||||
"""
|
||||
Log parsing progress for debugging.
|
||||
|
||||
Args:
|
||||
component: Name of component being parsed
|
||||
count: Number of items processed
|
||||
total: Total number of items (optional)
|
||||
"""
|
||||
if total:
|
||||
log.debug(f"Parsed {count}/{total} {component}")
|
||||
else:
|
||||
log.debug(f"Parsed {count} {component}")
|
||||
Reference in New Issue
Block a user