Files
2025-12-09 12:13:01 +01:00

150 lines
5.9 KiB
Python

"""
Labels parser for XBRL documents.
This module handles parsing of XBRL label linkbases and extracting
element labels for display purposes.
"""
from pathlib import Path
from typing import Dict, Union
from lxml import etree as ET
from edgar.xbrl.core import STANDARD_LABEL, extract_element_id
from edgar.xbrl.models import ElementCatalog, XBRLProcessingError
from .base import BaseParser
class LabelsParser(BaseParser):
"""Parser for XBRL label linkbases."""
def __init__(self, element_catalog: Dict[str, ElementCatalog]):
"""
Initialize labels parser with data structure references.
Args:
element_catalog: Reference to element catalog dictionary
"""
super().__init__()
# Store references to data structures
self.element_catalog = element_catalog
def parse_labels(self, file_path: Union[str, Path]) -> None:
"""Parse label linkbase file and extract label information."""
try:
content = Path(file_path).read_text()
self.parse_labels_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing label file {file_path}: {str(e)}") from e
def parse_labels_content(self, content: str) -> None:
"""Parse label linkbase content and extract label information."""
try:
# Optimize: Register namespaces for faster XPath lookups
nsmap = {
'link': 'http://www.xbrl.org/2003/linkbase',
'xlink': 'http://www.w3.org/1999/xlink',
'xml': 'http://www.w3.org/XML/1998/namespace'
}
# Optimize: Use lxml parser with smart string handling
parser = ET.XMLParser(remove_blank_text=True, recover=True)
root = ET.XML(content.encode('utf-8'), parser)
# Optimize: Use specific XPath expressions with namespaces for faster lookups
# This is much faster than using findall with '//' in element tree
label_arcs = root.xpath('//link:labelArc', namespaces=nsmap)
labels = root.xpath('//link:label', namespaces=nsmap)
# Optimize: Pre-allocate dictionary with expected size
label_lookup = {}
# Optimize: Cache attribute lookups
xlink_label = '{http://www.w3.org/1999/xlink}label'
xlink_role = '{http://www.w3.org/1999/xlink}role'
xml_lang = '{http://www.w3.org/XML/1998/namespace}lang'
default_role = 'http://www.xbrl.org/2003/role/label'
# Optimize: Process labels in a single pass with direct attribute access
for label in labels:
label_id = label.get(xlink_label)
if not label_id:
continue
# Get text first - if empty, skip further processing
text = label.text
if text is None:
continue
# Get attributes - direct lookup is faster than method calls
role = label.get(xlink_role, default_role)
lang = label.get(xml_lang, 'en-US')
# Create nested dictionaries only when needed
if label_id not in label_lookup:
label_lookup[label_id] = {}
if lang not in label_lookup[label_id]:
label_lookup[label_id][lang] = {}
label_lookup[label_id][lang][role] = text
# Optimize: Cache attribute lookups for arcs
xlink_from = '{http://www.w3.org/1999/xlink}from'
xlink_to = '{http://www.w3.org/1999/xlink}to'
xlink_href = '{http://www.w3.org/1999/xlink}href'
# Optimize: Create a lookup table for locators by label for faster access
loc_by_label = {}
for loc in root.xpath('//link:loc', namespaces=nsmap):
loc_label = loc.get(xlink_label)
if loc_label:
loc_by_label[loc_label] = loc.get(xlink_href)
# Connect labels to elements using arcs - with optimized lookups
for arc in label_arcs:
from_ref = arc.get(xlink_from)
to_ref = arc.get(xlink_to)
if not from_ref or not to_ref or to_ref not in label_lookup:
continue
# Use cached locator lookup instead of expensive XPath
href = loc_by_label.get(from_ref)
if not href:
continue
# Extract element ID from href
element_id = extract_element_id(href)
# Find labels for this element - check most likely case first
if 'en-US' in label_lookup[to_ref]:
element_labels = label_lookup[to_ref]['en-US']
# Optimize: Update catalog with minimal overhead
catalog_entry = self.element_catalog.get(element_id)
if catalog_entry:
catalog_entry.labels.update(element_labels)
else:
# Create placeholder in catalog
self.element_catalog[element_id] = ElementCatalog(
name=element_id,
data_type="",
period_type="duration",
labels=element_labels
)
except Exception as e:
raise XBRLProcessingError(f"Error parsing label content: {str(e)}") from e
def get_element_label(self, element_id: str) -> str:
"""Get the label for an element, falling back to the element ID if not found."""
if element_id in self.element_catalog and self.element_catalog[element_id].labels:
# Use standard label if available
standard_label = self.element_catalog[element_id].labels.get(STANDARD_LABEL)
if standard_label:
return standard_label
return element_id # Fallback to element ID