Initial commit
This commit is contained in:
149
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/labels.py
Normal file
149
venv/lib/python3.10/site-packages/edgar/xbrl/parsers/labels.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""
|
||||
Labels parser for XBRL documents.
|
||||
|
||||
This module handles parsing of XBRL label linkbases and extracting
|
||||
element labels for display purposes.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, Union
|
||||
|
||||
from lxml import etree as ET
|
||||
|
||||
from edgar.xbrl.core import STANDARD_LABEL, extract_element_id
|
||||
from edgar.xbrl.models import ElementCatalog, XBRLProcessingError
|
||||
|
||||
from .base import BaseParser
|
||||
|
||||
|
||||
class LabelsParser(BaseParser):
|
||||
"""Parser for XBRL label linkbases."""
|
||||
|
||||
def __init__(self, element_catalog: Dict[str, ElementCatalog]):
|
||||
"""
|
||||
Initialize labels parser with data structure references.
|
||||
|
||||
Args:
|
||||
element_catalog: Reference to element catalog dictionary
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# Store references to data structures
|
||||
self.element_catalog = element_catalog
|
||||
|
||||
def parse_labels(self, file_path: Union[str, Path]) -> None:
|
||||
"""Parse label linkbase file and extract label information."""
|
||||
try:
|
||||
content = Path(file_path).read_text()
|
||||
self.parse_labels_content(content)
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing label file {file_path}: {str(e)}") from e
|
||||
|
||||
def parse_labels_content(self, content: str) -> None:
|
||||
"""Parse label linkbase content and extract label information."""
|
||||
try:
|
||||
# Optimize: Register namespaces for faster XPath lookups
|
||||
nsmap = {
|
||||
'link': 'http://www.xbrl.org/2003/linkbase',
|
||||
'xlink': 'http://www.w3.org/1999/xlink',
|
||||
'xml': 'http://www.w3.org/XML/1998/namespace'
|
||||
}
|
||||
|
||||
# Optimize: Use lxml parser with smart string handling
|
||||
parser = ET.XMLParser(remove_blank_text=True, recover=True)
|
||||
root = ET.XML(content.encode('utf-8'), parser)
|
||||
|
||||
# Optimize: Use specific XPath expressions with namespaces for faster lookups
|
||||
# This is much faster than using findall with '//' in element tree
|
||||
label_arcs = root.xpath('//link:labelArc', namespaces=nsmap)
|
||||
labels = root.xpath('//link:label', namespaces=nsmap)
|
||||
|
||||
# Optimize: Pre-allocate dictionary with expected size
|
||||
label_lookup = {}
|
||||
|
||||
# Optimize: Cache attribute lookups
|
||||
xlink_label = '{http://www.w3.org/1999/xlink}label'
|
||||
xlink_role = '{http://www.w3.org/1999/xlink}role'
|
||||
xml_lang = '{http://www.w3.org/XML/1998/namespace}lang'
|
||||
default_role = 'http://www.xbrl.org/2003/role/label'
|
||||
|
||||
# Optimize: Process labels in a single pass with direct attribute access
|
||||
for label in labels:
|
||||
label_id = label.get(xlink_label)
|
||||
if not label_id:
|
||||
continue
|
||||
|
||||
# Get text first - if empty, skip further processing
|
||||
text = label.text
|
||||
if text is None:
|
||||
continue
|
||||
|
||||
# Get attributes - direct lookup is faster than method calls
|
||||
role = label.get(xlink_role, default_role)
|
||||
lang = label.get(xml_lang, 'en-US')
|
||||
|
||||
# Create nested dictionaries only when needed
|
||||
if label_id not in label_lookup:
|
||||
label_lookup[label_id] = {}
|
||||
|
||||
if lang not in label_lookup[label_id]:
|
||||
label_lookup[label_id][lang] = {}
|
||||
|
||||
label_lookup[label_id][lang][role] = text
|
||||
|
||||
# Optimize: Cache attribute lookups for arcs
|
||||
xlink_from = '{http://www.w3.org/1999/xlink}from'
|
||||
xlink_to = '{http://www.w3.org/1999/xlink}to'
|
||||
xlink_href = '{http://www.w3.org/1999/xlink}href'
|
||||
|
||||
# Optimize: Create a lookup table for locators by label for faster access
|
||||
loc_by_label = {}
|
||||
for loc in root.xpath('//link:loc', namespaces=nsmap):
|
||||
loc_label = loc.get(xlink_label)
|
||||
if loc_label:
|
||||
loc_by_label[loc_label] = loc.get(xlink_href)
|
||||
|
||||
# Connect labels to elements using arcs - with optimized lookups
|
||||
for arc in label_arcs:
|
||||
from_ref = arc.get(xlink_from)
|
||||
to_ref = arc.get(xlink_to)
|
||||
|
||||
if not from_ref or not to_ref or to_ref not in label_lookup:
|
||||
continue
|
||||
|
||||
# Use cached locator lookup instead of expensive XPath
|
||||
href = loc_by_label.get(from_ref)
|
||||
if not href:
|
||||
continue
|
||||
|
||||
# Extract element ID from href
|
||||
element_id = extract_element_id(href)
|
||||
|
||||
# Find labels for this element - check most likely case first
|
||||
if 'en-US' in label_lookup[to_ref]:
|
||||
element_labels = label_lookup[to_ref]['en-US']
|
||||
|
||||
# Optimize: Update catalog with minimal overhead
|
||||
catalog_entry = self.element_catalog.get(element_id)
|
||||
if catalog_entry:
|
||||
catalog_entry.labels.update(element_labels)
|
||||
else:
|
||||
# Create placeholder in catalog
|
||||
self.element_catalog[element_id] = ElementCatalog(
|
||||
name=element_id,
|
||||
data_type="",
|
||||
period_type="duration",
|
||||
labels=element_labels
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise XBRLProcessingError(f"Error parsing label content: {str(e)}") from e
|
||||
|
||||
def get_element_label(self, element_id: str) -> str:
|
||||
"""Get the label for an element, falling back to the element ID if not found."""
|
||||
if element_id in self.element_catalog and self.element_catalog[element_id].labels:
|
||||
# Use standard label if available
|
||||
standard_label = self.element_catalog[element_id].labels.get(STANDARD_LABEL)
|
||||
if standard_label:
|
||||
return standard_label
|
||||
return element_id # Fallback to element ID
|
||||
Reference in New Issue
Block a user