Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/xbrl/parsers/labels.py
+++ b/venv/lib/python3.10/site-packages/edgar/xbrl/parsers/labels.py
@@ -0,0 +1,149 @@
+"""
+Labels parser for XBRL documents.
+
+This module handles parsing of XBRL label linkbases and extracting
+element labels for display purposes.
+"""
+
+from pathlib import Path
+from typing import Dict, Union
+
+from lxml import etree as ET
+
+from edgar.xbrl.core import STANDARD_LABEL, extract_element_id
+from edgar.xbrl.models import ElementCatalog, XBRLProcessingError
+
+from .base import BaseParser
+
+
+class LabelsParser(BaseParser):
+    """Parser for XBRL label linkbases."""
+
+    def __init__(self, element_catalog: Dict[str, ElementCatalog]):
+        """
+        Initialize labels parser with data structure references.
+
+        Args:
+            element_catalog: Reference to element catalog dictionary
+        """
+        super().__init__()
+
+        # Store references to data structures
+        self.element_catalog = element_catalog
+
+    def parse_labels(self, file_path: Union[str, Path]) -> None:
+        """Parse label linkbase file and extract label information."""
+        try:
+            content = Path(file_path).read_text()
+            self.parse_labels_content(content)
+        except Exception as e:
+            raise XBRLProcessingError(f"Error parsing label file {file_path}: {str(e)}") from e
+
+    def parse_labels_content(self, content: str) -> None:
+        """Parse label linkbase content and extract label information."""
+        try:
+            # Optimize: Register namespaces for faster XPath lookups
+            nsmap = {
+                'link': 'http://www.xbrl.org/2003/linkbase',
+                'xlink': 'http://www.w3.org/1999/xlink',
+                'xml': 'http://www.w3.org/XML/1998/namespace'
+            }
+
+            # Optimize: Use lxml parser with smart string handling
+            parser = ET.XMLParser(remove_blank_text=True, recover=True)
+            root = ET.XML(content.encode('utf-8'), parser)
+
+            # Optimize: Use specific XPath expressions with namespaces for faster lookups
+            # This is much faster than using findall with '//' in element tree
+            label_arcs = root.xpath('//link:labelArc', namespaces=nsmap)
+            labels = root.xpath('//link:label', namespaces=nsmap)
+
+            # Optimize: Pre-allocate dictionary with expected size
+            label_lookup = {}
+
+            # Optimize: Cache attribute lookups
+            xlink_label = '{http://www.w3.org/1999/xlink}label'
+            xlink_role = '{http://www.w3.org/1999/xlink}role'
+            xml_lang = '{http://www.w3.org/XML/1998/namespace}lang'
+            default_role = 'http://www.xbrl.org/2003/role/label'
+
+            # Optimize: Process labels in a single pass with direct attribute access
+            for label in labels:
+                label_id = label.get(xlink_label)
+                if not label_id:
+                    continue
+
+                # Get text first - if empty, skip further processing
+                text = label.text
+                if text is None:
+                    continue
+
+                # Get attributes - direct lookup is faster than method calls
+                role = label.get(xlink_role, default_role)
+                lang = label.get(xml_lang, 'en-US')
+
+                # Create nested dictionaries only when needed
+                if label_id not in label_lookup:
+                    label_lookup[label_id] = {}
+
+                if lang not in label_lookup[label_id]:
+                    label_lookup[label_id][lang] = {}
+
+                label_lookup[label_id][lang][role] = text
+
+            # Optimize: Cache attribute lookups for arcs
+            xlink_from = '{http://www.w3.org/1999/xlink}from'
+            xlink_to = '{http://www.w3.org/1999/xlink}to'
+            xlink_href = '{http://www.w3.org/1999/xlink}href'
+
+            # Optimize: Create a lookup table for locators by label for faster access
+            loc_by_label = {}
+            for loc in root.xpath('//link:loc', namespaces=nsmap):
+                loc_label = loc.get(xlink_label)
+                if loc_label:
+                    loc_by_label[loc_label] = loc.get(xlink_href)
+
+            # Connect labels to elements using arcs - with optimized lookups
+            for arc in label_arcs:
+                from_ref = arc.get(xlink_from)
+                to_ref = arc.get(xlink_to)
+
+                if not from_ref or not to_ref or to_ref not in label_lookup:
+                    continue
+
+                # Use cached locator lookup instead of expensive XPath
+                href = loc_by_label.get(from_ref)
+                if not href:
+                    continue
+
+                # Extract element ID from href
+                element_id = extract_element_id(href)
+
+                # Find labels for this element - check most likely case first
+                if 'en-US' in label_lookup[to_ref]:
+                    element_labels = label_lookup[to_ref]['en-US']
+
+                    # Optimize: Update catalog with minimal overhead
+                    catalog_entry = self.element_catalog.get(element_id)
+                    if catalog_entry:
+                        catalog_entry.labels.update(element_labels)
+                    else:
+                        # Create placeholder in catalog
+                        self.element_catalog[element_id] = ElementCatalog(
+                            name=element_id,
+                            data_type="",
+                            period_type="duration",
+                            labels=element_labels
+                        )
+
+        except Exception as e:
+            raise XBRLProcessingError(f"Error parsing label content: {str(e)}") from e
+
+    def get_element_label(self, element_id: str) -> str:
+        """Get the label for an element, falling back to the element ID if not found."""
+        if element_id in self.element_catalog and self.element_catalog[element_id].labels:
+            # Use standard label if available
+            standard_label = self.element_catalog[element_id].labels.get(STANDARD_LABEL)
+            if standard_label:
+                return standard_label
+        return element_id  # Fallback to element ID