""" Schema parser for XBRL documents. This module handles parsing of XBRL taxonomy schemas and element catalog creation with element definitions and properties. """ from pathlib import Path from typing import Dict, Union from lxml import etree as ET from edgar.core import log from edgar.xbrl.models import ElementCatalog, XBRLProcessingError from .base import BaseParser class SchemaParser(BaseParser): """Parser for XBRL taxonomy schemas.""" def __init__(self, element_catalog: Dict[str, ElementCatalog]): """ Initialize schema parser with data structure references. Args: element_catalog: Reference to element catalog dictionary """ super().__init__() # Store references to data structures self.element_catalog = element_catalog # Will be set by coordinator when needed self.parse_labels_content = None self.parse_presentation_content = None self.parse_calculation_content = None self.parse_definition_content = None def set_linkbase_parsers(self, labels_parser, presentation_parser, calculation_parser, definition_parser): """ Set references to other parsers for embedded linkbase processing. Args: labels_parser: LabelsParser instance presentation_parser: PresentationParser instance calculation_parser: CalculationParser instance definition_parser: DefinitionParser instance """ self.parse_labels_content = labels_parser.parse_labels_content self.parse_presentation_content = presentation_parser.parse_presentation_content self.parse_calculation_content = calculation_parser.parse_calculation_content self.parse_definition_content = definition_parser.parse_definition_content def parse_schema(self, file_path: Union[str, Path]) -> None: """Parse schema file and extract element information.""" try: content = Path(file_path).read_text() self.parse_schema_content(content) except Exception as e: raise XBRLProcessingError(f"Error parsing schema file {file_path}: {str(e)}") from e def parse_schema_content(self, content: str) -> None: """Parse schema content and extract element information.""" try: # Use the safe XML parsing helper root = self._safe_parse_xml(content) # Extract element declarations for element in root.findall('.//{http://www.w3.org/2001/XMLSchema}element'): element_id = element.get('id') or element.get('name') if not element_id: continue # Extract element properties data_type = element.get('type', '') # Check for balance and period type # First check as attributes on the element (modern XBRL style) balance_type = element.get('{http://www.xbrl.org/2003/instance}balance') period_type = element.get('{http://www.xbrl.org/2003/instance}periodType') abstract = element.get('abstract', 'false').lower() == 'true' # If not found as attributes, look in nested annotations (legacy style) if not balance_type or not period_type: annotation = element.find('.//{http://www.w3.org/2001/XMLSchema}annotation') if annotation is not None: for appinfo in annotation.findall('.//{http://www.w3.org/2001/XMLSchema}appinfo'): if not balance_type: balance_element = appinfo.find('.//{http://www.xbrl.org/2003/instance}balance') if balance_element is not None: balance_type = balance_element.text if not period_type: period_element = appinfo.find('.//{http://www.xbrl.org/2003/instance}periodType') if period_element is not None: period_type = period_element.text # Create element catalog entry self.element_catalog[element_id] = ElementCatalog( name=element_id, data_type=data_type, period_type=period_type or "duration", # Default to duration balance=balance_type, abstract=abstract, labels={} ) # Extract embedded linkbases if present embedded_linkbases = self._extract_embedded_linkbases(content) # If embedded linkbases were found, parse them if embedded_linkbases and 'linkbases' in embedded_linkbases: if 'label' in embedded_linkbases['linkbases'] and self.parse_labels_content: label_content = embedded_linkbases['linkbases']['label'] self.parse_labels_content(label_content) if 'presentation' in embedded_linkbases['linkbases'] and self.parse_presentation_content: presentation_content = embedded_linkbases['linkbases']['presentation'] self.parse_presentation_content(presentation_content) if 'calculation' in embedded_linkbases['linkbases'] and self.parse_calculation_content: calculation_content = embedded_linkbases['linkbases']['calculation'] self.parse_calculation_content(calculation_content) if 'definition' in embedded_linkbases['linkbases'] and self.parse_definition_content: definition_content = embedded_linkbases['linkbases']['definition'] self.parse_definition_content(definition_content) except Exception as e: raise XBRLProcessingError(f"Error parsing schema content: {str(e)}") from e def _extract_embedded_linkbases(self, schema_content: str) -> Dict[str, Dict[str, str]]: """ Extract embedded linkbases and role types from the schema file. Args: schema_content: XML content of the schema file Returns: Dictionary containing embedded linkbases and role type information """ embedded_data = { 'linkbases': {}, 'role_types': {} } try: # Use the safe XML parsing helper root = self._safe_parse_xml(schema_content) # Create namespace map for use with XPath nsmap = { 'xsd': 'http://www.w3.org/2001/XMLSchema', 'link': 'http://www.xbrl.org/2003/linkbase' } # Find all appinfo elements using optimized XPath for appinfo in root.xpath('.//xsd:appinfo', namespaces=nsmap): # Extract role types for role_type in appinfo.xpath('./link:roleType', namespaces=nsmap): role_uri = role_type.get('roleURI') role_id = role_type.get('id') # Use optimized XPath to find definition definition = role_type.find('./link:definition', nsmap) definition_text = definition.text if definition is not None else "" # Use optimized XPath to find usedOn elements used_on = [elem.text for elem in role_type.xpath('./link:usedOn', namespaces=nsmap) if elem.text] if role_uri: embedded_data['role_types'][role_uri] = { 'id': role_id, 'definition': definition_text, 'used_on': used_on } # Find the linkbase element with optimized XPath linkbase = appinfo.find('./link:linkbase', nsmap) if linkbase is not None: # Extract the entire linkbase element as a string - with proper encoding linkbase_string = ET.tostring(linkbase, encoding='unicode', method='xml') # Extract each type of linkbase with optimized XPath for linkbase_type in ['presentation', 'label', 'calculation', 'definition']: # Use direct child XPath for better performance xpath_expr = f'./link:{linkbase_type}Link' linkbase_elements = linkbase.xpath(xpath_expr, namespaces=nsmap) if linkbase_elements: # Convert all linkbase elements of this type to strings linkbase_strings = [ ET.tostring(elem, encoding='unicode', method='xml') for elem in linkbase_elements ] # Join multiple linkbase elements efficiently linkbase_header = linkbase_string.split('>', 1)[0] + '>' embedded_data['linkbases'][linkbase_type] = ( f"{linkbase_header}\n" + '\n'.join(linkbase_strings) + "\n" ) return embedded_data except Exception as e: # Log the error but don't fail - just return empty embedded data log.warning(f"Warning: Error extracting embedded linkbases: {str(e)}") return embedded_data