211 lines
9.4 KiB
Python
211 lines
9.4 KiB
Python
"""
|
|
Schema parser for XBRL documents.
|
|
|
|
This module handles parsing of XBRL taxonomy schemas and element catalog
|
|
creation with element definitions and properties.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from typing import Dict, Union
|
|
|
|
from lxml import etree as ET
|
|
|
|
from edgar.core import log
|
|
from edgar.xbrl.models import ElementCatalog, XBRLProcessingError
|
|
|
|
from .base import BaseParser
|
|
|
|
|
|
class SchemaParser(BaseParser):
|
|
"""Parser for XBRL taxonomy schemas."""
|
|
|
|
def __init__(self, element_catalog: Dict[str, ElementCatalog]):
|
|
"""
|
|
Initialize schema parser with data structure references.
|
|
|
|
Args:
|
|
element_catalog: Reference to element catalog dictionary
|
|
"""
|
|
super().__init__()
|
|
|
|
# Store references to data structures
|
|
self.element_catalog = element_catalog
|
|
|
|
# Will be set by coordinator when needed
|
|
self.parse_labels_content = None
|
|
self.parse_presentation_content = None
|
|
self.parse_calculation_content = None
|
|
self.parse_definition_content = None
|
|
|
|
def set_linkbase_parsers(self, labels_parser, presentation_parser, calculation_parser, definition_parser):
|
|
"""
|
|
Set references to other parsers for embedded linkbase processing.
|
|
|
|
Args:
|
|
labels_parser: LabelsParser instance
|
|
presentation_parser: PresentationParser instance
|
|
calculation_parser: CalculationParser instance
|
|
definition_parser: DefinitionParser instance
|
|
"""
|
|
self.parse_labels_content = labels_parser.parse_labels_content
|
|
self.parse_presentation_content = presentation_parser.parse_presentation_content
|
|
self.parse_calculation_content = calculation_parser.parse_calculation_content
|
|
self.parse_definition_content = definition_parser.parse_definition_content
|
|
|
|
def parse_schema(self, file_path: Union[str, Path]) -> None:
|
|
"""Parse schema file and extract element information."""
|
|
try:
|
|
content = Path(file_path).read_text()
|
|
self.parse_schema_content(content)
|
|
except Exception as e:
|
|
raise XBRLProcessingError(f"Error parsing schema file {file_path}: {str(e)}") from e
|
|
|
|
def parse_schema_content(self, content: str) -> None:
|
|
"""Parse schema content and extract element information."""
|
|
try:
|
|
# Use the safe XML parsing helper
|
|
root = self._safe_parse_xml(content)
|
|
|
|
# Extract element declarations
|
|
for element in root.findall('.//{http://www.w3.org/2001/XMLSchema}element'):
|
|
element_id = element.get('id') or element.get('name')
|
|
if not element_id:
|
|
continue
|
|
|
|
# Extract element properties
|
|
data_type = element.get('type', '')
|
|
|
|
# Check for balance and period type
|
|
# First check as attributes on the element (modern XBRL style)
|
|
balance_type = element.get('{http://www.xbrl.org/2003/instance}balance')
|
|
period_type = element.get('{http://www.xbrl.org/2003/instance}periodType')
|
|
abstract = element.get('abstract', 'false').lower() == 'true'
|
|
|
|
# If not found as attributes, look in nested annotations (legacy style)
|
|
if not balance_type or not period_type:
|
|
annotation = element.find('.//{http://www.w3.org/2001/XMLSchema}annotation')
|
|
if annotation is not None:
|
|
for appinfo in annotation.findall('.//{http://www.w3.org/2001/XMLSchema}appinfo'):
|
|
if not balance_type:
|
|
balance_element = appinfo.find('.//{http://www.xbrl.org/2003/instance}balance')
|
|
if balance_element is not None:
|
|
balance_type = balance_element.text
|
|
|
|
if not period_type:
|
|
period_element = appinfo.find('.//{http://www.xbrl.org/2003/instance}periodType')
|
|
if period_element is not None:
|
|
period_type = period_element.text
|
|
|
|
# Create element catalog entry
|
|
self.element_catalog[element_id] = ElementCatalog(
|
|
name=element_id,
|
|
data_type=data_type,
|
|
period_type=period_type or "duration", # Default to duration
|
|
balance=balance_type,
|
|
abstract=abstract,
|
|
labels={}
|
|
)
|
|
|
|
# Extract embedded linkbases if present
|
|
embedded_linkbases = self._extract_embedded_linkbases(content)
|
|
|
|
# If embedded linkbases were found, parse them
|
|
if embedded_linkbases and 'linkbases' in embedded_linkbases:
|
|
if 'label' in embedded_linkbases['linkbases'] and self.parse_labels_content:
|
|
label_content = embedded_linkbases['linkbases']['label']
|
|
self.parse_labels_content(label_content)
|
|
|
|
if 'presentation' in embedded_linkbases['linkbases'] and self.parse_presentation_content:
|
|
presentation_content = embedded_linkbases['linkbases']['presentation']
|
|
self.parse_presentation_content(presentation_content)
|
|
|
|
if 'calculation' in embedded_linkbases['linkbases'] and self.parse_calculation_content:
|
|
calculation_content = embedded_linkbases['linkbases']['calculation']
|
|
self.parse_calculation_content(calculation_content)
|
|
|
|
if 'definition' in embedded_linkbases['linkbases'] and self.parse_definition_content:
|
|
definition_content = embedded_linkbases['linkbases']['definition']
|
|
self.parse_definition_content(definition_content)
|
|
|
|
except Exception as e:
|
|
raise XBRLProcessingError(f"Error parsing schema content: {str(e)}") from e
|
|
|
|
def _extract_embedded_linkbases(self, schema_content: str) -> Dict[str, Dict[str, str]]:
|
|
"""
|
|
Extract embedded linkbases and role types from the schema file.
|
|
|
|
Args:
|
|
schema_content: XML content of the schema file
|
|
|
|
Returns:
|
|
Dictionary containing embedded linkbases and role type information
|
|
"""
|
|
embedded_data = {
|
|
'linkbases': {},
|
|
'role_types': {}
|
|
}
|
|
|
|
try:
|
|
# Use the safe XML parsing helper
|
|
root = self._safe_parse_xml(schema_content)
|
|
|
|
# Create namespace map for use with XPath
|
|
nsmap = {
|
|
'xsd': 'http://www.w3.org/2001/XMLSchema',
|
|
'link': 'http://www.xbrl.org/2003/linkbase'
|
|
}
|
|
|
|
# Find all appinfo elements using optimized XPath
|
|
for appinfo in root.xpath('.//xsd:appinfo', namespaces=nsmap):
|
|
# Extract role types
|
|
for role_type in appinfo.xpath('./link:roleType', namespaces=nsmap):
|
|
role_uri = role_type.get('roleURI')
|
|
role_id = role_type.get('id')
|
|
|
|
# Use optimized XPath to find definition
|
|
definition = role_type.find('./link:definition', nsmap)
|
|
definition_text = definition.text if definition is not None else ""
|
|
|
|
# Use optimized XPath to find usedOn elements
|
|
used_on = [elem.text for elem in role_type.xpath('./link:usedOn', namespaces=nsmap) if elem.text]
|
|
|
|
if role_uri:
|
|
embedded_data['role_types'][role_uri] = {
|
|
'id': role_id,
|
|
'definition': definition_text,
|
|
'used_on': used_on
|
|
}
|
|
|
|
# Find the linkbase element with optimized XPath
|
|
linkbase = appinfo.find('./link:linkbase', nsmap)
|
|
if linkbase is not None:
|
|
# Extract the entire linkbase element as a string - with proper encoding
|
|
linkbase_string = ET.tostring(linkbase, encoding='unicode', method='xml')
|
|
|
|
# Extract each type of linkbase with optimized XPath
|
|
for linkbase_type in ['presentation', 'label', 'calculation', 'definition']:
|
|
# Use direct child XPath for better performance
|
|
xpath_expr = f'./link:{linkbase_type}Link'
|
|
linkbase_elements = linkbase.xpath(xpath_expr, namespaces=nsmap)
|
|
|
|
if linkbase_elements:
|
|
# Convert all linkbase elements of this type to strings
|
|
linkbase_strings = [
|
|
ET.tostring(elem, encoding='unicode', method='xml')
|
|
for elem in linkbase_elements
|
|
]
|
|
|
|
# Join multiple linkbase elements efficiently
|
|
linkbase_header = linkbase_string.split('>', 1)[0] + '>'
|
|
embedded_data['linkbases'][linkbase_type] = (
|
|
f"{linkbase_header}\n" +
|
|
'\n'.join(linkbase_strings) +
|
|
"\n</link:linkbase>"
|
|
)
|
|
|
|
return embedded_data
|
|
except Exception as e:
|
|
# Log the error but don't fail - just return empty embedded data
|
|
log.warning(f"Warning: Error extracting embedded linkbases: {str(e)}")
|
|
return embedded_data
|