250 lines
9.5 KiB
Python
250 lines
9.5 KiB
Python
"""
|
|
Presentation parser for XBRL documents.
|
|
|
|
This module handles parsing of XBRL presentation linkbases and building
|
|
presentation trees for financial statement structure.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Union
|
|
|
|
from lxml import etree as ET
|
|
|
|
from edgar.xbrl.core import extract_element_id
|
|
from edgar.xbrl.models import ElementCatalog, PresentationNode, PresentationTree, XBRLProcessingError
|
|
|
|
from .base import BaseParser
|
|
|
|
|
|
class PresentationParser(BaseParser):
|
|
"""Parser for XBRL presentation linkbases."""
|
|
|
|
def __init__(self, presentation_roles: Dict[str, Dict[str, Any]],
|
|
presentation_trees: Dict[str, PresentationTree],
|
|
element_catalog: Dict[str, ElementCatalog]):
|
|
"""
|
|
Initialize presentation parser with data structure references.
|
|
|
|
Args:
|
|
presentation_roles: Reference to presentation roles dictionary
|
|
presentation_trees: Reference to presentation trees dictionary
|
|
element_catalog: Reference to element catalog dictionary
|
|
"""
|
|
super().__init__()
|
|
|
|
# Store references to data structures
|
|
self.presentation_roles = presentation_roles
|
|
self.presentation_trees = presentation_trees
|
|
self.element_catalog = element_catalog
|
|
|
|
def parse_presentation(self, file_path: Union[str, Path]) -> None:
|
|
"""Parse presentation linkbase file and build presentation trees."""
|
|
try:
|
|
content = Path(file_path).read_text()
|
|
self.parse_presentation_content(content)
|
|
except Exception as e:
|
|
raise XBRLProcessingError(f"Error parsing presentation file {file_path}: {str(e)}") from e
|
|
|
|
def parse_presentation_content(self, content: str) -> None:
|
|
"""Parse presentation linkbase content and build presentation trees."""
|
|
try:
|
|
# Optimize: Register namespaces for faster XPath lookups
|
|
nsmap = {
|
|
'link': 'http://www.xbrl.org/2003/linkbase',
|
|
'xlink': 'http://www.w3.org/1999/xlink'
|
|
}
|
|
|
|
# Optimize: Use lxml parser with smart string handling
|
|
parser = ET.XMLParser(remove_blank_text=True, recover=True)
|
|
root = ET.XML(content.encode('utf-8'), parser)
|
|
|
|
# Optimize: Use XPath with namespaces for faster extraction
|
|
presentation_links = root.xpath('//link:presentationLink', namespaces=nsmap)
|
|
|
|
# Optimize: Cache attribute paths
|
|
xlink_role = '{http://www.w3.org/1999/xlink}role'
|
|
xlink_from = '{http://www.w3.org/1999/xlink}from'
|
|
xlink_to = '{http://www.w3.org/1999/xlink}to'
|
|
xlink_label = '{http://www.w3.org/1999/xlink}label'
|
|
xlink_href = '{http://www.w3.org/1999/xlink}href'
|
|
|
|
for link in presentation_links:
|
|
role = link.get(xlink_role)
|
|
if not role:
|
|
continue
|
|
|
|
# Store role information
|
|
role_id = role.split('/')[-1] if '/' in role else role
|
|
role_def = role_id.replace('_', ' ')
|
|
|
|
self.presentation_roles[role] = {
|
|
'roleUri': role,
|
|
'definition': role_def,
|
|
'roleId': role_id
|
|
}
|
|
|
|
# Optimize: Pre-build locator map to avoid repeated XPath lookups
|
|
loc_map = {}
|
|
for loc in link.xpath('.//link:loc', namespaces=nsmap):
|
|
label = loc.get(xlink_label)
|
|
if label:
|
|
loc_map[label] = loc.get(xlink_href)
|
|
|
|
# Optimize: Extract arcs using direct xpath with context
|
|
arcs = link.xpath('.//link:presentationArc', namespaces=nsmap)
|
|
|
|
# Create relationships map - pre-allocate with known size
|
|
relationships = []
|
|
relationships_append = relationships.append # Local function reference for speed
|
|
|
|
# Process arcs with optimized locator lookups
|
|
for arc in arcs:
|
|
from_ref = arc.get(xlink_from)
|
|
to_ref = arc.get(xlink_to)
|
|
|
|
if not from_ref or not to_ref:
|
|
continue
|
|
|
|
# Optimize: Use cached locator references instead of expensive XPath lookups
|
|
from_href = loc_map.get(from_ref)
|
|
to_href = loc_map.get(to_ref)
|
|
|
|
if not from_href or not to_href:
|
|
continue
|
|
|
|
# Parse order attribute correctly
|
|
order = self._parse_order_attribute(arc)
|
|
|
|
preferred_label = arc.get('preferredLabel')
|
|
|
|
# Extract element IDs from hrefs
|
|
from_element = extract_element_id(from_href)
|
|
to_element = extract_element_id(to_href)
|
|
|
|
# Add relationship using local function reference
|
|
relationships_append({
|
|
'from_element': from_element,
|
|
'to_element': to_element,
|
|
'order': order,
|
|
'preferred_label': preferred_label
|
|
})
|
|
|
|
# Build presentation tree for this role if we have relationships
|
|
if relationships:
|
|
self._build_presentation_tree(role, relationships)
|
|
|
|
except Exception as e:
|
|
raise XBRLProcessingError(f"Error parsing presentation content: {str(e)}") from e
|
|
|
|
def _build_presentation_tree(self, role: str, relationships: List[Dict[str, Any]]) -> None:
|
|
"""
|
|
Build a presentation tree from relationships.
|
|
|
|
Args:
|
|
role: Extended link role URI
|
|
relationships: List of relationships (from_element, to_element, order, preferred_label)
|
|
"""
|
|
# Group relationships by source element
|
|
from_map = {}
|
|
to_map = {}
|
|
|
|
for rel in relationships:
|
|
from_element = rel['from_element']
|
|
to_element = rel['to_element']
|
|
|
|
if from_element not in from_map:
|
|
from_map[from_element] = []
|
|
from_map[from_element].append(rel)
|
|
|
|
if to_element not in to_map:
|
|
to_map[to_element] = []
|
|
to_map[to_element].append(rel)
|
|
|
|
# Find root elements (appear as 'from' but not as 'to')
|
|
root_elements = set(from_map.keys()) - set(to_map.keys())
|
|
|
|
if not root_elements:
|
|
return # No root elements found
|
|
|
|
# Create presentation tree
|
|
tree = PresentationTree(
|
|
role_uri=role,
|
|
definition=self.presentation_roles[role]['definition'],
|
|
root_element_id=next(iter(root_elements)),
|
|
all_nodes={}
|
|
)
|
|
|
|
# Build tree recursively
|
|
for root_id in root_elements:
|
|
self._build_presentation_subtree(root_id, None, 0, from_map, tree.all_nodes)
|
|
|
|
# Add tree to collection
|
|
self.presentation_trees[role] = tree
|
|
|
|
def _build_presentation_subtree(self, element_id: str, parent_id: Optional[str], depth: int,
|
|
from_map: Dict[str, List[Dict[str, Any]]],
|
|
all_nodes: Dict[str, PresentationNode]) -> None:
|
|
"""
|
|
Recursively build a presentation subtree.
|
|
|
|
Args:
|
|
element_id: Current element ID
|
|
parent_id: Parent element ID
|
|
depth: Current depth in tree
|
|
from_map: Map of relationships by source element
|
|
all_nodes: Dictionary to store all nodes
|
|
"""
|
|
# Create node
|
|
node = PresentationNode(
|
|
element_id=element_id,
|
|
parent=parent_id,
|
|
children=[],
|
|
depth=depth
|
|
)
|
|
|
|
# Add element information if available
|
|
if element_id in self.element_catalog:
|
|
elem_info = self.element_catalog[element_id]
|
|
node.element_name = elem_info.name
|
|
node.standard_label = elem_info.labels.get('http://www.xbrl.org/2003/role/label', elem_info.name)
|
|
|
|
# Use enhanced abstract detection (Issue #450 fix)
|
|
# The element catalog may not have correct abstract info for standard taxonomy concepts
|
|
from edgar.xbrl.abstract_detection import is_abstract_concept
|
|
node.is_abstract = is_abstract_concept(
|
|
concept_name=elem_info.name,
|
|
schema_abstract=elem_info.abstract,
|
|
has_children=False, # Will be updated after children are processed
|
|
has_values=False # Will be determined later when facts are loaded
|
|
)
|
|
|
|
node.labels = elem_info.labels
|
|
|
|
# Add to collection
|
|
all_nodes[element_id] = node
|
|
|
|
# Process children
|
|
if element_id in from_map:
|
|
# Sort children by order
|
|
children = sorted(from_map[element_id], key=lambda r: r['order'])
|
|
|
|
for rel in children:
|
|
child_id = rel['to_element']
|
|
|
|
# Add child to parent's children list
|
|
node.children.append(child_id)
|
|
|
|
# Set preferred label
|
|
preferred_label = rel['preferred_label']
|
|
|
|
# Recursively build child subtree
|
|
self._build_presentation_subtree(
|
|
child_id, element_id, depth + 1, from_map, all_nodes
|
|
)
|
|
|
|
# Update preferred label and order after child is built
|
|
if child_id in all_nodes:
|
|
if preferred_label:
|
|
all_nodes[child_id].preferred_label = preferred_label
|
|
all_nodes[child_id].order = rel['order']
|