Files
2025-12-09 12:13:01 +01:00

670 lines
26 KiB
Python

"""
Document builder that converts parsed HTML tree into document nodes.
"""
from typing import Dict, Any, Optional
from lxml.html import HtmlElement
from edgar.documents.config import ParserConfig
from edgar.documents.nodes import (
Node, DocumentNode, TextNode, ParagraphNode, HeadingNode,
ContainerNode, SectionNode, ListNode, ListItemNode, LinkNode, ImageNode
)
from edgar.documents.strategies.style_parser import StyleParser
from edgar.documents.table_nodes import TableNode, Cell, Row
from edgar.documents.types import Style, ParseContext, SemanticType
class DocumentBuilder:
"""
Builds Document node tree from parsed HTML.
Handles the conversion of HTML elements into structured nodes
with proper hierarchy and metadata.
"""
# Block-level elements
BLOCK_ELEMENTS = {
'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'ul', 'ol', 'li', 'blockquote', 'pre', 'hr',
'table', 'form', 'fieldset', 'address', 'section',
'article', 'aside', 'nav', 'header', 'footer', 'main'
}
# Inline elements
INLINE_ELEMENTS = {
'span', 'a', 'em', 'strong', 'b', 'i', 'u', 's',
'small', 'mark', 'del', 'ins', 'sub', 'sup',
'code', 'kbd', 'var', 'samp', 'abbr', 'cite',
'q', 'time', 'font',
# IXBRL inline elements for simple values - should not break text flow
'ix:nonfraction', 'ix:footnote', 'ix:fraction'
}
# Elements to skip
SKIP_ELEMENTS = {
'script', 'style', 'meta', 'link', 'noscript',
# IXBRL exclude elements - content that should not appear in final document
'ix:exclude'
}
def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
"""
Initialize document builder.
Args:
config: Parser configuration
strategies: Dictionary of parsing strategies
"""
self.config = config
self.strategies = strategies
self.style_parser = StyleParser()
self.context = ParseContext()
# Track XBRL context
self.xbrl_context_stack = []
self.xbrl_continuations = {}
def build(self, tree: HtmlElement) -> DocumentNode:
"""
Build document from HTML tree.
Args:
tree: Parsed HTML tree
Returns:
Document root node
"""
# Create root document node
root = DocumentNode()
# Find body element
body = tree.find('.//body')
if body is None:
# If no body, use the entire tree
body = tree
# Process body content
self._process_element(body, root)
# Apply node merging if configured
if self.config.merge_adjacent_nodes:
self._merge_adjacent_nodes(root)
return root
def _process_element(self, element: HtmlElement, parent: Node) -> Optional[Node]:
"""
Process HTML element into node.
Args:
element: HTML element to process
parent: Parent node
Returns:
Created node or None if skipped
"""
# Skip certain elements but preserve their tail text
if element.tag in self.SKIP_ELEMENTS:
# Process tail text even when skipping element
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
return None
# Skip page number containers
if self._is_page_number_container(element):
return None
# Skip page break elements
if self._is_page_break_element(element):
return None
# Skip navigation containers that follow page breaks
if self._is_page_navigation_container(element):
return None
# Track parsing depth
self.context.depth += 1
try:
# Handle XBRL elements
if element.tag.startswith('{'): # Namespaced element
self._enter_xbrl_context(element)
# Extract style
style = self._extract_style(element)
# Create appropriate node based on element type
node = self._create_node_for_element(element, style)
if node:
# Add XBRL metadata if in context
if self.xbrl_context_stack:
node.metadata.update(self._get_current_xbrl_metadata())
# Add to parent
parent.add_child(node)
# Process children for container nodes
if self._should_process_children(element, node):
# Add element's direct text first
if element.text:
if self.config.preserve_whitespace:
if element.text: # Don't strip whitespace
text_node = TextNode(content=element.text)
node.add_child(text_node)
else:
if element.text.strip():
text_node = TextNode(content=element.text.strip())
node.add_child(text_node)
# Process child elements
for child in element:
self._process_element(child, node)
# Process text after children
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
elif element.tail.isspace():
# Even if tail is just whitespace, preserve the spacing info
# This helps with inline element spacing decisions
if hasattr(node, 'set_metadata'):
node.set_metadata('has_tail_whitespace', True)
else:
# Node created but children not processed - still need to handle tail
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
elif element.tail.isspace():
# Even if tail is just whitespace, preserve the spacing info
if hasattr(node, 'set_metadata'):
node.set_metadata('has_tail_whitespace', True)
else:
# No node created, process children with same parent
for child in element:
self._process_element(child, parent)
# Process tail text
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
# Exit XBRL context
if element.tag.startswith('{'):
self._exit_xbrl_context(element)
return node
finally:
self.context.depth -= 1
def _create_node_for_element(self, element: HtmlElement, style: Style) -> Optional[Node]:
"""Create appropriate node for HTML element."""
tag = element.tag.lower() if not element.tag.startswith('{') else element.tag
# Check for heading
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
level = int(tag[1])
text = self._get_element_text(element)
if text:
return HeadingNode(content=text, level=level, style=style)
# Handle specific elements first before header detection
if tag == 'p':
return ParagraphNode(style=style)
elif tag == 'li':
return ListItemNode(style=style)
# Check if element might be a heading based on style/content
# Skip header detection for certain tags that should never be headers
skip_header_detection_tags = {
'li', 'td', 'th', 'option', 'a', 'button', 'label',
# IXBRL inline elements - should not be treated as headers
'ix:nonfraction', 'ix:footnote', 'ix:fraction',
# IXBRL elements that can contain tables and complex content
'ix:nonNumeric', 'ix:continuation'
}
if tag not in skip_header_detection_tags and self.strategies.get('header_detection'):
header_info = self.strategies['header_detection'].detect(element, self.context)
if header_info and header_info.confidence > self.config.header_detection_threshold:
text = self._get_element_text(element)
if text:
node = HeadingNode(
content=text,
level=header_info.level,
style=style
)
# Add header metadata
node.set_metadata('detection_method', header_info.detection_method)
node.set_metadata('confidence', header_info.confidence)
if header_info.is_item:
node.semantic_type = SemanticType.ITEM_HEADER
node.set_metadata('item_number', header_info.item_number)
return node
# Continue handling other specific elements
if tag == 'table':
if self.strategies.get('table_processing'):
return self.strategies['table_processing'].process(element)
else:
return self._process_table_basic(element, style)
elif tag in ['ul', 'ol']:
return ListNode(ordered=(tag == 'ol'), style=style)
elif tag == 'li':
return ListItemNode(style=style)
elif tag == 'a':
href = element.get('href', '')
title = element.get('title', '')
text = self._get_element_text(element)
return LinkNode(content=text, href=href, title=title, style=style)
elif tag == 'img':
return ImageNode(
src=element.get('src'),
alt=element.get('alt'),
width=self._parse_dimension(element.get('width')),
height=self._parse_dimension(element.get('height')),
style=style
)
elif tag == 'br':
# Line break - add as text node
return TextNode(content='\n')
elif tag in ['section', 'article']:
return SectionNode(style=style)
elif tag == 'div' or tag in self.BLOCK_ELEMENTS:
# Check if CSS display property makes this inline
if style.display in ['inline', 'inline-block']:
# Treat as inline element despite being a div
text = self._get_element_text(element)
if text:
text_node = TextNode(content=text, style=style)
text_node.set_metadata('original_tag', tag)
text_node.set_metadata('inline_via_css', True)
return text_node
# If no text but inline, still process children inline
return ContainerNode(tag_name=tag, style=style)
# Normal block behavior
# Check if this is just a text container with only inline elements
if self._is_text_only_container(element):
# Create ParagraphNode for divs containing only inline elements
# This ensures proper text concatenation for spans, etc.
return ParagraphNode(style=style)
else:
return ContainerNode(tag_name=tag, style=style)
elif tag in self.INLINE_ELEMENTS:
# Inline elements - extract text and add to parent
text = self._get_element_text(element)
if text:
text_node = TextNode(content=text, style=style)
# Preserve inline element metadata
text_node.set_metadata('original_tag', tag)
return text_node
elif tag in ['ix:nonNumeric', 'ix:continuation']:
# IXBRL elements that can contain complex content including tables
# Process as container to allow proper table parsing
return ContainerNode(tag_name=tag, style=style)
# Default: create container for unknown elements
return ContainerNode(tag_name=tag, style=style)
def _is_page_number_container(self, element: HtmlElement) -> bool:
"""Detect and filter page number containers across various SEC filing patterns."""
import re
# Get text content first - all page numbers should be short
text_content = element.text_content().strip()
# Must be short content (1-8 chars to handle "Page X" format)
if len(text_content) > 8 or len(text_content) == 0:
return False
# Must be numeric, roman numerals, or "Page X" format
if not self._is_page_number_content(text_content):
return False
# Check various patterns based on element type and styling
tag = element.tag.lower()
# Pattern 1: Oracle-style flexbox containers (highest confidence)
if tag == 'div' and self._is_flexbox_page_number(element):
return True
# Pattern 2: Center/right aligned paragraphs (common pattern)
if tag == 'p' and self._is_aligned_page_number(element):
return True
# Pattern 3: Footer-style divs with centered page numbers
if tag == 'div' and self._is_footer_page_number(element):
return True
# Pattern 4: Simple divs with page break context
if tag == 'div' and self._is_page_break_context(element):
return True
return False
def _is_page_number_content(self, text: str) -> bool:
"""Check if text content looks like a page number."""
import re
# Simple numeric (most common)
if text.isdigit():
return True
# Roman numerals
if re.match(r'^[ivxlcdm]+$', text.lower()):
return True
# "Page X" or "Page X of Y" format
if re.match(r'^page\s+\d+(\s+of\s+\d+)?$', text.lower()):
return True
return False
def _is_flexbox_page_number(self, element: HtmlElement) -> bool:
"""Detect Oracle-style flexbox page number containers."""
import re
style_attr = element.get('style', '')
if not style_attr:
return False
# Must have: display:flex, justify-content:flex-end, min-height:1in
required_patterns = [
r'display:\s*flex',
r'justify-content:\s*flex-end',
r'min-height:\s*1in'
]
return all(re.search(pattern, style_attr) for pattern in required_patterns)
def _is_aligned_page_number(self, element: HtmlElement) -> bool:
"""Detect center or right-aligned page number paragraphs."""
import re
style_attr = element.get('style', '')
# Check for center or right alignment
alignment_pattern = r'text-align:\s*(center|right)'
if not re.search(alignment_pattern, style_attr):
return False
# Optional: check for smaller font size (common in page numbers)
font_size_pattern = r'font-size:\s*([0-9]+)pt'
font_match = re.search(font_size_pattern, style_attr)
if font_match:
font_size = int(font_match.group(1))
# Page numbers often use smaller fonts (8-12pt)
if font_size <= 12:
return True
return True # Any center/right aligned short content
def _is_footer_page_number(self, element: HtmlElement) -> bool:
"""Detect footer-style page number containers."""
import re
style_attr = element.get('style', '')
# Look for bottom positioning or footer-like styling
footer_patterns = [
r'bottom:\s*[0-9]',
r'position:\s*absolute',
r'margin-bottom:\s*0',
r'text-align:\s*center'
]
# Need at least 2 footer indicators
matches = sum(1 for pattern in footer_patterns if re.search(pattern, style_attr))
return matches >= 2
def _is_page_break_context(self, element: HtmlElement) -> bool:
"""Check if element is near page breaks (common page number context)."""
# Check next sibling for page break HR
next_elem = element.getnext()
if next_elem is not None and next_elem.tag == 'hr':
hr_style = next_elem.get('style', '')
if 'page-break' in hr_style:
return True
# Check if element has page-break styling itself
style_attr = element.get('style', '')
if 'page-break' in style_attr:
return True
return False
def _is_page_break_element(self, element: HtmlElement) -> bool:
"""Detect page break HR elements."""
if element.tag.lower() != 'hr':
return False
style_attr = element.get('style', '')
# Check for page-break-after:always or similar page break styles
return 'page-break' in style_attr
def _is_page_navigation_container(self, element: HtmlElement) -> bool:
"""Detect navigation containers that appear after page breaks."""
if element.tag.lower() != 'div':
return False
style_attr = element.get('style', '')
# Check for navigation container patterns
# Often have: padding-top, min-height:1in, box-sizing:border-box
nav_indicators = [
r'padding-top:\s*0\.5in',
r'min-height:\s*1in',
r'box-sizing:\s*border-box'
]
import re
matches = sum(1 for pattern in nav_indicators if re.search(pattern, style_attr))
# Need at least 2 indicators
if matches < 2:
return False
# Check if it contains typical navigation content
text_content = element.text_content().strip().lower()
# Common navigation phrases
nav_phrases = [
'table of contents',
'index to financial statements',
'table of content',
'index to financial statement'
]
return any(phrase in text_content for phrase in nav_phrases)
def _extract_style(self, element: HtmlElement) -> Style:
"""Extract style from element."""
style_str = element.get('style', '')
style = self.style_parser.parse(style_str)
# Add tag-specific styles
tag = element.tag.lower()
if tag == 'b' or tag == 'strong':
style.font_weight = 'bold'
elif tag == 'i' or tag == 'em':
style.font_style = 'italic'
elif tag == 'u':
style.text_decoration = 'underline'
# Handle alignment
align = element.get('align')
if align:
style.text_align = align
return style
def _get_element_text(self, element: HtmlElement) -> str:
"""Get text content from element."""
text_parts = []
# Get element's direct text
if element.text:
# For inline elements, preserve leading/trailing whitespace
if element.tag.lower() in self.INLINE_ELEMENTS:
text_parts.append(element.text)
else:
text_parts.append(element.text.strip())
# For simple elements, get all text content
if element.tag.lower() in self.INLINE_ELEMENTS or \
element.tag.lower() in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
# Get all text including from child elements
for child in element:
if child.tag.lower() not in self.SKIP_ELEMENTS:
child_text = child.text_content()
if child_text:
# For inline elements, preserve whitespace in child content too
if element.tag.lower() in self.INLINE_ELEMENTS:
text_parts.append(child_text)
else:
text_parts.append(child_text.strip())
# For inline elements with preserved whitespace, concatenate directly
# For others, join with spaces
if element.tag.lower() in self.INLINE_ELEMENTS and len(text_parts) == 1:
return text_parts[0] if text_parts else ''
else:
return ' '.join(text_parts)
def _is_text_only_container(self, element: HtmlElement) -> bool:
"""Check if element contains only text and inline elements."""
for child in element:
if child.tag.lower() in self.BLOCK_ELEMENTS:
return False
if child.tag.lower() == 'table':
return False
return True
def _should_process_children(self, element: HtmlElement, node: Node) -> bool:
"""Determine if children should be processed."""
# Don't process children for certain node types
if isinstance(node, (TextNode, HeadingNode)):
return False
# Tables are processed separately
if isinstance(node, TableNode):
return False
return True
def _process_table_basic(self, element: HtmlElement, style: Style) -> TableNode:
"""Basic table processing without advanced strategy."""
table = TableNode(style=style)
# Set config for rendering decisions
table._config = self.config
# Extract caption
caption_elem = element.find('.//caption')
if caption_elem is not None:
table.caption = caption_elem.text_content().strip()
# Process rows
for tr in element.findall('.//tr'):
cells = []
for td in tr.findall('.//td') + tr.findall('.//th'):
cell = Cell(
content=td.text_content().strip(),
colspan=int(td.get('colspan', '1')),
rowspan=int(td.get('rowspan', '1')),
is_header=(td.tag == 'th'),
align=td.get('align')
)
cells.append(cell)
if cells:
row = Row(cells=cells, is_header=(tr.find('.//th') is not None))
# Determine if header or data row
if tr.getparent().tag == 'thead' or row.is_header:
table.headers.append(cells)
else:
table.rows.append(row)
return table
def _parse_dimension(self, value: Optional[str]) -> Optional[int]:
"""Parse dimension value (width/height)."""
if not value:
return None
# Remove 'px' suffix if present
value = value.strip().rstrip('px')
try:
return int(value)
except ValueError:
return None
def _enter_xbrl_context(self, element: HtmlElement):
"""Enter XBRL context."""
if self.config.extract_xbrl and self.strategies.get('xbrl_extraction'):
xbrl_data = self.strategies['xbrl_extraction'].extract_context(element)
if xbrl_data:
self.xbrl_context_stack.append(xbrl_data)
def _exit_xbrl_context(self, element: HtmlElement):
"""Exit XBRL context."""
if self.xbrl_context_stack:
self.xbrl_context_stack.pop()
def _get_current_xbrl_metadata(self) -> Dict[str, Any]:
"""Get current XBRL metadata."""
if not self.xbrl_context_stack:
return {}
# Merge all contexts in stack
metadata = {}
for context in self.xbrl_context_stack:
metadata.update(context)
return metadata
def _merge_adjacent_nodes(self, root: Node):
"""Merge adjacent text nodes with similar styles."""
# Implementation would recursively merge adjacent text nodes
# This is a placeholder for the actual implementation
pass