"""
Document builder that converts parsed HTML tree into document nodes.
"""
from typing import Dict, Any, Optional
from lxml.html import HtmlElement
from edgar.documents.config import ParserConfig
from edgar.documents.nodes import (
Node, DocumentNode, TextNode, ParagraphNode, HeadingNode,
ContainerNode, SectionNode, ListNode, ListItemNode, LinkNode, ImageNode
)
from edgar.documents.strategies.style_parser import StyleParser
from edgar.documents.table_nodes import TableNode, Cell, Row
from edgar.documents.types import Style, ParseContext, SemanticType
class DocumentBuilder:
"""
Builds Document node tree from parsed HTML.
Handles the conversion of HTML elements into structured nodes
with proper hierarchy and metadata.
"""
# Block-level elements
BLOCK_ELEMENTS = {
'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'ul', 'ol', 'li', 'blockquote', 'pre', 'hr',
'table', 'form', 'fieldset', 'address', 'section',
'article', 'aside', 'nav', 'header', 'footer', 'main'
}
# Inline elements
INLINE_ELEMENTS = {
'span', 'a', 'em', 'strong', 'b', 'i', 'u', 's',
'small', 'mark', 'del', 'ins', 'sub', 'sup',
'code', 'kbd', 'var', 'samp', 'abbr', 'cite',
'q', 'time', 'font',
# IXBRL inline elements for simple values - should not break text flow
'ix:nonfraction', 'ix:footnote', 'ix:fraction'
}
# Elements to skip
SKIP_ELEMENTS = {
'script', 'style', 'meta', 'link', 'noscript',
# IXBRL exclude elements - content that should not appear in final document
'ix:exclude'
}
def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
"""
Initialize document builder.
Args:
config: Parser configuration
strategies: Dictionary of parsing strategies
"""
self.config = config
self.strategies = strategies
self.style_parser = StyleParser()
self.context = ParseContext()
# Track XBRL context
self.xbrl_context_stack = []
self.xbrl_continuations = {}
def build(self, tree: HtmlElement) -> DocumentNode:
"""
Build document from HTML tree.
Args:
tree: Parsed HTML tree
Returns:
Document root node
"""
# Create root document node
root = DocumentNode()
# Find body element
body = tree.find('.//body')
if body is None:
# If no body, use the entire tree
body = tree
# Process body content
self._process_element(body, root)
# Apply node merging if configured
if self.config.merge_adjacent_nodes:
self._merge_adjacent_nodes(root)
return root
def _process_element(self, element: HtmlElement, parent: Node) -> Optional[Node]:
"""
Process HTML element into node.
Args:
element: HTML element to process
parent: Parent node
Returns:
Created node or None if skipped
"""
# Skip certain elements but preserve their tail text
if element.tag in self.SKIP_ELEMENTS:
# Process tail text even when skipping element
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
return None
# Skip page number containers
if self._is_page_number_container(element):
return None
# Skip page break elements
if self._is_page_break_element(element):
return None
# Skip navigation containers that follow page breaks
if self._is_page_navigation_container(element):
return None
# Track parsing depth
self.context.depth += 1
try:
# Handle XBRL elements
if element.tag.startswith('{'): # Namespaced element
self._enter_xbrl_context(element)
# Extract style
style = self._extract_style(element)
# Create appropriate node based on element type
node = self._create_node_for_element(element, style)
if node:
# Add XBRL metadata if in context
if self.xbrl_context_stack:
node.metadata.update(self._get_current_xbrl_metadata())
# Add to parent
parent.add_child(node)
# Process children for container nodes
if self._should_process_children(element, node):
# Add element's direct text first
if element.text:
if self.config.preserve_whitespace:
if element.text: # Don't strip whitespace
text_node = TextNode(content=element.text)
node.add_child(text_node)
else:
if element.text.strip():
text_node = TextNode(content=element.text.strip())
node.add_child(text_node)
# Process child elements
for child in element:
self._process_element(child, node)
# Process text after children
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
elif element.tail.isspace():
# Even if tail is just whitespace, preserve the spacing info
# This helps with inline element spacing decisions
if hasattr(node, 'set_metadata'):
node.set_metadata('has_tail_whitespace', True)
else:
# Node created but children not processed - still need to handle tail
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
elif element.tail.isspace():
# Even if tail is just whitespace, preserve the spacing info
if hasattr(node, 'set_metadata'):
node.set_metadata('has_tail_whitespace', True)
else:
# No node created, process children with same parent
for child in element:
self._process_element(child, parent)
# Process tail text
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
# Exit XBRL context
if element.tag.startswith('{'):
self._exit_xbrl_context(element)
return node
finally:
self.context.depth -= 1
def _create_node_for_element(self, element: HtmlElement, style: Style) -> Optional[Node]:
"""Create appropriate node for HTML element."""
tag = element.tag.lower() if not element.tag.startswith('{') else element.tag
# Check for heading
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
level = int(tag[1])
text = self._get_element_text(element)
if text:
return HeadingNode(content=text, level=level, style=style)
# Handle specific elements first before header detection
if tag == 'p':
return ParagraphNode(style=style)
elif tag == 'li':
return ListItemNode(style=style)
# Check if element might be a heading based on style/content
# Skip header detection for certain tags that should never be headers
skip_header_detection_tags = {
'li', 'td', 'th', 'option', 'a', 'button', 'label',
# IXBRL inline elements - should not be treated as headers
'ix:nonfraction', 'ix:footnote', 'ix:fraction',
# IXBRL elements that can contain tables and complex content
'ix:nonNumeric', 'ix:continuation'
}
if tag not in skip_header_detection_tags and self.strategies.get('header_detection'):
header_info = self.strategies['header_detection'].detect(element, self.context)
if header_info and header_info.confidence > self.config.header_detection_threshold:
text = self._get_element_text(element)
if text:
node = HeadingNode(
content=text,
level=header_info.level,
style=style
)
# Add header metadata
node.set_metadata('detection_method', header_info.detection_method)
node.set_metadata('confidence', header_info.confidence)
if header_info.is_item:
node.semantic_type = SemanticType.ITEM_HEADER
node.set_metadata('item_number', header_info.item_number)
return node
# Continue handling other specific elements
if tag == 'table':
if self.strategies.get('table_processing'):
return self.strategies['table_processing'].process(element)
else:
return self._process_table_basic(element, style)
elif tag in ['ul', 'ol']:
return ListNode(ordered=(tag == 'ol'), style=style)
elif tag == 'li':
return ListItemNode(style=style)
elif tag == 'a':
href = element.get('href', '')
title = element.get('title', '')
text = self._get_element_text(element)
return LinkNode(content=text, href=href, title=title, style=style)
elif tag == 'img':
return ImageNode(
src=element.get('src'),
alt=element.get('alt'),
width=self._parse_dimension(element.get('width')),
height=self._parse_dimension(element.get('height')),
style=style
)
elif tag == 'br':
# Line break - add as text node
return TextNode(content='\n')
elif tag in ['section', 'article']:
return SectionNode(style=style)
elif tag == 'div' or tag in self.BLOCK_ELEMENTS:
# Check if CSS display property makes this inline
if style.display in ['inline', 'inline-block']:
# Treat as inline element despite being a div
text = self._get_element_text(element)
if text:
text_node = TextNode(content=text, style=style)
text_node.set_metadata('original_tag', tag)
text_node.set_metadata('inline_via_css', True)
return text_node
# If no text but inline, still process children inline
return ContainerNode(tag_name=tag, style=style)
# Normal block behavior
# Check if this is just a text container with only inline elements
if self._is_text_only_container(element):
# Create ParagraphNode for divs containing only inline elements
# This ensures proper text concatenation for spans, etc.
return ParagraphNode(style=style)
else:
return ContainerNode(tag_name=tag, style=style)
elif tag in self.INLINE_ELEMENTS:
# Inline elements - extract text and add to parent
text = self._get_element_text(element)
if text:
text_node = TextNode(content=text, style=style)
# Preserve inline element metadata
text_node.set_metadata('original_tag', tag)
return text_node
elif tag in ['ix:nonNumeric', 'ix:continuation']:
# IXBRL elements that can contain complex content including tables
# Process as container to allow proper table parsing
return ContainerNode(tag_name=tag, style=style)
# Default: create container for unknown elements
return ContainerNode(tag_name=tag, style=style)
def _is_page_number_container(self, element: HtmlElement) -> bool:
"""Detect and filter page number containers across various SEC filing patterns."""
import re
# Get text content first - all page numbers should be short
text_content = element.text_content().strip()
# Must be short content (1-8 chars to handle "Page X" format)
if len(text_content) > 8 or len(text_content) == 0:
return False
# Must be numeric, roman numerals, or "Page X" format
if not self._is_page_number_content(text_content):
return False
# Check various patterns based on element type and styling
tag = element.tag.lower()
# Pattern 1: Oracle-style flexbox containers (highest confidence)
if tag == 'div' and self._is_flexbox_page_number(element):
return True
# Pattern 2: Center/right aligned paragraphs (common pattern)
if tag == 'p' and self._is_aligned_page_number(element):
return True
# Pattern 3: Footer-style divs with centered page numbers
if tag == 'div' and self._is_footer_page_number(element):
return True
# Pattern 4: Simple divs with page break context
if tag == 'div' and self._is_page_break_context(element):
return True
return False
def _is_page_number_content(self, text: str) -> bool:
"""Check if text content looks like a page number."""
import re
# Simple numeric (most common)
if text.isdigit():
return True
# Roman numerals
if re.match(r'^[ivxlcdm]+$', text.lower()):
return True
# "Page X" or "Page X of Y" format
if re.match(r'^page\s+\d+(\s+of\s+\d+)?$', text.lower()):
return True
return False
def _is_flexbox_page_number(self, element: HtmlElement) -> bool:
"""Detect Oracle-style flexbox page number containers."""
import re
style_attr = element.get('style', '')
if not style_attr:
return False
# Must have: display:flex, justify-content:flex-end, min-height:1in
required_patterns = [
r'display:\s*flex',
r'justify-content:\s*flex-end',
r'min-height:\s*1in'
]
return all(re.search(pattern, style_attr) for pattern in required_patterns)
def _is_aligned_page_number(self, element: HtmlElement) -> bool:
"""Detect center or right-aligned page number paragraphs."""
import re
style_attr = element.get('style', '')
# Check for center or right alignment
alignment_pattern = r'text-align:\s*(center|right)'
if not re.search(alignment_pattern, style_attr):
return False
# Optional: check for smaller font size (common in page numbers)
font_size_pattern = r'font-size:\s*([0-9]+)pt'
font_match = re.search(font_size_pattern, style_attr)
if font_match:
font_size = int(font_match.group(1))
# Page numbers often use smaller fonts (8-12pt)
if font_size <= 12:
return True
return True # Any center/right aligned short content
def _is_footer_page_number(self, element: HtmlElement) -> bool:
"""Detect footer-style page number containers."""
import re
style_attr = element.get('style', '')
# Look for bottom positioning or footer-like styling
footer_patterns = [
r'bottom:\s*[0-9]',
r'position:\s*absolute',
r'margin-bottom:\s*0',
r'text-align:\s*center'
]
# Need at least 2 footer indicators
matches = sum(1 for pattern in footer_patterns if re.search(pattern, style_attr))
return matches >= 2
def _is_page_break_context(self, element: HtmlElement) -> bool:
"""Check if element is near page breaks (common page number context)."""
# Check next sibling for page break HR
next_elem = element.getnext()
if next_elem is not None and next_elem.tag == 'hr':
hr_style = next_elem.get('style', '')
if 'page-break' in hr_style:
return True
# Check if element has page-break styling itself
style_attr = element.get('style', '')
if 'page-break' in style_attr:
return True
return False
def _is_page_break_element(self, element: HtmlElement) -> bool:
"""Detect page break HR elements."""
if element.tag.lower() != 'hr':
return False
style_attr = element.get('style', '')
# Check for page-break-after:always or similar page break styles
return 'page-break' in style_attr
def _is_page_navigation_container(self, element: HtmlElement) -> bool:
"""Detect navigation containers that appear after page breaks."""
if element.tag.lower() != 'div':
return False
style_attr = element.get('style', '')
# Check for navigation container patterns
# Often have: padding-top, min-height:1in, box-sizing:border-box
nav_indicators = [
r'padding-top:\s*0\.5in',
r'min-height:\s*1in',
r'box-sizing:\s*border-box'
]
import re
matches = sum(1 for pattern in nav_indicators if re.search(pattern, style_attr))
# Need at least 2 indicators
if matches < 2:
return False
# Check if it contains typical navigation content
text_content = element.text_content().strip().lower()
# Common navigation phrases
nav_phrases = [
'table of contents',
'index to financial statements',
'table of content',
'index to financial statement'
]
return any(phrase in text_content for phrase in nav_phrases)
def _extract_style(self, element: HtmlElement) -> Style:
"""Extract style from element."""
style_str = element.get('style', '')
style = self.style_parser.parse(style_str)
# Add tag-specific styles
tag = element.tag.lower()
if tag == 'b' or tag == 'strong':
style.font_weight = 'bold'
elif tag == 'i' or tag == 'em':
style.font_style = 'italic'
elif tag == 'u':
style.text_decoration = 'underline'
# Handle alignment
align = element.get('align')
if align:
style.text_align = align
return style
def _get_element_text(self, element: HtmlElement) -> str:
"""Get text content from element."""
text_parts = []
# Get element's direct text
if element.text:
# For inline elements, preserve leading/trailing whitespace
if element.tag.lower() in self.INLINE_ELEMENTS:
text_parts.append(element.text)
else:
text_parts.append(element.text.strip())
# For simple elements, get all text content
if element.tag.lower() in self.INLINE_ELEMENTS or \
element.tag.lower() in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
# Get all text including from child elements
for child in element:
if child.tag.lower() not in self.SKIP_ELEMENTS:
child_text = child.text_content()
if child_text:
# For inline elements, preserve whitespace in child content too
if element.tag.lower() in self.INLINE_ELEMENTS:
text_parts.append(child_text)
else:
text_parts.append(child_text.strip())
# For inline elements with preserved whitespace, concatenate directly
# For others, join with spaces
if element.tag.lower() in self.INLINE_ELEMENTS and len(text_parts) == 1:
return text_parts[0] if text_parts else ''
else:
return ' '.join(text_parts)
def _is_text_only_container(self, element: HtmlElement) -> bool:
"""Check if element contains only text and inline elements."""
for child in element:
if child.tag.lower() in self.BLOCK_ELEMENTS:
return False
if child.tag.lower() == 'table':
return False
return True
def _should_process_children(self, element: HtmlElement, node: Node) -> bool:
"""Determine if children should be processed."""
# Don't process children for certain node types
if isinstance(node, (TextNode, HeadingNode)):
return False
# Tables are processed separately
if isinstance(node, TableNode):
return False
return True
def _process_table_basic(self, element: HtmlElement, style: Style) -> TableNode:
"""Basic table processing without advanced strategy."""
table = TableNode(style=style)
# Set config for rendering decisions
table._config = self.config
# Extract caption
caption_elem = element.find('.//caption')
if caption_elem is not None:
table.caption = caption_elem.text_content().strip()
# Process rows
for tr in element.findall('.//tr'):
cells = []
for td in tr.findall('.//td') + tr.findall('.//th'):
cell = Cell(
content=td.text_content().strip(),
colspan=int(td.get('colspan', '1')),
rowspan=int(td.get('rowspan', '1')),
is_header=(td.tag == 'th'),
align=td.get('align')
)
cells.append(cell)
if cells:
row = Row(cells=cells, is_header=(tr.find('.//th') is not None))
# Determine if header or data row
if tr.getparent().tag == 'thead' or row.is_header:
table.headers.append(cells)
else:
table.rows.append(row)
return table
def _parse_dimension(self, value: Optional[str]) -> Optional[int]:
"""Parse dimension value (width/height)."""
if not value:
return None
# Remove 'px' suffix if present
value = value.strip().rstrip('px')
try:
return int(value)
except ValueError:
return None
def _enter_xbrl_context(self, element: HtmlElement):
"""Enter XBRL context."""
if self.config.extract_xbrl and self.strategies.get('xbrl_extraction'):
xbrl_data = self.strategies['xbrl_extraction'].extract_context(element)
if xbrl_data:
self.xbrl_context_stack.append(xbrl_data)
def _exit_xbrl_context(self, element: HtmlElement):
"""Exit XBRL context."""
if self.xbrl_context_stack:
self.xbrl_context_stack.pop()
def _get_current_xbrl_metadata(self) -> Dict[str, Any]:
"""Get current XBRL metadata."""
if not self.xbrl_context_stack:
return {}
# Merge all contexts in stack
metadata = {}
for context in self.xbrl_context_stack:
metadata.update(context)
return metadata
def _merge_adjacent_nodes(self, root: Node):
"""Merge adjacent text nodes with similar styles."""
# Implementation would recursively merge adjacent text nodes
# This is a placeholder for the actual implementation
pass