670 lines
26 KiB
Python
670 lines
26 KiB
Python
"""
|
|
Document builder that converts parsed HTML tree into document nodes.
|
|
"""
|
|
|
|
from typing import Dict, Any, Optional
|
|
|
|
from lxml.html import HtmlElement
|
|
|
|
from edgar.documents.config import ParserConfig
|
|
from edgar.documents.nodes import (
|
|
Node, DocumentNode, TextNode, ParagraphNode, HeadingNode,
|
|
ContainerNode, SectionNode, ListNode, ListItemNode, LinkNode, ImageNode
|
|
)
|
|
from edgar.documents.strategies.style_parser import StyleParser
|
|
from edgar.documents.table_nodes import TableNode, Cell, Row
|
|
from edgar.documents.types import Style, ParseContext, SemanticType
|
|
|
|
|
|
class DocumentBuilder:
|
|
"""
|
|
Builds Document node tree from parsed HTML.
|
|
|
|
Handles the conversion of HTML elements into structured nodes
|
|
with proper hierarchy and metadata.
|
|
"""
|
|
|
|
# Block-level elements
|
|
BLOCK_ELEMENTS = {
|
|
'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
'ul', 'ol', 'li', 'blockquote', 'pre', 'hr',
|
|
'table', 'form', 'fieldset', 'address', 'section',
|
|
'article', 'aside', 'nav', 'header', 'footer', 'main'
|
|
}
|
|
|
|
# Inline elements
|
|
INLINE_ELEMENTS = {
|
|
'span', 'a', 'em', 'strong', 'b', 'i', 'u', 's',
|
|
'small', 'mark', 'del', 'ins', 'sub', 'sup',
|
|
'code', 'kbd', 'var', 'samp', 'abbr', 'cite',
|
|
'q', 'time', 'font',
|
|
# IXBRL inline elements for simple values - should not break text flow
|
|
'ix:nonfraction', 'ix:footnote', 'ix:fraction'
|
|
}
|
|
|
|
# Elements to skip
|
|
SKIP_ELEMENTS = {
|
|
'script', 'style', 'meta', 'link', 'noscript',
|
|
# IXBRL exclude elements - content that should not appear in final document
|
|
'ix:exclude'
|
|
}
|
|
|
|
def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
|
|
"""
|
|
Initialize document builder.
|
|
|
|
Args:
|
|
config: Parser configuration
|
|
strategies: Dictionary of parsing strategies
|
|
"""
|
|
self.config = config
|
|
self.strategies = strategies
|
|
self.style_parser = StyleParser()
|
|
self.context = ParseContext()
|
|
|
|
# Track XBRL context
|
|
self.xbrl_context_stack = []
|
|
self.xbrl_continuations = {}
|
|
|
|
def build(self, tree: HtmlElement) -> DocumentNode:
|
|
"""
|
|
Build document from HTML tree.
|
|
|
|
Args:
|
|
tree: Parsed HTML tree
|
|
|
|
Returns:
|
|
Document root node
|
|
"""
|
|
# Create root document node
|
|
root = DocumentNode()
|
|
|
|
# Find body element
|
|
body = tree.find('.//body')
|
|
if body is None:
|
|
# If no body, use the entire tree
|
|
body = tree
|
|
|
|
# Process body content
|
|
self._process_element(body, root)
|
|
|
|
# Apply node merging if configured
|
|
if self.config.merge_adjacent_nodes:
|
|
self._merge_adjacent_nodes(root)
|
|
|
|
return root
|
|
|
|
def _process_element(self, element: HtmlElement, parent: Node) -> Optional[Node]:
|
|
"""
|
|
Process HTML element into node.
|
|
|
|
Args:
|
|
element: HTML element to process
|
|
parent: Parent node
|
|
|
|
Returns:
|
|
Created node or None if skipped
|
|
"""
|
|
|
|
# Skip certain elements but preserve their tail text
|
|
if element.tag in self.SKIP_ELEMENTS:
|
|
# Process tail text even when skipping element
|
|
if element.tail:
|
|
if self.config.preserve_whitespace:
|
|
text_node = TextNode(content=element.tail)
|
|
parent.add_child(text_node)
|
|
else:
|
|
if element.tail.strip():
|
|
text_node = TextNode(content=element.tail.strip())
|
|
parent.add_child(text_node)
|
|
return None
|
|
|
|
# Skip page number containers
|
|
if self._is_page_number_container(element):
|
|
return None
|
|
|
|
# Skip page break elements
|
|
if self._is_page_break_element(element):
|
|
return None
|
|
|
|
# Skip navigation containers that follow page breaks
|
|
if self._is_page_navigation_container(element):
|
|
return None
|
|
|
|
# Track parsing depth
|
|
self.context.depth += 1
|
|
|
|
try:
|
|
# Handle XBRL elements
|
|
if element.tag.startswith('{'): # Namespaced element
|
|
self._enter_xbrl_context(element)
|
|
|
|
# Extract style
|
|
style = self._extract_style(element)
|
|
|
|
# Create appropriate node based on element type
|
|
node = self._create_node_for_element(element, style)
|
|
|
|
if node:
|
|
# Add XBRL metadata if in context
|
|
if self.xbrl_context_stack:
|
|
node.metadata.update(self._get_current_xbrl_metadata())
|
|
|
|
# Add to parent
|
|
parent.add_child(node)
|
|
|
|
# Process children for container nodes
|
|
if self._should_process_children(element, node):
|
|
# Add element's direct text first
|
|
if element.text:
|
|
if self.config.preserve_whitespace:
|
|
if element.text: # Don't strip whitespace
|
|
text_node = TextNode(content=element.text)
|
|
node.add_child(text_node)
|
|
else:
|
|
if element.text.strip():
|
|
text_node = TextNode(content=element.text.strip())
|
|
node.add_child(text_node)
|
|
|
|
# Process child elements
|
|
for child in element:
|
|
self._process_element(child, node)
|
|
|
|
# Process text after children
|
|
if element.tail:
|
|
if self.config.preserve_whitespace:
|
|
text_node = TextNode(content=element.tail)
|
|
parent.add_child(text_node)
|
|
else:
|
|
if element.tail.strip():
|
|
text_node = TextNode(content=element.tail.strip())
|
|
parent.add_child(text_node)
|
|
elif element.tail.isspace():
|
|
# Even if tail is just whitespace, preserve the spacing info
|
|
# This helps with inline element spacing decisions
|
|
if hasattr(node, 'set_metadata'):
|
|
node.set_metadata('has_tail_whitespace', True)
|
|
else:
|
|
# Node created but children not processed - still need to handle tail
|
|
if element.tail:
|
|
if self.config.preserve_whitespace:
|
|
text_node = TextNode(content=element.tail)
|
|
parent.add_child(text_node)
|
|
else:
|
|
if element.tail.strip():
|
|
text_node = TextNode(content=element.tail.strip())
|
|
parent.add_child(text_node)
|
|
elif element.tail.isspace():
|
|
# Even if tail is just whitespace, preserve the spacing info
|
|
if hasattr(node, 'set_metadata'):
|
|
node.set_metadata('has_tail_whitespace', True)
|
|
else:
|
|
# No node created, process children with same parent
|
|
for child in element:
|
|
self._process_element(child, parent)
|
|
|
|
# Process tail text
|
|
if element.tail:
|
|
if self.config.preserve_whitespace:
|
|
text_node = TextNode(content=element.tail)
|
|
parent.add_child(text_node)
|
|
else:
|
|
if element.tail.strip():
|
|
text_node = TextNode(content=element.tail.strip())
|
|
parent.add_child(text_node)
|
|
|
|
# Exit XBRL context
|
|
if element.tag.startswith('{'):
|
|
self._exit_xbrl_context(element)
|
|
|
|
return node
|
|
|
|
finally:
|
|
self.context.depth -= 1
|
|
|
|
def _create_node_for_element(self, element: HtmlElement, style: Style) -> Optional[Node]:
|
|
"""Create appropriate node for HTML element."""
|
|
tag = element.tag.lower() if not element.tag.startswith('{') else element.tag
|
|
|
|
|
|
# Check for heading
|
|
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
level = int(tag[1])
|
|
text = self._get_element_text(element)
|
|
if text:
|
|
return HeadingNode(content=text, level=level, style=style)
|
|
|
|
# Handle specific elements first before header detection
|
|
if tag == 'p':
|
|
return ParagraphNode(style=style)
|
|
|
|
elif tag == 'li':
|
|
return ListItemNode(style=style)
|
|
|
|
# Check if element might be a heading based on style/content
|
|
# Skip header detection for certain tags that should never be headers
|
|
skip_header_detection_tags = {
|
|
'li', 'td', 'th', 'option', 'a', 'button', 'label',
|
|
# IXBRL inline elements - should not be treated as headers
|
|
'ix:nonfraction', 'ix:footnote', 'ix:fraction',
|
|
# IXBRL elements that can contain tables and complex content
|
|
'ix:nonNumeric', 'ix:continuation'
|
|
}
|
|
if tag not in skip_header_detection_tags and self.strategies.get('header_detection'):
|
|
header_info = self.strategies['header_detection'].detect(element, self.context)
|
|
if header_info and header_info.confidence > self.config.header_detection_threshold:
|
|
text = self._get_element_text(element)
|
|
if text:
|
|
node = HeadingNode(
|
|
content=text,
|
|
level=header_info.level,
|
|
style=style
|
|
)
|
|
# Add header metadata
|
|
node.set_metadata('detection_method', header_info.detection_method)
|
|
node.set_metadata('confidence', header_info.confidence)
|
|
if header_info.is_item:
|
|
node.semantic_type = SemanticType.ITEM_HEADER
|
|
node.set_metadata('item_number', header_info.item_number)
|
|
return node
|
|
|
|
# Continue handling other specific elements
|
|
if tag == 'table':
|
|
if self.strategies.get('table_processing'):
|
|
return self.strategies['table_processing'].process(element)
|
|
else:
|
|
return self._process_table_basic(element, style)
|
|
|
|
elif tag in ['ul', 'ol']:
|
|
return ListNode(ordered=(tag == 'ol'), style=style)
|
|
|
|
elif tag == 'li':
|
|
return ListItemNode(style=style)
|
|
|
|
elif tag == 'a':
|
|
href = element.get('href', '')
|
|
title = element.get('title', '')
|
|
text = self._get_element_text(element)
|
|
return LinkNode(content=text, href=href, title=title, style=style)
|
|
|
|
elif tag == 'img':
|
|
return ImageNode(
|
|
src=element.get('src'),
|
|
alt=element.get('alt'),
|
|
width=self._parse_dimension(element.get('width')),
|
|
height=self._parse_dimension(element.get('height')),
|
|
style=style
|
|
)
|
|
|
|
elif tag == 'br':
|
|
# Line break - add as text node
|
|
return TextNode(content='\n')
|
|
|
|
elif tag in ['section', 'article']:
|
|
return SectionNode(style=style)
|
|
|
|
elif tag == 'div' or tag in self.BLOCK_ELEMENTS:
|
|
# Check if CSS display property makes this inline
|
|
if style.display in ['inline', 'inline-block']:
|
|
# Treat as inline element despite being a div
|
|
text = self._get_element_text(element)
|
|
if text:
|
|
text_node = TextNode(content=text, style=style)
|
|
text_node.set_metadata('original_tag', tag)
|
|
text_node.set_metadata('inline_via_css', True)
|
|
return text_node
|
|
# If no text but inline, still process children inline
|
|
return ContainerNode(tag_name=tag, style=style)
|
|
|
|
# Normal block behavior
|
|
# Check if this is just a text container with only inline elements
|
|
if self._is_text_only_container(element):
|
|
# Create ParagraphNode for divs containing only inline elements
|
|
# This ensures proper text concatenation for spans, etc.
|
|
return ParagraphNode(style=style)
|
|
else:
|
|
return ContainerNode(tag_name=tag, style=style)
|
|
|
|
elif tag in self.INLINE_ELEMENTS:
|
|
# Inline elements - extract text and add to parent
|
|
text = self._get_element_text(element)
|
|
if text:
|
|
text_node = TextNode(content=text, style=style)
|
|
# Preserve inline element metadata
|
|
text_node.set_metadata('original_tag', tag)
|
|
return text_node
|
|
|
|
elif tag in ['ix:nonNumeric', 'ix:continuation']:
|
|
# IXBRL elements that can contain complex content including tables
|
|
# Process as container to allow proper table parsing
|
|
return ContainerNode(tag_name=tag, style=style)
|
|
|
|
# Default: create container for unknown elements
|
|
return ContainerNode(tag_name=tag, style=style)
|
|
|
|
def _is_page_number_container(self, element: HtmlElement) -> bool:
|
|
"""Detect and filter page number containers across various SEC filing patterns."""
|
|
import re
|
|
|
|
# Get text content first - all page numbers should be short
|
|
text_content = element.text_content().strip()
|
|
|
|
# Must be short content (1-8 chars to handle "Page X" format)
|
|
if len(text_content) > 8 or len(text_content) == 0:
|
|
return False
|
|
|
|
# Must be numeric, roman numerals, or "Page X" format
|
|
if not self._is_page_number_content(text_content):
|
|
return False
|
|
|
|
# Check various patterns based on element type and styling
|
|
tag = element.tag.lower()
|
|
|
|
# Pattern 1: Oracle-style flexbox containers (highest confidence)
|
|
if tag == 'div' and self._is_flexbox_page_number(element):
|
|
return True
|
|
|
|
# Pattern 2: Center/right aligned paragraphs (common pattern)
|
|
if tag == 'p' and self._is_aligned_page_number(element):
|
|
return True
|
|
|
|
# Pattern 3: Footer-style divs with centered page numbers
|
|
if tag == 'div' and self._is_footer_page_number(element):
|
|
return True
|
|
|
|
# Pattern 4: Simple divs with page break context
|
|
if tag == 'div' and self._is_page_break_context(element):
|
|
return True
|
|
|
|
return False
|
|
|
|
def _is_page_number_content(self, text: str) -> bool:
|
|
"""Check if text content looks like a page number."""
|
|
import re
|
|
|
|
# Simple numeric (most common)
|
|
if text.isdigit():
|
|
return True
|
|
|
|
# Roman numerals
|
|
if re.match(r'^[ivxlcdm]+$', text.lower()):
|
|
return True
|
|
|
|
# "Page X" or "Page X of Y" format
|
|
if re.match(r'^page\s+\d+(\s+of\s+\d+)?$', text.lower()):
|
|
return True
|
|
|
|
return False
|
|
|
|
def _is_flexbox_page_number(self, element: HtmlElement) -> bool:
|
|
"""Detect Oracle-style flexbox page number containers."""
|
|
import re
|
|
|
|
style_attr = element.get('style', '')
|
|
if not style_attr:
|
|
return False
|
|
|
|
# Must have: display:flex, justify-content:flex-end, min-height:1in
|
|
required_patterns = [
|
|
r'display:\s*flex',
|
|
r'justify-content:\s*flex-end',
|
|
r'min-height:\s*1in'
|
|
]
|
|
|
|
return all(re.search(pattern, style_attr) for pattern in required_patterns)
|
|
|
|
def _is_aligned_page_number(self, element: HtmlElement) -> bool:
|
|
"""Detect center or right-aligned page number paragraphs."""
|
|
import re
|
|
|
|
style_attr = element.get('style', '')
|
|
|
|
# Check for center or right alignment
|
|
alignment_pattern = r'text-align:\s*(center|right)'
|
|
if not re.search(alignment_pattern, style_attr):
|
|
return False
|
|
|
|
# Optional: check for smaller font size (common in page numbers)
|
|
font_size_pattern = r'font-size:\s*([0-9]+)pt'
|
|
font_match = re.search(font_size_pattern, style_attr)
|
|
if font_match:
|
|
font_size = int(font_match.group(1))
|
|
# Page numbers often use smaller fonts (8-12pt)
|
|
if font_size <= 12:
|
|
return True
|
|
|
|
return True # Any center/right aligned short content
|
|
|
|
def _is_footer_page_number(self, element: HtmlElement) -> bool:
|
|
"""Detect footer-style page number containers."""
|
|
import re
|
|
|
|
style_attr = element.get('style', '')
|
|
|
|
# Look for bottom positioning or footer-like styling
|
|
footer_patterns = [
|
|
r'bottom:\s*[0-9]',
|
|
r'position:\s*absolute',
|
|
r'margin-bottom:\s*0',
|
|
r'text-align:\s*center'
|
|
]
|
|
|
|
# Need at least 2 footer indicators
|
|
matches = sum(1 for pattern in footer_patterns if re.search(pattern, style_attr))
|
|
return matches >= 2
|
|
|
|
def _is_page_break_context(self, element: HtmlElement) -> bool:
|
|
"""Check if element is near page breaks (common page number context)."""
|
|
|
|
# Check next sibling for page break HR
|
|
next_elem = element.getnext()
|
|
if next_elem is not None and next_elem.tag == 'hr':
|
|
hr_style = next_elem.get('style', '')
|
|
if 'page-break' in hr_style:
|
|
return True
|
|
|
|
# Check if element has page-break styling itself
|
|
style_attr = element.get('style', '')
|
|
if 'page-break' in style_attr:
|
|
return True
|
|
|
|
return False
|
|
|
|
def _is_page_break_element(self, element: HtmlElement) -> bool:
|
|
"""Detect page break HR elements."""
|
|
if element.tag.lower() != 'hr':
|
|
return False
|
|
|
|
style_attr = element.get('style', '')
|
|
|
|
# Check for page-break-after:always or similar page break styles
|
|
return 'page-break' in style_attr
|
|
|
|
def _is_page_navigation_container(self, element: HtmlElement) -> bool:
|
|
"""Detect navigation containers that appear after page breaks."""
|
|
if element.tag.lower() != 'div':
|
|
return False
|
|
|
|
style_attr = element.get('style', '')
|
|
|
|
# Check for navigation container patterns
|
|
# Often have: padding-top, min-height:1in, box-sizing:border-box
|
|
nav_indicators = [
|
|
r'padding-top:\s*0\.5in',
|
|
r'min-height:\s*1in',
|
|
r'box-sizing:\s*border-box'
|
|
]
|
|
|
|
import re
|
|
matches = sum(1 for pattern in nav_indicators if re.search(pattern, style_attr))
|
|
|
|
# Need at least 2 indicators
|
|
if matches < 2:
|
|
return False
|
|
|
|
# Check if it contains typical navigation content
|
|
text_content = element.text_content().strip().lower()
|
|
|
|
# Common navigation phrases
|
|
nav_phrases = [
|
|
'table of contents',
|
|
'index to financial statements',
|
|
'table of content',
|
|
'index to financial statement'
|
|
]
|
|
|
|
return any(phrase in text_content for phrase in nav_phrases)
|
|
|
|
def _extract_style(self, element: HtmlElement) -> Style:
|
|
"""Extract style from element."""
|
|
style_str = element.get('style', '')
|
|
style = self.style_parser.parse(style_str)
|
|
|
|
# Add tag-specific styles
|
|
tag = element.tag.lower()
|
|
if tag == 'b' or tag == 'strong':
|
|
style.font_weight = 'bold'
|
|
elif tag == 'i' or tag == 'em':
|
|
style.font_style = 'italic'
|
|
elif tag == 'u':
|
|
style.text_decoration = 'underline'
|
|
|
|
# Handle alignment
|
|
align = element.get('align')
|
|
if align:
|
|
style.text_align = align
|
|
|
|
return style
|
|
|
|
def _get_element_text(self, element: HtmlElement) -> str:
|
|
"""Get text content from element."""
|
|
text_parts = []
|
|
|
|
# Get element's direct text
|
|
if element.text:
|
|
# For inline elements, preserve leading/trailing whitespace
|
|
if element.tag.lower() in self.INLINE_ELEMENTS:
|
|
text_parts.append(element.text)
|
|
else:
|
|
text_parts.append(element.text.strip())
|
|
|
|
# For simple elements, get all text content
|
|
if element.tag.lower() in self.INLINE_ELEMENTS or \
|
|
element.tag.lower() in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
# Get all text including from child elements
|
|
for child in element:
|
|
if child.tag.lower() not in self.SKIP_ELEMENTS:
|
|
child_text = child.text_content()
|
|
if child_text:
|
|
# For inline elements, preserve whitespace in child content too
|
|
if element.tag.lower() in self.INLINE_ELEMENTS:
|
|
text_parts.append(child_text)
|
|
else:
|
|
text_parts.append(child_text.strip())
|
|
|
|
# For inline elements with preserved whitespace, concatenate directly
|
|
# For others, join with spaces
|
|
if element.tag.lower() in self.INLINE_ELEMENTS and len(text_parts) == 1:
|
|
return text_parts[0] if text_parts else ''
|
|
else:
|
|
return ' '.join(text_parts)
|
|
|
|
def _is_text_only_container(self, element: HtmlElement) -> bool:
|
|
"""Check if element contains only text and inline elements."""
|
|
for child in element:
|
|
if child.tag.lower() in self.BLOCK_ELEMENTS:
|
|
return False
|
|
if child.tag.lower() == 'table':
|
|
return False
|
|
return True
|
|
|
|
def _should_process_children(self, element: HtmlElement, node: Node) -> bool:
|
|
"""Determine if children should be processed."""
|
|
# Don't process children for certain node types
|
|
if isinstance(node, (TextNode, HeadingNode)):
|
|
return False
|
|
|
|
# Tables are processed separately
|
|
if isinstance(node, TableNode):
|
|
return False
|
|
|
|
return True
|
|
|
|
def _process_table_basic(self, element: HtmlElement, style: Style) -> TableNode:
|
|
"""Basic table processing without advanced strategy."""
|
|
table = TableNode(style=style)
|
|
|
|
# Set config for rendering decisions
|
|
table._config = self.config
|
|
|
|
# Extract caption
|
|
caption_elem = element.find('.//caption')
|
|
if caption_elem is not None:
|
|
table.caption = caption_elem.text_content().strip()
|
|
|
|
# Process rows
|
|
for tr in element.findall('.//tr'):
|
|
cells = []
|
|
for td in tr.findall('.//td') + tr.findall('.//th'):
|
|
cell = Cell(
|
|
content=td.text_content().strip(),
|
|
colspan=int(td.get('colspan', '1')),
|
|
rowspan=int(td.get('rowspan', '1')),
|
|
is_header=(td.tag == 'th'),
|
|
align=td.get('align')
|
|
)
|
|
cells.append(cell)
|
|
|
|
if cells:
|
|
row = Row(cells=cells, is_header=(tr.find('.//th') is not None))
|
|
|
|
# Determine if header or data row
|
|
if tr.getparent().tag == 'thead' or row.is_header:
|
|
table.headers.append(cells)
|
|
else:
|
|
table.rows.append(row)
|
|
|
|
return table
|
|
|
|
def _parse_dimension(self, value: Optional[str]) -> Optional[int]:
|
|
"""Parse dimension value (width/height)."""
|
|
if not value:
|
|
return None
|
|
|
|
# Remove 'px' suffix if present
|
|
value = value.strip().rstrip('px')
|
|
|
|
try:
|
|
return int(value)
|
|
except ValueError:
|
|
return None
|
|
|
|
def _enter_xbrl_context(self, element: HtmlElement):
|
|
"""Enter XBRL context."""
|
|
if self.config.extract_xbrl and self.strategies.get('xbrl_extraction'):
|
|
xbrl_data = self.strategies['xbrl_extraction'].extract_context(element)
|
|
if xbrl_data:
|
|
self.xbrl_context_stack.append(xbrl_data)
|
|
|
|
def _exit_xbrl_context(self, element: HtmlElement):
|
|
"""Exit XBRL context."""
|
|
if self.xbrl_context_stack:
|
|
self.xbrl_context_stack.pop()
|
|
|
|
def _get_current_xbrl_metadata(self) -> Dict[str, Any]:
|
|
"""Get current XBRL metadata."""
|
|
if not self.xbrl_context_stack:
|
|
return {}
|
|
|
|
# Merge all contexts in stack
|
|
metadata = {}
|
|
for context in self.xbrl_context_stack:
|
|
metadata.update(context)
|
|
|
|
return metadata
|
|
|
|
def _merge_adjacent_nodes(self, root: Node):
|
|
"""Merge adjacent text nodes with similar styles."""
|
|
# Implementation would recursively merge adjacent text nodes
|
|
# This is a placeholder for the actual implementation
|
|
pass |