Initial commit
This commit is contained in:
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
Parsing strategies for different content types.
|
||||
"""
|
||||
|
||||
from edgar.documents.strategies.document_builder import DocumentBuilder
|
||||
from edgar.documents.strategies.header_detection import HeaderDetectionStrategy
|
||||
from edgar.documents.strategies.table_processing import TableProcessor
|
||||
from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
|
||||
|
||||
__all__ = [
|
||||
'DocumentBuilder',
|
||||
'HeaderDetectionStrategy',
|
||||
'TableProcessor',
|
||||
'XBRLExtractor'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,670 @@
|
||||
"""
|
||||
Document builder that converts parsed HTML tree into document nodes.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.nodes import (
|
||||
Node, DocumentNode, TextNode, ParagraphNode, HeadingNode,
|
||||
ContainerNode, SectionNode, ListNode, ListItemNode, LinkNode, ImageNode
|
||||
)
|
||||
from edgar.documents.strategies.style_parser import StyleParser
|
||||
from edgar.documents.table_nodes import TableNode, Cell, Row
|
||||
from edgar.documents.types import Style, ParseContext, SemanticType
|
||||
|
||||
|
||||
class DocumentBuilder:
|
||||
"""
|
||||
Builds Document node tree from parsed HTML.
|
||||
|
||||
Handles the conversion of HTML elements into structured nodes
|
||||
with proper hierarchy and metadata.
|
||||
"""
|
||||
|
||||
# Block-level elements
|
||||
BLOCK_ELEMENTS = {
|
||||
'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'ul', 'ol', 'li', 'blockquote', 'pre', 'hr',
|
||||
'table', 'form', 'fieldset', 'address', 'section',
|
||||
'article', 'aside', 'nav', 'header', 'footer', 'main'
|
||||
}
|
||||
|
||||
# Inline elements
|
||||
INLINE_ELEMENTS = {
|
||||
'span', 'a', 'em', 'strong', 'b', 'i', 'u', 's',
|
||||
'small', 'mark', 'del', 'ins', 'sub', 'sup',
|
||||
'code', 'kbd', 'var', 'samp', 'abbr', 'cite',
|
||||
'q', 'time', 'font',
|
||||
# IXBRL inline elements for simple values - should not break text flow
|
||||
'ix:nonfraction', 'ix:footnote', 'ix:fraction'
|
||||
}
|
||||
|
||||
# Elements to skip
|
||||
SKIP_ELEMENTS = {
|
||||
'script', 'style', 'meta', 'link', 'noscript',
|
||||
# IXBRL exclude elements - content that should not appear in final document
|
||||
'ix:exclude'
|
||||
}
|
||||
|
||||
def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
|
||||
"""
|
||||
Initialize document builder.
|
||||
|
||||
Args:
|
||||
config: Parser configuration
|
||||
strategies: Dictionary of parsing strategies
|
||||
"""
|
||||
self.config = config
|
||||
self.strategies = strategies
|
||||
self.style_parser = StyleParser()
|
||||
self.context = ParseContext()
|
||||
|
||||
# Track XBRL context
|
||||
self.xbrl_context_stack = []
|
||||
self.xbrl_continuations = {}
|
||||
|
||||
def build(self, tree: HtmlElement) -> DocumentNode:
|
||||
"""
|
||||
Build document from HTML tree.
|
||||
|
||||
Args:
|
||||
tree: Parsed HTML tree
|
||||
|
||||
Returns:
|
||||
Document root node
|
||||
"""
|
||||
# Create root document node
|
||||
root = DocumentNode()
|
||||
|
||||
# Find body element
|
||||
body = tree.find('.//body')
|
||||
if body is None:
|
||||
# If no body, use the entire tree
|
||||
body = tree
|
||||
|
||||
# Process body content
|
||||
self._process_element(body, root)
|
||||
|
||||
# Apply node merging if configured
|
||||
if self.config.merge_adjacent_nodes:
|
||||
self._merge_adjacent_nodes(root)
|
||||
|
||||
return root
|
||||
|
||||
def _process_element(self, element: HtmlElement, parent: Node) -> Optional[Node]:
|
||||
"""
|
||||
Process HTML element into node.
|
||||
|
||||
Args:
|
||||
element: HTML element to process
|
||||
parent: Parent node
|
||||
|
||||
Returns:
|
||||
Created node or None if skipped
|
||||
"""
|
||||
|
||||
# Skip certain elements but preserve their tail text
|
||||
if element.tag in self.SKIP_ELEMENTS:
|
||||
# Process tail text even when skipping element
|
||||
if element.tail:
|
||||
if self.config.preserve_whitespace:
|
||||
text_node = TextNode(content=element.tail)
|
||||
parent.add_child(text_node)
|
||||
else:
|
||||
if element.tail.strip():
|
||||
text_node = TextNode(content=element.tail.strip())
|
||||
parent.add_child(text_node)
|
||||
return None
|
||||
|
||||
# Skip page number containers
|
||||
if self._is_page_number_container(element):
|
||||
return None
|
||||
|
||||
# Skip page break elements
|
||||
if self._is_page_break_element(element):
|
||||
return None
|
||||
|
||||
# Skip navigation containers that follow page breaks
|
||||
if self._is_page_navigation_container(element):
|
||||
return None
|
||||
|
||||
# Track parsing depth
|
||||
self.context.depth += 1
|
||||
|
||||
try:
|
||||
# Handle XBRL elements
|
||||
if element.tag.startswith('{'): # Namespaced element
|
||||
self._enter_xbrl_context(element)
|
||||
|
||||
# Extract style
|
||||
style = self._extract_style(element)
|
||||
|
||||
# Create appropriate node based on element type
|
||||
node = self._create_node_for_element(element, style)
|
||||
|
||||
if node:
|
||||
# Add XBRL metadata if in context
|
||||
if self.xbrl_context_stack:
|
||||
node.metadata.update(self._get_current_xbrl_metadata())
|
||||
|
||||
# Add to parent
|
||||
parent.add_child(node)
|
||||
|
||||
# Process children for container nodes
|
||||
if self._should_process_children(element, node):
|
||||
# Add element's direct text first
|
||||
if element.text:
|
||||
if self.config.preserve_whitespace:
|
||||
if element.text: # Don't strip whitespace
|
||||
text_node = TextNode(content=element.text)
|
||||
node.add_child(text_node)
|
||||
else:
|
||||
if element.text.strip():
|
||||
text_node = TextNode(content=element.text.strip())
|
||||
node.add_child(text_node)
|
||||
|
||||
# Process child elements
|
||||
for child in element:
|
||||
self._process_element(child, node)
|
||||
|
||||
# Process text after children
|
||||
if element.tail:
|
||||
if self.config.preserve_whitespace:
|
||||
text_node = TextNode(content=element.tail)
|
||||
parent.add_child(text_node)
|
||||
else:
|
||||
if element.tail.strip():
|
||||
text_node = TextNode(content=element.tail.strip())
|
||||
parent.add_child(text_node)
|
||||
elif element.tail.isspace():
|
||||
# Even if tail is just whitespace, preserve the spacing info
|
||||
# This helps with inline element spacing decisions
|
||||
if hasattr(node, 'set_metadata'):
|
||||
node.set_metadata('has_tail_whitespace', True)
|
||||
else:
|
||||
# Node created but children not processed - still need to handle tail
|
||||
if element.tail:
|
||||
if self.config.preserve_whitespace:
|
||||
text_node = TextNode(content=element.tail)
|
||||
parent.add_child(text_node)
|
||||
else:
|
||||
if element.tail.strip():
|
||||
text_node = TextNode(content=element.tail.strip())
|
||||
parent.add_child(text_node)
|
||||
elif element.tail.isspace():
|
||||
# Even if tail is just whitespace, preserve the spacing info
|
||||
if hasattr(node, 'set_metadata'):
|
||||
node.set_metadata('has_tail_whitespace', True)
|
||||
else:
|
||||
# No node created, process children with same parent
|
||||
for child in element:
|
||||
self._process_element(child, parent)
|
||||
|
||||
# Process tail text
|
||||
if element.tail:
|
||||
if self.config.preserve_whitespace:
|
||||
text_node = TextNode(content=element.tail)
|
||||
parent.add_child(text_node)
|
||||
else:
|
||||
if element.tail.strip():
|
||||
text_node = TextNode(content=element.tail.strip())
|
||||
parent.add_child(text_node)
|
||||
|
||||
# Exit XBRL context
|
||||
if element.tag.startswith('{'):
|
||||
self._exit_xbrl_context(element)
|
||||
|
||||
return node
|
||||
|
||||
finally:
|
||||
self.context.depth -= 1
|
||||
|
||||
def _create_node_for_element(self, element: HtmlElement, style: Style) -> Optional[Node]:
|
||||
"""Create appropriate node for HTML element."""
|
||||
tag = element.tag.lower() if not element.tag.startswith('{') else element.tag
|
||||
|
||||
|
||||
# Check for heading
|
||||
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
level = int(tag[1])
|
||||
text = self._get_element_text(element)
|
||||
if text:
|
||||
return HeadingNode(content=text, level=level, style=style)
|
||||
|
||||
# Handle specific elements first before header detection
|
||||
if tag == 'p':
|
||||
return ParagraphNode(style=style)
|
||||
|
||||
elif tag == 'li':
|
||||
return ListItemNode(style=style)
|
||||
|
||||
# Check if element might be a heading based on style/content
|
||||
# Skip header detection for certain tags that should never be headers
|
||||
skip_header_detection_tags = {
|
||||
'li', 'td', 'th', 'option', 'a', 'button', 'label',
|
||||
# IXBRL inline elements - should not be treated as headers
|
||||
'ix:nonfraction', 'ix:footnote', 'ix:fraction',
|
||||
# IXBRL elements that can contain tables and complex content
|
||||
'ix:nonNumeric', 'ix:continuation'
|
||||
}
|
||||
if tag not in skip_header_detection_tags and self.strategies.get('header_detection'):
|
||||
header_info = self.strategies['header_detection'].detect(element, self.context)
|
||||
if header_info and header_info.confidence > self.config.header_detection_threshold:
|
||||
text = self._get_element_text(element)
|
||||
if text:
|
||||
node = HeadingNode(
|
||||
content=text,
|
||||
level=header_info.level,
|
||||
style=style
|
||||
)
|
||||
# Add header metadata
|
||||
node.set_metadata('detection_method', header_info.detection_method)
|
||||
node.set_metadata('confidence', header_info.confidence)
|
||||
if header_info.is_item:
|
||||
node.semantic_type = SemanticType.ITEM_HEADER
|
||||
node.set_metadata('item_number', header_info.item_number)
|
||||
return node
|
||||
|
||||
# Continue handling other specific elements
|
||||
if tag == 'table':
|
||||
if self.strategies.get('table_processing'):
|
||||
return self.strategies['table_processing'].process(element)
|
||||
else:
|
||||
return self._process_table_basic(element, style)
|
||||
|
||||
elif tag in ['ul', 'ol']:
|
||||
return ListNode(ordered=(tag == 'ol'), style=style)
|
||||
|
||||
elif tag == 'li':
|
||||
return ListItemNode(style=style)
|
||||
|
||||
elif tag == 'a':
|
||||
href = element.get('href', '')
|
||||
title = element.get('title', '')
|
||||
text = self._get_element_text(element)
|
||||
return LinkNode(content=text, href=href, title=title, style=style)
|
||||
|
||||
elif tag == 'img':
|
||||
return ImageNode(
|
||||
src=element.get('src'),
|
||||
alt=element.get('alt'),
|
||||
width=self._parse_dimension(element.get('width')),
|
||||
height=self._parse_dimension(element.get('height')),
|
||||
style=style
|
||||
)
|
||||
|
||||
elif tag == 'br':
|
||||
# Line break - add as text node
|
||||
return TextNode(content='\n')
|
||||
|
||||
elif tag in ['section', 'article']:
|
||||
return SectionNode(style=style)
|
||||
|
||||
elif tag == 'div' or tag in self.BLOCK_ELEMENTS:
|
||||
# Check if CSS display property makes this inline
|
||||
if style.display in ['inline', 'inline-block']:
|
||||
# Treat as inline element despite being a div
|
||||
text = self._get_element_text(element)
|
||||
if text:
|
||||
text_node = TextNode(content=text, style=style)
|
||||
text_node.set_metadata('original_tag', tag)
|
||||
text_node.set_metadata('inline_via_css', True)
|
||||
return text_node
|
||||
# If no text but inline, still process children inline
|
||||
return ContainerNode(tag_name=tag, style=style)
|
||||
|
||||
# Normal block behavior
|
||||
# Check if this is just a text container with only inline elements
|
||||
if self._is_text_only_container(element):
|
||||
# Create ParagraphNode for divs containing only inline elements
|
||||
# This ensures proper text concatenation for spans, etc.
|
||||
return ParagraphNode(style=style)
|
||||
else:
|
||||
return ContainerNode(tag_name=tag, style=style)
|
||||
|
||||
elif tag in self.INLINE_ELEMENTS:
|
||||
# Inline elements - extract text and add to parent
|
||||
text = self._get_element_text(element)
|
||||
if text:
|
||||
text_node = TextNode(content=text, style=style)
|
||||
# Preserve inline element metadata
|
||||
text_node.set_metadata('original_tag', tag)
|
||||
return text_node
|
||||
|
||||
elif tag in ['ix:nonNumeric', 'ix:continuation']:
|
||||
# IXBRL elements that can contain complex content including tables
|
||||
# Process as container to allow proper table parsing
|
||||
return ContainerNode(tag_name=tag, style=style)
|
||||
|
||||
# Default: create container for unknown elements
|
||||
return ContainerNode(tag_name=tag, style=style)
|
||||
|
||||
def _is_page_number_container(self, element: HtmlElement) -> bool:
|
||||
"""Detect and filter page number containers across various SEC filing patterns."""
|
||||
import re
|
||||
|
||||
# Get text content first - all page numbers should be short
|
||||
text_content = element.text_content().strip()
|
||||
|
||||
# Must be short content (1-8 chars to handle "Page X" format)
|
||||
if len(text_content) > 8 or len(text_content) == 0:
|
||||
return False
|
||||
|
||||
# Must be numeric, roman numerals, or "Page X" format
|
||||
if not self._is_page_number_content(text_content):
|
||||
return False
|
||||
|
||||
# Check various patterns based on element type and styling
|
||||
tag = element.tag.lower()
|
||||
|
||||
# Pattern 1: Oracle-style flexbox containers (highest confidence)
|
||||
if tag == 'div' and self._is_flexbox_page_number(element):
|
||||
return True
|
||||
|
||||
# Pattern 2: Center/right aligned paragraphs (common pattern)
|
||||
if tag == 'p' and self._is_aligned_page_number(element):
|
||||
return True
|
||||
|
||||
# Pattern 3: Footer-style divs with centered page numbers
|
||||
if tag == 'div' and self._is_footer_page_number(element):
|
||||
return True
|
||||
|
||||
# Pattern 4: Simple divs with page break context
|
||||
if tag == 'div' and self._is_page_break_context(element):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _is_page_number_content(self, text: str) -> bool:
|
||||
"""Check if text content looks like a page number."""
|
||||
import re
|
||||
|
||||
# Simple numeric (most common)
|
||||
if text.isdigit():
|
||||
return True
|
||||
|
||||
# Roman numerals
|
||||
if re.match(r'^[ivxlcdm]+$', text.lower()):
|
||||
return True
|
||||
|
||||
# "Page X" or "Page X of Y" format
|
||||
if re.match(r'^page\s+\d+(\s+of\s+\d+)?$', text.lower()):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _is_flexbox_page_number(self, element: HtmlElement) -> bool:
|
||||
"""Detect Oracle-style flexbox page number containers."""
|
||||
import re
|
||||
|
||||
style_attr = element.get('style', '')
|
||||
if not style_attr:
|
||||
return False
|
||||
|
||||
# Must have: display:flex, justify-content:flex-end, min-height:1in
|
||||
required_patterns = [
|
||||
r'display:\s*flex',
|
||||
r'justify-content:\s*flex-end',
|
||||
r'min-height:\s*1in'
|
||||
]
|
||||
|
||||
return all(re.search(pattern, style_attr) for pattern in required_patterns)
|
||||
|
||||
def _is_aligned_page_number(self, element: HtmlElement) -> bool:
|
||||
"""Detect center or right-aligned page number paragraphs."""
|
||||
import re
|
||||
|
||||
style_attr = element.get('style', '')
|
||||
|
||||
# Check for center or right alignment
|
||||
alignment_pattern = r'text-align:\s*(center|right)'
|
||||
if not re.search(alignment_pattern, style_attr):
|
||||
return False
|
||||
|
||||
# Optional: check for smaller font size (common in page numbers)
|
||||
font_size_pattern = r'font-size:\s*([0-9]+)pt'
|
||||
font_match = re.search(font_size_pattern, style_attr)
|
||||
if font_match:
|
||||
font_size = int(font_match.group(1))
|
||||
# Page numbers often use smaller fonts (8-12pt)
|
||||
if font_size <= 12:
|
||||
return True
|
||||
|
||||
return True # Any center/right aligned short content
|
||||
|
||||
def _is_footer_page_number(self, element: HtmlElement) -> bool:
|
||||
"""Detect footer-style page number containers."""
|
||||
import re
|
||||
|
||||
style_attr = element.get('style', '')
|
||||
|
||||
# Look for bottom positioning or footer-like styling
|
||||
footer_patterns = [
|
||||
r'bottom:\s*[0-9]',
|
||||
r'position:\s*absolute',
|
||||
r'margin-bottom:\s*0',
|
||||
r'text-align:\s*center'
|
||||
]
|
||||
|
||||
# Need at least 2 footer indicators
|
||||
matches = sum(1 for pattern in footer_patterns if re.search(pattern, style_attr))
|
||||
return matches >= 2
|
||||
|
||||
def _is_page_break_context(self, element: HtmlElement) -> bool:
|
||||
"""Check if element is near page breaks (common page number context)."""
|
||||
|
||||
# Check next sibling for page break HR
|
||||
next_elem = element.getnext()
|
||||
if next_elem is not None and next_elem.tag == 'hr':
|
||||
hr_style = next_elem.get('style', '')
|
||||
if 'page-break' in hr_style:
|
||||
return True
|
||||
|
||||
# Check if element has page-break styling itself
|
||||
style_attr = element.get('style', '')
|
||||
if 'page-break' in style_attr:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _is_page_break_element(self, element: HtmlElement) -> bool:
|
||||
"""Detect page break HR elements."""
|
||||
if element.tag.lower() != 'hr':
|
||||
return False
|
||||
|
||||
style_attr = element.get('style', '')
|
||||
|
||||
# Check for page-break-after:always or similar page break styles
|
||||
return 'page-break' in style_attr
|
||||
|
||||
def _is_page_navigation_container(self, element: HtmlElement) -> bool:
|
||||
"""Detect navigation containers that appear after page breaks."""
|
||||
if element.tag.lower() != 'div':
|
||||
return False
|
||||
|
||||
style_attr = element.get('style', '')
|
||||
|
||||
# Check for navigation container patterns
|
||||
# Often have: padding-top, min-height:1in, box-sizing:border-box
|
||||
nav_indicators = [
|
||||
r'padding-top:\s*0\.5in',
|
||||
r'min-height:\s*1in',
|
||||
r'box-sizing:\s*border-box'
|
||||
]
|
||||
|
||||
import re
|
||||
matches = sum(1 for pattern in nav_indicators if re.search(pattern, style_attr))
|
||||
|
||||
# Need at least 2 indicators
|
||||
if matches < 2:
|
||||
return False
|
||||
|
||||
# Check if it contains typical navigation content
|
||||
text_content = element.text_content().strip().lower()
|
||||
|
||||
# Common navigation phrases
|
||||
nav_phrases = [
|
||||
'table of contents',
|
||||
'index to financial statements',
|
||||
'table of content',
|
||||
'index to financial statement'
|
||||
]
|
||||
|
||||
return any(phrase in text_content for phrase in nav_phrases)
|
||||
|
||||
def _extract_style(self, element: HtmlElement) -> Style:
|
||||
"""Extract style from element."""
|
||||
style_str = element.get('style', '')
|
||||
style = self.style_parser.parse(style_str)
|
||||
|
||||
# Add tag-specific styles
|
||||
tag = element.tag.lower()
|
||||
if tag == 'b' or tag == 'strong':
|
||||
style.font_weight = 'bold'
|
||||
elif tag == 'i' or tag == 'em':
|
||||
style.font_style = 'italic'
|
||||
elif tag == 'u':
|
||||
style.text_decoration = 'underline'
|
||||
|
||||
# Handle alignment
|
||||
align = element.get('align')
|
||||
if align:
|
||||
style.text_align = align
|
||||
|
||||
return style
|
||||
|
||||
def _get_element_text(self, element: HtmlElement) -> str:
|
||||
"""Get text content from element."""
|
||||
text_parts = []
|
||||
|
||||
# Get element's direct text
|
||||
if element.text:
|
||||
# For inline elements, preserve leading/trailing whitespace
|
||||
if element.tag.lower() in self.INLINE_ELEMENTS:
|
||||
text_parts.append(element.text)
|
||||
else:
|
||||
text_parts.append(element.text.strip())
|
||||
|
||||
# For simple elements, get all text content
|
||||
if element.tag.lower() in self.INLINE_ELEMENTS or \
|
||||
element.tag.lower() in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
# Get all text including from child elements
|
||||
for child in element:
|
||||
if child.tag.lower() not in self.SKIP_ELEMENTS:
|
||||
child_text = child.text_content()
|
||||
if child_text:
|
||||
# For inline elements, preserve whitespace in child content too
|
||||
if element.tag.lower() in self.INLINE_ELEMENTS:
|
||||
text_parts.append(child_text)
|
||||
else:
|
||||
text_parts.append(child_text.strip())
|
||||
|
||||
# For inline elements with preserved whitespace, concatenate directly
|
||||
# For others, join with spaces
|
||||
if element.tag.lower() in self.INLINE_ELEMENTS and len(text_parts) == 1:
|
||||
return text_parts[0] if text_parts else ''
|
||||
else:
|
||||
return ' '.join(text_parts)
|
||||
|
||||
def _is_text_only_container(self, element: HtmlElement) -> bool:
|
||||
"""Check if element contains only text and inline elements."""
|
||||
for child in element:
|
||||
if child.tag.lower() in self.BLOCK_ELEMENTS:
|
||||
return False
|
||||
if child.tag.lower() == 'table':
|
||||
return False
|
||||
return True
|
||||
|
||||
def _should_process_children(self, element: HtmlElement, node: Node) -> bool:
|
||||
"""Determine if children should be processed."""
|
||||
# Don't process children for certain node types
|
||||
if isinstance(node, (TextNode, HeadingNode)):
|
||||
return False
|
||||
|
||||
# Tables are processed separately
|
||||
if isinstance(node, TableNode):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _process_table_basic(self, element: HtmlElement, style: Style) -> TableNode:
|
||||
"""Basic table processing without advanced strategy."""
|
||||
table = TableNode(style=style)
|
||||
|
||||
# Set config for rendering decisions
|
||||
table._config = self.config
|
||||
|
||||
# Extract caption
|
||||
caption_elem = element.find('.//caption')
|
||||
if caption_elem is not None:
|
||||
table.caption = caption_elem.text_content().strip()
|
||||
|
||||
# Process rows
|
||||
for tr in element.findall('.//tr'):
|
||||
cells = []
|
||||
for td in tr.findall('.//td') + tr.findall('.//th'):
|
||||
cell = Cell(
|
||||
content=td.text_content().strip(),
|
||||
colspan=int(td.get('colspan', '1')),
|
||||
rowspan=int(td.get('rowspan', '1')),
|
||||
is_header=(td.tag == 'th'),
|
||||
align=td.get('align')
|
||||
)
|
||||
cells.append(cell)
|
||||
|
||||
if cells:
|
||||
row = Row(cells=cells, is_header=(tr.find('.//th') is not None))
|
||||
|
||||
# Determine if header or data row
|
||||
if tr.getparent().tag == 'thead' or row.is_header:
|
||||
table.headers.append(cells)
|
||||
else:
|
||||
table.rows.append(row)
|
||||
|
||||
return table
|
||||
|
||||
def _parse_dimension(self, value: Optional[str]) -> Optional[int]:
|
||||
"""Parse dimension value (width/height)."""
|
||||
if not value:
|
||||
return None
|
||||
|
||||
# Remove 'px' suffix if present
|
||||
value = value.strip().rstrip('px')
|
||||
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def _enter_xbrl_context(self, element: HtmlElement):
|
||||
"""Enter XBRL context."""
|
||||
if self.config.extract_xbrl and self.strategies.get('xbrl_extraction'):
|
||||
xbrl_data = self.strategies['xbrl_extraction'].extract_context(element)
|
||||
if xbrl_data:
|
||||
self.xbrl_context_stack.append(xbrl_data)
|
||||
|
||||
def _exit_xbrl_context(self, element: HtmlElement):
|
||||
"""Exit XBRL context."""
|
||||
if self.xbrl_context_stack:
|
||||
self.xbrl_context_stack.pop()
|
||||
|
||||
def _get_current_xbrl_metadata(self) -> Dict[str, Any]:
|
||||
"""Get current XBRL metadata."""
|
||||
if not self.xbrl_context_stack:
|
||||
return {}
|
||||
|
||||
# Merge all contexts in stack
|
||||
metadata = {}
|
||||
for context in self.xbrl_context_stack:
|
||||
metadata.update(context)
|
||||
|
||||
return metadata
|
||||
|
||||
def _merge_adjacent_nodes(self, root: Node):
|
||||
"""Merge adjacent text nodes with similar styles."""
|
||||
# Implementation would recursively merge adjacent text nodes
|
||||
# This is a placeholder for the actual implementation
|
||||
pass
|
||||
@@ -0,0 +1,450 @@
|
||||
"""
|
||||
Multi-strategy header detection for document structure.
|
||||
"""
|
||||
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional, List, Dict
|
||||
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.types import HeaderInfo, ParseContext
|
||||
|
||||
|
||||
class HeaderDetector(ABC):
|
||||
"""Abstract base class for header detectors."""
|
||||
|
||||
@abstractmethod
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""Detect if element is a header."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def name(self) -> str:
|
||||
"""Detector name."""
|
||||
pass
|
||||
|
||||
|
||||
class StyleBasedDetector(HeaderDetector):
|
||||
"""Detect headers based on CSS styles."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "style"
|
||||
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""Detect headers based on style attributes."""
|
||||
# Get element style
|
||||
style = context.get_current_style()
|
||||
|
||||
# Skip if no style info
|
||||
if not style:
|
||||
return None
|
||||
|
||||
# Get text content
|
||||
text = element.text_content().strip()
|
||||
if not text or len(text) > 200: # Skip very long text
|
||||
return None
|
||||
|
||||
confidence = 0.0
|
||||
level = 3 # Default level
|
||||
|
||||
# Check font size
|
||||
if style.font_size and context.base_font_size:
|
||||
size_ratio = style.font_size / context.base_font_size
|
||||
|
||||
if size_ratio >= 2.0:
|
||||
confidence += 0.8
|
||||
level = 1
|
||||
elif size_ratio >= 1.5:
|
||||
confidence += 0.7
|
||||
level = 2
|
||||
elif size_ratio >= 1.2:
|
||||
confidence += 0.5
|
||||
level = 3
|
||||
elif size_ratio >= 1.1:
|
||||
confidence += 0.3
|
||||
level = 4
|
||||
|
||||
# Check font weight
|
||||
if style.is_bold:
|
||||
confidence += 0.3
|
||||
if level == 3: # Adjust level for bold text
|
||||
level = 2
|
||||
|
||||
# Check text alignment
|
||||
if style.is_centered:
|
||||
confidence += 0.2
|
||||
|
||||
# Check for uppercase
|
||||
if text.isupper() and len(text.split()) <= 10:
|
||||
confidence += 0.2
|
||||
|
||||
# Check margins (headers often have larger margins)
|
||||
if style.margin_top and style.margin_top > 20:
|
||||
confidence += 0.1
|
||||
if style.margin_bottom and style.margin_bottom > 10:
|
||||
confidence += 0.1
|
||||
|
||||
# Normalize confidence
|
||||
confidence = min(confidence, 1.0)
|
||||
|
||||
if confidence > 0.4: # Threshold for style-based detection
|
||||
return HeaderInfo.from_text(text, level, confidence, self.name)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class PatternBasedDetector(HeaderDetector):
|
||||
"""Detect headers based on text patterns."""
|
||||
|
||||
# Common header patterns in SEC filings
|
||||
HEADER_PATTERNS = [
|
||||
# Item patterns
|
||||
(r'^(Item|ITEM)\s+(\d+[A-Z]?)[.\s]+(.+)$', 1, 0.95),
|
||||
(r'^Part\s+[IVX]+[.\s]*$', 1, 0.9),
|
||||
(r'^PART\s+[IVX]+[.\s]*$', 1, 0.9),
|
||||
|
||||
# Section patterns
|
||||
(r'^(BUSINESS|RISK FACTORS|PROPERTIES|LEGAL PROCEEDINGS)$', 2, 0.85),
|
||||
(r'^(Management\'?s?\s+Discussion|MD&A)', 2, 0.85),
|
||||
(r'^(Financial\s+Statements|Consolidated\s+Financial\s+Statements)$', 2, 0.85),
|
||||
|
||||
# Numbered sections
|
||||
(r'^\d+\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
|
||||
(r'^[A-Z]\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
|
||||
(r'^\([a-z]\)\s+[A-Z][A-Za-z\s]+$', 4, 0.6),
|
||||
|
||||
# Title case headers
|
||||
(r'^[A-Z][A-Za-z\s]+[A-Za-z]$', 3, 0.5),
|
||||
|
||||
# All caps headers
|
||||
(r'^[A-Z\s]+$', 3, 0.6),
|
||||
]
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "pattern"
|
||||
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""Detect headers based on text patterns."""
|
||||
text = element.text_content().strip()
|
||||
|
||||
# Skip empty or very long text
|
||||
if not text or len(text) > 200:
|
||||
return None
|
||||
|
||||
# Skip single punctuation - never headers
|
||||
if len(text) == 1 and text in '.,!?;:()[]{}':
|
||||
return None
|
||||
|
||||
# Skip if text contains multiple sentences (likely paragraph)
|
||||
if text.count('.') > 2:
|
||||
return None
|
||||
|
||||
# Check against patterns
|
||||
for pattern, level, base_confidence in self.HEADER_PATTERNS:
|
||||
match = re.match(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
# Adjust confidence based on context
|
||||
confidence = base_confidence
|
||||
|
||||
# Boost confidence if element is alone in parent
|
||||
if len(element.getparent()) == 1:
|
||||
confidence += 0.1
|
||||
|
||||
# Boost confidence if followed by substantial text
|
||||
next_elem = element.getnext()
|
||||
if next_elem is not None and len(next_elem.text_content()) > 100:
|
||||
confidence += 0.1
|
||||
|
||||
confidence = min(confidence, 1.0)
|
||||
|
||||
return HeaderInfo.from_text(text, level, confidence, self.name)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class StructuralDetector(HeaderDetector):
|
||||
"""Detect headers based on DOM structure."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "structural"
|
||||
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""Detect headers based on structural cues."""
|
||||
text = element.text_content().strip()
|
||||
|
||||
# Skip empty or very long text
|
||||
if not text or len(text) > 200:
|
||||
return None
|
||||
|
||||
# Skip single punctuation - never headers
|
||||
if len(text) == 1 and text in '.,!?;:()[]{}':
|
||||
return None
|
||||
|
||||
confidence = 0.0
|
||||
level = 3
|
||||
|
||||
# Check if element is in a header tag
|
||||
tag = element.tag.lower()
|
||||
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
confidence = 1.0
|
||||
level = int(tag[1])
|
||||
return HeaderInfo.from_text(text, level, confidence, self.name)
|
||||
|
||||
# Check parent structure
|
||||
parent = element.getparent()
|
||||
if parent is not None:
|
||||
parent_tag = parent.tag.lower()
|
||||
|
||||
# Check if in header-like container
|
||||
if parent_tag in ['header', 'thead', 'caption']:
|
||||
confidence += 0.6
|
||||
level = 2
|
||||
|
||||
# Check if parent has few children (isolated element)
|
||||
if len(parent) <= 3:
|
||||
confidence += 0.3
|
||||
|
||||
# Check if parent is centered
|
||||
parent_align = parent.get('align')
|
||||
if parent_align == 'center':
|
||||
confidence += 0.2
|
||||
|
||||
# Check element properties
|
||||
if tag in ['strong', 'b']:
|
||||
confidence += 0.3
|
||||
|
||||
if element.get('align') == 'center':
|
||||
confidence += 0.2
|
||||
|
||||
# Check if followed by block content
|
||||
next_elem = element.getnext()
|
||||
if next_elem is not None:
|
||||
next_tag = next_elem.tag.lower()
|
||||
if next_tag in ['p', 'div', 'table', 'ul', 'ol']:
|
||||
confidence += 0.2
|
||||
|
||||
# Check text characteristics
|
||||
words = text.split()
|
||||
if 1 <= len(words) <= 10: # Short text
|
||||
confidence += 0.1
|
||||
|
||||
# Normalize confidence
|
||||
confidence = min(confidence, 1.0)
|
||||
|
||||
if confidence > 0.5:
|
||||
return HeaderInfo.from_text(text, level, confidence, self.name)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class ContextualDetector(HeaderDetector):
|
||||
"""Detect headers based on surrounding context."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "contextual"
|
||||
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""Detect headers based on contextual clues."""
|
||||
text = element.text_content().strip()
|
||||
|
||||
# Skip empty or very long text
|
||||
if not text or len(text) > 200:
|
||||
return None
|
||||
|
||||
# Skip single punctuation - never headers
|
||||
if len(text) == 1 and text in '.,!?;:()[]{}':
|
||||
return None
|
||||
|
||||
confidence = 0.0
|
||||
level = 3
|
||||
|
||||
# Check if text looks like a header
|
||||
if self._looks_like_header(text):
|
||||
confidence += 0.4
|
||||
|
||||
# Check relationship to previous content
|
||||
prev_elem = element.getprevious()
|
||||
if prev_elem is not None:
|
||||
prev_text = prev_elem.text_content().strip()
|
||||
|
||||
# Check if previous was also a header (section hierarchy)
|
||||
if prev_text and self._looks_like_header(prev_text):
|
||||
confidence += 0.3
|
||||
# Adjust level based on comparison
|
||||
if len(text) > len(prev_text):
|
||||
level = 2
|
||||
else:
|
||||
level = 3
|
||||
|
||||
# Check relationship to next content
|
||||
next_elem = element.getnext()
|
||||
if next_elem is not None:
|
||||
next_text = next_elem.text_content().strip()
|
||||
|
||||
# Headers are often followed by longer content
|
||||
if len(next_text) > len(text) * 3:
|
||||
confidence += 0.3
|
||||
|
||||
# Check if next element is indented or styled differently
|
||||
next_style = next_elem.get('style', '')
|
||||
if 'margin-left' in next_style or 'padding-left' in next_style:
|
||||
confidence += 0.2
|
||||
|
||||
# Check position in document
|
||||
if context.current_section is None and context.depth < 5:
|
||||
# Early in document, more likely to be header
|
||||
confidence += 0.2
|
||||
|
||||
# Normalize confidence
|
||||
confidence = min(confidence, 1.0)
|
||||
|
||||
if confidence > 0.5:
|
||||
return HeaderInfo.from_text(text, level, confidence, self.name)
|
||||
|
||||
return None
|
||||
|
||||
def _looks_like_header(self, text: str) -> bool:
|
||||
"""Check if text looks like a header."""
|
||||
# Short text
|
||||
if len(text.split()) > 15:
|
||||
return False
|
||||
|
||||
# No ending punctuation (except colon)
|
||||
if text.rstrip().endswith(('.', '!', '?', ';')):
|
||||
return False
|
||||
|
||||
# Title case or all caps
|
||||
if text.istitle() or text.isupper():
|
||||
return True
|
||||
|
||||
# Starts with capital letter
|
||||
if text and text[0].isupper():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class HeaderDetectionStrategy:
|
||||
"""
|
||||
Multi-strategy header detection.
|
||||
|
||||
Combines multiple detection methods with weighted voting.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ParserConfig):
|
||||
"""Initialize with configuration."""
|
||||
self.config = config
|
||||
self.detectors = self._init_detectors()
|
||||
|
||||
def _init_detectors(self) -> List[HeaderDetector]:
|
||||
"""Initialize enabled detectors."""
|
||||
detectors = []
|
||||
|
||||
# Always include basic detectors
|
||||
detectors.extend([
|
||||
StyleBasedDetector(),
|
||||
PatternBasedDetector(),
|
||||
StructuralDetector(),
|
||||
ContextualDetector()
|
||||
])
|
||||
|
||||
# Add ML detector if enabled
|
||||
if self.config.features.get('ml_header_detection'):
|
||||
# Would add MLBasedDetector here
|
||||
pass
|
||||
|
||||
return detectors
|
||||
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""
|
||||
Detect if element is a header using multiple strategies.
|
||||
|
||||
Args:
|
||||
element: HTML element to check
|
||||
context: Current parsing context
|
||||
|
||||
Returns:
|
||||
HeaderInfo if element is detected as header, None otherwise
|
||||
"""
|
||||
# Skip if element has no text
|
||||
text = element.text_content().strip()
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# Collect results from all detectors
|
||||
results: List[HeaderInfo] = []
|
||||
|
||||
for detector in self.detectors:
|
||||
try:
|
||||
result = detector.detect(element, context)
|
||||
if result:
|
||||
results.append(result)
|
||||
except Exception:
|
||||
# Don't let one detector failure stop others
|
||||
continue
|
||||
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# If only one detector fired, use its result if confident enough
|
||||
if len(results) == 1:
|
||||
if results[0].confidence >= self.config.header_detection_threshold:
|
||||
return results[0]
|
||||
return None
|
||||
|
||||
# Multiple detectors - combine results
|
||||
return self._combine_results(results, text)
|
||||
|
||||
def _combine_results(self, results: List[HeaderInfo], text: str) -> HeaderInfo:
|
||||
"""Combine multiple detection results."""
|
||||
# Weight different detectors
|
||||
detector_weights = {
|
||||
'style': 0.3,
|
||||
'pattern': 0.4,
|
||||
'structural': 0.2,
|
||||
'contextual': 0.1,
|
||||
'ml': 0.5 # Would be highest if available
|
||||
}
|
||||
|
||||
# Calculate weighted confidence
|
||||
total_confidence = 0.0
|
||||
total_weight = 0.0
|
||||
|
||||
# Group by level
|
||||
level_votes: Dict[int, float] = {}
|
||||
|
||||
for result in results:
|
||||
weight = detector_weights.get(result.detection_method, 0.1)
|
||||
total_confidence += result.confidence * weight
|
||||
total_weight += weight
|
||||
|
||||
# Vote for level
|
||||
if result.level not in level_votes:
|
||||
level_votes[result.level] = 0.0
|
||||
level_votes[result.level] += result.confidence * weight
|
||||
|
||||
# Normalize confidence
|
||||
final_confidence = total_confidence / total_weight if total_weight > 0 else 0.0
|
||||
|
||||
# Choose most voted level
|
||||
final_level = max(level_votes.items(), key=lambda x: x[1])[0]
|
||||
|
||||
# Check if any detector found this is an item
|
||||
is_item = any(r.is_item for r in results)
|
||||
item_number = next((r.item_number for r in results if r.item_number), None)
|
||||
|
||||
return HeaderInfo(
|
||||
level=final_level,
|
||||
confidence=final_confidence,
|
||||
text=text,
|
||||
detection_method='combined',
|
||||
is_item=is_item,
|
||||
item_number=item_number
|
||||
)
|
||||
@@ -0,0 +1,344 @@
|
||||
"""
|
||||
CSS style parser for HTML elements.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, Optional, Tuple, Union
|
||||
from edgar.documents.types import Style
|
||||
from edgar.documents.utils import get_cache_manager
|
||||
|
||||
|
||||
class StyleParser:
|
||||
"""
|
||||
Parser for CSS style attributes.
|
||||
|
||||
Handles inline styles and converts them to Style objects.
|
||||
"""
|
||||
|
||||
# Common CSS units
|
||||
ABSOLUTE_UNITS = {'px', 'pt', 'pc', 'cm', 'mm', 'in'}
|
||||
RELATIVE_UNITS = {'em', 'rem', 'ex', 'ch', 'vw', 'vh', '%'}
|
||||
|
||||
# Font weight mappings
|
||||
FONT_WEIGHT_MAP = {
|
||||
'normal': '400',
|
||||
'bold': '700',
|
||||
'bolder': '800',
|
||||
'lighter': '300'
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize style parser with cache."""
|
||||
self._cache = get_cache_manager().style_cache
|
||||
|
||||
def parse(self, style_string: str) -> Style:
|
||||
"""
|
||||
Parse CSS style string into Style object.
|
||||
|
||||
Args:
|
||||
style_string: CSS style string (e.g., "font-size: 14px; color: red")
|
||||
|
||||
Returns:
|
||||
Parsed Style object
|
||||
"""
|
||||
if not style_string:
|
||||
return Style()
|
||||
|
||||
# Check cache first
|
||||
cached_style = self._cache.get(style_string)
|
||||
if cached_style is not None:
|
||||
return cached_style
|
||||
|
||||
# Parse style
|
||||
style = Style()
|
||||
|
||||
# Split into individual declarations
|
||||
declarations = self._split_declarations(style_string)
|
||||
|
||||
for prop, value in declarations.items():
|
||||
self._apply_property(style, prop, value)
|
||||
|
||||
# Cache result
|
||||
self._cache.put(style_string, style)
|
||||
|
||||
return style
|
||||
|
||||
def _split_declarations(self, style_string: str) -> Dict[str, str]:
|
||||
"""Split style string into property-value pairs."""
|
||||
declarations = {}
|
||||
|
||||
# Split by semicolon, handling potential issues
|
||||
parts = style_string.split(';')
|
||||
|
||||
for part in parts:
|
||||
part = part.strip()
|
||||
if not part:
|
||||
continue
|
||||
|
||||
# Split property and value
|
||||
if ':' in part:
|
||||
prop, value = part.split(':', 1)
|
||||
prop = prop.strip().lower()
|
||||
value = value.strip()
|
||||
|
||||
if prop and value:
|
||||
declarations[prop] = value
|
||||
|
||||
return declarations
|
||||
|
||||
def _apply_property(self, style: Style, prop: str, value: str):
|
||||
"""Apply CSS property to Style object."""
|
||||
# Font properties
|
||||
if prop == 'font-size':
|
||||
size = self._parse_length(value)
|
||||
if size is not None:
|
||||
style.font_size = size
|
||||
|
||||
elif prop == 'font-weight':
|
||||
style.font_weight = self._normalize_font_weight(value)
|
||||
|
||||
elif prop == 'font-style':
|
||||
if value in ['italic', 'oblique']:
|
||||
style.font_style = 'italic'
|
||||
elif value == 'normal':
|
||||
style.font_style = 'normal'
|
||||
|
||||
# Text properties
|
||||
elif prop == 'text-align':
|
||||
if value in ['left', 'right', 'center', 'justify']:
|
||||
style.text_align = value
|
||||
|
||||
elif prop == 'text-decoration':
|
||||
style.text_decoration = value
|
||||
|
||||
# Color properties
|
||||
elif prop == 'color':
|
||||
style.color = self._normalize_color(value)
|
||||
|
||||
elif prop in ['background-color', 'background']:
|
||||
color = self._extract_background_color(value)
|
||||
if color:
|
||||
style.background_color = color
|
||||
|
||||
# Spacing properties
|
||||
elif prop == 'margin':
|
||||
self._parse_box_property(style, 'margin', value)
|
||||
elif prop == 'margin-top':
|
||||
margin = self._parse_length(value)
|
||||
if margin is not None:
|
||||
style.margin_top = margin
|
||||
elif prop == 'margin-bottom':
|
||||
margin = self._parse_length(value)
|
||||
if margin is not None:
|
||||
style.margin_bottom = margin
|
||||
elif prop == 'margin-left':
|
||||
margin = self._parse_length(value)
|
||||
if margin is not None:
|
||||
style.margin_left = margin
|
||||
elif prop == 'margin-right':
|
||||
margin = self._parse_length(value)
|
||||
if margin is not None:
|
||||
style.margin_right = margin
|
||||
|
||||
elif prop == 'padding':
|
||||
self._parse_box_property(style, 'padding', value)
|
||||
elif prop == 'padding-top':
|
||||
padding = self._parse_length(value)
|
||||
if padding is not None:
|
||||
style.padding_top = padding
|
||||
elif prop == 'padding-bottom':
|
||||
padding = self._parse_length(value)
|
||||
if padding is not None:
|
||||
style.padding_bottom = padding
|
||||
elif prop == 'padding-left':
|
||||
padding = self._parse_length(value)
|
||||
if padding is not None:
|
||||
style.padding_left = padding
|
||||
elif prop == 'padding-right':
|
||||
padding = self._parse_length(value)
|
||||
if padding is not None:
|
||||
style.padding_right = padding
|
||||
|
||||
# Display properties
|
||||
elif prop == 'display':
|
||||
style.display = value
|
||||
|
||||
# Size properties
|
||||
elif prop == 'width':
|
||||
style.width = self._parse_dimension(value)
|
||||
elif prop == 'height':
|
||||
style.height = self._parse_dimension(value)
|
||||
|
||||
# Line height
|
||||
elif prop == 'line-height':
|
||||
line_height = self._parse_line_height(value)
|
||||
if line_height is not None:
|
||||
style.line_height = line_height
|
||||
|
||||
def _parse_length(self, value: str) -> Optional[float]:
|
||||
"""Parse CSS length value to pixels."""
|
||||
value = value.strip().lower()
|
||||
|
||||
# Handle special values
|
||||
if value in ['0', 'auto', 'inherit', 'initial']:
|
||||
return 0.0 if value == '0' else None
|
||||
|
||||
# Extract number and unit
|
||||
match = re.match(r'^(-?\d*\.?\d+)\s*([a-z%]*)$', value)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
num_str, unit = match.groups()
|
||||
try:
|
||||
num = float(num_str)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Convert to pixels
|
||||
if not unit or unit == 'px':
|
||||
return num
|
||||
elif unit == 'pt':
|
||||
return num * 1.333 # 1pt = 1.333px
|
||||
elif unit == 'em':
|
||||
return num * 16 # Assume 16px base
|
||||
elif unit == 'rem':
|
||||
return num * 16 # Assume 16px root
|
||||
elif unit == '%':
|
||||
return None # Can't convert percentage without context
|
||||
elif unit == 'in':
|
||||
return num * 96 # 1in = 96px
|
||||
elif unit == 'cm':
|
||||
return num * 37.8 # 1cm = 37.8px
|
||||
elif unit == 'mm':
|
||||
return num * 3.78 # 1mm = 3.78px
|
||||
|
||||
return None
|
||||
|
||||
def _parse_dimension(self, value: str) -> Optional[Union[float, str]]:
|
||||
"""Parse dimension value (width/height)."""
|
||||
value = value.strip()
|
||||
|
||||
# Check for percentage
|
||||
if value.endswith('%'):
|
||||
return value # Return as string
|
||||
|
||||
# Try to parse as length
|
||||
length = self._parse_length(value)
|
||||
return length
|
||||
|
||||
def _parse_line_height(self, value: str) -> Optional[float]:
|
||||
"""Parse line-height value."""
|
||||
value = value.strip()
|
||||
|
||||
# Unitless number (multiplier)
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Try as length
|
||||
return self._parse_length(value)
|
||||
|
||||
def _normalize_font_weight(self, value: str) -> str:
|
||||
"""Normalize font weight value."""
|
||||
value = value.strip().lower()
|
||||
|
||||
# Map keywords to numeric values
|
||||
if value in self.FONT_WEIGHT_MAP:
|
||||
return self.FONT_WEIGHT_MAP[value]
|
||||
|
||||
# Check if it's already numeric
|
||||
if value.isdigit() and 100 <= int(value) <= 900:
|
||||
return value
|
||||
|
||||
return value
|
||||
|
||||
def _normalize_color(self, value: str) -> str:
|
||||
"""Normalize color value."""
|
||||
value = value.strip().lower()
|
||||
|
||||
# Handle rgb/rgba
|
||||
if value.startswith(('rgb(', 'rgba(')):
|
||||
return value
|
||||
|
||||
# Handle hex colors
|
||||
if value.startswith('#'):
|
||||
# Expand 3-char hex to 6-char
|
||||
if len(value) == 4:
|
||||
return '#' + ''.join(c*2 for c in value[1:])
|
||||
return value
|
||||
|
||||
# Return named colors as-is
|
||||
return value
|
||||
|
||||
def _extract_background_color(self, value: str) -> Optional[str]:
|
||||
"""Extract color from background property."""
|
||||
# Simple extraction - could be enhanced
|
||||
parts = value.split()
|
||||
for part in parts:
|
||||
if part.startswith('#') or part.startswith('rgb'):
|
||||
return self._normalize_color(part)
|
||||
# Check for named colors
|
||||
if not any(unit in part for unit in self.ABSOLUTE_UNITS | self.RELATIVE_UNITS):
|
||||
return part
|
||||
|
||||
return None
|
||||
|
||||
def _parse_box_property(self, style: Style, prop_type: str, value: str):
|
||||
"""Parse box property (margin/padding) with multiple values."""
|
||||
parts = value.split()
|
||||
|
||||
if not parts:
|
||||
return
|
||||
|
||||
# Convert all parts to lengths
|
||||
lengths = []
|
||||
for part in parts:
|
||||
length = self._parse_length(part)
|
||||
if length is not None:
|
||||
lengths.append(length)
|
||||
|
||||
if not lengths:
|
||||
return
|
||||
|
||||
# Apply based on number of values (CSS box model)
|
||||
if len(lengths) == 1:
|
||||
# All sides
|
||||
val = lengths[0]
|
||||
setattr(style, f'{prop_type}_top', val)
|
||||
setattr(style, f'{prop_type}_right', val)
|
||||
setattr(style, f'{prop_type}_bottom', val)
|
||||
setattr(style, f'{prop_type}_left', val)
|
||||
elif len(lengths) == 2:
|
||||
# Vertical, horizontal
|
||||
vert, horiz = lengths
|
||||
setattr(style, f'{prop_type}_top', vert)
|
||||
setattr(style, f'{prop_type}_bottom', vert)
|
||||
setattr(style, f'{prop_type}_left', horiz)
|
||||
setattr(style, f'{prop_type}_right', horiz)
|
||||
elif len(lengths) == 3:
|
||||
# Top, horizontal, bottom
|
||||
top, horiz, bottom = lengths
|
||||
setattr(style, f'{prop_type}_top', top)
|
||||
setattr(style, f'{prop_type}_bottom', bottom)
|
||||
setattr(style, f'{prop_type}_left', horiz)
|
||||
setattr(style, f'{prop_type}_right', horiz)
|
||||
elif len(lengths) >= 4:
|
||||
# Top, right, bottom, left
|
||||
setattr(style, f'{prop_type}_top', lengths[0])
|
||||
setattr(style, f'{prop_type}_right', lengths[1])
|
||||
setattr(style, f'{prop_type}_bottom', lengths[2])
|
||||
setattr(style, f'{prop_type}_left', lengths[3])
|
||||
|
||||
def merge_styles(self, base: Style, override: Style) -> Style:
|
||||
"""
|
||||
Merge two styles with override taking precedence.
|
||||
|
||||
Args:
|
||||
base: Base style
|
||||
override: Override style
|
||||
|
||||
Returns:
|
||||
Merged style
|
||||
"""
|
||||
return base.merge(override)
|
||||
@@ -0,0 +1,637 @@
|
||||
"""
|
||||
Advanced table processing strategy.
|
||||
"""
|
||||
|
||||
import re
|
||||
from functools import lru_cache
|
||||
from typing import List, Optional
|
||||
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.strategies.style_parser import StyleParser
|
||||
from edgar.documents.table_nodes import TableNode, Cell, Row
|
||||
from edgar.documents.types import TableType
|
||||
|
||||
|
||||
class TableProcessor:
|
||||
"""
|
||||
Advanced table processing with type detection and structure analysis.
|
||||
"""
|
||||
|
||||
# HTML entities that need replacement
|
||||
ENTITY_REPLACEMENTS = {
|
||||
'―': '-----',
|
||||
'—': '-----',
|
||||
'–': '---',
|
||||
'−': '-',
|
||||
'‐': '-',
|
||||
'‐': '-',
|
||||
' ': ' ',
|
||||
'&': '&',
|
||||
'<': '<',
|
||||
'>': '>',
|
||||
'"': '"',
|
||||
''': "'",
|
||||
' ': ' ',
|
||||
'​': '',
|
||||
'—': '-----',
|
||||
'–': '---',
|
||||
'−': '-',
|
||||
}
|
||||
|
||||
# Financial keywords for table type detection
|
||||
FINANCIAL_KEYWORDS = {
|
||||
'revenue', 'income', 'expense', 'asset', 'liability',
|
||||
'cash', 'equity', 'profit', 'loss', 'margin',
|
||||
'earnings', 'cost', 'sales', 'operating', 'net',
|
||||
'gross', 'total', 'balance', 'statement', 'consolidated',
|
||||
'provision', 'tax', 'taxes', 'compensation', 'stock',
|
||||
'share', 'shares', 'rsu', 'option', 'grant', 'vest'
|
||||
}
|
||||
|
||||
# Metrics keywords
|
||||
METRICS_KEYWORDS = {
|
||||
'ratio', 'percentage', 'percent', '%', 'rate',
|
||||
'growth', 'change', 'increase', 'decrease',
|
||||
'average', 'median', 'total', 'count', 'number'
|
||||
}
|
||||
|
||||
def __init__(self, config: ParserConfig):
|
||||
"""Initialize table processor."""
|
||||
self.config = config
|
||||
self.style_parser = StyleParser()
|
||||
|
||||
def process(self, element: HtmlElement) -> TableNode:
|
||||
"""
|
||||
Process table element into TableNode.
|
||||
|
||||
Args:
|
||||
element: HTML table element
|
||||
|
||||
Returns:
|
||||
Processed TableNode
|
||||
"""
|
||||
# Extract table metadata
|
||||
table_id = element.get('id')
|
||||
table_class = element.get('class', '').split()
|
||||
table_style = self.style_parser.parse(element.get('style', ''))
|
||||
|
||||
# Create table node
|
||||
table = TableNode(style=table_style)
|
||||
|
||||
# Set config for rendering decisions
|
||||
table._config = self.config
|
||||
|
||||
# Add metadata
|
||||
if table_id:
|
||||
table.set_metadata('id', table_id)
|
||||
if table_class:
|
||||
table.set_metadata('classes', table_class)
|
||||
|
||||
# Extract caption
|
||||
caption_elem = element.find('.//caption')
|
||||
if caption_elem is not None:
|
||||
table.caption = self._extract_text(caption_elem)
|
||||
|
||||
# Extract summary
|
||||
summary = element.get('summary')
|
||||
if summary:
|
||||
table.summary = summary
|
||||
|
||||
# Process table structure
|
||||
self._process_table_structure(element, table)
|
||||
|
||||
# Detect table type if configured
|
||||
if self.config.detect_table_types:
|
||||
table.table_type = self._detect_table_type(table)
|
||||
|
||||
# Extract relationships if configured
|
||||
if self.config.extract_table_relationships:
|
||||
self._extract_relationships(table)
|
||||
|
||||
return table
|
||||
|
||||
def _process_table_structure(self, element: HtmlElement, table: TableNode):
|
||||
"""Process table structure (thead, tbody, tfoot)."""
|
||||
# Process thead
|
||||
thead = element.find('.//thead')
|
||||
if thead is not None:
|
||||
for tr in thead.findall('.//tr'):
|
||||
cells = self._process_row(tr, is_header=True)
|
||||
if cells:
|
||||
table.headers.append(cells)
|
||||
|
||||
# Process tbody (or direct rows)
|
||||
tbody = element.find('.//tbody')
|
||||
rows_container = tbody if tbody is not None else element
|
||||
|
||||
# Track if we've seen headers and data rows
|
||||
headers_found = bool(table.headers)
|
||||
consecutive_header_rows = 0
|
||||
data_rows_started = False
|
||||
|
||||
for tr in rows_container.findall('.//tr'):
|
||||
# Skip if already processed in thead
|
||||
if thead is not None and tr.getparent() == thead:
|
||||
continue
|
||||
|
||||
# Check if this might be a header row
|
||||
is_header_row = False
|
||||
|
||||
# Continue checking for headers if:
|
||||
# 1. We haven't found any headers yet, OR
|
||||
# 2. We've found headers but haven't seen data rows yet (multi-row headers)
|
||||
if not data_rows_started:
|
||||
is_header_row = self._is_header_row(tr)
|
||||
|
||||
# Additional check for multi-row headers in financial tables
|
||||
# If the previous row was a header and this row has years or units,
|
||||
# it's likely part of the header
|
||||
if headers_found and not is_header_row:
|
||||
row_text = tr.text_content().strip()
|
||||
# Check for units like "(in millions)" or "(in thousands)"
|
||||
if '(in millions)' in row_text or '(in thousands)' in row_text or '(in billions)' in row_text:
|
||||
is_header_row = True
|
||||
# Check for year rows that follow "Year Ended" headers
|
||||
elif len(table.headers) > 0:
|
||||
last_header_text = ' '.join(cell.text() for cell in table.headers[-1])
|
||||
if 'year ended' in last_header_text.lower() or 'years ended' in last_header_text.lower():
|
||||
# Check if this row has years
|
||||
year_pattern = r'\b(19\d{2}|20\d{2})\b'
|
||||
years_found = re.findall(year_pattern, row_text)
|
||||
if years_found:
|
||||
is_header_row = True
|
||||
|
||||
cells = self._process_row(tr, is_header=is_header_row)
|
||||
if cells:
|
||||
if is_header_row:
|
||||
table.headers.append(cells)
|
||||
headers_found = True
|
||||
consecutive_header_rows += 1
|
||||
else:
|
||||
# Only mark data_rows_started if this row has actual content
|
||||
# Empty rows at the beginning shouldn't stop header detection
|
||||
row = Row(cells=cells, is_header=False)
|
||||
table.rows.append(row)
|
||||
|
||||
# Check if row has significant content that indicates data rows have started
|
||||
# But be smart about it - descriptive rows like "(in millions)" or pure spacing
|
||||
# shouldn't stop header detection
|
||||
has_content = any(cell.text().strip() for cell in cells)
|
||||
if has_content:
|
||||
# Get the row text for smarter analysis
|
||||
row_text = ' '.join(cell.text().strip() for cell in cells).strip()
|
||||
row_text_lower = row_text.lower()
|
||||
|
||||
# Don't consider this as "data started" if it's likely a header-related row
|
||||
is_header_related = (
|
||||
# Unit descriptions
|
||||
'(in millions)' in row_text_lower or
|
||||
'(in thousands)' in row_text_lower or
|
||||
'(in billions)' in row_text_lower or
|
||||
'except per share' in row_text_lower or
|
||||
# Financial period descriptions
|
||||
'year ended' in row_text_lower or
|
||||
'months ended' in row_text_lower or
|
||||
# Mostly just spacing/formatting
|
||||
len(row_text.strip()) < 5 or
|
||||
# Contains years (might be misclassified header)
|
||||
bool(re.search(r'\b(19\d{2}|20\d{2})\b', row_text))
|
||||
)
|
||||
|
||||
# Only mark data_rows_started if this seems like actual data, not header-related
|
||||
if not is_header_related:
|
||||
data_rows_started = True
|
||||
|
||||
consecutive_header_rows = 0
|
||||
|
||||
# Process tfoot
|
||||
tfoot = element.find('.//tfoot')
|
||||
if tfoot is not None:
|
||||
for tr in tfoot.findall('.//tr'):
|
||||
cells = self._process_row(tr, is_header=False)
|
||||
if cells:
|
||||
row = Row(cells=cells, is_header=False)
|
||||
table.footer.append(row)
|
||||
|
||||
def _process_row(self, tr: HtmlElement, is_header: bool) -> List[Cell]:
|
||||
"""Process table row into cells."""
|
||||
cells = []
|
||||
|
||||
# Process both td and th elements
|
||||
for cell_elem in tr.findall('.//td') + tr.findall('.//th'):
|
||||
cell = self._process_cell(cell_elem, is_header or cell_elem.tag == 'th')
|
||||
if cell:
|
||||
cells.append(cell)
|
||||
|
||||
return cells
|
||||
|
||||
def _process_cell(self, elem: HtmlElement, is_header: bool) -> Optional[Cell]:
|
||||
"""Process table cell."""
|
||||
# Extract cell properties
|
||||
colspan = int(elem.get('colspan', '1'))
|
||||
rowspan = int(elem.get('rowspan', '1'))
|
||||
align = elem.get('align')
|
||||
|
||||
# Extract style
|
||||
style = self.style_parser.parse(elem.get('style', ''))
|
||||
if style.text_align:
|
||||
align = style.text_align
|
||||
|
||||
# Extract content
|
||||
content = self._extract_cell_content(elem)
|
||||
|
||||
# Create cell
|
||||
cell = Cell(
|
||||
content=content,
|
||||
colspan=colspan,
|
||||
rowspan=rowspan,
|
||||
is_header=is_header,
|
||||
align=align
|
||||
)
|
||||
|
||||
return cell
|
||||
|
||||
def _extract_cell_content(self, elem: HtmlElement) -> str:
|
||||
"""Extract and clean cell content."""
|
||||
# Check for nested structure
|
||||
divs = elem.findall('.//div')
|
||||
if divs and len(divs) > 1:
|
||||
# Multiple divs - likely multi-line content
|
||||
lines = []
|
||||
for div in divs:
|
||||
text = self._extract_text(div)
|
||||
if text:
|
||||
lines.append(text)
|
||||
return '\n'.join(lines)
|
||||
|
||||
# Handle line breaks
|
||||
for br in elem.findall('.//br'):
|
||||
br.tail = '\n' + (br.tail or '')
|
||||
|
||||
# Extract text
|
||||
text = self._extract_text(elem)
|
||||
|
||||
return text
|
||||
|
||||
def _extract_text(self, elem: HtmlElement) -> str:
|
||||
"""Extract and clean text from element."""
|
||||
# Use itertext() to get all text fragments
|
||||
# This preserves spaces better than text_content()
|
||||
text_parts = []
|
||||
for text in elem.itertext():
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
# Join parts, ensuring we don't lose spaces
|
||||
# If a part doesn't end with whitespace and the next doesn't start with whitespace,
|
||||
# we need to add a space between them
|
||||
if not text_parts:
|
||||
return ''
|
||||
|
||||
result = []
|
||||
for i, part in enumerate(text_parts):
|
||||
if i == 0:
|
||||
result.append(part)
|
||||
else:
|
||||
prev_part = text_parts[i-1]
|
||||
# Check if we need to add a space between parts
|
||||
# Don't add space if previous ends with space or current starts with space
|
||||
if prev_part and part:
|
||||
if not prev_part[-1].isspace() and not part[0].isspace():
|
||||
# Check for punctuation that shouldn't have space before it
|
||||
if part[0] not in ',.;:!?%)]':
|
||||
result.append(' ')
|
||||
result.append(part)
|
||||
|
||||
text = ''.join(result)
|
||||
|
||||
# Replace entities
|
||||
for entity, replacement in self.ENTITY_REPLACEMENTS.items():
|
||||
text = text.replace(entity, replacement)
|
||||
|
||||
# Clean whitespace
|
||||
text = text.strip()
|
||||
|
||||
# Normalize internal whitespace but preserve line breaks
|
||||
lines = text.split('\n')
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
# Collapse multiple spaces to single space
|
||||
line = ' '.join(line.split())
|
||||
cleaned_lines.append(line)
|
||||
|
||||
return '\n'.join(cleaned_lines)
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=1)
|
||||
def _get_period_header_pattern():
|
||||
"""
|
||||
Compile comprehensive regex for financial period headers.
|
||||
Adapted from old parser's proven patterns.
|
||||
|
||||
Returns:
|
||||
Compiled regex pattern matching financial period headers
|
||||
"""
|
||||
# Base components
|
||||
periods = r'(?:three|six|nine|twelve|[1-4]|first|second|third|fourth)'
|
||||
timeframes = r'(?:month|quarter|year|week)'
|
||||
ended_variants = r'(?:ended|ending|end|period)'
|
||||
as_of_variants = r'(?:as\s+of|at|as\s+at)'
|
||||
|
||||
# Date pattern
|
||||
months = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
|
||||
day = r'\d{1,2}'
|
||||
year = r'(?:19|20)\d{2}'
|
||||
date = f'{months}\\s*\\.?\\s*{day}\\s*,?\\s*{year}'
|
||||
|
||||
# Combined patterns
|
||||
patterns = [
|
||||
# Standard period headers
|
||||
f'{periods}\\s+{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
|
||||
f'(?:fiscal\\s+)?{timeframes}\\s+{ended_variants}',
|
||||
f'{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
|
||||
|
||||
# Balance sheet date headers
|
||||
f'{as_of_variants}\\s+{date}',
|
||||
|
||||
# Multiple date sequences
|
||||
f'{date}(?:\\s*(?:and|,)\\s*{date})*',
|
||||
|
||||
# Single dates
|
||||
f'(?:{ended_variants}\\s+)?{date}'
|
||||
]
|
||||
|
||||
pattern = '|'.join(f'(?:{p})' for p in patterns)
|
||||
return re.compile(pattern, re.IGNORECASE)
|
||||
|
||||
def _is_header_row(self, tr: HtmlElement) -> bool:
|
||||
"""Detect if row is likely a header row in SEC filings."""
|
||||
# Check if contains th elements (most reliable indicator)
|
||||
if tr.find('.//th') is not None:
|
||||
return True
|
||||
|
||||
cells = tr.findall('.//td')
|
||||
if not cells:
|
||||
return False
|
||||
|
||||
# Get row text for analysis
|
||||
row_text = tr.text_content()
|
||||
row_text_lower = row_text.lower()
|
||||
|
||||
# Check for date ranges with financial data (Oracle Table 6 pattern)
|
||||
# Date ranges like "March 1, 2024—March 31, 2024" should be data rows, not headers
|
||||
date_range_pattern = r'(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}\s*[—–-]\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}'
|
||||
has_date_range = bool(re.search(date_range_pattern, row_text_lower))
|
||||
|
||||
# Check for financial data indicators
|
||||
has_currency = bool(re.search(r'\$[\s]*[\d,\.]+', row_text))
|
||||
has_decimals = bool(re.search(r'\b\d+\.\d+\b', row_text))
|
||||
has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
|
||||
|
||||
# If row has date range + financial data, it's definitely a data row
|
||||
if has_date_range and (has_currency or has_decimals or has_large_numbers):
|
||||
return False
|
||||
|
||||
# Check for year patterns (very common in financial headers)
|
||||
year_pattern = r'\b(19\d{2}|20\d{2})\b'
|
||||
years_found = re.findall(year_pattern, row_text)
|
||||
if len(years_found) >= 2: # Multiple years suggest header row
|
||||
# IMPORTANT: Check for date ranges and same-year repetition
|
||||
# Date ranges like "March 1, 2024—March 31, 2024" contain the same year twice
|
||||
# but are data rows, not multi-year comparison headers
|
||||
|
||||
# If all years are the same (date range pattern)
|
||||
if len(set(years_found)) == 1:
|
||||
# Same year repeated - likely a date range like "Jan 1, 2024 - Mar 31, 2024"
|
||||
# Not a multi-year comparison header
|
||||
pass # Don't return True
|
||||
# Multiple different years suggest multi-year comparison header
|
||||
elif 'total' not in row_text_lower[:20]: # Check first 20 chars
|
||||
return True
|
||||
|
||||
# Enhanced year detection - check individual cells for year patterns
|
||||
# This handles cases where years are in separate cells
|
||||
year_cells = 0
|
||||
date_phrases = 0
|
||||
for cell in cells:
|
||||
cell_text = cell.text_content().strip()
|
||||
if cell_text:
|
||||
# Check for individual years
|
||||
if re.match(r'^\s*(19\d{2}|20\d{2})\s*$', cell_text):
|
||||
year_cells += 1
|
||||
# Check for date phrases like "June 30, 2025"
|
||||
elif 'june 30' in cell_text.lower() or 'december 31' in cell_text.lower():
|
||||
date_phrases += 1
|
||||
|
||||
# If we have multiple year cells or year + date phrases, likely a header
|
||||
if year_cells >= 2 or (year_cells >= 1 and date_phrases >= 1):
|
||||
if 'total' not in row_text_lower[:20]:
|
||||
return True
|
||||
|
||||
# Check for comprehensive financial period patterns (from old parser)
|
||||
period_pattern = self._get_period_header_pattern()
|
||||
if period_pattern.search(row_text_lower):
|
||||
# Additional validation: ensure it's not a data row with period text
|
||||
# Check for absence of strong data indicators
|
||||
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\s*[+\-*/]\s*\d+|\(\s*\d+(?:,\d{3})*\s*\))'
|
||||
if not re.search(data_pattern, row_text):
|
||||
return True
|
||||
|
||||
# Check for units notation (in millions, thousands, billions)
|
||||
units_pattern = r'\(in\s+(?:millions|thousands|billions)\)'
|
||||
if re.search(units_pattern, row_text_lower):
|
||||
return True
|
||||
|
||||
# Check for period indicators (quarters, months)
|
||||
# But be careful with "fiscal" - it could be data like "Fiscal 2025"
|
||||
period_keywords = ['quarter', 'q1', 'q2', 'q3', 'q4', 'month',
|
||||
'january', 'february', 'march', 'april', 'may', 'june',
|
||||
'july', 'august', 'september', 'october', 'november', 'december',
|
||||
'ended', 'three months', 'six months', 'nine months']
|
||||
|
||||
# Special handling for "fiscal" - only treat as header if it's part of a phrase like "fiscal year ended"
|
||||
if 'fiscal' in row_text_lower:
|
||||
# Check if row has numeric values (suggests it's data, not header)
|
||||
# Look for patterns like "Fiscal 2025 $10,612"
|
||||
has_currency_values = bool(re.search(r'\$[\s]*[\d,]+', row_text))
|
||||
has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
|
||||
|
||||
# If it has currency or large numbers, it's likely data
|
||||
if has_currency_values or has_large_numbers:
|
||||
return False
|
||||
|
||||
# Check if it's just "Fiscal YYYY" which is likely data, not a header
|
||||
fiscal_year_only = re.match(r'^\s*fiscal\s+\d{4}\s*$', row_text_lower.strip())
|
||||
if fiscal_year_only:
|
||||
return False # This is data, not a header
|
||||
|
||||
# Check for header-like phrases with fiscal
|
||||
if 'fiscal year' in row_text_lower and ('ended' in row_text_lower or 'ending' in row_text_lower):
|
||||
return True
|
||||
|
||||
if any(keyword in row_text_lower for keyword in period_keywords):
|
||||
# Validate it's not a data row with period keywords
|
||||
# Check for strong data indicators
|
||||
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
|
||||
if not re.search(data_pattern, row_text):
|
||||
return True
|
||||
|
||||
# Check for column descriptors (but NOT total)
|
||||
# These are words commonly found in headers but not data rows
|
||||
header_keywords = ['description', 'item', 'category', 'type', 'classification',
|
||||
'change', 'percent', 'increase', 'decrease', 'variance']
|
||||
if any(keyword in row_text_lower for keyword in header_keywords):
|
||||
# Make sure it's not a total row
|
||||
if 'total' not in row_text_lower[:30]:
|
||||
# Additional validation: long narrative text is not a header
|
||||
# Headers are typically concise (< 150 chars)
|
||||
if len(row_text) > 150:
|
||||
return False
|
||||
# Check for data indicators (would indicate data row, not header)
|
||||
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
|
||||
if re.search(data_pattern, row_text):
|
||||
return False
|
||||
return True
|
||||
|
||||
# Check if all cells are bold (common header formatting)
|
||||
bold_count = 0
|
||||
for cell in cells:
|
||||
style = cell.get('style', '')
|
||||
if 'font-weight' in style and 'bold' in style:
|
||||
bold_count += 1
|
||||
elif cell.find('.//b') is not None or cell.find('.//strong') is not None:
|
||||
bold_count += 1
|
||||
|
||||
# Only consider it a header if ALL cells are bold (not just some)
|
||||
if bold_count == len(cells) and bold_count > 0:
|
||||
return True
|
||||
|
||||
# Check content type ratio - headers usually have more text than numbers
|
||||
# Count cells with primarily text vs primarily numbers
|
||||
text_cells = 0
|
||||
number_cells = 0
|
||||
for cell in cells:
|
||||
cell_text = cell.text_content().strip()
|
||||
if cell_text:
|
||||
# Remove common symbols for analysis
|
||||
clean_text = cell_text.replace('$', '').replace('%', '').replace(',', '').replace('(', '').replace(')', '')
|
||||
if clean_text.replace('.', '').replace('-', '').strip().isdigit():
|
||||
number_cells += 1
|
||||
else:
|
||||
text_cells += 1
|
||||
|
||||
# Be very careful about treating text-heavy rows as headers
|
||||
# Many data rows start with text labels (e.g., "Impact of...", "Effect of...")
|
||||
# Only consider it a header if it has mostly text AND doesn't look like a data label
|
||||
if text_cells > number_cells * 2 and text_cells >= 3:
|
||||
# Check for common data row patterns
|
||||
data_row_indicators = [
|
||||
'impact of', 'effect of', 'adjustment', 'provision for', 'benefit',
|
||||
'expense', 'income from', 'loss on', 'gain on', 'charge', 'credit',
|
||||
'earnings', 'computed', 'state taxes', 'research', 'excess tax'
|
||||
]
|
||||
|
||||
# If it starts with any of these, it's likely a data row, not a header
|
||||
for indicator in data_row_indicators:
|
||||
if row_text_lower.startswith(indicator) or indicator in row_text_lower[:50]:
|
||||
return False
|
||||
|
||||
# Also not a header if it starts with "total"
|
||||
if not row_text_lower.startswith('total'):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _detect_table_type(self, table: TableNode) -> TableType:
|
||||
"""Detect the type of table based on content."""
|
||||
# Collect text from headers and first few rows
|
||||
text_parts = []
|
||||
|
||||
# Add caption
|
||||
if table.caption:
|
||||
text_parts.append(table.caption.lower())
|
||||
|
||||
# Add headers
|
||||
for header_row in table.headers:
|
||||
for cell in header_row:
|
||||
text_parts.append(cell.text().lower())
|
||||
|
||||
# Add first few rows
|
||||
for row in table.rows[:3]:
|
||||
for cell in row.cells:
|
||||
text_parts.append(cell.text().lower())
|
||||
|
||||
combined_text = ' '.join(text_parts)
|
||||
|
||||
# Check for financial table
|
||||
financial_count = sum(1 for keyword in self.FINANCIAL_KEYWORDS if keyword in combined_text)
|
||||
if financial_count >= 2: # Lowered threshold for better detection
|
||||
return TableType.FINANCIAL
|
||||
|
||||
# Check for metrics table
|
||||
metrics_count = sum(1 for keyword in self.METRICS_KEYWORDS if keyword in combined_text)
|
||||
numeric_cells = sum(1 for row in table.rows for cell in row.cells if cell.is_numeric)
|
||||
total_cells = sum(len(row.cells) for row in table.rows)
|
||||
|
||||
if total_cells > 0:
|
||||
numeric_ratio = numeric_cells / total_cells
|
||||
# More lenient metrics detection
|
||||
if metrics_count >= 1 or numeric_ratio > 0.3:
|
||||
return TableType.METRICS
|
||||
|
||||
# Check for table of contents
|
||||
if 'content' in combined_text or 'index' in combined_text:
|
||||
# Look for page numbers
|
||||
has_page_numbers = any(
|
||||
re.search(r'\b\d{1,3}\b', cell.text())
|
||||
for row in table.rows
|
||||
for cell in row.cells
|
||||
)
|
||||
if has_page_numbers:
|
||||
return TableType.TABLE_OF_CONTENTS
|
||||
|
||||
# Check for exhibit index
|
||||
if 'exhibit' in combined_text:
|
||||
return TableType.EXHIBIT_INDEX
|
||||
|
||||
# Check for reference table (citations, definitions, etc.)
|
||||
if any(word in combined_text for word in ['reference', 'definition', 'glossary', 'citation']):
|
||||
return TableType.REFERENCE
|
||||
|
||||
return TableType.GENERAL
|
||||
|
||||
def _extract_relationships(self, table: TableNode):
|
||||
"""Extract relationships within table data."""
|
||||
# This would implement relationship extraction
|
||||
# For now, just set a flag that relationships were processed
|
||||
table.set_metadata('relationships_extracted', True)
|
||||
|
||||
# Example relationships to extract:
|
||||
# - Parent-child relationships (indented rows)
|
||||
# - Total rows that sum other rows
|
||||
# - Cross-references between cells
|
||||
# - Time series relationships
|
||||
|
||||
# Detect total rows
|
||||
total_rows = []
|
||||
for i, row in enumerate(table.rows):
|
||||
if row.is_total_row:
|
||||
total_rows.append(i)
|
||||
|
||||
if total_rows:
|
||||
table.set_metadata('total_rows', total_rows)
|
||||
|
||||
# Detect indentation patterns (parent-child)
|
||||
indentation_levels = []
|
||||
for row in table.rows:
|
||||
if row.cells:
|
||||
first_cell_text = row.cells[0].text()
|
||||
# Count leading spaces
|
||||
indent = len(first_cell_text) - len(first_cell_text.lstrip())
|
||||
indentation_levels.append(indent)
|
||||
|
||||
if any(level > 0 for level in indentation_levels):
|
||||
table.set_metadata('has_hierarchy', True)
|
||||
table.set_metadata('indentation_levels', indentation_levels)
|
||||
@@ -0,0 +1,345 @@
|
||||
"""
|
||||
XBRL extraction strategy for inline XBRL documents.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from edgar.documents.types import XBRLFact
|
||||
|
||||
|
||||
class XBRLExtractor:
|
||||
"""
|
||||
Extracts XBRL facts from inline XBRL (iXBRL) documents.
|
||||
|
||||
Handles:
|
||||
- ix:nonFraction, ix:nonNumeric facts
|
||||
- Context and unit resolution
|
||||
- Continuation handling
|
||||
- Transformation rules
|
||||
"""
|
||||
|
||||
# XBRL namespaces
|
||||
NAMESPACES = {
|
||||
'ix': 'http://www.xbrl.org/2013/inlineXBRL',
|
||||
'xbrli': 'http://www.xbrl.org/2003/instance',
|
||||
'xbrldi': 'http://xbrl.org/2006/xbrldi',
|
||||
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
|
||||
}
|
||||
|
||||
# Common transformation formats
|
||||
TRANSFORMATIONS = {
|
||||
'ixt:numdotdecimal': lambda x: x.replace(',', ''),
|
||||
'ixt:numcommadecimal': lambda x: x.replace('.', '_').replace(',', '.').replace('_', ','),
|
||||
'ixt:zerodash': lambda x: '0' if x == '-' else x,
|
||||
'ixt:datedoteu': lambda x: x.replace('.', '-'),
|
||||
'ixt:datedotus': lambda x: x.replace('.', '/'),
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize XBRL extractor."""
|
||||
self.contexts: Dict[str, Dict[str, Any]] = {}
|
||||
self.units: Dict[str, str] = {}
|
||||
self.continuations: Dict[str, str] = {}
|
||||
self._initialized = False
|
||||
|
||||
def extract_context(self, element: HtmlElement) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract XBRL context from element.
|
||||
|
||||
Args:
|
||||
element: HTML element that might contain XBRL
|
||||
|
||||
Returns:
|
||||
XBRL metadata if found
|
||||
"""
|
||||
# Check if element is an ix: tag
|
||||
if not self._is_xbrl_element(element):
|
||||
return None
|
||||
|
||||
# Initialize context if needed
|
||||
if not self._initialized:
|
||||
self._initialize_context(element)
|
||||
|
||||
# Extract based on element type
|
||||
tag_name = self._get_local_name(element.tag)
|
||||
|
||||
if tag_name == 'nonfraction':
|
||||
return self._extract_nonfraction(element)
|
||||
elif tag_name == 'nonnumeric':
|
||||
return self._extract_nonnumeric(element)
|
||||
elif tag_name == 'continuation':
|
||||
return self._extract_continuation(element)
|
||||
elif tag_name == 'footnote':
|
||||
return self._extract_footnote(element)
|
||||
elif tag_name == 'fraction':
|
||||
return self._extract_fraction(element)
|
||||
|
||||
return None
|
||||
|
||||
def extract_fact(self, element: HtmlElement) -> Optional[XBRLFact]:
|
||||
"""Extract XBRL fact from element."""
|
||||
context = self.extract_context(element)
|
||||
if not context:
|
||||
return None
|
||||
|
||||
# Get fact value
|
||||
value = self._get_fact_value(element)
|
||||
|
||||
# Create fact
|
||||
fact = XBRLFact(
|
||||
concept=context.get('name', ''),
|
||||
value=value,
|
||||
context_ref=context.get('contextRef'),
|
||||
unit_ref=context.get('unitRef'),
|
||||
decimals=context.get('decimals'),
|
||||
scale=context.get('scale'),
|
||||
format=context.get('format'),
|
||||
sign=context.get('sign')
|
||||
)
|
||||
|
||||
# Resolve references
|
||||
if fact.context_ref and fact.context_ref in self.contexts:
|
||||
fact.context = self.contexts[fact.context_ref]
|
||||
|
||||
if fact.unit_ref and fact.unit_ref in self.units:
|
||||
fact.unit = self.units[fact.unit_ref]
|
||||
|
||||
return fact
|
||||
|
||||
def _is_xbrl_element(self, element: HtmlElement) -> bool:
|
||||
"""Check if element is an XBRL element."""
|
||||
tag = element.tag
|
||||
if not isinstance(tag, str):
|
||||
return False
|
||||
|
||||
# Handle both namespaced and non-namespaced tags
|
||||
tag_lower = tag.lower()
|
||||
return (
|
||||
tag.startswith('{' + self.NAMESPACES['ix'] + '}') or
|
||||
tag.startswith('ix:') or
|
||||
tag_lower.startswith('ix:')
|
||||
)
|
||||
|
||||
def _get_local_name(self, tag: str) -> str:
|
||||
"""Get local name from qualified tag."""
|
||||
if '}' in tag:
|
||||
return tag.split('}')[1].lower()
|
||||
elif ':' in tag:
|
||||
return tag.split(':')[1].lower()
|
||||
return tag.lower()
|
||||
|
||||
def _initialize_context(self, element: HtmlElement):
|
||||
"""Initialize context and unit information from document."""
|
||||
# Find root element
|
||||
root = element.getroottree().getroot()
|
||||
|
||||
# Extract contexts
|
||||
self._extract_contexts(root)
|
||||
|
||||
# Extract units
|
||||
self._extract_units(root)
|
||||
|
||||
self._initialized = True
|
||||
|
||||
def _extract_contexts(self, root: HtmlElement):
|
||||
"""Extract all context definitions."""
|
||||
# Look for xbrli:context elements
|
||||
for context in root.xpath('//xbrli:context', namespaces=self.NAMESPACES):
|
||||
context_id = context.get('id')
|
||||
if not context_id:
|
||||
continue
|
||||
|
||||
context_data = {
|
||||
'id': context_id
|
||||
}
|
||||
|
||||
# Extract entity
|
||||
entity = context.find('.//xbrli:entity', namespaces=self.NAMESPACES)
|
||||
if entity is not None:
|
||||
identifier = entity.find('.//xbrli:identifier', namespaces=self.NAMESPACES)
|
||||
if identifier is not None:
|
||||
context_data['entity'] = identifier.text
|
||||
context_data['scheme'] = identifier.get('scheme')
|
||||
|
||||
# Extract period
|
||||
period = context.find('.//xbrli:period', namespaces=self.NAMESPACES)
|
||||
if period is not None:
|
||||
instant = period.find('.//xbrli:instant', namespaces=self.NAMESPACES)
|
||||
if instant is not None:
|
||||
context_data['instant'] = instant.text
|
||||
context_data['period_type'] = 'instant'
|
||||
else:
|
||||
start = period.find('.//xbrli:startDate', namespaces=self.NAMESPACES)
|
||||
end = period.find('.//xbrli:endDate', namespaces=self.NAMESPACES)
|
||||
if start is not None and end is not None:
|
||||
context_data['start_date'] = start.text
|
||||
context_data['end_date'] = end.text
|
||||
context_data['period_type'] = 'duration'
|
||||
|
||||
# Extract dimensions
|
||||
segment = context.find('.//xbrli:segment', namespaces=self.NAMESPACES)
|
||||
if segment is not None:
|
||||
dimensions = {}
|
||||
for member in segment.findall('.//xbrldi:explicitMember', namespaces=self.NAMESPACES):
|
||||
dim = member.get('dimension')
|
||||
if dim:
|
||||
dimensions[dim] = member.text
|
||||
if dimensions:
|
||||
context_data['dimensions'] = dimensions
|
||||
|
||||
self.contexts[context_id] = context_data
|
||||
|
||||
def _extract_units(self, root: HtmlElement):
|
||||
"""Extract all unit definitions."""
|
||||
# Look for xbrli:unit elements
|
||||
for unit in root.xpath('//xbrli:unit', namespaces=self.NAMESPACES):
|
||||
unit_id = unit.get('id')
|
||||
if not unit_id:
|
||||
continue
|
||||
|
||||
# Check for simple measure
|
||||
measure = unit.find('.//xbrli:measure', namespaces=self.NAMESPACES)
|
||||
if measure is not None:
|
||||
self.units[unit_id] = self._normalize_unit(measure.text)
|
||||
continue
|
||||
|
||||
# Check for complex unit (divide)
|
||||
divide = unit.find('.//xbrli:divide', namespaces=self.NAMESPACES)
|
||||
if divide is not None:
|
||||
numerator = divide.find('.//xbrli:unitNumerator/xbrli:measure', namespaces=self.NAMESPACES)
|
||||
denominator = divide.find('.//xbrli:unitDenominator/xbrli:measure', namespaces=self.NAMESPACES)
|
||||
|
||||
if numerator is not None and denominator is not None:
|
||||
num_unit = self._normalize_unit(numerator.text)
|
||||
den_unit = self._normalize_unit(denominator.text)
|
||||
self.units[unit_id] = f"{num_unit}/{den_unit}"
|
||||
|
||||
def _normalize_unit(self, unit_text: str) -> str:
|
||||
"""Normalize unit text."""
|
||||
if not unit_text:
|
||||
return ''
|
||||
|
||||
# Remove namespace prefix
|
||||
if ':' in unit_text:
|
||||
unit_text = unit_text.split(':')[-1]
|
||||
|
||||
# Common normalizations
|
||||
unit_map = {
|
||||
'usd': 'USD',
|
||||
'shares': 'shares',
|
||||
'pure': 'pure',
|
||||
'percent': '%'
|
||||
}
|
||||
|
||||
return unit_map.get(unit_text.lower(), unit_text)
|
||||
|
||||
def _extract_nonfraction(self, element: HtmlElement) -> Dict[str, Any]:
|
||||
"""Extract ix:nonFraction element."""
|
||||
metadata = {
|
||||
'type': 'nonFraction',
|
||||
'name': element.get('name'),
|
||||
'contextRef': element.get('contextRef') or element.get('contextref'),
|
||||
'unitRef': element.get('unitRef') or element.get('unitref'),
|
||||
'decimals': element.get('decimals'),
|
||||
'scale': element.get('scale'),
|
||||
'format': element.get('format'),
|
||||
'sign': element.get('sign')
|
||||
}
|
||||
|
||||
# Clean None values
|
||||
return {k: v for k, v in metadata.items() if v is not None}
|
||||
|
||||
def _extract_nonnumeric(self, element: HtmlElement) -> Dict[str, Any]:
|
||||
"""Extract ix:nonNumeric element."""
|
||||
metadata = {
|
||||
'type': 'nonNumeric',
|
||||
'name': element.get('name'),
|
||||
'contextRef': element.get('contextRef') or element.get('contextref'),
|
||||
'format': element.get('format')
|
||||
}
|
||||
|
||||
# Clean None values
|
||||
return {k: v for k, v in metadata.items() if v is not None}
|
||||
|
||||
def _extract_continuation(self, element: HtmlElement) -> Dict[str, Any]:
|
||||
"""Extract ix:continuation element."""
|
||||
cont_id = element.get('id')
|
||||
continued_at = element.get('continuedAt')
|
||||
|
||||
if cont_id and continued_at:
|
||||
# Map continuation to original
|
||||
if continued_at in self.continuations:
|
||||
original = self.continuations[continued_at]
|
||||
self.continuations[cont_id] = original
|
||||
return original
|
||||
else:
|
||||
# Store for later resolution
|
||||
metadata = {
|
||||
'type': 'continuation',
|
||||
'id': cont_id,
|
||||
'continuedAt': continued_at
|
||||
}
|
||||
self.continuations[cont_id] = metadata
|
||||
return metadata
|
||||
|
||||
return {}
|
||||
|
||||
def _extract_footnote(self, element: HtmlElement) -> Dict[str, Any]:
|
||||
"""Extract ix:footnote element."""
|
||||
return {
|
||||
'type': 'footnote',
|
||||
'footnoteRole': element.get('footnoteRole'),
|
||||
'footnoteID': element.get('footnoteID')
|
||||
}
|
||||
|
||||
def _extract_fraction(self, element: HtmlElement) -> Dict[str, Any]:
|
||||
"""Extract ix:fraction element."""
|
||||
metadata = {
|
||||
'type': 'fraction',
|
||||
'name': element.get('name'),
|
||||
'contextRef': element.get('contextRef'),
|
||||
'unitRef': element.get('unitRef')
|
||||
}
|
||||
|
||||
# Extract numerator and denominator
|
||||
numerator = element.find('.//ix:numerator', namespaces=self.NAMESPACES)
|
||||
denominator = element.find('.//ix:denominator', namespaces=self.NAMESPACES)
|
||||
|
||||
if numerator is not None:
|
||||
metadata['numerator'] = numerator.text
|
||||
if denominator is not None:
|
||||
metadata['denominator'] = denominator.text
|
||||
|
||||
return {k: v for k, v in metadata.items() if v is not None}
|
||||
|
||||
def _get_fact_value(self, element: HtmlElement) -> str:
|
||||
"""Get fact value from element with transformations."""
|
||||
# Get raw value
|
||||
value = element.text or ''
|
||||
|
||||
# Apply format transformation if specified
|
||||
format_attr = element.get('format')
|
||||
if format_attr and format_attr in self.TRANSFORMATIONS:
|
||||
transform = self.TRANSFORMATIONS[format_attr]
|
||||
value = transform(value)
|
||||
|
||||
# Apply scale if specified
|
||||
scale = element.get('scale')
|
||||
if scale:
|
||||
try:
|
||||
scale_factor = int(scale)
|
||||
numeric_value = float(value.replace(',', ''))
|
||||
scaled_value = numeric_value * (10 ** scale_factor)
|
||||
value = str(scaled_value)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Apply sign if specified
|
||||
sign = element.get('sign')
|
||||
if sign == '-':
|
||||
if value and not value.startswith('-'):
|
||||
value = '-' + value
|
||||
|
||||
return value.strip()
|
||||
Reference in New Issue
Block a user