Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
"""
Parsing strategies for different content types.
"""
from edgar.documents.strategies.document_builder import DocumentBuilder
from edgar.documents.strategies.header_detection import HeaderDetectionStrategy
from edgar.documents.strategies.table_processing import TableProcessor
from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
__all__ = [
'DocumentBuilder',
'HeaderDetectionStrategy',
'TableProcessor',
'XBRLExtractor'
]

View File

@@ -0,0 +1,670 @@
"""
Document builder that converts parsed HTML tree into document nodes.
"""
from typing import Dict, Any, Optional
from lxml.html import HtmlElement
from edgar.documents.config import ParserConfig
from edgar.documents.nodes import (
Node, DocumentNode, TextNode, ParagraphNode, HeadingNode,
ContainerNode, SectionNode, ListNode, ListItemNode, LinkNode, ImageNode
)
from edgar.documents.strategies.style_parser import StyleParser
from edgar.documents.table_nodes import TableNode, Cell, Row
from edgar.documents.types import Style, ParseContext, SemanticType
class DocumentBuilder:
"""
Builds Document node tree from parsed HTML.
Handles the conversion of HTML elements into structured nodes
with proper hierarchy and metadata.
"""
# Block-level elements
BLOCK_ELEMENTS = {
'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'ul', 'ol', 'li', 'blockquote', 'pre', 'hr',
'table', 'form', 'fieldset', 'address', 'section',
'article', 'aside', 'nav', 'header', 'footer', 'main'
}
# Inline elements
INLINE_ELEMENTS = {
'span', 'a', 'em', 'strong', 'b', 'i', 'u', 's',
'small', 'mark', 'del', 'ins', 'sub', 'sup',
'code', 'kbd', 'var', 'samp', 'abbr', 'cite',
'q', 'time', 'font',
# IXBRL inline elements for simple values - should not break text flow
'ix:nonfraction', 'ix:footnote', 'ix:fraction'
}
# Elements to skip
SKIP_ELEMENTS = {
'script', 'style', 'meta', 'link', 'noscript',
# IXBRL exclude elements - content that should not appear in final document
'ix:exclude'
}
def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
"""
Initialize document builder.
Args:
config: Parser configuration
strategies: Dictionary of parsing strategies
"""
self.config = config
self.strategies = strategies
self.style_parser = StyleParser()
self.context = ParseContext()
# Track XBRL context
self.xbrl_context_stack = []
self.xbrl_continuations = {}
def build(self, tree: HtmlElement) -> DocumentNode:
"""
Build document from HTML tree.
Args:
tree: Parsed HTML tree
Returns:
Document root node
"""
# Create root document node
root = DocumentNode()
# Find body element
body = tree.find('.//body')
if body is None:
# If no body, use the entire tree
body = tree
# Process body content
self._process_element(body, root)
# Apply node merging if configured
if self.config.merge_adjacent_nodes:
self._merge_adjacent_nodes(root)
return root
def _process_element(self, element: HtmlElement, parent: Node) -> Optional[Node]:
"""
Process HTML element into node.
Args:
element: HTML element to process
parent: Parent node
Returns:
Created node or None if skipped
"""
# Skip certain elements but preserve their tail text
if element.tag in self.SKIP_ELEMENTS:
# Process tail text even when skipping element
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
return None
# Skip page number containers
if self._is_page_number_container(element):
return None
# Skip page break elements
if self._is_page_break_element(element):
return None
# Skip navigation containers that follow page breaks
if self._is_page_navigation_container(element):
return None
# Track parsing depth
self.context.depth += 1
try:
# Handle XBRL elements
if element.tag.startswith('{'): # Namespaced element
self._enter_xbrl_context(element)
# Extract style
style = self._extract_style(element)
# Create appropriate node based on element type
node = self._create_node_for_element(element, style)
if node:
# Add XBRL metadata if in context
if self.xbrl_context_stack:
node.metadata.update(self._get_current_xbrl_metadata())
# Add to parent
parent.add_child(node)
# Process children for container nodes
if self._should_process_children(element, node):
# Add element's direct text first
if element.text:
if self.config.preserve_whitespace:
if element.text: # Don't strip whitespace
text_node = TextNode(content=element.text)
node.add_child(text_node)
else:
if element.text.strip():
text_node = TextNode(content=element.text.strip())
node.add_child(text_node)
# Process child elements
for child in element:
self._process_element(child, node)
# Process text after children
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
elif element.tail.isspace():
# Even if tail is just whitespace, preserve the spacing info
# This helps with inline element spacing decisions
if hasattr(node, 'set_metadata'):
node.set_metadata('has_tail_whitespace', True)
else:
# Node created but children not processed - still need to handle tail
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
elif element.tail.isspace():
# Even if tail is just whitespace, preserve the spacing info
if hasattr(node, 'set_metadata'):
node.set_metadata('has_tail_whitespace', True)
else:
# No node created, process children with same parent
for child in element:
self._process_element(child, parent)
# Process tail text
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
# Exit XBRL context
if element.tag.startswith('{'):
self._exit_xbrl_context(element)
return node
finally:
self.context.depth -= 1
def _create_node_for_element(self, element: HtmlElement, style: Style) -> Optional[Node]:
"""Create appropriate node for HTML element."""
tag = element.tag.lower() if not element.tag.startswith('{') else element.tag
# Check for heading
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
level = int(tag[1])
text = self._get_element_text(element)
if text:
return HeadingNode(content=text, level=level, style=style)
# Handle specific elements first before header detection
if tag == 'p':
return ParagraphNode(style=style)
elif tag == 'li':
return ListItemNode(style=style)
# Check if element might be a heading based on style/content
# Skip header detection for certain tags that should never be headers
skip_header_detection_tags = {
'li', 'td', 'th', 'option', 'a', 'button', 'label',
# IXBRL inline elements - should not be treated as headers
'ix:nonfraction', 'ix:footnote', 'ix:fraction',
# IXBRL elements that can contain tables and complex content
'ix:nonNumeric', 'ix:continuation'
}
if tag not in skip_header_detection_tags and self.strategies.get('header_detection'):
header_info = self.strategies['header_detection'].detect(element, self.context)
if header_info and header_info.confidence > self.config.header_detection_threshold:
text = self._get_element_text(element)
if text:
node = HeadingNode(
content=text,
level=header_info.level,
style=style
)
# Add header metadata
node.set_metadata('detection_method', header_info.detection_method)
node.set_metadata('confidence', header_info.confidence)
if header_info.is_item:
node.semantic_type = SemanticType.ITEM_HEADER
node.set_metadata('item_number', header_info.item_number)
return node
# Continue handling other specific elements
if tag == 'table':
if self.strategies.get('table_processing'):
return self.strategies['table_processing'].process(element)
else:
return self._process_table_basic(element, style)
elif tag in ['ul', 'ol']:
return ListNode(ordered=(tag == 'ol'), style=style)
elif tag == 'li':
return ListItemNode(style=style)
elif tag == 'a':
href = element.get('href', '')
title = element.get('title', '')
text = self._get_element_text(element)
return LinkNode(content=text, href=href, title=title, style=style)
elif tag == 'img':
return ImageNode(
src=element.get('src'),
alt=element.get('alt'),
width=self._parse_dimension(element.get('width')),
height=self._parse_dimension(element.get('height')),
style=style
)
elif tag == 'br':
# Line break - add as text node
return TextNode(content='\n')
elif tag in ['section', 'article']:
return SectionNode(style=style)
elif tag == 'div' or tag in self.BLOCK_ELEMENTS:
# Check if CSS display property makes this inline
if style.display in ['inline', 'inline-block']:
# Treat as inline element despite being a div
text = self._get_element_text(element)
if text:
text_node = TextNode(content=text, style=style)
text_node.set_metadata('original_tag', tag)
text_node.set_metadata('inline_via_css', True)
return text_node
# If no text but inline, still process children inline
return ContainerNode(tag_name=tag, style=style)
# Normal block behavior
# Check if this is just a text container with only inline elements
if self._is_text_only_container(element):
# Create ParagraphNode for divs containing only inline elements
# This ensures proper text concatenation for spans, etc.
return ParagraphNode(style=style)
else:
return ContainerNode(tag_name=tag, style=style)
elif tag in self.INLINE_ELEMENTS:
# Inline elements - extract text and add to parent
text = self._get_element_text(element)
if text:
text_node = TextNode(content=text, style=style)
# Preserve inline element metadata
text_node.set_metadata('original_tag', tag)
return text_node
elif tag in ['ix:nonNumeric', 'ix:continuation']:
# IXBRL elements that can contain complex content including tables
# Process as container to allow proper table parsing
return ContainerNode(tag_name=tag, style=style)
# Default: create container for unknown elements
return ContainerNode(tag_name=tag, style=style)
def _is_page_number_container(self, element: HtmlElement) -> bool:
"""Detect and filter page number containers across various SEC filing patterns."""
import re
# Get text content first - all page numbers should be short
text_content = element.text_content().strip()
# Must be short content (1-8 chars to handle "Page X" format)
if len(text_content) > 8 or len(text_content) == 0:
return False
# Must be numeric, roman numerals, or "Page X" format
if not self._is_page_number_content(text_content):
return False
# Check various patterns based on element type and styling
tag = element.tag.lower()
# Pattern 1: Oracle-style flexbox containers (highest confidence)
if tag == 'div' and self._is_flexbox_page_number(element):
return True
# Pattern 2: Center/right aligned paragraphs (common pattern)
if tag == 'p' and self._is_aligned_page_number(element):
return True
# Pattern 3: Footer-style divs with centered page numbers
if tag == 'div' and self._is_footer_page_number(element):
return True
# Pattern 4: Simple divs with page break context
if tag == 'div' and self._is_page_break_context(element):
return True
return False
def _is_page_number_content(self, text: str) -> bool:
"""Check if text content looks like a page number."""
import re
# Simple numeric (most common)
if text.isdigit():
return True
# Roman numerals
if re.match(r'^[ivxlcdm]+$', text.lower()):
return True
# "Page X" or "Page X of Y" format
if re.match(r'^page\s+\d+(\s+of\s+\d+)?$', text.lower()):
return True
return False
def _is_flexbox_page_number(self, element: HtmlElement) -> bool:
"""Detect Oracle-style flexbox page number containers."""
import re
style_attr = element.get('style', '')
if not style_attr:
return False
# Must have: display:flex, justify-content:flex-end, min-height:1in
required_patterns = [
r'display:\s*flex',
r'justify-content:\s*flex-end',
r'min-height:\s*1in'
]
return all(re.search(pattern, style_attr) for pattern in required_patterns)
def _is_aligned_page_number(self, element: HtmlElement) -> bool:
"""Detect center or right-aligned page number paragraphs."""
import re
style_attr = element.get('style', '')
# Check for center or right alignment
alignment_pattern = r'text-align:\s*(center|right)'
if not re.search(alignment_pattern, style_attr):
return False
# Optional: check for smaller font size (common in page numbers)
font_size_pattern = r'font-size:\s*([0-9]+)pt'
font_match = re.search(font_size_pattern, style_attr)
if font_match:
font_size = int(font_match.group(1))
# Page numbers often use smaller fonts (8-12pt)
if font_size <= 12:
return True
return True # Any center/right aligned short content
def _is_footer_page_number(self, element: HtmlElement) -> bool:
"""Detect footer-style page number containers."""
import re
style_attr = element.get('style', '')
# Look for bottom positioning or footer-like styling
footer_patterns = [
r'bottom:\s*[0-9]',
r'position:\s*absolute',
r'margin-bottom:\s*0',
r'text-align:\s*center'
]
# Need at least 2 footer indicators
matches = sum(1 for pattern in footer_patterns if re.search(pattern, style_attr))
return matches >= 2
def _is_page_break_context(self, element: HtmlElement) -> bool:
"""Check if element is near page breaks (common page number context)."""
# Check next sibling for page break HR
next_elem = element.getnext()
if next_elem is not None and next_elem.tag == 'hr':
hr_style = next_elem.get('style', '')
if 'page-break' in hr_style:
return True
# Check if element has page-break styling itself
style_attr = element.get('style', '')
if 'page-break' in style_attr:
return True
return False
def _is_page_break_element(self, element: HtmlElement) -> bool:
"""Detect page break HR elements."""
if element.tag.lower() != 'hr':
return False
style_attr = element.get('style', '')
# Check for page-break-after:always or similar page break styles
return 'page-break' in style_attr
def _is_page_navigation_container(self, element: HtmlElement) -> bool:
"""Detect navigation containers that appear after page breaks."""
if element.tag.lower() != 'div':
return False
style_attr = element.get('style', '')
# Check for navigation container patterns
# Often have: padding-top, min-height:1in, box-sizing:border-box
nav_indicators = [
r'padding-top:\s*0\.5in',
r'min-height:\s*1in',
r'box-sizing:\s*border-box'
]
import re
matches = sum(1 for pattern in nav_indicators if re.search(pattern, style_attr))
# Need at least 2 indicators
if matches < 2:
return False
# Check if it contains typical navigation content
text_content = element.text_content().strip().lower()
# Common navigation phrases
nav_phrases = [
'table of contents',
'index to financial statements',
'table of content',
'index to financial statement'
]
return any(phrase in text_content for phrase in nav_phrases)
def _extract_style(self, element: HtmlElement) -> Style:
"""Extract style from element."""
style_str = element.get('style', '')
style = self.style_parser.parse(style_str)
# Add tag-specific styles
tag = element.tag.lower()
if tag == 'b' or tag == 'strong':
style.font_weight = 'bold'
elif tag == 'i' or tag == 'em':
style.font_style = 'italic'
elif tag == 'u':
style.text_decoration = 'underline'
# Handle alignment
align = element.get('align')
if align:
style.text_align = align
return style
def _get_element_text(self, element: HtmlElement) -> str:
"""Get text content from element."""
text_parts = []
# Get element's direct text
if element.text:
# For inline elements, preserve leading/trailing whitespace
if element.tag.lower() in self.INLINE_ELEMENTS:
text_parts.append(element.text)
else:
text_parts.append(element.text.strip())
# For simple elements, get all text content
if element.tag.lower() in self.INLINE_ELEMENTS or \
element.tag.lower() in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
# Get all text including from child elements
for child in element:
if child.tag.lower() not in self.SKIP_ELEMENTS:
child_text = child.text_content()
if child_text:
# For inline elements, preserve whitespace in child content too
if element.tag.lower() in self.INLINE_ELEMENTS:
text_parts.append(child_text)
else:
text_parts.append(child_text.strip())
# For inline elements with preserved whitespace, concatenate directly
# For others, join with spaces
if element.tag.lower() in self.INLINE_ELEMENTS and len(text_parts) == 1:
return text_parts[0] if text_parts else ''
else:
return ' '.join(text_parts)
def _is_text_only_container(self, element: HtmlElement) -> bool:
"""Check if element contains only text and inline elements."""
for child in element:
if child.tag.lower() in self.BLOCK_ELEMENTS:
return False
if child.tag.lower() == 'table':
return False
return True
def _should_process_children(self, element: HtmlElement, node: Node) -> bool:
"""Determine if children should be processed."""
# Don't process children for certain node types
if isinstance(node, (TextNode, HeadingNode)):
return False
# Tables are processed separately
if isinstance(node, TableNode):
return False
return True
def _process_table_basic(self, element: HtmlElement, style: Style) -> TableNode:
"""Basic table processing without advanced strategy."""
table = TableNode(style=style)
# Set config for rendering decisions
table._config = self.config
# Extract caption
caption_elem = element.find('.//caption')
if caption_elem is not None:
table.caption = caption_elem.text_content().strip()
# Process rows
for tr in element.findall('.//tr'):
cells = []
for td in tr.findall('.//td') + tr.findall('.//th'):
cell = Cell(
content=td.text_content().strip(),
colspan=int(td.get('colspan', '1')),
rowspan=int(td.get('rowspan', '1')),
is_header=(td.tag == 'th'),
align=td.get('align')
)
cells.append(cell)
if cells:
row = Row(cells=cells, is_header=(tr.find('.//th') is not None))
# Determine if header or data row
if tr.getparent().tag == 'thead' or row.is_header:
table.headers.append(cells)
else:
table.rows.append(row)
return table
def _parse_dimension(self, value: Optional[str]) -> Optional[int]:
"""Parse dimension value (width/height)."""
if not value:
return None
# Remove 'px' suffix if present
value = value.strip().rstrip('px')
try:
return int(value)
except ValueError:
return None
def _enter_xbrl_context(self, element: HtmlElement):
"""Enter XBRL context."""
if self.config.extract_xbrl and self.strategies.get('xbrl_extraction'):
xbrl_data = self.strategies['xbrl_extraction'].extract_context(element)
if xbrl_data:
self.xbrl_context_stack.append(xbrl_data)
def _exit_xbrl_context(self, element: HtmlElement):
"""Exit XBRL context."""
if self.xbrl_context_stack:
self.xbrl_context_stack.pop()
def _get_current_xbrl_metadata(self) -> Dict[str, Any]:
"""Get current XBRL metadata."""
if not self.xbrl_context_stack:
return {}
# Merge all contexts in stack
metadata = {}
for context in self.xbrl_context_stack:
metadata.update(context)
return metadata
def _merge_adjacent_nodes(self, root: Node):
"""Merge adjacent text nodes with similar styles."""
# Implementation would recursively merge adjacent text nodes
# This is a placeholder for the actual implementation
pass

View File

@@ -0,0 +1,450 @@
"""
Multi-strategy header detection for document structure.
"""
import re
from abc import ABC, abstractmethod
from typing import Optional, List, Dict
from lxml.html import HtmlElement
from edgar.documents.config import ParserConfig
from edgar.documents.types import HeaderInfo, ParseContext
class HeaderDetector(ABC):
"""Abstract base class for header detectors."""
@abstractmethod
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""Detect if element is a header."""
pass
@property
@abstractmethod
def name(self) -> str:
"""Detector name."""
pass
class StyleBasedDetector(HeaderDetector):
"""Detect headers based on CSS styles."""
@property
def name(self) -> str:
return "style"
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""Detect headers based on style attributes."""
# Get element style
style = context.get_current_style()
# Skip if no style info
if not style:
return None
# Get text content
text = element.text_content().strip()
if not text or len(text) > 200: # Skip very long text
return None
confidence = 0.0
level = 3 # Default level
# Check font size
if style.font_size and context.base_font_size:
size_ratio = style.font_size / context.base_font_size
if size_ratio >= 2.0:
confidence += 0.8
level = 1
elif size_ratio >= 1.5:
confidence += 0.7
level = 2
elif size_ratio >= 1.2:
confidence += 0.5
level = 3
elif size_ratio >= 1.1:
confidence += 0.3
level = 4
# Check font weight
if style.is_bold:
confidence += 0.3
if level == 3: # Adjust level for bold text
level = 2
# Check text alignment
if style.is_centered:
confidence += 0.2
# Check for uppercase
if text.isupper() and len(text.split()) <= 10:
confidence += 0.2
# Check margins (headers often have larger margins)
if style.margin_top and style.margin_top > 20:
confidence += 0.1
if style.margin_bottom and style.margin_bottom > 10:
confidence += 0.1
# Normalize confidence
confidence = min(confidence, 1.0)
if confidence > 0.4: # Threshold for style-based detection
return HeaderInfo.from_text(text, level, confidence, self.name)
return None
class PatternBasedDetector(HeaderDetector):
"""Detect headers based on text patterns."""
# Common header patterns in SEC filings
HEADER_PATTERNS = [
# Item patterns
(r'^(Item|ITEM)\s+(\d+[A-Z]?)[.\s]+(.+)$', 1, 0.95),
(r'^Part\s+[IVX]+[.\s]*$', 1, 0.9),
(r'^PART\s+[IVX]+[.\s]*$', 1, 0.9),
# Section patterns
(r'^(BUSINESS|RISK FACTORS|PROPERTIES|LEGAL PROCEEDINGS)$', 2, 0.85),
(r'^(Management\'?s?\s+Discussion|MD&A)', 2, 0.85),
(r'^(Financial\s+Statements|Consolidated\s+Financial\s+Statements)$', 2, 0.85),
# Numbered sections
(r'^\d+\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
(r'^[A-Z]\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
(r'^\([a-z]\)\s+[A-Z][A-Za-z\s]+$', 4, 0.6),
# Title case headers
(r'^[A-Z][A-Za-z\s]+[A-Za-z]$', 3, 0.5),
# All caps headers
(r'^[A-Z\s]+$', 3, 0.6),
]
@property
def name(self) -> str:
return "pattern"
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""Detect headers based on text patterns."""
text = element.text_content().strip()
# Skip empty or very long text
if not text or len(text) > 200:
return None
# Skip single punctuation - never headers
if len(text) == 1 and text in '.,!?;:()[]{}':
return None
# Skip if text contains multiple sentences (likely paragraph)
if text.count('.') > 2:
return None
# Check against patterns
for pattern, level, base_confidence in self.HEADER_PATTERNS:
match = re.match(pattern, text, re.IGNORECASE)
if match:
# Adjust confidence based on context
confidence = base_confidence
# Boost confidence if element is alone in parent
if len(element.getparent()) == 1:
confidence += 0.1
# Boost confidence if followed by substantial text
next_elem = element.getnext()
if next_elem is not None and len(next_elem.text_content()) > 100:
confidence += 0.1
confidence = min(confidence, 1.0)
return HeaderInfo.from_text(text, level, confidence, self.name)
return None
class StructuralDetector(HeaderDetector):
"""Detect headers based on DOM structure."""
@property
def name(self) -> str:
return "structural"
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""Detect headers based on structural cues."""
text = element.text_content().strip()
# Skip empty or very long text
if not text or len(text) > 200:
return None
# Skip single punctuation - never headers
if len(text) == 1 and text in '.,!?;:()[]{}':
return None
confidence = 0.0
level = 3
# Check if element is in a header tag
tag = element.tag.lower()
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
confidence = 1.0
level = int(tag[1])
return HeaderInfo.from_text(text, level, confidence, self.name)
# Check parent structure
parent = element.getparent()
if parent is not None:
parent_tag = parent.tag.lower()
# Check if in header-like container
if parent_tag in ['header', 'thead', 'caption']:
confidence += 0.6
level = 2
# Check if parent has few children (isolated element)
if len(parent) <= 3:
confidence += 0.3
# Check if parent is centered
parent_align = parent.get('align')
if parent_align == 'center':
confidence += 0.2
# Check element properties
if tag in ['strong', 'b']:
confidence += 0.3
if element.get('align') == 'center':
confidence += 0.2
# Check if followed by block content
next_elem = element.getnext()
if next_elem is not None:
next_tag = next_elem.tag.lower()
if next_tag in ['p', 'div', 'table', 'ul', 'ol']:
confidence += 0.2
# Check text characteristics
words = text.split()
if 1 <= len(words) <= 10: # Short text
confidence += 0.1
# Normalize confidence
confidence = min(confidence, 1.0)
if confidence > 0.5:
return HeaderInfo.from_text(text, level, confidence, self.name)
return None
class ContextualDetector(HeaderDetector):
"""Detect headers based on surrounding context."""
@property
def name(self) -> str:
return "contextual"
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""Detect headers based on contextual clues."""
text = element.text_content().strip()
# Skip empty or very long text
if not text or len(text) > 200:
return None
# Skip single punctuation - never headers
if len(text) == 1 and text in '.,!?;:()[]{}':
return None
confidence = 0.0
level = 3
# Check if text looks like a header
if self._looks_like_header(text):
confidence += 0.4
# Check relationship to previous content
prev_elem = element.getprevious()
if prev_elem is not None:
prev_text = prev_elem.text_content().strip()
# Check if previous was also a header (section hierarchy)
if prev_text and self._looks_like_header(prev_text):
confidence += 0.3
# Adjust level based on comparison
if len(text) > len(prev_text):
level = 2
else:
level = 3
# Check relationship to next content
next_elem = element.getnext()
if next_elem is not None:
next_text = next_elem.text_content().strip()
# Headers are often followed by longer content
if len(next_text) > len(text) * 3:
confidence += 0.3
# Check if next element is indented or styled differently
next_style = next_elem.get('style', '')
if 'margin-left' in next_style or 'padding-left' in next_style:
confidence += 0.2
# Check position in document
if context.current_section is None and context.depth < 5:
# Early in document, more likely to be header
confidence += 0.2
# Normalize confidence
confidence = min(confidence, 1.0)
if confidence > 0.5:
return HeaderInfo.from_text(text, level, confidence, self.name)
return None
def _looks_like_header(self, text: str) -> bool:
"""Check if text looks like a header."""
# Short text
if len(text.split()) > 15:
return False
# No ending punctuation (except colon)
if text.rstrip().endswith(('.', '!', '?', ';')):
return False
# Title case or all caps
if text.istitle() or text.isupper():
return True
# Starts with capital letter
if text and text[0].isupper():
return True
return False
class HeaderDetectionStrategy:
"""
Multi-strategy header detection.
Combines multiple detection methods with weighted voting.
"""
def __init__(self, config: ParserConfig):
"""Initialize with configuration."""
self.config = config
self.detectors = self._init_detectors()
def _init_detectors(self) -> List[HeaderDetector]:
"""Initialize enabled detectors."""
detectors = []
# Always include basic detectors
detectors.extend([
StyleBasedDetector(),
PatternBasedDetector(),
StructuralDetector(),
ContextualDetector()
])
# Add ML detector if enabled
if self.config.features.get('ml_header_detection'):
# Would add MLBasedDetector here
pass
return detectors
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""
Detect if element is a header using multiple strategies.
Args:
element: HTML element to check
context: Current parsing context
Returns:
HeaderInfo if element is detected as header, None otherwise
"""
# Skip if element has no text
text = element.text_content().strip()
if not text:
return None
# Collect results from all detectors
results: List[HeaderInfo] = []
for detector in self.detectors:
try:
result = detector.detect(element, context)
if result:
results.append(result)
except Exception:
# Don't let one detector failure stop others
continue
if not results:
return None
# If only one detector fired, use its result if confident enough
if len(results) == 1:
if results[0].confidence >= self.config.header_detection_threshold:
return results[0]
return None
# Multiple detectors - combine results
return self._combine_results(results, text)
def _combine_results(self, results: List[HeaderInfo], text: str) -> HeaderInfo:
"""Combine multiple detection results."""
# Weight different detectors
detector_weights = {
'style': 0.3,
'pattern': 0.4,
'structural': 0.2,
'contextual': 0.1,
'ml': 0.5 # Would be highest if available
}
# Calculate weighted confidence
total_confidence = 0.0
total_weight = 0.0
# Group by level
level_votes: Dict[int, float] = {}
for result in results:
weight = detector_weights.get(result.detection_method, 0.1)
total_confidence += result.confidence * weight
total_weight += weight
# Vote for level
if result.level not in level_votes:
level_votes[result.level] = 0.0
level_votes[result.level] += result.confidence * weight
# Normalize confidence
final_confidence = total_confidence / total_weight if total_weight > 0 else 0.0
# Choose most voted level
final_level = max(level_votes.items(), key=lambda x: x[1])[0]
# Check if any detector found this is an item
is_item = any(r.is_item for r in results)
item_number = next((r.item_number for r in results if r.item_number), None)
return HeaderInfo(
level=final_level,
confidence=final_confidence,
text=text,
detection_method='combined',
is_item=is_item,
item_number=item_number
)

View File

@@ -0,0 +1,344 @@
"""
CSS style parser for HTML elements.
"""
import re
from typing import Dict, Optional, Tuple, Union
from edgar.documents.types import Style
from edgar.documents.utils import get_cache_manager
class StyleParser:
"""
Parser for CSS style attributes.
Handles inline styles and converts them to Style objects.
"""
# Common CSS units
ABSOLUTE_UNITS = {'px', 'pt', 'pc', 'cm', 'mm', 'in'}
RELATIVE_UNITS = {'em', 'rem', 'ex', 'ch', 'vw', 'vh', '%'}
# Font weight mappings
FONT_WEIGHT_MAP = {
'normal': '400',
'bold': '700',
'bolder': '800',
'lighter': '300'
}
def __init__(self):
"""Initialize style parser with cache."""
self._cache = get_cache_manager().style_cache
def parse(self, style_string: str) -> Style:
"""
Parse CSS style string into Style object.
Args:
style_string: CSS style string (e.g., "font-size: 14px; color: red")
Returns:
Parsed Style object
"""
if not style_string:
return Style()
# Check cache first
cached_style = self._cache.get(style_string)
if cached_style is not None:
return cached_style
# Parse style
style = Style()
# Split into individual declarations
declarations = self._split_declarations(style_string)
for prop, value in declarations.items():
self._apply_property(style, prop, value)
# Cache result
self._cache.put(style_string, style)
return style
def _split_declarations(self, style_string: str) -> Dict[str, str]:
"""Split style string into property-value pairs."""
declarations = {}
# Split by semicolon, handling potential issues
parts = style_string.split(';')
for part in parts:
part = part.strip()
if not part:
continue
# Split property and value
if ':' in part:
prop, value = part.split(':', 1)
prop = prop.strip().lower()
value = value.strip()
if prop and value:
declarations[prop] = value
return declarations
def _apply_property(self, style: Style, prop: str, value: str):
"""Apply CSS property to Style object."""
# Font properties
if prop == 'font-size':
size = self._parse_length(value)
if size is not None:
style.font_size = size
elif prop == 'font-weight':
style.font_weight = self._normalize_font_weight(value)
elif prop == 'font-style':
if value in ['italic', 'oblique']:
style.font_style = 'italic'
elif value == 'normal':
style.font_style = 'normal'
# Text properties
elif prop == 'text-align':
if value in ['left', 'right', 'center', 'justify']:
style.text_align = value
elif prop == 'text-decoration':
style.text_decoration = value
# Color properties
elif prop == 'color':
style.color = self._normalize_color(value)
elif prop in ['background-color', 'background']:
color = self._extract_background_color(value)
if color:
style.background_color = color
# Spacing properties
elif prop == 'margin':
self._parse_box_property(style, 'margin', value)
elif prop == 'margin-top':
margin = self._parse_length(value)
if margin is not None:
style.margin_top = margin
elif prop == 'margin-bottom':
margin = self._parse_length(value)
if margin is not None:
style.margin_bottom = margin
elif prop == 'margin-left':
margin = self._parse_length(value)
if margin is not None:
style.margin_left = margin
elif prop == 'margin-right':
margin = self._parse_length(value)
if margin is not None:
style.margin_right = margin
elif prop == 'padding':
self._parse_box_property(style, 'padding', value)
elif prop == 'padding-top':
padding = self._parse_length(value)
if padding is not None:
style.padding_top = padding
elif prop == 'padding-bottom':
padding = self._parse_length(value)
if padding is not None:
style.padding_bottom = padding
elif prop == 'padding-left':
padding = self._parse_length(value)
if padding is not None:
style.padding_left = padding
elif prop == 'padding-right':
padding = self._parse_length(value)
if padding is not None:
style.padding_right = padding
# Display properties
elif prop == 'display':
style.display = value
# Size properties
elif prop == 'width':
style.width = self._parse_dimension(value)
elif prop == 'height':
style.height = self._parse_dimension(value)
# Line height
elif prop == 'line-height':
line_height = self._parse_line_height(value)
if line_height is not None:
style.line_height = line_height
def _parse_length(self, value: str) -> Optional[float]:
"""Parse CSS length value to pixels."""
value = value.strip().lower()
# Handle special values
if value in ['0', 'auto', 'inherit', 'initial']:
return 0.0 if value == '0' else None
# Extract number and unit
match = re.match(r'^(-?\d*\.?\d+)\s*([a-z%]*)$', value)
if not match:
return None
num_str, unit = match.groups()
try:
num = float(num_str)
except ValueError:
return None
# Convert to pixels
if not unit or unit == 'px':
return num
elif unit == 'pt':
return num * 1.333 # 1pt = 1.333px
elif unit == 'em':
return num * 16 # Assume 16px base
elif unit == 'rem':
return num * 16 # Assume 16px root
elif unit == '%':
return None # Can't convert percentage without context
elif unit == 'in':
return num * 96 # 1in = 96px
elif unit == 'cm':
return num * 37.8 # 1cm = 37.8px
elif unit == 'mm':
return num * 3.78 # 1mm = 3.78px
return None
def _parse_dimension(self, value: str) -> Optional[Union[float, str]]:
"""Parse dimension value (width/height)."""
value = value.strip()
# Check for percentage
if value.endswith('%'):
return value # Return as string
# Try to parse as length
length = self._parse_length(value)
return length
def _parse_line_height(self, value: str) -> Optional[float]:
"""Parse line-height value."""
value = value.strip()
# Unitless number (multiplier)
try:
return float(value)
except ValueError:
pass
# Try as length
return self._parse_length(value)
def _normalize_font_weight(self, value: str) -> str:
"""Normalize font weight value."""
value = value.strip().lower()
# Map keywords to numeric values
if value in self.FONT_WEIGHT_MAP:
return self.FONT_WEIGHT_MAP[value]
# Check if it's already numeric
if value.isdigit() and 100 <= int(value) <= 900:
return value
return value
def _normalize_color(self, value: str) -> str:
"""Normalize color value."""
value = value.strip().lower()
# Handle rgb/rgba
if value.startswith(('rgb(', 'rgba(')):
return value
# Handle hex colors
if value.startswith('#'):
# Expand 3-char hex to 6-char
if len(value) == 4:
return '#' + ''.join(c*2 for c in value[1:])
return value
# Return named colors as-is
return value
def _extract_background_color(self, value: str) -> Optional[str]:
"""Extract color from background property."""
# Simple extraction - could be enhanced
parts = value.split()
for part in parts:
if part.startswith('#') or part.startswith('rgb'):
return self._normalize_color(part)
# Check for named colors
if not any(unit in part for unit in self.ABSOLUTE_UNITS | self.RELATIVE_UNITS):
return part
return None
def _parse_box_property(self, style: Style, prop_type: str, value: str):
"""Parse box property (margin/padding) with multiple values."""
parts = value.split()
if not parts:
return
# Convert all parts to lengths
lengths = []
for part in parts:
length = self._parse_length(part)
if length is not None:
lengths.append(length)
if not lengths:
return
# Apply based on number of values (CSS box model)
if len(lengths) == 1:
# All sides
val = lengths[0]
setattr(style, f'{prop_type}_top', val)
setattr(style, f'{prop_type}_right', val)
setattr(style, f'{prop_type}_bottom', val)
setattr(style, f'{prop_type}_left', val)
elif len(lengths) == 2:
# Vertical, horizontal
vert, horiz = lengths
setattr(style, f'{prop_type}_top', vert)
setattr(style, f'{prop_type}_bottom', vert)
setattr(style, f'{prop_type}_left', horiz)
setattr(style, f'{prop_type}_right', horiz)
elif len(lengths) == 3:
# Top, horizontal, bottom
top, horiz, bottom = lengths
setattr(style, f'{prop_type}_top', top)
setattr(style, f'{prop_type}_bottom', bottom)
setattr(style, f'{prop_type}_left', horiz)
setattr(style, f'{prop_type}_right', horiz)
elif len(lengths) >= 4:
# Top, right, bottom, left
setattr(style, f'{prop_type}_top', lengths[0])
setattr(style, f'{prop_type}_right', lengths[1])
setattr(style, f'{prop_type}_bottom', lengths[2])
setattr(style, f'{prop_type}_left', lengths[3])
def merge_styles(self, base: Style, override: Style) -> Style:
"""
Merge two styles with override taking precedence.
Args:
base: Base style
override: Override style
Returns:
Merged style
"""
return base.merge(override)

View File

@@ -0,0 +1,637 @@
"""
Advanced table processing strategy.
"""
import re
from functools import lru_cache
from typing import List, Optional
from lxml.html import HtmlElement
from edgar.documents.config import ParserConfig
from edgar.documents.strategies.style_parser import StyleParser
from edgar.documents.table_nodes import TableNode, Cell, Row
from edgar.documents.types import TableType
class TableProcessor:
"""
Advanced table processing with type detection and structure analysis.
"""
# HTML entities that need replacement
ENTITY_REPLACEMENTS = {
'&horbar;': '-----',
'&mdash;': '-----',
'&ndash;': '---',
'&minus;': '-',
'&hyphen;': '-',
'&dash;': '-',
'&nbsp;': ' ',
'&amp;': '&',
'&lt;': '<',
'&gt;': '>',
'&quot;': '"',
'&apos;': "'",
'&#8202;': ' ',
'&#8203;': '',
'&#x2014;': '-----',
'&#x2013;': '---',
'&#x2212;': '-',
}
# Financial keywords for table type detection
FINANCIAL_KEYWORDS = {
'revenue', 'income', 'expense', 'asset', 'liability',
'cash', 'equity', 'profit', 'loss', 'margin',
'earnings', 'cost', 'sales', 'operating', 'net',
'gross', 'total', 'balance', 'statement', 'consolidated',
'provision', 'tax', 'taxes', 'compensation', 'stock',
'share', 'shares', 'rsu', 'option', 'grant', 'vest'
}
# Metrics keywords
METRICS_KEYWORDS = {
'ratio', 'percentage', 'percent', '%', 'rate',
'growth', 'change', 'increase', 'decrease',
'average', 'median', 'total', 'count', 'number'
}
def __init__(self, config: ParserConfig):
"""Initialize table processor."""
self.config = config
self.style_parser = StyleParser()
def process(self, element: HtmlElement) -> TableNode:
"""
Process table element into TableNode.
Args:
element: HTML table element
Returns:
Processed TableNode
"""
# Extract table metadata
table_id = element.get('id')
table_class = element.get('class', '').split()
table_style = self.style_parser.parse(element.get('style', ''))
# Create table node
table = TableNode(style=table_style)
# Set config for rendering decisions
table._config = self.config
# Add metadata
if table_id:
table.set_metadata('id', table_id)
if table_class:
table.set_metadata('classes', table_class)
# Extract caption
caption_elem = element.find('.//caption')
if caption_elem is not None:
table.caption = self._extract_text(caption_elem)
# Extract summary
summary = element.get('summary')
if summary:
table.summary = summary
# Process table structure
self._process_table_structure(element, table)
# Detect table type if configured
if self.config.detect_table_types:
table.table_type = self._detect_table_type(table)
# Extract relationships if configured
if self.config.extract_table_relationships:
self._extract_relationships(table)
return table
def _process_table_structure(self, element: HtmlElement, table: TableNode):
"""Process table structure (thead, tbody, tfoot)."""
# Process thead
thead = element.find('.//thead')
if thead is not None:
for tr in thead.findall('.//tr'):
cells = self._process_row(tr, is_header=True)
if cells:
table.headers.append(cells)
# Process tbody (or direct rows)
tbody = element.find('.//tbody')
rows_container = tbody if tbody is not None else element
# Track if we've seen headers and data rows
headers_found = bool(table.headers)
consecutive_header_rows = 0
data_rows_started = False
for tr in rows_container.findall('.//tr'):
# Skip if already processed in thead
if thead is not None and tr.getparent() == thead:
continue
# Check if this might be a header row
is_header_row = False
# Continue checking for headers if:
# 1. We haven't found any headers yet, OR
# 2. We've found headers but haven't seen data rows yet (multi-row headers)
if not data_rows_started:
is_header_row = self._is_header_row(tr)
# Additional check for multi-row headers in financial tables
# If the previous row was a header and this row has years or units,
# it's likely part of the header
if headers_found and not is_header_row:
row_text = tr.text_content().strip()
# Check for units like "(in millions)" or "(in thousands)"
if '(in millions)' in row_text or '(in thousands)' in row_text or '(in billions)' in row_text:
is_header_row = True
# Check for year rows that follow "Year Ended" headers
elif len(table.headers) > 0:
last_header_text = ' '.join(cell.text() for cell in table.headers[-1])
if 'year ended' in last_header_text.lower() or 'years ended' in last_header_text.lower():
# Check if this row has years
year_pattern = r'\b(19\d{2}|20\d{2})\b'
years_found = re.findall(year_pattern, row_text)
if years_found:
is_header_row = True
cells = self._process_row(tr, is_header=is_header_row)
if cells:
if is_header_row:
table.headers.append(cells)
headers_found = True
consecutive_header_rows += 1
else:
# Only mark data_rows_started if this row has actual content
# Empty rows at the beginning shouldn't stop header detection
row = Row(cells=cells, is_header=False)
table.rows.append(row)
# Check if row has significant content that indicates data rows have started
# But be smart about it - descriptive rows like "(in millions)" or pure spacing
# shouldn't stop header detection
has_content = any(cell.text().strip() for cell in cells)
if has_content:
# Get the row text for smarter analysis
row_text = ' '.join(cell.text().strip() for cell in cells).strip()
row_text_lower = row_text.lower()
# Don't consider this as "data started" if it's likely a header-related row
is_header_related = (
# Unit descriptions
'(in millions)' in row_text_lower or
'(in thousands)' in row_text_lower or
'(in billions)' in row_text_lower or
'except per share' in row_text_lower or
# Financial period descriptions
'year ended' in row_text_lower or
'months ended' in row_text_lower or
# Mostly just spacing/formatting
len(row_text.strip()) < 5 or
# Contains years (might be misclassified header)
bool(re.search(r'\b(19\d{2}|20\d{2})\b', row_text))
)
# Only mark data_rows_started if this seems like actual data, not header-related
if not is_header_related:
data_rows_started = True
consecutive_header_rows = 0
# Process tfoot
tfoot = element.find('.//tfoot')
if tfoot is not None:
for tr in tfoot.findall('.//tr'):
cells = self._process_row(tr, is_header=False)
if cells:
row = Row(cells=cells, is_header=False)
table.footer.append(row)
def _process_row(self, tr: HtmlElement, is_header: bool) -> List[Cell]:
"""Process table row into cells."""
cells = []
# Process both td and th elements
for cell_elem in tr.findall('.//td') + tr.findall('.//th'):
cell = self._process_cell(cell_elem, is_header or cell_elem.tag == 'th')
if cell:
cells.append(cell)
return cells
def _process_cell(self, elem: HtmlElement, is_header: bool) -> Optional[Cell]:
"""Process table cell."""
# Extract cell properties
colspan = int(elem.get('colspan', '1'))
rowspan = int(elem.get('rowspan', '1'))
align = elem.get('align')
# Extract style
style = self.style_parser.parse(elem.get('style', ''))
if style.text_align:
align = style.text_align
# Extract content
content = self._extract_cell_content(elem)
# Create cell
cell = Cell(
content=content,
colspan=colspan,
rowspan=rowspan,
is_header=is_header,
align=align
)
return cell
def _extract_cell_content(self, elem: HtmlElement) -> str:
"""Extract and clean cell content."""
# Check for nested structure
divs = elem.findall('.//div')
if divs and len(divs) > 1:
# Multiple divs - likely multi-line content
lines = []
for div in divs:
text = self._extract_text(div)
if text:
lines.append(text)
return '\n'.join(lines)
# Handle line breaks
for br in elem.findall('.//br'):
br.tail = '\n' + (br.tail or '')
# Extract text
text = self._extract_text(elem)
return text
def _extract_text(self, elem: HtmlElement) -> str:
"""Extract and clean text from element."""
# Use itertext() to get all text fragments
# This preserves spaces better than text_content()
text_parts = []
for text in elem.itertext():
if text:
text_parts.append(text)
# Join parts, ensuring we don't lose spaces
# If a part doesn't end with whitespace and the next doesn't start with whitespace,
# we need to add a space between them
if not text_parts:
return ''
result = []
for i, part in enumerate(text_parts):
if i == 0:
result.append(part)
else:
prev_part = text_parts[i-1]
# Check if we need to add a space between parts
# Don't add space if previous ends with space or current starts with space
if prev_part and part:
if not prev_part[-1].isspace() and not part[0].isspace():
# Check for punctuation that shouldn't have space before it
if part[0] not in ',.;:!?%)]':
result.append(' ')
result.append(part)
text = ''.join(result)
# Replace entities
for entity, replacement in self.ENTITY_REPLACEMENTS.items():
text = text.replace(entity, replacement)
# Clean whitespace
text = text.strip()
# Normalize internal whitespace but preserve line breaks
lines = text.split('\n')
cleaned_lines = []
for line in lines:
# Collapse multiple spaces to single space
line = ' '.join(line.split())
cleaned_lines.append(line)
return '\n'.join(cleaned_lines)
@staticmethod
@lru_cache(maxsize=1)
def _get_period_header_pattern():
"""
Compile comprehensive regex for financial period headers.
Adapted from old parser's proven patterns.
Returns:
Compiled regex pattern matching financial period headers
"""
# Base components
periods = r'(?:three|six|nine|twelve|[1-4]|first|second|third|fourth)'
timeframes = r'(?:month|quarter|year|week)'
ended_variants = r'(?:ended|ending|end|period)'
as_of_variants = r'(?:as\s+of|at|as\s+at)'
# Date pattern
months = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
day = r'\d{1,2}'
year = r'(?:19|20)\d{2}'
date = f'{months}\\s*\\.?\\s*{day}\\s*,?\\s*{year}'
# Combined patterns
patterns = [
# Standard period headers
f'{periods}\\s+{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
f'(?:fiscal\\s+)?{timeframes}\\s+{ended_variants}',
f'{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
# Balance sheet date headers
f'{as_of_variants}\\s+{date}',
# Multiple date sequences
f'{date}(?:\\s*(?:and|,)\\s*{date})*',
# Single dates
f'(?:{ended_variants}\\s+)?{date}'
]
pattern = '|'.join(f'(?:{p})' for p in patterns)
return re.compile(pattern, re.IGNORECASE)
def _is_header_row(self, tr: HtmlElement) -> bool:
"""Detect if row is likely a header row in SEC filings."""
# Check if contains th elements (most reliable indicator)
if tr.find('.//th') is not None:
return True
cells = tr.findall('.//td')
if not cells:
return False
# Get row text for analysis
row_text = tr.text_content()
row_text_lower = row_text.lower()
# Check for date ranges with financial data (Oracle Table 6 pattern)
# Date ranges like "March 1, 2024—March 31, 2024" should be data rows, not headers
date_range_pattern = r'(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}\s*[—–-]\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}'
has_date_range = bool(re.search(date_range_pattern, row_text_lower))
# Check for financial data indicators
has_currency = bool(re.search(r'\$[\s]*[\d,\.]+', row_text))
has_decimals = bool(re.search(r'\b\d+\.\d+\b', row_text))
has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
# If row has date range + financial data, it's definitely a data row
if has_date_range and (has_currency or has_decimals or has_large_numbers):
return False
# Check for year patterns (very common in financial headers)
year_pattern = r'\b(19\d{2}|20\d{2})\b'
years_found = re.findall(year_pattern, row_text)
if len(years_found) >= 2: # Multiple years suggest header row
# IMPORTANT: Check for date ranges and same-year repetition
# Date ranges like "March 1, 2024—March 31, 2024" contain the same year twice
# but are data rows, not multi-year comparison headers
# If all years are the same (date range pattern)
if len(set(years_found)) == 1:
# Same year repeated - likely a date range like "Jan 1, 2024 - Mar 31, 2024"
# Not a multi-year comparison header
pass # Don't return True
# Multiple different years suggest multi-year comparison header
elif 'total' not in row_text_lower[:20]: # Check first 20 chars
return True
# Enhanced year detection - check individual cells for year patterns
# This handles cases where years are in separate cells
year_cells = 0
date_phrases = 0
for cell in cells:
cell_text = cell.text_content().strip()
if cell_text:
# Check for individual years
if re.match(r'^\s*(19\d{2}|20\d{2})\s*$', cell_text):
year_cells += 1
# Check for date phrases like "June 30, 2025"
elif 'june 30' in cell_text.lower() or 'december 31' in cell_text.lower():
date_phrases += 1
# If we have multiple year cells or year + date phrases, likely a header
if year_cells >= 2 or (year_cells >= 1 and date_phrases >= 1):
if 'total' not in row_text_lower[:20]:
return True
# Check for comprehensive financial period patterns (from old parser)
period_pattern = self._get_period_header_pattern()
if period_pattern.search(row_text_lower):
# Additional validation: ensure it's not a data row with period text
# Check for absence of strong data indicators
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\s*[+\-*/]\s*\d+|\(\s*\d+(?:,\d{3})*\s*\))'
if not re.search(data_pattern, row_text):
return True
# Check for units notation (in millions, thousands, billions)
units_pattern = r'\(in\s+(?:millions|thousands|billions)\)'
if re.search(units_pattern, row_text_lower):
return True
# Check for period indicators (quarters, months)
# But be careful with "fiscal" - it could be data like "Fiscal 2025"
period_keywords = ['quarter', 'q1', 'q2', 'q3', 'q4', 'month',
'january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november', 'december',
'ended', 'three months', 'six months', 'nine months']
# Special handling for "fiscal" - only treat as header if it's part of a phrase like "fiscal year ended"
if 'fiscal' in row_text_lower:
# Check if row has numeric values (suggests it's data, not header)
# Look for patterns like "Fiscal 2025 $10,612"
has_currency_values = bool(re.search(r'\$[\s]*[\d,]+', row_text))
has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
# If it has currency or large numbers, it's likely data
if has_currency_values or has_large_numbers:
return False
# Check if it's just "Fiscal YYYY" which is likely data, not a header
fiscal_year_only = re.match(r'^\s*fiscal\s+\d{4}\s*$', row_text_lower.strip())
if fiscal_year_only:
return False # This is data, not a header
# Check for header-like phrases with fiscal
if 'fiscal year' in row_text_lower and ('ended' in row_text_lower or 'ending' in row_text_lower):
return True
if any(keyword in row_text_lower for keyword in period_keywords):
# Validate it's not a data row with period keywords
# Check for strong data indicators
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
if not re.search(data_pattern, row_text):
return True
# Check for column descriptors (but NOT total)
# These are words commonly found in headers but not data rows
header_keywords = ['description', 'item', 'category', 'type', 'classification',
'change', 'percent', 'increase', 'decrease', 'variance']
if any(keyword in row_text_lower for keyword in header_keywords):
# Make sure it's not a total row
if 'total' not in row_text_lower[:30]:
# Additional validation: long narrative text is not a header
# Headers are typically concise (< 150 chars)
if len(row_text) > 150:
return False
# Check for data indicators (would indicate data row, not header)
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
if re.search(data_pattern, row_text):
return False
return True
# Check if all cells are bold (common header formatting)
bold_count = 0
for cell in cells:
style = cell.get('style', '')
if 'font-weight' in style and 'bold' in style:
bold_count += 1
elif cell.find('.//b') is not None or cell.find('.//strong') is not None:
bold_count += 1
# Only consider it a header if ALL cells are bold (not just some)
if bold_count == len(cells) and bold_count > 0:
return True
# Check content type ratio - headers usually have more text than numbers
# Count cells with primarily text vs primarily numbers
text_cells = 0
number_cells = 0
for cell in cells:
cell_text = cell.text_content().strip()
if cell_text:
# Remove common symbols for analysis
clean_text = cell_text.replace('$', '').replace('%', '').replace(',', '').replace('(', '').replace(')', '')
if clean_text.replace('.', '').replace('-', '').strip().isdigit():
number_cells += 1
else:
text_cells += 1
# Be very careful about treating text-heavy rows as headers
# Many data rows start with text labels (e.g., "Impact of...", "Effect of...")
# Only consider it a header if it has mostly text AND doesn't look like a data label
if text_cells > number_cells * 2 and text_cells >= 3:
# Check for common data row patterns
data_row_indicators = [
'impact of', 'effect of', 'adjustment', 'provision for', 'benefit',
'expense', 'income from', 'loss on', 'gain on', 'charge', 'credit',
'earnings', 'computed', 'state taxes', 'research', 'excess tax'
]
# If it starts with any of these, it's likely a data row, not a header
for indicator in data_row_indicators:
if row_text_lower.startswith(indicator) or indicator in row_text_lower[:50]:
return False
# Also not a header if it starts with "total"
if not row_text_lower.startswith('total'):
return True
return False
def _detect_table_type(self, table: TableNode) -> TableType:
"""Detect the type of table based on content."""
# Collect text from headers and first few rows
text_parts = []
# Add caption
if table.caption:
text_parts.append(table.caption.lower())
# Add headers
for header_row in table.headers:
for cell in header_row:
text_parts.append(cell.text().lower())
# Add first few rows
for row in table.rows[:3]:
for cell in row.cells:
text_parts.append(cell.text().lower())
combined_text = ' '.join(text_parts)
# Check for financial table
financial_count = sum(1 for keyword in self.FINANCIAL_KEYWORDS if keyword in combined_text)
if financial_count >= 2: # Lowered threshold for better detection
return TableType.FINANCIAL
# Check for metrics table
metrics_count = sum(1 for keyword in self.METRICS_KEYWORDS if keyword in combined_text)
numeric_cells = sum(1 for row in table.rows for cell in row.cells if cell.is_numeric)
total_cells = sum(len(row.cells) for row in table.rows)
if total_cells > 0:
numeric_ratio = numeric_cells / total_cells
# More lenient metrics detection
if metrics_count >= 1 or numeric_ratio > 0.3:
return TableType.METRICS
# Check for table of contents
if 'content' in combined_text or 'index' in combined_text:
# Look for page numbers
has_page_numbers = any(
re.search(r'\b\d{1,3}\b', cell.text())
for row in table.rows
for cell in row.cells
)
if has_page_numbers:
return TableType.TABLE_OF_CONTENTS
# Check for exhibit index
if 'exhibit' in combined_text:
return TableType.EXHIBIT_INDEX
# Check for reference table (citations, definitions, etc.)
if any(word in combined_text for word in ['reference', 'definition', 'glossary', 'citation']):
return TableType.REFERENCE
return TableType.GENERAL
def _extract_relationships(self, table: TableNode):
"""Extract relationships within table data."""
# This would implement relationship extraction
# For now, just set a flag that relationships were processed
table.set_metadata('relationships_extracted', True)
# Example relationships to extract:
# - Parent-child relationships (indented rows)
# - Total rows that sum other rows
# - Cross-references between cells
# - Time series relationships
# Detect total rows
total_rows = []
for i, row in enumerate(table.rows):
if row.is_total_row:
total_rows.append(i)
if total_rows:
table.set_metadata('total_rows', total_rows)
# Detect indentation patterns (parent-child)
indentation_levels = []
for row in table.rows:
if row.cells:
first_cell_text = row.cells[0].text()
# Count leading spaces
indent = len(first_cell_text) - len(first_cell_text.lstrip())
indentation_levels.append(indent)
if any(level > 0 for level in indentation_levels):
table.set_metadata('has_hierarchy', True)
table.set_metadata('indentation_levels', indentation_levels)

View File

@@ -0,0 +1,345 @@
"""
XBRL extraction strategy for inline XBRL documents.
"""
from typing import Dict, Any, Optional
from lxml.html import HtmlElement
from edgar.documents.types import XBRLFact
class XBRLExtractor:
"""
Extracts XBRL facts from inline XBRL (iXBRL) documents.
Handles:
- ix:nonFraction, ix:nonNumeric facts
- Context and unit resolution
- Continuation handling
- Transformation rules
"""
# XBRL namespaces
NAMESPACES = {
'ix': 'http://www.xbrl.org/2013/inlineXBRL',
'xbrli': 'http://www.xbrl.org/2003/instance',
'xbrldi': 'http://xbrl.org/2006/xbrldi',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}
# Common transformation formats
TRANSFORMATIONS = {
'ixt:numdotdecimal': lambda x: x.replace(',', ''),
'ixt:numcommadecimal': lambda x: x.replace('.', '_').replace(',', '.').replace('_', ','),
'ixt:zerodash': lambda x: '0' if x == '-' else x,
'ixt:datedoteu': lambda x: x.replace('.', '-'),
'ixt:datedotus': lambda x: x.replace('.', '/'),
}
def __init__(self):
"""Initialize XBRL extractor."""
self.contexts: Dict[str, Dict[str, Any]] = {}
self.units: Dict[str, str] = {}
self.continuations: Dict[str, str] = {}
self._initialized = False
def extract_context(self, element: HtmlElement) -> Optional[Dict[str, Any]]:
"""
Extract XBRL context from element.
Args:
element: HTML element that might contain XBRL
Returns:
XBRL metadata if found
"""
# Check if element is an ix: tag
if not self._is_xbrl_element(element):
return None
# Initialize context if needed
if not self._initialized:
self._initialize_context(element)
# Extract based on element type
tag_name = self._get_local_name(element.tag)
if tag_name == 'nonfraction':
return self._extract_nonfraction(element)
elif tag_name == 'nonnumeric':
return self._extract_nonnumeric(element)
elif tag_name == 'continuation':
return self._extract_continuation(element)
elif tag_name == 'footnote':
return self._extract_footnote(element)
elif tag_name == 'fraction':
return self._extract_fraction(element)
return None
def extract_fact(self, element: HtmlElement) -> Optional[XBRLFact]:
"""Extract XBRL fact from element."""
context = self.extract_context(element)
if not context:
return None
# Get fact value
value = self._get_fact_value(element)
# Create fact
fact = XBRLFact(
concept=context.get('name', ''),
value=value,
context_ref=context.get('contextRef'),
unit_ref=context.get('unitRef'),
decimals=context.get('decimals'),
scale=context.get('scale'),
format=context.get('format'),
sign=context.get('sign')
)
# Resolve references
if fact.context_ref and fact.context_ref in self.contexts:
fact.context = self.contexts[fact.context_ref]
if fact.unit_ref and fact.unit_ref in self.units:
fact.unit = self.units[fact.unit_ref]
return fact
def _is_xbrl_element(self, element: HtmlElement) -> bool:
"""Check if element is an XBRL element."""
tag = element.tag
if not isinstance(tag, str):
return False
# Handle both namespaced and non-namespaced tags
tag_lower = tag.lower()
return (
tag.startswith('{' + self.NAMESPACES['ix'] + '}') or
tag.startswith('ix:') or
tag_lower.startswith('ix:')
)
def _get_local_name(self, tag: str) -> str:
"""Get local name from qualified tag."""
if '}' in tag:
return tag.split('}')[1].lower()
elif ':' in tag:
return tag.split(':')[1].lower()
return tag.lower()
def _initialize_context(self, element: HtmlElement):
"""Initialize context and unit information from document."""
# Find root element
root = element.getroottree().getroot()
# Extract contexts
self._extract_contexts(root)
# Extract units
self._extract_units(root)
self._initialized = True
def _extract_contexts(self, root: HtmlElement):
"""Extract all context definitions."""
# Look for xbrli:context elements
for context in root.xpath('//xbrli:context', namespaces=self.NAMESPACES):
context_id = context.get('id')
if not context_id:
continue
context_data = {
'id': context_id
}
# Extract entity
entity = context.find('.//xbrli:entity', namespaces=self.NAMESPACES)
if entity is not None:
identifier = entity.find('.//xbrli:identifier', namespaces=self.NAMESPACES)
if identifier is not None:
context_data['entity'] = identifier.text
context_data['scheme'] = identifier.get('scheme')
# Extract period
period = context.find('.//xbrli:period', namespaces=self.NAMESPACES)
if period is not None:
instant = period.find('.//xbrli:instant', namespaces=self.NAMESPACES)
if instant is not None:
context_data['instant'] = instant.text
context_data['period_type'] = 'instant'
else:
start = period.find('.//xbrli:startDate', namespaces=self.NAMESPACES)
end = period.find('.//xbrli:endDate', namespaces=self.NAMESPACES)
if start is not None and end is not None:
context_data['start_date'] = start.text
context_data['end_date'] = end.text
context_data['period_type'] = 'duration'
# Extract dimensions
segment = context.find('.//xbrli:segment', namespaces=self.NAMESPACES)
if segment is not None:
dimensions = {}
for member in segment.findall('.//xbrldi:explicitMember', namespaces=self.NAMESPACES):
dim = member.get('dimension')
if dim:
dimensions[dim] = member.text
if dimensions:
context_data['dimensions'] = dimensions
self.contexts[context_id] = context_data
def _extract_units(self, root: HtmlElement):
"""Extract all unit definitions."""
# Look for xbrli:unit elements
for unit in root.xpath('//xbrli:unit', namespaces=self.NAMESPACES):
unit_id = unit.get('id')
if not unit_id:
continue
# Check for simple measure
measure = unit.find('.//xbrli:measure', namespaces=self.NAMESPACES)
if measure is not None:
self.units[unit_id] = self._normalize_unit(measure.text)
continue
# Check for complex unit (divide)
divide = unit.find('.//xbrli:divide', namespaces=self.NAMESPACES)
if divide is not None:
numerator = divide.find('.//xbrli:unitNumerator/xbrli:measure', namespaces=self.NAMESPACES)
denominator = divide.find('.//xbrli:unitDenominator/xbrli:measure', namespaces=self.NAMESPACES)
if numerator is not None and denominator is not None:
num_unit = self._normalize_unit(numerator.text)
den_unit = self._normalize_unit(denominator.text)
self.units[unit_id] = f"{num_unit}/{den_unit}"
def _normalize_unit(self, unit_text: str) -> str:
"""Normalize unit text."""
if not unit_text:
return ''
# Remove namespace prefix
if ':' in unit_text:
unit_text = unit_text.split(':')[-1]
# Common normalizations
unit_map = {
'usd': 'USD',
'shares': 'shares',
'pure': 'pure',
'percent': '%'
}
return unit_map.get(unit_text.lower(), unit_text)
def _extract_nonfraction(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:nonFraction element."""
metadata = {
'type': 'nonFraction',
'name': element.get('name'),
'contextRef': element.get('contextRef') or element.get('contextref'),
'unitRef': element.get('unitRef') or element.get('unitref'),
'decimals': element.get('decimals'),
'scale': element.get('scale'),
'format': element.get('format'),
'sign': element.get('sign')
}
# Clean None values
return {k: v for k, v in metadata.items() if v is not None}
def _extract_nonnumeric(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:nonNumeric element."""
metadata = {
'type': 'nonNumeric',
'name': element.get('name'),
'contextRef': element.get('contextRef') or element.get('contextref'),
'format': element.get('format')
}
# Clean None values
return {k: v for k, v in metadata.items() if v is not None}
def _extract_continuation(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:continuation element."""
cont_id = element.get('id')
continued_at = element.get('continuedAt')
if cont_id and continued_at:
# Map continuation to original
if continued_at in self.continuations:
original = self.continuations[continued_at]
self.continuations[cont_id] = original
return original
else:
# Store for later resolution
metadata = {
'type': 'continuation',
'id': cont_id,
'continuedAt': continued_at
}
self.continuations[cont_id] = metadata
return metadata
return {}
def _extract_footnote(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:footnote element."""
return {
'type': 'footnote',
'footnoteRole': element.get('footnoteRole'),
'footnoteID': element.get('footnoteID')
}
def _extract_fraction(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:fraction element."""
metadata = {
'type': 'fraction',
'name': element.get('name'),
'contextRef': element.get('contextRef'),
'unitRef': element.get('unitRef')
}
# Extract numerator and denominator
numerator = element.find('.//ix:numerator', namespaces=self.NAMESPACES)
denominator = element.find('.//ix:denominator', namespaces=self.NAMESPACES)
if numerator is not None:
metadata['numerator'] = numerator.text
if denominator is not None:
metadata['denominator'] = denominator.text
return {k: v for k, v in metadata.items() if v is not None}
def _get_fact_value(self, element: HtmlElement) -> str:
"""Get fact value from element with transformations."""
# Get raw value
value = element.text or ''
# Apply format transformation if specified
format_attr = element.get('format')
if format_attr and format_attr in self.TRANSFORMATIONS:
transform = self.TRANSFORMATIONS[format_attr]
value = transform(value)
# Apply scale if specified
scale = element.get('scale')
if scale:
try:
scale_factor = int(scale)
numeric_value = float(value.replace(',', ''))
scaled_value = numeric_value * (10 ** scale_factor)
value = str(scaled_value)
except (ValueError, TypeError):
pass
# Apply sign if specified
sign = element.get('sign')
if sign == '-':
if value and not value.startswith('-'):
value = '-' + value
return value.strip()