Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/init.py
@@ -0,0 +1,15 @@
+"""
+Parsing strategies for different content types.
+"""
+
+from edgar.documents.strategies.document_builder import DocumentBuilder
+from edgar.documents.strategies.header_detection import HeaderDetectionStrategy
+from edgar.documents.strategies.table_processing import TableProcessor
+from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
+
+__all__ = [
+    'DocumentBuilder',
+    'HeaderDetectionStrategy', 
+    'TableProcessor',
+    'XBRLExtractor'
+]
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/document_builder.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/document_builder.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/header_detection.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/header_detection.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/style_parser.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/style_parser.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/table_processing.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/table_processing.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/xbrl_extraction.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/xbrl_extraction.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/document_builder.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/document_builder.py
@@ -0,0 +1,670 @@
+"""
+Document builder that converts parsed HTML tree into document nodes.
+"""
+
+from typing import Dict, Any, Optional
+
+from lxml.html import HtmlElement
+
+from edgar.documents.config import ParserConfig
+from edgar.documents.nodes import (
+    Node, DocumentNode, TextNode, ParagraphNode, HeadingNode,
+    ContainerNode, SectionNode, ListNode, ListItemNode, LinkNode, ImageNode
+)
+from edgar.documents.strategies.style_parser import StyleParser
+from edgar.documents.table_nodes import TableNode, Cell, Row
+from edgar.documents.types import Style, ParseContext, SemanticType
+
+
+class DocumentBuilder:
+    """
+    Builds Document node tree from parsed HTML.
+    
+    Handles the conversion of HTML elements into structured nodes
+    with proper hierarchy and metadata.
+    """
+    
+    # Block-level elements
+    BLOCK_ELEMENTS = {
+        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+        'ul', 'ol', 'li', 'blockquote', 'pre', 'hr',
+        'table', 'form', 'fieldset', 'address', 'section',
+        'article', 'aside', 'nav', 'header', 'footer', 'main'
+    }
+    
+    # Inline elements
+    INLINE_ELEMENTS = {
+        'span', 'a', 'em', 'strong', 'b', 'i', 'u', 's',
+        'small', 'mark', 'del', 'ins', 'sub', 'sup',
+        'code', 'kbd', 'var', 'samp', 'abbr', 'cite',
+        'q', 'time', 'font',
+        # IXBRL inline elements for simple values - should not break text flow  
+        'ix:nonfraction', 'ix:footnote', 'ix:fraction'
+    }
+    
+    # Elements to skip
+    SKIP_ELEMENTS = {
+        'script', 'style', 'meta', 'link', 'noscript',
+        # IXBRL exclude elements - content that should not appear in final document
+        'ix:exclude'
+    }
+    
+    def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
+        """
+        Initialize document builder.
+        
+        Args:
+            config: Parser configuration
+            strategies: Dictionary of parsing strategies
+        """
+        self.config = config
+        self.strategies = strategies
+        self.style_parser = StyleParser()
+        self.context = ParseContext()
+        
+        # Track XBRL context
+        self.xbrl_context_stack = []
+        self.xbrl_continuations = {}
+    
+    def build(self, tree: HtmlElement) -> DocumentNode:
+        """
+        Build document from HTML tree.
+        
+        Args:
+            tree: Parsed HTML tree
+            
+        Returns:
+            Document root node
+        """
+        # Create root document node
+        root = DocumentNode()
+        
+        # Find body element
+        body = tree.find('.//body')
+        if body is None:
+            # If no body, use the entire tree
+            body = tree
+        
+        # Process body content
+        self._process_element(body, root)
+        
+        # Apply node merging if configured
+        if self.config.merge_adjacent_nodes:
+            self._merge_adjacent_nodes(root)
+        
+        return root
+    
+    def _process_element(self, element: HtmlElement, parent: Node) -> Optional[Node]:
+        """
+        Process HTML element into node.
+        
+        Args:
+            element: HTML element to process
+            parent: Parent node
+            
+        Returns:
+            Created node or None if skipped
+        """
+        
+        # Skip certain elements but preserve their tail text
+        if element.tag in self.SKIP_ELEMENTS:
+            # Process tail text even when skipping element
+            if element.tail:
+                if self.config.preserve_whitespace:
+                    text_node = TextNode(content=element.tail)
+                    parent.add_child(text_node)
+                else:
+                    if element.tail.strip():
+                        text_node = TextNode(content=element.tail.strip())
+                        parent.add_child(text_node)
+            return None
+        
+        # Skip page number containers
+        if self._is_page_number_container(element):
+            return None
+        
+        # Skip page break elements
+        if self._is_page_break_element(element):
+            return None
+        
+        # Skip navigation containers that follow page breaks
+        if self._is_page_navigation_container(element):
+            return None
+        
+        # Track parsing depth
+        self.context.depth += 1
+        
+        try:
+            # Handle XBRL elements
+            if element.tag.startswith('{'):  # Namespaced element
+                self._enter_xbrl_context(element)
+            
+            # Extract style
+            style = self._extract_style(element)
+            
+            # Create appropriate node based on element type
+            node = self._create_node_for_element(element, style)
+            
+            if node:
+                # Add XBRL metadata if in context
+                if self.xbrl_context_stack:
+                    node.metadata.update(self._get_current_xbrl_metadata())
+                
+                # Add to parent
+                parent.add_child(node)
+                
+                # Process children for container nodes
+                if self._should_process_children(element, node):
+                    # Add element's direct text first
+                    if element.text:
+                        if self.config.preserve_whitespace:
+                            if element.text:  # Don't strip whitespace
+                                text_node = TextNode(content=element.text)
+                                node.add_child(text_node)
+                        else:
+                            if element.text.strip():
+                                text_node = TextNode(content=element.text.strip())
+                                node.add_child(text_node)
+                    
+                    # Process child elements
+                    for child in element:
+                        self._process_element(child, node)
+                    
+                    # Process text after children
+                    if element.tail:
+                        if self.config.preserve_whitespace:
+                            text_node = TextNode(content=element.tail)
+                            parent.add_child(text_node)
+                        else:
+                            if element.tail.strip():
+                                text_node = TextNode(content=element.tail.strip())
+                                parent.add_child(text_node)
+                            elif element.tail.isspace():
+                                # Even if tail is just whitespace, preserve the spacing info
+                                # This helps with inline element spacing decisions
+                                if hasattr(node, 'set_metadata'):
+                                    node.set_metadata('has_tail_whitespace', True)
+                else:
+                    # Node created but children not processed - still need to handle tail
+                    if element.tail:
+                        if self.config.preserve_whitespace:
+                            text_node = TextNode(content=element.tail)
+                            parent.add_child(text_node)
+                        else:
+                            if element.tail.strip():
+                                text_node = TextNode(content=element.tail.strip())
+                                parent.add_child(text_node)
+                            elif element.tail.isspace():
+                                # Even if tail is just whitespace, preserve the spacing info
+                                if hasattr(node, 'set_metadata'):
+                                    node.set_metadata('has_tail_whitespace', True)
+            else:
+                # No node created, process children with same parent
+                for child in element:
+                    self._process_element(child, parent)
+                
+                # Process tail text
+                if element.tail:
+                    if self.config.preserve_whitespace:
+                        text_node = TextNode(content=element.tail)
+                        parent.add_child(text_node)
+                    else:
+                        if element.tail.strip():
+                            text_node = TextNode(content=element.tail.strip())
+                            parent.add_child(text_node)
+            
+            # Exit XBRL context
+            if element.tag.startswith('{'):
+                self._exit_xbrl_context(element)
+            
+            return node
+            
+        finally:
+            self.context.depth -= 1
+    
+    def _create_node_for_element(self, element: HtmlElement, style: Style) -> Optional[Node]:
+        """Create appropriate node for HTML element."""
+        tag = element.tag.lower() if not element.tag.startswith('{') else element.tag
+        
+        
+        # Check for heading
+        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            level = int(tag[1])
+            text = self._get_element_text(element)
+            if text:
+                return HeadingNode(content=text, level=level, style=style)
+        
+        # Handle specific elements first before header detection
+        if tag == 'p':
+            return ParagraphNode(style=style)
+        
+        elif tag == 'li':
+            return ListItemNode(style=style)
+        
+        # Check if element might be a heading based on style/content
+        # Skip header detection for certain tags that should never be headers
+        skip_header_detection_tags = {
+            'li', 'td', 'th', 'option', 'a', 'button', 'label',
+            # IXBRL inline elements - should not be treated as headers
+            'ix:nonfraction', 'ix:footnote', 'ix:fraction',
+            # IXBRL elements that can contain tables and complex content
+            'ix:nonNumeric', 'ix:continuation'
+        }
+        if tag not in skip_header_detection_tags and self.strategies.get('header_detection'):
+            header_info = self.strategies['header_detection'].detect(element, self.context)
+            if header_info and header_info.confidence > self.config.header_detection_threshold:
+                text = self._get_element_text(element)
+                if text:
+                    node = HeadingNode(
+                        content=text,
+                        level=header_info.level,
+                        style=style
+                    )
+                    # Add header metadata
+                    node.set_metadata('detection_method', header_info.detection_method)
+                    node.set_metadata('confidence', header_info.confidence)
+                    if header_info.is_item:
+                        node.semantic_type = SemanticType.ITEM_HEADER
+                        node.set_metadata('item_number', header_info.item_number)
+                    return node
+        
+        # Continue handling other specific elements
+        if tag == 'table':
+            if self.strategies.get('table_processing'):
+                return self.strategies['table_processing'].process(element)
+            else:
+                return self._process_table_basic(element, style)
+        
+        elif tag in ['ul', 'ol']:
+            return ListNode(ordered=(tag == 'ol'), style=style)
+        
+        elif tag == 'li':
+            return ListItemNode(style=style)
+        
+        elif tag == 'a':
+            href = element.get('href', '')
+            title = element.get('title', '')
+            text = self._get_element_text(element)
+            return LinkNode(content=text, href=href, title=title, style=style)
+        
+        elif tag == 'img':
+            return ImageNode(
+                src=element.get('src'),
+                alt=element.get('alt'),
+                width=self._parse_dimension(element.get('width')),
+                height=self._parse_dimension(element.get('height')),
+                style=style
+            )
+        
+        elif tag == 'br':
+            # Line break - add as text node
+            return TextNode(content='\n')
+        
+        elif tag in ['section', 'article']:
+            return SectionNode(style=style)
+        
+        elif tag == 'div' or tag in self.BLOCK_ELEMENTS:
+            # Check if CSS display property makes this inline
+            if style.display in ['inline', 'inline-block']:
+                # Treat as inline element despite being a div
+                text = self._get_element_text(element)
+                if text:
+                    text_node = TextNode(content=text, style=style)
+                    text_node.set_metadata('original_tag', tag)
+                    text_node.set_metadata('inline_via_css', True)
+                    return text_node
+                # If no text but inline, still process children inline
+                return ContainerNode(tag_name=tag, style=style)
+            
+            # Normal block behavior
+            # Check if this is just a text container with only inline elements
+            if self._is_text_only_container(element):
+                # Create ParagraphNode for divs containing only inline elements
+                # This ensures proper text concatenation for spans, etc.
+                return ParagraphNode(style=style)
+            else:
+                return ContainerNode(tag_name=tag, style=style)
+        
+        elif tag in self.INLINE_ELEMENTS:
+            # Inline elements - extract text and add to parent
+            text = self._get_element_text(element)
+            if text:
+                text_node = TextNode(content=text, style=style)
+                # Preserve inline element metadata
+                text_node.set_metadata('original_tag', tag)
+                return text_node
+        
+        elif tag in ['ix:nonNumeric', 'ix:continuation']:
+            # IXBRL elements that can contain complex content including tables
+            # Process as container to allow proper table parsing
+            return ContainerNode(tag_name=tag, style=style)
+        
+        # Default: create container for unknown elements
+        return ContainerNode(tag_name=tag, style=style)
+    
+    def _is_page_number_container(self, element: HtmlElement) -> bool:
+        """Detect and filter page number containers across various SEC filing patterns."""
+        import re
+        
+        # Get text content first - all page numbers should be short
+        text_content = element.text_content().strip()
+        
+        # Must be short content (1-8 chars to handle "Page X" format) 
+        if len(text_content) > 8 or len(text_content) == 0:
+            return False
+        
+        # Must be numeric, roman numerals, or "Page X" format
+        if not self._is_page_number_content(text_content):
+            return False
+        
+        # Check various patterns based on element type and styling
+        tag = element.tag.lower()
+        
+        # Pattern 1: Oracle-style flexbox containers (highest confidence)
+        if tag == 'div' and self._is_flexbox_page_number(element):
+            return True
+        
+        # Pattern 2: Center/right aligned paragraphs (common pattern)
+        if tag == 'p' and self._is_aligned_page_number(element):
+            return True
+        
+        # Pattern 3: Footer-style divs with centered page numbers
+        if tag == 'div' and self._is_footer_page_number(element):
+            return True
+        
+        # Pattern 4: Simple divs with page break context
+        if tag == 'div' and self._is_page_break_context(element):
+            return True
+        
+        return False
+    
+    def _is_page_number_content(self, text: str) -> bool:
+        """Check if text content looks like a page number."""
+        import re
+        
+        # Simple numeric (most common)
+        if text.isdigit():
+            return True
+        
+        # Roman numerals
+        if re.match(r'^[ivxlcdm]+$', text.lower()):
+            return True
+        
+        # "Page X" or "Page X of Y" format
+        if re.match(r'^page\s+\d+(\s+of\s+\d+)?$', text.lower()):
+            return True
+        
+        return False
+    
+    def _is_flexbox_page_number(self, element: HtmlElement) -> bool:
+        """Detect Oracle-style flexbox page number containers."""
+        import re
+        
+        style_attr = element.get('style', '')
+        if not style_attr:
+            return False
+        
+        # Must have: display:flex, justify-content:flex-end, min-height:1in
+        required_patterns = [
+            r'display:\s*flex',
+            r'justify-content:\s*flex-end',
+            r'min-height:\s*1in'
+        ]
+        
+        return all(re.search(pattern, style_attr) for pattern in required_patterns)
+    
+    def _is_aligned_page_number(self, element: HtmlElement) -> bool:
+        """Detect center or right-aligned page number paragraphs."""
+        import re
+        
+        style_attr = element.get('style', '')
+        
+        # Check for center or right alignment
+        alignment_pattern = r'text-align:\s*(center|right)'
+        if not re.search(alignment_pattern, style_attr):
+            return False
+        
+        # Optional: check for smaller font size (common in page numbers)
+        font_size_pattern = r'font-size:\s*([0-9]+)pt'
+        font_match = re.search(font_size_pattern, style_attr)
+        if font_match:
+            font_size = int(font_match.group(1))
+            # Page numbers often use smaller fonts (8-12pt)
+            if font_size <= 12:
+                return True
+        
+        return True  # Any center/right aligned short content
+    
+    def _is_footer_page_number(self, element: HtmlElement) -> bool:
+        """Detect footer-style page number containers."""
+        import re
+        
+        style_attr = element.get('style', '')
+        
+        # Look for bottom positioning or footer-like styling
+        footer_patterns = [
+            r'bottom:\s*[0-9]',
+            r'position:\s*absolute',
+            r'margin-bottom:\s*0',
+            r'text-align:\s*center'
+        ]
+        
+        # Need at least 2 footer indicators
+        matches = sum(1 for pattern in footer_patterns if re.search(pattern, style_attr))
+        return matches >= 2
+    
+    def _is_page_break_context(self, element: HtmlElement) -> bool:
+        """Check if element is near page breaks (common page number context)."""
+        
+        # Check next sibling for page break HR
+        next_elem = element.getnext()
+        if next_elem is not None and next_elem.tag == 'hr':
+            hr_style = next_elem.get('style', '')
+            if 'page-break' in hr_style:
+                return True
+        
+        # Check if element has page-break styling itself
+        style_attr = element.get('style', '')
+        if 'page-break' in style_attr:
+            return True
+        
+        return False
+    
+    def _is_page_break_element(self, element: HtmlElement) -> bool:
+        """Detect page break HR elements."""
+        if element.tag.lower() != 'hr':
+            return False
+        
+        style_attr = element.get('style', '')
+        
+        # Check for page-break-after:always or similar page break styles
+        return 'page-break' in style_attr
+    
+    def _is_page_navigation_container(self, element: HtmlElement) -> bool:
+        """Detect navigation containers that appear after page breaks."""
+        if element.tag.lower() != 'div':
+            return False
+        
+        style_attr = element.get('style', '')
+        
+        # Check for navigation container patterns
+        # Often have: padding-top, min-height:1in, box-sizing:border-box
+        nav_indicators = [
+            r'padding-top:\s*0\.5in',
+            r'min-height:\s*1in',
+            r'box-sizing:\s*border-box'
+        ]
+        
+        import re
+        matches = sum(1 for pattern in nav_indicators if re.search(pattern, style_attr))
+        
+        # Need at least 2 indicators
+        if matches < 2:
+            return False
+        
+        # Check if it contains typical navigation content
+        text_content = element.text_content().strip().lower()
+        
+        # Common navigation phrases
+        nav_phrases = [
+            'table of contents',
+            'index to financial statements',
+            'table of content',
+            'index to financial statement'
+        ]
+        
+        return any(phrase in text_content for phrase in nav_phrases)
+    
+    def _extract_style(self, element: HtmlElement) -> Style:
+        """Extract style from element."""
+        style_str = element.get('style', '')
+        style = self.style_parser.parse(style_str)
+        
+        # Add tag-specific styles
+        tag = element.tag.lower()
+        if tag == 'b' or tag == 'strong':
+            style.font_weight = 'bold'
+        elif tag == 'i' or tag == 'em':
+            style.font_style = 'italic'
+        elif tag == 'u':
+            style.text_decoration = 'underline'
+        
+        # Handle alignment
+        align = element.get('align')
+        if align:
+            style.text_align = align
+        
+        return style
+    
+    def _get_element_text(self, element: HtmlElement) -> str:
+        """Get text content from element."""
+        text_parts = []
+        
+        # Get element's direct text
+        if element.text:
+            # For inline elements, preserve leading/trailing whitespace
+            if element.tag.lower() in self.INLINE_ELEMENTS:
+                text_parts.append(element.text)
+            else:
+                text_parts.append(element.text.strip())
+        
+        # For simple elements, get all text content
+        if element.tag.lower() in self.INLINE_ELEMENTS or \
+           element.tag.lower() in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            # Get all text including from child elements
+            for child in element:
+                if child.tag.lower() not in self.SKIP_ELEMENTS:
+                    child_text = child.text_content()
+                    if child_text:
+                        # For inline elements, preserve whitespace in child content too
+                        if element.tag.lower() in self.INLINE_ELEMENTS:
+                            text_parts.append(child_text)
+                        else:
+                            text_parts.append(child_text.strip())
+        
+        # For inline elements with preserved whitespace, concatenate directly
+        # For others, join with spaces
+        if element.tag.lower() in self.INLINE_ELEMENTS and len(text_parts) == 1:
+            return text_parts[0] if text_parts else ''
+        else:
+            return ' '.join(text_parts)
+    
+    def _is_text_only_container(self, element: HtmlElement) -> bool:
+        """Check if element contains only text and inline elements."""
+        for child in element:
+            if child.tag.lower() in self.BLOCK_ELEMENTS:
+                return False
+            if child.tag.lower() == 'table':
+                return False
+        return True
+    
+    def _should_process_children(self, element: HtmlElement, node: Node) -> bool:
+        """Determine if children should be processed."""
+        # Don't process children for certain node types
+        if isinstance(node, (TextNode, HeadingNode)):
+            return False
+        
+        # Tables are processed separately
+        if isinstance(node, TableNode):
+            return False
+        
+        return True
+    
+    def _process_table_basic(self, element: HtmlElement, style: Style) -> TableNode:
+        """Basic table processing without advanced strategy."""
+        table = TableNode(style=style)
+        
+        # Set config for rendering decisions
+        table._config = self.config
+        
+        # Extract caption
+        caption_elem = element.find('.//caption')
+        if caption_elem is not None:
+            table.caption = caption_elem.text_content().strip()
+        
+        # Process rows
+        for tr in element.findall('.//tr'):
+            cells = []
+            for td in tr.findall('.//td') + tr.findall('.//th'):
+                cell = Cell(
+                    content=td.text_content().strip(),
+                    colspan=int(td.get('colspan', '1')),
+                    rowspan=int(td.get('rowspan', '1')),
+                    is_header=(td.tag == 'th'),
+                    align=td.get('align')
+                )
+                cells.append(cell)
+            
+            if cells:
+                row = Row(cells=cells, is_header=(tr.find('.//th') is not None))
+                
+                # Determine if header or data row
+                if tr.getparent().tag == 'thead' or row.is_header:
+                    table.headers.append(cells)
+                else:
+                    table.rows.append(row)
+        
+        return table
+    
+    def _parse_dimension(self, value: Optional[str]) -> Optional[int]:
+        """Parse dimension value (width/height)."""
+        if not value:
+            return None
+        
+        # Remove 'px' suffix if present
+        value = value.strip().rstrip('px')
+        
+        try:
+            return int(value)
+        except ValueError:
+            return None
+    
+    def _enter_xbrl_context(self, element: HtmlElement):
+        """Enter XBRL context."""
+        if self.config.extract_xbrl and self.strategies.get('xbrl_extraction'):
+            xbrl_data = self.strategies['xbrl_extraction'].extract_context(element)
+            if xbrl_data:
+                self.xbrl_context_stack.append(xbrl_data)
+    
+    def _exit_xbrl_context(self, element: HtmlElement):
+        """Exit XBRL context."""
+        if self.xbrl_context_stack:
+            self.xbrl_context_stack.pop()
+    
+    def _get_current_xbrl_metadata(self) -> Dict[str, Any]:
+        """Get current XBRL metadata."""
+        if not self.xbrl_context_stack:
+            return {}
+        
+        # Merge all contexts in stack
+        metadata = {}
+        for context in self.xbrl_context_stack:
+            metadata.update(context)
+        
+        return metadata
+    
+    def _merge_adjacent_nodes(self, root: Node):
+        """Merge adjacent text nodes with similar styles."""
+        # Implementation would recursively merge adjacent text nodes
+        # This is a placeholder for the actual implementation
+        pass
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/header_detection.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/header_detection.py
@@ -0,0 +1,450 @@
+"""
+Multi-strategy header detection for document structure.
+"""
+
+import re
+from abc import ABC, abstractmethod
+from typing import Optional, List, Dict
+
+from lxml.html import HtmlElement
+
+from edgar.documents.config import ParserConfig
+from edgar.documents.types import HeaderInfo, ParseContext
+
+
+class HeaderDetector(ABC):
+    """Abstract base class for header detectors."""
+    
+    @abstractmethod
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """Detect if element is a header."""
+        pass
+    
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Detector name."""
+        pass
+
+
+class StyleBasedDetector(HeaderDetector):
+    """Detect headers based on CSS styles."""
+    
+    @property
+    def name(self) -> str:
+        return "style"
+    
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """Detect headers based on style attributes."""
+        # Get element style
+        style = context.get_current_style()
+        
+        # Skip if no style info
+        if not style:
+            return None
+        
+        # Get text content
+        text = element.text_content().strip()
+        if not text or len(text) > 200:  # Skip very long text
+            return None
+        
+        confidence = 0.0
+        level = 3  # Default level
+        
+        # Check font size
+        if style.font_size and context.base_font_size:
+            size_ratio = style.font_size / context.base_font_size
+            
+            if size_ratio >= 2.0:
+                confidence += 0.8
+                level = 1
+            elif size_ratio >= 1.5:
+                confidence += 0.7
+                level = 2
+            elif size_ratio >= 1.2:
+                confidence += 0.5
+                level = 3
+            elif size_ratio >= 1.1:
+                confidence += 0.3
+                level = 4
+        
+        # Check font weight
+        if style.is_bold:
+            confidence += 0.3
+            if level == 3:  # Adjust level for bold text
+                level = 2
+        
+        # Check text alignment
+        if style.is_centered:
+            confidence += 0.2
+        
+        # Check for uppercase
+        if text.isupper() and len(text.split()) <= 10:
+            confidence += 0.2
+        
+        # Check margins (headers often have larger margins)
+        if style.margin_top and style.margin_top > 20:
+            confidence += 0.1
+        if style.margin_bottom and style.margin_bottom > 10:
+            confidence += 0.1
+        
+        # Normalize confidence
+        confidence = min(confidence, 1.0)
+        
+        if confidence > 0.4:  # Threshold for style-based detection
+            return HeaderInfo.from_text(text, level, confidence, self.name)
+        
+        return None
+
+
+class PatternBasedDetector(HeaderDetector):
+    """Detect headers based on text patterns."""
+    
+    # Common header patterns in SEC filings
+    HEADER_PATTERNS = [
+        # Item patterns
+        (r'^(Item|ITEM)\s+(\d+[A-Z]?)[.\s]+(.+)$', 1, 0.95),
+        (r'^Part\s+[IVX]+[.\s]*$', 1, 0.9),
+        (r'^PART\s+[IVX]+[.\s]*$', 1, 0.9),
+        
+        # Section patterns
+        (r'^(BUSINESS|RISK FACTORS|PROPERTIES|LEGAL PROCEEDINGS)$', 2, 0.85),
+        (r'^(Management\'?s?\s+Discussion|MD&A)', 2, 0.85),
+        (r'^(Financial\s+Statements|Consolidated\s+Financial\s+Statements)$', 2, 0.85),
+        
+        # Numbered sections
+        (r'^\d+\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
+        (r'^[A-Z]\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
+        (r'^\([a-z]\)\s+[A-Z][A-Za-z\s]+$', 4, 0.6),
+        
+        # Title case headers
+        (r'^[A-Z][A-Za-z\s]+[A-Za-z]$', 3, 0.5),
+        
+        # All caps headers
+        (r'^[A-Z\s]+$', 3, 0.6),
+    ]
+    
+    @property
+    def name(self) -> str:
+        return "pattern"
+    
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """Detect headers based on text patterns."""
+        text = element.text_content().strip()
+        
+        # Skip empty or very long text
+        if not text or len(text) > 200:
+            return None
+        
+        # Skip single punctuation - never headers
+        if len(text) == 1 and text in '.,!?;:()[]{}':
+            return None
+        
+        # Skip if text contains multiple sentences (likely paragraph)
+        if text.count('.') > 2:
+            return None
+        
+        # Check against patterns
+        for pattern, level, base_confidence in self.HEADER_PATTERNS:
+            match = re.match(pattern, text, re.IGNORECASE)
+            if match:
+                # Adjust confidence based on context
+                confidence = base_confidence
+                
+                # Boost confidence if element is alone in parent
+                if len(element.getparent()) == 1:
+                    confidence += 0.1
+                
+                # Boost confidence if followed by substantial text
+                next_elem = element.getnext()
+                if next_elem is not None and len(next_elem.text_content()) > 100:
+                    confidence += 0.1
+                
+                confidence = min(confidence, 1.0)
+                
+                return HeaderInfo.from_text(text, level, confidence, self.name)
+        
+        return None
+
+
+class StructuralDetector(HeaderDetector):
+    """Detect headers based on DOM structure."""
+    
+    @property
+    def name(self) -> str:
+        return "structural"
+    
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """Detect headers based on structural cues."""
+        text = element.text_content().strip()
+        
+        # Skip empty or very long text
+        if not text or len(text) > 200:
+            return None
+        
+        # Skip single punctuation - never headers
+        if len(text) == 1 and text in '.,!?;:()[]{}':
+            return None
+        
+        confidence = 0.0
+        level = 3
+        
+        # Check if element is in a header tag
+        tag = element.tag.lower()
+        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            confidence = 1.0
+            level = int(tag[1])
+            return HeaderInfo.from_text(text, level, confidence, self.name)
+        
+        # Check parent structure
+        parent = element.getparent()
+        if parent is not None:
+            parent_tag = parent.tag.lower()
+            
+            # Check if in header-like container
+            if parent_tag in ['header', 'thead', 'caption']:
+                confidence += 0.6
+                level = 2
+            
+            # Check if parent has few children (isolated element)
+            if len(parent) <= 3:
+                confidence += 0.3
+            
+            # Check if parent is centered
+            parent_align = parent.get('align')
+            if parent_align == 'center':
+                confidence += 0.2
+        
+        # Check element properties
+        if tag in ['strong', 'b']:
+            confidence += 0.3
+        
+        if element.get('align') == 'center':
+            confidence += 0.2
+        
+        # Check if followed by block content
+        next_elem = element.getnext()
+        if next_elem is not None:
+            next_tag = next_elem.tag.lower()
+            if next_tag in ['p', 'div', 'table', 'ul', 'ol']:
+                confidence += 0.2
+        
+        # Check text characteristics
+        words = text.split()
+        if 1 <= len(words) <= 10:  # Short text
+            confidence += 0.1
+        
+        # Normalize confidence
+        confidence = min(confidence, 1.0)
+        
+        if confidence > 0.5:
+            return HeaderInfo.from_text(text, level, confidence, self.name)
+        
+        return None
+
+
+class ContextualDetector(HeaderDetector):
+    """Detect headers based on surrounding context."""
+    
+    @property
+    def name(self) -> str:
+        return "contextual"
+    
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """Detect headers based on contextual clues."""
+        text = element.text_content().strip()
+        
+        # Skip empty or very long text
+        if not text or len(text) > 200:
+            return None
+        
+        # Skip single punctuation - never headers
+        if len(text) == 1 and text in '.,!?;:()[]{}':
+            return None
+        
+        confidence = 0.0
+        level = 3
+        
+        # Check if text looks like a header
+        if self._looks_like_header(text):
+            confidence += 0.4
+        
+        # Check relationship to previous content
+        prev_elem = element.getprevious()
+        if prev_elem is not None:
+            prev_text = prev_elem.text_content().strip()
+            
+            # Check if previous was also a header (section hierarchy)
+            if prev_text and self._looks_like_header(prev_text):
+                confidence += 0.3
+                # Adjust level based on comparison
+                if len(text) > len(prev_text):
+                    level = 2
+                else:
+                    level = 3
+        
+        # Check relationship to next content
+        next_elem = element.getnext()
+        if next_elem is not None:
+            next_text = next_elem.text_content().strip()
+            
+            # Headers are often followed by longer content
+            if len(next_text) > len(text) * 3:
+                confidence += 0.3
+            
+            # Check if next element is indented or styled differently
+            next_style = next_elem.get('style', '')
+            if 'margin-left' in next_style or 'padding-left' in next_style:
+                confidence += 0.2
+        
+        # Check position in document
+        if context.current_section is None and context.depth < 5:
+            # Early in document, more likely to be header
+            confidence += 0.2
+        
+        # Normalize confidence
+        confidence = min(confidence, 1.0)
+        
+        if confidence > 0.5:
+            return HeaderInfo.from_text(text, level, confidence, self.name)
+        
+        return None
+    
+    def _looks_like_header(self, text: str) -> bool:
+        """Check if text looks like a header."""
+        # Short text
+        if len(text.split()) > 15:
+            return False
+        
+        # No ending punctuation (except colon)
+        if text.rstrip().endswith(('.', '!', '?', ';')):
+            return False
+        
+        # Title case or all caps
+        if text.istitle() or text.isupper():
+            return True
+        
+        # Starts with capital letter
+        if text and text[0].isupper():
+            return True
+        
+        return False
+
+
+class HeaderDetectionStrategy:
+    """
+    Multi-strategy header detection.
+    
+    Combines multiple detection methods with weighted voting.
+    """
+    
+    def __init__(self, config: ParserConfig):
+        """Initialize with configuration."""
+        self.config = config
+        self.detectors = self._init_detectors()
+    
+    def _init_detectors(self) -> List[HeaderDetector]:
+        """Initialize enabled detectors."""
+        detectors = []
+        
+        # Always include basic detectors
+        detectors.extend([
+            StyleBasedDetector(),
+            PatternBasedDetector(),
+            StructuralDetector(),
+            ContextualDetector()
+        ])
+        
+        # Add ML detector if enabled
+        if self.config.features.get('ml_header_detection'):
+            # Would add MLBasedDetector here
+            pass
+        
+        return detectors
+    
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """
+        Detect if element is a header using multiple strategies.
+        
+        Args:
+            element: HTML element to check
+            context: Current parsing context
+            
+        Returns:
+            HeaderInfo if element is detected as header, None otherwise
+        """
+        # Skip if element has no text
+        text = element.text_content().strip()
+        if not text:
+            return None
+        
+        # Collect results from all detectors
+        results: List[HeaderInfo] = []
+        
+        for detector in self.detectors:
+            try:
+                result = detector.detect(element, context)
+                if result:
+                    results.append(result)
+            except Exception:
+                # Don't let one detector failure stop others
+                continue
+        
+        if not results:
+            return None
+        
+        # If only one detector fired, use its result if confident enough
+        if len(results) == 1:
+            if results[0].confidence >= self.config.header_detection_threshold:
+                return results[0]
+            return None
+        
+        # Multiple detectors - combine results
+        return self._combine_results(results, text)
+    
+    def _combine_results(self, results: List[HeaderInfo], text: str) -> HeaderInfo:
+        """Combine multiple detection results."""
+        # Weight different detectors
+        detector_weights = {
+            'style': 0.3,
+            'pattern': 0.4,
+            'structural': 0.2,
+            'contextual': 0.1,
+            'ml': 0.5  # Would be highest if available
+        }
+        
+        # Calculate weighted confidence
+        total_confidence = 0.0
+        total_weight = 0.0
+        
+        # Group by level
+        level_votes: Dict[int, float] = {}
+        
+        for result in results:
+            weight = detector_weights.get(result.detection_method, 0.1)
+            total_confidence += result.confidence * weight
+            total_weight += weight
+            
+            # Vote for level
+            if result.level not in level_votes:
+                level_votes[result.level] = 0.0
+            level_votes[result.level] += result.confidence * weight
+        
+        # Normalize confidence
+        final_confidence = total_confidence / total_weight if total_weight > 0 else 0.0
+        
+        # Choose most voted level
+        final_level = max(level_votes.items(), key=lambda x: x[1])[0]
+        
+        # Check if any detector found this is an item
+        is_item = any(r.is_item for r in results)
+        item_number = next((r.item_number for r in results if r.item_number), None)
+        
+        return HeaderInfo(
+            level=final_level,
+            confidence=final_confidence,
+            text=text,
+            detection_method='combined',
+            is_item=is_item,
+            item_number=item_number
+        )
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/style_parser.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/style_parser.py
@@ -0,0 +1,344 @@
+"""
+CSS style parser for HTML elements.
+"""
+
+import re
+from typing import Dict, Optional, Tuple, Union
+from edgar.documents.types import Style
+from edgar.documents.utils import get_cache_manager
+
+
+class StyleParser:
+    """
+    Parser for CSS style attributes.
+    
+    Handles inline styles and converts them to Style objects.
+    """
+    
+    # Common CSS units
+    ABSOLUTE_UNITS = {'px', 'pt', 'pc', 'cm', 'mm', 'in'}
+    RELATIVE_UNITS = {'em', 'rem', 'ex', 'ch', 'vw', 'vh', '%'}
+    
+    # Font weight mappings
+    FONT_WEIGHT_MAP = {
+        'normal': '400',
+        'bold': '700',
+        'bolder': '800',
+        'lighter': '300'
+    }
+    
+    def __init__(self):
+        """Initialize style parser with cache."""
+        self._cache = get_cache_manager().style_cache
+    
+    def parse(self, style_string: str) -> Style:
+        """
+        Parse CSS style string into Style object.
+        
+        Args:
+            style_string: CSS style string (e.g., "font-size: 14px; color: red")
+            
+        Returns:
+            Parsed Style object
+        """
+        if not style_string:
+            return Style()
+        
+        # Check cache first
+        cached_style = self._cache.get(style_string)
+        if cached_style is not None:
+            return cached_style
+        
+        # Parse style
+        style = Style()
+        
+        # Split into individual declarations
+        declarations = self._split_declarations(style_string)
+        
+        for prop, value in declarations.items():
+            self._apply_property(style, prop, value)
+        
+        # Cache result
+        self._cache.put(style_string, style)
+        
+        return style
+    
+    def _split_declarations(self, style_string: str) -> Dict[str, str]:
+        """Split style string into property-value pairs."""
+        declarations = {}
+        
+        # Split by semicolon, handling potential issues
+        parts = style_string.split(';')
+        
+        for part in parts:
+            part = part.strip()
+            if not part:
+                continue
+            
+            # Split property and value
+            if ':' in part:
+                prop, value = part.split(':', 1)
+                prop = prop.strip().lower()
+                value = value.strip()
+                
+                if prop and value:
+                    declarations[prop] = value
+        
+        return declarations
+    
+    def _apply_property(self, style: Style, prop: str, value: str):
+        """Apply CSS property to Style object."""
+        # Font properties
+        if prop == 'font-size':
+            size = self._parse_length(value)
+            if size is not None:
+                style.font_size = size
+        
+        elif prop == 'font-weight':
+            style.font_weight = self._normalize_font_weight(value)
+        
+        elif prop == 'font-style':
+            if value in ['italic', 'oblique']:
+                style.font_style = 'italic'
+            elif value == 'normal':
+                style.font_style = 'normal'
+        
+        # Text properties
+        elif prop == 'text-align':
+            if value in ['left', 'right', 'center', 'justify']:
+                style.text_align = value
+        
+        elif prop == 'text-decoration':
+            style.text_decoration = value
+        
+        # Color properties
+        elif prop == 'color':
+            style.color = self._normalize_color(value)
+        
+        elif prop in ['background-color', 'background']:
+            color = self._extract_background_color(value)
+            if color:
+                style.background_color = color
+        
+        # Spacing properties
+        elif prop == 'margin':
+            self._parse_box_property(style, 'margin', value)
+        elif prop == 'margin-top':
+            margin = self._parse_length(value)
+            if margin is not None:
+                style.margin_top = margin
+        elif prop == 'margin-bottom':
+            margin = self._parse_length(value)
+            if margin is not None:
+                style.margin_bottom = margin
+        elif prop == 'margin-left':
+            margin = self._parse_length(value)
+            if margin is not None:
+                style.margin_left = margin
+        elif prop == 'margin-right':
+            margin = self._parse_length(value)
+            if margin is not None:
+                style.margin_right = margin
+        
+        elif prop == 'padding':
+            self._parse_box_property(style, 'padding', value)
+        elif prop == 'padding-top':
+            padding = self._parse_length(value)
+            if padding is not None:
+                style.padding_top = padding
+        elif prop == 'padding-bottom':
+            padding = self._parse_length(value)
+            if padding is not None:
+                style.padding_bottom = padding
+        elif prop == 'padding-left':
+            padding = self._parse_length(value)
+            if padding is not None:
+                style.padding_left = padding
+        elif prop == 'padding-right':
+            padding = self._parse_length(value)
+            if padding is not None:
+                style.padding_right = padding
+        
+        # Display properties
+        elif prop == 'display':
+            style.display = value
+        
+        # Size properties
+        elif prop == 'width':
+            style.width = self._parse_dimension(value)
+        elif prop == 'height':
+            style.height = self._parse_dimension(value)
+        
+        # Line height
+        elif prop == 'line-height':
+            line_height = self._parse_line_height(value)
+            if line_height is not None:
+                style.line_height = line_height
+    
+    def _parse_length(self, value: str) -> Optional[float]:
+        """Parse CSS length value to pixels."""
+        value = value.strip().lower()
+        
+        # Handle special values
+        if value in ['0', 'auto', 'inherit', 'initial']:
+            return 0.0 if value == '0' else None
+        
+        # Extract number and unit
+        match = re.match(r'^(-?\d*\.?\d+)\s*([a-z%]*)$', value)
+        if not match:
+            return None
+        
+        num_str, unit = match.groups()
+        try:
+            num = float(num_str)
+        except ValueError:
+            return None
+        
+        # Convert to pixels
+        if not unit or unit == 'px':
+            return num
+        elif unit == 'pt':
+            return num * 1.333  # 1pt = 1.333px
+        elif unit == 'em':
+            return num * 16  # Assume 16px base
+        elif unit == 'rem':
+            return num * 16  # Assume 16px root
+        elif unit == '%':
+            return None  # Can't convert percentage without context
+        elif unit == 'in':
+            return num * 96  # 1in = 96px
+        elif unit == 'cm':
+            return num * 37.8  # 1cm = 37.8px
+        elif unit == 'mm':
+            return num * 3.78  # 1mm = 3.78px
+        
+        return None
+    
+    def _parse_dimension(self, value: str) -> Optional[Union[float, str]]:
+        """Parse dimension value (width/height)."""
+        value = value.strip()
+        
+        # Check for percentage
+        if value.endswith('%'):
+            return value  # Return as string
+        
+        # Try to parse as length
+        length = self._parse_length(value)
+        return length
+    
+    def _parse_line_height(self, value: str) -> Optional[float]:
+        """Parse line-height value."""
+        value = value.strip()
+        
+        # Unitless number (multiplier)
+        try:
+            return float(value)
+        except ValueError:
+            pass
+        
+        # Try as length
+        return self._parse_length(value)
+    
+    def _normalize_font_weight(self, value: str) -> str:
+        """Normalize font weight value."""
+        value = value.strip().lower()
+        
+        # Map keywords to numeric values
+        if value in self.FONT_WEIGHT_MAP:
+            return self.FONT_WEIGHT_MAP[value]
+        
+        # Check if it's already numeric
+        if value.isdigit() and 100 <= int(value) <= 900:
+            return value
+        
+        return value
+    
+    def _normalize_color(self, value: str) -> str:
+        """Normalize color value."""
+        value = value.strip().lower()
+        
+        # Handle rgb/rgba
+        if value.startswith(('rgb(', 'rgba(')):
+            return value
+        
+        # Handle hex colors
+        if value.startswith('#'):
+            # Expand 3-char hex to 6-char
+            if len(value) == 4:
+                return '#' + ''.join(c*2 for c in value[1:])
+            return value
+        
+        # Return named colors as-is
+        return value
+    
+    def _extract_background_color(self, value: str) -> Optional[str]:
+        """Extract color from background property."""
+        # Simple extraction - could be enhanced
+        parts = value.split()
+        for part in parts:
+            if part.startswith('#') or part.startswith('rgb'):
+                return self._normalize_color(part)
+            # Check for named colors
+            if not any(unit in part for unit in self.ABSOLUTE_UNITS | self.RELATIVE_UNITS):
+                return part
+        
+        return None
+    
+    def _parse_box_property(self, style: Style, prop_type: str, value: str):
+        """Parse box property (margin/padding) with multiple values."""
+        parts = value.split()
+        
+        if not parts:
+            return
+        
+        # Convert all parts to lengths
+        lengths = []
+        for part in parts:
+            length = self._parse_length(part)
+            if length is not None:
+                lengths.append(length)
+        
+        if not lengths:
+            return
+        
+        # Apply based on number of values (CSS box model)
+        if len(lengths) == 1:
+            # All sides
+            val = lengths[0]
+            setattr(style, f'{prop_type}_top', val)
+            setattr(style, f'{prop_type}_right', val)
+            setattr(style, f'{prop_type}_bottom', val)
+            setattr(style, f'{prop_type}_left', val)
+        elif len(lengths) == 2:
+            # Vertical, horizontal
+            vert, horiz = lengths
+            setattr(style, f'{prop_type}_top', vert)
+            setattr(style, f'{prop_type}_bottom', vert)
+            setattr(style, f'{prop_type}_left', horiz)
+            setattr(style, f'{prop_type}_right', horiz)
+        elif len(lengths) == 3:
+            # Top, horizontal, bottom
+            top, horiz, bottom = lengths
+            setattr(style, f'{prop_type}_top', top)
+            setattr(style, f'{prop_type}_bottom', bottom)
+            setattr(style, f'{prop_type}_left', horiz)
+            setattr(style, f'{prop_type}_right', horiz)
+        elif len(lengths) >= 4:
+            # Top, right, bottom, left
+            setattr(style, f'{prop_type}_top', lengths[0])
+            setattr(style, f'{prop_type}_right', lengths[1])
+            setattr(style, f'{prop_type}_bottom', lengths[2])
+            setattr(style, f'{prop_type}_left', lengths[3])
+    
+    def merge_styles(self, base: Style, override: Style) -> Style:
+        """
+        Merge two styles with override taking precedence.
+        
+        Args:
+            base: Base style
+            override: Override style
+            
+        Returns:
+            Merged style
+        """
+        return base.merge(override)
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/table_processing.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/table_processing.py
@@ -0,0 +1,637 @@
+"""
+Advanced table processing strategy.
+"""
+
+import re
+from functools import lru_cache
+from typing import List, Optional
+
+from lxml.html import HtmlElement
+
+from edgar.documents.config import ParserConfig
+from edgar.documents.strategies.style_parser import StyleParser
+from edgar.documents.table_nodes import TableNode, Cell, Row
+from edgar.documents.types import TableType
+
+
+class TableProcessor:
+    """
+    Advanced table processing with type detection and structure analysis.
+    """
+    
+    # HTML entities that need replacement
+    ENTITY_REPLACEMENTS = {
+        '&horbar;': '-----',
+        '&mdash;': '-----',
+        '&ndash;': '---',
+        '&minus;': '-',
+        '&hyphen;': '-',
+        '&dash;': '-',
+        '&nbsp;': ' ',
+        '&amp;': '&',
+        '&lt;': '<',
+        '&gt;': '>',
+        '&quot;': '"',
+        '&apos;': "'",
+        '&#8202;': ' ',
+        '&#8203;': '',
+        '&#x2014;': '-----',
+        '&#x2013;': '---',
+        '&#x2212;': '-',
+    }
+    
+    # Financial keywords for table type detection
+    FINANCIAL_KEYWORDS = {
+        'revenue', 'income', 'expense', 'asset', 'liability',
+        'cash', 'equity', 'profit', 'loss', 'margin',
+        'earnings', 'cost', 'sales', 'operating', 'net',
+        'gross', 'total', 'balance', 'statement', 'consolidated',
+        'provision', 'tax', 'taxes', 'compensation', 'stock',
+        'share', 'shares', 'rsu', 'option', 'grant', 'vest'
+    }
+    
+    # Metrics keywords
+    METRICS_KEYWORDS = {
+        'ratio', 'percentage', 'percent', '%', 'rate',
+        'growth', 'change', 'increase', 'decrease',
+        'average', 'median', 'total', 'count', 'number'
+    }
+    
+    def __init__(self, config: ParserConfig):
+        """Initialize table processor."""
+        self.config = config
+        self.style_parser = StyleParser()
+    
+    def process(self, element: HtmlElement) -> TableNode:
+        """
+        Process table element into TableNode.
+        
+        Args:
+            element: HTML table element
+            
+        Returns:
+            Processed TableNode
+        """
+        # Extract table metadata
+        table_id = element.get('id')
+        table_class = element.get('class', '').split()
+        table_style = self.style_parser.parse(element.get('style', ''))
+        
+        # Create table node
+        table = TableNode(style=table_style)
+        
+        # Set config for rendering decisions
+        table._config = self.config
+        
+        # Add metadata
+        if table_id:
+            table.set_metadata('id', table_id)
+        if table_class:
+            table.set_metadata('classes', table_class)
+        
+        # Extract caption
+        caption_elem = element.find('.//caption')
+        if caption_elem is not None:
+            table.caption = self._extract_text(caption_elem)
+        
+        # Extract summary
+        summary = element.get('summary')
+        if summary:
+            table.summary = summary
+        
+        # Process table structure
+        self._process_table_structure(element, table)
+        
+        # Detect table type if configured
+        if self.config.detect_table_types:
+            table.table_type = self._detect_table_type(table)
+        
+        # Extract relationships if configured
+        if self.config.extract_table_relationships:
+            self._extract_relationships(table)
+        
+        return table
+    
+    def _process_table_structure(self, element: HtmlElement, table: TableNode):
+        """Process table structure (thead, tbody, tfoot)."""
+        # Process thead
+        thead = element.find('.//thead')
+        if thead is not None:
+            for tr in thead.findall('.//tr'):
+                cells = self._process_row(tr, is_header=True)
+                if cells:
+                    table.headers.append(cells)
+        
+        # Process tbody (or direct rows)
+        tbody = element.find('.//tbody')
+        rows_container = tbody if tbody is not None else element
+        
+        # Track if we've seen headers and data rows
+        headers_found = bool(table.headers)
+        consecutive_header_rows = 0
+        data_rows_started = False
+        
+        for tr in rows_container.findall('.//tr'):
+            # Skip if already processed in thead
+            if thead is not None and tr.getparent() == thead:
+                continue
+            
+            # Check if this might be a header row
+            is_header_row = False
+            
+            # Continue checking for headers if:
+            # 1. We haven't found any headers yet, OR
+            # 2. We've found headers but haven't seen data rows yet (multi-row headers)
+            if not data_rows_started:
+                is_header_row = self._is_header_row(tr)
+                
+                # Additional check for multi-row headers in financial tables
+                # If the previous row was a header and this row has years or units,
+                # it's likely part of the header
+                if headers_found and not is_header_row:
+                    row_text = tr.text_content().strip()
+                    # Check for units like "(in millions)" or "(in thousands)"
+                    if '(in millions)' in row_text or '(in thousands)' in row_text or '(in billions)' in row_text:
+                        is_header_row = True
+                    # Check for year rows that follow "Year Ended" headers
+                    elif len(table.headers) > 0:
+                        last_header_text = ' '.join(cell.text() for cell in table.headers[-1])
+                        if 'year ended' in last_header_text.lower() or 'years ended' in last_header_text.lower():
+                            # Check if this row has years
+                            year_pattern = r'\b(19\d{2}|20\d{2})\b'
+                            years_found = re.findall(year_pattern, row_text)
+                            if years_found:
+                                is_header_row = True
+            
+            cells = self._process_row(tr, is_header=is_header_row)
+            if cells:
+                if is_header_row:
+                    table.headers.append(cells)
+                    headers_found = True
+                    consecutive_header_rows += 1
+                else:
+                    # Only mark data_rows_started if this row has actual content
+                    # Empty rows at the beginning shouldn't stop header detection
+                    row = Row(cells=cells, is_header=False)
+                    table.rows.append(row)
+                    
+                    # Check if row has significant content that indicates data rows have started
+                    # But be smart about it - descriptive rows like "(in millions)" or pure spacing
+                    # shouldn't stop header detection
+                    has_content = any(cell.text().strip() for cell in cells)
+                    if has_content:
+                        # Get the row text for smarter analysis
+                        row_text = ' '.join(cell.text().strip() for cell in cells).strip()
+                        row_text_lower = row_text.lower()
+                        
+                        # Don't consider this as "data started" if it's likely a header-related row
+                        is_header_related = (
+                            # Unit descriptions
+                            '(in millions)' in row_text_lower or 
+                            '(in thousands)' in row_text_lower or 
+                            '(in billions)' in row_text_lower or
+                            'except per share' in row_text_lower or
+                            # Financial period descriptions  
+                            'year ended' in row_text_lower or
+                            'months ended' in row_text_lower or
+                            # Mostly just spacing/formatting
+                            len(row_text.strip()) < 5 or
+                            # Contains years (might be misclassified header)
+                            bool(re.search(r'\b(19\d{2}|20\d{2})\b', row_text))
+                        )
+                        
+                        # Only mark data_rows_started if this seems like actual data, not header-related
+                        if not is_header_related:
+                            data_rows_started = True
+                    
+                    consecutive_header_rows = 0
+        
+        # Process tfoot
+        tfoot = element.find('.//tfoot')
+        if tfoot is not None:
+            for tr in tfoot.findall('.//tr'):
+                cells = self._process_row(tr, is_header=False)
+                if cells:
+                    row = Row(cells=cells, is_header=False)
+                    table.footer.append(row)
+    
+    def _process_row(self, tr: HtmlElement, is_header: bool) -> List[Cell]:
+        """Process table row into cells."""
+        cells = []
+        
+        # Process both td and th elements
+        for cell_elem in tr.findall('.//td') + tr.findall('.//th'):
+            cell = self._process_cell(cell_elem, is_header or cell_elem.tag == 'th')
+            if cell:
+                cells.append(cell)
+        
+        return cells
+    
+    def _process_cell(self, elem: HtmlElement, is_header: bool) -> Optional[Cell]:
+        """Process table cell."""
+        # Extract cell properties
+        colspan = int(elem.get('colspan', '1'))
+        rowspan = int(elem.get('rowspan', '1'))
+        align = elem.get('align')
+        
+        # Extract style
+        style = self.style_parser.parse(elem.get('style', ''))
+        if style.text_align:
+            align = style.text_align
+        
+        # Extract content
+        content = self._extract_cell_content(elem)
+        
+        # Create cell
+        cell = Cell(
+            content=content,
+            colspan=colspan,
+            rowspan=rowspan,
+            is_header=is_header,
+            align=align
+        )
+        
+        return cell
+    
+    def _extract_cell_content(self, elem: HtmlElement) -> str:
+        """Extract and clean cell content."""
+        # Check for nested structure
+        divs = elem.findall('.//div')
+        if divs and len(divs) > 1:
+            # Multiple divs - likely multi-line content
+            lines = []
+            for div in divs:
+                text = self._extract_text(div)
+                if text:
+                    lines.append(text)
+            return '\n'.join(lines)
+        
+        # Handle line breaks
+        for br in elem.findall('.//br'):
+            br.tail = '\n' + (br.tail or '')
+        
+        # Extract text
+        text = self._extract_text(elem)
+        
+        return text
+    
+    def _extract_text(self, elem: HtmlElement) -> str:
+        """Extract and clean text from element."""
+        # Use itertext() to get all text fragments
+        # This preserves spaces better than text_content()
+        text_parts = []
+        for text in elem.itertext():
+            if text:
+                text_parts.append(text)
+        
+        # Join parts, ensuring we don't lose spaces
+        # If a part doesn't end with whitespace and the next doesn't start with whitespace,
+        # we need to add a space between them
+        if not text_parts:
+            return ''
+        
+        result = []
+        for i, part in enumerate(text_parts):
+            if i == 0:
+                result.append(part)
+            else:
+                prev_part = text_parts[i-1]
+                # Check if we need to add a space between parts
+                # Don't add space if previous ends with space or current starts with space
+                if prev_part and part:
+                    if not prev_part[-1].isspace() and not part[0].isspace():
+                        # Check for punctuation that shouldn't have space before it
+                        if part[0] not in ',.;:!?%)]':
+                            result.append(' ')
+                result.append(part)
+        
+        text = ''.join(result)
+        
+        # Replace entities
+        for entity, replacement in self.ENTITY_REPLACEMENTS.items():
+            text = text.replace(entity, replacement)
+        
+        # Clean whitespace
+        text = text.strip()
+        
+        # Normalize internal whitespace but preserve line breaks
+        lines = text.split('\n')
+        cleaned_lines = []
+        for line in lines:
+            # Collapse multiple spaces to single space
+            line = ' '.join(line.split())
+            cleaned_lines.append(line)
+        
+        return '\n'.join(cleaned_lines)
+
+    @staticmethod
+    @lru_cache(maxsize=1)
+    def _get_period_header_pattern():
+        """
+        Compile comprehensive regex for financial period headers.
+        Adapted from old parser's proven patterns.
+
+        Returns:
+            Compiled regex pattern matching financial period headers
+        """
+        # Base components
+        periods = r'(?:three|six|nine|twelve|[1-4]|first|second|third|fourth)'
+        timeframes = r'(?:month|quarter|year|week)'
+        ended_variants = r'(?:ended|ending|end|period)'
+        as_of_variants = r'(?:as\s+of|at|as\s+at)'
+
+        # Date pattern
+        months = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
+        day = r'\d{1,2}'
+        year = r'(?:19|20)\d{2}'
+        date = f'{months}\\s*\\.?\\s*{day}\\s*,?\\s*{year}'
+
+        # Combined patterns
+        patterns = [
+            # Standard period headers
+            f'{periods}\\s+{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
+            f'(?:fiscal\\s+)?{timeframes}\\s+{ended_variants}',
+            f'{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
+
+            # Balance sheet date headers
+            f'{as_of_variants}\\s+{date}',
+
+            # Multiple date sequences
+            f'{date}(?:\\s*(?:and|,)\\s*{date})*',
+
+            # Single dates
+            f'(?:{ended_variants}\\s+)?{date}'
+        ]
+
+        pattern = '|'.join(f'(?:{p})' for p in patterns)
+        return re.compile(pattern, re.IGNORECASE)
+
+    def _is_header_row(self, tr: HtmlElement) -> bool:
+        """Detect if row is likely a header row in SEC filings."""
+        # Check if contains th elements (most reliable indicator)
+        if tr.find('.//th') is not None:
+            return True
+        
+        cells = tr.findall('.//td')
+        if not cells:
+            return False
+        
+        # Get row text for analysis
+        row_text = tr.text_content()
+        row_text_lower = row_text.lower()
+
+        # Check for date ranges with financial data (Oracle Table 6 pattern)
+        # Date ranges like "March 1, 2024—March 31, 2024" should be data rows, not headers
+        date_range_pattern = r'(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}\s*[—–-]\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}'
+        has_date_range = bool(re.search(date_range_pattern, row_text_lower))
+
+        # Check for financial data indicators
+        has_currency = bool(re.search(r'\$[\s]*[\d,\.]+', row_text))
+        has_decimals = bool(re.search(r'\b\d+\.\d+\b', row_text))
+        has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
+
+        # If row has date range + financial data, it's definitely a data row
+        if has_date_range and (has_currency or has_decimals or has_large_numbers):
+            return False
+
+        # Check for year patterns (very common in financial headers)
+        year_pattern = r'\b(19\d{2}|20\d{2})\b'
+        years_found = re.findall(year_pattern, row_text)
+        if len(years_found) >= 2:  # Multiple years suggest header row
+            # IMPORTANT: Check for date ranges and same-year repetition
+            # Date ranges like "March 1, 2024—March 31, 2024" contain the same year twice
+            # but are data rows, not multi-year comparison headers
+
+            # If all years are the same (date range pattern)
+            if len(set(years_found)) == 1:
+                # Same year repeated - likely a date range like "Jan 1, 2024 - Mar 31, 2024"
+                # Not a multi-year comparison header
+                pass  # Don't return True
+            # Multiple different years suggest multi-year comparison header
+            elif 'total' not in row_text_lower[:20]:  # Check first 20 chars
+                return True
+        
+        # Enhanced year detection - check individual cells for year patterns
+        # This handles cases where years are in separate cells
+        year_cells = 0
+        date_phrases = 0
+        for cell in cells:
+            cell_text = cell.text_content().strip()
+            if cell_text:
+                # Check for individual years
+                if re.match(r'^\s*(19\d{2}|20\d{2})\s*$', cell_text):
+                    year_cells += 1
+                # Check for date phrases like "June 30, 2025"
+                elif 'june 30' in cell_text.lower() or 'december 31' in cell_text.lower():
+                    date_phrases += 1
+        
+        # If we have multiple year cells or year + date phrases, likely a header
+        if year_cells >= 2 or (year_cells >= 1 and date_phrases >= 1):
+            if 'total' not in row_text_lower[:20]:
+                return True
+        
+        # Check for comprehensive financial period patterns (from old parser)
+        period_pattern = self._get_period_header_pattern()
+        if period_pattern.search(row_text_lower):
+            # Additional validation: ensure it's not a data row with period text
+            # Check for absence of strong data indicators
+            data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\s*[+\-*/]\s*\d+|\(\s*\d+(?:,\d{3})*\s*\))'
+            if not re.search(data_pattern, row_text):
+                return True
+
+        # Check for units notation (in millions, thousands, billions)
+        units_pattern = r'\(in\s+(?:millions|thousands|billions)\)'
+        if re.search(units_pattern, row_text_lower):
+            return True
+        
+        # Check for period indicators (quarters, months)
+        # But be careful with "fiscal" - it could be data like "Fiscal 2025"
+        period_keywords = ['quarter', 'q1', 'q2', 'q3', 'q4', 'month', 
+                          'january', 'february', 'march', 'april', 'may', 'june',
+                          'july', 'august', 'september', 'october', 'november', 'december',
+                          'ended', 'three months', 'six months', 'nine months']
+        
+        # Special handling for "fiscal" - only treat as header if it's part of a phrase like "fiscal year ended"
+        if 'fiscal' in row_text_lower:
+            # Check if row has numeric values (suggests it's data, not header)
+            # Look for patterns like "Fiscal 2025 $10,612" 
+            has_currency_values = bool(re.search(r'\$[\s]*[\d,]+', row_text))
+            has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
+            
+            # If it has currency or large numbers, it's likely data
+            if has_currency_values or has_large_numbers:
+                return False
+            
+            # Check if it's just "Fiscal YYYY" which is likely data, not a header
+            fiscal_year_only = re.match(r'^\s*fiscal\s+\d{4}\s*$', row_text_lower.strip())
+            if fiscal_year_only:
+                return False  # This is data, not a header
+            
+            # Check for header-like phrases with fiscal
+            if 'fiscal year' in row_text_lower and ('ended' in row_text_lower or 'ending' in row_text_lower):
+                return True
+        
+        if any(keyword in row_text_lower for keyword in period_keywords):
+            # Validate it's not a data row with period keywords
+            # Check for strong data indicators
+            data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
+            if not re.search(data_pattern, row_text):
+                return True
+        
+        # Check for column descriptors (but NOT total)
+        # These are words commonly found in headers but not data rows
+        header_keywords = ['description', 'item', 'category', 'type', 'classification',
+                          'change', 'percent', 'increase', 'decrease', 'variance']
+        if any(keyword in row_text_lower for keyword in header_keywords):
+            # Make sure it's not a total row
+            if 'total' not in row_text_lower[:30]:
+                # Additional validation: long narrative text is not a header
+                # Headers are typically concise (< 150 chars)
+                if len(row_text) > 150:
+                    return False
+                # Check for data indicators (would indicate data row, not header)
+                data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
+                if re.search(data_pattern, row_text):
+                    return False
+                return True
+        
+        # Check if all cells are bold (common header formatting)
+        bold_count = 0
+        for cell in cells:
+            style = cell.get('style', '')
+            if 'font-weight' in style and 'bold' in style:
+                bold_count += 1
+            elif cell.find('.//b') is not None or cell.find('.//strong') is not None:
+                bold_count += 1
+        
+        # Only consider it a header if ALL cells are bold (not just some)
+        if bold_count == len(cells) and bold_count > 0:
+            return True
+        
+        # Check content type ratio - headers usually have more text than numbers
+        # Count cells with primarily text vs primarily numbers
+        text_cells = 0
+        number_cells = 0
+        for cell in cells:
+            cell_text = cell.text_content().strip()
+            if cell_text:
+                # Remove common symbols for analysis
+                clean_text = cell_text.replace('$', '').replace('%', '').replace(',', '').replace('(', '').replace(')', '')
+                if clean_text.replace('.', '').replace('-', '').strip().isdigit():
+                    number_cells += 1
+                else:
+                    text_cells += 1
+        
+        # Be very careful about treating text-heavy rows as headers
+        # Many data rows start with text labels (e.g., "Impact of...", "Effect of...")
+        # Only consider it a header if it has mostly text AND doesn't look like a data label
+        if text_cells > number_cells * 2 and text_cells >= 3:
+            # Check for common data row patterns
+            data_row_indicators = [
+                'impact of', 'effect of', 'adjustment', 'provision for', 'benefit',
+                'expense', 'income from', 'loss on', 'gain on', 'charge', 'credit',
+                'earnings', 'computed', 'state taxes', 'research', 'excess tax'
+            ]
+            
+            # If it starts with any of these, it's likely a data row, not a header
+            for indicator in data_row_indicators:
+                if row_text_lower.startswith(indicator) or indicator in row_text_lower[:50]:
+                    return False
+            
+            # Also not a header if it starts with "total"
+            if not row_text_lower.startswith('total'):
+                return True
+        
+        return False
+    
+    def _detect_table_type(self, table: TableNode) -> TableType:
+        """Detect the type of table based on content."""
+        # Collect text from headers and first few rows
+        text_parts = []
+        
+        # Add caption
+        if table.caption:
+            text_parts.append(table.caption.lower())
+        
+        # Add headers
+        for header_row in table.headers:
+            for cell in header_row:
+                text_parts.append(cell.text().lower())
+        
+        # Add first few rows
+        for row in table.rows[:3]:
+            for cell in row.cells:
+                text_parts.append(cell.text().lower())
+        
+        combined_text = ' '.join(text_parts)
+        
+        # Check for financial table
+        financial_count = sum(1 for keyword in self.FINANCIAL_KEYWORDS if keyword in combined_text)
+        if financial_count >= 2:  # Lowered threshold for better detection
+            return TableType.FINANCIAL
+        
+        # Check for metrics table  
+        metrics_count = sum(1 for keyword in self.METRICS_KEYWORDS if keyword in combined_text)
+        numeric_cells = sum(1 for row in table.rows for cell in row.cells if cell.is_numeric)
+        total_cells = sum(len(row.cells) for row in table.rows)
+        
+        if total_cells > 0:
+            numeric_ratio = numeric_cells / total_cells
+            # More lenient metrics detection
+            if metrics_count >= 1 or numeric_ratio > 0.3:
+                return TableType.METRICS
+        
+        # Check for table of contents
+        if 'content' in combined_text or 'index' in combined_text:
+            # Look for page numbers
+            has_page_numbers = any(
+                re.search(r'\b\d{1,3}\b', cell.text()) 
+                for row in table.rows 
+                for cell in row.cells
+            )
+            if has_page_numbers:
+                return TableType.TABLE_OF_CONTENTS
+        
+        # Check for exhibit index
+        if 'exhibit' in combined_text:
+            return TableType.EXHIBIT_INDEX
+        
+        # Check for reference table (citations, definitions, etc.)
+        if any(word in combined_text for word in ['reference', 'definition', 'glossary', 'citation']):
+            return TableType.REFERENCE
+        
+        return TableType.GENERAL
+    
+    def _extract_relationships(self, table: TableNode):
+        """Extract relationships within table data."""
+        # This would implement relationship extraction
+        # For now, just set a flag that relationships were processed
+        table.set_metadata('relationships_extracted', True)
+        
+        # Example relationships to extract:
+        # - Parent-child relationships (indented rows)
+        # - Total rows that sum other rows
+        # - Cross-references between cells
+        # - Time series relationships
+        
+        # Detect total rows
+        total_rows = []
+        for i, row in enumerate(table.rows):
+            if row.is_total_row:
+                total_rows.append(i)
+        
+        if total_rows:
+            table.set_metadata('total_rows', total_rows)
+        
+        # Detect indentation patterns (parent-child)
+        indentation_levels = []
+        for row in table.rows:
+            if row.cells:
+                first_cell_text = row.cells[0].text()
+                # Count leading spaces
+                indent = len(first_cell_text) - len(first_cell_text.lstrip())
+                indentation_levels.append(indent)
+        
+        if any(level > 0 for level in indentation_levels):
+            table.set_metadata('has_hierarchy', True)
+            table.set_metadata('indentation_levels', indentation_levels)
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/xbrl_extraction.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/xbrl_extraction.py
@@ -0,0 +1,345 @@
+"""
+XBRL extraction strategy for inline XBRL documents.
+"""
+
+from typing import Dict, Any, Optional
+
+from lxml.html import HtmlElement
+
+from edgar.documents.types import XBRLFact
+
+
+class XBRLExtractor:
+    """
+    Extracts XBRL facts from inline XBRL (iXBRL) documents.
+    
+    Handles:
+    - ix:nonFraction, ix:nonNumeric facts
+    - Context and unit resolution
+    - Continuation handling
+    - Transformation rules
+    """
+    
+    # XBRL namespaces
+    NAMESPACES = {
+        'ix': 'http://www.xbrl.org/2013/inlineXBRL',
+        'xbrli': 'http://www.xbrl.org/2003/instance',
+        'xbrldi': 'http://xbrl.org/2006/xbrldi',
+        'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
+    }
+    
+    # Common transformation formats
+    TRANSFORMATIONS = {
+        'ixt:numdotdecimal': lambda x: x.replace(',', ''),
+        'ixt:numcommadecimal': lambda x: x.replace('.', '_').replace(',', '.').replace('_', ','),
+        'ixt:zerodash': lambda x: '0' if x == '-' else x,
+        'ixt:datedoteu': lambda x: x.replace('.', '-'),
+        'ixt:datedotus': lambda x: x.replace('.', '/'),
+    }
+    
+    def __init__(self):
+        """Initialize XBRL extractor."""
+        self.contexts: Dict[str, Dict[str, Any]] = {}
+        self.units: Dict[str, str] = {}
+        self.continuations: Dict[str, str] = {}
+        self._initialized = False
+    
+    def extract_context(self, element: HtmlElement) -> Optional[Dict[str, Any]]:
+        """
+        Extract XBRL context from element.
+        
+        Args:
+            element: HTML element that might contain XBRL
+            
+        Returns:
+            XBRL metadata if found
+        """
+        # Check if element is an ix: tag
+        if not self._is_xbrl_element(element):
+            return None
+        
+        # Initialize context if needed
+        if not self._initialized:
+            self._initialize_context(element)
+        
+        # Extract based on element type
+        tag_name = self._get_local_name(element.tag)
+        
+        if tag_name == 'nonfraction':
+            return self._extract_nonfraction(element)
+        elif tag_name == 'nonnumeric':
+            return self._extract_nonnumeric(element)
+        elif tag_name == 'continuation':
+            return self._extract_continuation(element)
+        elif tag_name == 'footnote':
+            return self._extract_footnote(element)
+        elif tag_name == 'fraction':
+            return self._extract_fraction(element)
+        
+        return None
+    
+    def extract_fact(self, element: HtmlElement) -> Optional[XBRLFact]:
+        """Extract XBRL fact from element."""
+        context = self.extract_context(element)
+        if not context:
+            return None
+        
+        # Get fact value
+        value = self._get_fact_value(element)
+        
+        # Create fact
+        fact = XBRLFact(
+            concept=context.get('name', ''),
+            value=value,
+            context_ref=context.get('contextRef'),
+            unit_ref=context.get('unitRef'),
+            decimals=context.get('decimals'),
+            scale=context.get('scale'),
+            format=context.get('format'),
+            sign=context.get('sign')
+        )
+        
+        # Resolve references
+        if fact.context_ref and fact.context_ref in self.contexts:
+            fact.context = self.contexts[fact.context_ref]
+        
+        if fact.unit_ref and fact.unit_ref in self.units:
+            fact.unit = self.units[fact.unit_ref]
+        
+        return fact
+    
+    def _is_xbrl_element(self, element: HtmlElement) -> bool:
+        """Check if element is an XBRL element."""
+        tag = element.tag
+        if not isinstance(tag, str):
+            return False
+        
+        # Handle both namespaced and non-namespaced tags
+        tag_lower = tag.lower()
+        return (
+            tag.startswith('{' + self.NAMESPACES['ix'] + '}') or
+            tag.startswith('ix:') or
+            tag_lower.startswith('ix:')
+        )
+    
+    def _get_local_name(self, tag: str) -> str:
+        """Get local name from qualified tag."""
+        if '}' in tag:
+            return tag.split('}')[1].lower()
+        elif ':' in tag:
+            return tag.split(':')[1].lower()
+        return tag.lower()
+    
+    def _initialize_context(self, element: HtmlElement):
+        """Initialize context and unit information from document."""
+        # Find root element
+        root = element.getroottree().getroot()
+        
+        # Extract contexts
+        self._extract_contexts(root)
+        
+        # Extract units
+        self._extract_units(root)
+        
+        self._initialized = True
+    
+    def _extract_contexts(self, root: HtmlElement):
+        """Extract all context definitions."""
+        # Look for xbrli:context elements
+        for context in root.xpath('//xbrli:context', namespaces=self.NAMESPACES):
+            context_id = context.get('id')
+            if not context_id:
+                continue
+            
+            context_data = {
+                'id': context_id
+            }
+            
+            # Extract entity
+            entity = context.find('.//xbrli:entity', namespaces=self.NAMESPACES)
+            if entity is not None:
+                identifier = entity.find('.//xbrli:identifier', namespaces=self.NAMESPACES)
+                if identifier is not None:
+                    context_data['entity'] = identifier.text
+                    context_data['scheme'] = identifier.get('scheme')
+            
+            # Extract period
+            period = context.find('.//xbrli:period', namespaces=self.NAMESPACES)
+            if period is not None:
+                instant = period.find('.//xbrli:instant', namespaces=self.NAMESPACES)
+                if instant is not None:
+                    context_data['instant'] = instant.text
+                    context_data['period_type'] = 'instant'
+                else:
+                    start = period.find('.//xbrli:startDate', namespaces=self.NAMESPACES)
+                    end = period.find('.//xbrli:endDate', namespaces=self.NAMESPACES)
+                    if start is not None and end is not None:
+                        context_data['start_date'] = start.text
+                        context_data['end_date'] = end.text
+                        context_data['period_type'] = 'duration'
+            
+            # Extract dimensions
+            segment = context.find('.//xbrli:segment', namespaces=self.NAMESPACES)
+            if segment is not None:
+                dimensions = {}
+                for member in segment.findall('.//xbrldi:explicitMember', namespaces=self.NAMESPACES):
+                    dim = member.get('dimension')
+                    if dim:
+                        dimensions[dim] = member.text
+                if dimensions:
+                    context_data['dimensions'] = dimensions
+            
+            self.contexts[context_id] = context_data
+    
+    def _extract_units(self, root: HtmlElement):
+        """Extract all unit definitions."""
+        # Look for xbrli:unit elements
+        for unit in root.xpath('//xbrli:unit', namespaces=self.NAMESPACES):
+            unit_id = unit.get('id')
+            if not unit_id:
+                continue
+            
+            # Check for simple measure
+            measure = unit.find('.//xbrli:measure', namespaces=self.NAMESPACES)
+            if measure is not None:
+                self.units[unit_id] = self._normalize_unit(measure.text)
+                continue
+            
+            # Check for complex unit (divide)
+            divide = unit.find('.//xbrli:divide', namespaces=self.NAMESPACES)
+            if divide is not None:
+                numerator = divide.find('.//xbrli:unitNumerator/xbrli:measure', namespaces=self.NAMESPACES)
+                denominator = divide.find('.//xbrli:unitDenominator/xbrli:measure', namespaces=self.NAMESPACES)
+                
+                if numerator is not None and denominator is not None:
+                    num_unit = self._normalize_unit(numerator.text)
+                    den_unit = self._normalize_unit(denominator.text)
+                    self.units[unit_id] = f"{num_unit}/{den_unit}"
+    
+    def _normalize_unit(self, unit_text: str) -> str:
+        """Normalize unit text."""
+        if not unit_text:
+            return ''
+        
+        # Remove namespace prefix
+        if ':' in unit_text:
+            unit_text = unit_text.split(':')[-1]
+        
+        # Common normalizations
+        unit_map = {
+            'usd': 'USD',
+            'shares': 'shares',
+            'pure': 'pure',
+            'percent': '%'
+        }
+        
+        return unit_map.get(unit_text.lower(), unit_text)
+    
+    def _extract_nonfraction(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:nonFraction element."""
+        metadata = {
+            'type': 'nonFraction',
+            'name': element.get('name'),
+            'contextRef': element.get('contextRef') or element.get('contextref'),
+            'unitRef': element.get('unitRef') or element.get('unitref'),
+            'decimals': element.get('decimals'),
+            'scale': element.get('scale'),
+            'format': element.get('format'),
+            'sign': element.get('sign')
+        }
+        
+        # Clean None values
+        return {k: v for k, v in metadata.items() if v is not None}
+    
+    def _extract_nonnumeric(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:nonNumeric element."""
+        metadata = {
+            'type': 'nonNumeric',
+            'name': element.get('name'),
+            'contextRef': element.get('contextRef') or element.get('contextref'),
+            'format': element.get('format')
+        }
+        
+        # Clean None values
+        return {k: v for k, v in metadata.items() if v is not None}
+    
+    def _extract_continuation(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:continuation element."""
+        cont_id = element.get('id')
+        continued_at = element.get('continuedAt')
+        
+        if cont_id and continued_at:
+            # Map continuation to original
+            if continued_at in self.continuations:
+                original = self.continuations[continued_at]
+                self.continuations[cont_id] = original
+                return original
+            else:
+                # Store for later resolution
+                metadata = {
+                    'type': 'continuation',
+                    'id': cont_id,
+                    'continuedAt': continued_at
+                }
+                self.continuations[cont_id] = metadata
+                return metadata
+        
+        return {}
+    
+    def _extract_footnote(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:footnote element."""
+        return {
+            'type': 'footnote',
+            'footnoteRole': element.get('footnoteRole'),
+            'footnoteID': element.get('footnoteID')
+        }
+    
+    def _extract_fraction(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:fraction element."""
+        metadata = {
+            'type': 'fraction',
+            'name': element.get('name'),
+            'contextRef': element.get('contextRef'),
+            'unitRef': element.get('unitRef')
+        }
+        
+        # Extract numerator and denominator
+        numerator = element.find('.//ix:numerator', namespaces=self.NAMESPACES)
+        denominator = element.find('.//ix:denominator', namespaces=self.NAMESPACES)
+        
+        if numerator is not None:
+            metadata['numerator'] = numerator.text
+        if denominator is not None:
+            metadata['denominator'] = denominator.text
+        
+        return {k: v for k, v in metadata.items() if v is not None}
+    
+    def _get_fact_value(self, element: HtmlElement) -> str:
+        """Get fact value from element with transformations."""
+        # Get raw value
+        value = element.text or ''
+        
+        # Apply format transformation if specified
+        format_attr = element.get('format')
+        if format_attr and format_attr in self.TRANSFORMATIONS:
+            transform = self.TRANSFORMATIONS[format_attr]
+            value = transform(value)
+        
+        # Apply scale if specified
+        scale = element.get('scale')
+        if scale:
+            try:
+                scale_factor = int(scale)
+                numeric_value = float(value.replace(',', ''))
+                scaled_value = numeric_value * (10 ** scale_factor)
+                value = str(scaled_value)
+            except (ValueError, TypeError):
+                pass
+        
+        # Apply sign if specified
+        sign = element.get('sign')
+        if sign == '-':
+            if value and not value.startswith('-'):
+                value = '-' + value
+        
+        return value.strip()