Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/documents/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/init.py
@@ -0,0 +1,49 @@
+"""
+EdgarTools HTML Parser v2.0
+
+A high-performance, semantically-aware HTML parser for SEC filings.
+"""
+
+from edgar.documents.parser import HTMLParser
+from edgar.documents.document import Document
+from edgar.documents.config import ParserConfig
+from edgar.documents.exceptions import ParsingError
+from edgar.documents.types import NodeType, SemanticType, TableType
+from edgar.documents.search import DocumentSearch, SearchResult, SearchMode
+from edgar.documents.renderers import MarkdownRenderer, TextRenderer
+
+__version__ = "2.0.0"
+__all__ = [
+    'HTMLParser', 
+    'Document', 
+    'ParserConfig', 
+    'ParsingError',
+    'NodeType',
+    'SemanticType', 
+    'TableType',
+    'DocumentSearch',
+    'SearchResult',
+    'SearchMode',
+    'MarkdownRenderer',
+    'TextRenderer',
+    'parse_html'
+]
+
+
+def parse_html(html: str, config: ParserConfig = None) -> Document:
+    """
+    Convenience function for parsing HTML.
+    
+    Args:
+        html: HTML content to parse
+        config: Optional parser configuration
+        
+    Returns:
+        Parsed Document object
+        
+    Example:
+        >>> document = parse_html(html_content)
+        >>> print(document.text()[:100])
+    """
+    parser = HTMLParser(config or ParserConfig())
+    return parser.parse(html)
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/cache_mixin.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/cache_mixin.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/config.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/config.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/document.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/document.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/exceptions.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/exceptions.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/migration.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/migration.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/migration_example.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/migration_example.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/nodes.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/nodes.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/parser.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/parser.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/search.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/search.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/table_nodes.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/table_nodes.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/table_utils.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/table_utils.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/pycache/types.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/pycache/types.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/cache_mixin.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/cache_mixin.py
@@ -0,0 +1,83 @@
+"""
+Mixin class providing text caching functionality for document nodes.
+
+This module consolidates the text caching pattern used across multiple node types
+(DocumentNode, ParagraphNode, ContainerNode, TableNode, and Document).
+"""
+
+from typing import Callable, Any
+
+
+class CacheableMixin:
+    """
+    Mixin providing text caching functionality for nodes.
+
+    This mixin implements a lazy-evaluated text caching pattern that:
+    1. Checks for existing cached text
+    2. Generates text on first access via a generator function
+    3. Caches the result for subsequent accesses
+    4. Provides recursive cache clearing for tree structures
+
+    Usage:
+        class MyNode(CacheableMixin):
+            def text(self, **kwargs):
+                def generator():
+                    # Generate text logic here
+                    return "generated text"
+                return self._get_cached_text(generator)
+    """
+
+    def _get_cached_text(self, generator_func: Callable[[], Any], *args, **kwargs) -> Any:
+        """
+        Get cached text or generate and cache it.
+
+        This method implements the caching pattern:
+        - If cache exists and is not None, return cached value
+        - Otherwise, call generator function to create text
+        - Store result in cache
+        - Return the result
+
+        Args:
+            generator_func: Function that generates the text when cache miss occurs
+            *args: Positional arguments to pass to generator (currently unused)
+            **kwargs: Keyword arguments to pass to generator (currently unused)
+
+        Returns:
+            The cached or newly generated text
+
+        Note:
+            The cache is stored in the instance attribute '_text_cache'.
+            Generator function is called without arguments in current implementation.
+        """
+        if hasattr(self, '_text_cache') and self._text_cache is not None:
+            return self._text_cache
+
+        # Generate text and cache it
+        self._text_cache = generator_func(*args, **kwargs)
+        return self._text_cache
+
+    def clear_text_cache(self) -> None:
+        """
+        Clear cached text recursively.
+
+        This method:
+        1. Clears the text cache for this node (sets to None)
+        2. Recursively clears cache for all children (if node has children)
+
+        The recursive clearing ensures that when a parent node's content changes,
+        all descendant nodes also have their caches invalidated.
+
+        Safe to call even if:
+        - Node doesn't have a cache (_text_cache attribute)
+        - Node doesn't have children
+        - Children don't have clear_text_cache method
+        """
+        # Clear own cache if it exists
+        if hasattr(self, '_text_cache'):
+            self._text_cache = None
+
+        # Recursively clear children's caches
+        if hasattr(self, 'children'):
+            for child in self.children:
+                if hasattr(child, 'clear_text_cache'):
+                    child.clear_text_cache()
--- a/venv/lib/python3.10/site-packages/edgar/documents/config.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/config.py
@@ -0,0 +1,211 @@
+"""
+Configuration for the HTML parser.
+"""
+
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Optional
+
+
+@dataclass
+class DetectionThresholds:
+    """
+    Configurable thresholds for section detection strategies.
+
+    Attributes:
+        min_confidence: Minimum confidence score to include a section (0.0-1.0)
+        cross_validation_boost: Multiplier when multiple methods agree (>1.0)
+        disagreement_penalty: Multiplier when methods disagree (<1.0)
+        boundary_overlap_penalty: Multiplier for overlapping sections (<1.0)
+        enable_cross_validation: Whether to run cross-validation (slower but more accurate)
+        thresholds_by_form: Filing-specific threshold overrides
+    """
+    min_confidence: float = 0.6
+    cross_validation_boost: float = 1.2
+    disagreement_penalty: float = 0.8
+    boundary_overlap_penalty: float = 0.9
+    enable_cross_validation: bool = False  # Disabled by default for performance
+    thresholds_by_form: Dict[str, Dict[str, float]] = field(default_factory=dict)
+
+
+@dataclass
+class ParserConfig:
+    """
+    Configuration for HTML parser.
+    
+    Attributes:
+        max_document_size: Maximum document size in bytes
+        streaming_threshold: Document size threshold for streaming mode
+        cache_size: Maximum number of cached items
+        enable_parallel: Enable parallel processing for tables
+        strict_mode: Fail on parsing errors vs. best effort
+        extract_xbrl: Extract inline XBRL facts
+        extract_styles: Extract and process CSS styles
+        preserve_whitespace: Preserve original whitespace
+        optimize_for_ai: Enable AI-specific optimizations
+        max_token_estimation: Maximum estimated tokens for AI optimization
+        features: Feature flags for optional functionality
+    """
+    
+    # Performance settings
+    max_document_size: int = 100 * 1024 * 1024  # 100MB (handles large filings like JPM)
+    streaming_threshold: int = 10 * 1024 * 1024  # 10MB
+    cache_size: int = 1000
+    enable_parallel: bool = True
+    max_workers: Optional[int] = None  # None = use CPU count
+    
+    # Parsing settings
+    strict_mode: bool = False
+    extract_xbrl: bool = True
+    extract_styles: bool = True
+    preserve_whitespace: bool = False
+    normalize_text: bool = True
+    extract_links: bool = True
+    extract_images: bool = False
+    
+    # AI optimization
+    optimize_for_ai: bool = True
+    max_token_estimation: int = 100_000
+    chunk_size: int = 512
+    chunk_overlap: int = 128
+    
+    # Table processing
+    table_extraction: bool = True
+    detect_table_types: bool = True
+    extract_table_relationships: bool = True
+    fast_table_rendering: bool = True  # Fast renderer is now production-ready (7-10x faster than Rich)
+    
+    # Section detection
+    detect_sections: bool = True
+    eager_section_extraction: bool = False  # Extract sections during parsing vs. on first access (default: lazy)
+    form: Optional[str] = None  # Required for section detection (e.g. '10-K', '10-Q', '8-K')
+    detection_thresholds: DetectionThresholds = field(default_factory=DetectionThresholds)
+    section_patterns: Dict[str, List[str]] = field(default_factory=lambda: {
+        'business': [
+            r'item\s+1\.?\s*business',
+            r'business\s+overview',
+            r'our\s+business'
+        ],
+        'risk_factors': [
+            r'item\s+1a\.?\s*risk\s+factors',
+            r'risk\s+factors',
+            r'factors\s+that\s+may\s+affect'
+        ],
+        'properties': [
+            r'item\s+2\.?\s*properties',
+            r'properties'
+        ],
+        'legal_proceedings': [
+            r'item\s+3\.?\s*legal\s+proceedings',
+            r'legal\s+proceedings',
+            r'litigation'
+        ],
+        'mda': [
+            r'item\s+7\.?\s*management\'?s?\s+discussion',
+            r'md&a',
+            r'management\'?s?\s+discussion\s+and\s+analysis'
+        ],
+        'financial_statements': [
+            r'item\s+8\.?\s*financial\s+statements',
+            r'consolidated\s+financial\s+statements',
+            r'financial\s+statements'
+        ]
+    })
+    
+    # Feature flags
+    features: Dict[str, bool] = field(default_factory=lambda: {
+        'ml_header_detection': True,
+        'semantic_analysis': True,
+        'table_understanding': True,
+        'xbrl_validation': True,
+        'auto_section_detection': True,
+        'smart_text_extraction': True,
+        'footnote_linking': True,
+        'cross_reference_resolution': True
+    })
+    
+    # Header detection settings
+    header_detection_threshold: float = 0.6  # Minimum confidence
+    header_detection_methods: List[str] = field(default_factory=lambda: [
+        'style',
+        'pattern',
+        'structural',
+        'contextual'
+    ])
+    
+    # Text extraction settings
+    min_text_length: int = 10  # Minimum text length to keep
+    merge_adjacent_nodes: bool = True
+    merge_distance: int = 2  # Max distance between nodes to merge
+    
+    # Performance monitoring
+    enable_profiling: bool = False
+    log_performance: bool = False
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert configuration to dictionary."""
+        return {
+            'max_document_size': self.max_document_size,
+            'streaming_threshold': self.streaming_threshold,
+            'cache_size': self.cache_size,
+            'enable_parallel': self.enable_parallel,
+            'strict_mode': self.strict_mode,
+            'extract_xbrl': self.extract_xbrl,
+            'extract_styles': self.extract_styles,
+            'preserve_whitespace': self.preserve_whitespace,
+            'optimize_for_ai': self.optimize_for_ai,
+            'features': self.features.copy()
+        }
+    
+    @classmethod
+    def for_performance(cls) -> 'ParserConfig':
+        """Create config optimized for performance."""
+        return cls(
+            extract_styles=False,
+            extract_xbrl=False,
+            enable_parallel=True,
+            cache_size=5000,
+            eager_section_extraction=False,  # Skip expensive section extraction
+            fast_table_rendering=True,  # Fast renderer (enabled by default now)
+            features={
+                'ml_header_detection': False,
+                'semantic_analysis': False,
+                'table_understanding': False,
+                'xbrl_validation': False
+            }
+        )
+    
+    @classmethod
+    def for_accuracy(cls) -> 'ParserConfig':
+        """Create config optimized for accuracy."""
+        return cls(
+            strict_mode=True,
+            extract_styles=True,
+            extract_xbrl=True,
+            enable_parallel=True,
+            features={
+                'ml_header_detection': True,
+                'semantic_analysis': True,
+                'table_understanding': True,
+                'xbrl_validation': True,
+                'auto_section_detection': True,
+                'smart_text_extraction': True,
+                'footnote_linking': True,
+                'cross_reference_resolution': True
+            }
+        )
+    
+    @classmethod
+    def for_ai(cls) -> 'ParserConfig':
+        """Create config optimized for AI/LLM processing."""
+        return cls(
+            optimize_for_ai=True,
+            extract_styles=False,
+            extract_xbrl=True,
+            normalize_text=True,
+            merge_adjacent_nodes=True,
+            features={
+                'ml_header_detection': True,
+                'semantic_analysis': True,
+                'smart_text_extraction': True
+            }
+        )
--- a/venv/lib/python3.10/site-packages/edgar/documents/docs/HTML_PARSER_STATUS.md
+++ b/venv/lib/python3.10/site-packages/edgar/documents/docs/HTML_PARSER_STATUS.md
@@ -0,0 +1,314 @@
+# HTML Parser Rewrite - Status Report
+
+**Generated**: 2025-10-08
+**Branch**: `html_rewrite`
+**Target**: Merge to `main`
+
+---
+
+## Overall Progress: ~95% Complete ✅
+
+### Completed Phases
+
+#### ✅ Phase 1: Core Implementation (100%)
+- [x] Streaming parser for large documents
+- [x] TableMatrix system for accurate table rendering
+- [x] Section extraction with Part I/II detection
+- [x] XBRL integration
+- [x] Rich-based table rendering
+- [x] Configuration system (ParserConfig)
+- [x] Error handling and validation
+
+#### ✅ Phase 2: Functional Testing (100%)
+- [x] **Corpus Validation** - 40 diverse filings, 100% success rate
+- [x] **Edge Cases** - 31 tests covering invalid inputs, malformed HTML, edge conditions
+- [x] **Integration Tests** - 25 tests for Filing/Company integration, backward compatibility
+- [x] **Regression Tests** - 15 tests preventing known bugs from returning
+
+**Total Test Count**: 79 functional tests, all passing
+
+#### ✅ Phase 3: Performance Profiling (100%)
+- [x] **Benchmarking Infrastructure** - Comprehensive benchmark suite
+- [x] **Hot Path Analysis** - Identified 3 critical bottlenecks (63% section extraction, 40% Rich rendering, 15% regex)
+- [x] **Memory Profiling** - Found 255MB memory leak in MSFT 10-K, documented root causes
+- [x] **Performance Regression Tests** - 15 tests locking in baseline thresholds
+
+**Performance Baseline Established**:
+- Average: 3.8MB/s throughput, 4.1MB memory per doc
+- Small docs: 2.6MB/s (optimization opportunity)
+- Large docs: 20.7MB/s (excellent streaming)
+- Memory leak: 19-25x ratio on medium docs (needs fixing)
+
+#### ✅ Phase 4: Test Data Augmentation (100%)
+- [x] **HTML Fixtures** - Downloaded 32 files (155MB) from 16 companies across 6 industries
+- [x] **Download Automation** - Created `download_html_fixtures.py` script
+- [x] **Documentation** - Comprehensive fixture documentation
+
+---
+
+## Current Status: Ready for Optimization Phase
+
+### What's Working Well ✅
+
+1. **Parsing Accuracy**: 100% success rate across 40+ diverse filings
+2. **Large Document Handling**: Excellent streaming performance (20.7MB/s on JPM 10-K)
+3. **Table Extraction**: TableMatrix accurately handles colspan/rowspan
+4. **Test Coverage**: 79 comprehensive tests covering edge cases, integration, regression
+5. **Backward Compatibility**: Old TenK API still works for existing code
+
+### Known Issues to Address 🔧
+
+#### Critical (Must Fix Before Merge)
+
+1. **Memory Leaks** (Priority: CRITICAL)
+   - MSFT 10-K: 255MB leak (19x document size)
+   - Apple 10-K: 41MB leak (23x document size)
+   - **Root Causes**:
+     - Rich Console objects retained (0.4MB per doc)
+     - Global caches not cleared on document deletion
+     - Circular references in node graph
+   - **Location**: `tests/perf/memory_analysis.md:90-130`
+   - **Impact**: Server crashes after 10-20 requests in production
+
+2. **Performance Bottlenecks** (Priority: HIGH)
+   - Section extraction: 3.7s (63% of parse time)
+   - Rich rendering for text: 2.4s (40% of parse time)
+   - Regex normalization: 0.8s (15% of parse time)
+   - **Location**: `tests/perf/hotpath_analysis.md:9-66`
+   - **Impact**: 4x slower than necessary on medium documents
+
+#### Non-Critical (Can Fix After Merge)
+
+3. **Small Document Performance** (Priority: MEDIUM)
+   - 2.6MB/s vs desired 5MB/s
+   - Overhead dominates on <5MB documents
+   - **Optimization**: Lazy loading, reduce upfront processing
+
+---
+
+## Next Steps (In Order)
+
+### Phase 5: Critical Fixes (2-3 days) 🔧
+
+#### 5.1 Memory Leak Fixes (1-2 days)
+**Goal**: Reduce memory leak from 255MB to <5MB
+
+Tasks:
+- [ ] Implement `Document.__del__()` to clear caches
+- [ ] Replace Rich rendering in `text()` with direct string building
+- [ ] Break circular references in node graph
+- [ ] Use weak references for parent links
+- [ ] Add `__slots__` to frequently created objects (Cell, TableNode)
+
+**Expected Result**: MSFT 10-K leak: 255MB → <5MB (95% improvement)
+
+**Validation**:
+```bash
+pytest tests/perf/test_performance_regression.py::TestMemoryRegression -v
+```
+
+#### 5.2 Performance Optimizations (1-2 days)
+**Goal**: Improve parse speed from 1.2s → 0.3s on Apple 10-K (77% faster)
+
+Tasks:
+- [ ] Fix section detection - use headings instead of rendering entire document
+- [ ] Implement fast text extraction without Rich overhead
+- [ ] Optimize regex normalization - combine patterns, use compilation
+
+**Expected Results**:
+- Section extraction: 3.7s → 1.2s (60% faster)
+- Text extraction: 2.4s → 1.2s (50% faster)
+- Regex: 0.8s → 0.5s (40% faster)
+
+**Validation**:
+```bash
+pytest tests/perf/test_performance_regression.py::TestParseSpeedRegression -v
+```
+
+### Phase 6: Final Validation (1 day) ✅
+
+Tasks:
+- [ ] Re-run all 79 functional tests
+- [ ] Re-run performance regression tests (verify improvements)
+- [ ] Run full corpus validation
+- [ ] Memory profiling validation (confirm leaks fixed)
+- [ ] Update CHANGELOG.md
+- [ ] Create merge summary document
+
+### Phase 7: Merge to Main (1 day) 🚀
+
+Tasks:
+- [ ] Final code review
+- [ ] Squash commits or create clean merge
+- [ ] Update version number
+- [ ] Merge to main
+- [ ] Tag release
+- [ ] Monitor for issues
+
+---
+
+## Test Summary
+
+### Current Test Status: 79/79 Passing (100%)
+
+```
+tests/corpus/test_corpus_validation.py     8 tests  ✓
+tests/test_html_parser_edge_cases.py      31 tests  ✓
+tests/test_html_parser_integration.py     25 tests  ✓
+tests/test_html_parser_regressions.py     15 tests  ✓
+tests/perf/test_performance_regression.py 15 tests  ✓ (baseline established)
+```
+
+### Test Execution
+
+```bash
+# Functional tests (79 tests, ~30s)
+pytest tests/corpus tests/test_html_parser_*.py -v
+
+# Performance tests (15 tests, ~20s)
+pytest tests/perf/test_performance_regression.py -m performance -v
+
+# All tests
+pytest tests/ -v
+```
+
+---
+
+## Performance Metrics
+
+### Current Baseline (Before Optimization)
+
+| Document | Size | Parse Time | Throughput | Memory | Tables | Sections |
+|----------|------|------------|------------|--------|--------|----------|
+| Apple 10-Q | 1.1MB | 0.307s | 3.6MB/s | 27.9MB (25.6x) | 40 | 9 |
+| Apple 10-K | 1.8MB | 0.500s | 3.6MB/s | 21.6MB (11.9x) | 63 | 8 |
+| MSFT 10-K | 7.8MB | 1.501s | 5.2MB/s | 147.0MB (18.9x) | 85 | 0 |
+| JPM 10-K | 52.4MB | 2.537s | 20.7MB/s | 0.6MB (0.01x) | 681 | 0 |
+
+### Target Metrics (After Optimization)
+
+| Metric | Current | Target | Improvement |
+|--------|---------|--------|-------------|
+| **Memory leak** | 41-255MB | <5MB | 95% reduction |
+| **Memory ratio** | 19-25x | <3x | 87% reduction |
+| **Parse time (Apple 10-K)** | 0.500s | 0.150s | 70% faster |
+| **Throughput (small docs)** | 2.6MB/s | 5.0MB/s | 92% faster |
+
+---
+
+## File Organization
+
+### Core Parser Files
+```
+edgar/documents/
+├── __init__.py              # Public API (parse_html)
+├── parser.py                # Main parser with streaming
+├── config.py                # ParserConfig
+├── document_builder.py      # Document tree construction
+├── nodes/                   # Node types (TableNode, SectionNode)
+├── utils/
+│   ├── streaming.py         # Streaming parser (fixed JPM bug)
+│   └── table_processing.py  # TableMatrix system
+└── exceptions.py            # Custom exceptions
+```
+
+### Test Files
+```
+tests/
+├── corpus/                           # Corpus validation
+│   ├── quick_corpus.py              # Corpus builder
+│   └── test_corpus_validation.py    # 8 validation tests
+├── fixtures/
+│   ├── html/                         # 32 HTML fixtures (155MB)
+│   │   ├── {ticker}/10k/            # By company and form
+│   │   └── README.md
+│   └── download_html_fixtures.py    # Download automation
+├── perf/                             # Performance testing
+│   ├── benchmark_html_parser.py     # Benchmarking
+│   ├── profile_hotpaths.py          # Hot path profiling
+│   ├── profile_memory.py            # Memory profiling
+│   ├── test_performance_regression.py # Regression tests
+│   ├── performance_report.md        # Benchmark results
+│   ├── hotpath_analysis.md          # Bottleneck analysis
+│   └── memory_analysis.md           # Memory leak analysis
+├── test_html_parser_edge_cases.py   # 31 edge case tests
+├── test_html_parser_integration.py  # 25 integration tests
+└── test_html_parser_regressions.py  # 15 regression tests
+```
+
+---
+
+## Risks and Mitigation
+
+### Risk 1: Memory Leaks in Production
+**Severity**: HIGH
+**Probability**: HIGH (confirmed in testing)
+**Mitigation**: Must fix before merge (Phase 5.1)
+
+### Risk 2: Performance Regression
+**Severity**: MEDIUM
+**Probability**: LOW (baseline established, regression tests in place)
+**Mitigation**: Performance regression tests will catch any degradation
+
+### Risk 3: Backward Compatibility
+**Severity**: LOW
+**Probability**: LOW (integration tests passing)
+**Mitigation**: 25 integration tests verify old API still works
+
+---
+
+## Estimated Timeline to Merge
+
+```
+Phase 5.1: Memory leak fixes        1-2 days
+Phase 5.2: Performance optimization 1-2 days
+Phase 6: Final validation           1 day
+Phase 7: Merge to main              1 day
+----------------------------------------
+Total:                              4-6 days
+```
+
+**Target Merge Date**: October 12-14, 2025
+
+---
+
+## Decision Points
+
+### Should We Merge Now or After Optimization?
+
+**Option A: Merge Now (Not Recommended)**
+- ✅ Functional tests passing
+- ✅ Backward compatible
+- ❌ Memory leaks (production risk)
+- ❌ Performance issues
+- ❌ Will require hotfix soon
+
+**Option B: Fix Critical Issues First (Recommended)**
+- ✅ Production-ready
+- ✅ Performance validated
+- ✅ Memory efficient
+- ❌ 4-6 days delay
+- ✅ Clean, professional release
+
+**Recommendation**: **Option B** - Fix critical memory leaks and performance issues before merge. The 4-6 day investment prevents production incidents and ensures a polished release.
+
+---
+
+## Questions for Review
+
+1. **Scope**: Should we fix only critical issues (memory + performance) or also tackle small-doc optimization?
+2. **Timeline**: Is 4-6 days acceptable, or do we need to merge sooner?
+3. **Testing**: Are 79 functional tests + 15 performance tests sufficient coverage?
+4. **Documentation**: Do we need user-facing documentation updates?
+
+---
+
+## Conclusion
+
+The HTML parser rewrite is **95% complete** with excellent functional testing but critical memory and performance issues identified. The smart path forward is:
+
+1. ✅ Complete critical fixes (4-6 days)
+2. ✅ Validate improvements
+3. ✅ Merge to main with confidence
+
+This approach ensures a production-ready, performant parser rather than merging now and hotfixing later.
--- a/venv/lib/python3.10/site-packages/edgar/documents/docs/PROGRESS_ASSESSMENT.md
+++ b/venv/lib/python3.10/site-packages/edgar/documents/docs/PROGRESS_ASSESSMENT.md
@@ -0,0 +1,437 @@
+# HTML Parser Rewrite - Progress Assessment
+
+**Date**: 2025-10-07
+**Status**: Active Development (html_rewrite branch)
+
+---
+
+## Executive Summary
+
+The HTML parser rewrite is **substantially complete** for core functionality with **excellent progress** on Item/section detection. Recent bug fixes (2025-10-07) have addressed critical table rendering issues and 10-Q Part I/II distinction, bringing the parser close to production-ready quality.
+
+### Overall Progress: **~90% Complete**
+
+- ✅ Core parsing infrastructure: **100% Complete**
+- ✅ Table processing: **95% Complete** (recent fixes)
+- ✅ Section/Item detection: **95% Complete** (Part I/II fixed, needs validation)
+- ⚠️ Performance optimization: **70% Complete**
+- ⚠️ Comprehensive testing: **65% Complete** (added 10-Q Part tests)
+- ⚠️ Documentation: **75% Complete**
+
+---
+
+## Goal Achievement Analysis
+
+### Primary Goals (from goals.md)
+
+#### 1. **Semantic Meaning Preservation** ✅ **ACHIEVED**
+> "Read text, tables and ixbrl data preserving greatest semantic meaning"
+
+**Status**: ✅ Fully implemented
+- Text extraction with structure preservation
+- Advanced table matrix system for accurate table rendering
+- XBRL fact extraction before preprocessing
+- Hierarchical node model maintains document structure
+
+**Recent Improvements**:
+- Header detection fixes (Oracle Table 6, Tesla Table 16)
+- Spacing column filter now preserves header columns (MSFT Table 39)
+- Multi-row header normalization
+
+#### 2. **AI Channel (Primary) + Human Channel (Secondary)** ✅ **ACHIEVED**
+> "AI context is the primary goal, with human context being secondary"
+
+**Status**: ✅ Both channels working
+- **AI Channel**:
+  - Clean text output optimized for LLMs
+  - Structured table rendering for context windows
+  - Section-level extraction for chunking
+  - Semantic divisibility supported
+
+- **Human Channel**:
+  - Rich console rendering with proper formatting
+  - Markdown export
+  - Visual table alignment (recently fixed)
+
+#### 3. **Section-Level Processing** ✅ **ACHIEVED**
+> "Work at full document level and section level - breaking into independently processable sections"
+
+**Status**: ✅ Implemented with good coverage
+- `SectionExtractor` class fully functional
+- TOC-based section detection
+- Pattern-based section identification
+- Lazy loading support for large documents
+
+**What Works**:
+```python
+# Section detection is operational
+doc = parse_html(html)
+sections = doc.sections  # Dict of section names -> SectionNode
+
+# Access specific sections
+business = sections.get('Item 1 - Business')
+mda = sections.get('Item 7 - MD&A')
+financials = sections.get('Item 8 - Financial Statements')
+```
+
+#### 4. **Standard Section Names (10-K, 10-Q, 8-K)** ✅ **ACHIEVED**
+> "For some filing types (10-K, 10-Q, 8-K) identify sections by standard names"
+
+**Status**: ✅ 95% Complete - Implemented with Part I/II distinction for 10-Q
+
+**What's Implemented**:
+- Pattern matching for standard Items:
+  - Item 1 - Business
+  - Item 1A - Risk Factors
+  - Item 7 - MD&A
+  - Item 7A - Market Risk
+  - Item 8 - Financial Statements
+  - And more...
+- **10-Q Part I/Part II distinction** (newly fixed 2025-10-07):
+  - Part I - Item 1 (Financial Statements)
+  - Part II - Item 1 (Legal Proceedings)
+  - Proper boundary detection and context propagation
+  - Prevents Item number conflicts
+
+**What's Remaining** (5%):
+- Validation against large corpus of 10-K/10-Q filings
+- Edge case handling (non-standard formatting)
+- 8-K specific section patterns expansion
+
+**Evidence from Code**:
+```python
+# edgar/documents/extractors/section_extractor.py
+(r'^(Item|ITEM)\s+1\.?\s*Business', 'Item 1 - Business'),
+(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
+(r'^(Item|ITEM)\s+7\.?\s*Management.*Discussion', 'Item 7 - MD&A'),
+(r'^(Item|ITEM)\s+8\.?\s*Financial\s+Statements', 'Item 8 - Financial Statements'),
+
+# NEW: Part I/II detection (edgar/documents/extractors/section_extractor.py:294-324)
+def _detect_10q_parts(self, headers) -> Dict[int, str]:
+    """Detect Part I and Part II boundaries in 10-Q filings."""
+```
+
+#### 5. **Table Processing for AI Context** ✅ **ACHIEVED**
+> "Getting tables in the right structure for rendering to text for AI context is more important than dataframes"
+
+**Status**: ✅ Excellent progress with recent fixes
+- Advanced TableMatrix system handles complex tables
+- Multi-row header detection and normalization
+- Spacing column filtering (preserves semantic columns)
+- Currency symbol merging
+- Clean text rendering for LLM consumption
+
+**Recent Fixes (Today)**:
+- ✅ Fixed spacing column filter removing legitimate headers (MSFT Table 39)
+- ✅ Fixed header detection for date ranges (Oracle Table 6)
+- ✅ Fixed long narrative text misclassification (Tesla Table 16)
+- ✅ Header row normalization for alignment
+
+#### 6. **Better Than Old Parser in Every Way** 🟡 **MOSTLY ACHIEVED**
+> "Speed, accuracy, features, usability"
+
+**Comparison**:
+
+| Aspect | Old Parser | New Parser | Status |
+|--------|-----------|------------|--------|
+| **Speed** | Baseline | 1.4x faster (typical) | ✅ Better |
+| **Accuracy** | Good | Excellent (with recent fixes) | ✅ Better |
+| **Features** | Basic | Rich (XBRL, sections, multiple outputs) | ✅ Better |
+| **Usability** | Simple | Powerful + Simple API | ✅ Better |
+| **Table Rendering** | Basic alignment | Advanced matrix system | ✅ Better |
+| **Section Detection** | Limited | Comprehensive | ✅ Better |
+
+**Areas Needing Validation**:
+- Performance on very large documents (>50MB)
+- Memory usage under sustained load
+- Edge case handling across diverse filings
+
+---
+
+## Item/Section Detection Deep Dive
+
+### Current Capabilities
+
+**10-K Sections Detected**:
+- ✅ Item 1 - Business
+- ✅ Item 1A - Risk Factors
+- ✅ Item 1B - Unresolved Staff Comments
+- ✅ Item 2 - Properties
+- ✅ Item 3 - Legal Proceedings
+- ✅ Item 4 - Mine Safety Disclosures
+- ✅ Item 5 - Market for Stock
+- ✅ Item 6 - Selected Financial Data
+- ✅ Item 7 - MD&A
+- ✅ Item 7A - Market Risk
+- ✅ Item 8 - Financial Statements
+- ✅ Item 9 - Changes in Accounting
+- ✅ Item 9A - Controls and Procedures
+- ✅ Item 9B - Other Information
+- ✅ Item 10 - Directors and Officers
+- ✅ Item 11 - Executive Compensation
+- ✅ Item 12 - Security Ownership
+- ✅ Item 13 - Related Transactions
+- ✅ Item 14 - Principal Accountant
+- ✅ Item 15 - Exhibits
+
+**10-Q Sections Detected**:
+- ✅ Part I Items (Financial Information):
+  - Part I - Item 1 - Financial Statements
+  - Part I - Item 2 - MD&A
+  - Part I - Item 3 - Market Risk
+  - Part I - Item 4 - Controls and Procedures
+- ✅ Part II Items (Other Information):
+  - Part II - Item 1 - Legal Proceedings
+  - Part II - Item 1A - Risk Factors
+  - Part II - Item 2 - Unregistered Sales
+  - Part II - Item 6 - Exhibits
+
+**✅ FIXED** (2025-10-07): Part I/Part II distinction now implemented!
+- Part I Item 1 and Part II Item 1 are properly distinguished
+- Section keys include Part context: "Part I - Item 1 - Financial Statements" vs "Part II - Item 1 - Legal Proceedings"
+- Comprehensive test coverage added (5 tests in test_10q_part_detection.py)
+
+**8-K Sections**:
+- ⚠️ Limited - needs expansion
+
+### Detection Methods
+
+1. **TOC-based Detection** ✅
+   - Analyzes Table of Contents
+   - Extracts anchor links
+   - Maps sections to content
+
+2. **Pattern-based Detection** ✅
+   - Regex matching for Item headers
+   - Heading analysis (h1-h6 tags)
+   - Text pattern recognition
+
+3. **Hybrid Approach** ✅
+   - Combines TOC + patterns
+   - Fallback mechanisms
+   - Cross-validation
+
+### What's Working
+
+```python
+# This works today:
+from edgar.documents import parse_html
+
+html = filing.html()
+doc = parse_html(html)
+
+# Get all sections
+sections = doc.sections  # Returns dict
+
+# Access specific Items
+if 'Item 7 - MD&A' in sections:
+    mda = sections['Item 7 - MD&A']
+    mda_text = mda.text()
+    mda_tables = mda.tables()
+```
+
+### What Needs Work
+
+1. **Validation Coverage** (20% remaining)
+   - Test against 100+ diverse 10-K filings
+   - Test against 10-Q filings
+   - Test against 8-K filings
+   - Capture edge cases and variations
+
+2. **Edge Cases** (20% remaining)
+   - Non-standard Item formatting
+   - Missing TOC
+   - Nested sections
+   - Combined Items (e.g., "Items 10, 13, 14")
+
+3. **8-K Support** (50% remaining)
+   - 8-K specific Item patterns
+   - Event-based section detection
+   - Exhibit handling
+
+---
+
+## Recent Achievements (Past 24 Hours)
+
+### Critical Bug Fixes ✅
+
+1. **Spacing Column Filter Fix** (MSFT Table 39)
+   - Problem: Legitimate headers removed as "spacing"
+   - Solution: Header content protection + colspan preservation
+   - Impact: Tables now render correctly with all headers
+   - Commits: `4e43276`, `d19ddd1`
+
+2. **Header Detection Improvements**
+   - Oracle Table 6: Date ranges no longer misclassified
+   - Tesla Table 16: Long narrative text properly handled
+   - Multi-row header normalization
+   - Comprehensive test coverage (16 new tests)
+
+3. **Documentation Updates**
+   - TESTING.md clarified output limits
+   - CHANGELOG updated with fixes
+   - Bug reports and research docs completed
+
+### Quality Metrics
+
+**Test Coverage**:
+- 16 new tests added (all passing)
+- 0 regressions in existing tests
+- Comprehensive edge case coverage
+
+**Code Quality**:
+- Clean implementation following plan
+- Well-documented changes
+- Proper commit messages with Claude Code attribution
+
+---
+
+## Path to 100% Completion
+
+### High Priority (Next Steps)
+
+**📋 Detailed plans available**:
+- **Performance**: See `docs-internal/planning/active-tasks/2025-10-07-performance-optimization-plan.md`
+- **Testing**: See `docs-internal/planning/active-tasks/2025-10-07-comprehensive-testing-plan.md`
+
+1. **Performance Optimization** (1-2 weeks)
+   - [ ] Phase 1: Benchmarking & profiling (2-3 days)
+   - [ ] Phase 2: Algorithm optimizations (3-4 days)
+   - [ ] Phase 3: Validation & regression tests (2-3 days)
+   - [ ] Phase 4: Documentation & monitoring (1 day)
+   - **Goal**: Maintain 1.3x+ speed advantage, <2x memory usage
+
+2. **Comprehensive Testing** (2-3 weeks)
+   - [ ] Phase 1: Corpus validation - 100+ filings (3-4 days)
+   - [ ] Phase 2: Edge cases & error handling (2-3 days)
+   - [ ] Phase 3: Integration testing (2-3 days)
+   - [ ] Phase 4: Regression prevention (1-2 days)
+   - [ ] Phase 5: Documentation & sign-off (1 day)
+   - **Goal**: >95% success rate, >80% test coverage
+
+3. **Item Detection Validation** (included in testing plan)
+   - [ ] Test against 50+ diverse 10-K filings
+   - [ ] Test against 20+ 10-Q filings
+   - [ ] Document any pattern variations found
+   - [ ] Add regression tests for edge cases
+
+### Medium Priority
+
+4. **8-K Support** (1-2 days)
+   - [ ] Research 8-K Item patterns
+   - [ ] Implement detection patterns
+   - [ ] Test against sample 8-K filings
+
+5. **Documentation** (1 day)
+   - [ ] User guide for section access
+   - [ ] API documentation
+   - [ ] Migration guide from old parser
+   - [ ] Examples and recipes
+
+### Low Priority (Polish)
+
+6. **Final Polish**
+   - [ ] Error message improvements
+   - [ ] Logging enhancements
+   - [ ] Configuration documentation
+   - [ ] Performance tuning
+
+---
+
+## Risk Assessment
+
+### Low Risk ✅
+- Core parsing functionality (stable)
+- Table processing (recently fixed, well-tested)
+- Text extraction (working well)
+- XBRL extraction (functional)
+
+### Medium Risk ⚠️
+- Section detection edge cases (needs validation)
+- Performance on very large docs (needs testing)
+- Memory usage (needs profiling)
+
+### Mitigation Strategy
+1. Comprehensive validation testing (in progress)
+2. Real-world filing corpus testing
+3. Performance benchmarking suite
+4. Gradual rollout with monitoring
+
+---
+
+## Recommendations
+
+### Immediate Actions (This Week)
+
+1. **Validate Item Detection** 🎯 **TOP PRIORITY**
+   ```bash
+   # Run on diverse corpus
+   python tests/manual/compare_parsers.py --all
+
+   # Test specific sections
+   python -c "
+   from edgar.documents import parse_html
+   from pathlib import Path
+
+   for filing in ['Apple', 'Oracle', 'Tesla', 'Microsoft']:
+       html = Path(f'data/html/{filing}.10-K.html').read_text()
+       doc = parse_html(html)
+       print(f'{filing}: {list(doc.sections.keys())[:5]}...')
+   "
+   ```
+
+2. **Create Section Access Tests**
+   - Write tests that verify each Item can be accessed
+   - Validate text and table extraction from sections
+   - Test edge cases (missing Items, combined Items)
+
+3. **User Acceptance Testing**
+   - Have maintainer review section detection output
+   - Validate against known-good filings
+   - Document any issues found
+
+### Timeline to Production
+
+**Optimistic**: 1 week
+- If validation shows good Item detection
+- If performance is acceptable
+- If no major issues found
+
+**Realistic**: 2-3 weeks
+- Account for edge case fixes
+- Additional testing needed
+- Documentation completion
+
+**Conservative**: 4 weeks
+- Account for 8-K support
+- Comprehensive testing across all filing types
+- Full documentation
+
+---
+
+## Conclusion
+
+The HTML parser rewrite is **very close to completion** with excellent progress on all goals:
+
+**✅ Fully Achieved**:
+- Semantic meaning preservation
+- AI/Human channel support
+- Section-level processing
+- Table processing for AI context
+- Superior to old parser (in most respects)
+- **Standard Item detection for 10-K/10-Q** (with Part I/II distinction)
+
+**⚠️ Remaining Work (10%)**:
+- Validation against diverse corpus
+- Edge case handling
+- 8-K specific support expansion
+- Final testing and documentation
+
+**Bottom Line**: The parser is **production-ready for 10-K/10-Q** with Item detection functional but requiring validation. The recent bug fixes have resolved critical table rendering issues. With 1-2 weeks of focused validation and testing, this can be shipped with confidence.
+
+### Next Steps
+1. Run comprehensive Item detection validation
+2. Create section access test suite
+3. Performance benchmark
+4. Maintainer review and sign-off
+5. Merge to main branch
--- a/venv/lib/python3.10/site-packages/edgar/documents/docs/TESTING.md
+++ b/venv/lib/python3.10/site-packages/edgar/documents/docs/TESTING.md
@@ -0,0 +1,233 @@
+# HTML Parser Testing Quick Start
+
+Quick reference for testing the HTML parser rewrite during quality improvement.
+
+## Quick Start
+
+```bash
+# Use shortcuts (easy!)
+python tests/manual/compare_parsers.py aapl              # Apple 10-K
+python tests/manual/compare_parsers.py nvda --tables     # Nvidia tables
+python tests/manual/compare_parsers.py 'aapl 10-q'       # Apple 10-Q
+python tests/manual/compare_parsers.py orcl --table 5    # Oracle table #5
+
+# Or use full paths
+python tests/manual/compare_parsers.py data/html/Apple.10-K.html
+
+# Run all test files
+python tests/manual/compare_parsers.py --all
+```
+
+**Available shortcuts:**
+- **Companies**: `aapl`, `msft`, `tsla`, `nvda`, `orcl` (or full names like `apple`)
+- **Filing types**: `10-k` (default), `10-q`, `8-k`
+- **Combine**: `'aapl 10-q'`, `'orcl 8-k'`
+
+## Common Use Cases
+
+### 1. First Look at a Filing
+
+```bash
+# Get overview: speed, table count, sections
+python tests/manual/compare_parsers.py orcl
+```
+
+**Shows**:
+- Parse time comparison (OLD vs NEW)
+- Tables found
+- Text length
+- Sections detected
+- New features (headings, XBRL)
+
+### 2. Check Table Rendering
+
+```bash
+# List all tables with dimensions (shows first 20 tables)
+python tests/manual/compare_parsers.py aapl --tables
+
+# Compare specific table side-by-side (FULL table, no truncation)
+python tests/manual/compare_parsers.py aapl --table 7
+
+# Compare a range of tables
+python tests/manual/compare_parsers.py aapl --range 5:10
+```
+
+**Look for**:
+- Currency symbols merged: `$1,234` not `$ | 1,234`
+- Proper column alignment
+- Correct row/column counts
+- Clean rendering without extra spacing columns
+
+**Note**: `--table N` shows the **complete table** with all rows - no truncation!
+
+### 3. Verify Text Extraction
+
+```bash
+# See first 50 lines side-by-side (default limit)
+python tests/manual/compare_parsers.py msft --text
+
+# Show more lines (configurable)
+python tests/manual/compare_parsers.py msft --text --lines 100
+
+# Show first 200 lines
+python tests/manual/compare_parsers.py msft --text --lines 200
+```
+
+**Check**:
+- Semantic meaning preserved
+- No missing content
+- Clean formatting for LLM consumption
+
+**Note**: Text mode shows first N lines only (default: 50). Use `--lines N` to adjust.
+
+### 4. Check Section Detection
+
+```bash
+python tests/manual/compare_parsers.py aapl --sections
+```
+
+**Verify**:
+- Standard sections identified (10-K/10-Q)
+- Section boundaries correct
+- Text length reasonable per section
+
+### 5. Run Full Test Suite
+
+```bash
+# Test all files in corpus
+python tests/manual/compare_parsers.py --all
+```
+
+**Results**:
+- Summary table across all files
+- Performance comparison
+- Table detection comparison
+
+## Test Files
+
+Available in `data/html/`:
+
+- `Apple.10-K.html` - 1.8MB, complex financials
+- `Oracle.10-K.html` - Large filing
+- `Nvidia.10-K.html` - Tech company
+- `Apple.10-Q.html` - Quarterly format
+- More files as needed...
+
+## Command Reference
+
+```
+python tests/manual/compare_parsers.py [FILE] [OPTIONS]
+
+Options:
+  --all           Run on all test files
+  --tables        Show tables summary (first 20 tables)
+  --table N       Show specific table N side-by-side (FULL table)
+  --range START:END  Show range of tables (e.g., 5:10)
+  --text          Show text comparison (first 50 lines by default)
+  --sections      Show sections comparison
+  --lines N       Number of text lines to show (default: 50, only for --text)
+  --help          Show full help
+```
+
+### Output Limits Summary
+
+| Mode          | Limit      | Configurable      | Notes                           |
+|---------------|------------|-------------------|---------------------------------|
+| `--table N`   | None       | N/A               | Shows **complete table**        |
+| `--range N:M` | None       | N/A               | Shows **complete tables** in range |
+| `--tables`    | 20 tables  | No                | Lists first 20 tables only      |
+| `--text`      | 50 lines   | Yes (`--lines N`) | Preview only                    |
+| `--sections`  | None       | N/A               | Shows all sections              |
+
+## Output Interpretation
+
+### Overview Table
+
+```
+┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃ Metric        ┃ Old Parser ┃ New Parser ┃ Notes      ┃
+┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ Parse Time    │ 454ms      │ 334ms      │ 1.4x faster│
+│ Tables Found  │ 63         │ 63         │ +0         │
+│ Text Length   │ 0          │ 159,388    │ NEW!       │
+└───────────────┴────────────┴────────────┴────────────┘
+```
+
+**Good signs**:
+- ✅ New parser faster or similar speed
+- ✅ Same or more tables found
+- ✅ Text extracted (old parser shows 0)
+- ✅ Sections detected
+
+**Red flags**:
+- ❌ Significantly slower
+- ❌ Fewer tables (unless removing layout tables)
+- ❌ Much shorter text (content missing)
+
+### Table Comparison
+
+```
+Old Parser:
+┌─────────┬──────────┬──────────┐
+│ Year    │ Revenue  │ Profit   │
+├─────────┼──────────┼──────────┤
+│ 2023    │ $ 100M   │ $ 20M    │  <- Currency separated
+└─────────┴──────────┴──────────┘
+
+New Parser:
+┌─────────┬──────────┬──────────┐
+│ Year    │ Revenue  │ Profit   │
+├─────────┼──────────┼──────────┤
+│ 2023    │ $100M    │ $20M     │  <- Currency merged ✅
+└─────────┴──────────┴──────────┘
+```
+
+**Look for**:
+- Currency symbols merged with values
+- No extra empty columns
+- Proper alignment
+- Clean numeric formatting
+
+## Tips
+
+1. **Start with overview** - Get the big picture first
+2. **Check tables visually** - Automated metrics miss formatting issues
+3. **Use specific table inspection** - Don't scroll through 60 tables manually
+4. **Compare text for semantics** - Does it make sense for an LLM?
+5. **Run --all periodically** - Catch regressions across files
+
+## Troubleshooting
+
+### Script fails with import error
+
+```bash
+# Clear cached modules
+find . -type d -name __pycache__ -exec rm -rf {} +
+python tests/manual/compare_parsers.py data/html/Apple.10-K.html
+```
+
+### File not found
+
+```bash
+# Check available files
+ls -lh data/html/*.html
+
+# Use full path
+python tests/manual/compare_parsers.py /full/path/to/file.html
+```
+
+### Old parser shows 0 text
+
+This is expected - old parser has different text extraction. Focus on:
+- Table comparison
+- Parse time
+- Visual quality of output
+
+## Next Steps
+
+1. Run comparison on all test files
+2. Document bugs in `quality-improvement-strategy.md`
+3. Fix issues
+4. Repeat until satisfied
+
+See `edgar/documents/docs/quality-improvement-strategy.md` for full process.
--- a/venv/lib/python3.10/site-packages/edgar/documents/docs/fast-table-rendering.md
+++ b/venv/lib/python3.10/site-packages/edgar/documents/docs/fast-table-rendering.md
@@ -0,0 +1,529 @@
+# Fast Table Rendering
+
+**Status**: Production Ready - **Now the Default** (as of 2025-10-08)
+**Performance**: ~8-10x faster than Rich rendering with correct colspan/rowspan handling
+
+---
+
+## Overview
+
+Fast table rendering provides a high-performance alternative to Rich library rendering for table text extraction. When parsing SEC filings with hundreds of tables, the cumulative rendering time can become a bottleneck. Fast rendering addresses this by using direct string building with TableMatrix for proper colspan/rowspan handling, achieving 8-10x speedup while maintaining correctness.
+
+**As of 2025-10-08, fast rendering is the default** for all table text extraction. You no longer need to explicitly enable it.
+
+### Why It's Now the Default
+
+- **Production-ready**: Fixed all major issues (colspan, multi-row headers, multi-line cells)
+- **7-10x faster**: Significant performance improvement with correct output
+- **Maintains quality**: Matches Rich's appearance with simple() style
+- **Proven**: Extensively tested with Apple, NVIDIA, Microsoft 10-K filings
+
+### When to Disable (Use Rich Instead)
+
+You may want to disable fast rendering and use Rich for:
+- **Terminal display for humans**: Rich has more sophisticated text wrapping and layout
+- **Visual reports**: When presentation quality is more important than speed
+- **Debugging**: Rich output can be easier to visually inspect
+
+---
+
+## Usage
+
+### Default Behavior (Fast Rendering Enabled)
+
+```python
+from edgar.documents import parse_html
+
+# Fast rendering is now the default - no configuration needed!
+doc = parse_html(html)
+
+# Tables automatically use fast renderer (7-10x faster)
+table_text = doc.tables[0].text()
+```
+
+### Disabling Fast Rendering (Use Rich Instead)
+
+If you need Rich's sophisticated layout for visual display:
+
+```python
+from edgar.documents import parse_html
+from edgar.documents.config import ParserConfig
+
+# Explicitly disable fast rendering to use Rich
+config = ParserConfig(fast_table_rendering=False)
+doc = parse_html(html, config=config)
+
+# Tables use Rich renderer (slower but with advanced formatting)
+table_text = doc.tables[0].text()
+```
+
+### Custom Table Styles
+
+**New in this version**: Fast rendering now uses the `simple()` style by default, which matches Rich's `box.SIMPLE` appearance (borderless, clean).
+
+```python
+from edgar.documents import parse_html
+from edgar.documents.config import ParserConfig
+from edgar.documents.renderers.fast_table import FastTableRenderer, TableStyle
+
+# Enable fast rendering (uses simple() style by default)
+config = ParserConfig(fast_table_rendering=True)
+doc = parse_html(html, config=config)
+
+# Default: simple() style - borderless, clean
+table_text = doc.tables[0].text()
+
+# To use pipe_table() style explicitly (markdown-compatible borders):
+renderer = FastTableRenderer(TableStyle.pipe_table())
+pipe_text = renderer.render_table_node(doc.tables[0])
+
+# To use minimal() style (no separator):
+renderer = FastTableRenderer(TableStyle.minimal())
+minimal_text = renderer.render_table_node(doc.tables[0])
+```
+
+---
+
+## Performance Comparison
+
+### Benchmark Results
+
+**Test**: Apple 10-K (63 tables) - Updated 2025-10-08
+
+| Renderer | Average Per Table | Improvement | Notes |
+|----------|-------------------|-------------|-------|
+| Rich     | 1.5-2.5ms        | Baseline    | Varies by table complexity |
+| Fast (simple) | 0.15-0.35ms | **7-10x faster** | With proper colspan/rowspan handling |
+
+**Real-world Examples** (Apple 10-K):
+- Table 15 (complex colspan): Rich 2.51ms → Fast 0.35ms (**7.1x faster**)
+- Table 6 (multi-line cells): Rich 1.61ms → Fast 0.17ms (**9.5x faster**)
+- Table 5 (wide table): Rich 3.70ms → Fast 0.48ms (**7.7x faster**)
+
+**Impact on Full Parse**:
+- Rich rendering: 30-40% of total parse time spent in table rendering
+- Fast rendering: 5-10% of total parse time
+- **Overall speedup**: Reduces total parsing time by ~25-30%
+
+### Memory Impact
+
+Fast rendering also reduces memory overhead:
+- No Rich Console objects retained
+- Direct string building (no intermediate objects)
+- Helps prevent memory leaks identified in profiling
+
+---
+
+## Output Examples
+
+### Rich Renderer Output (Default)
+
+```
+  (In millions)
+  Year Ended June 30,                       2025    2024    2023
+ ──────────────────────────────────────────────────────────
+
+  Operating lease cost                    $5,524   3,555   2,875
+
+  Finance lease cost:
+  Amortization of right-of-use assets     $3,408   1,800   1,352
+  Interest on lease liabilities            1,417     734     501
+
+  Total finance lease cost                $4,825   2,534   1,853
+```
+
+**Style**: `box.SIMPLE` - No outer border, just horizontal separator under header
+**Pros**: Clean, uncluttered, perfect alignment, generous spacing
+**Cons**: Slow (6.5ms per table), creates Rich objects, memory overhead
+
+### Fast Renderer Output (NEW: simple() style - Default)
+
+```
+                            December 31, 2023    December 31, 2022    December 31, 2021
+ ───────────────────────────────────────────────────────────────────────────────────────
+  Revenue                               365,817              394,328              365,817
+  Cost of revenue                       223,546              212,981              192,266
+  Gross profit                          142,271              181,347              173,551
+```
+
+**Style**: `simple()` - Matches Rich's `box.SIMPLE` appearance
+**Pros**: Fast (0.2ms per table), clean appearance, no visual noise, professional look
+**Cons**: None - this is now the recommended default!
+
+### Fast Renderer Output (pipe_table() style - Optional)
+
+```
+|                          |  December 31, 2023  |  December 31, 2022  |  December 31, 2021  |
+|--------------------------|---------------------|---------------------|---------------------|
+| Revenue                  |             365,817 |             394,328 |             365,817 |
+| Cost of revenue          |             223,546 |             212,981 |             192,266 |
+| Gross profit             |             142,271 |             181,347 |             173,551 |
+```
+
+**Style**: `pipe_table()` - Markdown-compatible with borders
+**Pros**: Fast (0.2ms per table), markdown-compatible, explicit column boundaries
+**Cons**: Visual noise from pipe characters, busier appearance
+**Use when**: You need markdown-compatible output with explicit borders
+
+### Visual Comparison
+
+**Rich** (`box.SIMPLE`):
+- No outer border - clean, uncluttered look
+- Horizontal line separator under header only
+- Generous internal spacing and padding
+- Perfect column alignment
+- Professional, minimalist presentation
+
+**Fast simple()** (NEW DEFAULT):
+- No outer border - matches Rich's clean look
+- Horizontal line separator under header (using `─`)
+- Space-separated columns with generous padding
+- Clean, professional appearance
+- Same performance as pipe_table (~0.2ms per table)
+
+**Fast pipe_table()** (optional):
+- Full pipe table borders (`|` characters everywhere)
+- Horizontal dashes for header separator
+- Markdown-compatible format
+- Explicit column boundaries
+
+---
+
+## Recent Improvements (2025-10-08)
+
+### 1. Colspan/Rowspan Support
+
+**Fixed**: Tables with `colspan` and `rowspan` attributes now render correctly.
+
+**Previous issue**: Fast renderer was extracting cell text without accounting for colspan/rowspan, causing:
+- Missing columns (e.g., "2023" column disappeared in Apple 10-K table 15)
+- Misaligned data (currency symbols separated from values)
+- Data loss (em dashes and other values missing)
+
+**Solution**: Integrated `TableMatrix` for proper cell expansion, same as Rich rendering uses.
+
+**Status**: ✅ FIXED
+
+### 2. Multi-Row Header Preservation
+
+**Fixed**: Tables with multiple header rows now preserve each row separately.
+
+**Previous issue**: Multi-row headers were collapsed into a single line, causing "Investment portfolio" row to disappear in Apple 10-K table 20.
+
+**Solution**: Modified `render_table_data()` and `_build_table()` to preserve each header row as a separate line.
+
+**Status**: ✅ FIXED
+
+### 3. Multi-Line Cell Rendering
+
+**Fixed**: Cells containing newline characters (`\n`) now render as multiple lines.
+
+**Previous issue**: Multi-line cells like "Interest Rate\nSensitive Instrument" were truncated to first line only.
+
+**Solution**: Added `_format_multiline_row()` to split cells by `\n` and render each line separately.
+
+**Status**: ✅ FIXED
+
+### Performance Impact
+
+All three fixes maintain excellent performance:
+- **Speedup**: 7-10x faster than Rich (down from initial 14x, but with correct output)
+- **Correctness**: Now matches Rich output exactly for colspan, multi-row headers, and multi-line cells
+- **Production ready**: Can confidently use as default renderer
+
+---
+
+## Known Limitations
+
+### 1. Column Alignment in Some Tables
+
+**Issue**: Currency symbols and values may have extra spacing in some complex tables (e.g., Apple 10-K table 22)
+
+**Example**:
+- Rich: `$294,866`
+- Fast: `$                     294,866` (extra spacing)
+
+**Root cause**: Column width calculation creates wider columns for some currency/value pairs after colspan expansion and column filtering.
+
+**Impact**: Visual appearance differs slightly, but data is correct and readable.
+
+**Status**: ⚠️ Minor visual difference - acceptable trade-off for 10x performance gain
+
+### 3. Visual Polish
+
+**Issue**: Some visual aspects don't exactly match Rich's sophisticated layout
+
+**Examples**:
+- Multi-line cell wrapping may differ
+- Column alignment in edge cases
+
+**Status**: ⚠️ Acceptable trade-off for 8-10x performance gain
+
+---
+
+## Configuration Options
+
+### Table Styles
+
+Fast renderer supports different visual styles:
+
+```python
+from edgar.documents.renderers.fast_table import FastTableRenderer, TableStyle
+
+# Pipe table style (default) - markdown compatible
+renderer = FastTableRenderer(TableStyle.pipe_table())
+
+# Minimal style - no borders, just spacing
+renderer = FastTableRenderer(TableStyle.minimal())
+```
+
+### Minimal Style Output
+
+```
+                           December 31, 2023   December 31, 2022   December 31, 2021
+Revenue                              365,817             394,328             365,817
+Cost of revenue                      223,546             212,981             192,266
+Gross profit                         142,271             181,347             173,551
+```
+
+**Note**: Minimal style has cleaner appearance but loses column boundaries
+
+---
+
+## Technical Details
+
+### How It Works
+
+1. **Direct String Building**: Bypasses Rich's layout engine
+2. **Column Analysis**: Detects numeric columns for right-alignment
+3. **Smart Filtering**: Removes empty spacing columns
+4. **Currency Merging**: Combines `$` symbols with amounts
+5. **Width Calculation**: Measures content, applies min/max limits
+
+### Code Path
+
+```python
+# When fast_table_rendering=True:
+table.text()
+  → TableNode._fast_text_rendering()
+  → FastTableRenderer.render_table_node()
+  → Direct string building
+```
+
+### Memory Benefits
+
+Fast rendering avoids:
+- Rich Console object creation (~0.4MB per document)
+- Intermediate rich.Table objects
+- Style/theme processing overhead
+- ANSI escape code generation
+
+---
+
+## Future Improvements
+
+### Planned Enhancements
+
+1. **Match Rich's `box.SIMPLE` Style** (Priority: HIGH)
+   - **Remove all pipe characters** - no outer border, no column separators
+   - **Keep only horizontal separator** under header (using `─` character)
+   - **Increase internal padding** to match Rich's generous spacing
+   - **Clean, minimalist appearance** like Rich's SIMPLE box style
+   - **Goal**: Match Rich visual quality, still 30x faster
+
+2. **Improved Layout Engine**
+   - Better column width calculation (avoid too-wide/too-narrow columns)
+   - Respect natural content breaks
+   - Dynamic spacing based on content type
+   - Handle wrapping for long content
+
+3. **Dynamic Padding**
+   - Match Rich's generous spacing (currently too tight)
+   - Adjust padding based on content type
+   - Configurable padding rules
+   - Maintain alignment with variable padding
+
+4. **Header Handling**
+   - Better multi-row header collapse
+   - Preserve important hierarchies
+   - Smart column spanning
+   - Honor header groupings
+
+5. **Style Presets**
+   - `TableStyle.simple()` - Match Rich's `box.SIMPLE` (no borders, header separator only) ⭐ **PRIMARY GOAL**
+   - `TableStyle.minimal()` - no borders, just spacing (already implemented)
+   - `TableStyle.pipe_table()` - current markdown style (default)
+   - `TableStyle.ascii_clean()` - no Unicode, pure ASCII
+   - `TableStyle.compact()` - minimal spacing for dense data
+
+### Timeline
+
+These improvements are **planned for Phase 2** of the HTML parser optimization work (after memory leak fixes).
+
+---
+
+## Migration Guide
+
+### From Rich to Fast
+
+**Before** (using Rich):
+```python
+doc = parse_html(html)
+table_text = doc.tables[0].text()  # Slow but pretty
+```
+
+**After** (using Fast):
+```python
+config = ParserConfig(fast_table_rendering=True)
+doc = parse_html(html, config=config)
+table_text = doc.tables[0].text()  # Fast but current visual issues
+```
+
+### Hybrid Approach
+
+Use fast rendering during processing, Rich for final display:
+
+```python
+# Fast processing
+config = ParserConfig(fast_table_rendering=True)
+doc = parse_html(html, config=config)
+
+# Extract data quickly
+for table in doc.tables:
+    data = table.text()  # Fast
+    # Process data...
+
+# Display one table nicely
+special_table = doc.tables[5]
+rich_output = special_table.render()  # Switch to Rich for display
+```
+
+---
+
+## Performance Recommendations
+
+### Recommended Settings by Use Case
+
+**Batch Processing** (optimize for speed):
+```python
+config = ParserConfig.for_performance()
+# Includes: fast_table_rendering=True, eager_section_extraction=False
+```
+
+**Data Extraction** (balance speed and accuracy):
+```python
+config = ParserConfig(
+    fast_table_rendering=True,
+    extract_xbrl=True,
+    detect_sections=True
+)
+```
+
+**Display/Reports** (optimize for quality):
+```python
+config = ParserConfig()  # Default settings use Rich
+# Or explicitly:
+config = ParserConfig.for_accuracy()
+```
+
+---
+
+## FAQ
+
+**Q: Can I mix Fast and Rich rendering?**
+A: Not per-table. The setting is document-wide via ParserConfig. However, you can manually call `table.render()` to get Rich output.
+
+**Q: Does this affect section extraction?**
+A: Indirectly, yes. Section detection calls `text()` on the entire document, which includes tables. Fast rendering speeds this up significantly.
+
+**Q: Will the output format change?**
+A: Yes, as we improve the renderer. We'll maintain backward compatibility via style options.
+
+**Q: Can I customize the appearance?**
+A: Currently limited to `TableStyle.pipe_table()` vs `TableStyle.minimal()`. More options coming.
+
+**Q: What about DataFrame export?**
+A: Fast rendering only affects text output. `table.to_dataframe()` is unaffected.
+
+---
+
+## Feedback
+
+The fast renderer is actively being improved based on user feedback. Known issues:
+
+1. ❌ **Pipe characters** - visual noise
+2. ❌ **Layout engine** - inconsistent spacing
+3. ❌ **Padding** - needs tuning
+
+If you have specific rendering issues or suggestions, please provide:
+- Sample table HTML
+- Expected vs actual output
+- Use case description
+
+This helps prioritize improvements while maintaining the performance advantage.
+
+---
+
+## Summary
+
+### Current State (As of 2025-10-08)
+
+**Performance**: ✅ Excellent (8-10x faster than Rich)
+**Correctness**: ✅ Production ready (proper colspan/rowspan handling)
+**Visual Quality**: ⚠️ Good (simple() style matches Rich's box.SIMPLE appearance)
+**Use Case**: Production-ready for all use cases
+
+### Recent Milestones
+
+**✅ Completed**:
+- Core fast rendering implementation
+- TableStyle.simple() preset (borderless, clean)
+- Column filtering and merging
+- Numeric alignment detection
+- **Colspan/rowspan support via TableMatrix**
+- **Performance benchmarking with real tables**
+
+**🔧 Current Limitations**:
+- Multi-row header collapsing differs from Rich
+- Some visual polish differences (acceptable for speed gain)
+- Layout engine not as sophisticated as Rich
+
+### Development Roadmap
+
+**Phase 1** (✅ COMPLETED):
+- ✅ Core fast rendering implementation
+- ✅ Simple() style matching Rich's box.SIMPLE
+- ✅ Proper colspan/rowspan handling via TableMatrix
+- ✅ Production-ready performance (8-10x faster)
+
+**Phase 2** (Future Enhancements):
+- 📋 Improve multi-row header handling
+- 📋 Better layout engine for perfect column widths
+- 📋 Additional style presets
+- 📋 Advanced header detection (data vs labels)
+
+### Bottom Line
+
+Fast table rendering is **production-ready and now the default** for all table text extraction in EdgarTools.
+
+**Benefits**:
+- ✅ 7-10x faster than Rich rendering
+- ✅ Correct data extraction with proper colspan/rowspan handling
+- ✅ Multi-row header preservation
+- ✅ Multi-line cell rendering
+- ✅ Clean, borderless appearance (simple() style)
+
+**Minor differences from Rich**:
+- ⚠️ Some tables have extra spacing between currency symbols and values (e.g., table 22)
+- ⚠️ Column width calculation may differ slightly in complex tables
+- ✅ All data is preserved and correct - only visual presentation differs
+
+The implementation achieves **correct data extraction** with **significant performance gains** and **clean visual output**, making it the ideal default for EdgarTools.
+
+---
+
+## Related Documentation
+
+- [HTML Parser Status](HTML_PARSER_STATUS.md) - Overall parser progress
+- [Performance Analysis](../perf/hotpath_analysis.md) - Profiling results showing Rich rendering bottleneck
+- [Memory Analysis](../perf/memory_analysis.md) - Memory leak issues with Rich objects
--- a/venv/lib/python3.10/site-packages/edgar/documents/docs/goals.md
+++ b/venv/lib/python3.10/site-packages/edgar/documents/docs/goals.md
@@ -0,0 +1,164 @@
+# Goals
+
+## Mission
+Replace `edgar.files` with a parser that is better in **every way** - utility, accuracy, and user experience. The maintainer is the final judge: output must look correct when printed.
+
+## Core Principles
+
+### Primary Goal: AI Context Optimization
+- **Token efficiency**: 30-50% reduction vs raw HTML while preserving semantic meaning
+- **Chunking support**: Enable independent processing of sections/tables for LLM context windows
+- **Clean text output**: Tables rendered in LLM-friendly formats (clean text, markdown)
+- **Semantic preservation**: Extract meaning, not just formatting
+
+### Secondary Goal: Human Readability
+- **Rich console output**: Beautiful rendering with proper table alignment
+- **Markdown export**: Professional-looking document conversion
+- **Section navigation**: Easy access to specific Items/sections
+
+## User-Focused Feature Goals
+
+### 1. Text Extraction
+- Extract full document text without dropping meaningful content
+- Preserve paragraph structure and semantic whitespace
+- Handle inline XBRL facts gracefully (show values, not raw tags)
+- Clean HTML artifacts automatically (scripts, styles, page numbers)
+- **Target**: 99%+ accuracy vs manual reading
+
+### 2. Section Extraction (10-K, 10-Q, 8-K)
+- Detect >90% of standard sections for >90% of test tickers
+- Support flexible access: `doc.sections['Item 1A']`, `doc['1A']`, `doc.risk_factors`
+- Return Section objects with `.text()`, `.tables`, `.search()` methods
+- Include confidence scores and detection method metadata
+- **Target**: Better recall than old parser (quantify with test suite)
+
+### 3. Table Extraction
+- Extract all meaningful data tables (ignore pure layout tables)
+- Accurate rendering with aligned columns and proper formatting
+- Handle complex tables (rowspan, colspan, nested headers)
+- Preserve table captions and surrounding context
+- Support DataFrame conversion for data analysis
+- **Target**: 95%+ accuracy on test corpus
+
+### 4. Search Capabilities
+- Text search within documents
+- Regex pattern matching
+- Semantic search preparation (structure for embedding-based search)
+- Search within sections for focused queries
+
+### 5. Multiple Output Formats
+- Plain text (optimized for LLM context)
+- Markdown (for documentation/sharing)
+- Rich console (beautiful terminal display)
+- JSON (structured data export)
+
+### 6. Developer Experience
+- Intuitive API: `doc.text()`, `doc.tables`, `doc.sections`
+- Rich objects with useful methods (not just strings)
+- Simple tasks simple, complex tasks possible
+- Helpful error messages with recovery suggestions
+- **Target**: New users productive in <10 minutes
+
+
+
+## Performance Targets
+
+### Speed Benchmarks (Based on Current Performance)
+- **Small docs (<5MB)**: <500ms ✅ *Currently 96ms - excellent*
+- **Medium docs (5-20MB)**: <2s ✅ *Currently 1.19s - excellent*
+- **Large docs (>50MB)**: <10s ✅ *Currently 0.59s - excellent*
+- **Throughput**: >3MB/s sustained ✅ *Currently 3.8MB/s*
+- **Target**: Maintain or improve on all benchmarks
+
+### Memory Efficiency
+- **Small docs (<5MB)**: <3x document size *(currently 9x - needs optimization)*
+- **Large docs (>10MB)**: <2x document size *(currently 1.9x - good)*
+- **No memory spikes**: Never exceed 5x document size *(MSFT currently 5.4x)*
+- **Target**: Consistent 2-3x overhead across all document sizes
+
+### Accuracy Benchmarks
+- **Section detection recall**: >90% on 20-ticker test set
+- **Table extraction accuracy**: >95% on manual validation set
+- **Text fidelity**: >99% semantic equivalence to source HTML
+- **XBRL fact extraction**: 100% of inline facts captured correctly
+
+## Implementation Details
+
+### HTML Parsing
+- Read the entire HTML document without dropping semantically meaningful content
+- Drop non-meaningful content (scripts, styles, pure formatting tags)
+- Preserve semantic structure (headings, paragraphs, lists)
+- Handle both old (pre-2015) and modern (inline XBRL) formats
+- Graceful degradation for malformed HTML
+
+### Table Parsing
+- Extract tables containing meaningful data
+- Ignore layout tables (unless they aid document understanding)
+- Accurate rendering with proper column alignment
+- Handle complex structures: rowspan, colspan, nested headers, multi-level headers
+- Preserve table captions and contextual information
+- Support conversion to pandas DataFrame
+
+### Section Extraction
+- Detect standard sections (Item 1, 1A, 7, etc.) for 10-K, 10-Q, 8-K filings
+- Support multiple detection strategies: TOC-based, heading-based, pattern-based
+- Return Section objects with full API: `.text()`, `.text_without_tables()`, `.tables`, `.search()`
+- Include metadata: confidence scores, detection method, position
+- Better recall than old parser (establish baseline with test suite)
+
+## Quality Gates Before Replacing edgar.files
+
+### Automated Tests
+- [ ] All existing tests pass with new parser (1000+ tests)
+- [ ] Performance regression tests (<5% slower on any document)
+- [ ] Memory regression tests (no >10% increases)
+- [ ] Section detection accuracy >90% on test corpus
+- [ ] Table extraction accuracy >95% on validation set
+
+### Manual Validation (Maintainer Review)
+- [ ] Print full document text for 10 sample filings → verify quality
+- [ ] Compare table rendering old vs new → verify improvement
+- [ ] Test section extraction on edge cases → verify robustness
+- [ ] Review markdown output → verify professional appearance
+- [ ] Check memory usage → verify no concerning spikes
+
+### Documentation Requirements
+- [ ] Migration guide (old API → new API with examples)
+- [ ] Updated user guide showing new features
+- [ ] Performance comparison report (old vs new)
+- [ ] Known limitations documented clearly
+- [ ] API reference complete for all public methods
+
+## Success Metrics
+
+### Launch Criteria
+1. **Speed**: Equal or faster on 95% of test corpus
+2. **Accuracy**: Maintainer approves output quality on sample set
+3. **API**: Clean, intuitive interface (no confusion)
+4. **Tests**: Zero regressions, 95%+ coverage on new code
+5. **Docs**: Complete with examples for all major use cases
+
+### Post-Launch Monitoring
+- Issue reports: <5% related to parser quality/accuracy
+- User feedback: Positive sentiment on ease of use
+- Performance: No degradation over time (regression tests)
+- Adoption: Smooth migration from old parser (deprecation path)
+
+## Feature Parity with Old Parser
+
+### Must-Have (Required for Migration)
+- ✅ Get document text (with/without tables)
+- ✅ Extract specific sections by name/number
+- ✅ List all tables in document
+- ✅ Search document content
+- ✅ Convert to markdown
+- ✅ Handle both old and new SEC filing formats
+- ✅ Graceful error handling
+
+### Nice-to-Have (Improvements Over Old Parser)
+- 🎯 Semantic search capabilities
+- 🎯 Better subsection extraction within Items
+- 🎯 Table-of-contents navigation
+- 🎯 Export to multiple formats (JSON, clean HTML)
+- 🎯 Batch processing optimizations
+- 🎯 Section confidence scores and metadata
--- a/venv/lib/python3.10/site-packages/edgar/documents/docs/html-parser-rewrite-overview.md
+++ b/venv/lib/python3.10/site-packages/edgar/documents/docs/html-parser-rewrite-overview.md
@@ -0,0 +1,240 @@
+# HTML Parser Rewrite Technical Overview
+
+## Executive Summary
+
+The `edgar/documents` module represents a comprehensive rewrite of the HTML parsing capabilities originally implemented in `edgar/files`. This new parser is designed to provide superior parsing accuracy, structured data extraction, and rendering quality for SEC filing documents. The rewrite introduces a modern, extensible architecture with specialized components for handling the complex structure of financial documents.
+
+## Architecture Overview
+
+### Core Components
+
+#### 1. Document Object Model
+The new parser introduces a sophisticated node-based document model:
+
+- **Document**: Top-level container with metadata and sections
+- **Node Hierarchy**: Abstract base classes for all document elements
+  - `DocumentNode`: Root document container
+  - `TextNode`: Plain text content
+  - `ParagraphNode`: Paragraph elements with styling
+  - `HeadingNode`: Headers with levels 1-6
+  - `ContainerNode`: Generic containers (div, section)
+  - `SectionNode`: Document sections with semantic meaning
+  - `ListNode`/`ListItemNode`: Ordered and unordered lists
+  - `LinkNode`: Hyperlinks with metadata
+  - `ImageNode`: Images with attributes
+
+#### 2. Table Processing System
+Advanced table handling represents a major improvement over the old parser:
+
+- **TableNode**: Sophisticated table representation with multi-level headers
+- **Cell**: Individual cell with colspan/rowspan support and type detection
+- **Row**: Table row with header detection and semantic classification
+- **TableMatrix**: Handles complex cell spanning and alignment
+- **CurrencyColumnMerger**: Intelligently merges currency symbols with values
+- **ColumnAnalyzer**: Detects spacing columns and optimizes layout
+
+#### 3. Parser Pipeline
+The parsing process follows a well-defined pipeline:
+
+1. **HTMLParser**: Main orchestration class
+2. **HTMLPreprocessor**: Cleans and normalizes HTML
+3. **DocumentBuilder**: Converts HTML tree to document nodes
+4. **Strategy Pattern**: Pluggable parsing strategies
+5. **DocumentPostprocessor**: Final cleanup and optimization
+
+### Key Improvements Over Old Parser
+
+#### Table Processing Enhancements
+
+**Old Parser (`edgar/files`)**:
+- Basic table extraction using BeautifulSoup
+- Limited colspan/rowspan handling
+- Simple text-based rendering
+- Manual column alignment
+- Currency symbols often misaligned
+
+**New Parser (`edgar/documents`)**:
+- Advanced table matrix system for perfect cell alignment
+- Intelligent header detection (multi-row headers, year detection)
+- Automatic currency column merging ($1,234 instead of $ | 1,234)
+- Semantic table type detection (FINANCIAL, METRICS, TOC, etc.)
+- Rich table rendering with proper formatting
+- Smart column width calculation
+- Enhanced numeric formatting with comma separators
+
+#### Document Structure
+
+**Old Parser**:
+- Flat block-based structure
+- Limited semantic understanding
+- Basic text extraction
+
+**New Parser**:
+- Hierarchical node-based model
+- Semantic section detection
+- Rich metadata preservation
+- XBRL fact extraction
+- Search capabilities
+- Multiple output formats (text, markdown, JSON, pandas)
+
+#### Rendering Quality
+
+**Old Parser**:
+- Basic text output
+- Limited table formatting
+- No styling preservation
+
+**New Parser**:
+- Multiple renderers (text, markdown, Rich console)
+- Preserves document structure and styling
+- Configurable output options
+- LLM-optimized formatting
+
+## Implementation Details
+
+### Configuration System
+
+The new parser uses a comprehensive configuration system:
+
+```python
+@dataclass
+class ParserConfig:
+    # Size limits
+    max_document_size: int = 50 * 1024 * 1024  # 50MB
+    streaming_threshold: int = 10 * 1024 * 1024  # 10MB
+    
+    # Processing options
+    preserve_whitespace: bool = False
+    detect_sections: bool = True
+    extract_xbrl: bool = True
+    table_extraction: bool = True
+    detect_table_types: bool = True
+```
+
+### Strategy Pattern Implementation
+
+The parser uses pluggable strategies for different aspects:
+
+- **HeaderDetectionStrategy**: Identifies document sections
+- **TableProcessor**: Handles table extraction and classification
+- **XBRLExtractor**: Extracts XBRL facts and metadata
+- **StyleParser**: Processes CSS styling information
+
+### Table Processing Deep Dive
+
+The table processing system represents the most significant improvement:
+
+#### Header Detection Algorithm
+- Analyzes cell content patterns (th vs td elements)
+- Detects year patterns in financial tables
+- Identifies period indicators (quarters, fiscal years)
+- Handles multi-row headers with units and descriptions
+- Prevents misclassification of data rows as headers
+
+#### Cell Type Detection
+- Numeric vs text classification
+- Currency value recognition
+- Percentage handling
+- Em dash and null value detection
+- Proper number formatting with thousand separators
+
+#### Matrix Building
+- Handles colspan and rowspan expansion
+- Maintains cell relationships
+- Optimizes column layout
+- Removes spacing columns automatically
+
+### XBRL Integration
+
+The new parser includes sophisticated XBRL processing:
+- Extracts facts before preprocessing to preserve ix:hidden content
+- Maintains metadata relationships
+- Supports inline XBRL transformations
+- Preserves semantic context
+
+## Performance Characteristics
+
+### Memory Efficiency
+- Streaming support for large documents (>10MB)
+- Lazy loading of document sections
+- Caching for repeated operations
+- Memory-efficient node representation
+
+### Processing Speed
+- Optimized HTML parsing with lxml
+- Configurable processing strategies
+- Parallel extraction capabilities
+- Smart caching of expensive operations
+
+## Migration and Compatibility
+
+### API Compatibility
+The new parser maintains high-level compatibility with the old parser while offering enhanced functionality:
+
+```python
+# Old way
+from edgar.files import FilingDocument
+doc = FilingDocument(html)
+text = doc.text()
+
+# New way  
+from edgar.documents import HTMLParser
+parser = HTMLParser()
+doc = parser.parse(html)
+text = doc.text()
+```
+
+### Feature Parity
+All major features from the old parser are preserved:
+- Text extraction
+- Table conversion to DataFrame
+- Section detection
+- Metadata extraction
+
+### Enhanced Features
+New capabilities not available in the old parser:
+- Rich console rendering
+- Markdown export
+- Advanced table semantics
+- XBRL fact extraction
+- Document search
+- LLM optimization
+- Multiple output formats
+
+## Current Status and Next Steps
+
+### Completed Components
+- ✅ Core document model
+- ✅ HTML parsing pipeline
+- ✅ Advanced table processing
+- ✅ Multiple renderers (text, markdown, Rich)
+- ✅ XBRL extraction
+- ✅ Configuration system
+- ✅ Streaming support
+
+### Remaining Work
+- 🔄 Performance optimization and benchmarking
+- 🔄 Comprehensive test coverage migration
+- 🔄 Error handling improvements
+- 🔄 Documentation and examples
+- 🔄 Validation against large corpus of filings
+
+### Testing Strategy
+The rewrite requires extensive validation:
+- Comparison testing against old parser output
+- Financial table accuracy verification
+- Performance benchmarking
+- Edge case handling
+- Integration testing with existing workflows
+
+## Conclusion
+
+The `edgar/documents` rewrite represents a significant advancement in SEC filing processing capabilities. The new architecture provides:
+
+1. **Better Accuracy**: Advanced table processing and semantic understanding
+2. **Enhanced Functionality**: Multiple output formats and rich rendering
+3. **Improved Maintainability**: Clean, modular architecture with clear separation of concerns
+4. **Future Extensibility**: Plugin architecture for new parsing strategies
+5. **Performance**: Streaming support and optimized processing for large documents
+
+The modular design ensures that improvements can be made incrementally while maintaining backward compatibility. The sophisticated table processing system alone represents a major advancement in handling complex financial documents accurately.
--- a/venv/lib/python3.10/site-packages/edgar/documents/docs/quality-improvement-strategy.md
+++ b/venv/lib/python3.10/site-packages/edgar/documents/docs/quality-improvement-strategy.md
@@ -0,0 +1,208 @@
+# HTML Parser Quality Improvement Strategy
+
+## Overview
+
+Simple, iterative testing strategy for the HTML parser rewrite. The goal is rapid feedback loops where we compare OLD vs NEW parser output, identify visual/functional issues, fix them, and repeat until satisfied.
+
+## Test Corpus
+
+### 10 Representative Documents
+
+Selected to cover different filing types, companies, and edge cases:
+
+| # | Company | Filing Type | File Path | Rationale |
+|---|---------|-------------|-----------|-----------|
+| 1 | Apple | 10-K | `data/html/Apple.10-K.html` | Large complex filing, existing test file |
+| 2 | Oracle | 10-K | `data/html/Oracle.10-K.html` | Complex financials, existing test file |
+| 3 | Nvidia | 10-K | `data/html/Nvidia.10-K.html` | Tech company, existing test file |
+| 4 | Microsoft | 10-K | `data/html/Microsoft.10-K.html` | Popular company, complex tables |
+| 5 | Tesla | 10-K | `data/html/Tesla.10-K.html` | Manufacturing sector, different formatting |
+| 6 | [TBD] | 10-Q | TBD | Quarterly report format |
+| 7 | [TBD] | 10-Q | TBD | Another quarterly for variety |
+| 8 | [TBD] | 8-K | `data/html/BuckleInc.8-K.html` | Event-driven filing |
+| 9 | [TBD] | Proxy (DEF 14A) | TBD | Proxy statement with compensation tables |
+| 10 | [TBD] | Edge case | TBD | Unusual formatting or very large file |
+
+**Note**: Fill in TBD entries as we identify good test candidates.
+
+## The 4-Step Loop
+
+### Step 1: Run Comparison
+
+Use existing test scripts to compare OLD vs NEW parsers:
+
+```bash
+# Full comparison with metrics
+python tests/manual/check_parser_comparison.py
+
+# Table-focused comparison with rendering
+python tests/manual/check_tables.py
+
+# Or run on specific file
+python tests/manual/check_html_rewrite.py
+```
+
+**Outputs to review**:
+- Console output with side-by-side Rich panels
+- Metrics (parse time, table count, section detection)
+- Rendered tables (old vs new)
+
+### Step 2: Human Review
+
+**Visual Inspection Process**:
+1. Look at console output directly (Rich rendering)
+2. For detailed text comparison, optionally dump to files:
+   - OLD parser: `doc.text()` → `output/old_apple.txt`
+   - NEW parser: `doc.text()` → `output/new_apple.txt`
+   - Use `diff` or visual diff tool
+3. Take screenshots for complex table issues
+4. Focus on:
+   - Table alignment and formatting
+   - Currency symbol placement (should be merged: `$1,234` not `$ | 1,234`)
+   - Column count (fewer is better after removing spacing columns)
+   - Section detection accuracy
+   - Text readability for LLM context
+
+**Quality Criteria** (from goals.md):
+- Semantic meaning preserved
+- Tables render correctly when printed
+- Better than old parser in speed, accuracy, features
+- **You are the final judge**: "Does this look right?"
+
+### Step 3: Document Bugs
+
+Record issues in the tracker below as you find them:
+
+| Bug # | Status | Priority | Description | File/Location | Notes |
+|-------|--------|----------|-------------|---------------|-------|
+| Example | Fixed | High | Currency symbols not merging in balance sheet | Apple 10-K, Table 5 | Issue in CurrencyColumnMerger |
+| | | | | | |
+| | | | | | |
+| | | | | | |
+
+**Status values**: Open, In Progress, Fixed, Won't Fix, Deferred
+**Priority values**: Critical, High, Medium, Low
+
+**Bug Description Template**:
+- What's wrong: Clear description of the issue
+- Where: Which file/table/section
+- Expected: What it should look like
+- Actual: What it currently looks like
+- Impact: How it affects usability/readability
+
+### Step 4: Fix & Repeat
+
+1. Pick highest priority bug
+2. Fix the code
+3. Re-run comparison on affected file(s)
+4. Verify fix doesn't break other files
+5. Mark bug as Fixed
+6. Repeat until exit criteria met
+
+**Quick verification**:
+```bash
+# Re-run just the problematic file
+python -c "
+from edgar.documents import parse_html
+from pathlib import Path
+html = Path('data/html/Apple.10-K.html').read_text()
+doc = parse_html(html)
+# Quick inspection
+print(f'Tables: {len(doc.tables)}')
+print(doc.tables[5].render(width=200))  # Check specific table
+"
+```
+
+## Exit Criteria
+
+We're done when:
+1. ✅ All 10 test documents parse successfully
+2. ✅ Visual output looks correct (maintainer approval)
+3. ✅ Tables render cleanly with proper alignment
+4. ✅ No critical or high priority bugs remain
+5. ✅ Performance is equal or better than old parser
+6. ✅ Text extraction is complete and clean for AI context
+
+**Final approval**: Maintainer says "This is good enough to ship."
+
+## Testing Infrastructure
+
+### Primary Tool: compare_parsers.py
+
+Simple command-line tool for the quality improvement loop:
+
+```bash
+# Quick overview comparison (using shortcuts!)
+python tests/manual/compare_parsers.py aapl
+
+# See all tables in a document
+python tests/manual/compare_parsers.py aapl --tables
+
+# Compare specific table (OLD vs NEW side-by-side)
+python tests/manual/compare_parsers.py aapl --table 5
+
+# Compare text extraction
+python tests/manual/compare_parsers.py msft --text
+
+# See section detection
+python tests/manual/compare_parsers.py orcl --sections
+
+# Test with 10-Q filings
+python tests/manual/compare_parsers.py 'aapl 10-q'
+
+# Run all test files at once
+python tests/manual/compare_parsers.py --all
+```
+
+**Shortcuts available**:
+- Companies: `aapl`, `msft`, `tsla`, `nvda`, `orcl`
+- Filing types: `10-k` (default), `10-q`, `8-k`
+- Or use full file paths
+
+**Features**:
+- Clean command-line interface
+- Side-by-side OLD vs NEW comparison
+- Rich console output with colors and tables
+- Performance metrics
+- Individual table inspection
+
+### Other Available Scripts
+
+Additional tools for specific testing:
+
+- `tests/manual/check_parser_comparison.py` - Full comparison with metrics
+- `tests/manual/check_tables.py` - Table-specific comparison with rendering
+- `tests/manual/check_html_rewrite.py` - General HTML parsing checks
+- `tests/manual/check_html_parser_real_files.py` - Real filing tests
+
+## Quick Reference
+
+For day-to-day testing commands and usage examples, see [TESTING.md](TESTING.md).
+
+## Notes
+
+- **Keep it simple**: This is about rapid iteration, not comprehensive automation
+- **Visual inspection is key**: Automated metrics don't catch layout/formatting issues
+- **Use screenshots**: When describing bugs, screenshots speak louder than words
+- **Iterative approach**: Don't try to fix everything at once, prioritize
+- **Trust your judgment**: If it looks wrong, it probably is wrong
+
+## Bug Tracker
+
+### Active Issues
+
+(Add bugs here as they're discovered)
+
+### Fixed Issues
+
+(Move completed bugs here for history)
+
+### Deferred Issues
+
+(Issues that aren't blocking release but could be improved later)
+
+---
+
+**Status**: Initial draft
+**Last Updated**: 2025-10-07
+**Maintainer**: Dwight Gunning
--- a/venv/lib/python3.10/site-packages/edgar/documents/document.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/document.py
@@ -0,0 +1,931 @@
+"""
+Document model for parsed HTML.
+"""
+
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Any, Iterator
+
+from rich.table import Table as RichTable
+from rich.console import Group
+from rich.text import Text
+from edgar.richtools import repr_rich
+
+from edgar.documents.nodes import Node, SectionNode
+from edgar.documents.table_nodes import TableNode
+from edgar.documents.types import XBRLFact, SearchResult
+
+
+@dataclass
+class DocumentMetadata:
+    """
+    Document metadata.
+
+    Contains information about the source document and parsing process.
+    """
+    source: Optional[str] = None
+    form: Optional[str] = None
+    company: Optional[str] = None
+    cik: Optional[str] = None
+    accession_number: Optional[str] = None
+    filing_date: Optional[str] = None
+    report_date: Optional[str] = None
+    url: Optional[str] = None
+    size: int = 0
+    parse_time: float = 0.0
+    parser_version: str = "2.0.0"
+    xbrl_data: Optional[List[XBRLFact]] = None
+    preserve_whitespace: bool = False
+    original_html: Optional[str] = None  # Store original HTML for anchor analysis
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert metadata to dictionary."""
+        return {
+            'source': self.source,
+            'form': self.form,
+            'company': self.company,
+            'cik': self.cik,
+            'accession_number': self.accession_number,
+            'filing_date': self.filing_date,
+            'report_date': self.report_date,
+            'url': self.url,
+            'size': self.size,
+            'parse_time': self.parse_time,
+            'parser_version': self.parser_version,
+            'xbrl_data': [fact.to_dict() for fact in self.xbrl_data] if self.xbrl_data else None
+        }
+
+
+@dataclass
+class Section:
+    """
+    Document section representation.
+
+    Represents a logical section of the document (e.g., Risk Factors, MD&A).
+
+    Attributes:
+        name: Section identifier (e.g., "item_1", "part_i_item_1", "risk_factors")
+        title: Display title (e.g., "Item 1 - Business")
+        node: Node containing section content
+        start_offset: Character position where section starts
+        end_offset: Character position where section ends
+        confidence: Detection confidence score (0.0-1.0)
+        detection_method: How section was detected ('toc', 'heading', 'pattern')
+        validated: Whether section has been cross-validated
+        part: Optional part identifier for 10-Q filings ("I", "II", or None for 10-K)
+        item: Optional item identifier (e.g., "1", "1A", "2")
+        _text_extractor: Optional callback for lazy text extraction (for TOC-based sections)
+    """
+    name: str
+    title: str
+    node: SectionNode
+    start_offset: int = 0
+    end_offset: int = 0
+    confidence: float = 1.0  # Detection confidence (0.0-1.0)
+    detection_method: str = 'unknown'  # 'toc', 'heading', 'pattern', or 'unknown'
+    validated: bool = False  # Cross-validated flag
+    part: Optional[str] = None  # Part identifier for 10-Q: "I", "II", or None for 10-K
+    item: Optional[str] = None  # Item identifier: "1", "1A", "2", etc.
+    _text_extractor: Optional[Any] = field(default=None, repr=False)  # Callback for lazy text extraction
+
+    def text(self, **kwargs) -> str:
+        """Extract text from section."""
+        # If we have a text extractor callback (TOC-based sections), use it
+        if self._text_extractor is not None:
+            return self._text_extractor(self.name, **kwargs)
+
+        # Otherwise extract from node (heading/pattern-based sections)
+        from edgar.documents.extractors.text_extractor import TextExtractor
+        extractor = TextExtractor(**kwargs)
+        return extractor.extract_from_node(self.node)
+    
+    def tables(self) -> List[TableNode]:
+        """Get all tables in section."""
+        return self.node.find(lambda n: isinstance(n, TableNode))
+    
+    def search(self, query: str) -> List[SearchResult]:
+        """Search within section."""
+        # Implementation would use semantic search
+        results = []
+        # Simple text search for now
+        text = self.text().lower()
+        query_lower = query.lower()
+        
+        if query_lower in text:
+            # Find snippet around match
+            index = text.find(query_lower)
+            start = max(0, index - 50)
+            end = min(len(text), index + len(query) + 50)
+            snippet = text[start:end]
+            
+            results.append(SearchResult(
+                node=self.node,
+                score=1.0,
+                snippet=snippet,
+                section=self.name
+            ))
+
+        return results
+
+    @staticmethod
+    def parse_section_name(section_name: str) -> tuple[Optional[str], Optional[str]]:
+        """
+        Parse section name to extract part and item identifiers.
+
+        Handles both 10-Q part-aware names and 10-K simple names.
+
+        Args:
+            section_name: Section identifier (e.g., "part_i_item_1", "item_1a", "risk_factors")
+
+        Returns:
+            Tuple of (part, item) where:
+            - part: "I", "II", or None for 10-K sections
+            - item: "1", "1A", "2", etc. or None if not an item section
+
+        Examples:
+            >>> Section.parse_section_name("part_i_item_1")
+            ("I", "1")
+            >>> Section.parse_section_name("part_ii_item_1a")
+            ("II", "1A")
+            >>> Section.parse_section_name("item_7")
+            (None, "7")
+            >>> Section.parse_section_name("risk_factors")
+            (None, None)
+        """
+        import re
+
+        section_lower = section_name.lower()
+
+        # Match 10-Q format: "part_i_item_1", "part_ii_item_1a"
+        part_item_match = re.match(r'part_([ivx]+)_item_(\d+[a-z]?)', section_lower)
+        if part_item_match:
+            part_roman = part_item_match.group(1).upper()
+            item_num = part_item_match.group(2).upper()
+            return (part_roman, item_num)
+
+        # Match 10-K format: "item_1", "item_1a", "item_7"
+        item_match = re.match(r'item_(\d+[a-z]?)', section_lower)
+        if item_match:
+            item_num = item_match.group(1).upper()
+            return (None, item_num)
+
+        # Not a structured item section
+        return (None, None)
+
+
+class Sections(Dict[str, Section]):
+    """
+    Dictionary wrapper for sections with rich display support.
+
+    Behaves like a normal dict but provides beautiful terminal display
+    via __rich__() method when printed in rich-enabled environments.
+    """
+
+    def __rich__(self):
+        """Return rich representation for display."""
+        if not self:
+            return Text("No sections detected", style="dim")
+
+        # Create summary table
+        table = RichTable(title="Document Sections", show_header=True, header_style="bold magenta")
+        table.add_column("Section", style="cyan", no_wrap=True)
+        table.add_column("Title", style="white")
+        table.add_column("Confidence", justify="right", style="green")
+        table.add_column("Method", style="yellow")
+        table.add_column("Part/Item", style="blue")
+
+        # Sort sections by part (roman numeral) and item number
+        def sort_key(item):
+            name, section = item
+            # Convert roman numerals to integers for sorting
+            roman_to_int = {'i': 1, 'ii': 2, 'iii': 3, 'iv': 4, 'v': 5}
+
+            part = section.part.lower() if section.part else ''
+            item_str = section.item if section.item else ''
+
+            # Extract part number
+            part_num = roman_to_int.get(part, 0)
+
+            # Extract item number and letter
+            import re
+            if item_str:
+                match = re.match(r'(\d+)([a-z]?)', item_str.lower())
+                if match:
+                    item_num = int(match.group(1))
+                    item_letter = match.group(2) or ''
+                    return (part_num, item_num, item_letter)
+
+            # Fallback to name sorting
+            return (part_num, 999, name)
+
+        sorted_sections = sorted(self.items(), key=sort_key)
+
+        # Add rows for each section
+        for name, section in sorted_sections:
+            # Format confidence as percentage
+            confidence = f"{section.confidence:.1%}"
+
+            # Format part/item info
+            part_item = ""
+            if section.part and section.item:
+                part_item = f"Part {section.part}, Item {section.item}"
+            elif section.item:
+                part_item = f"Item {section.item}"
+            elif section.part:
+                part_item = f"Part {section.part}"
+
+            # Truncate title if too long
+            title = section.title
+            if len(title) > 50:
+                title = title[:47] + "..."
+
+            table.add_row(
+                name,
+                title,
+                confidence,
+                section.detection_method,
+                part_item
+            )
+
+        # Create summary stats
+        total = len(self)
+        high_conf = sum(1 for s in self.values() if s.confidence >= 0.8)
+        methods = {}
+        for section in self.values():
+            methods[section.detection_method] = methods.get(section.detection_method, 0) + 1
+
+        summary = Text()
+        summary.append(f"\nTotal: {total} sections | ", style="dim")
+        summary.append(f"High confidence (≥80%): {high_conf} | ", style="dim")
+        summary.append(f"Methods: {', '.join(f'{m}={c}' for m, c in methods.items())}", style="dim")
+
+        return Group(table, summary)
+
+    def __repr__(self):
+        return repr_rich(self.__rich__())
+
+    def get_item(self, item: str, part: str = None) -> Optional[Section]:
+        """
+        Get section by item number with optional part specification.
+
+        Args:
+            item: Item identifier (e.g., "1", "1A", "7", "Item 1", "Item 7A")
+            part: Optional part specification (e.g., "I", "II", "Part I", "Part II")
+                  If not specified and multiple parts contain the item, returns first match.
+
+        Returns:
+            Section object if found, None otherwise
+
+        Examples:
+            >>> sections.get_item("1")           # Returns first Item 1 (any part)
+            >>> sections.get_item("1", "I")      # Returns Part I, Item 1
+            >>> sections.get_item("Item 1A")     # Returns first Item 1A
+            >>> sections.get_item("7A", "II")    # Returns Part II, Item 7A
+        """
+        # Normalize item string - remove "Item " prefix if present
+        item_clean = item.replace("Item ", "").replace("item ", "").strip().upper()
+
+        # Normalize part string if provided
+        part_clean = None
+        if part:
+            part_clean = part.replace("Part ", "").replace("part ", "").replace("PART ", "").strip().upper()
+
+        # Search through sections
+        for name, section in self.items():
+            if section.item and section.item.upper() == item_clean:
+                if part_clean is None:
+                    # No part specified - return first match
+                    return section
+                elif section.part and section.part.upper() == part_clean:
+                    # Part matches
+                    return section
+
+        return None
+
+    def get_part(self, part: str) -> Dict[str, Section]:
+        """
+        Get all sections in a specific part.
+
+        Args:
+            part: Part identifier (e.g., "I", "II", "Part I", "Part II")
+
+        Returns:
+            Dictionary of sections in that part
+
+        Examples:
+            >>> sections.get_part("I")        # All Part I sections
+            >>> sections.get_part("Part II")  # All Part II sections
+        """
+        # Normalize part string
+        part_clean = part.replace("Part ", "").replace("part ", "").replace("PART ", "").strip().upper()
+
+        result = {}
+        for name, section in self.items():
+            if section.part and section.part.upper() == part_clean:
+                result[name] = section
+
+        return result
+
+    def get(self, key, default=None):
+        """
+        Enhanced get method that supports flexible key formats.
+
+        Supports:
+        - Standard dict key: "part_i_item_1"
+        - Item number: "Item 1", "1", "1A"
+        - Part+Item: ("I", "1"), ("Part II", "7A")
+
+        Args:
+            key: Section key (string or tuple)
+            default: Default value if not found
+
+        Returns:
+            Section object or default value
+        """
+        # Try standard dict lookup first
+        if isinstance(key, str):
+            result = super().get(key, None)
+            if result is not None:
+                return result
+
+            # Try as item number
+            result = self.get_item(key)
+            if result is not None:
+                return result
+
+        # Try as (part, item) tuple
+        elif isinstance(key, tuple) and len(key) == 2:
+            part, item = key
+            result = self.get_item(item, part)
+            if result is not None:
+                return result
+
+        return default
+
+    def __getitem__(self, key):
+        """
+        Enhanced __getitem__ that supports flexible key formats.
+
+        Supports:
+        - Standard dict key: sections["part_i_item_1"]
+        - Item number: sections["Item 1"], sections["1A"]
+        - Part+Item tuple: sections[("I", "1")], sections[("II", "7A")]
+
+        Raises KeyError if not found (standard dict behavior).
+        """
+        # Try standard dict lookup first
+        if isinstance(key, str):
+            try:
+                return super().__getitem__(key)
+            except KeyError:
+                # Try as item number
+                result = self.get_item(key)
+                if result is not None:
+                    return result
+
+        # Try as (part, item) tuple
+        elif isinstance(key, tuple) and len(key) == 2:
+            part, item = key
+            result = self.get_item(item, part)
+            if result is not None:
+                return result
+
+        # Not found - raise KeyError
+        raise KeyError(key)
+
+
+@dataclass
+class Document:
+    """
+    Main document class.
+    
+    Represents a parsed HTML document with methods for content extraction,
+    search, and transformation.
+    """
+    
+    # Core properties
+    root: Node
+    metadata: DocumentMetadata = field(default_factory=DocumentMetadata)
+
+    # Cached extractions
+    _sections: Optional[Sections] = field(default=None, init=False, repr=False)
+    _tables: Optional[List[TableNode]] = field(default=None, init=False, repr=False)
+    _headings: Optional[List[Node]] = field(default=None, init=False, repr=False)
+    _xbrl_facts: Optional[List[XBRLFact]] = field(default=None, init=False, repr=False)
+    _text_cache: Optional[str] = field(default=None, init=False, repr=False)
+    _config: Optional[Any] = field(default=None, init=False, repr=False)  # ParserConfig reference
+    
+    @property
+    def sections(self) -> Sections:
+        """
+        Get document sections using hybrid multi-strategy detection.
+
+        Tries detection methods in order of reliability:
+        1. TOC-based (0.95 confidence)
+        2. Heading-based (0.7-0.9 confidence)
+        3. Pattern-based (0.6 confidence)
+
+        Returns a Sections dictionary wrapper that provides rich terminal display
+        via __rich__() method. Each section includes confidence score and detection method.
+        """
+        if self._sections is None:
+            # Get form type from config or metadata
+            form = None
+            if self._config and hasattr(self._config, 'form'):
+                form = self._config.form
+            elif self.metadata and self.metadata.form:
+                form = self.metadata.form
+
+            # Only detect sections for supported form types (including amendments)
+            # Normalize form type by removing /A suffix for amendments
+            base_form = form.replace('/A', '') if form else None
+
+            if base_form and base_form in ['10-K', '10-Q', '8-K']:
+                from edgar.documents.extractors.hybrid_section_detector import HybridSectionDetector
+                # Pass thresholds from config if available
+                thresholds = self._config.detection_thresholds if self._config else None
+                # Use base form type for detection (10-K/A → 10-K)
+                detector = HybridSectionDetector(self, base_form, thresholds)
+                detected_sections = detector.detect_sections()
+            else:
+                # Fallback to pattern-based for other types or unknown
+                from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
+                extractor = SectionExtractor(form) if form else SectionExtractor()
+                detected_sections = extractor.extract(self)
+
+            # Wrap detected sections in Sections class for rich display
+            self._sections = Sections(detected_sections)
+
+        return self._sections
+    
+    @property
+    def tables(self) -> List[TableNode]:
+        """Get all tables in document."""
+        if self._tables is None:
+            self._tables = self.root.find(lambda n: isinstance(n, TableNode))
+        return self._tables
+    
+    @property
+    def headings(self) -> List[Node]:
+        """Get all headings in document."""
+        if self._headings is None:
+            from edgar.documents.nodes import HeadingNode
+            self._headings = self.root.find(lambda n: isinstance(n, HeadingNode))
+        return self._headings
+    
+    @property
+    def xbrl_facts(self) -> List[XBRLFact]:
+        """Get all XBRL facts in document."""
+        if self._xbrl_facts is None:
+            self._xbrl_facts = self._extract_xbrl_facts()
+        return self._xbrl_facts
+    
+    def text(self, 
+             clean: bool = True,
+             include_tables: bool = True,
+             include_metadata: bool = False,
+             max_length: Optional[int] = None) -> str:
+        """
+        Extract text from document.
+        
+        Args:
+            clean: Clean and normalize text
+            include_tables: Include table content in text
+            include_metadata: Include metadata annotations
+            max_length: Maximum text length
+            
+        Returns:
+            Extracted text
+        """
+        # Use cache if available and parameters match
+        if (self._text_cache is not None and 
+            clean and not include_tables and not include_metadata and max_length is None):
+            return self._text_cache
+        
+        # If whitespace was preserved during parsing and clean is default (True),
+        # respect the preserve_whitespace setting
+        if self.metadata.preserve_whitespace and clean:
+            clean = False
+        
+        from edgar.documents.extractors.text_extractor import TextExtractor
+        extractor = TextExtractor(
+            clean=clean,
+            include_tables=include_tables,
+            include_metadata=include_metadata,
+            max_length=max_length
+        )
+        text = extractor.extract(self)
+        
+        # Apply navigation link filtering when cleaning 
+        if clean:
+            # Use cached/integrated navigation filtering (optimized approach)
+            try:
+                from edgar.documents.utils.anchor_cache import filter_with_cached_patterns
+                # Use minimal cached approach (no memory overhead)
+                original_html = getattr(self.metadata, 'original_html', None)
+                text = filter_with_cached_patterns(text, html_content=original_html)
+            except:
+                # Fallback to pattern-based filtering
+                from edgar.documents.utils.toc_filter import filter_toc_links
+                text = filter_toc_links(text)
+        
+        # Cache if using default parameters
+        if clean and not include_tables and not include_metadata and max_length is None:
+            self._text_cache = text
+        
+        return text
+    
+    def search(self, query: str, top_k: int = 10) -> List[SearchResult]:
+        """
+        Search document for query.
+        
+        Args:
+            query: Search query
+            top_k: Maximum results to return
+            
+        Returns:
+            List of search results
+        """
+        from edgar.documents.search import DocumentSearch
+        searcher = DocumentSearch(self)
+        return searcher.search(query, top_k=top_k)
+    
+    def get_section(self, section_name: str, part: Optional[str] = None) -> Optional[Section]:
+        """
+        Get section by name with optional part specification for 10-Q filings.
+
+        Args:
+            section_name: Section identifier (e.g., "item_1", "part_i_item_1")
+            part: Optional part specification for 10-Q ("I", "II", "i", "ii")
+                  If provided, searches for "part_{part}_{section_name}"
+
+        Returns:
+            Section object if found, None otherwise
+
+        Examples:
+            # 10-K usage (unchanged)
+            >>> doc.get_section("item_1")  # Returns Item 1
+
+            # 10-Q usage with explicit part
+            >>> doc.get_section("item_1", part="I")  # Returns Part I Item 1
+            >>> doc.get_section("item_1", part="II")  # Returns Part II Item 1
+
+            # 10-Q usage with full name
+            >>> doc.get_section("part_i_item_1")  # Returns Part I Item 1
+        """
+        # If part is specified, construct part-aware name
+        if part:
+            part_normalized = part.upper()
+            # Remove "item_" prefix if present in section_name
+            item_name = section_name.replace("item_", "") if section_name.startswith("item_") else section_name
+            full_name = f"part_{part_normalized.lower()}_item_{item_name.lower()}"
+            return self.sections.get(full_name)
+
+        # Direct lookup (works for both 10-K "item_1" and 10-Q "part_i_item_1")
+        section = self.sections.get(section_name)
+        if section:
+            return section
+
+        # If not found and looks like an item without part, check if we have multiple parts
+        # In that case, raise a helpful error
+        if section_name.startswith("item_") or section_name.replace("_", "").startswith("item"):
+            # Check if we have part-aware sections (10-Q)
+            matching_sections = [name for name in self.sections.keys()
+                               if section_name in name and "part_" in name]
+            if matching_sections:
+                # Multiple parts available - user needs to specify which one
+                parts = sorted(set(s.split("_")[1] for s in matching_sections if s.startswith("part_")))
+                raise ValueError(
+                    f"Ambiguous section '{section_name}' in 10-Q filing. "
+                    f"Found in parts: {parts}. "
+                    f"Please specify part: get_section('{section_name}', part='I') or part='II'"
+                )
+
+        return None
+    
+    def extract_section_text(self, section_name: str) -> Optional[str]:
+        """Extract text from specific section."""
+        section = self.get_section(section_name)
+        if section:
+            return section.text()
+        return None
+    
+    def get_sec_section(self, section_name: str, clean: bool = True, 
+                       include_subsections: bool = True) -> Optional[str]:
+        """
+        Extract content from a specific SEC filing section using anchor analysis.
+        
+        Args:
+            section_name: Section name (e.g., "Item 1", "Item 1A", "Part I")
+            clean: Whether to apply text cleaning and navigation filtering
+            include_subsections: Whether to include subsections
+            
+        Returns:
+            Section text content or None if section not found
+            
+        Examples:
+            >>> doc.get_sec_section("Item 1")  # Business description
+            >>> doc.get_sec_section("Item 1A") # Risk factors  
+            >>> doc.get_sec_section("Item 7")  # MD&A
+        """
+        # Lazy-load section extractor
+        if not hasattr(self, '_section_extractor'):
+            from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
+            self._section_extractor = SECSectionExtractor(self)
+        
+        return self._section_extractor.get_section_text(
+            section_name, include_subsections, clean
+        )
+    
+    def get_available_sec_sections(self) -> List[str]:
+        """
+        Get list of SEC sections available for extraction.
+        
+        Returns:
+            List of section names that can be passed to get_sec_section()
+            
+        Example:
+            >>> sections = doc.get_available_sec_sections()
+            >>> print(sections)
+            ['Part I', 'Item 1', 'Item 1A', 'Item 1B', 'Item 2', ...]
+        """
+        if not hasattr(self, '_section_extractor'):
+            from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
+            self._section_extractor = SECSectionExtractor(self)
+        
+        return self._section_extractor.get_available_sections()
+    
+    def get_sec_section_info(self, section_name: str) -> Optional[Dict]:
+        """
+        Get detailed information about an SEC section.
+        
+        Args:
+            section_name: Section name to look up
+            
+        Returns:
+            Dict with section metadata including anchor info
+        """
+        if not hasattr(self, '_section_extractor'):
+            from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
+            self._section_extractor = SECSectionExtractor(self)
+        
+        return self._section_extractor.get_section_info(section_name)
+    
+    def to_markdown(self) -> str:
+        """Convert document to Markdown."""
+        from edgar.documents.renderers.markdown_renderer import MarkdownRenderer
+        renderer = MarkdownRenderer()
+        return renderer.render(self)
+    
+    def to_json(self, include_content: bool = True) -> Dict[str, Any]:
+        """
+        Convert document to JSON.
+        
+        Args:
+            include_content: Include full content or just structure
+            
+        Returns:
+            JSON-serializable dictionary
+        """
+        result = {
+            'metadata': self.metadata.to_dict(),
+            'sections': list(self.sections.keys()),
+            'table_count': len(self.tables),
+            'xbrl_fact_count': len(self.xbrl_facts)
+        }
+        
+        if include_content:
+            result['sections_detail'] = {
+                name: {
+                    'title': section.title,
+                    'text_length': len(section.text()),
+                    'table_count': len(section.tables())
+                }
+                for name, section in self.sections.items()
+            }
+            
+            result['tables'] = [
+                {
+                    'type': table.table_type.name,
+                    'rows': len(table.rows),
+                    'columns': len(table.headers[0]) if table.headers else 0,
+                    'caption': table.caption
+                }
+                for table in self.tables
+            ]
+        
+        return result
+    
+    def to_dataframe(self) -> 'pd.DataFrame':
+        """
+        Convert document tables to pandas DataFrame.
+        
+        Returns a DataFrame with all tables concatenated.
+        """
+        import pandas as pd
+        
+        if not self.tables:
+            return pd.DataFrame()
+        
+        # Convert each table to DataFrame
+        dfs = []
+        for i, table in enumerate(self.tables):
+            df = table.to_dataframe()
+            # Add table index
+            df['_table_index'] = i
+            df['_table_type'] = table.table_type.name
+            if table.caption:
+                df['_table_caption'] = table.caption
+            dfs.append(df)
+        
+        # Concatenate all tables
+        return pd.concat(dfs, ignore_index=True)
+    
+    def chunks(self, chunk_size: int = 512, overlap: int = 128) -> Iterator['DocumentChunk']:
+        """
+        Generate document chunks for processing.
+        
+        Args:
+            chunk_size: Target chunk size in tokens
+            overlap: Overlap between chunks
+            
+        Yields:
+            Document chunks
+        """
+        from edgar.documents.extractors.chunk_extractor import ChunkExtractor
+        extractor = ChunkExtractor(chunk_size=chunk_size, overlap=overlap)
+        return extractor.extract(self)
+    
+    def prepare_for_llm(self, 
+                       max_tokens: int = 4000,
+                       preserve_structure: bool = True,
+                       focus_sections: Optional[List[str]] = None) -> 'LLMDocument':
+        """
+        Prepare document for LLM processing.
+        
+        Args:
+            max_tokens: Maximum tokens
+            preserve_structure: Preserve document structure
+            focus_sections: Sections to focus on
+            
+        Returns:
+            LLM-optimized document
+        """
+        from edgar.documents.ai.llm_optimizer import LLMOptimizer
+        optimizer = LLMOptimizer()
+        return optimizer.optimize(
+            self, 
+            max_tokens=max_tokens,
+            preserve_structure=preserve_structure,
+            focus_sections=focus_sections
+        )
+    
+    def extract_key_information(self) -> Dict[str, Any]:
+        """Extract key information from document."""
+        return {
+            'company': self.metadata.company,
+            'form': self.metadata.form,
+            'filing_date': self.metadata.filing_date,
+            'sections': list(self.sections.keys()),
+            'financial_tables': sum(1 for t in self.tables if t.is_financial_table),
+            'total_tables': len(self.tables),
+            'xbrl_facts': len(self.xbrl_facts),
+            'document_length': len(self.text())
+        }
+    
+    def _extract_xbrl_facts(self) -> List[XBRLFact]:
+        """Extract XBRL facts from document."""
+        facts = []
+        
+        # Find all nodes with XBRL metadata
+        xbrl_nodes = self.root.find(
+            lambda n: n.get_metadata('ix_tag') is not None
+        )
+        
+        for node in xbrl_nodes:
+            fact = XBRLFact(
+                concept=node.get_metadata('ix_tag'),
+                value=node.text(),
+                context_ref=node.get_metadata('ix_context'),
+                unit_ref=node.get_metadata('ix_unit'),
+                decimals=node.get_metadata('ix_decimals'),
+                scale=node.get_metadata('ix_scale')
+            )
+            facts.append(fact)
+        
+        return facts
+    
+    def __len__(self) -> int:
+        """Get number of top-level nodes."""
+        return len(self.root.children)
+    
+    def __iter__(self) -> Iterator[Node]:
+        """Iterate over top-level nodes."""
+        return iter(self.root.children)
+
+    def __repr__(self) -> str:
+        return self.text()
+    
+    def walk(self) -> Iterator[Node]:
+        """Walk entire document tree."""
+        return self.root.walk()
+    
+    def find_nodes(self, predicate) -> List[Node]:
+        """Find all nodes matching predicate."""
+        return self.root.find(predicate)
+    
+    def find_first_node(self, predicate) -> Optional[Node]:
+        """Find first node matching predicate."""
+        return self.root.find_first(predicate)
+    
+    @property
+    def is_empty(self) -> bool:
+        """Check if document is empty."""
+        return len(self.root.children) == 0
+    
+    @property
+    def has_tables(self) -> bool:
+        """Check if document has tables."""
+        return len(self.tables) > 0
+    
+    @property
+    def has_xbrl(self) -> bool:
+        """Check if document has XBRL data."""
+        return len(self.xbrl_facts) > 0
+    
+    def validate(self) -> List[str]:
+        """
+        Validate document structure.
+        
+        Returns list of validation issues.
+        """
+        issues = []
+        
+        # Check for empty document
+        if self.is_empty:
+            issues.append("Document is empty")
+        
+        # Check for sections
+        if not self.sections:
+            issues.append("No sections detected")
+        
+        # Check for common sections in filings
+        if self.metadata.form in ['10-K', '10-Q']:
+            expected_sections = ['business', 'risk_factors', 'mda']
+            missing = [s for s in expected_sections if s not in self.sections]
+            if missing:
+                issues.append(f"Missing expected sections: {', '.join(missing)}")
+        
+        # Check for orphaned nodes
+        orphaned = self.root.find(lambda n: n.parent is None and n != self.root)
+        if orphaned:
+            issues.append(f"Found {len(orphaned)} orphaned nodes")
+        
+        return issues
+
+
+@dataclass
+class DocumentChunk:
+    """Represents a chunk of document for processing."""
+    content: str
+    start_node: Node
+    end_node: Node
+    section: Optional[str] = None
+    token_count: int = 0
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert chunk to dictionary."""
+        return {
+            'content': self.content,
+            'section': self.section,
+            'token_count': self.token_count,
+            'start_path': self.start_node.path,
+            'end_path': self.end_node.path
+        }
+
+
+@dataclass 
+class LLMDocument:
+    """Document optimized for LLM processing."""
+    content: str
+    metadata: Dict[str, Any]
+    token_count: int
+    sections: List[str]
+    truncated: bool = False
+    
+    def to_prompt(self) -> str:
+        """Convert to LLM prompt."""
+        parts = []
+        
+        # Add metadata context
+        parts.append(f"Document: {self.metadata.get('form', 'Unknown')}")
+        parts.append(f"Company: {self.metadata.get('company', 'Unknown')}")
+        parts.append(f"Date: {self.metadata.get('filing_date', 'Unknown')}")
+        parts.append("")
+        
+        # Add content
+        parts.append(self.content)
+        
+        if self.truncated:
+            parts.append("\n[Content truncated due to length]")
+        
+        return '\n'.join(parts)
--- a/venv/lib/python3.10/site-packages/edgar/documents/exceptions.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/exceptions.py
@@ -0,0 +1,81 @@
+"""
+Custom exceptions for the HTML parser.
+"""
+
+from typing import Optional, Dict, Any
+
+
+class ParsingError(Exception):
+    """Base exception for parsing errors."""
+    
+    def __init__(self, 
+                 message: str, 
+                 context: Optional[Dict[str, Any]] = None,
+                 suggestions: Optional[list] = None):
+        super().__init__(message)
+        self.message = message
+        self.context = context or {}
+        self.suggestions = suggestions or []
+    
+    def __str__(self):
+        result = self.message
+        if self.context:
+            result += f"\nContext: {self.context}"
+        if self.suggestions:
+            result += f"\nSuggestions: {', '.join(self.suggestions)}"
+        return result
+
+
+class HTMLParsingError(ParsingError):
+    """Error parsing HTML structure."""
+    pass
+
+
+class StyleParsingError(ParsingError):
+    """Error parsing CSS styles."""
+    pass
+
+
+class XBRLParsingError(ParsingError):
+    """Error parsing inline XBRL."""
+    pass
+
+
+class TableParsingError(ParsingError):
+    """Error parsing table structure."""
+    pass
+
+
+class SectionDetectionError(ParsingError):
+    """Error detecting document sections."""
+    pass
+
+
+class DocumentTooLargeError(ParsingError):
+    """Document exceeds maximum size."""
+    
+    def __init__(self, size: int, max_size: int):
+        super().__init__(
+            f"Document size ({size:,} bytes) exceeds maximum ({max_size:,} bytes)",
+            context={'size': size, 'max_size': max_size},
+            suggestions=[
+                "Use streaming parser for large documents",
+                "Increase max_document_size in configuration",
+                "Split document into smaller parts"
+            ]
+        )
+
+
+class InvalidConfigurationError(ParsingError):
+    """Invalid parser configuration."""
+    pass
+
+
+class NodeNotFoundError(ParsingError):
+    """Requested node not found in document."""
+    pass
+
+
+class ExtractionError(ParsingError):
+    """Error extracting content from document."""
+    pass
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/init.py
@@ -0,0 +1,15 @@
+"""
+Content extractors for documents.
+"""
+
+from edgar.documents.extractors.text_extractor import TextExtractor
+from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
+from edgar.documents.extractors.hybrid_section_detector import HybridSectionDetector
+from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
+
+__all__ = [
+    'TextExtractor',
+    'SectionExtractor',
+    'HybridSectionDetector',
+    'TOCSectionDetector'
+]
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/heading_section_detector.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/heading_section_detector.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/hybrid_section_detector.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/hybrid_section_detector.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/pattern_section_extractor.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/pattern_section_extractor.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/text_extractor.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/text_extractor.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/toc_section_detector.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/toc_section_detector.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/toc_section_extractor.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/toc_section_extractor.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/heading_section_detector.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/heading_section_detector.py
@@ -0,0 +1,170 @@
+"""
+Heading-based section detection strategy.
+
+Detects sections by analyzing heading nodes with HeaderInfo metadata.
+This strategy provides moderate confidence (0.7-0.9) and serves as a
+fallback when TOC-based detection is not available.
+"""
+
+import logging
+from typing import Dict, Optional
+
+from edgar.documents.document import Document, Section
+from edgar.documents.nodes import HeadingNode, SectionNode
+from edgar.documents.types import HeaderInfo
+
+logger = logging.getLogger(__name__)
+
+
+class HeadingSectionDetector:
+    """
+    Heading-based section detection using HeaderInfo.
+
+    Analyzes heading nodes that have been annotated with HeaderInfo
+    during parsing. Detects sections based on:
+    - Item numbers (Item 1, Item 1A, etc.)
+    - Heading confidence scores
+    - Heading hierarchy
+
+    Provides moderate confidence (0.7-0.9) detection.
+    """
+
+    def __init__(
+        self,
+        document: Document,
+        form: Optional[str] = None,
+        min_confidence: float = 0.5  # Lower threshold, let hybrid detector filter
+    ):
+        """
+        Initialize heading-based detector.
+
+        Args:
+            document: Document to analyze
+            form: Optional filing type for context ('10-K', '10-Q', '8-K')
+            min_confidence: Minimum confidence for headings (default 0.5)
+        """
+        self.document = document
+        self.form = form
+        self.min_confidence = min_confidence
+
+    def detect(self) -> Optional[Dict[str, Section]]:
+        """
+        Detect sections from heading nodes with HeaderInfo.
+
+        Returns:
+            Dictionary of sections if successful, None if no sections found
+        """
+        try:
+            # Get heading nodes from document
+            headings = self.document.headings
+            if not headings:
+                logger.debug("No headings found in document")
+                return None
+
+            sections = {}
+
+            for heading in headings:
+                # Check if heading has header info
+                if not hasattr(heading, 'header_info') or not heading.header_info:
+                    continue
+
+                header_info = heading.header_info
+
+                # Only use headings with sufficient confidence
+                if header_info.confidence < self.min_confidence:
+                    continue
+
+                # Check if it's an item header
+                if not header_info.is_item:
+                    continue
+
+                # Extract section from this heading
+                section = self._extract_section_from_heading(heading, header_info)
+                if section:
+                    section.confidence = header_info.confidence
+                    section.detection_method = 'heading'
+                    sections[section.name] = section
+
+            if not sections:
+                logger.debug("No item headers found with sufficient confidence")
+                return None
+
+            logger.info(f"Heading detection found {len(sections)} sections")
+            return sections
+
+        except Exception as e:
+            logger.warning(f"Heading detection failed: {e}")
+            return None
+
+    def _extract_section_from_heading(
+        self, heading: HeadingNode, header_info: HeaderInfo
+    ) -> Optional[Section]:
+        """
+        Extract section content from heading node to next heading.
+
+        Args:
+            heading: HeadingNode representing section start
+            header_info: HeaderInfo with section metadata
+
+        Returns:
+            Section object if successful, None otherwise
+        """
+        try:
+            # Create section name from item number
+            if header_info.item_number:
+                # Normalize: "1A" -> "item_1a", "7" -> "item_7"
+                section_name = f"item_{header_info.item_number.replace('.', '_').lower()}"
+            else:
+                section_name = "unknown"
+
+            # Create section node
+            section_node = SectionNode(section_name=section_name)
+
+            # Find next heading at same or higher level to determine section end
+            current_level = header_info.level
+            parent = heading.parent
+            if not parent:
+                logger.debug(f"Heading {header_info.text} has no parent")
+                return None
+
+            # Find heading position in parent's children
+            try:
+                heading_index = parent.children.index(heading)
+            except ValueError:
+                logger.debug(f"Could not find heading in parent's children")
+                return None
+
+            # Collect nodes until next section heading
+            for i in range(heading_index + 1, len(parent.children)):
+                child = parent.children[i]
+
+                # Stop at next heading of same or higher level
+                if isinstance(child, HeadingNode):
+                    if hasattr(child, 'header_info') and child.header_info:
+                        if child.header_info.level <= current_level:
+                            break
+
+                # Add child to section
+                section_node.add_child(child)
+
+            # Parse section name to extract part and item identifiers
+            part, item = Section.parse_section_name(section_name)
+
+            # Create Section object
+            section = Section(
+                name=section_name,
+                title=header_info.text,
+                node=section_node,
+                start_offset=0,  # Would need actual text position
+                end_offset=0,  # Would need actual text position
+                confidence=header_info.confidence,
+                detection_method='heading',
+                part=part,
+                item=item
+            )
+
+            return section
+
+        except Exception as e:
+            logger.warning(f"Failed to extract section from heading: {e}")
+            return None
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/hybrid_section_detector.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/hybrid_section_detector.py
@@ -0,0 +1,489 @@
+"""
+Hybrid section detection system with multiple fallback strategies.
+
+This module implements a multi-strategy approach to section detection:
+1. TOC-based (primary): High confidence, uses Table of Contents structure
+2. Heading-based (fallback): Moderate confidence, uses multi-strategy heading detection
+3. Pattern-based (last resort): Lower confidence, uses regex pattern matching
+"""
+
+import logging
+from typing import Dict, Optional, List
+from dataclasses import dataclass
+from functools import lru_cache
+
+from edgar.documents.document import Document, Section
+from edgar.documents.nodes import SectionNode, HeadingNode
+from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
+from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
+from edgar.documents.config import DetectionThresholds
+
+logger = logging.getLogger(__name__)
+
+
+class HybridSectionDetector:
+    """
+    Multi-strategy section detector with fallback.
+
+    Tries strategies in order of reliability:
+    1. TOC-based (0.95 confidence) - Most reliable
+    2. Multi-strategy heading detection (0.7-0.9 confidence) - Fallback
+    3. Pattern matching (0.6 confidence) - Last resort
+
+    Example:
+        >>> detector = HybridSectionDetector(document, '10-K')
+        >>> sections = detector.detect_sections()
+        >>> for name, section in sections.items():
+        ...     print(f"{name}: {section.confidence:.2f} ({section.detection_method})")
+    """
+
+    def __init__(self, document: Document, form: str, thresholds: Optional[DetectionThresholds] = None):
+        """
+        Initialize hybrid detector.
+
+        Args:
+            document: Document to extract sections from
+            form: Filing type ('10-K', '10-Q', '8-K')
+            thresholds: Detection thresholds configuration
+        """
+        self.document = document
+        self.form = form
+        self.thresholds = thresholds or DetectionThresholds()
+
+        # Initialize detection strategies
+        self.toc_detector = TOCSectionDetector(document)
+        self.pattern_extractor = SectionExtractor(form)
+
+    def detect_sections(self) -> Dict[str, Section]:
+        """
+        Detect sections using hybrid approach with fallback and validation.
+
+        Returns:
+            Dictionary mapping section names to Section objects with confidence scores
+        """
+        # Strategy 1: TOC-based (most reliable)
+        logger.debug("Trying TOC-based detection...")
+        sections = self.toc_detector.detect()
+        if sections:
+            logger.info(f"TOC detection successful: {len(sections)} sections found")
+            return self._validate_pipeline(sections, enable_cross_validation=True)
+
+        # Strategy 2: Heading-based (fallback)
+        logger.debug("TOC detection failed, trying heading detection...")
+        sections = self._try_heading_detection()
+        if sections:
+            logger.info(f"Heading detection successful: {len(sections)} sections found")
+            return self._validate_pipeline(sections, enable_cross_validation=False)
+
+        # Strategy 3: Pattern-based (last resort)
+        logger.debug("Heading detection failed, trying pattern matching...")
+        sections = self._try_pattern_detection()
+        if sections:
+            logger.info(f"Pattern detection successful: {len(sections)} sections found")
+            return self._validate_pipeline(sections, enable_cross_validation=False)
+
+        logger.warning("All detection strategies failed, no sections found")
+        return {}
+
+    def _validate_pipeline(
+        self,
+        sections: Dict[str, Section],
+        enable_cross_validation: bool = False
+    ) -> Dict[str, Section]:
+        """
+        Apply validation pipeline to sections.
+
+        Centralizes validation logic to eliminate duplication.
+
+        Args:
+            sections: Sections to validate
+            enable_cross_validation: Whether to enable cross-validation (expensive)
+
+        Returns:
+            Validated sections
+        """
+        if not sections:
+            return sections
+
+        # Cross-validate (optional, expensive)
+        if enable_cross_validation and self.thresholds.enable_cross_validation:
+            sections = self._cross_validate(sections)
+
+        # Validate boundaries
+        sections = self._validate_boundaries(sections)
+
+        # Deduplicate
+        sections = self._deduplicate(sections)
+
+        # Filter by confidence
+        sections = self._filter_by_confidence(sections)
+
+        return sections
+
+    def _try_heading_detection(self) -> Optional[Dict[str, Section]]:
+        """
+        Try multi-strategy heading detection.
+
+        Returns:
+            Dictionary of sections if successful, None if failed
+        """
+        try:
+            # Get heading nodes from document
+            headings = self.document.headings
+            if not headings:
+                return None
+
+            sections = {}
+
+            for heading in headings:
+                # Check if heading has header info
+                if not hasattr(heading, 'header_info') or not heading.header_info:
+                    continue
+
+                header_info = heading.header_info
+
+                # Only use headings with sufficient confidence
+                if header_info.confidence < 0.7:
+                    continue
+
+                # Check if it's an item header
+                if not header_info.is_item:
+                    continue
+
+                # Extract section from this heading to next
+                section = self._extract_section_from_heading(heading, header_info)
+                if section:
+                    section.confidence = header_info.confidence
+                    section.detection_method = 'heading'
+                    sections[section.name] = section
+
+            return sections if sections else None
+
+        except Exception as e:
+            logger.warning(f"Heading detection failed: {e}")
+            return None
+
+    def _try_pattern_detection(self) -> Optional[Dict[str, Section]]:
+        """
+        Try pattern-based extraction.
+
+        Returns:
+            Dictionary of sections if successful, None if failed
+        """
+        try:
+            # Use pattern extractor
+            sections = self.pattern_extractor.extract(self.document)
+
+            # Mark with pattern detection confidence
+            for section in sections.values():
+                section.confidence = 0.6  # Pattern-based = lower confidence
+                section.detection_method = 'pattern'
+
+            return sections if sections else None
+
+        except Exception as e:
+            logger.warning(f"Pattern detection failed: {e}")
+            return None
+
+    def _extract_section_from_heading(self, heading: HeadingNode, header_info) -> Optional[Section]:
+        """
+        Extract section content from heading node to next heading.
+
+        Args:
+            heading: HeadingNode representing section start
+            header_info: HeaderInfo with section metadata
+
+        Returns:
+            Section object if successful, None otherwise
+        """
+        try:
+            # Create section name from item number
+            section_name = f"item_{header_info.item_number.replace('.', '_')}" if header_info.item_number else "unknown"
+
+            # Create section node
+            section_node = SectionNode(section_name=section_name)
+
+            # Find next heading at same or higher level to determine section end
+            current_level = header_info.level
+            parent = heading.parent
+            if not parent:
+                return None
+
+            # Find heading position in parent's children
+            try:
+                heading_index = parent.children.index(heading)
+            except ValueError:
+                return None
+
+            # Collect nodes until next section heading
+            for i in range(heading_index + 1, len(parent.children)):
+                child = parent.children[i]
+
+                # Stop at next heading of same or higher level
+                if isinstance(child, HeadingNode):
+                    if hasattr(child, 'header_info') and child.header_info:
+                        if child.header_info.level <= current_level:
+                            break
+
+                # Add child to section
+                section_node.add_child(child)
+
+            # Create Section object
+            section = Section(
+                name=section_name,
+                title=header_info.text,
+                node=section_node,
+                start_offset=0,  # Would need actual text position
+                end_offset=0,  # Would need actual text position
+                confidence=header_info.confidence,
+                detection_method='heading'
+            )
+
+            return section
+
+        except Exception as e:
+            logger.warning(f"Failed to extract section from heading: {e}")
+            return None
+
+    def _cross_validate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
+        """
+        Cross-validate sections using multiple detection methods.
+
+        Boosts confidence if multiple methods detect the same section.
+        Reduces confidence if methods disagree.
+
+        Args:
+            sections: Sections detected by primary method
+
+        Returns:
+            Validated sections with adjusted confidence scores
+        """
+        validated = {}
+
+        # Get pattern-based sections once for comparison (not per section)
+        try:
+            pattern_sections = self.pattern_extractor.extract(self.document)
+        except Exception as e:
+            logger.debug(f"Pattern extraction failed for cross-validation: {e}")
+            pattern_sections = {}
+
+        for name, section in sections.items():
+            # Try alternative detection (pattern matching for validation)
+            try:
+                # Check if this section is also found by pattern matching
+                found_in_patterns = False
+                for pattern_name, pattern_section in pattern_sections.items():
+                    # Check for name similarity or overlap
+                    if self._sections_similar(section, pattern_section):
+                        found_in_patterns = True
+                        break
+
+                # Boost confidence if methods agree
+                if found_in_patterns:
+                    section.confidence = min(section.confidence * self.thresholds.cross_validation_boost, 1.0)
+                    section.validated = True
+                    logger.debug(f"Section {name} validated by multiple methods, confidence boosted to {section.confidence:.2f}")
+                else:
+                    # Slight reduction if not validated
+                    section.confidence *= self.thresholds.disagreement_penalty
+                    section.validated = False
+
+            except Exception as e:
+                logger.debug(f"Cross-validation failed for {name}: {e}")
+                # Keep original confidence if validation fails
+                pass
+
+            validated[name] = section
+
+        return validated
+
+    def _validate_boundaries(self, sections: Dict[str, Section]) -> Dict[str, Section]:
+        """
+        Validate section boundaries for overlaps, gaps, and ordering.
+
+        Args:
+            sections: Sections to validate
+
+        Returns:
+            Sections with validated boundaries
+        """
+        if not sections:
+            return sections
+
+        # Sort by start offset
+        sorted_sections = sorted(sections.items(), key=lambda x: x[1].start_offset)
+
+        validated = {}
+        prev_section = None
+
+        for name, section in sorted_sections:
+            # Check for overlap with previous section
+            if prev_section and section.start_offset > 0:
+                if section.start_offset < prev_section[1].end_offset:
+                    # Overlap detected - adjust boundary at midpoint
+                    gap_mid = (prev_section[1].end_offset + section.start_offset) // 2
+                    prev_section[1].end_offset = gap_mid
+                    section.start_offset = gap_mid
+
+                    # Reduce confidence due to boundary adjustment
+                    section.confidence *= self.thresholds.boundary_overlap_penalty
+                    prev_section[1].confidence *= self.thresholds.boundary_overlap_penalty
+
+                    logger.debug(f"Adjusted boundary between {prev_section[0]} and {name}")
+
+                # Check for large gap (>10% of document size)
+                elif prev_section[1].end_offset > 0:
+                    gap_size = section.start_offset - prev_section[1].end_offset
+                    if gap_size > 100000:  # Arbitrary large gap threshold
+                        # Large gap - might indicate missing section
+                        section.confidence *= 0.9
+                        logger.debug(f"Large gap detected before {name}")
+
+            validated[name] = section
+            prev_section = (name, section)
+
+        return validated
+
+    def _deduplicate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
+        """
+        Remove duplicate sections detected by multiple methods.
+
+        Keeps the detection with highest confidence.
+
+        Args:
+            sections: Sections possibly containing duplicates
+
+        Returns:
+            Deduplicated sections
+        """
+        if len(sections) <= 1:
+            return sections
+
+        # Group similar sections
+        groups = self._group_similar_sections(sections)
+
+        deduplicated = {}
+        for group in groups:
+            if len(group) == 1:
+                # No duplicates
+                deduplicated[group[0].name] = group[0]
+            else:
+                # Keep section with highest confidence
+                best = max(group, key=lambda s: s.confidence)
+
+                # Merge detection methods
+                methods = set(s.detection_method for s in group)
+                if len(methods) > 1:
+                    best.detection_method = ','.join(sorted(methods))
+                    # Boost confidence for multi-method detection
+                    best.confidence = min(best.confidence * 1.15, 1.0)
+                    best.validated = True
+                    logger.debug(f"Merged duplicate sections for {best.name}, methods: {best.detection_method}")
+
+                deduplicated[best.name] = best
+
+        return deduplicated
+
+    def _group_similar_sections(self, sections: Dict[str, Section]) -> List[List[Section]]:
+        """
+        Group sections that appear to be duplicates.
+
+        Args:
+            sections: Sections to group
+
+        Returns:
+            List of section groups
+        """
+        groups = []
+        used = set()
+
+        for name1, section1 in sections.items():
+            if name1 in used:
+                continue
+
+            group = [section1]
+            used.add(name1)
+
+            for name2, section2 in sections.items():
+                if name2 in used:
+                    continue
+
+                # Check if sections are similar
+                if self._sections_similar(section1, section2):
+                    group.append(section2)
+                    used.add(name2)
+
+            groups.append(group)
+
+        return groups
+
+    def _sections_similar(self, section1: Section, section2: Section) -> bool:
+        """
+        Check if two sections are similar (likely duplicates).
+
+        Args:
+            section1: First section
+            section2: Second section
+
+        Returns:
+            True if sections are similar
+        """
+        # Normalize names for comparison
+        name1 = section1.name.lower().replace('_', ' ').strip()
+        name2 = section2.name.lower().replace('_', ' ').strip()
+
+        # Check exact match after normalization
+        if name1 == name2:
+            return True
+
+        # Check title similarity (exact match)
+        title1 = section1.title.lower().strip()
+        title2 = section2.title.lower().strip()
+
+        if title1 == title2:
+            return True
+
+        # Check for position overlap (if positions are set)
+        if section1.start_offset > 0 and section2.start_offset > 0:
+            # Calculate overlap
+            overlap_start = max(section1.start_offset, section2.start_offset)
+            overlap_end = min(section1.end_offset, section2.end_offset)
+
+            if overlap_end > overlap_start:
+                # There is overlap
+                overlap_size = overlap_end - overlap_start
+                min_size = min(
+                    section1.end_offset - section1.start_offset,
+                    section2.end_offset - section2.start_offset
+                )
+
+                # If overlap is >50% of smaller section, consider similar
+                if min_size > 0 and overlap_size / min_size > 0.5:
+                    return True
+
+        return False
+
+    def _filter_by_confidence(self, sections: Dict[str, Section]) -> Dict[str, Section]:
+        """
+        Filter sections by minimum confidence threshold.
+
+        Args:
+            sections: Sections to filter
+
+        Returns:
+            Filtered sections meeting minimum confidence
+        """
+        # Check for filing-specific thresholds
+        min_conf = self.thresholds.min_confidence
+        if self.form in self.thresholds.thresholds_by_form:
+            filing_thresholds = self.thresholds.thresholds_by_form[self.form]
+            min_conf = filing_thresholds.get('min_confidence', min_conf)
+
+        filtered = {}
+        for name, section in sections.items():
+            if section.confidence >= min_conf:
+                filtered[name] = section
+            else:
+                logger.debug(f"Filtered out section {name} with confidence {section.confidence:.2f} < {min_conf:.2f}")
+
+        return filtered
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pattern_section_extractor.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pattern_section_extractor.py
@@ -0,0 +1,405 @@
+"""
+Section extraction from documents.
+"""
+
+import re
+from typing import Dict, List, Optional, Tuple
+
+from edgar.documents.document import Document, Section
+from edgar.documents.nodes import Node, HeadingNode, SectionNode
+
+
+class SectionExtractor:
+    """
+    Extracts logical sections from documents.
+    
+    Identifies document sections like:
+    - Business Overview (Item 1)
+    - Risk Factors (Item 1A)
+    - MD&A (Item 7)
+    - Financial Statements (Item 8)
+    """
+    
+    # Common section patterns for different filing types
+    SECTION_PATTERNS = {
+        '10-K': {
+            'business': [
+                (r'^(Item|ITEM)\s+1\.?\s*Business', 'Item 1 - Business'),
+                (r'^Business\s*$', 'Business'),
+                (r'^Business Overview', 'Business Overview'),
+                (r'^Our Business', 'Our Business'),
+                (r'^Company Overview', 'Company Overview')
+            ],
+            'risk_factors': [
+                (r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
+                (r'^Risk\s+Factors', 'Risk Factors'),
+                (r'^Factors\s+That\s+May\s+Affect', 'Risk Factors')
+            ],
+            'properties': [
+                (r'^(Item|ITEM)\s+2\.?\s*Properties', 'Item 2 - Properties'),
+                (r'^Properties', 'Properties'),
+                (r'^Real\s+Estate', 'Real Estate')
+            ],
+            'legal_proceedings': [
+                (r'^(Item|ITEM)\s+3\.?\s*Legal\s+Proceedings', 'Item 3 - Legal Proceedings'),
+                (r'^Legal\s+Proceedings', 'Legal Proceedings'),
+                (r'^Litigation', 'Litigation')
+            ],
+            'market_risk': [
+                (r'^(Item|ITEM)\s+7A\.?\s*Quantitative.*Disclosures', 'Item 7A - Market Risk'),
+                (r'^Market\s+Risk', 'Market Risk'),
+                (r'^Quantitative.*Qualitative.*Market\s+Risk', 'Market Risk')
+            ],
+            'mda': [
+                (r'^(Item|ITEM)\s+7\.?\s*Management.*Discussion', 'Item 7 - MD&A'),
+                (r'^Management.*Discussion.*Analysis', 'MD&A'),
+                (r'^MD&A', 'MD&A')
+            ],
+            'financial_statements': [
+                (r'^(Item|ITEM)\s+8\.?\s*Financial\s+Statements', 'Item 8 - Financial Statements'),
+                (r'^Financial\s+Statements', 'Financial Statements'),
+                (r'^Consolidated\s+Financial\s+Statements', 'Consolidated Financial Statements')
+            ],
+            'controls_procedures': [
+                (r'^(Item|ITEM)\s+9A\.?\s*Controls.*Procedures', 'Item 9A - Controls and Procedures'),
+                (r'^Controls.*Procedures', 'Controls and Procedures'),
+                (r'^Internal\s+Control', 'Internal Controls')
+            ]
+        },
+        '10-Q': {
+            'financial_statements': [
+                (r'^(Item|ITEM)\s+1\.?\s*Financial\s+Statements', 'Item 1 - Financial Statements'),
+                (r'^Financial\s+Statements', 'Financial Statements'),
+                (r'^Condensed.*Financial\s+Statements', 'Condensed Financial Statements')
+            ],
+            'mda': [
+                (r'^(Item|ITEM)\s+2\.?\s*Management.*Discussion', 'Item 2 - MD&A'),
+                (r'^Management.*Discussion.*Analysis', 'MD&A')
+            ],
+            'market_risk': [
+                (r'^(Item|ITEM)\s+3\.?\s*Quantitative.*Disclosures', 'Item 3 - Market Risk'),
+                (r'^Market\s+Risk', 'Market Risk')
+            ],
+            'controls_procedures': [
+                (r'^(Item|ITEM)\s+4\.?\s*Controls.*Procedures', 'Item 4 - Controls and Procedures'),
+                (r'^Controls.*Procedures', 'Controls and Procedures')
+            ],
+            'legal_proceedings': [
+                (r'^(Item|ITEM)\s+1\.?\s*Legal\s+Proceedings', 'Item 1 - Legal Proceedings'),
+                (r'^Legal\s+Proceedings', 'Legal Proceedings')
+            ],
+            'risk_factors': [
+                (r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
+                (r'^Risk\s+Factors', 'Risk Factors')
+            ]
+        },
+        '8-K': {
+            'item_101': [
+                (r'^(Item|ITEM)\s+1\.01', 'Item 1.01 - Entry into Material Agreement'),
+                (r'^Entry.*Material.*Agreement', 'Material Agreement')
+            ],
+            'item_201': [
+                (r'^(Item|ITEM)\s+2\.01', 'Item 2.01 - Completion of Acquisition'),
+                (r'^Completion.*Acquisition', 'Acquisition')
+            ],
+            'item_202': [
+                (r'^(Item|ITEM)\s+2\.02', 'Item 2.02 - Results of Operations'),
+                (r'^Results.*Operations', 'Results of Operations')
+            ],
+            'item_503': [
+                (r'^(Item|ITEM)\s+5\.03', 'Item 5.03 - Director/Officer Changes'),
+                (r'^Amendments.*Articles', 'Charter Amendments')
+            ],
+            'item_801': [
+                (r'^(Item|ITEM)\s+8\.01', 'Item 8.01 - Other Events'),
+                (r'^Other\s+Events', 'Other Events')
+            ],
+            'item_901': [
+                (r'^(Item|ITEM)\s+9\.01', 'Item 9.01 - Financial Statements and Exhibits'),
+                (r'^Financial.*Exhibits', 'Financial Statements and Exhibits')
+            ]
+        }
+    }
+    
+    def __init__(self, form: Optional[str] = None):
+        """
+        Initialize section extractor.
+        
+        Args:
+            form: Type of filing (10-K, 10-Q, 8-K, etc.)
+        """
+        self.form = form
+    
+    def extract(self, document: Document) -> Dict[str, Section]:
+        """
+        Extract sections from document.
+
+        Args:
+            document: Document to extract sections from
+
+        Returns:
+            Dictionary mapping section names to Section objects
+        """
+        # Get filing type from instance, metadata, or document config
+        # NOTE: We no longer auto-detect filing type (expensive and unnecessary)
+        form = None
+
+        if self.form:
+            form = self.form
+        elif document.metadata and document.metadata.form:
+            form = document.metadata.form
+        elif hasattr(document, '_config') and document._config and document._config.form:
+            form = document._config.form
+
+        # Only extract sections for forms that have standard sections
+        if not form or form not in ['10-K', '10-Q', '8-K']:
+            return {}  # No filing type or unsupported form = no section detection
+
+        # Get patterns for filing type
+        patterns = self.SECTION_PATTERNS.get(form, {})
+        if not patterns:
+            return {}  # No patterns defined for this form type
+
+        # Find section headers
+        headers = self._find_section_headers(document)
+
+        # For 10-Q, detect Part I/Part II boundaries
+        part_context = None
+        if form == '10-Q':
+            part_context = self._detect_10q_parts(headers)
+
+        # Match headers to sections
+        sections = self._match_sections(headers, patterns, document, part_context)
+
+        # Create section objects
+        return self._create_sections(sections, document)
+    
+    # NOTE: _detect_form() removed - form type should be known from context
+    # Filing metadata should be set by the caller (Filing class, TenK/TenQ, etc.)
+
+    # NOTE: _infer_form_from_headers() kept for backward compatibility but not used
+    # in normal flow anymore. Form type should always be provided explicitly.
+    def _infer_form_from_headers(self, document: Document) -> str:
+        """
+        Infer filing type from section headers.
+
+        NOTE: This method is kept for backward compatibility but should not be used
+        in the normal flow. Form type should be explicitly provided via config or metadata.
+        """
+        headers = document.headings
+        header_texts = [h.text().upper() for h in headers if h.text()]
+
+        # Check for 10-K specific sections
+        has_10k_sections = any(
+            'ITEM 1.' in text or 'ITEM 1A.' in text or 'ITEM 7.' in text or 'ITEM 8.' in text
+            for text in header_texts
+        )
+
+        # Check for 10-Q specific sections
+        has_10q_sections = any(
+            ('ITEM 1.' in text and 'FINANCIAL STATEMENTS' in text) or
+            ('ITEM 2.' in text and 'MANAGEMENT' in text) or
+            'ITEM 3.' in text or 'ITEM 4.' in text
+            for text in header_texts
+        )
+
+        # Check for 8-K specific sections
+        has_8k_sections = any(
+            re.search(r'ITEM \d\.\d{2}', text) for text in header_texts
+        )
+
+        if has_10k_sections and not has_10q_sections:
+            return '10-K'
+        elif has_10q_sections:
+            return '10-Q'
+        elif has_8k_sections:
+            return '8-K'
+        else:
+            return 'UNKNOWN'
+    
+    def _get_general_patterns(self) -> Dict[str, List[Tuple[str, str]]]:
+        """Get general section patterns."""
+        return {
+            'business': [
+                (r'^Business', 'Business'),
+                (r'^Overview', 'Overview'),
+                (r'^Company', 'Company')
+            ],
+            'financial': [
+                (r'^Financial\s+Statements', 'Financial Statements'),
+                (r'^Consolidated.*Statements', 'Consolidated Statements')
+            ],
+            'notes': [
+                (r'^Notes\s+to.*Financial\s+Statements', 'Notes to Financial Statements'),
+                (r'^Notes\s+to.*Statements', 'Notes')
+            ]
+        }
+    
+    def _find_section_headers(self, document: Document) -> List[Tuple[Node, str, int]]:
+        """Find all potential section headers."""
+        headers = []
+        
+        # Find all heading nodes
+        heading_nodes = document.root.find(lambda n: isinstance(n, HeadingNode))
+        
+        for node in heading_nodes:
+            text = node.text()
+            if text:
+                # Get position in document
+                position = self._get_node_position(node, document)
+                headers.append((node, text, position))
+        
+        # Also check for section nodes
+        section_nodes = document.root.find(lambda n: isinstance(n, SectionNode))
+        for node in section_nodes:
+            # Get first heading in section
+            first_heading = node.find_first(lambda n: isinstance(n, HeadingNode))
+            if first_heading:
+                text = first_heading.text()
+                if text:
+                    position = self._get_node_position(node, document)
+                    headers.append((node, text, position))
+        
+        # Sort by position
+        headers.sort(key=lambda x: x[2])
+        
+        return headers
+    
+    def _get_node_position(self, node: Node, document: Document) -> int:
+        """Get position of node in document."""
+        position = 0
+        for n in document.root.walk():
+            if n == node:
+                return position
+            position += 1
+        return position
+    
+    def _detect_10q_parts(self, headers: List[Tuple[Node, str, int]]) -> Dict[int, str]:
+        """
+        Detect Part I and Part II boundaries in 10-Q filings.
+
+        Args:
+            headers: List of (node, text, position) tuples
+
+        Returns:
+            Dict mapping header index to part name ("Part I" or "Part II")
+        """
+        part_context = {}
+        current_part = None
+
+        part_i_pattern = re.compile(r'^\s*PART\s+I\b', re.IGNORECASE)
+        part_ii_pattern = re.compile(r'^\s*PART\s+II\b', re.IGNORECASE)
+
+        for i, (node, text, position) in enumerate(headers):
+            text_stripped = text.strip()
+
+            # Check if this is a Part I or Part II header
+            if part_i_pattern.match(text_stripped):
+                current_part = "Part I"
+                part_context[i] = current_part
+            elif part_ii_pattern.match(text_stripped):
+                current_part = "Part II"
+                part_context[i] = current_part
+            elif current_part:
+                # Headers after a Part declaration belong to that part
+                part_context[i] = current_part
+
+        return part_context
+
+    def _match_sections(self,
+                       headers: List[Tuple[Node, str, int]],
+                       patterns: Dict[str, List[Tuple[str, str]]],
+                       document: Document,
+                       part_context: Optional[Dict[int, str]] = None) -> Dict[str, Tuple[Node, str, int, int]]:
+        """Match headers to section patterns."""
+        matched_sections = {}
+        used_headers = set()
+
+        # Try to match each pattern
+        for section_name, section_patterns in patterns.items():
+            for pattern, title in section_patterns:
+                for i, (node, text, position) in enumerate(headers):
+                    if i in used_headers:
+                        continue
+
+                    # Try to match pattern
+                    if re.match(pattern, text.strip(), re.IGNORECASE):
+                        # Find end position (next section or end of document)
+                        end_position = self._find_section_end(i, headers, document)
+
+                        # For 10-Q, prefix with Part I or Part II
+                        final_title = title
+                        if part_context and i in part_context:
+                            final_title = f"{part_context[i]} - {title}"
+
+                        # Use final_title as key to avoid conflicts
+                        section_key = final_title if part_context and i in part_context else section_name
+                        matched_sections[section_key] = (node, final_title, position, end_position)
+                        used_headers.add(i)
+                        break
+
+                # If we found a match, move to next section
+                if section_name in matched_sections:
+                    break
+
+        return matched_sections
+    
+    def _find_section_end(self, 
+                         section_index: int, 
+                         headers: List[Tuple[Node, str, int]],
+                         document: Document) -> int:
+        """Find where section ends."""
+        # Next section starts where next header at same or higher level begins
+        if section_index + 1 < len(headers):
+            current_node = headers[section_index][0]
+            current_level = current_node.level if isinstance(current_node, HeadingNode) else 1
+            
+            for i in range(section_index + 1, len(headers)):
+                next_node = headers[i][0]
+                next_level = next_node.level if isinstance(next_node, HeadingNode) else 1
+                
+                # If next header is at same or higher level, that's our end
+                if next_level <= current_level:
+                    return headers[i][2]
+        
+        # Otherwise, section goes to end of document
+        return sum(1 for _ in document.root.walk())
+    
+    def _create_sections(self, 
+                        matched_sections: Dict[str, Tuple[Node, str, int, int]], 
+                        document: Document) -> Dict[str, Section]:
+        """Create Section objects from matches."""
+        sections = {}
+        
+        for section_name, (node, title, start_pos, end_pos) in matched_sections.items():
+            # Create section node containing all content in range
+            section_node = SectionNode(section_name=section_name)
+            
+            # Find all nodes in position range
+            position = 0
+            for n in document.root.walk():
+                if start_pos <= position < end_pos:
+                    # Clone node and add to section
+                    # (In real implementation, would properly handle node hierarchy)
+                    section_node.add_child(n)
+                position += 1
+            
+            # Parse section name to extract part and item identifiers
+            part, item = Section.parse_section_name(section_name)
+
+            # Create Section object
+            section = Section(
+                name=section_name,
+                title=title,
+                node=section_node,
+                start_offset=start_pos,
+                end_offset=end_pos,
+                confidence=0.7,  # Pattern-based detection = moderate confidence
+                detection_method='pattern',  # Method: regex pattern matching
+                part=part,
+                item=item
+            )
+            
+            sections[section_name] = section
+        
+        return sections
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/text_extractor.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/text_extractor.py
@@ -0,0 +1,348 @@
+"""
+Text extraction from documents with various options.
+"""
+
+import re
+from typing import List, Optional, Set
+from edgar.documents.document import Document
+from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode
+from edgar.documents.table_nodes import TableNode
+from edgar.documents.types import NodeType
+
+
+class TextExtractor:
+    """
+    Extracts text from documents with configurable options.
+    
+    Supports:
+    - Clean text extraction for AI/NLP
+    - Table inclusion/exclusion
+    - Metadata annotations
+    - Length limiting
+    - Smart whitespace handling
+    """
+    
+    def __init__(self,
+                 clean: bool = True,
+                 include_tables: bool = True,
+                 include_metadata: bool = False,
+                 include_links: bool = False,
+                 max_length: Optional[int] = None,
+                 preserve_structure: bool = False):
+        """
+        Initialize text extractor.
+        
+        Args:
+            clean: Clean and normalize text
+            include_tables: Include table content
+            include_metadata: Include metadata annotations
+            include_links: Include link URLs
+            max_length: Maximum text length
+            preserve_structure: Preserve document structure with markers
+        """
+        self.clean = clean
+        self.include_tables = include_tables
+        self.include_metadata = include_metadata
+        self.include_links = include_links
+        self.max_length = max_length
+        self.preserve_structure = preserve_structure
+        
+        # Track what we've extracted to avoid duplicates
+        self._extracted_ids: Set[str] = set()
+    
+    def extract(self, document: Document) -> str:
+        """
+        Extract text from document.
+        
+        Args:
+            document: Document to extract from
+            
+        Returns:
+            Extracted text
+        """
+        parts = []
+        self._extracted_ids.clear()
+        
+        # Extract from root
+        self._extract_from_node(document.root, parts, depth=0)
+        
+        # Join parts
+        if self.preserve_structure:
+            text = '\n'.join(parts)
+        else:
+            text = '\n\n'.join(filter(None, parts))
+        
+        # Apply minimal global cleaning - tables are already handled appropriately per node
+        if self.clean:
+            text = self._clean_document_text(text)
+        
+        # Limit length if requested
+        if self.max_length and len(text) > self.max_length:
+            text = self._truncate_text(text, self.max_length)
+        
+        return text
+    
+    def extract_from_node(self, node: Node) -> str:
+        """Extract text from a specific node."""
+        parts = []
+        self._extracted_ids.clear()
+        self._extract_from_node(node, parts, depth=0)
+        
+        text = '\n\n'.join(filter(None, parts))
+        
+        if self.clean:
+            text = self._clean_document_text(text)
+        
+        return text
+    
+    def _extract_from_node(self, node: Node, parts: List[str], depth: int):
+        """Recursively extract text from node - render each node type appropriately."""
+        # Skip if already extracted (handles shared nodes)
+        if node.id in self._extracted_ids:
+            return
+        self._extracted_ids.add(node.id)
+        
+        # Handle based on node type - like old parser's block.get_text()
+        if isinstance(node, TableNode):
+            if self.include_tables:
+                # Tables render themselves - preserve their formatting 
+                self._extract_table(node, parts)
+        
+        elif isinstance(node, HeadingNode):
+            # Headings get cleaned text
+            self._extract_heading(node, parts, depth)
+        
+        elif isinstance(node, TextNode):
+            # Text nodes get cleaned if cleaning is enabled
+            text = node.text()
+            if text:
+                if self.clean:
+                    text = self._clean_text_content(text)  # Clean non-table text
+                if self.include_metadata and node.metadata:
+                    text = self._annotate_with_metadata(text, node.metadata)
+                parts.append(text)
+        
+        elif isinstance(node, ParagraphNode):
+            # Extract paragraph as unified text to maintain flow of inline elements
+            text = node.text()
+            if text:
+                if self.clean:
+                    text = self._clean_text_content(text)
+                if self.include_metadata and node.metadata:
+                    text = self._annotate_with_metadata(text, node.metadata)
+                parts.append(text)
+            # Don't process children since we already got the paragraph text
+            return
+        
+        else:
+            # Check if this looks like a bullet point container that should flow together
+            if self._is_bullet_point_container(node):
+                # Extract text from bullet point children and join with spaces (not newlines)
+                bullet_parts = []
+                for child in node.children:
+                    child_text = child.text() if hasattr(child, 'text') else ""
+                    if child_text and child_text.strip():
+                        bullet_parts.append(child_text.strip())
+                
+                if bullet_parts:
+                    # Join with spaces for bullet points
+                    text = ' '.join(bullet_parts)
+                    if self.clean:
+                        text = self._clean_text_content(text)
+                    if self.include_metadata and node.metadata:
+                        text = self._annotate_with_metadata(text, node.metadata)
+                    parts.append(text)
+                # Don't process children since we already got the unified text
+                return
+            
+            # For other nodes, extract text content and clean if appropriate
+            if hasattr(node, 'content') and isinstance(node.content, str):
+                text = node.content
+                if text and text.strip():
+                    if self.clean:
+                        text = self._clean_text_content(text)  # Clean non-table text
+                    if self.include_metadata and node.metadata:
+                        text = self._annotate_with_metadata(text, node.metadata)
+                    parts.append(text)
+        
+        # Process children
+        for child in node.children:
+            self._extract_from_node(child, parts, depth + 1)
+    
+    def _extract_heading(self, node: HeadingNode, parts: List[str], depth: int):
+        """Extract heading with optional structure markers."""
+        text = node.text()
+        if not text:
+            return
+        
+        if self.preserve_structure:
+            # Add structure markers
+            marker = '#' * node.level
+            text = f"{marker} {text}"
+        
+        if self.include_metadata and node.metadata:
+            text = self._annotate_with_metadata(text, node.metadata)
+        
+        parts.append(text)
+    
+    def _extract_table(self, table: TableNode, parts: List[str]):
+        """Extract table content - preserve original formatting like old parser."""
+        if self.preserve_structure:
+            parts.append("[TABLE START]")
+        
+        # Add table caption if present
+        if table.caption:
+            caption_text = table.caption
+            if self.clean:
+                caption_text = self._clean_text_content(caption_text)  # Clean caption but not table content
+            if self.preserve_structure:
+                parts.append(f"Caption: {caption_text}")
+            else:
+                parts.append(caption_text)
+        
+        # Extract table text - PRESERVE FORMATTING (like old parser's TableBlock.get_text())
+        table_text = table.text()
+        if table_text:
+            # Tables render their own formatting - don't apply text cleaning to preserve alignment
+            parts.append(table_text)  # Keep original spacing and alignment
+        
+        if self.preserve_structure:
+            parts.append("[TABLE END]")
+    
+    def _annotate_with_metadata(self, text: str, metadata: dict) -> str:
+        """Add metadata annotations to text."""
+        annotations = []
+        
+        # Add XBRL annotations
+        if 'ix_tag' in metadata:
+            annotations.append(f"[XBRL: {metadata['ix_tag']}]")
+        
+        # Add section annotations
+        if 'section_name' in metadata:
+            annotations.append(f"[Section: {metadata['section_name']}]")
+        
+        # Add semantic type
+        if 'semantic_type' in metadata:
+            annotations.append(f"[Type: {metadata['semantic_type']}]")
+        
+        if annotations:
+            return f"{' '.join(annotations)} {text}"
+        
+        return text
+    
+    def _clean_text_content(self, text: str) -> str:
+        """Clean regular text content (not tables) - like old parser text cleaning."""
+        if not text:
+            return text
+        
+        # Replace multiple spaces with single space for regular text
+        text = re.sub(r' {2,}', ' ', text)
+        
+        # Clean up space around newlines
+        text = re.sub(r' *\n *', '\n', text)
+        
+        # Remove leading/trailing whitespace from lines
+        lines = text.split('\n')
+        lines = [line.strip() for line in lines]
+        text = '\n'.join(lines)
+        
+        # Normalize quotes and dashes
+        text = self._normalize_punctuation(text)
+        
+        return text
+    
+    def _is_bullet_point_container(self, node) -> bool:
+        """Check if a container node represents a bullet point that should flow as one line."""
+        from edgar.documents.nodes import ContainerNode
+        
+        if not isinstance(node, ContainerNode):
+            return False
+        
+        # Must have at least 2 children (bullet + content)
+        if len(node.children) < 2:
+            return False
+        
+        # Get the text of all children to check for bullet patterns
+        all_text = node.text()
+        if not all_text:
+            return False
+        
+        # Check if starts with common bullet characters
+        bullet_chars = ['•', '●', '▪', '▫', '◦', '‣', '-', '*']
+        starts_with_bullet = any(all_text.strip().startswith(char) for char in bullet_chars)
+        
+        if not starts_with_bullet:
+            return False
+        
+        # Check if container has flex display (common for bullet point layouts)
+        if hasattr(node, 'style') and node.style and hasattr(node.style, 'display'):
+            if node.style.display == 'flex':
+                return True
+        
+        # Check if it has bullet-like structure: short first child + longer content
+        if len(node.children) >= 2:
+            first_child_text = node.children[0].text() if hasattr(node.children[0], 'text') else ""
+            second_child_text = node.children[1].text() if hasattr(node.children[1], 'text') else ""
+            
+            # First child is very short (likely bullet), second is longer (content)
+            if len(first_child_text.strip()) <= 3 and len(second_child_text.strip()) > 10:
+                return True
+        
+        return False
+    
+    def _clean_document_text(self, text: str) -> str:
+        """Apply minimal document-level cleaning that preserves table formatting."""
+        if not text:
+            return text
+        
+        # Only apply global formatting that doesn't affect table alignment:
+        
+        # Replace excessive newlines (4+ consecutive) with triple newline
+        text = re.sub(r'\n{4,}', '\n\n\n', text)
+        
+        # Remove empty lines at start/end only
+        text = text.strip()
+        
+        return text
+    
+    def _normalize_punctuation(self, text: str) -> str:
+        """Normalize punctuation for cleaner text."""
+        # Normalize quotes
+        text = text.replace('"', '"').replace('"', '"')
+        text = text.replace(''', "'").replace(''', "'")
+        
+        # Normalize dashes
+        text = text.replace('—', ' - ')  # em dash
+        text = text.replace('–', ' - ')  # en dash
+        
+        # Fix spacing around punctuation
+        text = re.sub(r'\s+([.,;!?])', r'\1', text)
+        text = re.sub(r'([.,;!?])\s*', r'\1 ', text)
+        
+        # Remove extra spaces
+        text = re.sub(r' {2,}', ' ', text)
+        
+        return text.strip()
+    
+    def _truncate_text(self, text: str, max_length: int) -> str:
+        """Truncate text intelligently."""
+        if len(text) <= max_length:
+            return text
+        
+        # Try to truncate at sentence boundary
+        truncated = text[:max_length]
+        last_period = truncated.rfind('.')
+        last_newline = truncated.rfind('\n')
+        
+        # Choose the better truncation point
+        truncate_at = max(last_period, last_newline)
+        if truncate_at > max_length * 0.8:  # If we found a good boundary
+            return text[:truncate_at + 1].strip()
+        
+        # Otherwise truncate at word boundary
+        last_space = truncated.rfind(' ')
+        if last_space > max_length * 0.9:
+            return text[:last_space].strip() + '...'
+        
+        # Last resort: hard truncate
+        return text[:max_length - 3].strip() + '...'
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/toc_section_detector.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/toc_section_detector.py
@@ -0,0 +1,178 @@
+"""
+TOC-based section detection strategy.
+
+Detects sections using Table of Contents structure. Provides highest
+confidence (0.95) and includes full text extraction capabilities.
+
+This detector wraps SECSectionExtractor which has proven implementations of:
+- Multi-column TOC support (checks all preceding table cells)
+- Nested anchor handling (traverses up to find content container)
+- Full section text extraction
+"""
+
+import logging
+from typing import Dict, Optional
+
+from edgar.documents.document import Document, Section
+from edgar.documents.nodes import SectionNode
+from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
+
+logger = logging.getLogger(__name__)
+
+
+class TOCSectionDetector:
+    """
+    TOC-based section detection strategy.
+
+    Uses Table of Contents structure to identify section boundaries and
+    extract full section content. Provides high confidence (0.95) detection.
+
+    This implementation wraps the proven SECSectionExtractor which includes:
+    - Multi-column TOC support for edge cases like Morgan Stanley
+    - Nested anchor handling for sections with no sibling content
+    - Complete text extraction with proper boundary detection
+    """
+
+    def __init__(self, document: Document):
+        """
+        Initialize TOC-based detector.
+
+        Args:
+            document: Document to analyze (must have metadata.original_html)
+        """
+        self.document = document
+        self.extractor = SECSectionExtractor(document)
+
+    def detect(self) -> Optional[Dict[str, Section]]:
+        """
+        Detect sections using TOC structure.
+
+        Returns:
+            Dictionary mapping section names to Section objects, or None if unavailable
+
+        Note:
+            Requires document.metadata.original_html to be available.
+            Returns None if HTML is not available or no sections found.
+        """
+        # Check if original HTML is available
+        html_content = getattr(self.document.metadata, 'original_html', None)
+        if not html_content:
+            logger.debug("TOC detection unavailable: original_html not in document metadata")
+            return None
+
+        try:
+            # Get available sections from TOC
+            available = self.extractor.get_available_sections()
+            if not available:
+                logger.debug("No sections found in TOC")
+                return None
+
+            sections = {}
+
+            # Extract each section
+            for section_name in available:
+                # Get section metadata first to check for subsections
+                section_info = self.extractor.get_section_info(section_name)
+                if not section_info:
+                    logger.debug(f"Skipping {section_name}: no section info")
+                    continue
+
+                # Get section text (may be empty for container sections)
+                section_text = self.extractor.get_section_text(section_name, include_subsections=True)
+
+                # Check if this section has subsections
+                has_subsections = section_info.get('subsections', [])
+
+                if not section_text and not has_subsections:
+                    # Skip only if no text AND no subsections
+                    logger.debug(f"Skipping {section_name}: no text and no subsections")
+                    continue
+
+                # Create section node (placeholder - actual content extracted lazily)
+                section_node = SectionNode(section_name=section_name)
+
+                # For container sections (Item 1, Item 10), text will include all subsections
+                section_length = len(section_text) if section_text else 0
+
+                # Create text extractor callback for lazy loading
+                def make_text_extractor(extractor, name):
+                    """Create a closure that captures extractor and section name."""
+                    def extract_text(section_name=None, **kwargs):
+                        # Use captured name, ignore passed section_name
+                        clean = kwargs.get('clean', True)
+                        return extractor.get_section_text(name, include_subsections=True, clean=clean) or ""
+                    return extract_text
+
+                # Parse section name to extract part and item identifiers
+                part, item = Section.parse_section_name(section_name)
+
+                # Create Section with TOC confidence
+                section = Section(
+                    name=section_name,
+                    title=section_info.get('canonical_name', section_name),
+                    node=section_node,
+                    start_offset=0,  # Would need actual offsets from parsing
+                    end_offset=section_length,
+                    confidence=0.95,  # TOC-based = high confidence
+                    detection_method='toc',
+                    part=part,
+                    item=item,
+                    _text_extractor=make_text_extractor(self.extractor, section_name)
+                )
+
+                sections[section_name] = section
+
+            if sections:
+                logger.info(f"TOC detection found {len(sections)} sections")
+                return sections
+
+            return None
+
+        except Exception as e:
+            logger.warning(f"TOC detection failed: {e}", exc_info=True)
+            return None
+
+
+def get_section_text(document: Document, section_name: str) -> Optional[str]:
+    """
+    Get section text using TOC-based extraction.
+
+    Args:
+        document: Document to extract from
+        section_name: Section name (e.g., 'Item 1', 'Item 1A')
+
+    Returns:
+        Section text if available, None otherwise
+    """
+    html_content = getattr(document.metadata, 'original_html', None)
+    if not html_content:
+        return None
+
+    try:
+        extractor = SECSectionExtractor(document)
+        return extractor.get_section_text(section_name)
+    except Exception as e:
+        logger.warning(f"Failed to get section text for {section_name}: {e}")
+        return None
+
+
+def get_available_sections(document: Document) -> list[str]:
+    """
+    Get list of available sections from TOC.
+
+    Args:
+        document: Document to analyze
+
+    Returns:
+        List of section names found in TOC
+    """
+    html_content = getattr(document.metadata, 'original_html', None)
+    if not html_content:
+        return []
+
+    try:
+        extractor = SECSectionExtractor(document)
+        return extractor.get_available_sections()
+    except Exception as e:
+        logger.warning(f"Failed to get available sections: {e}")
+        return []
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/toc_section_extractor.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/toc_section_extractor.py
@@ -0,0 +1,383 @@
+"""
+Section extraction for SEC filings using Table of Contents analysis.
+
+This system uses TOC structure to extract specific sections like "Item 1", 
+"Item 1A", etc. from SEC filings. This approach works consistently across
+all SEC filings regardless of whether they use semantic anchors or generated IDs.
+"""
+import re
+from typing import Dict, List, Optional, Tuple, Set
+from dataclasses import dataclass
+from lxml import html as lxml_html
+
+from edgar.documents.nodes import Node, SectionNode
+from edgar.documents.document import Document
+from edgar.documents.utils.toc_analyzer import TOCAnalyzer
+
+
+@dataclass
+class SectionBoundary:
+    """Represents the boundaries of a document section."""
+    name: str
+    anchor_id: str
+    start_element_id: Optional[str] = None
+    end_element_id: Optional[str] = None
+    start_node: Optional[Node] = None
+    end_node: Optional[Node] = None
+    text_start: Optional[int] = None  # Character position in full text
+    text_end: Optional[int] = None
+    confidence: float = 1.0  # Detection confidence (0.0-1.0)
+    detection_method: str = 'unknown'  # How section was detected
+
+
+class SECSectionExtractor:
+    """
+    Extract specific sections from SEC filings using Table of Contents analysis.
+    
+    This uses TOC structure to identify section boundaries and extract content
+    between them. Works consistently for all SEC filings.
+    """
+    
+    def __init__(self, document: Document):
+        self.document = document
+        self.section_map = {}  # Maps section names to canonical names
+        self.section_boundaries = {}  # Maps section names to boundaries
+        self.toc_analyzer = TOCAnalyzer()
+        self._analyze_sections()
+    
+    def _analyze_sections(self) -> None:
+        """
+        Analyze the document using TOC structure to identify section boundaries.
+        
+        This creates a map of section names to their anchor positions using
+        Table of Contents analysis, which works for all SEC filings.
+        """
+        # Get the original HTML if available
+        html_content = getattr(self.document.metadata, 'original_html', None)
+        if not html_content:
+            return
+        
+        # Use TOC analysis to find sections
+        toc_mapping = self.toc_analyzer.analyze_toc_structure(html_content)
+        
+        if not toc_mapping:
+            return  # No sections found
+        
+        # Handle XML declaration issues  
+        if html_content.startswith('<?xml'):
+            html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
+        
+        tree = lxml_html.fromstring(html_content)
+        
+        sec_sections = {}
+        
+        for section_name, anchor_id in toc_mapping.items():
+            # Verify the anchor target exists
+            target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
+            if target_elements:
+                element = target_elements[0]
+                
+                # Use TOC-based section info
+                section_type, order = self.toc_analyzer._get_section_type_and_order(section_name)
+                
+                sec_sections[section_name] = {
+                    'anchor_id': anchor_id,
+                    'element': element,
+                    'canonical_name': section_name,
+                    'type': section_type,
+                    'order': order,
+                    'confidence': 0.95,  # TOC-based detection = high confidence
+                    'detection_method': 'toc'  # Method: Table of Contents
+                }
+        
+        if not sec_sections:
+            return  # No valid sections found
+        
+        # Sort sections by their logical order
+        sorted_sections = sorted(sec_sections.items(), key=lambda x: x[1]['order'])
+        
+        # Calculate section boundaries
+        for i, (section_name, section_data) in enumerate(sorted_sections):
+            start_anchor = section_data['anchor_id']
+            
+            # End boundary is the start of the next section (if any)
+            end_anchor = None
+            if i + 1 < len(sorted_sections):
+                next_section = sorted_sections[i + 1][1]
+                end_anchor = next_section['anchor_id']
+            
+            self.section_boundaries[section_name] = SectionBoundary(
+                name=section_name,
+                anchor_id=start_anchor,
+                end_element_id=end_anchor,
+                confidence=section_data.get('confidence', 0.95),
+                detection_method=section_data.get('detection_method', 'toc')
+            )
+        
+        self.section_map = {name: data['canonical_name'] for name, data in sec_sections.items()}
+    
+    
+    
+    def get_available_sections(self) -> List[str]:
+        """
+        Get list of available sections that can be extracted.
+        
+        Returns:
+            List of section names
+        """
+        return sorted(self.section_boundaries.keys(), 
+                     key=lambda x: self.section_boundaries[x].anchor_id)
+    
+    def get_section_text(self, section_name: str,
+                        include_subsections: bool = True,
+                        clean: bool = True) -> Optional[str]:
+        """
+        Extract text content for a specific section.
+
+        Args:
+            section_name: Name of section (e.g., "Item 1", "Item 1A", "Part I")
+            include_subsections: Whether to include subsections
+            clean: Whether to apply text cleaning
+
+        Returns:
+            Section text content or None if section not found
+        """
+        # Normalize section name
+        normalized_name = self._normalize_section_name(section_name)
+
+        if normalized_name not in self.section_boundaries:
+            return None
+
+        boundary = self.section_boundaries[normalized_name]
+
+        # Extract content between boundaries using HTML parsing
+        html_content = getattr(self.document.metadata, 'original_html', None)
+        if not html_content:
+            return None
+
+        try:
+            section_text = self._extract_section_content(html_content, boundary, include_subsections, clean)
+
+            # If no direct content but include_subsections=True, aggregate subsection text
+            if not section_text and include_subsections:
+                subsections = self._get_subsections(normalized_name)
+                if subsections:
+                    # Recursively get text from all subsections
+                    subsection_texts = []
+                    for subsection_name in subsections:
+                        subsection_text = self.get_section_text(subsection_name, include_subsections=True, clean=clean)
+                        if subsection_text:
+                            subsection_texts.append(subsection_text)
+
+                    if subsection_texts:
+                        section_text = '\n\n'.join(subsection_texts)
+
+            return section_text
+        except Exception as e:
+            # Fallback to simple text extraction
+            return self._extract_section_fallback(section_name, clean)
+    
+    def _normalize_section_name(self, section_name: str) -> str:
+        """Normalize section name for lookup."""
+        # Handle common variations
+        name = section_name.strip()
+        
+        # "Item 1" vs "Item 1." vs "Item 1:"
+        name = re.sub(r'[.:]$', '', name)
+        
+        # Case normalization
+        if re.match(r'item\s+\d+', name, re.IGNORECASE):
+            match = re.match(r'item\s+(\d+[a-z]?)', name, re.IGNORECASE)
+            if match:
+                name = f"Item {match.group(1).upper()}"
+        elif re.match(r'part\s+[ivx]+', name, re.IGNORECASE):
+            match = re.match(r'part\s+([ivx]+)', name, re.IGNORECASE)
+            if match:
+                name = f"Part {match.group(1).upper()}"
+        
+        return name
+    
+    def _extract_section_content(self, html_content: str, boundary: SectionBoundary,
+                               include_subsections: bool, clean: bool) -> str:
+        """
+        Extract section content from HTML between anchors.
+        
+        Args:
+            html_content: Full HTML content
+            boundary: Section boundary info
+            include_subsections: Whether to include subsections
+            clean: Whether to clean the text
+            
+        Returns:
+            Extracted section text
+        """
+        # Handle XML declaration issues
+        if html_content.startswith('<?xml'):
+            html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
+        
+        tree = lxml_html.fromstring(html_content)
+        
+        # Find start element
+        start_elements = tree.xpath(f'//*[@id="{boundary.anchor_id}"]')
+        if not start_elements:
+            return ""
+        
+        start_element = start_elements[0]
+
+        # Collect content until we hit the end boundary (if specified)
+        content_elements = []
+
+        # If anchor has no siblings (nested in empty container), traverse up to find content container
+        # This handles cases like <div id="item7"><div></div></div> where content is after the container
+        current = start_element.getnext()
+        if current is None:
+            # No sibling - traverse up to find a container with siblings
+            container = start_element.getparent()
+            while container is not None and container.getnext() is None:
+                container = container.getparent()
+
+            # Start from the container's next sibling if found
+            if container is not None:
+                current = container.getnext()
+
+        # Collect content from siblings
+        if current is not None:
+            # Normal case - anchor has siblings
+            while current is not None:
+                # Check if we've reached the end boundary
+                if boundary.end_element_id:
+                    current_id = current.get('id', '')
+                    if current_id == boundary.end_element_id:
+                        break
+
+                    # Also check if this is a sibling section we should stop at
+                    if not include_subsections and self._is_sibling_section(current_id, boundary.name):
+                        break
+
+                content_elements.append(current)
+                current = current.getnext()
+        
+        # Extract text from collected elements
+        section_texts = []
+        for element in content_elements:
+            text = self._extract_element_text(element)
+            if text.strip():
+                section_texts.append(text)
+        
+        combined_text = '\n\n'.join(section_texts)
+        
+        # Apply cleaning if requested
+        if clean:
+            combined_text = self._clean_section_text(combined_text)
+        
+        return combined_text
+    
+    def _is_sibling_section(self, element_id: str, current_section: str) -> bool:
+        """Check if element ID represents a sibling section."""
+        if not element_id:
+            return False
+        
+        # Check if this looks like another item at the same level
+        if 'item' in current_section.lower() and 'item' in element_id.lower():
+            current_item = re.search(r'item\s*(\d+)', current_section, re.IGNORECASE)
+            other_item = re.search(r'item[\s_]*(\d+)', element_id, re.IGNORECASE)
+            
+            if current_item and other_item:
+                return current_item.group(1) != other_item.group(1)
+        
+        return False
+    
+    def _extract_element_text(self, element) -> str:
+        """Extract clean text from an HTML element."""
+        # This would integrate with your existing text extraction logic
+        # For now, simple text extraction
+        return element.text_content() or ""
+    
+    def _clean_section_text(self, text: str) -> str:
+        """Clean extracted section text."""
+        # Apply the same cleaning as the main document
+        from edgar.documents.utils.anchor_cache import filter_with_cached_patterns
+        
+        # Remove excessive whitespace
+        text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
+        
+        # Filter navigation links
+        html_content = getattr(self.document.metadata, 'original_html', None)
+        if html_content:
+            text = filter_with_cached_patterns(text, html_content)
+        
+        return text.strip()
+    
+    def _extract_section_fallback(self, section_name: str, clean: bool) -> Optional[str]:
+        """
+        Fallback section extraction using document nodes.
+        
+        This is used when HTML-based extraction fails.
+        """
+        # Search through document sections
+        for name, section in self.document.sections.items():
+            if section_name.lower() in name.lower():
+                return section.text(clean=clean)
+        
+        return None
+    
+    def get_section_info(self, section_name: str) -> Optional[Dict]:
+        """
+        Get detailed information about a section.
+        
+        Args:
+            section_name: Section name to look up
+            
+        Returns:
+            Dict with section metadata
+        """
+        normalized_name = self._normalize_section_name(section_name)
+        
+        if normalized_name not in self.section_boundaries:
+            return None
+        
+        boundary = self.section_boundaries[normalized_name]
+        
+        return {
+            'name': boundary.name,
+            'anchor_id': boundary.anchor_id,
+            'available': True,
+            'estimated_length': None,  # Could calculate if needed
+            'subsections': self._get_subsections(normalized_name)
+        }
+    
+    def _get_subsections(self, parent_section: str) -> List[str]:
+        """
+        Get subsections of a parent section.
+
+        For example:
+        - "Item 1" has subsections "Item 1A", "Item 1B" (valid)
+        - "Item 1" does NOT have subsection "Item 10" (invalid - different item)
+        """
+        subsections = []
+
+        # Look for sections that start with the parent name
+        for section_name in self.section_boundaries:
+            if section_name == parent_section:
+                continue
+
+            if section_name.startswith(parent_section):
+                # Check if this is a true subsection (e.g., Item 1A)
+                # vs a different section that happens to start with same prefix (e.g., Item 10)
+                remainder = section_name[len(parent_section):]
+
+                # Valid subsection patterns:
+                # - "Item 1A" (remainder: "A") - letter suffix
+                # - "Item 1 - Business" (remainder: " - Business") - has separator
+                # Invalid patterns:
+                # - "Item 10" (remainder: "0") - digit continues the number
+
+                if remainder and remainder[0].isalpha():
+                    # Letter suffix like "A", "B" - valid subsection
+                    subsections.append(section_name)
+                elif remainder and remainder[0] in [' ', '-', '.', ':']:
+                    # Has separator - could be descriptive title
+                    subsections.append(section_name)
+                # If remainder starts with digit, it's NOT a subsection (e.g., "Item 10")
+
+        return sorted(subsections)
--- a/venv/lib/python3.10/site-packages/edgar/documents/migration.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/migration.py
@@ -0,0 +1,318 @@
+"""
+Migration and compatibility layer for transitioning from old parser to new.
+
+NOTE: This compatibility layer is documented for user migration from v1.x → v2.0
+It is intentionally not used internally but kept for user convenience.
+Do not remove without versioning consideration.
+"""
+
+from typing import Optional, List, Dict, Any
+import warnings
+from edgar.documents import HTMLParser, Document, ParserConfig
+from edgar.documents.search import DocumentSearch
+
+
+class LegacyHTMLDocument:
+    """
+    Compatibility wrapper that mimics the old Document API.
+    
+    This allows existing code to work with the new parser
+    while providing deprecation warnings.
+    """
+    
+    def __init__(self, new_document: Document):
+        """Initialize with new document."""
+        self._doc = new_document
+        self._warn_on_use = True
+    
+    def _deprecation_warning(self, old_method: str, new_method: str = None):
+        """Issue deprecation warning."""
+        if self._warn_on_use:
+            msg = f"Document.{old_method} is deprecated."
+            if new_method:
+                msg += f" Use {new_method} instead."
+            warnings.warn(msg, DeprecationWarning, stacklevel=3)
+    
+    @property
+    def text(self) -> str:
+        """Get document text (old API)."""
+        self._deprecation_warning("text", "Document.text()")
+        return self._doc.text()
+    
+    def get_text(self, clean: bool = True) -> str:
+        """Get text with options (old API)."""
+        self._deprecation_warning("get_text()", "Document.text()")
+        return self._doc.text()
+    
+    @property
+    def tables(self) -> List[Any]:
+        """Get tables (old API)."""
+        self._deprecation_warning("tables", "Document.tables")
+        return self._doc.tables
+    
+    def find_all(self, tag: str) -> List[Any]:
+        """Find elements by tag (old API)."""
+        self._deprecation_warning("find_all()", "Document.root.find()")
+        
+        # Map old tag names to node types
+        from edgar.documents.types import NodeType
+        
+        tag_map = {
+            'h1': NodeType.HEADING,
+            'h2': NodeType.HEADING,
+            'h3': NodeType.HEADING,
+            'p': NodeType.PARAGRAPH,
+            'table': NodeType.TABLE,
+        }
+        
+        node_type = tag_map.get(tag.lower())
+        if node_type:
+            return self._doc.root.find(lambda n: n.type == node_type)
+        
+        return []
+    
+    def search(self, pattern: str) -> List[str]:
+        """Search document (old API)."""
+        self._deprecation_warning("search()", "DocumentSearch.search()")
+        
+        search = DocumentSearch(self._doc)
+        results = search.search(pattern)
+        return [r.text for r in results]
+    
+    @property
+    def sections(self) -> Dict[str, Any]:
+        """Get sections (old API)."""
+        # Convert new sections to old format
+        new_sections = self._doc.sections
+        old_sections = {}
+        
+        for name, section in new_sections.items():
+            old_sections[name] = {
+                'title': section.title,
+                'text': section.text(),
+                'start': section.start_offset,
+                'end': section.end_offset
+            }
+        
+        return old_sections
+    
+    def to_markdown(self) -> str:
+        """Convert to markdown (old API)."""
+        self._deprecation_warning("to_markdown()", "MarkdownRenderer.render()")
+        
+        from edgar.documents.renderers import MarkdownRenderer
+        renderer = MarkdownRenderer()
+        return renderer.render(self._doc)
+
+
+class LegacySECHTMLParser:
+    """
+    Compatibility wrapper for old SECHTMLParser.
+    
+    Maps old parser methods to new parser.
+    """
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """Initialize with optional config."""
+        # Convert old config to new
+        new_config = self._convert_config(config)
+        self._parser = HTMLParser(new_config)
+        self._warn_on_use = True
+    
+    def _convert_config(self, old_config: Optional[Dict[str, Any]]) -> ParserConfig:
+        """Convert old config format to new."""
+        if not old_config:
+            return ParserConfig()
+        
+        new_config = ParserConfig()
+        
+        # Map old config keys to new
+        if 'clean_text' in old_config:
+            new_config.clean_text = old_config['clean_text']
+        
+        if 'extract_tables' in old_config:
+            new_config.table_extraction = old_config['extract_tables']
+        
+        if 'preserve_layout' in old_config:
+            new_config.preserve_whitespace = old_config['preserve_layout']
+        
+        return new_config
+    
+    def parse(self, html: str) -> LegacyHTMLDocument:
+        """Parse HTML (old API)."""
+        if self._warn_on_use:
+            warnings.warn(
+                "SECHTMLParser is deprecated. Use HTMLParser instead.",
+                DeprecationWarning,
+                stacklevel=2
+            )
+        
+        new_doc = self._parser.parse(html)
+        return LegacyHTMLDocument(new_doc)
+    
+    def parse_file(self, filepath: str) -> LegacyHTMLDocument:
+        """Parse HTML file (old API)."""
+        if self._warn_on_use:
+            warnings.warn(
+                "SECHTMLParser.parse_file() is deprecated. Use HTMLParser.parse_file() instead.",
+                DeprecationWarning,
+                stacklevel=2
+            )
+        
+        new_doc = self._parser.parse_file(filepath)
+        return LegacyHTMLDocument(new_doc)
+
+
+def migrate_parser_usage(code: str) -> str:
+    """
+    Helper to migrate code from old parser to new.
+    
+    Args:
+        code: Python code using old parser
+        
+    Returns:
+        Updated code using new parser
+    """
+    replacements = [
+        # Import statements
+        ("from edgar.files.html import SECHTMLParser", 
+         "from edgar.documents import HTMLParser"),
+        
+        ("from edgar.files.html import Document",
+         "from edgar.documents import Document"),
+        
+        # Class instantiation
+        ("SECHTMLParser(", "HTMLParser("),
+        
+        # Method calls
+        ("document.text", "document.text()"),
+        ("document.get_text(", "document.text("),
+        ("document.find_all(", "document.root.find(lambda n: n.tag == "),
+        ("document.to_markdown(", "MarkdownRenderer().render(document"),
+        
+        # Config changes
+        ("extract_tables=", "table_extraction="),
+        ("preserve_layout=", "preserve_whitespace="),
+    ]
+    
+    migrated = code
+    for old, new in replacements:
+        migrated = migrated.replace(old, new)
+    
+    return migrated
+
+
+class MigrationGuide:
+    """
+    Provides migration guidance and utilities.
+    """
+    
+    @staticmethod
+    def check_compatibility(old_parser_instance) -> Dict[str, Any]:
+        """
+        Check if old parser instance can be migrated.
+        
+        Returns:
+            Dict with compatibility info
+        """
+        return {
+            'can_migrate': True,
+            'warnings': [],
+            'recommendations': [
+                "Replace SECHTMLParser with HTMLParser",
+                "Update document.text to document.text()",
+                "Use DocumentSearch for search functionality",
+                "Use MarkdownRenderer for markdown conversion"
+            ]
+        }
+    
+    @staticmethod
+    def print_migration_guide():
+        """Print migration guide."""
+        guide = """
+        HTML Parser Migration Guide
+        ==========================
+        
+        The new HTML parser provides significant improvements:
+        - 10x performance improvement
+        - Better table parsing
+        - Reliable section detection
+        - Advanced search capabilities
+        
+        Key Changes:
+        -----------
+        
+        1. Imports:
+           OLD: from edgar.files.html import SECHTMLParser, Document
+           NEW: from edgar.documents import HTMLParser, Document
+        
+        2. Parser Creation:
+           OLD: parser = SECHTMLParser()
+           NEW: parser = HTMLParser()
+        
+        3. Document Text:
+           OLD: document.text or document.get_text()
+           NEW: document.text()
+        
+        4. Search:
+           OLD: document.search(pattern)
+           NEW: search = DocumentSearch(document)
+                results = search.search(pattern)
+        
+        5. Tables:
+           OLD: document.tables
+           NEW: document.tables (same, but returns richer TableNode objects)
+        
+        6. Sections:
+           OLD: document.sections
+           NEW: document.sections (returns Section objects with more features)
+        
+        7. Markdown:
+           OLD: document.to_markdown()
+           NEW: renderer = MarkdownRenderer()
+                markdown = renderer.render(document)
+        
+        Compatibility:
+        -------------
+        
+        For gradual migration, use the compatibility layer:
+        
+        from edgar.documents.migration import LegacySECHTMLParser
+        parser = LegacySECHTMLParser()  # Works like old parser
+        
+        This will issue deprecation warnings to help you migrate.
+        
+        Performance Config:
+        ------------------
+        
+        For best performance:
+        parser = HTMLParser.create_for_performance()
+        
+        For best accuracy:
+        parser = HTMLParser.create_for_accuracy()
+        
+        For AI/LLM processing:
+        parser = HTMLParser.create_for_ai()
+        """
+        
+        print(guide)
+
+
+# Compatibility aliases
+SECHTMLParser = LegacySECHTMLParser
+HTMLDocument = LegacyHTMLDocument
+
+
+# Auto-migration for common imports
+def __getattr__(name):
+    """Provide compatibility imports with warnings."""
+    if name == "SECHTMLParser":
+        warnings.warn(
+            "Importing SECHTMLParser from edgar.documents.migration is deprecated. "
+            "Use HTMLParser from edgar.documents instead.",
+            DeprecationWarning,
+            stacklevel=2
+        )
+        return LegacySECHTMLParser
+    
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
--- a/venv/lib/python3.10/site-packages/edgar/documents/migration_example.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/migration_example.py
@@ -0,0 +1,160 @@
+"""
+Example showing how to migrate from old parser to new.
+"""
+
+def old_parser_example():
+    """Example using old parser API."""
+
+    # This is how code might look with the old parser
+    from edgar.documents.migration import SECHTMLParser  # Using compatibility layer
+
+    # Create parser
+    parser = SECHTMLParser({
+        'extract_tables': True,
+        'clean_text': True,
+        'preserve_layout': False
+    })
+
+    # Parse HTML
+    html = """
+    <html>
+        <body>
+            <h1>Item 1. Business</h1>
+            <p>We are a technology company.</p>
+
+            <table>
+                <tr><th>Year</th><th>Revenue</th></tr>
+                <tr><td>2023</td><td>$100M</td></tr>
+            </table>
+        </body>
+    </html>
+    """
+
+    document = parser.parse(html)
+
+    # Old API usage (will show deprecation warnings)
+
+    # Search
+    document.search("revenue")
+
+    # Convert to markdown
+    document.to_markdown()
+
+
+def new_parser_example():
+    """Example using new parser API."""
+
+    # New imports
+    from edgar.documents import DocumentSearch, HTMLParser, ParserConfig
+    from edgar.documents.renderers import MarkdownRenderer
+
+    # Create parser with new config
+    config = ParserConfig(
+        table_extraction=True,
+        clean_text=True,
+        preserve_whitespace=False,
+        detect_sections=True
+    )
+
+    parser = HTMLParser(config)
+
+    # Parse HTML
+    html = """
+    <html>
+        <body>
+            <h1>Item 1. Business</h1>
+            <p>We are a technology company.</p>
+
+            <table>
+                <tr><th>Year</th><th>Revenue</th></tr>
+                <tr><td>2023</td><td>$100M</td></tr>
+            </table>
+        </body>
+    </html>
+    """
+
+    document = parser.parse(html)
+
+    # New API usage
+
+    # Search with new API
+    search = DocumentSearch(document)
+    search.search("revenue")
+
+    # Convert to markdown with new API
+    renderer = MarkdownRenderer()
+    renderer.render(document)
+
+    # New features not available in old parser
+
+    # Advanced search
+    search.find_tables(caption_pattern="Revenue")
+
+    # Performance-optimized parser
+    HTMLParser.create_for_performance()
+
+    # Cache statistics
+    from edgar.documents.utils import get_cache_manager
+    get_cache_manager().get_stats()
+
+
+def migration_comparison():
+    """Show side-by-side comparison."""
+
+
+
+
+def automatic_migration_example():
+    """Show automatic code migration."""
+
+    from edgar.documents.migration import migrate_parser_usage
+
+    old_code = '''
+from edgar.files.html import SECHTMLParser, Document
+
+def analyze_filing(html):
+    parser = SECHTMLParser({'extract_tables': True})
+    document = parser.parse(html)
+
+    # Get text
+    text = document.text
+
+    # Search for revenue
+    revenue_mentions = document.search("revenue")
+
+    # Convert to markdown
+    markdown = document.to_markdown()
+
+    return {
+        'text': text,
+        'revenue_mentions': revenue_mentions,
+        'markdown': markdown
+    }
+'''
+
+    migrate_parser_usage(old_code)
+
+
+
+if __name__ == "__main__":
+    # Run examples
+    import warnings
+
+    # Show deprecation warnings
+    warnings.filterwarnings('always', category=DeprecationWarning)
+
+    # Run old parser example (will show warnings)
+    old_parser_example()
+
+    # Run new parser example
+    new_parser_example()
+
+    # Show comparison
+    migration_comparison()
+
+    # Show automatic migration
+    automatic_migration_example()
+
+    # Print full migration guide
+    from edgar.documents.migration import MigrationGuide
+    MigrationGuide.print_migration_guide()
--- a/venv/lib/python3.10/site-packages/edgar/documents/nodes.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/nodes.py
@@ -0,0 +1,456 @@
+"""
+Node hierarchy for the document tree.
+"""
+
+import uuid
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict, Any, Callable, Iterator
+
+from edgar.documents.types import NodeType, SemanticType, Style
+from edgar.documents.cache_mixin import CacheableMixin
+
+
+@dataclass
+class Node(ABC):
+    """
+    Base node class for document tree.
+    
+    All nodes in the document inherit from this class and implement
+    the abstract methods for text and HTML generation.
+    """
+    
+    # Identity
+    id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    type: NodeType = NodeType.DOCUMENT
+    
+    # Hierarchy
+    parent: Optional['Node'] = field(default=None, repr=False)
+    children: List['Node'] = field(default_factory=list, repr=False)
+    
+    # Content
+    content: Any = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    style: Style = field(default_factory=Style)
+    
+    # Semantic info
+    semantic_type: Optional[SemanticType] = None
+    semantic_role: Optional[str] = None
+    
+    def add_child(self, child: 'Node') -> None:
+        """Add child node, maintaining parent reference."""
+        child.parent = self
+        self.children.append(child)
+    
+    def remove_child(self, child: 'Node') -> None:
+        """Remove child node."""
+        if child in self.children:
+            self.children.remove(child)
+            child.parent = None
+    
+    def insert_child(self, index: int, child: 'Node') -> None:
+        """Insert child at specific index."""
+        child.parent = self
+        self.children.insert(index, child)
+    
+    @abstractmethod
+    def text(self) -> str:
+        """Extract text content from node and its children."""
+        pass
+    
+    @abstractmethod
+    def html(self) -> str:
+        """Generate HTML representation of node."""
+        pass
+    
+    def find(self, predicate: Callable[['Node'], bool]) -> List['Node']:
+        """Find all nodes matching predicate."""
+        results = []
+        if predicate(self):
+            results.append(self)
+        for child in self.children:
+            results.extend(child.find(predicate))
+        return results
+    
+    def find_first(self, predicate: Callable[['Node'], bool]) -> Optional['Node']:
+        """Find first node matching predicate."""
+        if predicate(self):
+            return self
+        for child in self.children:
+            result = child.find_first(predicate)
+            if result:
+                return result
+        return None
+    
+    def xpath(self, expression: str) -> List['Node']:
+        """
+        Simple XPath-like node selection.
+        
+        Supports:
+        - //node_type - Find all nodes of type
+        - /node_type - Direct children of type
+        - [@attr=value] - Attribute matching
+        """
+        # Simple implementation - can be extended
+        if expression.startswith('//'):
+            node_type = expression[2:].lower()
+            return self.find(lambda n: n.type.name.lower() == node_type)
+        elif expression.startswith('/'):
+            node_type = expression[1:].lower()
+            return [c for c in self.children if c.type.name.lower() == node_type]
+        return []
+    
+    def walk(self) -> Iterator['Node']:
+        """Walk the tree depth-first."""
+        yield self
+        for child in self.children:
+            yield from child.walk()
+    
+    @property
+    def depth(self) -> int:
+        """Get depth of node in tree."""
+        depth = 0
+        current = self.parent
+        while current:
+            depth += 1
+            current = current.parent
+        return depth
+    
+    @property
+    def path(self) -> str:
+        """Get path from root to this node."""
+        parts = []
+        current = self
+        while current:
+            parts.append(current.type.name)
+            current = current.parent
+        return '/'.join(reversed(parts))
+    
+    def get_metadata(self, key: str, default: Any = None) -> Any:
+        """Get metadata value with default."""
+        return self.metadata.get(key, default)
+    
+    def set_metadata(self, key: str, value: Any) -> None:
+        """Set metadata value."""
+        self.metadata[key] = value
+    
+    def has_metadata(self, key: str) -> bool:
+        """Check if metadata key exists."""
+        return key in self.metadata
+
+
+@dataclass
+class DocumentNode(Node, CacheableMixin):
+    """Root document node."""
+    type: NodeType = field(default=NodeType.DOCUMENT, init=False)
+
+    def text(self) -> str:
+        """Extract all text from document with caching."""
+        def _generate_text():
+            parts = []
+            for child in self.children:
+                text = child.text()
+                if text:
+                    parts.append(text)
+            return '\n\n'.join(parts)
+
+        return self._get_cached_text(_generate_text)
+    
+    def html(self) -> str:
+        """Generate complete HTML document."""
+        body_content = '\n'.join(child.html() for child in self.children)
+        return f"""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <title>Document</title>
+</head>
+<body>
+{body_content}
+</body>
+</html>"""
+
+
+@dataclass
+class TextNode(Node):
+    """Plain text content node."""
+    type: NodeType = field(default=NodeType.TEXT, init=False)
+    content: str = ""
+    
+    def text(self) -> str:
+        """Return text content."""
+        return self.content
+    
+    def html(self) -> str:
+        """Generate HTML for text."""
+        # Escape HTML entities
+        text = self.content
+        text = text.replace('&', '&amp;')
+        text = text.replace('<', '&lt;')
+        text = text.replace('>', '&gt;')
+        return text
+
+
+@dataclass
+class ParagraphNode(Node, CacheableMixin):
+    """Paragraph node."""
+    type: NodeType = field(default=NodeType.PARAGRAPH, init=False)
+
+    def text(self) -> str:
+        """Extract paragraph text with intelligent spacing and caching."""
+        def _generate_text():
+            parts = []
+            for i, child in enumerate(self.children):
+                text = child.text()
+                if text:
+                    # For the first child, just add the text
+                    if i == 0:
+                        parts.append(text)
+                    else:
+                        # For subsequent children, check if previous child had tail whitespace
+                        prev_child = self.children[i - 1]
+                        should_add_space = False
+
+                        # Add space if previous child had tail whitespace
+                        if hasattr(prev_child, 'get_metadata') and prev_child.get_metadata('has_tail_whitespace'):
+                            should_add_space = True
+
+                        # Add space if current text starts with space (preserve intended spacing)
+                        elif text.startswith(' '):
+                            should_add_space = True
+                            # Remove the leading space from text since we're adding it as separation
+                            text = text.lstrip()
+
+                        # Add space if previous text ends with punctuation (sentence boundaries)
+                        elif parts and parts[-1].rstrip()[-1:] in '.!?:;':
+                            should_add_space = True
+
+                        # Add space between adjacent inline elements if the current text starts with a letter/digit
+                        # This handles cases where whitespace was stripped but spacing is semantically important
+                        elif (text and text[0].isalpha() and
+                              parts and parts[-1] and not parts[-1].endswith(' ') and
+                              hasattr(child, 'get_metadata') and child.get_metadata('original_tag') in ['span', 'a', 'em', 'strong', 'i', 'b']):
+                            should_add_space = True
+
+                        if should_add_space:
+                            parts.append(' ' + text)
+                        else:
+                            # Concatenate directly without space
+                            if parts:
+                                parts[-1] += text
+                            else:
+                                parts.append(text)
+
+            return ''.join(parts)
+
+        return self._get_cached_text(_generate_text)
+    
+    def html(self) -> str:
+        """Generate paragraph HTML."""
+        content = ''.join(child.html() for child in self.children)
+        style_attr = self._generate_style_attr()
+        return f'<p{style_attr}>{content}</p>'
+    
+    def _generate_style_attr(self) -> str:
+        """Generate style attribute from style object."""
+        if not self.style:
+            return ''
+        
+        styles = []
+        if self.style.text_align:
+            styles.append(f'text-align: {self.style.text_align}')
+        if self.style.margin_top:
+            styles.append(f'margin-top: {self.style.margin_top}px')
+        if self.style.margin_bottom:
+            styles.append(f'margin-bottom: {self.style.margin_bottom}px')
+        
+        if styles:
+            return f' style="{"; ".join(styles)}"'
+        return ''
+
+
+@dataclass
+class HeadingNode(Node):
+    """Heading node with level."""
+    type: NodeType = field(default=NodeType.HEADING, init=False)
+    level: int = 1
+    
+    def text(self) -> str:
+        """Extract heading text."""
+        if isinstance(self.content, str):
+            return self.content
+        
+        parts = []
+        for child in self.children:
+            text = child.text()
+            if text:
+                parts.append(text)
+        return ' '.join(parts)
+    
+    def html(self) -> str:
+        """Generate heading HTML."""
+        level = max(1, min(6, self.level))  # Ensure level is 1-6
+        content = self.text()
+        style_attr = self._generate_style_attr()
+        return f'<h{level}{style_attr}>{content}</h{level}>'
+    
+    def _generate_style_attr(self) -> str:
+        """Generate style attribute."""
+        styles = []
+        if self.style.text_align:
+            styles.append(f'text-align: {self.style.text_align}')
+        if self.style.color:
+            styles.append(f'color: {self.style.color}')
+        if styles:
+            return f' style="{"; ".join(styles)}"'
+        return ''
+
+
+@dataclass
+class ContainerNode(Node, CacheableMixin):
+    """Generic container node (div, section, etc.)."""
+    type: NodeType = field(default=NodeType.CONTAINER, init=False)
+    tag_name: str = 'div'
+
+    def text(self) -> str:
+        """Extract text from container with caching."""
+        def _generate_text():
+            parts = []
+            for child in self.children:
+                text = child.text()
+                if text:
+                    parts.append(text)
+            return '\n'.join(parts)
+
+        return self._get_cached_text(_generate_text)
+    
+    def html(self) -> str:
+        """Generate container HTML."""
+        content = '\n'.join(child.html() for child in self.children)
+        style_attr = self._generate_style_attr()
+        class_attr = f' class="{self.semantic_role}"' if self.semantic_role else ''
+        return f'<{self.tag_name}{style_attr}{class_attr}>{content}</{self.tag_name}>'
+    
+    def _generate_style_attr(self) -> str:
+        """Generate style attribute."""
+        if not self.style:
+            return ''
+        
+        styles = []
+        if self.style.margin_top:
+            styles.append(f'margin-top: {self.style.margin_top}px')
+        if self.style.margin_bottom:
+            styles.append(f'margin-bottom: {self.style.margin_bottom}px')
+        if self.style.padding_left:
+            styles.append(f'padding-left: {self.style.padding_left}px')
+        
+        if styles:
+            return f' style="{"; ".join(styles)}"'
+        return ''
+
+
+@dataclass 
+class SectionNode(ContainerNode):
+    """Document section node."""
+    type: NodeType = field(default=NodeType.SECTION, init=False)
+    section_name: Optional[str] = None
+    tag_name: str = field(default='section', init=False)
+    
+    def __post_init__(self):
+        if self.section_name:
+            self.set_metadata('section_name', self.section_name)
+
+
+@dataclass
+class ListNode(Node):
+    """List node (ordered or unordered)."""
+    type: NodeType = field(default=NodeType.LIST, init=False)
+    ordered: bool = False
+    
+    def text(self) -> str:
+        """Extract list text."""
+        parts = []
+        for i, child in enumerate(self.children):
+            if self.ordered:
+                prefix = f"{i+1}. "
+            else:
+                prefix = "• "
+            text = child.text()
+            if text:
+                parts.append(f"{prefix}{text}")
+        return '\n'.join(parts)
+    
+    def html(self) -> str:
+        """Generate list HTML."""
+        tag = 'ol' if self.ordered else 'ul'
+        items = '\n'.join(child.html() for child in self.children)
+        return f'<{tag}>\n{items}\n</{tag}>'
+
+
+@dataclass
+class ListItemNode(Node):
+    """List item node."""
+    type: NodeType = field(default=NodeType.LIST_ITEM, init=False)
+    
+    def text(self) -> str:
+        """Extract list item text."""
+        parts = []
+        for child in self.children:
+            text = child.text()
+            if text:
+                parts.append(text)
+        return ' '.join(parts)
+    
+    def html(self) -> str:
+        """Generate list item HTML."""
+        content = ''.join(child.html() for child in self.children)
+        return f'<li>{content}</li>'
+
+
+@dataclass
+class LinkNode(Node):
+    """Hyperlink node."""
+    type: NodeType = field(default=NodeType.LINK, init=False)
+    href: Optional[str] = None
+    title: Optional[str] = None
+    
+    def text(self) -> str:
+        """Extract link text."""
+        if isinstance(self.content, str):
+            return self.content
+        
+        parts = []
+        for child in self.children:
+            text = child.text()
+            if text:
+                parts.append(text)
+        return ' '.join(parts)
+    
+    def html(self) -> str:
+        """Generate link HTML."""
+        content = self.text()
+        href_attr = f' href="{self.href}"' if self.href else ''
+        title_attr = f' title="{self.title}"' if self.title else ''
+        return f'<a{href_attr}{title_attr}>{content}</a>'
+
+
+@dataclass
+class ImageNode(Node):
+    """Image node."""
+    type: NodeType = field(default=NodeType.IMAGE, init=False)
+    src: Optional[str] = None
+    alt: Optional[str] = None
+    width: Optional[int] = None
+    height: Optional[int] = None
+    
+    def text(self) -> str:
+        """Extract image alt text."""
+        return self.alt or ''
+    
+    def html(self) -> str:
+        """Generate image HTML."""
+        src_attr = f' src="{self.src}"' if self.src else ''
+        alt_attr = f' alt="{self.alt}"' if self.alt else ''
+        width_attr = f' width="{self.width}"' if self.width else ''
+        height_attr = f' height="{self.height}"' if self.height else ''
+        return f'<img{src_attr}{alt_attr}{width_attr}{height_attr}>'
--- a/venv/lib/python3.10/site-packages/edgar/documents/parser.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/parser.py
@@ -0,0 +1,387 @@
+"""
+Main HTML parser implementation.
+"""
+
+import time
+from typing import List, Union
+
+import lxml.html
+from lxml import etree
+from lxml.html import HtmlElement
+
+from edgar.documents.config import ParserConfig
+from edgar.documents.document import Document, DocumentMetadata
+from edgar.documents.exceptions import (
+    HTMLParsingError, DocumentTooLargeError, InvalidConfigurationError
+)
+from edgar.documents.nodes import DocumentNode
+from edgar.documents.processors.postprocessor import DocumentPostprocessor
+from edgar.documents.processors.preprocessor import HTMLPreprocessor
+from edgar.documents.strategies.document_builder import DocumentBuilder
+from edgar.documents.types import XBRLFact
+from edgar.documents.utils import get_cache_manager
+from edgar.documents.utils.html_utils import remove_xml_declaration, create_lxml_parser
+
+
+class HTMLParser:
+    """
+    Main HTML parser class.
+    
+    Orchestrates the parsing pipeline with configurable strategies
+    and processors.
+    """
+    
+    def __init__(self, config: ParserConfig = None):
+        """
+        Initialize parser with configuration.
+        
+        Args:
+            config: Parser configuration
+        """
+        self.config = config or ParserConfig()
+        self._validate_config()
+        
+        # Initialize components
+        self.cache_manager = get_cache_manager()
+        self.preprocessor = HTMLPreprocessor(self.config)
+        self.postprocessor = DocumentPostprocessor(self.config)
+        
+        # Initialize strategies
+        self._init_strategies()
+    
+    def _validate_config(self):
+        """Validate configuration."""
+        if self.config.max_document_size <= 0:
+            raise InvalidConfigurationError("max_document_size must be positive")
+        
+        if self.config.streaming_threshold and self.config.max_document_size:
+            if self.config.streaming_threshold > self.config.max_document_size:
+                raise InvalidConfigurationError(
+                    "streaming_threshold cannot exceed max_document_size"
+                )
+    
+    def _init_strategies(self):
+        """Initialize parsing strategies based on configuration."""
+        self.strategies = {}
+        
+        # Header detection strategy
+        if self.config.detect_sections:
+            from edgar.documents.strategies.header_detection import HeaderDetectionStrategy
+            self.strategies['header_detection'] = HeaderDetectionStrategy(self.config)
+        
+        # Table processing strategy
+        if self.config.table_extraction:
+            from edgar.documents.strategies.table_processing import TableProcessor
+            self.strategies['table_processing'] = TableProcessor(self.config)
+        
+        # XBRL extraction strategy
+        if self.config.extract_xbrl:
+            from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
+            self.strategies['xbrl_extraction'] = XBRLExtractor()
+    
+    def parse(self, html: Union[str, bytes]) -> Document:
+        """
+        Parse HTML into Document.
+        
+        Args:
+            html: HTML content as string or bytes
+            
+        Returns:
+            Parsed Document object
+            
+        Raises:
+            DocumentTooLargeError: If document exceeds size limit
+            HTMLParsingError: If parsing fails
+        """
+        start_time = time.time()
+        
+        # Validate input type
+        if html is None:
+            raise TypeError("HTML input cannot be None")
+
+        if not isinstance(html, (str, bytes)):
+            raise TypeError(f"HTML must be string or bytes, got {type(html).__name__}")
+
+        # Convert bytes to string if needed
+        if isinstance(html, bytes):
+            html = html.decode('utf-8', errors='replace')
+
+        # Handle empty HTML
+        if not html.strip():
+            # Return empty document
+            root = DocumentNode()
+            metadata = DocumentMetadata(
+                size=0,
+                parse_time=time.time() - start_time,
+                parser_version="2.0.0"
+            )
+            return Document(root=root, metadata=metadata)
+        
+        # Check document size
+        doc_size = len(html.encode('utf-8'))
+        if doc_size > self.config.max_document_size:
+            raise DocumentTooLargeError(doc_size, self.config.max_document_size)
+        
+        # Check if streaming is needed
+        if doc_size > self.config.streaming_threshold:
+            return self._parse_streaming(html)
+        
+        try:
+            # Store original HTML BEFORE preprocessing (needed for TOC analysis)
+            original_html = html
+
+            # Extract XBRL data BEFORE preprocessing (to preserve ix:hidden content)
+            xbrl_facts = []
+            if self.config.extract_xbrl:
+                xbrl_facts = self._extract_xbrl_pre_process(html)
+
+            # Preprocessing (will remove ix:hidden for rendering)
+            html = self.preprocessor.process(html)
+            
+            # Parse with lxml
+            tree = self._parse_html(html)
+            
+            # Extract metadata
+            metadata = self._extract_metadata(tree, html)
+            metadata.preserve_whitespace = self.config.preserve_whitespace
+
+            # Store ORIGINAL unmodified HTML for section extraction (TOC analysis)
+            # Must be the raw HTML before preprocessing
+            metadata.original_html = original_html
+
+            # Add XBRL facts to metadata if found
+            if xbrl_facts:
+                metadata.xbrl_data = {'facts': xbrl_facts}
+            
+            # Build document
+            document = self._build_document(tree, metadata)
+
+            # Store config reference for section extraction
+            document._config = self.config
+
+            # Postprocessing
+            document = self.postprocessor.process(document)
+            
+            # Record parse time
+            document.metadata.parse_time = time.time() - start_time
+            document.metadata.size = doc_size
+            
+            return document
+            
+        except Exception as e:
+            if isinstance(e, (DocumentTooLargeError, HTMLParsingError)):
+                raise
+            raise HTMLParsingError(
+                f"Failed to parse HTML: {str(e)}",
+                context={'error_type': type(e).__name__}
+            )
+    
+    def _parse_html(self, html: str) -> HtmlElement:
+        """Parse HTML with lxml."""
+        try:
+            # Remove XML declaration if present
+            html = remove_xml_declaration(html)
+
+            parser = create_lxml_parser(
+                remove_blank_text=not self.config.preserve_whitespace,
+                remove_comments=True,
+                recover=True,
+                encoding='utf-8'
+            )
+            
+            # Parse HTML
+            tree = lxml.html.fromstring(html, parser=parser)
+            
+            # Ensure we have a proper document structure
+            if tree.tag != 'html':
+                # Wrap in html/body if needed
+                html_tree = lxml.html.Element('html')
+                body = etree.SubElement(html_tree, 'body')
+                body.append(tree)
+                tree = html_tree
+            
+            return tree
+            
+        except Exception as e:
+            raise HTMLParsingError(
+                f"lxml parsing failed: {str(e)}",
+                context={'parser': 'lxml.html'}
+            )
+    
+    def _extract_metadata(self, tree: HtmlElement, html: str) -> DocumentMetadata:
+        """Extract metadata from HTML tree."""
+        metadata = DocumentMetadata()
+        
+        # Use filing type from config if provided (avoids expensive detection)
+        if self.config.form:
+            metadata.form = self.config.form
+        
+        # Try to extract from meta tags
+        for meta in tree.xpath('//meta'):
+            name = meta.get('name', '').lower()
+            content = meta.get('content', '')
+            
+            if name == 'company':
+                metadata.company = content
+            elif name == 'filing-type':
+                metadata.form = content
+            elif name == 'cik':
+                metadata.cik = content
+            elif name == 'filing-date':
+                metadata.filing_date = content
+            elif name == 'accession-number':
+                metadata.accession_number = content
+        
+        # Try to extract from title
+        title_elem = tree.find('.//title')
+        if title_elem is not None and title_elem.text:
+            # Parse title for filing info
+            title = title_elem.text.strip()
+            # Example: "APPLE INC - 10-K - 2023-09-30"
+            parts = title.split(' - ')
+            if len(parts) >= 2:
+                if not metadata.company:
+                    metadata.company = parts[0].strip()
+                if not metadata.form:
+                    metadata.form = parts[1].strip()
+        
+        # Try to extract from document content
+        if not metadata.form:
+            # Look for form type in first 1000 chars
+            text_start = html[:1000].upper()
+            for form_type in ['10-K', '10-Q', '8-K', 'DEF 14A', 'S-1']:
+                if form_type in text_start:
+                    metadata.form = form_type
+                    break
+        
+        return metadata
+    
+    def _build_document(self, tree: HtmlElement, metadata: DocumentMetadata) -> Document:
+        """Build document from parsed tree."""
+        # Create document builder with strategies
+        builder = DocumentBuilder(self.config, self.strategies)
+        
+        # Build document node tree
+        root_node = builder.build(tree)
+        
+        # Create document
+        document = Document(root=root_node, metadata=metadata)
+        
+        return document
+    
+    def _parse_streaming(self, html: str) -> Document:
+        """Parse large document in streaming mode."""
+        from edgar.documents.utils.streaming import StreamingParser
+        
+        streaming_parser = StreamingParser(self.config, self.strategies)
+        return streaming_parser.parse(html)
+    
+    def _extract_xbrl_pre_process(self, html: str) -> List[XBRLFact]:
+        """
+        Extract XBRL facts before preprocessing.
+        This ensures we capture XBRL data from ix:hidden elements.
+        """
+        try:
+            # Parse HTML without preprocessing to preserve all XBRL content
+            parser = create_lxml_parser(
+                remove_blank_text=False,
+                remove_comments=False,
+                recover=True,
+                encoding='utf-8'
+            )
+            
+            # Remove XML declaration if present
+            html = remove_xml_declaration(html)
+
+            tree = lxml.html.fromstring(html, parser=parser)
+            
+            # Use XBRL extractor
+            from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
+            extractor = XBRLExtractor()
+            
+            facts = []
+            
+            # Find all XBRL elements (including those in ix:hidden)
+            # Simple approach: find all elements with ix: prefix
+            for element in tree.iter():
+                if element.tag and isinstance(element.tag, str) and 'ix:' in element.tag.lower():
+                    # Skip container elements
+                    local_name = element.tag.split(':')[-1].lower() if ':' in element.tag else element.tag.lower()
+                    if local_name in ['nonnumeric', 'nonfraction', 'continuation', 'footnote', 'fraction']:
+                        fact = extractor.extract_fact(element)
+                        if fact:
+                            # Mark if fact was in hidden section or header
+                            parent = element.getparent()
+                            while parent is not None:
+                                if parent.tag:
+                                    tag_lower = parent.tag.lower()
+                                    if 'ix:hidden' in tag_lower or 'ix:header' in tag_lower:
+                                        fact.metadata = fact.metadata or {}
+                                        fact.metadata['hidden'] = True
+                                        break
+                                parent = parent.getparent()
+                            facts.append(fact)
+            
+            return facts
+            
+        except Exception as e:
+            # Log error but don't fail parsing
+            import logging
+            logging.warning(f"Failed to extract XBRL data: {e}")
+            return []
+    
+    def parse_file(self, file_path: str) -> Document:
+        """
+        Parse HTML from file.
+        
+        Args:
+            file_path: Path to HTML file
+            
+        Returns:
+            Parsed Document object
+        """
+        with open(file_path, 'r', encoding='utf-8') as f:
+            html = f.read()
+        
+        document = self.parse(html)
+        document.metadata.source = file_path
+        
+        return document
+    
+    def parse_url(self, url: str) -> Document:
+        """
+        Parse HTML from URL.
+        
+        Args:
+            url: URL to fetch and parse
+            
+        Returns:
+            Parsed Document object
+        """
+        import requests
+        
+        response = requests.get(url, timeout=30)
+        response.raise_for_status()
+        
+        document = self.parse(response.text)
+        document.metadata.url = url
+        
+        return document
+    
+    @classmethod
+    def create_for_performance(cls) -> 'HTMLParser':
+        """Create parser optimized for performance."""
+        config = ParserConfig.for_performance()
+        return cls(config)
+    
+    @classmethod
+    def create_for_accuracy(cls) -> 'HTMLParser':
+        """Create parser optimized for accuracy."""
+        config = ParserConfig.for_accuracy()
+        return cls(config)
+    
+    @classmethod
+    def create_for_ai(cls) -> 'HTMLParser':
+        """Create parser optimized for AI processing."""
+        config = ParserConfig.for_ai()
+        return cls(config)
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/init.py
@@ -0,0 +1,11 @@
+"""
+Document processors for preprocessing and postprocessing.
+"""
+
+from edgar.documents.processors.preprocessor import HTMLPreprocessor
+from edgar.documents.processors.postprocessor import DocumentPostprocessor
+
+__all__ = [
+    'HTMLPreprocessor',
+    'DocumentPostprocessor'
+]
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/postprocessor.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/postprocessor.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/preprocessor.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/preprocessor.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/postprocessor.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/postprocessor.py
@@ -0,0 +1,283 @@
+"""
+Document postprocessor for final processing after parsing.
+"""
+
+from typing import List, Set
+from edgar.documents.config import ParserConfig
+from edgar.documents.document import Document
+from edgar.documents.nodes import Node, TextNode, ParagraphNode, HeadingNode
+from edgar.documents.types import NodeType
+
+
+class DocumentPostprocessor:
+    """
+    Postprocesses parsed documents to improve quality.
+    
+    Handles:
+    - Adjacent node merging
+    - Empty node removal
+    - Heading level normalization
+    - Section detection enhancement
+    - Metadata enrichment
+    """
+    
+    def __init__(self, config: ParserConfig):
+        """Initialize postprocessor with configuration."""
+        self.config = config
+    
+    def process(self, document: Document) -> Document:
+        """
+        Postprocess document.
+        
+        Args:
+            document: Parsed document
+            
+        Returns:
+            Processed document
+        """
+        # Remove empty nodes
+        self._remove_empty_nodes(document.root)
+        
+        # Merge adjacent text nodes if configured
+        if self.config.merge_adjacent_nodes:
+            self._merge_adjacent_nodes(document.root)
+        
+        # Normalize heading levels
+        self._normalize_heading_levels(document.root)
+        
+        # Enhance section detection if configured
+        if self.config.detect_sections:
+            self._enhance_sections(document)
+        
+        # Add document statistics
+        self._add_statistics(document)
+        
+        # Validate document structure
+        self._validate_structure(document)
+        
+        return document
+    
+    def _remove_empty_nodes(self, node: Node):
+        """Remove empty nodes from tree."""
+        # Process children first (bottom-up)
+        children_to_remove = []
+        
+        for child in node.children:
+            self._remove_empty_nodes(child)
+            
+            # Check if child is empty
+            if self._is_empty_node(child):
+                children_to_remove.append(child)
+        
+        # Remove empty children
+        for child in children_to_remove:
+            node.remove_child(child)
+    
+    def _is_empty_node(self, node: Node) -> bool:
+        """Check if node is empty and can be removed."""
+        # Never remove table nodes
+        if node.type == NodeType.TABLE:
+            return False
+        
+        # Never remove nodes with metadata
+        if node.metadata:
+            return False
+        
+        # Check text nodes
+        if isinstance(node, TextNode):
+            return not node.text().strip()
+        
+        # Check other nodes with text content
+        if hasattr(node, 'content') and isinstance(node.content, str):
+            return not node.content.strip()
+        
+        # Check container nodes
+        if not node.children:
+            # Empty container with no children
+            return True
+        
+        return False
+    
+    def _merge_adjacent_nodes(self, node: Node):
+        """Merge adjacent text nodes with similar properties."""
+        if not node.children:
+            return
+        
+        # Process children first
+        for child in node.children:
+            self._merge_adjacent_nodes(child)
+        
+        # Merge adjacent text nodes
+        merged_children = []
+        i = 0
+        
+        while i < len(node.children):
+            current = node.children[i]
+            
+            # Look for mergeable nodes
+            if self._can_merge(current):
+                # Collect all adjacent mergeable nodes
+                merge_group = [current]
+                j = i + 1
+                
+                while j < len(node.children) and self._can_merge_with(current, node.children[j]):
+                    merge_group.append(node.children[j])
+                    j += 1
+                
+                # Merge if we have multiple nodes
+                if len(merge_group) > 1:
+                    merged = self._merge_nodes(merge_group)
+                    merged_children.append(merged)
+                    i = j
+                else:
+                    merged_children.append(current)
+                    i += 1
+            else:
+                merged_children.append(current)
+                i += 1
+        
+        # Update children
+        node.children = merged_children
+        
+        # Update parent references
+        for child in node.children:
+            child.parent = node
+    
+    def _can_merge(self, node: Node) -> bool:
+        """Check if node can be merged."""
+        # Only merge TextNodes, not ParagraphNodes
+        return isinstance(node, TextNode) and not node.metadata
+    
+    def _can_merge_with(self, node1: Node, node2: Node) -> bool:
+        """Check if two nodes can be merged."""
+        # Must be same type
+        if type(node1) != type(node2):
+            return False
+        
+        # Must have compatible styles
+        if not self._compatible_styles(node1.style, node2.style):
+            return False
+        
+        # Must not have metadata
+        if node1.metadata or node2.metadata:
+            return False
+        
+        return True
+    
+    def _compatible_styles(self, style1, style2) -> bool:
+        """Check if two styles are compatible for merging."""
+        # For now, just check key properties
+        return (
+            style1.font_size == style2.font_size and
+            style1.font_weight == style2.font_weight and
+            style1.text_align == style2.text_align
+        )
+    
+    def _merge_nodes(self, nodes: List[Node]) -> Node:
+        """Merge multiple nodes into one."""
+        if not nodes:
+            return None
+        
+        # Use first node as base
+        merged = nodes[0]
+        
+        # Merge content
+        if isinstance(merged, TextNode):
+            texts = [n.text() for n in nodes]
+            merged.content = '\n'.join(texts)
+        elif isinstance(merged, ParagraphNode):
+            # Merge all children
+            for node in nodes[1:]:
+                merged.children.extend(node.children)
+        
+        return merged
+    
+    def _normalize_heading_levels(self, node: Node):
+        """Normalize heading levels to ensure proper hierarchy."""
+        # Collect all headings
+        headings = []
+        self._collect_headings(node, headings)
+        
+        if not headings:
+            return
+        
+        # Analyze heading structure
+        levels_used = set(h.level for h in headings)
+        
+        # If we're missing level 1, promote headings
+        if 1 not in levels_used and levels_used:
+            min_level = min(levels_used)
+            adjustment = min_level - 1
+            
+            for heading in headings:
+                heading.level = max(1, heading.level - adjustment)
+    
+    def _collect_headings(self, node: Node, headings: List[HeadingNode]):
+        """Collect all heading nodes."""
+        if isinstance(node, HeadingNode):
+            headings.append(node)
+        
+        for child in node.children:
+            self._collect_headings(child, headings)
+    
+    def _enhance_sections(self, document: Document):
+        """Enhance section detection and metadata."""
+        # Only extract sections eagerly if configured to do so
+        if not self.config.eager_section_extraction:
+            return
+            
+        # Force section extraction to populate cache
+        _ = document.sections
+        
+        # Add section metadata to nodes
+        for section_name, section in document.sections.items():
+            # Add section name to all nodes in section
+            for node in section.node.walk():
+                node.set_metadata('section', section_name)
+    
+    def _add_statistics(self, document: Document):
+        """Add document statistics to metadata."""
+        stats = {
+            'node_count': sum(1 for _ in document.root.walk()),
+            'text_length': len(document.text()),
+            'table_count': len(document.tables),
+            'heading_count': len(document.headings),
+        }
+        
+        # Only add section count if sections were extracted
+        if self.config.eager_section_extraction:
+            stats['section_count'] = len(document.sections)
+        
+        document.metadata.statistics = stats
+    
+    def _validate_structure(self, document: Document):
+        """Validate document structure and fix issues."""
+        issues = []
+        
+        # Check for orphaned nodes
+        for node in document.root.walk():
+            if node != document.root and node.parent is None:
+                issues.append(f"Orphaned node: {node.type}")
+                # Fix by adding to root
+                document.root.add_child(node)
+        
+        # Check for circular references
+        visited = set()
+        
+        def check_cycles(node: Node, path: Set[str]):
+            if node.id in path:
+                issues.append(f"Circular reference detected: {node.type}")
+                return
+            
+            path.add(node.id)
+            visited.add(node.id)
+            
+            for child in node.children:
+                if child.id not in visited:
+                    check_cycles(child, path.copy())
+        
+        check_cycles(document.root, set())
+        
+        # Store validation results
+        if issues:
+            document.metadata.validation_issues = issues
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/preprocessor.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/preprocessor.py
@@ -0,0 +1,242 @@
+"""
+HTML preprocessor for cleaning and normalizing HTML before parsing.
+"""
+
+import re
+
+from edgar.documents.config import ParserConfig
+from edgar.documents.utils.html_utils import remove_xml_declaration
+
+
+class HTMLPreprocessor:
+    """
+    Preprocesses HTML to fix common issues and normalize content.
+    
+    Handles:
+    - Character encoding issues
+    - Malformed HTML
+    - Excessive whitespace
+    - Script/style removal
+    - Entity normalization
+    """
+    
+    def __init__(self, config: ParserConfig):
+        """Initialize preprocessor with configuration."""
+        self.config = config
+        
+        # Pre-compile regex patterns for performance
+        self._compiled_patterns = self._compile_patterns()
+    
+    def _compile_patterns(self):
+        """Pre-compile frequently used regex patterns."""
+        return {
+            # Encoding and cleanup
+            'control_chars': re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]'),
+
+            # Script/style removal
+            'script_tags': re.compile(r'<script[^>]*>.*?</script>', re.IGNORECASE | re.DOTALL),
+            'style_tags': re.compile(r'<style[^>]*>.*?</style>', re.IGNORECASE | re.DOTALL),
+            'link_tags': re.compile(r'<link[^>]*>', re.IGNORECASE),
+            'comments': re.compile(r'<!--.*?-->', re.DOTALL),
+            'ix_hidden': re.compile(r'<ix:hidden[^>]*>.*?</ix:hidden>', re.IGNORECASE | re.DOTALL),
+            'ix_header': re.compile(r'<ix:header[^>]*>.*?</ix:header>', re.IGNORECASE | re.DOTALL),
+
+            # Malformed tags
+            'br_tags': re.compile(r'<br(?![^>]*/)>', re.IGNORECASE),
+            'img_tags': re.compile(r'<img([^>]+)(?<!/)>', re.IGNORECASE),
+            'input_tags': re.compile(r'<input([^>]+)(?<!/)>', re.IGNORECASE),
+            'hr_tags': re.compile(r'<hr(?![^>]*/)>', re.IGNORECASE),
+            'nested_p_open': re.compile(r'<p>\s*<p>', re.IGNORECASE),
+            'nested_p_close': re.compile(r'</p>\s*</p>', re.IGNORECASE),
+
+            # Whitespace normalization
+            'multiple_spaces': re.compile(r'[ \t]+'),
+            'multiple_newlines': re.compile(r'\n{3,}'),
+            'spaces_around_tags': re.compile(r'\s*(<[^>]+>)\s*'),
+
+            # Block element newlines - combined pattern for opening tags
+            'block_open_tags': re.compile(
+                r'(<(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)[^>]*>)',
+                re.IGNORECASE
+            ),
+            # Block element newlines - combined pattern for closing tags
+            'block_close_tags': re.compile(
+                r'(</(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)>)',
+                re.IGNORECASE
+            ),
+
+            # Empty tags removal - combined pattern for all removable tags
+            'empty_tags': re.compile(
+                r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*>\s*</(?:span|div|p|font|b|i|u|strong|em)>',
+                re.IGNORECASE
+            ),
+            'empty_self_closing': re.compile(
+                r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*/>\s*',
+                re.IGNORECASE
+            ),
+
+            # Common issues
+            'multiple_br': re.compile(r'(<br\s*/?>[\s\n]*){3,}', re.IGNORECASE),
+            'space_before_punct': re.compile(r'\s+([.,;!?])'),
+            'missing_space_after_punct': re.compile(r'([.,;!?])([A-Z])'),
+        }
+    
+    def process(self, html: str) -> str:
+        """
+        Preprocess HTML content.
+        
+        Args:
+            html: Raw HTML content
+            
+        Returns:
+            Cleaned HTML ready for parsing
+        """
+        # Remove BOM if present
+        if html.startswith('\ufeff'):
+            html = html[1:]
+        
+        # Remove XML declaration if present
+        html = remove_xml_declaration(html)
+        
+        # Fix common character encoding issues
+        html = self._fix_encoding_issues(html)
+        
+        # Remove script and style tags
+        html = self._remove_script_style(html)
+        
+        # Normalize entities
+        html = self._normalize_entities(html)
+        
+        # Fix malformed tags
+        html = self._fix_malformed_tags(html)
+        
+        # Normalize whitespace if not preserving
+        if not self.config.preserve_whitespace:
+            html = self._normalize_whitespace(html)
+        
+        # Remove empty tags
+        html = self._remove_empty_tags(html)
+        
+        # Fix common HTML issues
+        html = self._fix_common_issues(html)
+        
+        return html
+    
+    def _fix_encoding_issues(self, html: str) -> str:
+        """Fix common character encoding issues."""
+        # Replace Windows-1252 characters with Unicode equivalents
+        replacements = {
+            '\x91': "'",  # Left single quote
+            '\x92': "'",  # Right single quote
+            '\x93': '"',  # Left double quote
+            '\x94': '"',  # Right double quote
+            '\x95': '•',  # Bullet
+            '\x96': '–',  # En dash
+            '\x97': '—',  # Em dash
+            '\xa0': ' ',  # Non-breaking space
+        }
+        
+        for old, new in replacements.items():
+            html = html.replace(old, new)
+        
+        # Remove other control characters
+        html = self._compiled_patterns['control_chars'].sub('', html)
+        
+        return html
+    
+    def _remove_script_style(self, html: str) -> str:
+        """Remove script and style tags with content."""
+        # Use pre-compiled patterns for better performance
+        html = self._compiled_patterns['script_tags'].sub('', html)
+        html = self._compiled_patterns['style_tags'].sub('', html)
+        html = self._compiled_patterns['link_tags'].sub('', html)
+        html = self._compiled_patterns['comments'].sub('', html)
+        html = self._compiled_patterns['ix_hidden'].sub('', html)
+        html = self._compiled_patterns['ix_header'].sub('', html)
+
+        return html
+    
+    def _normalize_entities(self, html: str) -> str:
+        """Normalize HTML entities."""
+        # Common entity replacements
+        entities = {
+            '&nbsp;': ' ',
+            '&ensp;': ' ',
+            '&emsp;': '  ',
+            '&thinsp;': ' ',
+            '&#160;': ' ',
+            '&#32;': ' ',
+            '&zwj;': '',  # Zero-width joiner
+            '&zwnj;': '',  # Zero-width non-joiner
+            '&#8203;': '',  # Zero-width space
+        }
+        
+        for entity, replacement in entities.items():
+            html = html.replace(entity, replacement)
+        
+        # Fix double-encoded entities
+        html = html.replace('&amp;amp;', '&amp;')
+        html = html.replace('&amp;nbsp;', ' ')
+        html = html.replace('&amp;lt;', '&lt;')
+        html = html.replace('&amp;gt;', '&gt;')
+        
+        return html
+    
+    def _fix_malformed_tags(self, html: str) -> str:
+        """Fix common malformed tag issues."""
+        # Use pre-compiled patterns for better performance
+        html = self._compiled_patterns['br_tags'].sub('<br/>', html)
+        html = self._compiled_patterns['img_tags'].sub(r'<img\1/>', html)
+        html = self._compiled_patterns['input_tags'].sub(r'<input\1/>', html)
+        html = self._compiled_patterns['hr_tags'].sub('<hr/>', html)
+        html = self._compiled_patterns['nested_p_open'].sub('<p>', html)
+        html = self._compiled_patterns['nested_p_close'].sub('</p>', html)
+
+        return html
+    
+    def _normalize_whitespace(self, html: str) -> str:
+        """Normalize whitespace in HTML."""
+        # Use pre-compiled patterns for better performance
+        # Replace multiple spaces with single space
+        html = self._compiled_patterns['multiple_spaces'].sub(' ', html)
+
+        # Replace multiple newlines with double newline
+        html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
+
+        # Remove spaces around tags
+        html = self._compiled_patterns['spaces_around_tags'].sub(r'\1', html)
+
+        # Add newlines around block elements for readability
+        # Using combined patterns instead of looping over individual tags
+        html = self._compiled_patterns['block_open_tags'].sub(r'\n\1', html)
+        html = self._compiled_patterns['block_close_tags'].sub(r'\1\n', html)
+
+        # Clean up excessive newlines (apply again after adding newlines)
+        html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
+
+        return html.strip()
+    
+    def _remove_empty_tags(self, html: str) -> str:
+        """Remove empty tags that don't contribute content."""
+        # Use pre-compiled combined patterns instead of looping
+        html = self._compiled_patterns['empty_tags'].sub('', html)
+        html = self._compiled_patterns['empty_self_closing'].sub('', html)
+
+        return html
+    
+    def _fix_common_issues(self, html: str) -> str:
+        """Fix other common HTML issues."""
+        # Use pre-compiled patterns for better performance
+        html = self._compiled_patterns['multiple_br'].sub('<br/><br/>', html)
+        html = self._compiled_patterns['space_before_punct'].sub(r'\1', html)
+        html = self._compiled_patterns['missing_space_after_punct'].sub(r'\1 \2', html)
+
+        # Remove zero-width spaces (simple string replace is faster than regex)
+        html = html.replace('\u200b', '')
+        html = html.replace('\ufeff', '')
+
+        # Fix common typos in tags (simple string replace is faster than regex)
+        html = html.replace('<tabel', '<table')
+        html = html.replace('</tabel>', '</table>')
+
+        return html
--- a/venv/lib/python3.10/site-packages/edgar/documents/ranking/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/ranking/init.py
@@ -0,0 +1,34 @@
+"""
+Advanced ranking functionality for edgar.documents.
+
+This package provides BM25-based ranking with semantic structure awareness
+and intelligent index caching for performance optimization.
+"""
+
+from edgar.documents.ranking.ranking import (
+    RankingAlgorithm,
+    RankingEngine,
+    BM25Engine,
+    HybridEngine,
+    SemanticEngine,
+    RankedResult,
+)
+from edgar.documents.ranking.cache import (
+    SearchIndexCache,
+    CacheEntry,
+    get_search_cache,
+    set_search_cache,
+)
+
+__all__ = [
+    'RankingAlgorithm',
+    'RankingEngine',
+    'BM25Engine',
+    'HybridEngine',
+    'SemanticEngine',
+    'RankedResult',
+    'SearchIndexCache',
+    'CacheEntry',
+    'get_search_cache',
+    'set_search_cache',
+]
--- a/venv/lib/python3.10/site-packages/edgar/documents/ranking/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/ranking/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/ranking/pycache/cache.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/ranking/pycache/cache.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/ranking/pycache/preprocessing.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/ranking/pycache/preprocessing.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/ranking/pycache/ranking.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/ranking/pycache/ranking.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/ranking/pycache/semantic.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/ranking/pycache/semantic.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/ranking/cache.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/ranking/cache.py
@@ -0,0 +1,311 @@
+"""
+Search index caching for performance optimization.
+
+Provides memory and disk caching with LRU eviction and TTL expiration.
+"""
+
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+import hashlib
+import pickle
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CacheEntry:
+    """
+    Cached search index entry.
+
+    Stores pre-built search indices for a document along with metadata
+    for cache management (access tracking, TTL).
+    """
+    document_hash: str
+    index_data: Dict[str, Any]  # Serialized BM25 index data
+    created_at: datetime
+    access_count: int = 0
+    last_accessed: Optional[datetime] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+
+class SearchIndexCache:
+    """
+    Manages search index caching with memory + disk storage.
+
+    Features:
+    - In-memory LRU cache for fast access
+    - Optional disk persistence for reuse across sessions
+    - TTL-based expiration
+    - Access statistics tracking
+
+    Parameters:
+        memory_cache_size: Maximum entries in memory (default: 10)
+        disk_cache_enabled: Enable disk persistence (default: True)
+        cache_dir: Directory for disk cache (default: ~/.edgar_cache/search)
+        ttl_hours: Time-to-live for cached entries (default: 24)
+    """
+
+    def __init__(self,
+                 memory_cache_size: int = 10,
+                 disk_cache_enabled: bool = True,
+                 cache_dir: Optional[Path] = None,
+                 ttl_hours: int = 24):
+        """Initialize cache."""
+        self.memory_cache_size = memory_cache_size
+        self.disk_cache_enabled = disk_cache_enabled
+        self.cache_dir = cache_dir or Path.home() / ".edgar_cache" / "search"
+        self.ttl = timedelta(hours=ttl_hours)
+
+        # In-memory cache (LRU)
+        self._memory_cache: Dict[str, CacheEntry] = {}
+        self._access_order: List[str] = []
+
+        # Statistics
+        self._hits = 0
+        self._misses = 0
+
+        # Create cache directory
+        if disk_cache_enabled:
+            self.cache_dir.mkdir(parents=True, exist_ok=True)
+
+    def compute_document_hash(self, document_id: str, content_sample: str) -> str:
+        """
+        Compute cache key from document identifiers.
+
+        Uses document ID (e.g., accession number) and a content sample
+        to create a unique, stable hash.
+
+        Args:
+            document_id: Unique document identifier
+            content_sample: Sample of document content for verification
+
+        Returns:
+            16-character hex hash
+        """
+        content = f"{document_id}:{content_sample}"
+        return hashlib.sha256(content.encode()).hexdigest()[:16]
+
+    def get(self, document_hash: str) -> Optional[CacheEntry]:
+        """
+        Get cached entry.
+
+        Tries memory cache first, then disk cache. Updates LRU order
+        and access statistics.
+
+        Args:
+            document_hash: Cache key
+
+        Returns:
+            CacheEntry if found and valid, None otherwise
+        """
+        # Try memory cache first
+        if document_hash in self._memory_cache:
+            entry = self._memory_cache[document_hash]
+
+            # Check TTL
+            if datetime.now() - entry.created_at > self.ttl:
+                # Expired - remove from cache
+                self._evict_memory(document_hash)
+                self._misses += 1
+                return None
+
+            # Update access tracking
+            entry.access_count += 1
+            entry.last_accessed = datetime.now()
+
+            # Update LRU order
+            if document_hash in self._access_order:
+                self._access_order.remove(document_hash)
+            self._access_order.append(document_hash)
+
+            self._hits += 1
+            logger.debug(f"Cache hit (memory): {document_hash}")
+            return entry
+
+        # Try disk cache
+        if self.disk_cache_enabled:
+            entry = self._load_from_disk(document_hash)
+            if entry:
+                # Check TTL
+                if datetime.now() - entry.created_at > self.ttl:
+                    # Expired - delete file
+                    self._delete_from_disk(document_hash)
+                    self._misses += 1
+                    return None
+
+                # Load into memory cache
+                self._put_memory(document_hash, entry)
+                self._hits += 1
+                logger.debug(f"Cache hit (disk): {document_hash}")
+                return entry
+
+        self._misses += 1
+        logger.debug(f"Cache miss: {document_hash}")
+        return None
+
+    def put(self, document_hash: str, entry: CacheEntry) -> None:
+        """
+        Cache entry in memory and optionally on disk.
+
+        Args:
+            document_hash: Cache key
+            entry: Entry to cache
+        """
+        # Put in memory cache
+        self._put_memory(document_hash, entry)
+
+        # Put in disk cache
+        if self.disk_cache_enabled:
+            self._save_to_disk(document_hash, entry)
+
+        logger.debug(f"Cached entry: {document_hash}")
+
+    def _put_memory(self, document_hash: str, entry: CacheEntry) -> None:
+        """Put entry in memory cache with LRU eviction."""
+        # Evict if cache full
+        while len(self._memory_cache) >= self.memory_cache_size:
+            if self._access_order:
+                oldest = self._access_order.pop(0)
+                self._evict_memory(oldest)
+            else:
+                break
+
+        self._memory_cache[document_hash] = entry
+        self._access_order.append(document_hash)
+
+    def _evict_memory(self, document_hash: str) -> None:
+        """Evict entry from memory cache."""
+        if document_hash in self._memory_cache:
+            del self._memory_cache[document_hash]
+            logger.debug(f"Evicted from memory: {document_hash}")
+
+    def _load_from_disk(self, document_hash: str) -> Optional[CacheEntry]:
+        """Load entry from disk cache."""
+        cache_file = self.cache_dir / f"{document_hash}.pkl"
+        if not cache_file.exists():
+            return None
+
+        try:
+            with open(cache_file, 'rb') as f:
+                entry = pickle.load(f)
+            return entry
+        except Exception as e:
+            logger.warning(f"Failed to load cache from disk: {e}")
+            # Delete corrupted file
+            try:
+                cache_file.unlink()
+            except:
+                pass
+            return None
+
+    def _save_to_disk(self, document_hash: str, entry: CacheEntry) -> None:
+        """Save entry to disk cache."""
+        cache_file = self.cache_dir / f"{document_hash}.pkl"
+        try:
+            with open(cache_file, 'wb') as f:
+                pickle.dump(entry, f)
+        except Exception as e:
+            logger.warning(f"Failed to save cache to disk: {e}")
+
+    def _delete_from_disk(self, document_hash: str) -> None:
+        """Delete entry from disk cache."""
+        cache_file = self.cache_dir / f"{document_hash}.pkl"
+        try:
+            if cache_file.exists():
+                cache_file.unlink()
+        except Exception as e:
+            logger.warning(f"Failed to delete cache file: {e}")
+
+    def clear(self, memory_only: bool = False) -> None:
+        """
+        Clear cache.
+
+        Args:
+            memory_only: If True, only clear memory cache (keep disk)
+        """
+        self._memory_cache.clear()
+        self._access_order.clear()
+        logger.info("Cleared memory cache")
+
+        if not memory_only and self.disk_cache_enabled:
+            try:
+                for cache_file in self.cache_dir.glob("*.pkl"):
+                    cache_file.unlink()
+                logger.info("Cleared disk cache")
+            except Exception as e:
+                logger.warning(f"Failed to clear disk cache: {e}")
+
+    def get_stats(self) -> Dict[str, Any]:
+        """
+        Get cache statistics.
+
+        Returns:
+            Dictionary with cache statistics
+        """
+        disk_entries = 0
+        if self.disk_cache_enabled:
+            try:
+                disk_entries = len(list(self.cache_dir.glob("*.pkl")))
+            except:
+                pass
+
+        total_requests = self._hits + self._misses
+        hit_rate = self._hits / total_requests if total_requests > 0 else 0.0
+
+        return {
+            "memory_entries": len(self._memory_cache),
+            "disk_entries": disk_entries,
+            "total_accesses": sum(e.access_count for e in self._memory_cache.values()),
+            "cache_hits": self._hits,
+            "cache_misses": self._misses,
+            "hit_rate": hit_rate,
+            "memory_size_mb": self._estimate_cache_size()
+        }
+
+    def _estimate_cache_size(self) -> float:
+        """Estimate memory cache size in MB."""
+        try:
+            import sys
+            total_bytes = sum(
+                sys.getsizeof(entry.index_data)
+                for entry in self._memory_cache.values()
+            )
+            return total_bytes / (1024 * 1024)
+        except:
+            # Rough estimate if sys.getsizeof fails
+            return len(self._memory_cache) * 5.0  # Assume ~5MB per entry
+
+
+# Global cache instance
+_global_cache: Optional[SearchIndexCache] = None
+
+
+def get_search_cache() -> SearchIndexCache:
+    """
+    Get global search cache instance.
+
+    Creates a singleton cache instance on first call.
+
+    Returns:
+        Global SearchIndexCache instance
+    """
+    global _global_cache
+    if _global_cache is None:
+        _global_cache = SearchIndexCache()
+    return _global_cache
+
+
+def set_search_cache(cache: Optional[SearchIndexCache]) -> None:
+    """
+    Set global search cache instance.
+
+    Useful for testing or custom cache configuration.
+
+    Args:
+        cache: Cache instance to use globally (None to disable)
+    """
+    global _global_cache
+    _global_cache = cache
--- a/venv/lib/python3.10/site-packages/edgar/documents/ranking/preprocessing.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/ranking/preprocessing.py
@@ -0,0 +1,187 @@
+"""
+Text preprocessing for search.
+
+Provides tokenization and text normalization for BM25 and semantic analysis.
+"""
+
+import re
+from typing import List, Set
+
+
+# Common English stopwords (minimal set for financial documents)
+# We keep many financial terms that might be stopwords in other contexts
+STOPWORDS: Set[str] = {
+    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for',
+    'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on',
+    'that', 'the', 'to', 'was', 'will', 'with'
+}
+
+
+def preprocess_text(text: str,
+                   lowercase: bool = True,
+                   remove_punctuation: bool = False) -> str:
+    """
+    Preprocess text for search.
+
+    Args:
+        text: Raw text
+        lowercase: Convert to lowercase
+        remove_punctuation: Remove punctuation (keep for financial data)
+
+    Returns:
+        Preprocessed text
+    """
+    if not text:
+        return ""
+
+    # Normalize whitespace
+    text = ' '.join(text.split())
+
+    # Lowercase (important for BM25 matching)
+    if lowercase:
+        text = text.lower()
+
+    # Optionally remove punctuation (usually keep for "$5B", "Item 1A", etc.)
+    if remove_punctuation:
+        text = re.sub(r'[^\w\s]', ' ', text)
+        text = ' '.join(text.split())  # Clean up extra spaces
+
+    return text
+
+
+def tokenize(text: str,
+            remove_stopwords: bool = False,
+            min_token_length: int = 2) -> List[str]:
+    """
+    Tokenize text for BM25 indexing.
+
+    Args:
+        text: Text to tokenize
+        remove_stopwords: Remove common stopwords
+        min_token_length: Minimum token length to keep
+
+    Returns:
+        List of tokens
+    """
+    if not text:
+        return []
+
+    # Split on whitespace and punctuation boundaries
+    # Keep alphanumeric + some special chars for financial terms
+    tokens = re.findall(r'\b[\w$%]+\b', text.lower())
+
+    # Filter by length
+    tokens = [t for t in tokens if len(t) >= min_token_length]
+
+    # Optionally remove stopwords
+    if remove_stopwords:
+        tokens = [t for t in tokens if t not in STOPWORDS]
+
+    return tokens
+
+
+def extract_query_terms(query: str) -> List[str]:
+    """
+    Extract important terms from query for boosting.
+
+    Identifies key financial terms, numbers, and important phrases.
+
+    Args:
+        query: Search query
+
+    Returns:
+        List of important query terms
+    """
+    # Tokenize
+    tokens = tokenize(query, remove_stopwords=True)
+
+    # Extract important patterns
+    important = []
+
+    # Financial amounts: $5B, $1.2M, etc.
+    amounts = re.findall(r'\$[\d,.]+[BMK]?', query, re.IGNORECASE)
+    important.extend(amounts)
+
+    # Percentages: 15%, 3.5%
+    percentages = re.findall(r'\d+\.?\d*%', query)
+    important.extend(percentages)
+
+    # Years: 2023, 2024
+    years = re.findall(r'\b(19|20)\d{2}\b', query)
+    important.extend(years)
+
+    # Item references: Item 1A, Item 7
+    items = re.findall(r'item\s+\d+[a-z]?', query, re.IGNORECASE)
+    important.extend(items)
+
+    # Add all tokens
+    important.extend(tokens)
+
+    # Remove duplicates while preserving order
+    seen = set()
+    result = []
+    for term in important:
+        term_lower = term.lower()
+        if term_lower not in seen:
+            seen.add(term_lower)
+            result.append(term)
+
+    return result
+
+
+def normalize_financial_term(term: str) -> str:
+    """
+    Normalize financial terms for consistent matching.
+
+    Examples:
+        "$5 billion" -> "$5b"
+        "5,000,000" -> "5000000"
+        "Item 1A" -> "item1a"
+
+    Args:
+        term: Financial term
+
+    Returns:
+        Normalized term
+    """
+    term = term.lower().strip()
+
+    # Remove commas from numbers
+    term = term.replace(',', '')
+
+    # Normalize billion/million/thousand
+    term = re.sub(r'\s*billion\b', 'b', term)
+    term = re.sub(r'\s*million\b', 'm', term)
+    term = re.sub(r'\s*thousand\b', 'k', term)
+
+    # Remove spaces in compound terms
+    term = re.sub(r'(item|section|part)\s+(\d+[a-z]?)', r'\1\2', term)
+
+    # Remove extra whitespace
+    term = ' '.join(term.split())
+
+    return term
+
+
+def get_ngrams(tokens: List[str], n: int = 2) -> List[str]:
+    """
+    Generate n-grams from tokens.
+
+    Useful for phrase matching in BM25.
+
+    Args:
+        tokens: List of tokens
+        n: N-gram size
+
+    Returns:
+        List of n-grams as strings
+    """
+    if len(tokens) < n:
+        return []
+
+    ngrams = []
+    for i in range(len(tokens) - n + 1):
+        ngram = ' '.join(tokens[i:i + n])
+        ngrams.append(ngram)
+
+    return ngrams
--- a/venv/lib/python3.10/site-packages/edgar/documents/ranking/ranking.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/ranking/ranking.py
@@ -0,0 +1,401 @@
+"""
+Ranking engines for document search.
+
+Provides BM25-based ranking with optional semantic structure boosting.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from enum import Enum, auto
+from typing import List, Optional, Dict, Any, TYPE_CHECKING
+
+from rank_bm25 import BM25Okapi
+
+if TYPE_CHECKING:
+    from edgar.documents.nodes import Node
+
+
+class RankingAlgorithm(Enum):
+    """Supported ranking algorithms."""
+    BM25 = auto()           # Classic BM25 (Okapi variant)
+    HYBRID = auto()         # BM25 + Semantic structure boosting
+    SEMANTIC = auto()       # Pure structure-aware scoring
+
+
+@dataclass
+class RankedResult:
+    """
+    A search result with ranking score.
+
+    Attributes:
+        node: Document node containing the match
+        score: Relevance score (higher is better)
+        rank: Position in results (1-indexed)
+        text: Matched text content
+        bm25_score: Raw BM25 score (if applicable)
+        semantic_score: Semantic boost score (if applicable)
+        metadata: Additional result metadata
+    """
+    node: 'Node'
+    score: float
+    rank: int
+    text: str
+    bm25_score: Optional[float] = None
+    semantic_score: Optional[float] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+    @property
+    def snippet(self) -> str:
+        """Get text snippet (first 200 chars)."""
+        if len(self.text) <= 200:
+            return self.text
+        return self.text[:197] + "..."
+
+
+class RankingEngine(ABC):
+    """Abstract base class for ranking engines."""
+
+    @abstractmethod
+    def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
+        """
+        Rank nodes by relevance to query.
+
+        Args:
+            query: Search query
+            nodes: Nodes to rank
+
+        Returns:
+            List of ranked results sorted by relevance (best first)
+        """
+        pass
+
+    @abstractmethod
+    def get_algorithm_name(self) -> str:
+        """Get name of ranking algorithm."""
+        pass
+
+
+class BM25Engine(RankingEngine):
+    """
+    BM25 ranking engine using Okapi variant.
+
+    BM25 is a probabilistic retrieval function that ranks documents based on
+    query term frequency and inverse document frequency. Well-suited for
+    financial documents where exact term matching is important.
+
+    Parameters:
+        k1: Term frequency saturation parameter (default: 1.5)
+            Controls how quickly term frequency impact plateaus.
+        b: Length normalization parameter (default: 0.75)
+            0 = no normalization, 1 = full normalization.
+    """
+
+    def __init__(self, k1: float = 1.5, b: float = 0.75):
+        """
+        Initialize BM25 engine.
+
+        Args:
+            k1: Term frequency saturation (1.2-2.0 typical)
+            b: Length normalization (0.75 is standard)
+        """
+        self.k1 = k1
+        self.b = b
+        self._bm25: Optional[BM25Okapi] = None
+        self._corpus_nodes: Optional[List['Node']] = None
+        self._tokenized_corpus: Optional[List[List[str]]] = None
+
+    def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
+        """
+        Rank nodes using BM25 algorithm.
+
+        Args:
+            query: Search query
+            nodes: Nodes to rank
+
+        Returns:
+            Ranked results sorted by BM25 score
+        """
+        if not nodes:
+            return []
+
+        # Import preprocessing here to avoid circular dependency
+        from edgar.documents.ranking.preprocessing import preprocess_text, tokenize
+
+        # Build index if needed or if nodes changed
+        if self._corpus_nodes != nodes:
+            self._build_index(nodes)
+
+        # Tokenize and preprocess query
+        query_tokens = tokenize(preprocess_text(query))
+
+        if not query_tokens:
+            return []
+
+        # Get BM25 scores
+        scores = self._bm25.get_scores(query_tokens)
+
+        # Create ranked results
+        results = []
+        for idx, (node, score) in enumerate(zip(nodes, scores)):
+            if score > 0:  # Only include nodes with positive scores
+                text = node.text() if hasattr(node, 'text') else str(node)
+                results.append(RankedResult(
+                    node=node,
+                    score=float(score),
+                    rank=0,  # Will be set after sorting
+                    text=text,
+                    bm25_score=float(score),
+                    metadata={'algorithm': 'BM25'}
+                ))
+
+        # Sort by score (highest first) and assign ranks
+        results.sort(key=lambda r: r.score, reverse=True)
+        for rank, result in enumerate(results, start=1):
+            result.rank = rank
+
+        return results
+
+    def _build_index(self, nodes: List['Node']):
+        """Build BM25 index from nodes."""
+        from edgar.documents.ranking.preprocessing import preprocess_text, tokenize
+
+        # Store corpus
+        self._corpus_nodes = nodes
+
+        # Tokenize all nodes
+        self._tokenized_corpus = []
+        for node in nodes:
+            text = node.text() if hasattr(node, 'text') else str(node)
+            processed = preprocess_text(text)
+            tokens = tokenize(processed)
+            self._tokenized_corpus.append(tokens)
+
+        # Build BM25 index with custom parameters
+        self._bm25 = BM25Okapi(
+            self._tokenized_corpus,
+            k1=self.k1,
+            b=self.b
+        )
+
+    def get_index_data(self) -> Dict[str, Any]:
+        """
+        Serialize index data for caching.
+
+        Returns:
+            Dictionary with serializable index data
+        """
+        return {
+            'tokenized_corpus': self._tokenized_corpus,
+            'k1': self.k1,
+            'b': self.b,
+            'algorithm': 'BM25'
+        }
+
+    def load_index_data(self, index_data: Dict[str, Any], nodes: List['Node']) -> None:
+        """
+        Load index from cached data.
+
+        Args:
+            index_data: Serialized index data
+            nodes: Nodes corresponding to the index
+        """
+        self._corpus_nodes = nodes
+        self._tokenized_corpus = index_data['tokenized_corpus']
+        self.k1 = index_data['k1']
+        self.b = index_data['b']
+
+        # Rebuild BM25 index from tokenized corpus
+        self._bm25 = BM25Okapi(
+            self._tokenized_corpus,
+            k1=self.k1,
+            b=self.b
+        )
+
+    def get_algorithm_name(self) -> str:
+        """Get algorithm name."""
+        return "BM25"
+
+
+class HybridEngine(RankingEngine):
+    """
+    Hybrid ranking engine: BM25 + Semantic structure boosting.
+
+    Combines classic BM25 text matching with semantic structure awareness:
+    - BM25 provides strong exact-match ranking for financial terms
+    - Semantic scoring boosts results based on document structure:
+      * Headings and section markers
+      * Cross-references ("See Item X")
+      * Gateway content (summaries, overviews)
+      * Table and XBRL importance
+
+    This approach is agent-friendly: it surfaces starting points for
+    investigation rather than fragmented chunks.
+
+    Parameters:
+        bm25_weight: Weight for BM25 score (default: 0.8)
+        semantic_weight: Weight for semantic score (default: 0.2)
+        k1: BM25 term frequency saturation
+        b: BM25 length normalization
+    """
+
+    def __init__(self,
+                 bm25_weight: float = 0.8,
+                 semantic_weight: float = 0.2,
+                 k1: float = 1.5,
+                 b: float = 0.75,
+                 boost_sections: Optional[List[str]] = None):
+        """
+        Initialize hybrid engine.
+
+        Args:
+            bm25_weight: Weight for BM25 component (0-1)
+            semantic_weight: Weight for semantic component (0-1)
+            k1: BM25 k1 parameter
+            b: BM25 b parameter
+            boost_sections: Section names to boost (e.g., ["Risk Factors"])
+        """
+        self.bm25_engine = BM25Engine(k1=k1, b=b)
+        self.bm25_weight = bm25_weight
+        self.semantic_weight = semantic_weight
+        self.boost_sections = boost_sections or []
+
+        # Validate weights
+        total_weight = bm25_weight + semantic_weight
+        if not (0.99 <= total_weight <= 1.01):  # Allow small floating point error
+            raise ValueError(f"Weights must sum to 1.0, got {total_weight}")
+
+    def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
+        """
+        Rank nodes using hybrid approach.
+
+        Args:
+            query: Search query
+            nodes: Nodes to rank
+
+        Returns:
+            Ranked results with combined BM25 + semantic scores
+        """
+        if not nodes:
+            return []
+
+        # Get BM25 results
+        bm25_results = self.bm25_engine.rank(query, nodes)
+
+        if not bm25_results:
+            return []
+
+        # Import semantic scoring
+        from edgar.documents.ranking.semantic import compute_semantic_scores
+
+        # Get semantic scores for all nodes
+        semantic_scores_dict = compute_semantic_scores(
+            nodes=nodes,
+            query=query,
+            boost_sections=self.boost_sections
+        )
+
+        # Normalize BM25 scores to 0-1 range
+        max_bm25 = max(r.bm25_score for r in bm25_results)
+        if max_bm25 > 0:
+            for result in bm25_results:
+                result.bm25_score = result.bm25_score / max_bm25
+
+        # Combine scores
+        for result in bm25_results:
+            semantic_score = semantic_scores_dict.get(id(result.node), 0.0)
+            result.semantic_score = semantic_score
+
+            # Weighted combination
+            result.score = (
+                self.bm25_weight * result.bm25_score +
+                self.semantic_weight * semantic_score
+            )
+
+            result.metadata['algorithm'] = 'Hybrid'
+            result.metadata['bm25_weight'] = self.bm25_weight
+            result.metadata['semantic_weight'] = self.semantic_weight
+
+        # Re-sort by combined score
+        bm25_results.sort(key=lambda r: r.score, reverse=True)
+
+        # Update ranks
+        for rank, result in enumerate(bm25_results, start=1):
+            result.rank = rank
+
+        return bm25_results
+
+    def get_algorithm_name(self) -> str:
+        """Get algorithm name."""
+        return "Hybrid"
+
+
+class SemanticEngine(RankingEngine):
+    """
+    Pure semantic/structure-based ranking (no text matching).
+
+    Ranks nodes purely by structural importance:
+    - Section headings
+    - Cross-references
+    - Gateway content
+    - Document structure position
+
+    Useful for understanding document organization without specific queries.
+    """
+
+    def __init__(self, boost_sections: Optional[List[str]] = None):
+        """
+        Initialize semantic engine.
+
+        Args:
+            boost_sections: Section names to boost
+        """
+        self.boost_sections = boost_sections or []
+
+    def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
+        """
+        Rank nodes by semantic importance.
+
+        Args:
+            query: Search query (used for context)
+            nodes: Nodes to rank
+
+        Returns:
+            Ranked results by structural importance
+        """
+        if not nodes:
+            return []
+
+        from edgar.documents.ranking.semantic import compute_semantic_scores
+
+        # Get semantic scores
+        semantic_scores = compute_semantic_scores(
+            nodes=nodes,
+            query=query,
+            boost_sections=self.boost_sections
+        )
+
+        # Create results
+        results = []
+        for node in nodes:
+            score = semantic_scores.get(id(node), 0.0)
+            if score > 0:
+                text = node.text() if hasattr(node, 'text') else str(node)
+                results.append(RankedResult(
+                    node=node,
+                    score=score,
+                    rank=0,
+                    text=text,
+                    semantic_score=score,
+                    metadata={'algorithm': 'Semantic'}
+                ))
+
+        # Sort and rank
+        results.sort(key=lambda r: r.score, reverse=True)
+        for rank, result in enumerate(results, start=1):
+            result.rank = rank
+
+        return results
+
+    def get_algorithm_name(self) -> str:
+        """Get algorithm name."""
+        return "Semantic"
--- a/venv/lib/python3.10/site-packages/edgar/documents/ranking/semantic.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/ranking/semantic.py
@@ -0,0 +1,333 @@
+"""
+Semantic scoring for document structure awareness.
+
+Provides structure-based boosting without ML/embeddings:
+- Node type importance (headings, tables, XBRL)
+- Cross-reference detection (gateway content)
+- Section importance
+- Text quality signals
+
+This is NOT embedding-based semantic search. It's structure-aware ranking
+that helps agents find investigation starting points.
+"""
+
+import re
+from typing import List, Dict, Optional, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from edgar.documents.nodes import Node
+
+from edgar.documents.types import NodeType, SemanticType
+
+
+# Gateway terms that indicate summary/overview content
+GATEWAY_TERMS = [
+    'summary', 'overview', 'introduction', 'highlights',
+    'key points', 'executive summary', 'in summary',
+    'table of contents', 'index'
+]
+
+# Cross-reference patterns
+CROSS_REFERENCE_PATTERNS = [
+    r'\bsee\s+item\s+\d+[a-z]?\b',              # "See Item 1A"
+    r'\bsee\s+(?:part|section)\s+\d+\b',        # "See Part II"
+    r'\brefer\s+to\s+item\s+\d+[a-z]?\b',       # "Refer to Item 7"
+    r'\bas\s+discussed\s+in\s+item\s+\d+\b',    # "As discussed in Item 1"
+    r'\bfor\s+(?:more|additional)\s+information\b',  # "For more information"
+]
+
+# Section importance weights
+SECTION_IMPORTANCE = {
+    'risk factors': 1.5,
+    'management discussion': 1.4,
+    'md&a': 1.4,
+    'business': 1.3,
+    'financial statements': 1.2,
+    'controls and procedures': 1.2,
+}
+
+
+def compute_semantic_scores(nodes: List['Node'],
+                           query: str,
+                           boost_sections: Optional[List[str]] = None) -> Dict[int, float]:
+    """
+    Compute semantic/structure scores for nodes.
+
+    This provides structure-aware boosting based on:
+    1. Node type (headings > tables > paragraphs)
+    2. Cross-references (gateway content)
+    3. Section importance
+    4. Gateway terms (summaries, overviews)
+    5. XBRL presence
+    6. Text quality
+
+    Args:
+        nodes: Nodes to score
+        query: Search query (for context-aware boosting)
+        boost_sections: Additional sections to boost
+
+    Returns:
+        Dictionary mapping node id to semantic score (0-1 range)
+    """
+    scores = {}
+    boost_sections = boost_sections or []
+
+    # Get query context
+    query_lower = query.lower()
+    is_item_query = bool(re.search(r'item\s+\d+[a-z]?', query_lower))
+
+    for node in nodes:
+        score = 0.0
+
+        # 1. Node Type Boosting
+        score += _get_node_type_boost(node)
+
+        # 2. Semantic Type Boosting
+        score += _get_semantic_type_boost(node)
+
+        # 3. Cross-Reference Detection (gateway content)
+        score += _detect_cross_references(node)
+
+        # 4. Gateway Content Detection
+        score += _detect_gateway_content(node, query_lower)
+
+        # 5. Section Importance Boosting
+        score += _get_section_boost(node, boost_sections)
+
+        # 6. XBRL Fact Boosting (for financial queries)
+        score += _get_xbrl_boost(node)
+
+        # 7. Text Quality Signals
+        score += _get_quality_boost(node)
+
+        # 8. Query-Specific Boosting
+        if is_item_query:
+            score += _get_item_header_boost(node)
+
+        # Normalize to 0-1 range (max possible score is ~7.0)
+        normalized_score = min(score / 7.0, 1.0)
+
+        scores[id(node)] = normalized_score
+
+    return scores
+
+
+def _get_node_type_boost(node: 'Node') -> float:
+    """
+    Boost based on node type.
+
+    Headings and structural elements are more important for navigation.
+    """
+    type_boosts = {
+        NodeType.HEADING: 2.0,      # Headings are key navigation points
+        NodeType.SECTION: 1.5,       # Section markers
+        NodeType.TABLE: 1.0,         # Tables contain structured data
+        NodeType.XBRL_FACT: 0.8,     # Financial facts
+        NodeType.LIST: 0.5,          # Lists
+        NodeType.PARAGRAPH: 0.3,     # Regular text
+        NodeType.TEXT: 0.1,          # Plain text nodes
+    }
+
+    return type_boosts.get(node.type, 0.0)
+
+
+def _get_semantic_type_boost(node: 'Node') -> float:
+    """
+    Boost based on semantic type.
+
+    Section headers and items are important for SEC filings.
+    """
+    if not hasattr(node, 'semantic_type') or node.semantic_type is None:
+        return 0.0
+
+    semantic_boosts = {
+        SemanticType.ITEM_HEADER: 2.0,          # Item headers are critical
+        SemanticType.SECTION_HEADER: 1.5,       # Section headers
+        SemanticType.FINANCIAL_STATEMENT: 1.2,  # Financial statements
+        SemanticType.TABLE_OF_CONTENTS: 1.0,    # TOC is a gateway
+        SemanticType.TITLE: 0.8,
+        SemanticType.HEADER: 0.6,
+    }
+
+    return semantic_boosts.get(node.semantic_type, 0.0)
+
+
+def _detect_cross_references(node: 'Node') -> float:
+    """
+    Detect cross-references that indicate gateway content.
+
+    Content that points to other sections is useful for navigation.
+    """
+    text = node.text() if hasattr(node, 'text') else ''
+    if not text:
+        return 0.0
+
+    text_lower = text.lower()
+
+    # Check each pattern
+    matches = 0
+    for pattern in CROSS_REFERENCE_PATTERNS:
+        if re.search(pattern, text_lower):
+            matches += 1
+
+    # Boost increases with number of cross-references
+    return min(matches * 0.5, 1.5)  # Cap at 1.5
+
+
+def _detect_gateway_content(node: 'Node', query_lower: str) -> float:
+    """
+    Detect gateway content (summaries, overviews, introductions).
+
+    These are excellent starting points for investigation.
+    """
+    text = node.text() if hasattr(node, 'text') else ''
+    if not text:
+        return 0.0
+
+    text_lower = text.lower()
+
+    # Check for gateway terms in text
+    for term in GATEWAY_TERMS:
+        if term in text_lower:
+            return 1.0
+
+    # Check if this is an introductory paragraph (first ~200 chars)
+    if len(text) < 200 and len(text) > 20:
+        # Short intro paragraphs are often summaries
+        if any(word in text_lower for word in ['provides', 'describes', 'includes', 'contains']):
+            return 0.5
+
+    return 0.0
+
+
+def _get_section_boost(node: 'Node', boost_sections: List[str]) -> float:
+    """
+    Boost nodes in important sections.
+
+    Some SEC sections are more relevant for certain queries.
+    """
+    # Try to determine section from node or ancestors
+    section_name = _get_node_section(node)
+    if not section_name:
+        return 0.0
+
+    section_lower = section_name.lower()
+
+    # Check built-in importance
+    for key, boost in SECTION_IMPORTANCE.items():
+        if key in section_lower:
+            return boost
+
+    # Check user-specified sections
+    for boost_section in boost_sections:
+        if boost_section.lower() in section_lower:
+            return 1.5
+
+    return 0.0
+
+
+def _get_xbrl_boost(node: 'Node') -> float:
+    """
+    Boost XBRL facts and tables with XBRL data.
+
+    Financial data is important for financial queries.
+    """
+    if node.type == NodeType.XBRL_FACT:
+        return 0.8
+
+    # Check if table contains XBRL facts
+    if node.type == NodeType.TABLE:
+        # Check metadata for XBRL indicator
+        if hasattr(node, 'metadata') and node.metadata.get('has_xbrl'):
+            return 0.6
+
+    return 0.0
+
+
+def _get_quality_boost(node: 'Node') -> float:
+    """
+    Boost based on text quality signals.
+
+    Higher quality content tends to be more useful:
+    - Appropriate length (not too short, not too long)
+    - Good structure (sentences, punctuation)
+    - Substantive content (not just formatting)
+    """
+    text = node.text() if hasattr(node, 'text') else ''
+    if not text:
+        return 0.0
+
+    score = 0.0
+
+    # Length signal
+    text_len = len(text)
+    if 50 <= text_len <= 1000:
+        score += 0.3  # Good length
+    elif text_len > 1000:
+        score += 0.1  # Long but might be comprehensive
+    else:
+        score += 0.0  # Too short, likely not substantive
+
+    # Sentence structure
+    sentence_count = text.count('.') + text.count('?') + text.count('!')
+    if sentence_count >= 2:
+        score += 0.2  # Multiple sentences indicate substantive content
+
+    # Avoid pure formatting/navigation
+    if text.strip() in ['...', '—', '-', 'Table of Contents', 'Page', '']:
+        return 0.0  # Skip pure formatting
+
+    return score
+
+
+def _get_item_header_boost(node: 'Node') -> float:
+    """
+    Boost Item headers when query is about items.
+
+    "Item 1A" queries should prioritize Item 1A headers.
+    """
+    if node.type != NodeType.HEADING:
+        return 0.0
+
+    text = node.text() if hasattr(node, 'text') else ''
+    if not text:
+        return 0.0
+
+    # Check if this is an Item header
+    if re.match(r'^\s*item\s+\d+[a-z]?[:\.\s]', text, re.IGNORECASE):
+        return 1.5
+
+    return 0.0
+
+
+def _get_node_section(node: 'Node') -> Optional[str]:
+    """
+    Get section name for a node by walking up the tree.
+
+    Returns:
+        Section name if found, None otherwise
+    """
+    # Check if node has section in metadata
+    if hasattr(node, 'metadata') and 'section' in node.metadata:
+        return node.metadata['section']
+
+    # Walk up tree looking for section marker
+    current = node
+    while current:
+        if hasattr(current, 'semantic_type'):
+            if current.semantic_type in (SemanticType.SECTION_HEADER, SemanticType.ITEM_HEADER):
+                return current.text() if hasattr(current, 'text') else None
+
+        current = current.parent if hasattr(current, 'parent') else None
+
+    return None
+
+
+def get_section_importance_names() -> List[str]:
+    """
+    Get list of important section names for reference.
+
+    Returns:
+        List of section names with built-in importance boosts
+    """
+    return list(SECTION_IMPORTANCE.keys())
--- a/venv/lib/python3.10/site-packages/edgar/documents/renderers/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/renderers/init.py
@@ -0,0 +1,13 @@
+"""
+Document renderers for various output formats.
+"""
+
+from edgar.documents.renderers.markdown import MarkdownRenderer
+from edgar.documents.renderers.text import TextRenderer
+from edgar.documents.renderers.fast_table import FastTableRenderer
+
+__all__ = [
+    'MarkdownRenderer',
+    'TextRenderer',
+    'FastTableRenderer'
+]
--- a/venv/lib/python3.10/site-packages/edgar/documents/renderers/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/renderers/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/renderers/pycache/fast_table.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/renderers/pycache/fast_table.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/renderers/pycache/markdown.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/renderers/pycache/markdown.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/renderers/pycache/text.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/renderers/pycache/text.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/renderers/fast_table.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/renderers/fast_table.py
@@ -0,0 +1,669 @@
+"""
+Fast table renderer for edgar.documents - optimized for performance.
+
+This module provides a high-performance alternative to Rich table rendering
+while maintaining professional output quality and readability.
+
+Performance target: ~32x faster than Rich rendering (0.2ms vs 6.5ms per table)
+"""
+
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Union, Tuple
+from enum import Enum
+
+
+class Alignment(Enum):
+    """Column alignment options."""
+    LEFT = "left"
+    RIGHT = "right"
+    CENTER = "center"
+
+
+@dataclass
+class ColumnConfig:
+    """Configuration for a table column."""
+    alignment: Alignment = Alignment.LEFT
+    min_width: int = 8
+    max_width: Optional[int] = None
+    padding: int = 1
+
+
+@dataclass
+class TableStyle:
+    """Table styling configuration."""
+    border_char: str = "|"
+    header_separator: str = "-"
+    corner_char: str = "+"
+    padding: int = 1
+    min_col_width: int = 8
+    max_col_width: int = 50
+    
+    @classmethod
+    def pipe_table(cls) -> 'TableStyle':
+        """Markdown-compatible pipe table style."""
+        return cls(
+            border_char="|",
+            header_separator="-",
+            corner_char="|",
+            padding=1,
+            min_col_width=8,
+            max_col_width=50
+        )
+    
+    @classmethod
+    def minimal(cls) -> 'TableStyle':
+        """Minimal table style with spacing only."""
+        return cls(
+            border_char="",
+            header_separator="",
+            corner_char="",
+            padding=2,
+            min_col_width=6,
+            max_col_width=40
+        )
+
+    @classmethod
+    def simple(cls) -> 'TableStyle':
+        """
+        Simple table style matching Rich's box.SIMPLE.
+
+        Features:
+        - No outer border
+        - No column separators
+        - Single horizontal line under header
+        - Space-separated columns with generous padding
+        - Clean, professional appearance
+
+        This style provides the best balance of visual quality and performance,
+        matching Rich's box.SIMPLE aesthetic while maintaining fast rendering speed.
+        """
+        return cls(
+            border_char="",            # No pipes/borders
+            header_separator="─",      # Unicode horizontal line
+            corner_char="",            # No corners
+            padding=2,                 # Generous spacing (was 1 in pipe_table)
+            min_col_width=6,          # Slightly relaxed (was 8)
+            max_col_width=60          # Raised from 50 for wider columns
+        )
+
+
+class FastTableRenderer:
+    """
+    High-performance table renderer optimized for speed.
+    
+    Features:
+    - 30x+ faster than Rich table rendering
+    - Professional, readable output
+    - Configurable alignment and styling
+    - Handles complex SEC filing table structures
+    - Markdown-compatible output
+    - Memory efficient
+    """
+    
+    def __init__(self, style: Optional[TableStyle] = None):
+        """Initialize renderer with optional style configuration."""
+        self.style = style or TableStyle.pipe_table()
+        
+        # Pre-compile format strings for performance
+        self._format_cache = {}
+    
+    def render_table_node(self, table_node) -> str:
+        """
+        Render a TableNode to text format with proper colspan/rowspan handling.
+
+        Args:
+            table_node: TableNode instance from edgar.documents
+
+        Returns:
+            Formatted table string
+        """
+        from edgar.documents.utils.table_matrix import TableMatrix
+
+        # Build matrix to handle colspan/rowspan properly
+        # This ensures cells are expanded to fill their full colspan/rowspan
+        matrix = TableMatrix()
+        matrix.build_from_rows(table_node.headers, table_node.rows)
+
+        # Extract headers from expanded matrix
+        headers = []
+        if table_node.headers:
+            for row_idx in range(len(table_node.headers)):
+                expanded_row = matrix.get_expanded_row(row_idx)
+                # Convert Cell objects to strings, handling None values
+                row_texts = [cell.text().strip() if cell else '' for cell in expanded_row]
+                headers.append(row_texts)
+
+        # Extract data rows from expanded matrix
+        rows = []
+        start_row = len(table_node.headers) if table_node.headers else 0
+        for row_idx in range(start_row, matrix.row_count):
+            expanded_row = matrix.get_expanded_row(row_idx)
+            # Convert Cell objects to strings, handling None values
+            row_texts = [cell.text().strip() if cell else '' for cell in expanded_row]
+            rows.append(row_texts)
+
+        # Render the table
+        table_text = self.render_table_data(headers, rows)
+
+        # Add caption if present (matches Rich renderer behavior)
+        if hasattr(table_node, 'caption') and table_node.caption:
+            return f"{table_node.caption}\n{table_text}"
+
+        return table_text
+    
+    def render_table_data(self, headers: List[List[str]], rows: List[List[str]]) -> str:
+        """
+        Render table data with headers and rows.
+
+        Args:
+            headers: List of header rows (for multi-row headers)
+            rows: List of data rows
+
+        Returns:
+            Formatted table string
+        """
+        if not headers and not rows:
+            return ""
+
+        # Determine column count from all rows (headers + data)
+        all_rows = headers + rows if headers else rows
+        if not all_rows:
+            return ""
+
+        max_cols = max(len(row) for row in all_rows) if all_rows else 0
+        if max_cols == 0:
+            return ""
+
+        # Filter out empty/spacing columns
+        meaningful_columns = self._identify_meaningful_columns(all_rows, max_cols)
+        if not meaningful_columns:
+            return ""
+
+        # Filter all rows (both headers and data) to only meaningful columns
+        filtered_headers = [self._filter_row_to_columns(row, meaningful_columns) for row in headers] if headers else []
+        filtered_rows = [self._filter_row_to_columns(row, meaningful_columns) for row in rows]
+
+        # Post-process to merge related columns (e.g., currency symbols with amounts)
+        # Apply to all rows including headers
+        all_filtered = filtered_headers + filtered_rows
+        if all_filtered:
+            # Merge using first filtered row as reference
+            _, all_merged = self._merge_related_columns(all_filtered[0], all_filtered)
+            # Split back into headers and data
+            if filtered_headers:
+                filtered_headers = all_merged[:len(filtered_headers)]
+                filtered_rows = all_merged[len(filtered_headers):]
+            else:
+                filtered_rows = all_merged
+
+        # Recalculate with filtered and merged data
+        filtered_all_rows = filtered_headers + filtered_rows if filtered_headers else filtered_rows
+        filtered_max_cols = max(len(row) for row in filtered_all_rows) if filtered_all_rows else 0
+
+        # Calculate optimal column widths for filtered columns
+        col_widths = self._calculate_column_widths(filtered_all_rows, filtered_max_cols)
+
+        # Detect column alignments based on filtered content
+        alignments = self._detect_alignments(filtered_all_rows, filtered_max_cols)
+
+        # Build table with filtered data - pass headers as multiple rows
+        return self._build_table(filtered_headers, filtered_rows, col_widths, alignments)
+    
+    def _combine_headers(self, headers: List[List[str]]) -> List[str]:
+        """
+        Combine multi-row headers intelligently.
+        
+        For SEC tables, this prioritizes specific dates/periods over generic labels.
+        """
+        if not headers:
+            return []
+        
+        if len(headers) == 1:
+            return headers[0]
+        
+        # Determine max columns across all header rows
+        max_cols = max(len(row) for row in headers) if headers else 0
+        combined = [""] * max_cols
+        
+        for col in range(max_cols):
+            # Collect all values for this column
+            values = []
+            for header_row in headers:
+                if col < len(header_row) and header_row[col].strip():
+                    values.append(header_row[col].strip())
+            
+            if values:
+                # Prioritize date-like values over generic terms
+                date_values = [v for v in values if self._looks_like_date(v)]
+                if date_values:
+                    combined[col] = date_values[0]
+                elif len(values) == 1:
+                    combined[col] = values[0]
+                else:
+                    # Skip generic terms like "Year Ended" if we have something more specific
+                    specific_values = [v for v in values 
+                                     if v.lower() not in {'year ended', 'years ended', 'period ended'}]
+                    combined[col] = specific_values[0] if specific_values else values[0]
+        
+        return combined
+    
+    def _looks_like_date(self, text: str) -> bool:
+        """Quick date detection for header processing."""
+        if not text or len(text) < 4:
+            return False
+        
+        text_lower = text.lower().replace('\n', ' ').strip()
+        
+        # Common date indicators
+        date_indicators = [
+            'january', 'february', 'march', 'april', 'may', 'june',
+            'july', 'august', 'september', 'october', 'november', 'december',
+            '20', '19',  # Year prefixes
+        ]
+        
+        return any(indicator in text_lower for indicator in date_indicators) and \
+               any(c.isdigit() for c in text)
+    
+    def _identify_meaningful_columns(self, all_rows: List[List[str]], max_cols: int) -> List[int]:
+        """
+        Identify columns that contain meaningful content (not just spacing).
+        
+        Returns:
+            List of column indices that have meaningful content
+        """
+        column_scores = []
+        
+        for col_idx in range(max_cols):
+            content_score = 0
+            total_rows = 0
+            
+            # Score each column based on content quality
+            for row in all_rows:
+                if col_idx < len(row):
+                    total_rows += 1
+                    cell_content = str(row[col_idx]).strip()
+                    
+                    if cell_content:
+                        # Higher score for longer, more substantial content
+                        if len(cell_content) >= 3:  # Substantial content
+                            content_score += 3
+                        elif len(cell_content) == 2 and cell_content.isalnum():
+                            content_score += 2
+                        elif len(cell_content) == 1 and (cell_content.isalnum() or cell_content == '$'):
+                            content_score += 1
+                        # Skip single spaces, dashes, or other likely spacing characters
+            
+            # Calculate average score per row for this column
+            avg_score = content_score / max(total_rows, 1)
+            column_scores.append((col_idx, avg_score, content_score))
+        
+        # Sort by score descending
+        column_scores.sort(key=lambda x: x[1], reverse=True)
+        
+        # Take columns with meaningful content (score >= 0.5 or among top columns)
+        meaningful_columns = []
+        for col_idx, avg_score, total_score in column_scores:
+            # Include if it has good average score or significant total content
+            if avg_score >= 0.5 or total_score >= 5:
+                meaningful_columns.append(col_idx)
+            # Limit to reasonable number of columns for readability
+            if len(meaningful_columns) >= 8:
+                break
+        
+        # Sort by original column order
+        meaningful_columns.sort()
+        
+        return meaningful_columns
+    
+    def _filter_row_to_columns(self, row: List[str], column_indices: List[int]) -> List[str]:
+        """
+        Filter a row to only include the specified column indices.
+        
+        Args:
+            row: Original row data
+            column_indices: List of column indices to keep
+            
+        Returns:
+            Filtered row with only the specified columns
+        """
+        if not row:
+            return []
+        
+        filtered_row = []
+        for col_idx in column_indices:
+            if col_idx < len(row):
+                filtered_row.append(row[col_idx])
+            else:
+                filtered_row.append("")  # Missing column
+        
+        return filtered_row
+    
+    def _merge_related_columns(self, headers: List[str], rows: List[List[str]]) -> tuple:
+        """
+        Merge related columns (e.g., currency symbols with their amounts).
+        
+        Returns:
+            Tuple of (merged_headers, merged_rows)
+        """
+        if not rows or not any(rows):
+            return headers, rows
+        
+        # Find columns that should be merged
+        merge_pairs = []
+        max_cols = max(len(row) for row in [headers] + rows if row) if rows else len(headers) if headers else 0
+        
+        for col_idx in range(max_cols - 1):
+            # Check if this column and the next should be merged
+            should_merge = self._should_merge_columns(headers, rows, col_idx, col_idx + 1)
+            if should_merge:
+                merge_pairs.append((col_idx, col_idx + 1))
+        
+        # Apply merges (from right to left to avoid index shifting)
+        merged_headers = headers[:] if headers else []
+        merged_rows = [row[:] for row in rows]
+        
+        for left_idx, right_idx in reversed(merge_pairs):
+            # Merge headers
+            if merged_headers and left_idx < len(merged_headers) and right_idx < len(merged_headers):
+                left_header = merged_headers[left_idx].strip()
+                right_header = merged_headers[right_idx].strip()
+                merged_header = f"{left_header} {right_header}".strip()
+                merged_headers[left_idx] = merged_header
+                merged_headers.pop(right_idx)
+            
+            # Merge rows
+            for row in merged_rows:
+                if left_idx < len(row) and right_idx < len(row):
+                    left_cell = str(row[left_idx]).strip()
+                    right_cell = str(row[right_idx]).strip()
+                    
+                    # Smart merging based on content
+                    if left_cell == '$' and right_cell:
+                        merged_cell = f"${right_cell}"
+                    elif left_cell and right_cell:
+                        merged_cell = f"{left_cell} {right_cell}"
+                    else:
+                        merged_cell = left_cell or right_cell
+                    
+                    row[left_idx] = merged_cell
+                    if right_idx < len(row):
+                        row.pop(right_idx)
+        
+        return merged_headers, merged_rows
+    
+    def _should_merge_columns(self, headers: List[str], rows: List[List[str]], left_idx: int, right_idx: int) -> bool:
+        """
+        Determine if two adjacent columns should be merged.
+        
+        Returns:
+            True if columns should be merged
+        """
+        # Check if left column is mostly currency symbols
+        currency_count = 0
+        total_count = 0
+        
+        for row in rows:
+            if left_idx < len(row) and right_idx < len(row):
+                total_count += 1
+                left_cell = str(row[left_idx]).strip()
+                right_cell = str(row[right_idx]).strip()
+                
+                # If left is '$' and right is a number, they should be merged
+                if left_cell == '$' and right_cell and (right_cell.replace(',', '').replace('.', '').isdigit()):
+                    currency_count += 1
+        
+        # If most rows have currency symbol + number pattern, merge them
+        if total_count > 0 and currency_count / total_count >= 0.5:
+            return True
+        
+        # Check for other merge patterns (e.g., empty left column with content right column)
+        empty_left_count = 0
+        for row in rows:
+            if left_idx < len(row) and right_idx < len(row):
+                left_cell = str(row[left_idx]).strip()
+                right_cell = str(row[right_idx]).strip()
+                
+                if not left_cell and right_cell:
+                    empty_left_count += 1
+        
+        # If left column is mostly empty, consider merging
+        if total_count > 0 and empty_left_count / total_count >= 0.7:
+            return True
+        
+        return False
+    
+    def _calculate_column_widths(self, all_rows: List[List[str]], max_cols: int) -> List[int]:
+        """Calculate optimal column widths based on content."""
+        col_widths = [self.style.min_col_width] * max_cols
+        
+        # Find the maximum content width for each column
+        for row in all_rows:
+            for col_idx in range(min(len(row), max_cols)):
+                content = str(row[col_idx]) if row[col_idx] else ""
+                # Handle multi-line content
+                max_line_width = max((len(line) for line in content.split('\n')), default=0)
+                content_width = max_line_width + (self.style.padding * 2)
+                
+                # Apply limits
+                content_width = min(content_width, self.style.max_col_width)
+                col_widths[col_idx] = max(col_widths[col_idx], content_width)
+        
+        return col_widths
+    
+    def _detect_alignments(self, all_rows: List[List[str]], max_cols: int) -> List[Alignment]:
+        """Detect appropriate alignment for each column based on content."""
+        alignments = [Alignment.LEFT] * max_cols
+        
+        for col_idx in range(max_cols):
+            # Analyze column content (skip header row if present)
+            data_rows = all_rows[1:] if len(all_rows) > 1 else all_rows
+            
+            numeric_count = 0
+            total_count = 0
+            
+            for row in data_rows:
+                if col_idx < len(row) and row[col_idx].strip():
+                    total_count += 1
+                    content = row[col_idx].strip()
+                    
+                    # Check if content looks numeric (currency, percentages, numbers)
+                    if self._looks_numeric(content):
+                        numeric_count += 1
+            
+            # If most values in column are numeric, right-align
+            if total_count > 0 and numeric_count / total_count >= 0.7:
+                alignments[col_idx] = Alignment.RIGHT
+        
+        return alignments
+    
+    def _looks_numeric(self, text: str) -> bool:
+        """Check if text content looks numeric."""
+        if not text:
+            return False
+        
+        # Remove common formatting characters
+        clean_text = text.replace(',', '').replace('$', '').replace('%', '').replace('(', '').replace(')', '').strip()
+        
+        # Handle negative numbers in parentheses
+        if text.strip().startswith('(') and text.strip().endswith(')'):
+            clean_text = text.strip()[1:-1].replace(',', '').replace('$', '').strip()
+        
+        # Check if remaining text is numeric
+        try:
+            float(clean_text)
+            return True
+        except ValueError:
+            return False
+    
+    def _build_table(self, headers: List[List[str]], rows: List[List[str]],
+                    col_widths: List[int], alignments: List[Alignment]) -> str:
+        """
+        Build the final table string.
+
+        Args:
+            headers: List of header rows (can be multiple rows for multi-row headers)
+            rows: List of data rows
+            col_widths: Column widths
+            alignments: Column alignments
+        """
+        lines = []
+
+        # Header rows (can be multiple)
+        if headers:
+            for header_row in headers:
+                # Only add header rows with meaningful content
+                if any(cell.strip() for cell in header_row):
+                    # Handle multi-line cells in header rows
+                    formatted_lines = self._format_multiline_row(header_row, col_widths, alignments)
+                    lines.extend(formatted_lines)
+
+            # Header separator (after all header rows)
+            if self.style.header_separator:
+                sep_line = self._create_separator_line(col_widths)
+                lines.append(sep_line)
+
+        # Data rows
+        for row in rows:
+            # Only add rows with meaningful content
+            if any(cell.strip() for cell in row):
+                row_line = self._format_row(row, col_widths, alignments)
+                lines.append(row_line)
+        
+        return '\n'.join(lines)
+    
+    def _format_row(self, row: List[str], col_widths: List[int], 
+                   alignments: List[Alignment]) -> str:
+        """Format a single row with proper alignment and padding."""
+        cells = []
+        border = self.style.border_char
+        
+        for col_idx, width in enumerate(col_widths):
+            # Get cell content
+            content = str(row[col_idx]) if col_idx < len(row) else ""
+            
+            # Handle multi-line content (take first line only for table)
+            if '\n' in content:
+                content = content.split('\n')[0]
+            
+            content = content.strip()
+            
+            # Calculate available width for content
+            available_width = width - (self.style.padding * 2)
+            
+            # Truncate if too long
+            if len(content) > available_width:
+                content = content[:available_width-3] + "..."
+            
+            # Apply alignment
+            alignment = alignments[col_idx] if col_idx < len(alignments) else Alignment.LEFT
+            
+            if alignment == Alignment.RIGHT:
+                aligned_content = content.rjust(available_width)
+            elif alignment == Alignment.CENTER:
+                aligned_content = content.center(available_width)
+            else:  # LEFT
+                aligned_content = content.ljust(available_width)
+            
+            # Add padding
+            padded_cell = ' ' * self.style.padding + aligned_content + ' ' * self.style.padding
+            cells.append(padded_cell)
+        
+        # Join with borders
+        if border:
+            return border + border.join(cells) + border
+        else:
+            return '  '.join(cells)
+    
+    def _format_multiline_row(self, row: List[str], col_widths: List[int],
+                              alignments: List[Alignment]) -> List[str]:
+        """
+        Format a row that may contain multi-line cells (cells with \n characters).
+
+        Returns a list of formatted lines, one for each line of text in the cells.
+        """
+        # Split each cell by newlines
+        cell_lines = []
+        max_lines = 1
+
+        for col_idx, content in enumerate(row):
+            lines = content.split('\n') if content else ['']
+            cell_lines.append(lines)
+            max_lines = max(max_lines, len(lines))
+
+        # Build output lines
+        output_lines = []
+        for line_idx in range(max_lines):
+            # Build row for this line
+            current_row = []
+            for col_idx in range(len(row)):
+                # Get the line for this cell, or empty string if this cell has fewer lines
+                if line_idx < len(cell_lines[col_idx]):
+                    current_row.append(cell_lines[col_idx][line_idx])
+                else:
+                    current_row.append('')
+
+            # Format this line
+            formatted_line = self._format_row(current_row, col_widths, alignments)
+            output_lines.append(formatted_line)
+
+        return output_lines
+
+    def _create_separator_line(self, col_widths: List[int]) -> str:
+        """
+        Create header separator line.
+
+        For bordered styles: |-------|-------|
+        For borderless styles:  ─────────────── (full width horizontal line)
+        """
+        sep_char = self.style.header_separator
+        border = self.style.border_char
+
+        if not sep_char:
+            # No separator at all (minimal style)
+            return ""
+
+        if border:
+            # Bordered style: create separator matching column widths
+            separators = []
+            for width in col_widths:
+                separators.append(sep_char * width)
+            return border + border.join(separators) + border
+        else:
+            # Borderless style (simple): single horizontal line across full width
+            # Calculate total width: sum of column widths + gaps between columns
+            total_width = sum(col_widths) + (len(col_widths) - 1) * 2  # 2-space gaps
+
+            # Add leading space for indentation (matching row indentation)
+            return " " + sep_char * total_width
+
+
+# Factory functions for easy usage
+def create_fast_renderer(style: str = "pipe") -> FastTableRenderer:
+    """
+    Create a FastTableRenderer with predefined style.
+    
+    Args:
+        style: Style name ("pipe", "minimal")
+    
+    Returns:
+        Configured FastTableRenderer instance
+    """
+    if style == "minimal":
+        return FastTableRenderer(TableStyle.minimal())
+    else:  # Default to pipe
+        return FastTableRenderer(TableStyle.pipe_table())
+
+
+def render_table_fast(table_node, style: str = "pipe") -> str:
+    """
+    Convenience function to quickly render a table.
+    
+    Args:
+        table_node: TableNode instance
+        style: Style name ("pipe", "minimal")
+    
+    Returns:
+        Formatted table string
+    """
+    renderer = create_fast_renderer(style)
+    return renderer.render_table_node(table_node)
--- a/venv/lib/python3.10/site-packages/edgar/documents/renderers/markdown.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/renderers/markdown.py
@@ -0,0 +1,613 @@
+"""
+Markdown renderer for parsed documents.
+"""
+
+from typing import List, Optional, Dict, Set
+
+from edgar.documents.document import Document
+from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode, ListNode, ListItemNode
+from edgar.documents.table_nodes import TableNode
+
+
+class MarkdownRenderer:
+    """
+    Renders parsed documents to Markdown format.
+    
+    Features:
+    - Preserves document structure
+    - Handles tables with proper formatting
+    - Supports nested lists
+    - Includes metadata annotations
+    - Configurable output options
+    """
+    
+    def __init__(self,
+                 include_metadata: bool = False,
+                 include_toc: bool = False,
+                 max_heading_level: int = 6,
+                 table_format: str = 'pipe',
+                 wrap_width: Optional[int] = None):
+        """
+        Initialize markdown renderer.
+        
+        Args:
+            include_metadata: Include metadata annotations
+            include_toc: Generate table of contents
+            max_heading_level: Maximum heading level to render
+            table_format: Table format ('pipe', 'grid', 'simple')
+            wrap_width: Wrap text at specified width
+        """
+        self.include_metadata = include_metadata
+        self.include_toc = include_toc
+        self.max_heading_level = max_heading_level
+        self.table_format = table_format
+        self.wrap_width = wrap_width
+        
+        # Track state during rendering
+        self._toc_entries: List[tuple] = []
+        self._rendered_ids: Set[str] = set()
+        self._list_depth = 0
+        self._in_table = False
+    
+    def render(self, document: Document) -> str:
+        """
+        Render document to Markdown.
+        
+        Args:
+            document: Document to render
+            
+        Returns:
+            Markdown formatted text
+        """
+        self._reset_state()
+        
+        parts = []
+        
+        # Add metadata header if requested
+        if self.include_metadata:
+            parts.append(self._render_metadata(document))
+            parts.append("")
+        
+        # Placeholder for TOC
+        if self.include_toc:
+            toc_placeholder = "<!-- TOC -->"
+            parts.append(toc_placeholder)
+            parts.append("")
+        
+        # Render document content
+        content = self._render_node(document.root)
+        parts.append(content)
+        
+        # Join parts
+        markdown = "\n".join(parts)
+        
+        # Replace TOC placeholder
+        if self.include_toc and self._toc_entries:
+            toc = self._generate_toc()
+            markdown = markdown.replace(toc_placeholder, toc)
+        
+        return markdown.strip()
+    
+    def render_node(self, node: Node) -> str:
+        """
+        Render a specific node to Markdown.
+        
+        Args:
+            node: Node to render
+            
+        Returns:
+            Markdown formatted text
+        """
+        self._reset_state()
+        return self._render_node(node)
+    
+    def _reset_state(self):
+        """Reset renderer state."""
+        self._toc_entries = []
+        self._rendered_ids = set()
+        self._list_depth = 0
+        self._in_table = False
+    
+    def _render_node(self, node: Node) -> str:
+        """Render a node and its children."""
+        # Skip if already rendered (handles shared nodes)
+        if node.id in self._rendered_ids:
+            return ""
+        self._rendered_ids.add(node.id)
+        
+        # Dispatch based on node type
+        if isinstance(node, HeadingNode):
+            return self._render_heading(node)
+        elif isinstance(node, ParagraphNode):
+            return self._render_paragraph(node)
+        elif isinstance(node, TextNode):
+            return self._render_text(node)
+        elif isinstance(node, TableNode):
+            return self._render_table(node)
+        elif isinstance(node, ListNode):
+            return self._render_list(node)
+        elif isinstance(node, ListItemNode):
+            return self._render_list_item(node)
+        else:
+            # Default: render children
+            return self._render_children(node)
+    
+    def _render_heading(self, node: HeadingNode) -> str:
+        """Render heading node."""
+        # Limit heading level
+        level = min(node.level, self.max_heading_level)
+        
+        # Get heading text
+        text = node.text().strip()
+        if not text:
+            return ""
+        
+        # Add to TOC
+        if self.include_toc:
+            self._toc_entries.append((level, text, node.id))
+        
+        # Create markdown heading
+        markdown = "#" * level + " " + text
+        
+        # Add metadata if requested
+        if self.include_metadata and node.metadata:
+            metadata = self._format_metadata(node.metadata)
+            if metadata:
+                markdown += f" <!-- {metadata} -->"
+        
+        # Add children content
+        children_content = self._render_children(node)
+        if children_content:
+            markdown += "\n\n" + children_content
+        
+        return markdown
+    
+    def _render_paragraph(self, node: ParagraphNode) -> str:
+        """Render paragraph node."""
+        # Get paragraph content
+        content = self._render_children(node).strip()
+        if not content:
+            return ""
+        
+        # Wrap if requested
+        if self.wrap_width:
+            content = self._wrap_text(content, self.wrap_width)
+        
+        # Add metadata if requested
+        if self.include_metadata and node.metadata:
+            metadata = self._format_metadata(node.metadata)
+            if metadata:
+                content = f"<!-- {metadata} -->\n{content}"
+        
+        return content
+    
+    def _render_text(self, node: TextNode) -> str:
+        """Render text node."""
+        text = node.text()
+        
+        # Escape markdown special characters
+        text = self._escape_markdown(text)
+        
+        # Apply text formatting based on style
+        if node.style:
+            if node.style.font_weight in ['bold', '700', '800', '900']:
+                text = f"**{text}**"
+            elif node.style.font_style == 'italic':
+                text = f"*{text}*"
+            elif node.style.text_decoration == 'underline':
+                text = f"<u>{text}</u>"
+        
+        return text
+    
+    def _render_table(self, node: TableNode) -> str:
+        """Render table node."""
+        self._in_table = True
+        
+        parts = []
+        
+        # Add caption if present
+        if node.caption:
+            parts.append(f"**Table: {node.caption}**")
+            parts.append("")
+        
+        # Render based on format
+        if self.table_format == 'pipe':
+            table_md = self._render_table_pipe(node)
+        elif self.table_format == 'grid':
+            table_md = self._render_table_grid(node)
+        else:  # simple
+            table_md = self._render_table_simple(node)
+        
+        parts.append(table_md)
+        
+        # Add metadata if requested
+        if self.include_metadata and node.metadata:
+            metadata = self._format_metadata(node.metadata)
+            if metadata:
+                parts.append(f"<!-- Table metadata: {metadata} -->")
+        
+        self._in_table = False
+        
+        return "\n".join(parts)
+    
+    def _render_table_pipe(self, node: TableNode) -> str:
+        """Render table in pipe format with proper column spanning support."""
+        # Handle complex SEC filing tables with column spanning
+        expanded_headers, expanded_data_rows = self._expand_table_structure(node)
+        
+        # Identify and filter to meaningful columns
+        content_columns = self._identify_content_columns(expanded_headers, expanded_data_rows)
+        
+        if not content_columns:
+            return ""
+        
+        rows = []
+        
+        # Render headers with intelligent multi-row combination
+        if expanded_headers:
+            combined_headers = self._combine_multi_row_headers(expanded_headers)
+            filtered_headers = [combined_headers[i] if i < len(combined_headers) else "" for i in content_columns]
+            
+            row_md = "| " + " | ".join(filtered_headers) + " |"
+            rows.append(row_md)
+            
+            # Add separator
+            separator = "| " + " | ".join(["---"] * len(filtered_headers)) + " |"
+            rows.append(separator)
+        
+        # Render data rows
+        for expanded_row in expanded_data_rows:
+            filtered_row = [expanded_row[i] if i < len(expanded_row) else "" for i in content_columns]
+            
+            # Only add rows with meaningful content
+            if any(cell.strip() for cell in filtered_row):
+                row_md = "| " + " | ".join(filtered_row) + " |"
+                rows.append(row_md)
+        
+        return "\n".join(rows)
+    
+    def _render_table_grid(self, node: TableNode) -> str:
+        """Render table in grid format."""
+        # Simplified grid format
+        all_rows = []
+        
+        # Add headers
+        if node.headers:
+            for header_row in node.headers:
+                cells = [cell.text() for cell in header_row]
+                all_rows.append(" | ".join(cells))
+        
+        # Add data rows
+        for row in node.rows:
+            cells = [cell.text() for cell in row.cells]
+            all_rows.append(" | ".join(cells))
+        
+        if all_rows:
+            # Add borders
+            max_width = max(len(row) for row in all_rows)
+            border = "+" + "-" * (max_width + 2) + "+"
+            result = [border]
+            for row in all_rows:
+                result.append(f"| {row:<{max_width}} |")
+            result.append(border)
+            return "\n".join(result)
+        
+        return ""
+    
+    def _render_table_simple(self, node: TableNode) -> str:
+        """Render table in simple format."""
+        rows = []
+        
+        # Add headers
+        if node.headers:
+            for header_row in node.headers:
+                cells = [cell.text() for cell in header_row]
+                rows.append("  ".join(cells))
+        
+        # Add separator if we have headers
+        if node.headers and node.rows:
+            rows.append("")
+        
+        # Add data rows
+        for row in node.rows:
+            cells = [cell.text() for cell in row.cells]
+            rows.append("  ".join(cells))
+        
+        return "\n".join(rows)
+    
+    def _render_list(self, node: ListNode) -> str:
+        """Render list node."""
+        self._list_depth += 1
+        
+        items = []
+        for child in node.children:
+            if isinstance(child, ListItemNode):
+                item_md = self._render_list_item(child)
+                if item_md:
+                    items.append(item_md)
+        
+        self._list_depth -= 1
+        
+        return "\n".join(items)
+    
+    def _render_list_item(self, node: ListItemNode) -> str:
+        """Render list item node."""
+        # Determine bullet/number
+        if node.parent and hasattr(node.parent, 'ordered') and node.parent.ordered:
+            # Ordered list
+            index = node.parent.children.index(node) + 1
+            marker = f"{index}."
+        else:
+            # Unordered list
+            markers = ['*', '-', '+']
+            marker = markers[(self._list_depth - 1) % len(markers)]
+        
+        # Indentation
+        indent = "  " * (self._list_depth - 1)
+        
+        # Get content
+        content = self._render_children(node).strip()
+        
+        # Format item
+        if '\n' in content:
+            # Multi-line content
+            lines = content.split('\n')
+            result = indent + marker + " " + lines[0]
+            for line in lines[1:]:
+                result += "\n" + indent + "  " + line
+            return result
+        else:
+            # Single line
+            return indent + marker + " " + content
+    
+    def _render_children(self, node: Node) -> str:
+        """Render all children of a node."""
+        parts = []
+        
+        for child in node.children:
+            child_md = self._render_node(child)
+            if child_md:
+                parts.append(child_md)
+        
+        # Join with appropriate separator
+        if self._in_table:
+            return " ".join(parts)
+        elif any(isinstance(child, (HeadingNode, ParagraphNode, TableNode, ListNode)) 
+                for child in node.children):
+            return "\n\n".join(parts)
+        else:
+            return " ".join(parts)
+    
+    def _render_metadata(self, document: Document) -> str:
+        """Render document metadata."""
+        lines = ["---"]
+        
+        if document.metadata.company:
+            lines.append(f"company: {document.metadata.company}")
+        if document.metadata.form:
+            lines.append(f"form: {document.metadata.form}")
+        if document.metadata.filing_date:
+            lines.append(f"filing_date: {document.metadata.filing_date}")
+        if document.metadata.cik:
+            lines.append(f"cik: {document.metadata.cik}")
+        if document.metadata.accession_number:
+            lines.append(f"accession_number: {document.metadata.accession_number}")
+        
+        lines.append("---")
+        
+        return "\n".join(lines)
+    
+    def _generate_toc(self) -> str:
+        """Generate table of contents."""
+        lines = ["## Table of Contents", ""]
+        
+        for level, text, node_id in self._toc_entries:
+            # Create anchor link
+            anchor = self._create_anchor(text)
+            
+            # Indentation based on level
+            indent = "  " * (level - 1)
+            
+            # Add TOC entry
+            lines.append(f"{indent}- [{text}](#{anchor})")
+        
+        return "\n".join(lines)
+    
+    def _create_anchor(self, text: str) -> str:
+        """Create anchor from heading text."""
+        # Convert to lowercase and replace spaces with hyphens
+        anchor = text.lower()
+        anchor = anchor.replace(' ', '-')
+        
+        # Remove special characters
+        import re
+        anchor = re.sub(r'[^a-z0-9\-]', '', anchor)
+        
+        # Remove multiple hyphens
+        anchor = re.sub(r'-+', '-', anchor)
+        
+        return anchor.strip('-')
+    
+    def _format_metadata(self, metadata: Dict) -> str:
+        """Format metadata for display."""
+        parts = []
+        
+        for key, value in metadata.items():
+            if key == 'semantic_type':
+                parts.append(f"type:{value}")
+            elif key == 'section':
+                parts.append(f"section:{value}")
+            elif key == 'ix_tag':
+                parts.append(f"xbrl:{value}")
+            else:
+                parts.append(f"{key}:{value}")
+        
+        return " ".join(parts)
+    
+    def _escape_markdown(self, text: str) -> str:
+        """Escape markdown special characters."""
+        # Don't escape in tables
+        if self._in_table:
+            return text
+        
+        # Escape special characters
+        for char in ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']:
+            text = text.replace(char, '\\' + char)
+        
+        return text
+    
+    def _wrap_text(self, text: str, width: int) -> str:
+        """Wrap text at specified width."""
+        import textwrap
+        return textwrap.fill(text, width=width, break_long_words=False)
+    
+    def _expand_table_structure(self, node: TableNode) -> tuple:
+        """
+        Expand table structure to handle column spanning properly.
+        Returns (expanded_headers, expanded_data_rows).
+        """
+        # Calculate the logical column count from colspan
+        max_columns = 0
+        
+        # Check all rows for maximum column span
+        all_rows = []
+        if node.headers:
+            for header_row in node.headers:
+                all_rows.append(header_row)
+        for row in node.rows:
+            all_rows.append(row.cells)
+        
+        for row in all_rows:
+            column_count = sum(cell.colspan for cell in row)
+            max_columns = max(max_columns, column_count)
+        
+        # Expand headers
+        expanded_headers = []
+        if node.headers:
+            for header_row in node.headers:
+                expanded = self._expand_row_to_columns(header_row, max_columns)
+                expanded_headers.append(expanded)
+        
+        # Expand data rows
+        expanded_data_rows = []
+        for row in node.rows:
+            expanded = self._expand_row_to_columns(row.cells, max_columns)
+            expanded_data_rows.append(expanded)
+        
+        return expanded_headers, expanded_data_rows
+    
+    def _expand_row_to_columns(self, cells: List, target_columns: int) -> List[str]:
+        """Expand a row with colspan cells to match the target column count."""
+        expanded = []
+        current_column = 0
+        
+        for cell in cells:
+            cell_text = cell.text().strip()
+            
+            # Add the cell content
+            expanded.append(cell_text)
+            current_column += 1
+            
+            # Add empty cells for remaining colspan
+            for _ in range(cell.colspan - 1):
+                if current_column < target_columns:
+                    expanded.append("")
+                    current_column += 1
+        
+        # Pad to target column count if needed
+        while len(expanded) < target_columns:
+            expanded.append("")
+        
+        return expanded[:target_columns]
+    
+    def _identify_content_columns(self, expanded_headers: List[List[str]], 
+                                 expanded_data_rows: List[List[str]]) -> List[int]:
+        """Identify which columns actually contain meaningful content."""
+        if not expanded_headers and not expanded_data_rows:
+            return []
+        
+        # Get the column count
+        max_cols = 0
+        if expanded_headers:
+            max_cols = max(max_cols, max(len(row) for row in expanded_headers))
+        if expanded_data_rows:
+            max_cols = max(max_cols, max(len(row) for row in expanded_data_rows))
+        
+        content_columns = []
+        
+        for col in range(max_cols):
+            has_content = False
+            
+            # Check headers
+            for header_row in expanded_headers:
+                if col < len(header_row) and header_row[col].strip():
+                    has_content = True
+                    break
+            
+            # Check data rows
+            if not has_content:
+                for data_row in expanded_data_rows:
+                    if col < len(data_row) and data_row[col].strip():
+                        has_content = True
+                        break
+            
+            if has_content:
+                content_columns.append(col)
+        
+        return content_columns
+    
+    def _combine_multi_row_headers(self, header_rows: List[List[str]]) -> List[str]:
+        """
+        Combine multi-row headers intelligently for SEC filing tables.
+        Prioritizes specific dates/periods over generic labels.
+        """
+        if not header_rows:
+            return []
+        
+        num_columns = len(header_rows[0])
+        combined = [""] * num_columns
+        
+        for col in range(num_columns):
+            # Collect all values for this column across header rows
+            column_values = []
+            for row in header_rows:
+                if col < len(row) and row[col].strip():
+                    column_values.append(row[col].strip())
+            
+            if column_values:
+                # Prioritize date-like values over generic labels
+                date_values = [v for v in column_values if self._looks_like_date(v)]
+                if date_values:
+                    # Clean up line breaks in dates
+                    combined[col] = date_values[0].replace('\n', ' ')
+                elif len(column_values) == 1:
+                    combined[col] = column_values[0].replace('\n', ' ')
+                else:
+                    # Skip generic terms like "Year Ended" if we have something more specific
+                    specific_values = [v for v in column_values 
+                                     if v.lower() not in ['year ended', 'years ended']]
+                    if specific_values:
+                        combined[col] = specific_values[0].replace('\n', ' ')
+                    else:
+                        combined[col] = column_values[0].replace('\n', ' ')
+        
+        return combined
+    
+    def _looks_like_date(self, text: str) -> bool:
+        """Check if text looks like a date."""
+        import re
+        
+        # Common date patterns in SEC filings
+        date_patterns = [
+            r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s*\d{4}',
+            r'\d{1,2}/\d{1,2}/\d{4}',
+            r'\d{4}-\d{2}-\d{2}',
+            r'^\d{4}$',  # Just a year
+        ]
+        
+        text_clean = text.replace('\n', ' ').strip()
+        for pattern in date_patterns:
+            if re.search(pattern, text_clean, re.IGNORECASE):
+                return True
+        
+        return False
--- a/venv/lib/python3.10/site-packages/edgar/documents/renderers/text.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/renderers/text.py
@@ -0,0 +1,51 @@
+"""
+Plain text renderer for parsed documents.
+"""
+
+from typing import Optional
+from edgar.documents.document import Document
+from edgar.documents.extractors.text_extractor import TextExtractor
+
+
+class TextRenderer:
+    """
+    Renders parsed documents to plain text.
+    
+    This is a simple wrapper around TextExtractor for consistency
+    with other renderers.
+    """
+    
+    def __init__(self,
+                 clean: bool = True,
+                 include_tables: bool = True,
+                 max_length: Optional[int] = None,
+                 preserve_structure: bool = False):
+        """
+        Initialize text renderer.
+        
+        Args:
+            clean: Clean and normalize text
+            include_tables: Include table content
+            max_length: Maximum text length
+            preserve_structure: Preserve document structure
+        """
+        self.extractor = TextExtractor(
+            clean=clean,
+            include_tables=include_tables,
+            include_metadata=False,
+            include_links=False,
+            max_length=max_length,
+            preserve_structure=preserve_structure
+        )
+    
+    def render(self, document: Document) -> str:
+        """
+        Render document to plain text.
+        
+        Args:
+            document: Document to render
+            
+        Returns:
+            Plain text
+        """
+        return self.extractor.extract(document)
--- a/venv/lib/python3.10/site-packages/edgar/documents/search.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/search.py
@@ -0,0 +1,769 @@
+"""
+Search functionality for parsed documents.
+
+Provides both traditional search modes (TEXT, REGEX, SEMANTIC, XPATH) and
+advanced BM25-based ranking with semantic structure awareness.
+"""
+
+import re
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional, Dict, Any, TYPE_CHECKING
+
+from edgar.documents.document import Document
+from edgar.documents.nodes import Node, HeadingNode
+from edgar.documents.table_nodes import TableNode
+from edgar.documents.types import NodeType, SemanticType
+
+if TYPE_CHECKING:
+    from edgar.documents.types import SearchResult as TypesSearchResult
+
+
+class SearchMode(Enum):
+    """Search modes."""
+    TEXT = "text"           # Plain text search
+    REGEX = "regex"         # Regular expression search
+    SEMANTIC = "semantic"   # Semantic/structural search
+    XPATH = "xpath"         # XPath-like search
+
+
+@dataclass
+class SearchResult:
+    """Result from a search operation."""
+    node: Node                      # Node containing match
+    text: str                       # Matched text
+    start_offset: int              # Start position in text
+    end_offset: int                # End position in text
+    context: Optional[str] = None  # Surrounding context
+    score: float = 1.0             # Relevance score
+    
+    @property
+    def snippet(self) -> str:
+        """Get text snippet with match highlighted."""
+        if self.context:
+            # Highlight match in context
+            before = self.context[:self.start_offset]
+            match = self.context[self.start_offset:self.end_offset]
+            after = self.context[self.end_offset:]
+            return f"{before}**{match}**{after}"
+        return f"**{self.text}**"
+
+
+class DocumentSearch:
+    """
+    Search functionality for parsed documents.
+    
+    Supports various search modes and options.
+    """
+    
+    def __init__(self, document: Document, use_cache: bool = True):
+        """
+        Initialize search with document.
+
+        Args:
+            document: Document to search
+            use_cache: Enable index caching for faster repeated searches (default: True)
+        """
+        self.document = document
+        self.use_cache = use_cache
+        self._ranking_engines: Dict[str, Any] = {}  # Cached ranking engines
+        self._build_index()
+    
+    def _build_index(self):
+        """Build search index for performance."""
+        # Text index: map text to nodes
+        self.text_index: Dict[str, List[Node]] = {}
+        
+        # Type index: map node types to nodes
+        self.type_index: Dict[NodeType, List[Node]] = {}
+        
+        # Semantic index: map semantic types to nodes  
+        self.semantic_index: Dict[SemanticType, List[Node]] = {}
+        
+        # Build indices
+        for node in self.document.root.walk():
+            # Text index
+            if hasattr(node, 'text'):
+                text = node.text()
+                if text:
+                    text_lower = text.lower()
+                    if text_lower not in self.text_index:
+                        self.text_index[text_lower] = []
+                    self.text_index[text_lower].append(node)
+            
+            # Type index
+            if node.type not in self.type_index:
+                self.type_index[node.type] = []
+            self.type_index[node.type].append(node)
+            
+            # Semantic index
+            if hasattr(node, 'semantic_type') and node.semantic_type:
+                if node.semantic_type not in self.semantic_index:
+                    self.semantic_index[node.semantic_type] = []
+                self.semantic_index[node.semantic_type].append(node)
+    
+    def search(self, 
+              query: str,
+              mode: SearchMode = SearchMode.TEXT,
+              case_sensitive: bool = False,
+              whole_word: bool = False,
+              limit: Optional[int] = None,
+              node_types: Optional[List[NodeType]] = None,
+              in_section: Optional[str] = None) -> List[SearchResult]:
+        """
+        Search document.
+        
+        Args:
+            query: Search query
+            mode: Search mode
+            case_sensitive: Case sensitive search
+            whole_word: Match whole words only
+            limit: Maximum results to return
+            node_types: Limit search to specific node types
+            in_section: Limit search to specific section
+            
+        Returns:
+            List of search results
+        """
+        if mode == SearchMode.TEXT:
+            results = self._text_search(query, case_sensitive, whole_word)
+        elif mode == SearchMode.REGEX:
+            results = self._regex_search(query, case_sensitive)
+        elif mode == SearchMode.SEMANTIC:
+            results = self._semantic_search(query)
+        elif mode == SearchMode.XPATH:
+            results = self._xpath_search(query)
+        else:
+            raise ValueError(f"Unsupported search mode: {mode}")
+        
+        # Filter by node types
+        if node_types:
+            results = [r for r in results if r.node.type in node_types]
+        
+        # Filter by section
+        if in_section:
+            section_nodes = self._get_section_nodes(in_section)
+            results = [r for r in results if r.node in section_nodes]
+        
+        # Apply limit
+        if limit and len(results) > limit:
+            results = results[:limit]
+        
+        return results
+    
+    def _text_search(self, query: str, case_sensitive: bool, whole_word: bool) -> List[SearchResult]:
+        """Perform text search."""
+        results = []
+        
+        # Prepare query
+        if not case_sensitive:
+            query = query.lower()
+        
+        # Search only leaf nodes to avoid duplicates
+        for node in self.document.root.walk():
+            # Skip nodes with children (they aggregate child text)
+            if hasattr(node, 'children') and node.children:
+                continue
+            
+            if not hasattr(node, 'text'):
+                continue
+            
+            text = node.text()
+            if not text:
+                continue
+            
+            search_text = text if case_sensitive else text.lower()
+            
+            # Find all occurrences
+            if whole_word:
+                # Use word boundary regex
+                pattern = r'\b' + re.escape(query) + r'\b'
+                flags = 0 if case_sensitive else re.IGNORECASE
+                
+                for match in re.finditer(pattern, text, flags):
+                    results.append(SearchResult(
+                        node=node,
+                        text=match.group(),
+                        start_offset=match.start(),
+                        end_offset=match.end(),
+                        context=self._get_context(text, match.start(), match.end())
+                    ))
+            else:
+                # Simple substring search
+                start = 0
+                while True:
+                    pos = search_text.find(query, start)
+                    if pos == -1:
+                        break
+                    
+                    results.append(SearchResult(
+                        node=node,
+                        text=text[pos:pos + len(query)],
+                        start_offset=pos,
+                        end_offset=pos + len(query),
+                        context=self._get_context(text, pos, pos + len(query))
+                    ))
+                    
+                    start = pos + 1
+        
+        return results
+    
+    def _regex_search(self, pattern: str, case_sensitive: bool) -> List[SearchResult]:
+        """Perform regex search."""
+        results = []
+        
+        try:
+            flags = 0 if case_sensitive else re.IGNORECASE
+            regex = re.compile(pattern, flags)
+        except re.error as e:
+            raise ValueError(f"Invalid regex pattern: {e}")
+        
+        # Search only leaf nodes to avoid duplicates
+        for node in self.document.root.walk():
+            # Skip nodes with children (they aggregate child text)
+            if hasattr(node, 'children') and node.children:
+                continue
+                
+            if not hasattr(node, 'text'):
+                continue
+            
+            text = node.text()
+            if not text:
+                continue
+            
+            # Find all matches
+            for match in regex.finditer(text):
+                results.append(SearchResult(
+                    node=node,
+                    text=match.group(),
+                    start_offset=match.start(),
+                    end_offset=match.end(),
+                    context=self._get_context(text, match.start(), match.end())
+                ))
+        
+        return results
+    
+    def _semantic_search(self, query: str) -> List[SearchResult]:
+        """Perform semantic/structural search."""
+        results = []
+        
+        # Parse semantic query
+        # Examples: "heading:Item 1", "table:revenue", "section:risk factors"
+        if ':' in query:
+            search_type, search_text = query.split(':', 1)
+            search_type = search_type.lower().strip()
+            search_text = search_text.strip()
+        else:
+            # Default to text search in headings
+            search_type = 'heading'
+            search_text = query
+        
+        if search_type == 'heading':
+            # Search headings
+            for node in self.type_index.get(NodeType.HEADING, []):
+                if isinstance(node, HeadingNode):
+                    heading_text = node.text()
+                    if heading_text and search_text.lower() in heading_text.lower():
+                        results.append(SearchResult(
+                            node=node,
+                            text=heading_text,
+                            start_offset=0,
+                            end_offset=len(heading_text),
+                            score=self._calculate_heading_score(node)
+                        ))
+        
+        elif search_type == 'table':
+            # Search tables
+            for node in self.type_index.get(NodeType.TABLE, []):
+                if isinstance(node, TableNode):
+                    # Search in table content
+                    table_text = node.text()
+                    if table_text and search_text.lower() in table_text.lower():
+                        results.append(SearchResult(
+                            node=node,
+                            text=f"Table: {node.caption or 'Untitled'}",
+                            start_offset=0,
+                            end_offset=len(table_text),
+                            context=table_text[:200] + "..." if len(table_text) > 200 else table_text
+                        ))
+        
+        elif search_type == 'section':
+            # Search sections
+            sections = self.document.sections
+            for section_name, section in sections.items():
+                if search_text.lower() in section_name.lower():
+                    results.append(SearchResult(
+                        node=section.node,
+                        text=section.title,
+                        start_offset=section.start_offset,
+                        end_offset=section.end_offset,
+                        score=2.0  # Boost section matches
+                    ))
+        
+        # Sort by score
+        results.sort(key=lambda r: r.score, reverse=True)
+        
+        return results
+    
+    def _xpath_search(self, xpath: str) -> List[SearchResult]:
+        """Perform XPath-like search."""
+        results = []
+        
+        # Simple XPath parser
+        # Examples: "//h1", "//table[@class='financial']", "//p[contains(text(),'revenue')]"
+        
+        # Extract tag name
+        tag_match = re.match(r'//(\w+)', xpath)
+        if not tag_match:
+            raise ValueError(f"Invalid XPath: {xpath}")
+        
+        tag_name = tag_match.group(1).lower()
+        
+        # Map tag to node type
+        tag_to_type = {
+            'h1': NodeType.HEADING,
+            'h2': NodeType.HEADING,
+            'h3': NodeType.HEADING,
+            'h4': NodeType.HEADING,
+            'h5': NodeType.HEADING,
+            'h6': NodeType.HEADING,
+            'p': NodeType.PARAGRAPH,
+            'table': NodeType.TABLE,
+            'section': NodeType.SECTION
+        }
+        
+        node_type = tag_to_type.get(tag_name)
+        if not node_type:
+            return results
+        
+        # Get nodes of type
+        nodes = self.type_index.get(node_type, [])
+        
+        # Apply filters
+        if '[' in xpath:
+            # Extract condition
+            condition_match = re.search(r'\[(.*?)\]', xpath)
+            if condition_match:
+                condition = condition_match.group(1)
+                nodes = self._apply_xpath_condition(nodes, condition)
+        
+        # Create results
+        for node in nodes:
+            text = node.text() if hasattr(node, 'text') else str(node)
+            results.append(SearchResult(
+                node=node,
+                text=text[:100] + "..." if len(text) > 100 else text,
+                start_offset=0,
+                end_offset=len(text)
+            ))
+        
+        return results
+    
+    def _apply_xpath_condition(self, nodes: List[Node], condition: str) -> List[Node]:
+        """Apply XPath condition to filter nodes."""
+        filtered = []
+        
+        # Parse condition
+        if condition.startswith('@'):
+            # Attribute condition
+            attr_match = re.match(r'@(\w+)=["\']([^"\']+)["\']', condition)
+            if attr_match:
+                attr_name, attr_value = attr_match.groups()
+                for node in nodes:
+                    if node.metadata.get(attr_name) == attr_value:
+                        filtered.append(node)
+        
+        elif 'contains(text()' in condition:
+            # Text contains condition
+            text_match = re.search(r'contains\(text\(\),\s*["\']([^"\']+)["\']\)', condition)
+            if text_match:
+                search_text = text_match.group(1).lower()
+                for node in nodes:
+                    if hasattr(node, 'text'):
+                        node_text = node.text()
+                        if node_text and search_text in node_text.lower():
+                            filtered.append(node)
+        
+        else:
+            # Level condition for headings
+            try:
+                level = int(condition)
+                for node in nodes:
+                    if isinstance(node, HeadingNode) and node.level == level:
+                        filtered.append(node)
+            except ValueError:
+                pass
+        
+        return filtered
+    
+    def _get_context(self, text: str, start: int, end: int, context_size: int = 50) -> str:
+        """Get context around match."""
+        # Calculate context boundaries
+        context_start = max(0, start - context_size)
+        context_end = min(len(text), end + context_size)
+        
+        # Get context
+        context = text[context_start:context_end]
+        
+        # Add ellipsis if truncated
+        if context_start > 0:
+            context = "..." + context
+        if context_end < len(text):
+            context = context + "..."
+        
+        # Adjust offsets for context
+        if context_start > 0:
+            start = start - context_start + 3  # Account for "..."
+            end = end - context_start + 3
+        else:
+            start = start - context_start
+            end = end - context_start
+        
+        return context
+    
+    def _calculate_heading_score(self, heading: HeadingNode) -> float:
+        """Calculate relevance score for heading."""
+        # Higher level headings get higher scores
+        base_score = 7 - heading.level  # H1=6, H2=5, etc.
+        
+        # Boost section headers
+        if heading.semantic_type == SemanticType.SECTION_HEADER:
+            base_score *= 1.5
+        
+        return base_score
+    
+    def _get_section_nodes(self, section_name: str) -> List[Node]:
+        """Get all nodes in a section."""
+        nodes = []
+        
+        sections = self.document.sections
+        if section_name in sections:
+            section = sections[section_name]
+            # Get all nodes in section
+            for node in section.node.walk():
+                nodes.append(node)
+        
+        return nodes
+    
+    def find_tables(self, 
+                   caption_pattern: Optional[str] = None,
+                   min_rows: Optional[int] = None,
+                   min_cols: Optional[int] = None) -> List[TableNode]:
+        """
+        Find tables matching criteria.
+        
+        Args:
+            caption_pattern: Regex pattern for caption
+            min_rows: Minimum number of rows
+            min_cols: Minimum number of columns
+            
+        Returns:
+            List of matching tables
+        """
+        tables = []
+        
+        for node in self.type_index.get(NodeType.TABLE, []):
+            if not isinstance(node, TableNode):
+                continue
+            
+            # Check caption
+            if caption_pattern and node.caption:
+                if not re.search(caption_pattern, node.caption, re.IGNORECASE):
+                    continue
+            
+            # Check dimensions
+            if min_rows and node.row_count < min_rows:
+                continue
+            if min_cols and node.col_count < min_cols:
+                continue
+            
+            tables.append(node)
+        
+        return tables
+    
+    def find_headings(self,
+                     level: Optional[int] = None,
+                     pattern: Optional[str] = None) -> List[HeadingNode]:
+        """
+        Find headings matching criteria.
+
+        Args:
+            level: Heading level (1-6)
+            pattern: Regex pattern for heading text
+
+        Returns:
+            List of matching headings
+        """
+        headings = []
+
+        for node in self.type_index.get(NodeType.HEADING, []):
+            if not isinstance(node, HeadingNode):
+                continue
+
+            # Check level
+            if level and node.level != level:
+                continue
+
+            # Check pattern
+            if pattern:
+                heading_text = node.text()
+                if not heading_text or not re.search(pattern, heading_text, re.IGNORECASE):
+                    continue
+
+            headings.append(node)
+
+        return headings
+
+    def ranked_search(self,
+                     query: str,
+                     algorithm: str = "hybrid",
+                     top_k: int = 10,
+                     node_types: Optional[List[NodeType]] = None,
+                     in_section: Optional[str] = None,
+                     boost_sections: Optional[List[str]] = None) -> List['TypesSearchResult']:
+        """
+        Advanced search with BM25-based ranking and semantic structure awareness.
+
+        This provides relevance-ranked results better suited for financial documents
+        than simple substring matching. Uses BM25 for exact term matching combined
+        with semantic structure boosting for gateway content detection.
+
+        Args:
+            query: Search query
+            algorithm: Ranking algorithm ("bm25", "hybrid", "semantic")
+            top_k: Maximum results to return
+            node_types: Limit search to specific node types
+            in_section: Limit search to specific section
+            boost_sections: Section names to boost (e.g., ["Risk Factors"])
+
+        Returns:
+            List of SearchResult objects with relevance scores (from types.py)
+
+        Examples:
+            >>> searcher = DocumentSearch(document)
+            >>> results = searcher.ranked_search("revenue growth", algorithm="hybrid", top_k=5)
+            >>> for result in results:
+            >>>     print(f"Score: {result.score:.3f}")
+            >>>     print(f"Text: {result.snippet}")
+            >>>     print(f"Full context: {result.full_context[:200]}...")
+        """
+        from edgar.documents.ranking.ranking import (
+            BM25Engine,
+            HybridEngine,
+            SemanticEngine
+        )
+        from edgar.documents.types import SearchResult as TypesSearchResult
+
+        # Get all leaf nodes for ranking (avoid duplicates from parent nodes)
+        nodes = []
+        for node in self.document.root.walk():
+            # Only include leaf nodes with text
+            if hasattr(node, 'children') and node.children:
+                continue  # Skip parent nodes
+            if hasattr(node, 'text'):
+                text = node.text()
+                if text and len(text.strip()) > 0:
+                    nodes.append(node)
+
+        # Filter by node types if specified
+        if node_types:
+            nodes = [n for n in nodes if n.type in node_types]
+
+        # Filter by section if specified
+        if in_section:
+            section_nodes = self._get_section_nodes(in_section)
+            nodes = [n for n in nodes if n in section_nodes]
+
+        if not nodes:
+            return []
+
+        # Select ranking engine (with caching)
+        engine = self._get_ranking_engine(algorithm.lower(), nodes, boost_sections)
+
+        # Rank nodes
+        ranked_results = engine.rank(query, nodes)
+
+        # Convert to types.SearchResult format and add section context
+        search_results = []
+        for ranked in ranked_results[:top_k]:
+            # Try to find which section this node belongs to
+            section_obj = self._find_node_section(ranked.node)
+
+            search_results.append(TypesSearchResult(
+                node=ranked.node,
+                score=ranked.score,
+                snippet=ranked.snippet,
+                section=section_obj.name if section_obj else None,
+                context=ranked.text if len(ranked.text) <= 500 else ranked.text[:497] + "...",
+                _section_obj=section_obj  # Agent navigation support
+            ))
+
+        return search_results
+
+    def _get_ranking_engine(self, algorithm: str, nodes: List[Node],
+                            boost_sections: Optional[List[str]] = None):
+        """
+        Get or create ranking engine with caching support.
+
+        Args:
+            algorithm: Ranking algorithm ("bm25", "hybrid", "semantic")
+            nodes: Nodes to index
+            boost_sections: Section names to boost (for hybrid/semantic)
+
+        Returns:
+            Ready-to-use ranking engine
+        """
+        from edgar.documents.ranking.ranking import (
+            BM25Engine,
+            HybridEngine,
+            SemanticEngine
+        )
+        from edgar.documents.ranking.cache import get_search_cache, CacheEntry
+        from datetime import datetime
+
+        # Create cache key
+        # Use document ID, algorithm, and sample of first node for stability
+        content_sample = nodes[0].text()[:200] if nodes and hasattr(nodes[0], 'text') else ""
+        cache_key = f"{self.document.accession_number if hasattr(self.document, 'accession_number') else id(self.document)}_{algorithm}"
+
+        # Check instance cache first (for same search session)
+        if cache_key in self._ranking_engines:
+            engine, cached_nodes = self._ranking_engines[cache_key]
+            # Verify nodes haven't changed
+            if cached_nodes == nodes:
+                return engine
+
+        # Create engine based on algorithm
+        if algorithm == "bm25":
+            engine = BM25Engine()
+        elif algorithm == "hybrid":
+            engine = HybridEngine(boost_sections=boost_sections)
+        elif algorithm == "semantic":
+            engine = SemanticEngine(boost_sections=boost_sections)
+        else:
+            raise ValueError(f"Unsupported algorithm: {algorithm}")
+
+        # Try to load from global cache if enabled
+        if self.use_cache and algorithm == "bm25":  # Only cache BM25 for now
+            search_cache = get_search_cache()
+            document_hash = search_cache.compute_document_hash(
+                document_id=cache_key,
+                content_sample=content_sample
+            )
+
+            cached_entry = search_cache.get(document_hash)
+            if cached_entry:
+                # Load index from cache
+                try:
+                    engine.load_index_data(cached_entry.index_data, nodes)
+                    # Cache in instance
+                    self._ranking_engines[cache_key] = (engine, nodes)
+                    return engine
+                except Exception as e:
+                    # Cache load failed, rebuild
+                    pass
+
+        # Build fresh index
+        # For BM25/Hybrid, index is built lazily on first rank() call
+        # But we can force it here and cache the result
+        if self.use_cache and algorithm == "bm25":
+            # Force index build by doing a dummy rank
+            engine._build_index(nodes)
+
+            # Save to global cache
+            try:
+                search_cache = get_search_cache()
+                document_hash = search_cache.compute_document_hash(
+                    document_id=cache_key,
+                    content_sample=content_sample
+                )
+
+                index_data = engine.get_index_data()
+                cache_entry = CacheEntry(
+                    document_hash=document_hash,
+                    index_data=index_data,
+                    created_at=datetime.now()
+                )
+                search_cache.put(document_hash, cache_entry)
+            except Exception as e:
+                # Cache save failed, not critical
+                pass
+
+        # Cache in instance
+        self._ranking_engines[cache_key] = (engine, nodes)
+
+        return engine
+
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """
+        Get search cache statistics.
+
+        Returns:
+            Dictionary with cache performance metrics including:
+            - memory_entries: Number of indices in memory
+            - disk_entries: Number of indices on disk
+            - cache_hits: Total cache hits
+            - cache_misses: Total cache misses
+            - hit_rate: Cache hit rate (0-1)
+            - memory_size_mb: Estimated memory usage in MB
+
+        Examples:
+            >>> searcher = DocumentSearch(document)
+            >>> searcher.ranked_search("revenue", algorithm="bm25")
+            >>> stats = searcher.get_cache_stats()
+            >>> print(f"Hit rate: {stats['hit_rate']:.1%}")
+        """
+        from edgar.documents.ranking.cache import get_search_cache
+
+        stats = {
+            'instance_cache_entries': len(self._ranking_engines),
+            'global_cache_stats': {}
+        }
+
+        if self.use_cache:
+            cache = get_search_cache()
+            stats['global_cache_stats'] = cache.get_stats()
+
+        return stats
+
+    def clear_cache(self, memory_only: bool = False) -> None:
+        """
+        Clear search caches.
+
+        Args:
+            memory_only: If True, only clear in-memory caches (default: False)
+
+        Examples:
+            >>> searcher = DocumentSearch(document)
+            >>> searcher.clear_cache()  # Clear all caches
+            >>> searcher.clear_cache(memory_only=True)  # Only clear memory
+        """
+        # Clear instance cache
+        self._ranking_engines.clear()
+
+        # Clear global cache if enabled
+        if self.use_cache:
+            from edgar.documents.ranking.cache import get_search_cache
+            cache = get_search_cache()
+            cache.clear(memory_only=memory_only)
+
+    def _find_node_section(self, node: Node):
+        """
+        Find which section a node belongs to.
+
+        Returns:
+            Section object or None
+        """
+        # Walk up the tree to find section markers
+        current = node
+        while current:
+            # Check if any section contains this node
+            for section_name, section in self.document.sections.items():
+                # Check if node is in section's subtree
+                for section_node in section.node.walk():
+                    if section_node is current or section_node is node:
+                        return section
+
+            current = current.parent if hasattr(current, 'parent') else None
+
+        return None
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/init.py
@@ -0,0 +1,15 @@
+"""
+Parsing strategies for different content types.
+"""
+
+from edgar.documents.strategies.document_builder import DocumentBuilder
+from edgar.documents.strategies.header_detection import HeaderDetectionStrategy
+from edgar.documents.strategies.table_processing import TableProcessor
+from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
+
+__all__ = [
+    'DocumentBuilder',
+    'HeaderDetectionStrategy', 
+    'TableProcessor',
+    'XBRLExtractor'
+]
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/document_builder.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/document_builder.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/header_detection.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/header_detection.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/style_parser.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/style_parser.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/table_processing.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/table_processing.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/xbrl_extraction.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/pycache/xbrl_extraction.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/document_builder.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/document_builder.py
@@ -0,0 +1,670 @@
+"""
+Document builder that converts parsed HTML tree into document nodes.
+"""
+
+from typing import Dict, Any, Optional
+
+from lxml.html import HtmlElement
+
+from edgar.documents.config import ParserConfig
+from edgar.documents.nodes import (
+    Node, DocumentNode, TextNode, ParagraphNode, HeadingNode,
+    ContainerNode, SectionNode, ListNode, ListItemNode, LinkNode, ImageNode
+)
+from edgar.documents.strategies.style_parser import StyleParser
+from edgar.documents.table_nodes import TableNode, Cell, Row
+from edgar.documents.types import Style, ParseContext, SemanticType
+
+
+class DocumentBuilder:
+    """
+    Builds Document node tree from parsed HTML.
+    
+    Handles the conversion of HTML elements into structured nodes
+    with proper hierarchy and metadata.
+    """
+    
+    # Block-level elements
+    BLOCK_ELEMENTS = {
+        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+        'ul', 'ol', 'li', 'blockquote', 'pre', 'hr',
+        'table', 'form', 'fieldset', 'address', 'section',
+        'article', 'aside', 'nav', 'header', 'footer', 'main'
+    }
+    
+    # Inline elements
+    INLINE_ELEMENTS = {
+        'span', 'a', 'em', 'strong', 'b', 'i', 'u', 's',
+        'small', 'mark', 'del', 'ins', 'sub', 'sup',
+        'code', 'kbd', 'var', 'samp', 'abbr', 'cite',
+        'q', 'time', 'font',
+        # IXBRL inline elements for simple values - should not break text flow  
+        'ix:nonfraction', 'ix:footnote', 'ix:fraction'
+    }
+    
+    # Elements to skip
+    SKIP_ELEMENTS = {
+        'script', 'style', 'meta', 'link', 'noscript',
+        # IXBRL exclude elements - content that should not appear in final document
+        'ix:exclude'
+    }
+    
+    def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
+        """
+        Initialize document builder.
+        
+        Args:
+            config: Parser configuration
+            strategies: Dictionary of parsing strategies
+        """
+        self.config = config
+        self.strategies = strategies
+        self.style_parser = StyleParser()
+        self.context = ParseContext()
+        
+        # Track XBRL context
+        self.xbrl_context_stack = []
+        self.xbrl_continuations = {}
+    
+    def build(self, tree: HtmlElement) -> DocumentNode:
+        """
+        Build document from HTML tree.
+        
+        Args:
+            tree: Parsed HTML tree
+            
+        Returns:
+            Document root node
+        """
+        # Create root document node
+        root = DocumentNode()
+        
+        # Find body element
+        body = tree.find('.//body')
+        if body is None:
+            # If no body, use the entire tree
+            body = tree
+        
+        # Process body content
+        self._process_element(body, root)
+        
+        # Apply node merging if configured
+        if self.config.merge_adjacent_nodes:
+            self._merge_adjacent_nodes(root)
+        
+        return root
+    
+    def _process_element(self, element: HtmlElement, parent: Node) -> Optional[Node]:
+        """
+        Process HTML element into node.
+        
+        Args:
+            element: HTML element to process
+            parent: Parent node
+            
+        Returns:
+            Created node or None if skipped
+        """
+        
+        # Skip certain elements but preserve their tail text
+        if element.tag in self.SKIP_ELEMENTS:
+            # Process tail text even when skipping element
+            if element.tail:
+                if self.config.preserve_whitespace:
+                    text_node = TextNode(content=element.tail)
+                    parent.add_child(text_node)
+                else:
+                    if element.tail.strip():
+                        text_node = TextNode(content=element.tail.strip())
+                        parent.add_child(text_node)
+            return None
+        
+        # Skip page number containers
+        if self._is_page_number_container(element):
+            return None
+        
+        # Skip page break elements
+        if self._is_page_break_element(element):
+            return None
+        
+        # Skip navigation containers that follow page breaks
+        if self._is_page_navigation_container(element):
+            return None
+        
+        # Track parsing depth
+        self.context.depth += 1
+        
+        try:
+            # Handle XBRL elements
+            if element.tag.startswith('{'):  # Namespaced element
+                self._enter_xbrl_context(element)
+            
+            # Extract style
+            style = self._extract_style(element)
+            
+            # Create appropriate node based on element type
+            node = self._create_node_for_element(element, style)
+            
+            if node:
+                # Add XBRL metadata if in context
+                if self.xbrl_context_stack:
+                    node.metadata.update(self._get_current_xbrl_metadata())
+                
+                # Add to parent
+                parent.add_child(node)
+                
+                # Process children for container nodes
+                if self._should_process_children(element, node):
+                    # Add element's direct text first
+                    if element.text:
+                        if self.config.preserve_whitespace:
+                            if element.text:  # Don't strip whitespace
+                                text_node = TextNode(content=element.text)
+                                node.add_child(text_node)
+                        else:
+                            if element.text.strip():
+                                text_node = TextNode(content=element.text.strip())
+                                node.add_child(text_node)
+                    
+                    # Process child elements
+                    for child in element:
+                        self._process_element(child, node)
+                    
+                    # Process text after children
+                    if element.tail:
+                        if self.config.preserve_whitespace:
+                            text_node = TextNode(content=element.tail)
+                            parent.add_child(text_node)
+                        else:
+                            if element.tail.strip():
+                                text_node = TextNode(content=element.tail.strip())
+                                parent.add_child(text_node)
+                            elif element.tail.isspace():
+                                # Even if tail is just whitespace, preserve the spacing info
+                                # This helps with inline element spacing decisions
+                                if hasattr(node, 'set_metadata'):
+                                    node.set_metadata('has_tail_whitespace', True)
+                else:
+                    # Node created but children not processed - still need to handle tail
+                    if element.tail:
+                        if self.config.preserve_whitespace:
+                            text_node = TextNode(content=element.tail)
+                            parent.add_child(text_node)
+                        else:
+                            if element.tail.strip():
+                                text_node = TextNode(content=element.tail.strip())
+                                parent.add_child(text_node)
+                            elif element.tail.isspace():
+                                # Even if tail is just whitespace, preserve the spacing info
+                                if hasattr(node, 'set_metadata'):
+                                    node.set_metadata('has_tail_whitespace', True)
+            else:
+                # No node created, process children with same parent
+                for child in element:
+                    self._process_element(child, parent)
+                
+                # Process tail text
+                if element.tail:
+                    if self.config.preserve_whitespace:
+                        text_node = TextNode(content=element.tail)
+                        parent.add_child(text_node)
+                    else:
+                        if element.tail.strip():
+                            text_node = TextNode(content=element.tail.strip())
+                            parent.add_child(text_node)
+            
+            # Exit XBRL context
+            if element.tag.startswith('{'):
+                self._exit_xbrl_context(element)
+            
+            return node
+            
+        finally:
+            self.context.depth -= 1
+    
+    def _create_node_for_element(self, element: HtmlElement, style: Style) -> Optional[Node]:
+        """Create appropriate node for HTML element."""
+        tag = element.tag.lower() if not element.tag.startswith('{') else element.tag
+        
+        
+        # Check for heading
+        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            level = int(tag[1])
+            text = self._get_element_text(element)
+            if text:
+                return HeadingNode(content=text, level=level, style=style)
+        
+        # Handle specific elements first before header detection
+        if tag == 'p':
+            return ParagraphNode(style=style)
+        
+        elif tag == 'li':
+            return ListItemNode(style=style)
+        
+        # Check if element might be a heading based on style/content
+        # Skip header detection for certain tags that should never be headers
+        skip_header_detection_tags = {
+            'li', 'td', 'th', 'option', 'a', 'button', 'label',
+            # IXBRL inline elements - should not be treated as headers
+            'ix:nonfraction', 'ix:footnote', 'ix:fraction',
+            # IXBRL elements that can contain tables and complex content
+            'ix:nonNumeric', 'ix:continuation'
+        }
+        if tag not in skip_header_detection_tags and self.strategies.get('header_detection'):
+            header_info = self.strategies['header_detection'].detect(element, self.context)
+            if header_info and header_info.confidence > self.config.header_detection_threshold:
+                text = self._get_element_text(element)
+                if text:
+                    node = HeadingNode(
+                        content=text,
+                        level=header_info.level,
+                        style=style
+                    )
+                    # Add header metadata
+                    node.set_metadata('detection_method', header_info.detection_method)
+                    node.set_metadata('confidence', header_info.confidence)
+                    if header_info.is_item:
+                        node.semantic_type = SemanticType.ITEM_HEADER
+                        node.set_metadata('item_number', header_info.item_number)
+                    return node
+        
+        # Continue handling other specific elements
+        if tag == 'table':
+            if self.strategies.get('table_processing'):
+                return self.strategies['table_processing'].process(element)
+            else:
+                return self._process_table_basic(element, style)
+        
+        elif tag in ['ul', 'ol']:
+            return ListNode(ordered=(tag == 'ol'), style=style)
+        
+        elif tag == 'li':
+            return ListItemNode(style=style)
+        
+        elif tag == 'a':
+            href = element.get('href', '')
+            title = element.get('title', '')
+            text = self._get_element_text(element)
+            return LinkNode(content=text, href=href, title=title, style=style)
+        
+        elif tag == 'img':
+            return ImageNode(
+                src=element.get('src'),
+                alt=element.get('alt'),
+                width=self._parse_dimension(element.get('width')),
+                height=self._parse_dimension(element.get('height')),
+                style=style
+            )
+        
+        elif tag == 'br':
+            # Line break - add as text node
+            return TextNode(content='\n')
+        
+        elif tag in ['section', 'article']:
+            return SectionNode(style=style)
+        
+        elif tag == 'div' or tag in self.BLOCK_ELEMENTS:
+            # Check if CSS display property makes this inline
+            if style.display in ['inline', 'inline-block']:
+                # Treat as inline element despite being a div
+                text = self._get_element_text(element)
+                if text:
+                    text_node = TextNode(content=text, style=style)
+                    text_node.set_metadata('original_tag', tag)
+                    text_node.set_metadata('inline_via_css', True)
+                    return text_node
+                # If no text but inline, still process children inline
+                return ContainerNode(tag_name=tag, style=style)
+            
+            # Normal block behavior
+            # Check if this is just a text container with only inline elements
+            if self._is_text_only_container(element):
+                # Create ParagraphNode for divs containing only inline elements
+                # This ensures proper text concatenation for spans, etc.
+                return ParagraphNode(style=style)
+            else:
+                return ContainerNode(tag_name=tag, style=style)
+        
+        elif tag in self.INLINE_ELEMENTS:
+            # Inline elements - extract text and add to parent
+            text = self._get_element_text(element)
+            if text:
+                text_node = TextNode(content=text, style=style)
+                # Preserve inline element metadata
+                text_node.set_metadata('original_tag', tag)
+                return text_node
+        
+        elif tag in ['ix:nonNumeric', 'ix:continuation']:
+            # IXBRL elements that can contain complex content including tables
+            # Process as container to allow proper table parsing
+            return ContainerNode(tag_name=tag, style=style)
+        
+        # Default: create container for unknown elements
+        return ContainerNode(tag_name=tag, style=style)
+    
+    def _is_page_number_container(self, element: HtmlElement) -> bool:
+        """Detect and filter page number containers across various SEC filing patterns."""
+        import re
+        
+        # Get text content first - all page numbers should be short
+        text_content = element.text_content().strip()
+        
+        # Must be short content (1-8 chars to handle "Page X" format) 
+        if len(text_content) > 8 or len(text_content) == 0:
+            return False
+        
+        # Must be numeric, roman numerals, or "Page X" format
+        if not self._is_page_number_content(text_content):
+            return False
+        
+        # Check various patterns based on element type and styling
+        tag = element.tag.lower()
+        
+        # Pattern 1: Oracle-style flexbox containers (highest confidence)
+        if tag == 'div' and self._is_flexbox_page_number(element):
+            return True
+        
+        # Pattern 2: Center/right aligned paragraphs (common pattern)
+        if tag == 'p' and self._is_aligned_page_number(element):
+            return True
+        
+        # Pattern 3: Footer-style divs with centered page numbers
+        if tag == 'div' and self._is_footer_page_number(element):
+            return True
+        
+        # Pattern 4: Simple divs with page break context
+        if tag == 'div' and self._is_page_break_context(element):
+            return True
+        
+        return False
+    
+    def _is_page_number_content(self, text: str) -> bool:
+        """Check if text content looks like a page number."""
+        import re
+        
+        # Simple numeric (most common)
+        if text.isdigit():
+            return True
+        
+        # Roman numerals
+        if re.match(r'^[ivxlcdm]+$', text.lower()):
+            return True
+        
+        # "Page X" or "Page X of Y" format
+        if re.match(r'^page\s+\d+(\s+of\s+\d+)?$', text.lower()):
+            return True
+        
+        return False
+    
+    def _is_flexbox_page_number(self, element: HtmlElement) -> bool:
+        """Detect Oracle-style flexbox page number containers."""
+        import re
+        
+        style_attr = element.get('style', '')
+        if not style_attr:
+            return False
+        
+        # Must have: display:flex, justify-content:flex-end, min-height:1in
+        required_patterns = [
+            r'display:\s*flex',
+            r'justify-content:\s*flex-end',
+            r'min-height:\s*1in'
+        ]
+        
+        return all(re.search(pattern, style_attr) for pattern in required_patterns)
+    
+    def _is_aligned_page_number(self, element: HtmlElement) -> bool:
+        """Detect center or right-aligned page number paragraphs."""
+        import re
+        
+        style_attr = element.get('style', '')
+        
+        # Check for center or right alignment
+        alignment_pattern = r'text-align:\s*(center|right)'
+        if not re.search(alignment_pattern, style_attr):
+            return False
+        
+        # Optional: check for smaller font size (common in page numbers)
+        font_size_pattern = r'font-size:\s*([0-9]+)pt'
+        font_match = re.search(font_size_pattern, style_attr)
+        if font_match:
+            font_size = int(font_match.group(1))
+            # Page numbers often use smaller fonts (8-12pt)
+            if font_size <= 12:
+                return True
+        
+        return True  # Any center/right aligned short content
+    
+    def _is_footer_page_number(self, element: HtmlElement) -> bool:
+        """Detect footer-style page number containers."""
+        import re
+        
+        style_attr = element.get('style', '')
+        
+        # Look for bottom positioning or footer-like styling
+        footer_patterns = [
+            r'bottom:\s*[0-9]',
+            r'position:\s*absolute',
+            r'margin-bottom:\s*0',
+            r'text-align:\s*center'
+        ]
+        
+        # Need at least 2 footer indicators
+        matches = sum(1 for pattern in footer_patterns if re.search(pattern, style_attr))
+        return matches >= 2
+    
+    def _is_page_break_context(self, element: HtmlElement) -> bool:
+        """Check if element is near page breaks (common page number context)."""
+        
+        # Check next sibling for page break HR
+        next_elem = element.getnext()
+        if next_elem is not None and next_elem.tag == 'hr':
+            hr_style = next_elem.get('style', '')
+            if 'page-break' in hr_style:
+                return True
+        
+        # Check if element has page-break styling itself
+        style_attr = element.get('style', '')
+        if 'page-break' in style_attr:
+            return True
+        
+        return False
+    
+    def _is_page_break_element(self, element: HtmlElement) -> bool:
+        """Detect page break HR elements."""
+        if element.tag.lower() != 'hr':
+            return False
+        
+        style_attr = element.get('style', '')
+        
+        # Check for page-break-after:always or similar page break styles
+        return 'page-break' in style_attr
+    
+    def _is_page_navigation_container(self, element: HtmlElement) -> bool:
+        """Detect navigation containers that appear after page breaks."""
+        if element.tag.lower() != 'div':
+            return False
+        
+        style_attr = element.get('style', '')
+        
+        # Check for navigation container patterns
+        # Often have: padding-top, min-height:1in, box-sizing:border-box
+        nav_indicators = [
+            r'padding-top:\s*0\.5in',
+            r'min-height:\s*1in',
+            r'box-sizing:\s*border-box'
+        ]
+        
+        import re
+        matches = sum(1 for pattern in nav_indicators if re.search(pattern, style_attr))
+        
+        # Need at least 2 indicators
+        if matches < 2:
+            return False
+        
+        # Check if it contains typical navigation content
+        text_content = element.text_content().strip().lower()
+        
+        # Common navigation phrases
+        nav_phrases = [
+            'table of contents',
+            'index to financial statements',
+            'table of content',
+            'index to financial statement'
+        ]
+        
+        return any(phrase in text_content for phrase in nav_phrases)
+    
+    def _extract_style(self, element: HtmlElement) -> Style:
+        """Extract style from element."""
+        style_str = element.get('style', '')
+        style = self.style_parser.parse(style_str)
+        
+        # Add tag-specific styles
+        tag = element.tag.lower()
+        if tag == 'b' or tag == 'strong':
+            style.font_weight = 'bold'
+        elif tag == 'i' or tag == 'em':
+            style.font_style = 'italic'
+        elif tag == 'u':
+            style.text_decoration = 'underline'
+        
+        # Handle alignment
+        align = element.get('align')
+        if align:
+            style.text_align = align
+        
+        return style
+    
+    def _get_element_text(self, element: HtmlElement) -> str:
+        """Get text content from element."""
+        text_parts = []
+        
+        # Get element's direct text
+        if element.text:
+            # For inline elements, preserve leading/trailing whitespace
+            if element.tag.lower() in self.INLINE_ELEMENTS:
+                text_parts.append(element.text)
+            else:
+                text_parts.append(element.text.strip())
+        
+        # For simple elements, get all text content
+        if element.tag.lower() in self.INLINE_ELEMENTS or \
+           element.tag.lower() in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            # Get all text including from child elements
+            for child in element:
+                if child.tag.lower() not in self.SKIP_ELEMENTS:
+                    child_text = child.text_content()
+                    if child_text:
+                        # For inline elements, preserve whitespace in child content too
+                        if element.tag.lower() in self.INLINE_ELEMENTS:
+                            text_parts.append(child_text)
+                        else:
+                            text_parts.append(child_text.strip())
+        
+        # For inline elements with preserved whitespace, concatenate directly
+        # For others, join with spaces
+        if element.tag.lower() in self.INLINE_ELEMENTS and len(text_parts) == 1:
+            return text_parts[0] if text_parts else ''
+        else:
+            return ' '.join(text_parts)
+    
+    def _is_text_only_container(self, element: HtmlElement) -> bool:
+        """Check if element contains only text and inline elements."""
+        for child in element:
+            if child.tag.lower() in self.BLOCK_ELEMENTS:
+                return False
+            if child.tag.lower() == 'table':
+                return False
+        return True
+    
+    def _should_process_children(self, element: HtmlElement, node: Node) -> bool:
+        """Determine if children should be processed."""
+        # Don't process children for certain node types
+        if isinstance(node, (TextNode, HeadingNode)):
+            return False
+        
+        # Tables are processed separately
+        if isinstance(node, TableNode):
+            return False
+        
+        return True
+    
+    def _process_table_basic(self, element: HtmlElement, style: Style) -> TableNode:
+        """Basic table processing without advanced strategy."""
+        table = TableNode(style=style)
+        
+        # Set config for rendering decisions
+        table._config = self.config
+        
+        # Extract caption
+        caption_elem = element.find('.//caption')
+        if caption_elem is not None:
+            table.caption = caption_elem.text_content().strip()
+        
+        # Process rows
+        for tr in element.findall('.//tr'):
+            cells = []
+            for td in tr.findall('.//td') + tr.findall('.//th'):
+                cell = Cell(
+                    content=td.text_content().strip(),
+                    colspan=int(td.get('colspan', '1')),
+                    rowspan=int(td.get('rowspan', '1')),
+                    is_header=(td.tag == 'th'),
+                    align=td.get('align')
+                )
+                cells.append(cell)
+            
+            if cells:
+                row = Row(cells=cells, is_header=(tr.find('.//th') is not None))
+                
+                # Determine if header or data row
+                if tr.getparent().tag == 'thead' or row.is_header:
+                    table.headers.append(cells)
+                else:
+                    table.rows.append(row)
+        
+        return table
+    
+    def _parse_dimension(self, value: Optional[str]) -> Optional[int]:
+        """Parse dimension value (width/height)."""
+        if not value:
+            return None
+        
+        # Remove 'px' suffix if present
+        value = value.strip().rstrip('px')
+        
+        try:
+            return int(value)
+        except ValueError:
+            return None
+    
+    def _enter_xbrl_context(self, element: HtmlElement):
+        """Enter XBRL context."""
+        if self.config.extract_xbrl and self.strategies.get('xbrl_extraction'):
+            xbrl_data = self.strategies['xbrl_extraction'].extract_context(element)
+            if xbrl_data:
+                self.xbrl_context_stack.append(xbrl_data)
+    
+    def _exit_xbrl_context(self, element: HtmlElement):
+        """Exit XBRL context."""
+        if self.xbrl_context_stack:
+            self.xbrl_context_stack.pop()
+    
+    def _get_current_xbrl_metadata(self) -> Dict[str, Any]:
+        """Get current XBRL metadata."""
+        if not self.xbrl_context_stack:
+            return {}
+        
+        # Merge all contexts in stack
+        metadata = {}
+        for context in self.xbrl_context_stack:
+            metadata.update(context)
+        
+        return metadata
+    
+    def _merge_adjacent_nodes(self, root: Node):
+        """Merge adjacent text nodes with similar styles."""
+        # Implementation would recursively merge adjacent text nodes
+        # This is a placeholder for the actual implementation
+        pass
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/header_detection.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/header_detection.py
@@ -0,0 +1,450 @@
+"""
+Multi-strategy header detection for document structure.
+"""
+
+import re
+from abc import ABC, abstractmethod
+from typing import Optional, List, Dict
+
+from lxml.html import HtmlElement
+
+from edgar.documents.config import ParserConfig
+from edgar.documents.types import HeaderInfo, ParseContext
+
+
+class HeaderDetector(ABC):
+    """Abstract base class for header detectors."""
+    
+    @abstractmethod
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """Detect if element is a header."""
+        pass
+    
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Detector name."""
+        pass
+
+
+class StyleBasedDetector(HeaderDetector):
+    """Detect headers based on CSS styles."""
+    
+    @property
+    def name(self) -> str:
+        return "style"
+    
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """Detect headers based on style attributes."""
+        # Get element style
+        style = context.get_current_style()
+        
+        # Skip if no style info
+        if not style:
+            return None
+        
+        # Get text content
+        text = element.text_content().strip()
+        if not text or len(text) > 200:  # Skip very long text
+            return None
+        
+        confidence = 0.0
+        level = 3  # Default level
+        
+        # Check font size
+        if style.font_size and context.base_font_size:
+            size_ratio = style.font_size / context.base_font_size
+            
+            if size_ratio >= 2.0:
+                confidence += 0.8
+                level = 1
+            elif size_ratio >= 1.5:
+                confidence += 0.7
+                level = 2
+            elif size_ratio >= 1.2:
+                confidence += 0.5
+                level = 3
+            elif size_ratio >= 1.1:
+                confidence += 0.3
+                level = 4
+        
+        # Check font weight
+        if style.is_bold:
+            confidence += 0.3
+            if level == 3:  # Adjust level for bold text
+                level = 2
+        
+        # Check text alignment
+        if style.is_centered:
+            confidence += 0.2
+        
+        # Check for uppercase
+        if text.isupper() and len(text.split()) <= 10:
+            confidence += 0.2
+        
+        # Check margins (headers often have larger margins)
+        if style.margin_top and style.margin_top > 20:
+            confidence += 0.1
+        if style.margin_bottom and style.margin_bottom > 10:
+            confidence += 0.1
+        
+        # Normalize confidence
+        confidence = min(confidence, 1.0)
+        
+        if confidence > 0.4:  # Threshold for style-based detection
+            return HeaderInfo.from_text(text, level, confidence, self.name)
+        
+        return None
+
+
+class PatternBasedDetector(HeaderDetector):
+    """Detect headers based on text patterns."""
+    
+    # Common header patterns in SEC filings
+    HEADER_PATTERNS = [
+        # Item patterns
+        (r'^(Item|ITEM)\s+(\d+[A-Z]?)[.\s]+(.+)$', 1, 0.95),
+        (r'^Part\s+[IVX]+[.\s]*$', 1, 0.9),
+        (r'^PART\s+[IVX]+[.\s]*$', 1, 0.9),
+        
+        # Section patterns
+        (r'^(BUSINESS|RISK FACTORS|PROPERTIES|LEGAL PROCEEDINGS)$', 2, 0.85),
+        (r'^(Management\'?s?\s+Discussion|MD&A)', 2, 0.85),
+        (r'^(Financial\s+Statements|Consolidated\s+Financial\s+Statements)$', 2, 0.85),
+        
+        # Numbered sections
+        (r'^\d+\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
+        (r'^[A-Z]\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
+        (r'^\([a-z]\)\s+[A-Z][A-Za-z\s]+$', 4, 0.6),
+        
+        # Title case headers
+        (r'^[A-Z][A-Za-z\s]+[A-Za-z]$', 3, 0.5),
+        
+        # All caps headers
+        (r'^[A-Z\s]+$', 3, 0.6),
+    ]
+    
+    @property
+    def name(self) -> str:
+        return "pattern"
+    
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """Detect headers based on text patterns."""
+        text = element.text_content().strip()
+        
+        # Skip empty or very long text
+        if not text or len(text) > 200:
+            return None
+        
+        # Skip single punctuation - never headers
+        if len(text) == 1 and text in '.,!?;:()[]{}':
+            return None
+        
+        # Skip if text contains multiple sentences (likely paragraph)
+        if text.count('.') > 2:
+            return None
+        
+        # Check against patterns
+        for pattern, level, base_confidence in self.HEADER_PATTERNS:
+            match = re.match(pattern, text, re.IGNORECASE)
+            if match:
+                # Adjust confidence based on context
+                confidence = base_confidence
+                
+                # Boost confidence if element is alone in parent
+                if len(element.getparent()) == 1:
+                    confidence += 0.1
+                
+                # Boost confidence if followed by substantial text
+                next_elem = element.getnext()
+                if next_elem is not None and len(next_elem.text_content()) > 100:
+                    confidence += 0.1
+                
+                confidence = min(confidence, 1.0)
+                
+                return HeaderInfo.from_text(text, level, confidence, self.name)
+        
+        return None
+
+
+class StructuralDetector(HeaderDetector):
+    """Detect headers based on DOM structure."""
+    
+    @property
+    def name(self) -> str:
+        return "structural"
+    
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """Detect headers based on structural cues."""
+        text = element.text_content().strip()
+        
+        # Skip empty or very long text
+        if not text or len(text) > 200:
+            return None
+        
+        # Skip single punctuation - never headers
+        if len(text) == 1 and text in '.,!?;:()[]{}':
+            return None
+        
+        confidence = 0.0
+        level = 3
+        
+        # Check if element is in a header tag
+        tag = element.tag.lower()
+        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            confidence = 1.0
+            level = int(tag[1])
+            return HeaderInfo.from_text(text, level, confidence, self.name)
+        
+        # Check parent structure
+        parent = element.getparent()
+        if parent is not None:
+            parent_tag = parent.tag.lower()
+            
+            # Check if in header-like container
+            if parent_tag in ['header', 'thead', 'caption']:
+                confidence += 0.6
+                level = 2
+            
+            # Check if parent has few children (isolated element)
+            if len(parent) <= 3:
+                confidence += 0.3
+            
+            # Check if parent is centered
+            parent_align = parent.get('align')
+            if parent_align == 'center':
+                confidence += 0.2
+        
+        # Check element properties
+        if tag in ['strong', 'b']:
+            confidence += 0.3
+        
+        if element.get('align') == 'center':
+            confidence += 0.2
+        
+        # Check if followed by block content
+        next_elem = element.getnext()
+        if next_elem is not None:
+            next_tag = next_elem.tag.lower()
+            if next_tag in ['p', 'div', 'table', 'ul', 'ol']:
+                confidence += 0.2
+        
+        # Check text characteristics
+        words = text.split()
+        if 1 <= len(words) <= 10:  # Short text
+            confidence += 0.1
+        
+        # Normalize confidence
+        confidence = min(confidence, 1.0)
+        
+        if confidence > 0.5:
+            return HeaderInfo.from_text(text, level, confidence, self.name)
+        
+        return None
+
+
+class ContextualDetector(HeaderDetector):
+    """Detect headers based on surrounding context."""
+    
+    @property
+    def name(self) -> str:
+        return "contextual"
+    
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """Detect headers based on contextual clues."""
+        text = element.text_content().strip()
+        
+        # Skip empty or very long text
+        if not text or len(text) > 200:
+            return None
+        
+        # Skip single punctuation - never headers
+        if len(text) == 1 and text in '.,!?;:()[]{}':
+            return None
+        
+        confidence = 0.0
+        level = 3
+        
+        # Check if text looks like a header
+        if self._looks_like_header(text):
+            confidence += 0.4
+        
+        # Check relationship to previous content
+        prev_elem = element.getprevious()
+        if prev_elem is not None:
+            prev_text = prev_elem.text_content().strip()
+            
+            # Check if previous was also a header (section hierarchy)
+            if prev_text and self._looks_like_header(prev_text):
+                confidence += 0.3
+                # Adjust level based on comparison
+                if len(text) > len(prev_text):
+                    level = 2
+                else:
+                    level = 3
+        
+        # Check relationship to next content
+        next_elem = element.getnext()
+        if next_elem is not None:
+            next_text = next_elem.text_content().strip()
+            
+            # Headers are often followed by longer content
+            if len(next_text) > len(text) * 3:
+                confidence += 0.3
+            
+            # Check if next element is indented or styled differently
+            next_style = next_elem.get('style', '')
+            if 'margin-left' in next_style or 'padding-left' in next_style:
+                confidence += 0.2
+        
+        # Check position in document
+        if context.current_section is None and context.depth < 5:
+            # Early in document, more likely to be header
+            confidence += 0.2
+        
+        # Normalize confidence
+        confidence = min(confidence, 1.0)
+        
+        if confidence > 0.5:
+            return HeaderInfo.from_text(text, level, confidence, self.name)
+        
+        return None
+    
+    def _looks_like_header(self, text: str) -> bool:
+        """Check if text looks like a header."""
+        # Short text
+        if len(text.split()) > 15:
+            return False
+        
+        # No ending punctuation (except colon)
+        if text.rstrip().endswith(('.', '!', '?', ';')):
+            return False
+        
+        # Title case or all caps
+        if text.istitle() or text.isupper():
+            return True
+        
+        # Starts with capital letter
+        if text and text[0].isupper():
+            return True
+        
+        return False
+
+
+class HeaderDetectionStrategy:
+    """
+    Multi-strategy header detection.
+    
+    Combines multiple detection methods with weighted voting.
+    """
+    
+    def __init__(self, config: ParserConfig):
+        """Initialize with configuration."""
+        self.config = config
+        self.detectors = self._init_detectors()
+    
+    def _init_detectors(self) -> List[HeaderDetector]:
+        """Initialize enabled detectors."""
+        detectors = []
+        
+        # Always include basic detectors
+        detectors.extend([
+            StyleBasedDetector(),
+            PatternBasedDetector(),
+            StructuralDetector(),
+            ContextualDetector()
+        ])
+        
+        # Add ML detector if enabled
+        if self.config.features.get('ml_header_detection'):
+            # Would add MLBasedDetector here
+            pass
+        
+        return detectors
+    
+    def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
+        """
+        Detect if element is a header using multiple strategies.
+        
+        Args:
+            element: HTML element to check
+            context: Current parsing context
+            
+        Returns:
+            HeaderInfo if element is detected as header, None otherwise
+        """
+        # Skip if element has no text
+        text = element.text_content().strip()
+        if not text:
+            return None
+        
+        # Collect results from all detectors
+        results: List[HeaderInfo] = []
+        
+        for detector in self.detectors:
+            try:
+                result = detector.detect(element, context)
+                if result:
+                    results.append(result)
+            except Exception:
+                # Don't let one detector failure stop others
+                continue
+        
+        if not results:
+            return None
+        
+        # If only one detector fired, use its result if confident enough
+        if len(results) == 1:
+            if results[0].confidence >= self.config.header_detection_threshold:
+                return results[0]
+            return None
+        
+        # Multiple detectors - combine results
+        return self._combine_results(results, text)
+    
+    def _combine_results(self, results: List[HeaderInfo], text: str) -> HeaderInfo:
+        """Combine multiple detection results."""
+        # Weight different detectors
+        detector_weights = {
+            'style': 0.3,
+            'pattern': 0.4,
+            'structural': 0.2,
+            'contextual': 0.1,
+            'ml': 0.5  # Would be highest if available
+        }
+        
+        # Calculate weighted confidence
+        total_confidence = 0.0
+        total_weight = 0.0
+        
+        # Group by level
+        level_votes: Dict[int, float] = {}
+        
+        for result in results:
+            weight = detector_weights.get(result.detection_method, 0.1)
+            total_confidence += result.confidence * weight
+            total_weight += weight
+            
+            # Vote for level
+            if result.level not in level_votes:
+                level_votes[result.level] = 0.0
+            level_votes[result.level] += result.confidence * weight
+        
+        # Normalize confidence
+        final_confidence = total_confidence / total_weight if total_weight > 0 else 0.0
+        
+        # Choose most voted level
+        final_level = max(level_votes.items(), key=lambda x: x[1])[0]
+        
+        # Check if any detector found this is an item
+        is_item = any(r.is_item for r in results)
+        item_number = next((r.item_number for r in results if r.item_number), None)
+        
+        return HeaderInfo(
+            level=final_level,
+            confidence=final_confidence,
+            text=text,
+            detection_method='combined',
+            is_item=is_item,
+            item_number=item_number
+        )
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/style_parser.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/style_parser.py
@@ -0,0 +1,344 @@
+"""
+CSS style parser for HTML elements.
+"""
+
+import re
+from typing import Dict, Optional, Tuple, Union
+from edgar.documents.types import Style
+from edgar.documents.utils import get_cache_manager
+
+
+class StyleParser:
+    """
+    Parser for CSS style attributes.
+    
+    Handles inline styles and converts them to Style objects.
+    """
+    
+    # Common CSS units
+    ABSOLUTE_UNITS = {'px', 'pt', 'pc', 'cm', 'mm', 'in'}
+    RELATIVE_UNITS = {'em', 'rem', 'ex', 'ch', 'vw', 'vh', '%'}
+    
+    # Font weight mappings
+    FONT_WEIGHT_MAP = {
+        'normal': '400',
+        'bold': '700',
+        'bolder': '800',
+        'lighter': '300'
+    }
+    
+    def __init__(self):
+        """Initialize style parser with cache."""
+        self._cache = get_cache_manager().style_cache
+    
+    def parse(self, style_string: str) -> Style:
+        """
+        Parse CSS style string into Style object.
+        
+        Args:
+            style_string: CSS style string (e.g., "font-size: 14px; color: red")
+            
+        Returns:
+            Parsed Style object
+        """
+        if not style_string:
+            return Style()
+        
+        # Check cache first
+        cached_style = self._cache.get(style_string)
+        if cached_style is not None:
+            return cached_style
+        
+        # Parse style
+        style = Style()
+        
+        # Split into individual declarations
+        declarations = self._split_declarations(style_string)
+        
+        for prop, value in declarations.items():
+            self._apply_property(style, prop, value)
+        
+        # Cache result
+        self._cache.put(style_string, style)
+        
+        return style
+    
+    def _split_declarations(self, style_string: str) -> Dict[str, str]:
+        """Split style string into property-value pairs."""
+        declarations = {}
+        
+        # Split by semicolon, handling potential issues
+        parts = style_string.split(';')
+        
+        for part in parts:
+            part = part.strip()
+            if not part:
+                continue
+            
+            # Split property and value
+            if ':' in part:
+                prop, value = part.split(':', 1)
+                prop = prop.strip().lower()
+                value = value.strip()
+                
+                if prop and value:
+                    declarations[prop] = value
+        
+        return declarations
+    
+    def _apply_property(self, style: Style, prop: str, value: str):
+        """Apply CSS property to Style object."""
+        # Font properties
+        if prop == 'font-size':
+            size = self._parse_length(value)
+            if size is not None:
+                style.font_size = size
+        
+        elif prop == 'font-weight':
+            style.font_weight = self._normalize_font_weight(value)
+        
+        elif prop == 'font-style':
+            if value in ['italic', 'oblique']:
+                style.font_style = 'italic'
+            elif value == 'normal':
+                style.font_style = 'normal'
+        
+        # Text properties
+        elif prop == 'text-align':
+            if value in ['left', 'right', 'center', 'justify']:
+                style.text_align = value
+        
+        elif prop == 'text-decoration':
+            style.text_decoration = value
+        
+        # Color properties
+        elif prop == 'color':
+            style.color = self._normalize_color(value)
+        
+        elif prop in ['background-color', 'background']:
+            color = self._extract_background_color(value)
+            if color:
+                style.background_color = color
+        
+        # Spacing properties
+        elif prop == 'margin':
+            self._parse_box_property(style, 'margin', value)
+        elif prop == 'margin-top':
+            margin = self._parse_length(value)
+            if margin is not None:
+                style.margin_top = margin
+        elif prop == 'margin-bottom':
+            margin = self._parse_length(value)
+            if margin is not None:
+                style.margin_bottom = margin
+        elif prop == 'margin-left':
+            margin = self._parse_length(value)
+            if margin is not None:
+                style.margin_left = margin
+        elif prop == 'margin-right':
+            margin = self._parse_length(value)
+            if margin is not None:
+                style.margin_right = margin
+        
+        elif prop == 'padding':
+            self._parse_box_property(style, 'padding', value)
+        elif prop == 'padding-top':
+            padding = self._parse_length(value)
+            if padding is not None:
+                style.padding_top = padding
+        elif prop == 'padding-bottom':
+            padding = self._parse_length(value)
+            if padding is not None:
+                style.padding_bottom = padding
+        elif prop == 'padding-left':
+            padding = self._parse_length(value)
+            if padding is not None:
+                style.padding_left = padding
+        elif prop == 'padding-right':
+            padding = self._parse_length(value)
+            if padding is not None:
+                style.padding_right = padding
+        
+        # Display properties
+        elif prop == 'display':
+            style.display = value
+        
+        # Size properties
+        elif prop == 'width':
+            style.width = self._parse_dimension(value)
+        elif prop == 'height':
+            style.height = self._parse_dimension(value)
+        
+        # Line height
+        elif prop == 'line-height':
+            line_height = self._parse_line_height(value)
+            if line_height is not None:
+                style.line_height = line_height
+    
+    def _parse_length(self, value: str) -> Optional[float]:
+        """Parse CSS length value to pixels."""
+        value = value.strip().lower()
+        
+        # Handle special values
+        if value in ['0', 'auto', 'inherit', 'initial']:
+            return 0.0 if value == '0' else None
+        
+        # Extract number and unit
+        match = re.match(r'^(-?\d*\.?\d+)\s*([a-z%]*)$', value)
+        if not match:
+            return None
+        
+        num_str, unit = match.groups()
+        try:
+            num = float(num_str)
+        except ValueError:
+            return None
+        
+        # Convert to pixels
+        if not unit or unit == 'px':
+            return num
+        elif unit == 'pt':
+            return num * 1.333  # 1pt = 1.333px
+        elif unit == 'em':
+            return num * 16  # Assume 16px base
+        elif unit == 'rem':
+            return num * 16  # Assume 16px root
+        elif unit == '%':
+            return None  # Can't convert percentage without context
+        elif unit == 'in':
+            return num * 96  # 1in = 96px
+        elif unit == 'cm':
+            return num * 37.8  # 1cm = 37.8px
+        elif unit == 'mm':
+            return num * 3.78  # 1mm = 3.78px
+        
+        return None
+    
+    def _parse_dimension(self, value: str) -> Optional[Union[float, str]]:
+        """Parse dimension value (width/height)."""
+        value = value.strip()
+        
+        # Check for percentage
+        if value.endswith('%'):
+            return value  # Return as string
+        
+        # Try to parse as length
+        length = self._parse_length(value)
+        return length
+    
+    def _parse_line_height(self, value: str) -> Optional[float]:
+        """Parse line-height value."""
+        value = value.strip()
+        
+        # Unitless number (multiplier)
+        try:
+            return float(value)
+        except ValueError:
+            pass
+        
+        # Try as length
+        return self._parse_length(value)
+    
+    def _normalize_font_weight(self, value: str) -> str:
+        """Normalize font weight value."""
+        value = value.strip().lower()
+        
+        # Map keywords to numeric values
+        if value in self.FONT_WEIGHT_MAP:
+            return self.FONT_WEIGHT_MAP[value]
+        
+        # Check if it's already numeric
+        if value.isdigit() and 100 <= int(value) <= 900:
+            return value
+        
+        return value
+    
+    def _normalize_color(self, value: str) -> str:
+        """Normalize color value."""
+        value = value.strip().lower()
+        
+        # Handle rgb/rgba
+        if value.startswith(('rgb(', 'rgba(')):
+            return value
+        
+        # Handle hex colors
+        if value.startswith('#'):
+            # Expand 3-char hex to 6-char
+            if len(value) == 4:
+                return '#' + ''.join(c*2 for c in value[1:])
+            return value
+        
+        # Return named colors as-is
+        return value
+    
+    def _extract_background_color(self, value: str) -> Optional[str]:
+        """Extract color from background property."""
+        # Simple extraction - could be enhanced
+        parts = value.split()
+        for part in parts:
+            if part.startswith('#') or part.startswith('rgb'):
+                return self._normalize_color(part)
+            # Check for named colors
+            if not any(unit in part for unit in self.ABSOLUTE_UNITS | self.RELATIVE_UNITS):
+                return part
+        
+        return None
+    
+    def _parse_box_property(self, style: Style, prop_type: str, value: str):
+        """Parse box property (margin/padding) with multiple values."""
+        parts = value.split()
+        
+        if not parts:
+            return
+        
+        # Convert all parts to lengths
+        lengths = []
+        for part in parts:
+            length = self._parse_length(part)
+            if length is not None:
+                lengths.append(length)
+        
+        if not lengths:
+            return
+        
+        # Apply based on number of values (CSS box model)
+        if len(lengths) == 1:
+            # All sides
+            val = lengths[0]
+            setattr(style, f'{prop_type}_top', val)
+            setattr(style, f'{prop_type}_right', val)
+            setattr(style, f'{prop_type}_bottom', val)
+            setattr(style, f'{prop_type}_left', val)
+        elif len(lengths) == 2:
+            # Vertical, horizontal
+            vert, horiz = lengths
+            setattr(style, f'{prop_type}_top', vert)
+            setattr(style, f'{prop_type}_bottom', vert)
+            setattr(style, f'{prop_type}_left', horiz)
+            setattr(style, f'{prop_type}_right', horiz)
+        elif len(lengths) == 3:
+            # Top, horizontal, bottom
+            top, horiz, bottom = lengths
+            setattr(style, f'{prop_type}_top', top)
+            setattr(style, f'{prop_type}_bottom', bottom)
+            setattr(style, f'{prop_type}_left', horiz)
+            setattr(style, f'{prop_type}_right', horiz)
+        elif len(lengths) >= 4:
+            # Top, right, bottom, left
+            setattr(style, f'{prop_type}_top', lengths[0])
+            setattr(style, f'{prop_type}_right', lengths[1])
+            setattr(style, f'{prop_type}_bottom', lengths[2])
+            setattr(style, f'{prop_type}_left', lengths[3])
+    
+    def merge_styles(self, base: Style, override: Style) -> Style:
+        """
+        Merge two styles with override taking precedence.
+        
+        Args:
+            base: Base style
+            override: Override style
+            
+        Returns:
+            Merged style
+        """
+        return base.merge(override)
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/table_processing.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/table_processing.py
@@ -0,0 +1,637 @@
+"""
+Advanced table processing strategy.
+"""
+
+import re
+from functools import lru_cache
+from typing import List, Optional
+
+from lxml.html import HtmlElement
+
+from edgar.documents.config import ParserConfig
+from edgar.documents.strategies.style_parser import StyleParser
+from edgar.documents.table_nodes import TableNode, Cell, Row
+from edgar.documents.types import TableType
+
+
+class TableProcessor:
+    """
+    Advanced table processing with type detection and structure analysis.
+    """
+    
+    # HTML entities that need replacement
+    ENTITY_REPLACEMENTS = {
+        '&horbar;': '-----',
+        '&mdash;': '-----',
+        '&ndash;': '---',
+        '&minus;': '-',
+        '&hyphen;': '-',
+        '&dash;': '-',
+        '&nbsp;': ' ',
+        '&amp;': '&',
+        '&lt;': '<',
+        '&gt;': '>',
+        '&quot;': '"',
+        '&apos;': "'",
+        '&#8202;': ' ',
+        '&#8203;': '',
+        '&#x2014;': '-----',
+        '&#x2013;': '---',
+        '&#x2212;': '-',
+    }
+    
+    # Financial keywords for table type detection
+    FINANCIAL_KEYWORDS = {
+        'revenue', 'income', 'expense', 'asset', 'liability',
+        'cash', 'equity', 'profit', 'loss', 'margin',
+        'earnings', 'cost', 'sales', 'operating', 'net',
+        'gross', 'total', 'balance', 'statement', 'consolidated',
+        'provision', 'tax', 'taxes', 'compensation', 'stock',
+        'share', 'shares', 'rsu', 'option', 'grant', 'vest'
+    }
+    
+    # Metrics keywords
+    METRICS_KEYWORDS = {
+        'ratio', 'percentage', 'percent', '%', 'rate',
+        'growth', 'change', 'increase', 'decrease',
+        'average', 'median', 'total', 'count', 'number'
+    }
+    
+    def __init__(self, config: ParserConfig):
+        """Initialize table processor."""
+        self.config = config
+        self.style_parser = StyleParser()
+    
+    def process(self, element: HtmlElement) -> TableNode:
+        """
+        Process table element into TableNode.
+        
+        Args:
+            element: HTML table element
+            
+        Returns:
+            Processed TableNode
+        """
+        # Extract table metadata
+        table_id = element.get('id')
+        table_class = element.get('class', '').split()
+        table_style = self.style_parser.parse(element.get('style', ''))
+        
+        # Create table node
+        table = TableNode(style=table_style)
+        
+        # Set config for rendering decisions
+        table._config = self.config
+        
+        # Add metadata
+        if table_id:
+            table.set_metadata('id', table_id)
+        if table_class:
+            table.set_metadata('classes', table_class)
+        
+        # Extract caption
+        caption_elem = element.find('.//caption')
+        if caption_elem is not None:
+            table.caption = self._extract_text(caption_elem)
+        
+        # Extract summary
+        summary = element.get('summary')
+        if summary:
+            table.summary = summary
+        
+        # Process table structure
+        self._process_table_structure(element, table)
+        
+        # Detect table type if configured
+        if self.config.detect_table_types:
+            table.table_type = self._detect_table_type(table)
+        
+        # Extract relationships if configured
+        if self.config.extract_table_relationships:
+            self._extract_relationships(table)
+        
+        return table
+    
+    def _process_table_structure(self, element: HtmlElement, table: TableNode):
+        """Process table structure (thead, tbody, tfoot)."""
+        # Process thead
+        thead = element.find('.//thead')
+        if thead is not None:
+            for tr in thead.findall('.//tr'):
+                cells = self._process_row(tr, is_header=True)
+                if cells:
+                    table.headers.append(cells)
+        
+        # Process tbody (or direct rows)
+        tbody = element.find('.//tbody')
+        rows_container = tbody if tbody is not None else element
+        
+        # Track if we've seen headers and data rows
+        headers_found = bool(table.headers)
+        consecutive_header_rows = 0
+        data_rows_started = False
+        
+        for tr in rows_container.findall('.//tr'):
+            # Skip if already processed in thead
+            if thead is not None and tr.getparent() == thead:
+                continue
+            
+            # Check if this might be a header row
+            is_header_row = False
+            
+            # Continue checking for headers if:
+            # 1. We haven't found any headers yet, OR
+            # 2. We've found headers but haven't seen data rows yet (multi-row headers)
+            if not data_rows_started:
+                is_header_row = self._is_header_row(tr)
+                
+                # Additional check for multi-row headers in financial tables
+                # If the previous row was a header and this row has years or units,
+                # it's likely part of the header
+                if headers_found and not is_header_row:
+                    row_text = tr.text_content().strip()
+                    # Check for units like "(in millions)" or "(in thousands)"
+                    if '(in millions)' in row_text or '(in thousands)' in row_text or '(in billions)' in row_text:
+                        is_header_row = True
+                    # Check for year rows that follow "Year Ended" headers
+                    elif len(table.headers) > 0:
+                        last_header_text = ' '.join(cell.text() for cell in table.headers[-1])
+                        if 'year ended' in last_header_text.lower() or 'years ended' in last_header_text.lower():
+                            # Check if this row has years
+                            year_pattern = r'\b(19\d{2}|20\d{2})\b'
+                            years_found = re.findall(year_pattern, row_text)
+                            if years_found:
+                                is_header_row = True
+            
+            cells = self._process_row(tr, is_header=is_header_row)
+            if cells:
+                if is_header_row:
+                    table.headers.append(cells)
+                    headers_found = True
+                    consecutive_header_rows += 1
+                else:
+                    # Only mark data_rows_started if this row has actual content
+                    # Empty rows at the beginning shouldn't stop header detection
+                    row = Row(cells=cells, is_header=False)
+                    table.rows.append(row)
+                    
+                    # Check if row has significant content that indicates data rows have started
+                    # But be smart about it - descriptive rows like "(in millions)" or pure spacing
+                    # shouldn't stop header detection
+                    has_content = any(cell.text().strip() for cell in cells)
+                    if has_content:
+                        # Get the row text for smarter analysis
+                        row_text = ' '.join(cell.text().strip() for cell in cells).strip()
+                        row_text_lower = row_text.lower()
+                        
+                        # Don't consider this as "data started" if it's likely a header-related row
+                        is_header_related = (
+                            # Unit descriptions
+                            '(in millions)' in row_text_lower or 
+                            '(in thousands)' in row_text_lower or 
+                            '(in billions)' in row_text_lower or
+                            'except per share' in row_text_lower or
+                            # Financial period descriptions  
+                            'year ended' in row_text_lower or
+                            'months ended' in row_text_lower or
+                            # Mostly just spacing/formatting
+                            len(row_text.strip()) < 5 or
+                            # Contains years (might be misclassified header)
+                            bool(re.search(r'\b(19\d{2}|20\d{2})\b', row_text))
+                        )
+                        
+                        # Only mark data_rows_started if this seems like actual data, not header-related
+                        if not is_header_related:
+                            data_rows_started = True
+                    
+                    consecutive_header_rows = 0
+        
+        # Process tfoot
+        tfoot = element.find('.//tfoot')
+        if tfoot is not None:
+            for tr in tfoot.findall('.//tr'):
+                cells = self._process_row(tr, is_header=False)
+                if cells:
+                    row = Row(cells=cells, is_header=False)
+                    table.footer.append(row)
+    
+    def _process_row(self, tr: HtmlElement, is_header: bool) -> List[Cell]:
+        """Process table row into cells."""
+        cells = []
+        
+        # Process both td and th elements
+        for cell_elem in tr.findall('.//td') + tr.findall('.//th'):
+            cell = self._process_cell(cell_elem, is_header or cell_elem.tag == 'th')
+            if cell:
+                cells.append(cell)
+        
+        return cells
+    
+    def _process_cell(self, elem: HtmlElement, is_header: bool) -> Optional[Cell]:
+        """Process table cell."""
+        # Extract cell properties
+        colspan = int(elem.get('colspan', '1'))
+        rowspan = int(elem.get('rowspan', '1'))
+        align = elem.get('align')
+        
+        # Extract style
+        style = self.style_parser.parse(elem.get('style', ''))
+        if style.text_align:
+            align = style.text_align
+        
+        # Extract content
+        content = self._extract_cell_content(elem)
+        
+        # Create cell
+        cell = Cell(
+            content=content,
+            colspan=colspan,
+            rowspan=rowspan,
+            is_header=is_header,
+            align=align
+        )
+        
+        return cell
+    
+    def _extract_cell_content(self, elem: HtmlElement) -> str:
+        """Extract and clean cell content."""
+        # Check for nested structure
+        divs = elem.findall('.//div')
+        if divs and len(divs) > 1:
+            # Multiple divs - likely multi-line content
+            lines = []
+            for div in divs:
+                text = self._extract_text(div)
+                if text:
+                    lines.append(text)
+            return '\n'.join(lines)
+        
+        # Handle line breaks
+        for br in elem.findall('.//br'):
+            br.tail = '\n' + (br.tail or '')
+        
+        # Extract text
+        text = self._extract_text(elem)
+        
+        return text
+    
+    def _extract_text(self, elem: HtmlElement) -> str:
+        """Extract and clean text from element."""
+        # Use itertext() to get all text fragments
+        # This preserves spaces better than text_content()
+        text_parts = []
+        for text in elem.itertext():
+            if text:
+                text_parts.append(text)
+        
+        # Join parts, ensuring we don't lose spaces
+        # If a part doesn't end with whitespace and the next doesn't start with whitespace,
+        # we need to add a space between them
+        if not text_parts:
+            return ''
+        
+        result = []
+        for i, part in enumerate(text_parts):
+            if i == 0:
+                result.append(part)
+            else:
+                prev_part = text_parts[i-1]
+                # Check if we need to add a space between parts
+                # Don't add space if previous ends with space or current starts with space
+                if prev_part and part:
+                    if not prev_part[-1].isspace() and not part[0].isspace():
+                        # Check for punctuation that shouldn't have space before it
+                        if part[0] not in ',.;:!?%)]':
+                            result.append(' ')
+                result.append(part)
+        
+        text = ''.join(result)
+        
+        # Replace entities
+        for entity, replacement in self.ENTITY_REPLACEMENTS.items():
+            text = text.replace(entity, replacement)
+        
+        # Clean whitespace
+        text = text.strip()
+        
+        # Normalize internal whitespace but preserve line breaks
+        lines = text.split('\n')
+        cleaned_lines = []
+        for line in lines:
+            # Collapse multiple spaces to single space
+            line = ' '.join(line.split())
+            cleaned_lines.append(line)
+        
+        return '\n'.join(cleaned_lines)
+
+    @staticmethod
+    @lru_cache(maxsize=1)
+    def _get_period_header_pattern():
+        """
+        Compile comprehensive regex for financial period headers.
+        Adapted from old parser's proven patterns.
+
+        Returns:
+            Compiled regex pattern matching financial period headers
+        """
+        # Base components
+        periods = r'(?:three|six|nine|twelve|[1-4]|first|second|third|fourth)'
+        timeframes = r'(?:month|quarter|year|week)'
+        ended_variants = r'(?:ended|ending|end|period)'
+        as_of_variants = r'(?:as\s+of|at|as\s+at)'
+
+        # Date pattern
+        months = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
+        day = r'\d{1,2}'
+        year = r'(?:19|20)\d{2}'
+        date = f'{months}\\s*\\.?\\s*{day}\\s*,?\\s*{year}'
+
+        # Combined patterns
+        patterns = [
+            # Standard period headers
+            f'{periods}\\s+{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
+            f'(?:fiscal\\s+)?{timeframes}\\s+{ended_variants}',
+            f'{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
+
+            # Balance sheet date headers
+            f'{as_of_variants}\\s+{date}',
+
+            # Multiple date sequences
+            f'{date}(?:\\s*(?:and|,)\\s*{date})*',
+
+            # Single dates
+            f'(?:{ended_variants}\\s+)?{date}'
+        ]
+
+        pattern = '|'.join(f'(?:{p})' for p in patterns)
+        return re.compile(pattern, re.IGNORECASE)
+
+    def _is_header_row(self, tr: HtmlElement) -> bool:
+        """Detect if row is likely a header row in SEC filings."""
+        # Check if contains th elements (most reliable indicator)
+        if tr.find('.//th') is not None:
+            return True
+        
+        cells = tr.findall('.//td')
+        if not cells:
+            return False
+        
+        # Get row text for analysis
+        row_text = tr.text_content()
+        row_text_lower = row_text.lower()
+
+        # Check for date ranges with financial data (Oracle Table 6 pattern)
+        # Date ranges like "March 1, 2024—March 31, 2024" should be data rows, not headers
+        date_range_pattern = r'(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}\s*[—–-]\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}'
+        has_date_range = bool(re.search(date_range_pattern, row_text_lower))
+
+        # Check for financial data indicators
+        has_currency = bool(re.search(r'\$[\s]*[\d,\.]+', row_text))
+        has_decimals = bool(re.search(r'\b\d+\.\d+\b', row_text))
+        has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
+
+        # If row has date range + financial data, it's definitely a data row
+        if has_date_range and (has_currency or has_decimals or has_large_numbers):
+            return False
+
+        # Check for year patterns (very common in financial headers)
+        year_pattern = r'\b(19\d{2}|20\d{2})\b'
+        years_found = re.findall(year_pattern, row_text)
+        if len(years_found) >= 2:  # Multiple years suggest header row
+            # IMPORTANT: Check for date ranges and same-year repetition
+            # Date ranges like "March 1, 2024—March 31, 2024" contain the same year twice
+            # but are data rows, not multi-year comparison headers
+
+            # If all years are the same (date range pattern)
+            if len(set(years_found)) == 1:
+                # Same year repeated - likely a date range like "Jan 1, 2024 - Mar 31, 2024"
+                # Not a multi-year comparison header
+                pass  # Don't return True
+            # Multiple different years suggest multi-year comparison header
+            elif 'total' not in row_text_lower[:20]:  # Check first 20 chars
+                return True
+        
+        # Enhanced year detection - check individual cells for year patterns
+        # This handles cases where years are in separate cells
+        year_cells = 0
+        date_phrases = 0
+        for cell in cells:
+            cell_text = cell.text_content().strip()
+            if cell_text:
+                # Check for individual years
+                if re.match(r'^\s*(19\d{2}|20\d{2})\s*$', cell_text):
+                    year_cells += 1
+                # Check for date phrases like "June 30, 2025"
+                elif 'june 30' in cell_text.lower() or 'december 31' in cell_text.lower():
+                    date_phrases += 1
+        
+        # If we have multiple year cells or year + date phrases, likely a header
+        if year_cells >= 2 or (year_cells >= 1 and date_phrases >= 1):
+            if 'total' not in row_text_lower[:20]:
+                return True
+        
+        # Check for comprehensive financial period patterns (from old parser)
+        period_pattern = self._get_period_header_pattern()
+        if period_pattern.search(row_text_lower):
+            # Additional validation: ensure it's not a data row with period text
+            # Check for absence of strong data indicators
+            data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\s*[+\-*/]\s*\d+|\(\s*\d+(?:,\d{3})*\s*\))'
+            if not re.search(data_pattern, row_text):
+                return True
+
+        # Check for units notation (in millions, thousands, billions)
+        units_pattern = r'\(in\s+(?:millions|thousands|billions)\)'
+        if re.search(units_pattern, row_text_lower):
+            return True
+        
+        # Check for period indicators (quarters, months)
+        # But be careful with "fiscal" - it could be data like "Fiscal 2025"
+        period_keywords = ['quarter', 'q1', 'q2', 'q3', 'q4', 'month', 
+                          'january', 'february', 'march', 'april', 'may', 'june',
+                          'july', 'august', 'september', 'october', 'november', 'december',
+                          'ended', 'three months', 'six months', 'nine months']
+        
+        # Special handling for "fiscal" - only treat as header if it's part of a phrase like "fiscal year ended"
+        if 'fiscal' in row_text_lower:
+            # Check if row has numeric values (suggests it's data, not header)
+            # Look for patterns like "Fiscal 2025 $10,612" 
+            has_currency_values = bool(re.search(r'\$[\s]*[\d,]+', row_text))
+            has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
+            
+            # If it has currency or large numbers, it's likely data
+            if has_currency_values or has_large_numbers:
+                return False
+            
+            # Check if it's just "Fiscal YYYY" which is likely data, not a header
+            fiscal_year_only = re.match(r'^\s*fiscal\s+\d{4}\s*$', row_text_lower.strip())
+            if fiscal_year_only:
+                return False  # This is data, not a header
+            
+            # Check for header-like phrases with fiscal
+            if 'fiscal year' in row_text_lower and ('ended' in row_text_lower or 'ending' in row_text_lower):
+                return True
+        
+        if any(keyword in row_text_lower for keyword in period_keywords):
+            # Validate it's not a data row with period keywords
+            # Check for strong data indicators
+            data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
+            if not re.search(data_pattern, row_text):
+                return True
+        
+        # Check for column descriptors (but NOT total)
+        # These are words commonly found in headers but not data rows
+        header_keywords = ['description', 'item', 'category', 'type', 'classification',
+                          'change', 'percent', 'increase', 'decrease', 'variance']
+        if any(keyword in row_text_lower for keyword in header_keywords):
+            # Make sure it's not a total row
+            if 'total' not in row_text_lower[:30]:
+                # Additional validation: long narrative text is not a header
+                # Headers are typically concise (< 150 chars)
+                if len(row_text) > 150:
+                    return False
+                # Check for data indicators (would indicate data row, not header)
+                data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
+                if re.search(data_pattern, row_text):
+                    return False
+                return True
+        
+        # Check if all cells are bold (common header formatting)
+        bold_count = 0
+        for cell in cells:
+            style = cell.get('style', '')
+            if 'font-weight' in style and 'bold' in style:
+                bold_count += 1
+            elif cell.find('.//b') is not None or cell.find('.//strong') is not None:
+                bold_count += 1
+        
+        # Only consider it a header if ALL cells are bold (not just some)
+        if bold_count == len(cells) and bold_count > 0:
+            return True
+        
+        # Check content type ratio - headers usually have more text than numbers
+        # Count cells with primarily text vs primarily numbers
+        text_cells = 0
+        number_cells = 0
+        for cell in cells:
+            cell_text = cell.text_content().strip()
+            if cell_text:
+                # Remove common symbols for analysis
+                clean_text = cell_text.replace('$', '').replace('%', '').replace(',', '').replace('(', '').replace(')', '')
+                if clean_text.replace('.', '').replace('-', '').strip().isdigit():
+                    number_cells += 1
+                else:
+                    text_cells += 1
+        
+        # Be very careful about treating text-heavy rows as headers
+        # Many data rows start with text labels (e.g., "Impact of...", "Effect of...")
+        # Only consider it a header if it has mostly text AND doesn't look like a data label
+        if text_cells > number_cells * 2 and text_cells >= 3:
+            # Check for common data row patterns
+            data_row_indicators = [
+                'impact of', 'effect of', 'adjustment', 'provision for', 'benefit',
+                'expense', 'income from', 'loss on', 'gain on', 'charge', 'credit',
+                'earnings', 'computed', 'state taxes', 'research', 'excess tax'
+            ]
+            
+            # If it starts with any of these, it's likely a data row, not a header
+            for indicator in data_row_indicators:
+                if row_text_lower.startswith(indicator) or indicator in row_text_lower[:50]:
+                    return False
+            
+            # Also not a header if it starts with "total"
+            if not row_text_lower.startswith('total'):
+                return True
+        
+        return False
+    
+    def _detect_table_type(self, table: TableNode) -> TableType:
+        """Detect the type of table based on content."""
+        # Collect text from headers and first few rows
+        text_parts = []
+        
+        # Add caption
+        if table.caption:
+            text_parts.append(table.caption.lower())
+        
+        # Add headers
+        for header_row in table.headers:
+            for cell in header_row:
+                text_parts.append(cell.text().lower())
+        
+        # Add first few rows
+        for row in table.rows[:3]:
+            for cell in row.cells:
+                text_parts.append(cell.text().lower())
+        
+        combined_text = ' '.join(text_parts)
+        
+        # Check for financial table
+        financial_count = sum(1 for keyword in self.FINANCIAL_KEYWORDS if keyword in combined_text)
+        if financial_count >= 2:  # Lowered threshold for better detection
+            return TableType.FINANCIAL
+        
+        # Check for metrics table  
+        metrics_count = sum(1 for keyword in self.METRICS_KEYWORDS if keyword in combined_text)
+        numeric_cells = sum(1 for row in table.rows for cell in row.cells if cell.is_numeric)
+        total_cells = sum(len(row.cells) for row in table.rows)
+        
+        if total_cells > 0:
+            numeric_ratio = numeric_cells / total_cells
+            # More lenient metrics detection
+            if metrics_count >= 1 or numeric_ratio > 0.3:
+                return TableType.METRICS
+        
+        # Check for table of contents
+        if 'content' in combined_text or 'index' in combined_text:
+            # Look for page numbers
+            has_page_numbers = any(
+                re.search(r'\b\d{1,3}\b', cell.text()) 
+                for row in table.rows 
+                for cell in row.cells
+            )
+            if has_page_numbers:
+                return TableType.TABLE_OF_CONTENTS
+        
+        # Check for exhibit index
+        if 'exhibit' in combined_text:
+            return TableType.EXHIBIT_INDEX
+        
+        # Check for reference table (citations, definitions, etc.)
+        if any(word in combined_text for word in ['reference', 'definition', 'glossary', 'citation']):
+            return TableType.REFERENCE
+        
+        return TableType.GENERAL
+    
+    def _extract_relationships(self, table: TableNode):
+        """Extract relationships within table data."""
+        # This would implement relationship extraction
+        # For now, just set a flag that relationships were processed
+        table.set_metadata('relationships_extracted', True)
+        
+        # Example relationships to extract:
+        # - Parent-child relationships (indented rows)
+        # - Total rows that sum other rows
+        # - Cross-references between cells
+        # - Time series relationships
+        
+        # Detect total rows
+        total_rows = []
+        for i, row in enumerate(table.rows):
+            if row.is_total_row:
+                total_rows.append(i)
+        
+        if total_rows:
+            table.set_metadata('total_rows', total_rows)
+        
+        # Detect indentation patterns (parent-child)
+        indentation_levels = []
+        for row in table.rows:
+            if row.cells:
+                first_cell_text = row.cells[0].text()
+                # Count leading spaces
+                indent = len(first_cell_text) - len(first_cell_text.lstrip())
+                indentation_levels.append(indent)
+        
+        if any(level > 0 for level in indentation_levels):
+            table.set_metadata('has_hierarchy', True)
+            table.set_metadata('indentation_levels', indentation_levels)
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/xbrl_extraction.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/xbrl_extraction.py
@@ -0,0 +1,345 @@
+"""
+XBRL extraction strategy for inline XBRL documents.
+"""
+
+from typing import Dict, Any, Optional
+
+from lxml.html import HtmlElement
+
+from edgar.documents.types import XBRLFact
+
+
+class XBRLExtractor:
+    """
+    Extracts XBRL facts from inline XBRL (iXBRL) documents.
+    
+    Handles:
+    - ix:nonFraction, ix:nonNumeric facts
+    - Context and unit resolution
+    - Continuation handling
+    - Transformation rules
+    """
+    
+    # XBRL namespaces
+    NAMESPACES = {
+        'ix': 'http://www.xbrl.org/2013/inlineXBRL',
+        'xbrli': 'http://www.xbrl.org/2003/instance',
+        'xbrldi': 'http://xbrl.org/2006/xbrldi',
+        'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
+    }
+    
+    # Common transformation formats
+    TRANSFORMATIONS = {
+        'ixt:numdotdecimal': lambda x: x.replace(',', ''),
+        'ixt:numcommadecimal': lambda x: x.replace('.', '_').replace(',', '.').replace('_', ','),
+        'ixt:zerodash': lambda x: '0' if x == '-' else x,
+        'ixt:datedoteu': lambda x: x.replace('.', '-'),
+        'ixt:datedotus': lambda x: x.replace('.', '/'),
+    }
+    
+    def __init__(self):
+        """Initialize XBRL extractor."""
+        self.contexts: Dict[str, Dict[str, Any]] = {}
+        self.units: Dict[str, str] = {}
+        self.continuations: Dict[str, str] = {}
+        self._initialized = False
+    
+    def extract_context(self, element: HtmlElement) -> Optional[Dict[str, Any]]:
+        """
+        Extract XBRL context from element.
+        
+        Args:
+            element: HTML element that might contain XBRL
+            
+        Returns:
+            XBRL metadata if found
+        """
+        # Check if element is an ix: tag
+        if not self._is_xbrl_element(element):
+            return None
+        
+        # Initialize context if needed
+        if not self._initialized:
+            self._initialize_context(element)
+        
+        # Extract based on element type
+        tag_name = self._get_local_name(element.tag)
+        
+        if tag_name == 'nonfraction':
+            return self._extract_nonfraction(element)
+        elif tag_name == 'nonnumeric':
+            return self._extract_nonnumeric(element)
+        elif tag_name == 'continuation':
+            return self._extract_continuation(element)
+        elif tag_name == 'footnote':
+            return self._extract_footnote(element)
+        elif tag_name == 'fraction':
+            return self._extract_fraction(element)
+        
+        return None
+    
+    def extract_fact(self, element: HtmlElement) -> Optional[XBRLFact]:
+        """Extract XBRL fact from element."""
+        context = self.extract_context(element)
+        if not context:
+            return None
+        
+        # Get fact value
+        value = self._get_fact_value(element)
+        
+        # Create fact
+        fact = XBRLFact(
+            concept=context.get('name', ''),
+            value=value,
+            context_ref=context.get('contextRef'),
+            unit_ref=context.get('unitRef'),
+            decimals=context.get('decimals'),
+            scale=context.get('scale'),
+            format=context.get('format'),
+            sign=context.get('sign')
+        )
+        
+        # Resolve references
+        if fact.context_ref and fact.context_ref in self.contexts:
+            fact.context = self.contexts[fact.context_ref]
+        
+        if fact.unit_ref and fact.unit_ref in self.units:
+            fact.unit = self.units[fact.unit_ref]
+        
+        return fact
+    
+    def _is_xbrl_element(self, element: HtmlElement) -> bool:
+        """Check if element is an XBRL element."""
+        tag = element.tag
+        if not isinstance(tag, str):
+            return False
+        
+        # Handle both namespaced and non-namespaced tags
+        tag_lower = tag.lower()
+        return (
+            tag.startswith('{' + self.NAMESPACES['ix'] + '}') or
+            tag.startswith('ix:') or
+            tag_lower.startswith('ix:')
+        )
+    
+    def _get_local_name(self, tag: str) -> str:
+        """Get local name from qualified tag."""
+        if '}' in tag:
+            return tag.split('}')[1].lower()
+        elif ':' in tag:
+            return tag.split(':')[1].lower()
+        return tag.lower()
+    
+    def _initialize_context(self, element: HtmlElement):
+        """Initialize context and unit information from document."""
+        # Find root element
+        root = element.getroottree().getroot()
+        
+        # Extract contexts
+        self._extract_contexts(root)
+        
+        # Extract units
+        self._extract_units(root)
+        
+        self._initialized = True
+    
+    def _extract_contexts(self, root: HtmlElement):
+        """Extract all context definitions."""
+        # Look for xbrli:context elements
+        for context in root.xpath('//xbrli:context', namespaces=self.NAMESPACES):
+            context_id = context.get('id')
+            if not context_id:
+                continue
+            
+            context_data = {
+                'id': context_id
+            }
+            
+            # Extract entity
+            entity = context.find('.//xbrli:entity', namespaces=self.NAMESPACES)
+            if entity is not None:
+                identifier = entity.find('.//xbrli:identifier', namespaces=self.NAMESPACES)
+                if identifier is not None:
+                    context_data['entity'] = identifier.text
+                    context_data['scheme'] = identifier.get('scheme')
+            
+            # Extract period
+            period = context.find('.//xbrli:period', namespaces=self.NAMESPACES)
+            if period is not None:
+                instant = period.find('.//xbrli:instant', namespaces=self.NAMESPACES)
+                if instant is not None:
+                    context_data['instant'] = instant.text
+                    context_data['period_type'] = 'instant'
+                else:
+                    start = period.find('.//xbrli:startDate', namespaces=self.NAMESPACES)
+                    end = period.find('.//xbrli:endDate', namespaces=self.NAMESPACES)
+                    if start is not None and end is not None:
+                        context_data['start_date'] = start.text
+                        context_data['end_date'] = end.text
+                        context_data['period_type'] = 'duration'
+            
+            # Extract dimensions
+            segment = context.find('.//xbrli:segment', namespaces=self.NAMESPACES)
+            if segment is not None:
+                dimensions = {}
+                for member in segment.findall('.//xbrldi:explicitMember', namespaces=self.NAMESPACES):
+                    dim = member.get('dimension')
+                    if dim:
+                        dimensions[dim] = member.text
+                if dimensions:
+                    context_data['dimensions'] = dimensions
+            
+            self.contexts[context_id] = context_data
+    
+    def _extract_units(self, root: HtmlElement):
+        """Extract all unit definitions."""
+        # Look for xbrli:unit elements
+        for unit in root.xpath('//xbrli:unit', namespaces=self.NAMESPACES):
+            unit_id = unit.get('id')
+            if not unit_id:
+                continue
+            
+            # Check for simple measure
+            measure = unit.find('.//xbrli:measure', namespaces=self.NAMESPACES)
+            if measure is not None:
+                self.units[unit_id] = self._normalize_unit(measure.text)
+                continue
+            
+            # Check for complex unit (divide)
+            divide = unit.find('.//xbrli:divide', namespaces=self.NAMESPACES)
+            if divide is not None:
+                numerator = divide.find('.//xbrli:unitNumerator/xbrli:measure', namespaces=self.NAMESPACES)
+                denominator = divide.find('.//xbrli:unitDenominator/xbrli:measure', namespaces=self.NAMESPACES)
+                
+                if numerator is not None and denominator is not None:
+                    num_unit = self._normalize_unit(numerator.text)
+                    den_unit = self._normalize_unit(denominator.text)
+                    self.units[unit_id] = f"{num_unit}/{den_unit}"
+    
+    def _normalize_unit(self, unit_text: str) -> str:
+        """Normalize unit text."""
+        if not unit_text:
+            return ''
+        
+        # Remove namespace prefix
+        if ':' in unit_text:
+            unit_text = unit_text.split(':')[-1]
+        
+        # Common normalizations
+        unit_map = {
+            'usd': 'USD',
+            'shares': 'shares',
+            'pure': 'pure',
+            'percent': '%'
+        }
+        
+        return unit_map.get(unit_text.lower(), unit_text)
+    
+    def _extract_nonfraction(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:nonFraction element."""
+        metadata = {
+            'type': 'nonFraction',
+            'name': element.get('name'),
+            'contextRef': element.get('contextRef') or element.get('contextref'),
+            'unitRef': element.get('unitRef') or element.get('unitref'),
+            'decimals': element.get('decimals'),
+            'scale': element.get('scale'),
+            'format': element.get('format'),
+            'sign': element.get('sign')
+        }
+        
+        # Clean None values
+        return {k: v for k, v in metadata.items() if v is not None}
+    
+    def _extract_nonnumeric(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:nonNumeric element."""
+        metadata = {
+            'type': 'nonNumeric',
+            'name': element.get('name'),
+            'contextRef': element.get('contextRef') or element.get('contextref'),
+            'format': element.get('format')
+        }
+        
+        # Clean None values
+        return {k: v for k, v in metadata.items() if v is not None}
+    
+    def _extract_continuation(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:continuation element."""
+        cont_id = element.get('id')
+        continued_at = element.get('continuedAt')
+        
+        if cont_id and continued_at:
+            # Map continuation to original
+            if continued_at in self.continuations:
+                original = self.continuations[continued_at]
+                self.continuations[cont_id] = original
+                return original
+            else:
+                # Store for later resolution
+                metadata = {
+                    'type': 'continuation',
+                    'id': cont_id,
+                    'continuedAt': continued_at
+                }
+                self.continuations[cont_id] = metadata
+                return metadata
+        
+        return {}
+    
+    def _extract_footnote(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:footnote element."""
+        return {
+            'type': 'footnote',
+            'footnoteRole': element.get('footnoteRole'),
+            'footnoteID': element.get('footnoteID')
+        }
+    
+    def _extract_fraction(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:fraction element."""
+        metadata = {
+            'type': 'fraction',
+            'name': element.get('name'),
+            'contextRef': element.get('contextRef'),
+            'unitRef': element.get('unitRef')
+        }
+        
+        # Extract numerator and denominator
+        numerator = element.find('.//ix:numerator', namespaces=self.NAMESPACES)
+        denominator = element.find('.//ix:denominator', namespaces=self.NAMESPACES)
+        
+        if numerator is not None:
+            metadata['numerator'] = numerator.text
+        if denominator is not None:
+            metadata['denominator'] = denominator.text
+        
+        return {k: v for k, v in metadata.items() if v is not None}
+    
+    def _get_fact_value(self, element: HtmlElement) -> str:
+        """Get fact value from element with transformations."""
+        # Get raw value
+        value = element.text or ''
+        
+        # Apply format transformation if specified
+        format_attr = element.get('format')
+        if format_attr and format_attr in self.TRANSFORMATIONS:
+            transform = self.TRANSFORMATIONS[format_attr]
+            value = transform(value)
+        
+        # Apply scale if specified
+        scale = element.get('scale')
+        if scale:
+            try:
+                scale_factor = int(scale)
+                numeric_value = float(value.replace(',', ''))
+                scaled_value = numeric_value * (10 ** scale_factor)
+                value = str(scaled_value)
+            except (ValueError, TypeError):
+                pass
+        
+        # Apply sign if specified
+        sign = element.get('sign')
+        if sign == '-':
+            if value and not value.startswith('-'):
+                value = '-' + value
+        
+        return value.strip()
--- a/venv/lib/python3.10/site-packages/edgar/documents/table_nodes.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/table_nodes.py
--- a/venv/lib/python3.10/site-packages/edgar/documents/table_utils.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/table_utils.py
@@ -0,0 +1,62 @@
+"""
+Table processing utilities for document parsing.
+
+This module consolidates the standard table matrix processing pipeline used
+across table rendering implementations (TableNode.render(), TableNode.to_dataframe(),
+and FastTableRenderer.render_table_node()).
+"""
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from edgar.documents.utils.table_matrix import TableMatrix, ColumnAnalyzer
+    from edgar.documents.utils.currency_merger import CurrencyColumnMerger
+
+
+def process_table_matrix(matrix: "TableMatrix", headers, rows) -> "TableMatrix":
+    """
+    Standard table matrix processing pipeline.
+
+    This function applies the standard three-step processing pipeline:
+    1. Build matrix from headers and rows (handles colspan/rowspan)
+    2. Filter out spacing columns (columns with only whitespace)
+    3. Detect and merge currency symbol columns with adjacent value columns
+
+    Args:
+        matrix: TableMatrix instance to populate
+        headers: List of header rows (each row is a list of Cell objects)
+        rows: List of data rows (each row is a list of Cell objects)
+
+    Returns:
+        Processed TableMatrix with spacing columns removed and currency columns merged
+
+    Example:
+        >>> matrix = TableMatrix()
+        >>> clean_matrix = process_table_matrix(matrix, headers, rows)
+        >>> # clean_matrix now has colspan/rowspan expanded, spacing removed, and currencies merged
+
+    Note:
+        This consolidates the identical processing sequence that appeared in:
+        - table_nodes.py:240-251 (TableNode.render())
+        - table_nodes.py:XXX (TableNode.to_dataframe())
+        - renderers/fast_table.py:XXX (FastTableRenderer.render_table_node())
+    """
+    # Import at runtime to avoid circular imports
+    from edgar.documents.utils.table_matrix import ColumnAnalyzer
+    from edgar.documents.utils.currency_merger import CurrencyColumnMerger
+
+    # Step 1: Build matrix from rows (expands colspan/rowspan)
+    matrix.build_from_rows(headers, rows)
+
+    # Step 2: Remove spacing columns (columns with only whitespace/empty cells)
+    # Note: ColumnAnalyzer is created but unused in original implementation
+    analyzer = ColumnAnalyzer(matrix)
+    clean_matrix = matrix.filter_spacing_columns()
+
+    # Step 3: Detect and merge currency columns ($ with adjacent numbers)
+    currency_merger = CurrencyColumnMerger(clean_matrix)
+    currency_merger.detect_currency_pairs()
+    if currency_merger.merge_pairs:
+        clean_matrix = currency_merger.apply_merges()
+
+    return clean_matrix
--- a/venv/lib/python3.10/site-packages/edgar/documents/types.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/types.py
@@ -0,0 +1,282 @@
+"""
+Type definitions for the HTML parser.
+"""
+
+import re
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Protocol, Union, Optional, Dict, Any, List
+
+
+class NodeType(Enum):
+    """Types of nodes in the document tree."""
+    DOCUMENT = auto()
+    SECTION = auto()
+    HEADING = auto()
+    PARAGRAPH = auto()
+    TABLE = auto()
+    LIST = auto()
+    LIST_ITEM = auto()
+    LINK = auto()
+    IMAGE = auto()
+    XBRL_FACT = auto()
+    TEXT = auto()
+    CONTAINER = auto()
+
+
+class SemanticType(Enum):
+    """Semantic types for document understanding."""
+    TITLE = auto()
+    HEADER = auto()
+    BODY_TEXT = auto()
+    FOOTNOTE = auto()
+    TABLE_OF_CONTENTS = auto()
+    FINANCIAL_STATEMENT = auto()
+    DISCLOSURE = auto()
+    ITEM_HEADER = auto()
+    SECTION_HEADER = auto()
+    SIGNATURE = auto()
+    EXHIBIT = auto()
+
+
+class TableType(Enum):
+    """Types of tables for semantic understanding."""
+    FINANCIAL = auto()
+    METRICS = auto()
+    REFERENCE = auto()
+    GENERAL = auto()
+    TABLE_OF_CONTENTS = auto()
+    EXHIBIT_INDEX = auto()
+
+
+@dataclass
+class Style:
+    """Unified style representation."""
+    font_size: Optional[float] = None
+    font_weight: Optional[str] = None
+    font_style: Optional[str] = None
+    text_align: Optional[str] = None
+    text_decoration: Optional[str] = None
+    color: Optional[str] = None
+    background_color: Optional[str] = None
+    margin_top: Optional[float] = None
+    margin_bottom: Optional[float] = None
+    margin_left: Optional[float] = None
+    margin_right: Optional[float] = None
+    padding_top: Optional[float] = None
+    padding_bottom: Optional[float] = None
+    padding_left: Optional[float] = None
+    padding_right: Optional[float] = None
+    display: Optional[str] = None
+    width: Optional[Union[float, str]] = None
+    height: Optional[Union[float, str]] = None
+    line_height: Optional[float] = None
+    
+    def merge(self, other: 'Style') -> 'Style':
+        """Merge this style with another, with other taking precedence."""
+        merged = Style()
+        for field in self.__dataclass_fields__:
+            other_value = getattr(other, field)
+            if other_value is not None:
+                setattr(merged, field, other_value)
+            else:
+                setattr(merged, field, getattr(self, field))
+        return merged
+    
+    @property
+    def is_bold(self) -> bool:
+        """Check if style represents bold text."""
+        return self.font_weight in ('bold', '700', '800', '900')
+    
+    @property
+    def is_italic(self) -> bool:
+        """Check if style represents italic text."""
+        return self.font_style == 'italic'
+    
+    @property
+    def is_centered(self) -> bool:
+        """Check if text is centered."""
+        return self.text_align == 'center'
+
+
+class NodeProtocol(Protocol):
+    """Protocol for all nodes."""
+    id: str
+    type: NodeType
+    content: Any
+    metadata: Dict[str, Any]
+    style: Style
+    parent: Optional['NodeProtocol']
+    children: List['NodeProtocol']
+    
+    def text(self) -> str: ...
+    def html(self) -> str: ...
+    def find(self, predicate) -> List['NodeProtocol']: ...
+
+
+@dataclass
+class HeaderInfo:
+    """Information about detected headers."""
+    level: int  # 1-6
+    confidence: float  # 0.0-1.0
+    text: str
+    detection_method: str
+    is_item: bool = False
+    item_number: Optional[str] = None
+    
+    @classmethod
+    def from_text(cls, text: str, level: int, confidence: float, method: str) -> 'HeaderInfo':
+        """Create HeaderInfo from text, detecting if it's an item header."""
+        # Check for item patterns
+        item_pattern = re.compile(r'^(Item|ITEM)\s+(\d+[A-Z]?\.?)', re.IGNORECASE)
+        match = item_pattern.match(text.strip())
+        
+        is_item = bool(match)
+        item_number = match.group(2).rstrip('.') if match else None
+        
+        return cls(
+            level=level,
+            confidence=confidence,
+            text=text,
+            detection_method=method,
+            is_item=is_item,
+            item_number=item_number
+        )
+
+
+@dataclass
+class XBRLFact:
+    """Represents an XBRL fact extracted from inline XBRL."""
+    concept: str
+    value: str
+    context_ref: Optional[str] = None
+    unit_ref: Optional[str] = None
+    decimals: Optional[str] = None
+    scale: Optional[str] = None
+    format: Optional[str] = None
+    sign: Optional[str] = None
+    
+    # Resolved references
+    context: Optional[Dict[str, Any]] = None
+    unit: Optional[str] = None
+    
+    # Additional metadata
+    metadata: Optional[Dict[str, Any]] = None
+    
+    @property
+    def numeric_value(self) -> Optional[float]:
+        """Get numeric value if applicable."""
+        try:
+            # Remove commas and convert
+            clean_value = self.value.replace(',', '')
+            return float(clean_value)
+        except (ValueError, AttributeError):
+            return None
+    
+    @property
+    def is_numeric(self) -> bool:
+        """Check if this is a numeric fact."""
+        return self.numeric_value is not None
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert XBRLFact to dictionary."""
+        return {
+            'concept': self.concept,
+            'value': self.value,
+            'context_ref': self.context_ref,
+            'unit_ref': self.unit_ref,
+            'decimals': self.decimals,
+            'scale': self.scale,
+            'format': self.format,
+            'sign': self.sign,
+            'context': self.context,
+            'unit': self.unit,
+            'is_numeric': self.is_numeric,
+            'numeric_value': self.numeric_value
+        }
+
+
+@dataclass
+class SearchResult:
+    """
+    Result from document search.
+
+    Designed for agent-friendly investigation workflows - provides access to
+    full section context rather than fragmented chunks.
+    """
+    node: 'NodeProtocol'
+    score: float
+    snippet: str
+    section: Optional[str] = None
+    context: Optional[str] = None
+    _section_obj: Optional[Any] = None  # Hidden Section object for agent navigation
+
+    @property
+    def section_object(self) -> Optional[Any]:
+        """
+        Get full Section object for agent navigation.
+
+        Enables multi-step investigation by providing access to complete
+        section content, not just the matched fragment.
+
+        Returns:
+            Section object with text(), tables(), and search() methods
+        """
+        return self._section_obj
+
+    @property
+    def full_context(self) -> str:
+        """
+        Get complete section text for agent investigation.
+
+        Returns full section content instead of fragmented chunks.
+        This supports the post-RAG "investigation not retrieval" pattern.
+
+        Returns:
+            Complete section text if section available, else snippet
+        """
+        if self._section_obj and hasattr(self._section_obj, 'text'):
+            return self._section_obj.text()
+        return self.snippet
+
+
+@dataclass
+class ParseContext:
+    """Context information during parsing."""
+    base_font_size: float = 10.0
+    current_section: Optional[str] = None
+    in_table: bool = False
+    in_list: bool = False
+    depth: int = 0
+    style_stack: List[Style] = None
+    
+    def __post_init__(self):
+        if self.style_stack is None:
+            self.style_stack = []
+    
+    def push_style(self, style: Style):
+        """Push style onto stack."""
+        self.style_stack.append(style)
+    
+    def pop_style(self):
+        """Pop style from stack."""
+        if self.style_stack:
+            self.style_stack.pop()
+    
+    def get_current_style(self) -> Style:
+        """Get combined style from stack."""
+        if not self.style_stack:
+            return Style()
+        
+        result = self.style_stack[0]
+        for style in self.style_stack[1:]:
+            result = result.merge(style)
+        return result
+
+
+# Type aliases for clarity
+NodeId = str
+SectionName = str
+ConceptName = str
+ContextRef = str
+UnitRef = str
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/init.py
@@ -0,0 +1,51 @@
+"""
+Utility modules for HTML parsing.
+"""
+
+from edgar.documents.utils.cache import (
+    LRUCache,
+    WeakCache,
+    TimeBasedCache,
+    CacheManager,
+    get_cache_manager,
+    cached,
+    CacheStats
+)
+from edgar.documents.utils.streaming import (
+    StreamingParser
+)
+from edgar.documents.utils.table_matrix import (
+    TableMatrix,
+    ColumnAnalyzer,
+    MatrixCell
+)
+from edgar.documents.utils.currency_merger import (
+    CurrencyColumnMerger
+)
+# Note: CacheableMixin not exported to avoid circular imports
+# Import directly: from edgar.documents.cache_mixin import CacheableMixin
+from edgar.documents.utils.html_utils import (
+    remove_xml_declaration,
+    create_lxml_parser
+)
+# Note: table_utils not exported to avoid circular imports
+# Import directly: from edgar.documents.utils.table_utils import process_table_matrix
+
+__all__ = [
+    'LRUCache',
+    'WeakCache',
+    'TimeBasedCache',
+    'CacheManager',
+    'get_cache_manager',
+    'cached',
+    'CacheStats',
+    'StreamingParser',
+    'TableMatrix',
+    'ColumnAnalyzer',
+    'MatrixCell',
+    'CurrencyColumnMerger',
+    # 'CacheableMixin',  # Not exported - import directly to avoid circular imports
+    'remove_xml_declaration',
+    'create_lxml_parser',
+    # 'process_table_matrix'  # Not exported - import directly to avoid circular imports
+]
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/anchor_cache.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/anchor_cache.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/cache.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/cache.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/currency_merger.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/currency_merger.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/html_utils.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/html_utils.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/streaming.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/streaming.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/table_matrix.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/table_matrix.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/toc_analyzer.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/toc_analyzer.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/toc_filter.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/toc_filter.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/anchor_cache.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/anchor_cache.py
@@ -0,0 +1,205 @@
+"""
+Lightweight anchor analysis cache to avoid re-parsing HTML.
+
+This provides a middle-ground approach that caches anchor analysis results
+while minimizing memory overhead.
+"""
+import re
+from typing import Dict, Set, Optional
+from collections import Counter
+import hashlib
+import pickle
+from pathlib import Path
+
+
+class AnchorCache:
+    """
+    Cache for anchor link analysis results.
+    
+    Stores navigation patterns by HTML hash to avoid re-analysis.
+    """
+    
+    def __init__(self, cache_dir: Optional[Path] = None):
+        self.cache_dir = cache_dir or Path.home() / '.edgar_cache' / 'anchors'
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self._memory_cache = {}  # In-memory cache for current session
+    
+    def _get_html_hash(self, html_content: str) -> str:
+        """Get hash of HTML content for caching."""
+        return hashlib.md5(html_content.encode('utf-8')).hexdigest()
+    
+    def get_navigation_patterns(self, html_content: str) -> Optional[Set[str]]:
+        """
+        Get cached navigation patterns for HTML content.
+        
+        Args:
+            html_content: HTML to analyze
+            
+        Returns:
+            Set of navigation patterns or None if not cached
+        """
+        html_hash = self._get_html_hash(html_content)
+        
+        # Check in-memory cache first
+        if html_hash in self._memory_cache:
+            return self._memory_cache[html_hash]
+        
+        # Check disk cache
+        cache_file = self.cache_dir / f"{html_hash}.pkl"
+        if cache_file.exists():
+            try:
+                with open(cache_file, 'rb') as f:
+                    patterns = pickle.load(f)
+                self._memory_cache[html_hash] = patterns
+                return patterns
+            except:
+                # Corrupted cache file, remove it
+                cache_file.unlink(missing_ok=True)
+        
+        return None
+    
+    def cache_navigation_patterns(self, html_content: str, patterns: Set[str]) -> None:
+        """
+        Cache navigation patterns for HTML content.
+        
+        Args:
+            html_content: HTML content
+            patterns: Navigation patterns to cache
+        """
+        html_hash = self._get_html_hash(html_content)
+        
+        # Store in memory
+        self._memory_cache[html_hash] = patterns
+        
+        # Store on disk (async to avoid blocking)
+        try:
+            cache_file = self.cache_dir / f"{html_hash}.pkl"
+            with open(cache_file, 'wb') as f:
+                pickle.dump(patterns, f)
+        except:
+            # Ignore cache write errors
+            pass
+    
+    def clear_cache(self) -> None:
+        """Clear all cached data."""
+        self._memory_cache.clear()
+        for cache_file in self.cache_dir.glob("*.pkl"):
+            cache_file.unlink(missing_ok=True)
+
+
+# Global cache instance
+_anchor_cache = AnchorCache()
+
+
+def get_cached_navigation_patterns(html_content: str, 
+                                 force_analyze: bool = False) -> Set[str]:
+    """
+    Get navigation patterns with caching.
+    
+    Args:
+        html_content: HTML to analyze
+        force_analyze: Force re-analysis even if cached
+        
+    Returns:
+        Set of navigation link texts to filter
+    """
+    if not force_analyze:
+        cached_patterns = _anchor_cache.get_navigation_patterns(html_content)
+        if cached_patterns is not None:
+            return cached_patterns
+    
+    # Need to analyze - use minimal approach
+    patterns = _analyze_navigation_minimal(html_content)
+    
+    # Cache results
+    _anchor_cache.cache_navigation_patterns(html_content, patterns)
+    
+    return patterns
+
+
+def _analyze_navigation_minimal(html_content: str, min_frequency: int = 5) -> Set[str]:
+    """
+    Minimal navigation analysis using regex instead of full HTML parsing.
+    
+    This avoids BeautifulSoup overhead by using regex to find anchor patterns.
+    """
+    patterns = set()
+    
+    # Find all anchor links with regex (faster than BeautifulSoup)
+    anchor_pattern = re.compile(r'<a[^>]*href\s*=\s*["\']#([^"\']*)["\'][^>]*>(.*?)</a>', 
+                               re.IGNORECASE | re.DOTALL)
+    
+    link_counts = Counter()
+    
+    for match in anchor_pattern.finditer(html_content):
+        anchor_id = match.group(1).strip()
+        link_text = re.sub(r'<[^>]+>', '', match.group(2)).strip()  # Remove HTML tags
+        link_text = ' '.join(link_text.split())  # Normalize whitespace
+        
+        if link_text and len(link_text) < 100:  # Reasonable link text length
+            link_counts[link_text] += 1
+    
+    # Add frequently occurring links
+    for text, count in link_counts.items():
+        if count >= min_frequency:
+            patterns.add(text)
+    
+    return patterns
+
+
+def filter_with_cached_patterns(text: str, html_content: str = None) -> str:
+    """
+    Filter text using cached navigation patterns.
+    
+    Preserves first occurrences of patterns (document structure headers)
+    while filtering out repeated navigation links.
+    
+    Args:
+        text: Text to filter
+        html_content: HTML for pattern analysis (optional)
+        
+    Returns:
+        Filtered text
+    """
+    if not text:
+        return text
+    
+    # Get patterns (cached or analyze)
+    if html_content:
+        patterns = get_cached_navigation_patterns(html_content)
+    else:
+        # Fallback to common SEC patterns
+        patterns = {
+            'Table of Contents',
+            'Index to Financial Statements',
+            'Index to Exhibits'
+        }
+    
+    if not patterns:
+        return text
+    
+    # Smart filtering: preserve first few occurrences, filter out repetitions
+    lines = text.split('\n')
+    filtered_lines = []
+    pattern_counts = {}  # Track how many times we've seen each pattern
+    
+    # Allow first few occurrences of each pattern (document structure headers)
+    max_allowed_per_pattern = 2  # Allow up to 2 occurrences of each pattern
+    
+    for line in lines:
+        stripped_line = line.strip()
+        
+        if stripped_line in patterns:
+            # This line matches a navigation pattern
+            count = pattern_counts.get(stripped_line, 0)
+            
+            if count < max_allowed_per_pattern:
+                # Keep this occurrence (likely a document structure header)
+                filtered_lines.append(line)
+                pattern_counts[stripped_line] = count + 1
+            # else: skip this line (it's a repetitive navigation link)
+        else:
+            # Not a navigation pattern, always keep
+            filtered_lines.append(line)
+    
+    return '\n'.join(filtered_lines)
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/cache.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/cache.py
@@ -0,0 +1,426 @@
+"""
+Cache utilities for performance optimization.
+"""
+
+import weakref
+from collections import OrderedDict
+from typing import Any, Dict, Optional, Callable, TypeVar, Generic
+from functools import wraps
+import time
+import threading
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+
+T = TypeVar('T')
+
+
+@dataclass
+class CacheStats:
+    """Statistics for cache performance monitoring."""
+    hits: int = 0
+    misses: int = 0
+    evictions: int = 0
+    total_time: float = 0.0
+    last_reset: datetime = field(default_factory=datetime.now)
+    
+    @property
+    def hit_rate(self) -> float:
+        """Calculate cache hit rate."""
+        total = self.hits + self.misses
+        return self.hits / total if total > 0 else 0.0
+    
+    @property
+    def avg_access_time(self) -> float:
+        """Calculate average access time."""
+        total = self.hits + self.misses
+        return self.total_time / total if total > 0 else 0.0
+    
+    def reset(self):
+        """Reset statistics."""
+        self.hits = 0
+        self.misses = 0
+        self.evictions = 0
+        self.total_time = 0.0
+        self.last_reset = datetime.now()
+
+
+class LRUCache(Generic[T]):
+    """
+    Thread-safe LRU cache implementation.
+    
+    Used for caching expensive operations like style parsing
+    and header detection results.
+    """
+    
+    def __init__(self, max_size: int = 1000):
+        """
+        Initialize LRU cache.
+        
+        Args:
+            max_size: Maximum number of items to cache
+        """
+        self.max_size = max_size
+        self._cache: OrderedDict[str, T] = OrderedDict()
+        self._lock = threading.RLock()
+        self.stats = CacheStats()
+    
+    def get(self, key: str) -> Optional[T]:
+        """
+        Get item from cache.
+        
+        Args:
+            key: Cache key
+            
+        Returns:
+            Cached value or None if not found
+        """
+        start_time = time.time()
+        
+        with self._lock:
+            if key in self._cache:
+                # Move to end (most recently used)
+                self._cache.move_to_end(key)
+                self.stats.hits += 1
+                self.stats.total_time += time.time() - start_time
+                return self._cache[key]
+            
+            self.stats.misses += 1
+            self.stats.total_time += time.time() - start_time
+            return None
+    
+    def put(self, key: str, value: T) -> None:
+        """
+        Put item in cache.
+        
+        Args:
+            key: Cache key
+            value: Value to cache
+        """
+        with self._lock:
+            if key in self._cache:
+                # Update existing
+                self._cache.move_to_end(key)
+                self._cache[key] = value
+            else:
+                # Add new
+                self._cache[key] = value
+                
+                # Evict oldest if over capacity
+                if len(self._cache) > self.max_size:
+                    self._cache.popitem(last=False)
+                    self.stats.evictions += 1
+    
+    def clear(self) -> None:
+        """Clear all cached items."""
+        with self._lock:
+            self._cache.clear()
+    
+    def size(self) -> int:
+        """Get current cache size."""
+        with self._lock:
+            return len(self._cache)
+
+
+class WeakCache:
+    """
+    Weak reference cache for parsed nodes.
+    
+    Allows garbage collection of unused nodes while
+    maintaining references to actively used ones.
+    """
+    
+    def __init__(self):
+        """Initialize weak cache."""
+        self._cache: Dict[str, weakref.ref] = {}
+        self._lock = threading.RLock()
+        self.stats = CacheStats()
+    
+    def get(self, key: str) -> Optional[Any]:
+        """
+        Get item from cache.
+        
+        Args:
+            key: Cache key
+            
+        Returns:
+            Cached object or None if not found or collected
+        """
+        start_time = time.time()
+        
+        with self._lock:
+            ref = self._cache.get(key)
+            if ref is not None:
+                obj = ref()
+                if obj is not None:
+                    self.stats.hits += 1
+                    self.stats.total_time += time.time() - start_time
+                    return obj
+                else:
+                    # Object was garbage collected
+                    del self._cache[key]
+            
+            self.stats.misses += 1
+            self.stats.total_time += time.time() - start_time
+            return None
+    
+    def put(self, key: str, value: Any) -> None:
+        """
+        Put item in cache with weak reference.
+        
+        Args:
+            key: Cache key
+            value: Object to cache
+        """
+        with self._lock:
+            self._cache[key] = weakref.ref(value)
+    
+    def clear(self) -> None:
+        """Clear all cached references."""
+        with self._lock:
+            self._cache.clear()
+    
+    def cleanup(self) -> int:
+        """
+        Remove dead references.
+        
+        Returns:
+            Number of references removed
+        """
+        with self._lock:
+            dead_keys = [
+                key for key, ref in self._cache.items()
+                if ref() is None
+            ]
+            
+            for key in dead_keys:
+                del self._cache[key]
+            
+            return len(dead_keys)
+
+
+class TimeBasedCache(Generic[T]):
+    """
+    Time-based expiring cache.
+    
+    Items expire after a specified duration.
+    """
+    
+    def __init__(self, ttl_seconds: int = 3600):
+        """
+        Initialize time-based cache.
+        
+        Args:
+            ttl_seconds: Time to live in seconds
+        """
+        self.ttl = timedelta(seconds=ttl_seconds)
+        self._cache: Dict[str, tuple[T, datetime]] = {}
+        self._lock = threading.RLock()
+        self.stats = CacheStats()
+    
+    def get(self, key: str) -> Optional[T]:
+        """
+        Get item from cache if not expired.
+        
+        Args:
+            key: Cache key
+            
+        Returns:
+            Cached value or None if not found or expired
+        """
+        start_time = time.time()
+        
+        with self._lock:
+            if key in self._cache:
+                value, timestamp = self._cache[key]
+                if datetime.now() - timestamp < self.ttl:
+                    self.stats.hits += 1
+                    self.stats.total_time += time.time() - start_time
+                    return value
+                else:
+                    # Expired
+                    del self._cache[key]
+                    self.stats.evictions += 1
+            
+            self.stats.misses += 1
+            self.stats.total_time += time.time() - start_time
+            return None
+    
+    def put(self, key: str, value: T) -> None:
+        """
+        Put item in cache with timestamp.
+        
+        Args:
+            key: Cache key
+            value: Value to cache
+        """
+        with self._lock:
+            self._cache[key] = (value, datetime.now())
+    
+    def clear(self) -> None:
+        """Clear all cached items."""
+        with self._lock:
+            self._cache.clear()
+    
+    def cleanup(self) -> int:
+        """
+        Remove expired items.
+        
+        Returns:
+            Number of items removed
+        """
+        with self._lock:
+            now = datetime.now()
+            expired_keys = [
+                key for key, (_, timestamp) in self._cache.items()
+                if now - timestamp >= self.ttl
+            ]
+            
+            for key in expired_keys:
+                del self._cache[key]
+                self.stats.evictions += 1
+            
+            return len(expired_keys)
+
+
+def cached(cache: LRUCache, key_func: Optional[Callable] = None):
+    """
+    Decorator for caching function results.
+    
+    Args:
+        cache: Cache instance to use
+        key_func: Function to generate cache key from arguments
+        
+    Returns:
+        Decorated function
+    """
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            # Generate cache key
+            if key_func:
+                key = key_func(*args, **kwargs)
+            else:
+                # Default key generation
+                key = f"{func.__name__}:{str(args)}:{str(sorted(kwargs.items()))}"
+            
+            # Check cache
+            result = cache.get(key)
+            if result is not None:
+                return result
+            
+            # Compute and cache result
+            result = func(*args, **kwargs)
+            cache.put(key, result)
+            
+            return result
+        
+        return wrapper
+    
+    return decorator
+
+
+class CacheManager:
+    """
+    Manages multiple caches for the parser.
+    
+    Provides centralized cache management and monitoring.
+    """
+    
+    def __init__(self):
+        """Initialize cache manager."""
+        # Style parsing cache
+        self.style_cache = LRUCache[dict](max_size=5000)
+        
+        # Header detection cache
+        self.header_cache = LRUCache[bool](max_size=2000)
+        
+        # Pattern matching cache
+        self.pattern_cache = LRUCache[bool](max_size=10000)
+        
+        # Node reference cache
+        self.node_cache = WeakCache()
+        
+        # Compiled regex cache
+        self.regex_cache = LRUCache[Any](max_size=500)
+        
+        # All caches for management
+        self._caches = {
+            'style': self.style_cache,
+            'header': self.header_cache,
+            'pattern': self.pattern_cache,
+            'node': self.node_cache,
+            'regex': self.regex_cache
+        }
+    
+    def get_stats(self) -> Dict[str, CacheStats]:
+        """Get statistics for all caches."""
+        return {
+            name: cache.stats 
+            for name, cache in self._caches.items()
+            if hasattr(cache, 'stats')
+        }
+    
+    def reset_stats(self) -> None:
+        """Reset statistics for all caches."""
+        for cache in self._caches.values():
+            if hasattr(cache, 'stats'):
+                cache.stats.reset()
+    
+    def clear_all(self) -> None:
+        """Clear all caches."""
+        for cache in self._caches.values():
+            cache.clear()
+    
+    def cleanup(self) -> Dict[str, int]:
+        """
+        Cleanup expired/dead entries in all caches.
+        
+        Returns:
+            Number of entries cleaned up per cache
+        """
+        cleanup_counts = {}
+        
+        # Cleanup weak cache
+        if hasattr(self.node_cache, 'cleanup'):
+            cleanup_counts['node'] = self.node_cache.cleanup()
+        
+        return cleanup_counts
+    
+    def get_memory_usage(self) -> Dict[str, int]:
+        """
+        Estimate memory usage of caches.
+        
+        Returns:
+            Approximate memory usage in bytes per cache
+        """
+        import sys
+        
+        usage = {}
+        
+        for name, cache in self._caches.items():
+            if hasattr(cache, '_cache'):
+                # Rough estimation
+                size = 0
+                if isinstance(cache._cache, dict):
+                    for key, value in cache._cache.items():
+                        size += sys.getsizeof(key)
+                        if hasattr(value, '__sizeof__'):
+                            size += sys.getsizeof(value)
+                        else:
+                            size += 1000  # Default estimate
+                
+                usage[name] = size
+        
+        return usage
+
+
+# Global cache manager instance
+_cache_manager = None
+
+
+def get_cache_manager() -> CacheManager:
+    """Get global cache manager instance."""
+    global _cache_manager
+    if _cache_manager is None:
+        _cache_manager = CacheManager()
+    return _cache_manager
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/currency_merger.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/currency_merger.py
@@ -0,0 +1,277 @@
+"""
+Currency column merger for handling separated currency symbols in SEC filings.
+"""
+
+import re
+from typing import List, Tuple
+
+from edgar.documents.table_nodes import Cell
+from edgar.documents.utils.table_matrix import TableMatrix, MatrixCell
+
+
+class CurrencyColumnMerger:
+    """
+    Detects and merges currency symbol columns with their value columns.
+    
+    SEC filings often split currency values into two cells:
+    - Cell 1: "$" (left-aligned)
+    - Cell 2: "224.11" (right-aligned)
+    
+    This class detects this pattern and merges them into "$224.11"
+    """
+    
+    # Common currency symbols
+    CURRENCY_SYMBOLS = {'$', '€', '£', '¥', '₹', 'Rs', 'USD', 'EUR', 'GBP'}
+    
+    # Pattern for numeric values (with commas, decimals)
+    NUMERIC_PATTERN = re.compile(r'^[\d,]+\.?\d*$')
+    
+    def __init__(self, matrix: TableMatrix):
+        """Initialize with a table matrix."""
+        self.matrix = matrix
+        self.merge_pairs: List[Tuple[int, int]] = []
+        
+    def detect_currency_pairs(self) -> List[Tuple[int, int]]:
+        """
+        Detect column pairs that should be merged (currency symbol + value).
+        
+        Returns:
+            List of (symbol_col, value_col) pairs to merge
+        """
+        pairs = []
+        
+        for col_idx in range(self.matrix.col_count - 1):
+            if self._is_currency_column(col_idx):
+                next_col = col_idx + 1
+                if self._is_numeric_column(next_col):
+                    # Check if they're consistently paired
+                    if self._verify_pairing(col_idx, next_col):
+                        pairs.append((col_idx, next_col))
+        
+        self.merge_pairs = pairs
+        return pairs
+    
+    def _is_currency_column(self, col_idx: int) -> bool:
+        """
+        Check if a column contains only currency symbols.
+        
+        A currency column typically:
+        - Contains only currency symbols or empty cells
+        - Has very narrow width (1-3 characters)
+        - Is left-aligned (though we check content, not style)
+        """
+        currency_count = 0
+        empty_count = 0
+        other_count = 0
+        header_rows = 0
+        
+        for row_idx in range(self.matrix.row_count):
+            cell = self.matrix.matrix[row_idx][col_idx]
+            if cell.original_cell and not cell.is_spanned:
+                text = cell.original_cell.text().strip()
+                
+                # Skip header rows (first 2 rows typically)
+                if row_idx < 2 and text and not text in self.CURRENCY_SYMBOLS:
+                    header_rows += 1
+                    continue
+                
+                if not text:
+                    empty_count += 1
+                elif text in self.CURRENCY_SYMBOLS or text == '$':
+                    currency_count += 1
+                elif len(text) <= 3 and text in ['$', '€', '£', '¥']:
+                    currency_count += 1
+                else:
+                    other_count += 1
+        
+        # Column should be mostly currency symbols with some empty cells
+        # Exclude header rows from the calculation
+        total_non_empty = currency_count + other_count
+        if total_non_empty == 0:
+            return False
+        
+        # At least 60% of non-empty, non-header cells should be currency symbols
+        # Lower threshold since we're excluding headers
+        # Also accept if there's at least 1 currency symbol and no other non-currency content
+        return (currency_count >= 1 and other_count == 0) or \
+               (currency_count >= 2 and currency_count / total_non_empty >= 0.6)
+    
+    def _is_numeric_column(self, col_idx: int) -> bool:
+        """
+        Check if a column contains numeric values.
+        """
+        numeric_count = 0
+        non_empty_count = 0
+        
+        for row_idx in range(self.matrix.row_count):
+            cell = self.matrix.matrix[row_idx][col_idx]
+            if cell.original_cell and not cell.is_spanned:
+                text = cell.original_cell.text().strip()
+                
+                # Skip header rows
+                if row_idx < 2:
+                    continue
+                    
+                if text:
+                    non_empty_count += 1
+                    # Remove formatting and check if numeric
+                    clean_text = text.replace(',', '').replace('%', '').replace('(', '').replace(')', '')
+                    if self.NUMERIC_PATTERN.match(clean_text):
+                        numeric_count += 1
+        
+        if non_empty_count == 0:
+            return False
+        
+        # At least 60% should be numeric (lowered threshold)
+        return numeric_count / non_empty_count >= 0.6
+    
+    def _verify_pairing(self, symbol_col: int, value_col: int) -> bool:
+        """
+        Verify that symbol and value columns are consistently paired.
+        
+        They should have content in the same rows (when symbol present, value present).
+        """
+        paired_rows = 0
+        mismatched_rows = 0
+        
+        for row_idx in range(self.matrix.row_count):
+            symbol_cell = self.matrix.matrix[row_idx][symbol_col]
+            value_cell = self.matrix.matrix[row_idx][value_col]
+            
+            if symbol_cell.original_cell and value_cell.original_cell:
+                symbol_text = symbol_cell.original_cell.text().strip()
+                value_text = value_cell.original_cell.text().strip()
+                
+                # Check if they're paired (both have content or both empty)
+                if symbol_text in self.CURRENCY_SYMBOLS and value_text:
+                    paired_rows += 1
+                elif not symbol_text and not value_text:
+                    # Both empty is fine
+                    pass
+                elif symbol_text in self.CURRENCY_SYMBOLS and not value_text:
+                    # Symbol without value - might be header
+                    if row_idx < 2:  # Allow in headers
+                        pass
+                    else:
+                        mismatched_rows += 1
+                elif not symbol_text and value_text:
+                    # Value without symbol - could be valid (continuation)
+                    pass
+        
+        # Should have more paired than mismatched
+        return paired_rows > mismatched_rows
+    
+    def apply_merges(self) -> 'TableMatrix':
+        """
+        Create a new matrix with currency columns merged.
+        
+        Returns:
+            New TableMatrix with merged columns
+        """
+        if not self.merge_pairs:
+            self.detect_currency_pairs()
+        
+        if not self.merge_pairs:
+            # No merges needed
+            return self.matrix
+        
+        # Calculate new column count (each merge removes one column)
+        new_col_count = self.matrix.col_count - len(self.merge_pairs)
+        
+        # Create mapping from old to new columns
+        old_to_new = {}
+        merged_cols = set(pair[0] for pair in self.merge_pairs)  # Symbol columns to remove
+        
+        new_col = 0
+        for old_col in range(self.matrix.col_count):
+            if old_col in merged_cols:
+                # This column will be merged with next, skip it
+                continue
+            old_to_new[old_col] = new_col
+            new_col += 1
+        
+        # Create new matrix
+        new_matrix = TableMatrix()
+        new_matrix.row_count = self.matrix.row_count
+        new_matrix.col_count = new_col_count
+        new_matrix.matrix = []
+        
+        # Build new matrix with merged cells
+        for row_idx in range(self.matrix.row_count):
+            new_row = [MatrixCell() for _ in range(new_col_count)]
+            
+            for old_col in range(self.matrix.col_count):
+                # Check if this is a symbol column to merge
+                merge_pair = next((pair for pair in self.merge_pairs if pair[0] == old_col), None)
+                
+                if merge_pair:
+                    # Merge symbol with value
+                    symbol_col, value_col = merge_pair
+                    symbol_cell = self.matrix.matrix[row_idx][symbol_col]
+                    value_cell = self.matrix.matrix[row_idx][value_col]
+                    
+                    if value_cell.original_cell:
+                        # Create merged cell
+                        new_cell_content = self._merge_cell_content(symbol_cell, value_cell)
+                        if new_cell_content:
+                            # Create new merged cell
+                            merged_cell = Cell(
+                                content=new_cell_content,
+                                colspan=value_cell.original_cell.colspan,
+                                rowspan=value_cell.original_cell.rowspan,
+                                is_header=value_cell.original_cell.is_header,
+                                align=value_cell.original_cell.align
+                            )
+                            
+                            new_col_idx = old_to_new.get(value_col)
+                            if new_col_idx is not None:
+                                new_row[new_col_idx] = MatrixCell(
+                                    original_cell=merged_cell,
+                                    is_spanned=False,
+                                    row_origin=row_idx,
+                                    col_origin=new_col_idx
+                                )
+                
+                elif old_col not in set(pair[1] for pair in self.merge_pairs):
+                    # Regular column, not involved in merging
+                    new_col_idx = old_to_new.get(old_col)
+                    if new_col_idx is not None:
+                        new_row[new_col_idx] = self.matrix.matrix[row_idx][old_col]
+            
+            new_matrix.matrix.append(new_row)
+        
+        return new_matrix
+    
+    def _merge_cell_content(self, symbol_cell: MatrixCell, value_cell: MatrixCell) -> str:
+        """
+        Merge symbol and value cell contents.
+        
+        Returns:
+            Merged content like "$224.11" or original value if no symbol
+        """
+        value_text = value_cell.original_cell.text().strip() if value_cell.original_cell else ""
+        symbol_text = symbol_cell.original_cell.text().strip() if symbol_cell.original_cell else ""
+        
+        if not value_text:
+            return symbol_text  # Just return symbol if no value
+        
+        if symbol_text in self.CURRENCY_SYMBOLS:
+            # Merge symbol with value (no space for $, others may vary)
+            if symbol_text == '$':
+                return f"${value_text}"
+            else:
+                return f"{symbol_text}{value_text}"
+        else:
+            # No symbol, just return value
+            return value_text
+    
+    def get_merge_summary(self) -> str:
+        """Get a summary of merges to be applied."""
+        if not self.merge_pairs:
+            return "No currency column merges detected"
+        
+        summary = f"Currency merges detected: {len(self.merge_pairs)} pairs\n"
+        for symbol_col, value_col in self.merge_pairs:
+            summary += f"  • Column {symbol_col} ($) + Column {value_col} (value)\n"
+        
+        return summary
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/html_utils.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/html_utils.py
@@ -0,0 +1,96 @@
+"""
+HTML utility functions for document parsing.
+
+This module consolidates common HTML processing utilities used across
+the parser, preprocessor, and simple parser implementations.
+"""
+
+import lxml.html
+from typing import Optional
+
+
+def remove_xml_declaration(html: str) -> str:
+    """
+    Remove XML declaration from HTML if present.
+
+    SEC HTML documents sometimes include XML declarations like:
+        <?xml version="1.0" encoding="UTF-8"?>
+
+    These can interfere with HTML parsing and are safely removed since
+    the encoding is handled separately by the parser.
+
+    Args:
+        html: HTML string that may contain XML declaration
+
+    Returns:
+        HTML string with XML declaration removed (if present)
+
+    Examples:
+        >>> html = '<?xml version="1.0"?><!DOCTYPE html><html>...'
+        >>> remove_xml_declaration(html)
+        '<!DOCTYPE html><html>...'
+
+        >>> html = '<!DOCTYPE html><html>...'  # No XML declaration
+        >>> remove_xml_declaration(html)
+        '<!DOCTYPE html><html>...'
+    """
+    html_stripped = html.strip()
+    if html_stripped.startswith('<?xml'):
+        xml_end = html.find('?>') + 2
+        return html[xml_end:]
+    return html
+
+
+def create_lxml_parser(
+    remove_blank_text: bool = True,
+    remove_comments: bool = True,
+    recover: bool = True,
+    encoding: Optional[str] = 'utf-8'
+) -> lxml.html.HTMLParser:
+    """
+    Create a configured lxml HTMLParser.
+
+    This factory function creates an lxml HTMLParser with consistent
+    configuration settings used across the document parsing system.
+
+    Args:
+        remove_blank_text: Remove blank text nodes between tags.
+            Default True for cleaner tree structure.
+        remove_comments: Remove HTML comments from parsed tree.
+            Default True since comments are rarely needed.
+        recover: Enable error recovery mode to handle malformed HTML.
+            Default True since SEC filings often have HTML issues.
+        encoding: Character encoding for the parser.
+            Default 'utf-8'. Set to None to disable encoding handling.
+
+    Returns:
+        Configured lxml.html.HTMLParser instance
+
+    Examples:
+        >>> # Standard parser (removes whitespace and comments, recovers from errors)
+        >>> parser = create_lxml_parser()
+
+        >>> # Parser that preserves all content (for XBRL)
+        >>> parser = create_lxml_parser(
+        ...     remove_blank_text=False,
+        ...     remove_comments=False
+        ... )
+
+        >>> # Parser without encoding (auto-detect)
+        >>> parser = create_lxml_parser(encoding=None)
+
+    Note:
+        The recover=True setting is critical for SEC documents which
+        often contain non-standard HTML structures.
+    """
+    kwargs = {
+        'remove_blank_text': remove_blank_text,
+        'remove_comments': remove_comments,
+        'recover': recover,
+    }
+
+    # Only add encoding if specified
+    if encoding is not None:
+        kwargs['encoding'] = encoding
+
+    return lxml.html.HTMLParser(**kwargs)
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/streaming.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/streaming.py
@@ -0,0 +1,375 @@
+"""
+Streaming parser for large HTML documents.
+"""
+
+import io
+from typing import Dict, Any, TYPE_CHECKING
+
+from lxml import etree
+from lxml.html import HtmlElement
+
+from edgar.documents.config import ParserConfig
+from edgar.documents.exceptions import HTMLParsingError, DocumentTooLargeError
+
+# Use TYPE_CHECKING to avoid circular imports
+if TYPE_CHECKING:
+    from edgar.documents.document import Document, DocumentMetadata
+    from edgar.documents.nodes import DocumentNode, HeadingNode, ParagraphNode, TextNode, SectionNode, ContainerNode
+    from edgar.documents.table_nodes import TableNode
+from edgar.documents.types import SemanticType
+
+
+class StreamingParser:
+    """
+    Streaming parser for large HTML documents.
+    
+    Processes documents in chunks to minimize memory usage
+    while maintaining parse quality.
+    """
+    
+    # Chunk size for streaming (1MB)
+    CHUNK_SIZE = 1024 * 1024
+    
+    # Maximum node buffer before flush
+    MAX_NODE_BUFFER = 1000
+    
+    def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
+        """
+        Initialize streaming parser.
+        
+        Args:
+            config: Parser configuration
+            strategies: Parsing strategies to use
+        """
+        self.config = config
+        self.strategies = strategies
+        self._reset_state()
+    
+    def _reset_state(self):
+        """Reset parser state."""
+        # Import here to avoid circular import
+        from edgar.documents.document import DocumentMetadata
+        from edgar.documents.nodes import DocumentNode
+
+        self.current_section = None
+        self.node_buffer = []
+        self.metadata = DocumentMetadata()
+        self.root = DocumentNode()
+        self.current_parent = self.root
+        self.tag_stack = []
+        self.text_buffer = []
+        self.in_table = False
+        self.table_buffer = []
+        self.bytes_processed = 0
+
+    def parse(self, html: str) -> "Document":
+        """
+        Parse HTML in streaming mode.
+
+        Args:
+            html: HTML content to parse
+
+        Returns:
+            Parsed Document
+
+        Raises:
+            DocumentTooLargeError: If document exceeds size limit
+            HTMLParsingError: If parsing fails
+        """
+        self._reset_state()
+
+        # Store original HTML BEFORE parsing (needed for TOC-based section detection)
+        original_html = html
+        
+        try:
+            # Create streaming parser
+            parser = etree.iterparse(
+                io.BytesIO(html.encode('utf-8')),
+                events=('start', 'end'),
+                html=True,
+                recover=True,
+                encoding='utf-8'
+            )
+            
+            # Process events
+            for event, elem in parser:
+                self._process_event(event, elem)
+                
+                # Check size limit
+                self.bytes_processed += len(etree.tostring(elem, encoding='unicode', method='html'))
+                if self.bytes_processed > self.config.max_document_size:
+                    raise DocumentTooLargeError(self.bytes_processed, self.config.max_document_size)
+                
+                # Flush buffer if needed
+                if len(self.node_buffer) >= self.MAX_NODE_BUFFER:
+                    self._flush_buffer()
+                
+                # Clean up processed elements to save memory
+                elem.clear()
+                while elem.getprevious() is not None:
+                    parent = elem.getparent()
+                    if parent is not None:
+                        del parent[0]
+                    else:
+                        break
+            
+            # Final flush
+            self._flush_buffer()
+
+            # Store original HTML in metadata for section detection (TOC analysis)
+            self.metadata.original_html = original_html
+
+            # Create document (import here to avoid circular import)
+            from edgar.documents.document import Document
+            document = Document(root=self.root, metadata=self.metadata)
+
+            # Store config reference (required for section detection)
+            document._config = self.config
+            
+            # Apply post-processing
+            from edgar.documents.processors.postprocessor import DocumentPostprocessor
+            postprocessor = DocumentPostprocessor(self.config)
+            document = postprocessor.process(document)
+            
+            return document
+            
+        except etree.ParseError as e:
+            raise HTMLParsingError(f"Streaming parse failed: {str(e)}")
+        except Exception as e:
+            if isinstance(e, (DocumentTooLargeError, HTMLParsingError)):
+                raise
+            raise HTMLParsingError(f"Unexpected error during streaming parse: {str(e)}")
+    
+    def _process_event(self, event: str, elem: HtmlElement):
+        """Process a parse event."""
+        if event == 'start':
+            self._handle_start_tag(elem)
+        elif event == 'end':
+            self._handle_end_tag(elem)
+    
+    def _handle_start_tag(self, elem: HtmlElement):
+        """Handle opening tag."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import ContainerNode
+
+        tag = elem.tag.lower()
+
+        # Track tag stack
+        self.tag_stack.append(tag)
+
+        # Extract metadata from early elements
+        if tag == 'title' and elem.text:
+            self._extract_title_metadata(elem.text)
+        elif tag == 'meta':
+            self._extract_meta_metadata(elem)
+
+        # Handle specific tags
+        if tag == 'body':
+            # Create a container for body content
+            body_container = ContainerNode(tag_name='body')
+            self.root.add_child(body_container)
+            self.current_parent = body_container
+        elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            self._start_heading(elem)
+        elif tag == 'p':
+            self._start_paragraph(elem)
+        elif tag == 'table':
+            self._start_table(elem)
+        elif tag == 'section':
+            self._start_section(elem)
+    
+    def _handle_end_tag(self, elem: HtmlElement):
+        """Handle closing tag."""
+        tag = elem.tag.lower()
+        
+        # Remove from tag stack
+        if self.tag_stack and self.tag_stack[-1] == tag:
+            self.tag_stack.pop()
+        
+        # Handle specific tags
+        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            self._end_heading(elem)
+        elif tag == 'p':
+            self._end_paragraph(elem)
+        elif tag == 'table':
+            self._end_table(elem)
+        elif tag == 'section':
+            self._end_section(elem)
+        elif tag == 'body':
+            # When body ends, flush any remaining nodes
+            self._flush_buffer()
+        
+        # Handle text content
+        if elem.text:
+            self.text_buffer.append(elem.text.strip())
+        if elem.tail:
+            self.text_buffer.append(elem.tail.strip())
+    
+    def _start_heading(self, elem: HtmlElement):
+        """Start processing a heading."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import HeadingNode
+
+        level = int(elem.tag[1])
+        text = self._get_text_content(elem)
+
+        # Create heading node
+        heading = HeadingNode(
+            level=level,
+            content=text
+        )
+        
+        # Check if this is a section header
+        if self.strategies.get('header_detection'):
+            detector = self.strategies['header_detection']
+            if detector.is_section_header(text, elem):
+                heading.semantic_type = SemanticType.SECTION_HEADER
+        
+        self.node_buffer.append(heading)
+    
+    def _end_heading(self, elem: HtmlElement):
+        """End processing a heading."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import HeadingNode
+
+        # Get text content from element
+        text = self._get_text_content(elem)
+        if text and self.node_buffer and isinstance(self.node_buffer[-1], HeadingNode):
+            self.node_buffer[-1].content = text
+
+        # Clear any accumulated text buffer
+        self.text_buffer.clear()
+    
+    def _start_paragraph(self, elem: HtmlElement):
+        """Start processing a paragraph."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import ParagraphNode
+
+        para = ParagraphNode()
+
+        # Get style if present
+        style_attr = elem.get('style')
+        if style_attr and self.strategies.get('style_parser'):
+            style_parser = self.strategies['style_parser']
+            para.style = style_parser.parse(style_attr)
+
+        self.node_buffer.append(para)
+    
+    def _end_paragraph(self, elem: HtmlElement):
+        """End processing a paragraph."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import ParagraphNode, TextNode
+
+        # Get text content from element
+        text = self._get_text_content(elem)
+        if text and self.node_buffer and isinstance(self.node_buffer[-1], ParagraphNode):
+            text_node = TextNode(content=text)
+            self.node_buffer[-1].add_child(text_node)
+        
+        # Clear any accumulated text buffer
+        self.text_buffer.clear()
+    
+    def _start_table(self, elem: HtmlElement):
+        """Start processing a table."""
+        self.in_table = True
+        self.table_buffer = []
+        
+        # Store table element for later processing
+        self.table_elem = elem
+    
+    def _end_table(self, elem: HtmlElement):
+        """End processing a table."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.table_nodes import TableNode
+
+        self.in_table = False
+
+        # Process table with table processor if available
+        if self.strategies.get('table_processing'):
+            processor = self.strategies['table_processing']
+            table_node = processor.process(elem)
+            if table_node:
+                self.node_buffer.append(table_node)
+        else:
+            # Basic table node
+            table = TableNode()
+            self.node_buffer.append(table)
+
+        self.table_buffer.clear()
+    
+    def _start_section(self, elem: HtmlElement):
+        """Start processing a section."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import SectionNode
+
+        section = SectionNode()
+
+        # Get section attributes
+        section_id = elem.get('id')
+        if section_id:
+            section.metadata['id'] = section_id
+
+        section_class = elem.get('class')
+        if section_class:
+            section.metadata['class'] = section_class
+
+        self.current_section = section
+        self.node_buffer.append(section)
+    
+    def _end_section(self, elem: HtmlElement):
+        """End processing a section."""
+        self.current_section = None
+    
+    def _flush_buffer(self):
+        """Flush node buffer to document tree."""
+        for node in self.node_buffer:
+            # Add to current parent
+            if self.current_section:
+                self.current_section.add_child(node)
+            else:
+                self.current_parent.add_child(node)
+        
+        self.node_buffer.clear()
+    
+    def _get_text_content(self, elem: HtmlElement) -> str:
+        """Extract text content from element."""
+        text_parts = []
+        
+        if elem.text:
+            text_parts.append(elem.text.strip())
+        
+        for child in elem:
+            child_text = self._get_text_content(child)
+            if child_text:
+                text_parts.append(child_text)
+            if child.tail:
+                text_parts.append(child.tail.strip())
+        
+        return ' '.join(text_parts)
+    
+    def _extract_title_metadata(self, title: str):
+        """Extract metadata from title."""
+        # Example: "APPLE INC - 10-K - 2023-09-30"
+        parts = title.split(' - ')
+        if len(parts) >= 2:
+            self.metadata.company = parts[0].strip()
+            self.metadata.form = parts[1].strip()
+            if len(parts) >= 3:
+                self.metadata.filing_date = parts[2].strip()
+    
+    def _extract_meta_metadata(self, elem: HtmlElement):
+        """Extract metadata from meta tags."""
+        name = elem.get('name', '').lower()
+        content = elem.get('content', '')
+        
+        if name and content:
+            if name == 'company':
+                self.metadata.company = content
+            elif name == 'filing-type':
+                self.metadata.form = content
+            elif name == 'cik':
+                self.metadata.cik = content
+            elif name == 'filing-date':
+                self.metadata.filing_date = content
+            elif name == 'accession-number':
+                self.metadata.accession_number = content
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/table_matrix.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/table_matrix.py
@@ -0,0 +1,858 @@
+"""
+Table matrix builder for handling complex colspan/rowspan structures.
+"""
+
+from dataclasses import dataclass
+from typing import List, Optional
+
+from edgar.documents.table_nodes import Cell, Row
+
+
+@dataclass
+class MatrixCell:
+    """Cell in the matrix with reference to original cell"""
+    original_cell: Optional[Cell] = None
+    is_spanned: bool = False  # True if this is part of a colspan/rowspan
+    row_origin: int = -1  # Original row index
+    col_origin: int = -1  # Original column index
+    
+
+class TableMatrix:
+    """
+    Build a 2D matrix representation of table with proper handling of merged cells.
+    
+    This class converts a table with colspan/rowspan into a regular 2D grid
+    where each merged cell occupies multiple positions in the matrix.
+    """
+    
+    def __init__(self):
+        """Initialize empty matrix"""
+        self.matrix: List[List[MatrixCell]] = []
+        self.row_count = 0
+        self.col_count = 0
+        self.header_row_count = 0  # Track number of header rows
+
+    def build_from_rows(self, header_rows: List[List[Cell]], data_rows: List[Row]) -> 'TableMatrix':
+        """
+        Build matrix from header rows and data rows.
+
+        Args:
+            header_rows: List of header rows (each row is a list of Cells)
+            data_rows: List of Row objects
+
+        Returns:
+            Self for chaining
+        """
+        # Store header row count for later use
+        self.header_row_count = len(header_rows)
+
+        # Combine all rows for processing
+        all_rows = []
+
+        # Add header rows
+        for header_row in header_rows:
+            all_rows.append(header_row)
+        
+        # Add data rows
+        for row in data_rows:
+            all_rows.append(row.cells)
+        
+        if not all_rows:
+            return self
+        
+        # Calculate dimensions
+        self.row_count = len(all_rows)
+        
+        # First pass: determine actual column count
+        self._calculate_dimensions(all_rows)
+        
+        # Initialize matrix
+        self.matrix = [[MatrixCell() for _ in range(self.col_count)] 
+                       for _ in range(self.row_count)]
+        
+        # Second pass: place cells in matrix
+        self._place_cells(all_rows)
+        
+        return self
+    
+    def _calculate_dimensions(self, rows: List[List[Cell]]):
+        """Calculate the actual dimensions considering colspan"""
+        max_cols = 0
+        
+        for row_idx, row in enumerate(rows):
+            col_pos = 0
+            for cell in row:
+                # Skip positions that might be occupied by rowspan from above
+                while col_pos < max_cols and self._is_occupied(row_idx, col_pos):
+                    col_pos += 1
+                
+                # This cell will occupy from col_pos to col_pos + colspan
+                col_end = col_pos + cell.colspan
+                max_cols = max(max_cols, col_end)
+                col_pos = col_end
+        
+        self.col_count = max_cols
+    
+    def _is_occupied(self, row: int, col: int) -> bool:
+        """Check if a position is occupied by a cell from a previous row (rowspan)"""
+        if row == 0:
+            return False
+        
+        # Check if any cell above has rowspan that reaches this position
+        for prev_row in range(row):
+            if prev_row < len(self.matrix) and col < len(self.matrix[prev_row]):
+                cell = self.matrix[prev_row][col]
+                if cell.original_cell and cell.row_origin == prev_row:
+                    # Check if this cell's rowspan reaches current row
+                    if prev_row + cell.original_cell.rowspan > row:
+                        return True
+        return False
+    
+    def _place_cells(self, rows: List[List[Cell]]):
+        """Place cells in the matrix handling colspan and rowspan"""
+        for row_idx, row in enumerate(rows):
+            col_pos = 0
+            
+            for cell_idx, cell in enumerate(row):
+                # Find next available column position
+                while col_pos < self.col_count and self.matrix[row_idx][col_pos].original_cell is not None:
+                    col_pos += 1
+                
+                if col_pos >= self.col_count:
+                    # Need to expand matrix
+                    self._expand_columns(col_pos + cell.colspan)
+                
+                # Special handling for cells with colspan > 1 containing numeric values
+                # Only apply this logic for Table 15-style alignment issues
+                # Check if this looks like a financial value that should be right-aligned
+                cell_text = cell.text().strip()
+                
+                # Check for numeric values that need special alignment
+                # This is specifically for cases like "167,045" that should align with "$167,045"
+                has_comma_separator = ',' in cell_text
+                digit_ratio = sum(c.isdigit() for c in cell_text) / len(cell_text) if cell_text else 0
+                
+                # Only apply special placement for colspan=2 numeric values in data rows
+                # This handles Table 15's specific case without breaking Table 13
+                is_special_numeric = (cell.colspan == 2 and  # Specifically colspan=2
+                                    has_comma_separator and
+                                    digit_ratio > 0.5 and  # More than 50% digits
+                                    not cell_text.startswith('$') and
+                                    not any(month in cell_text.lower() for month in 
+                                           ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 
+                                            'jul', 'aug', 'sep', 'oct', 'nov', 'dec']) and
+                                    row_idx > 1)  # Not a header row (allow for multi-row headers)
+                
+                if is_special_numeric:
+                    # Place empty cell at first position, content at second position
+                    # This is specifically for Table 15 alignment
+                    for r in range(cell.rowspan):
+                        # First column of span: empty
+                        if row_idx + r < self.row_count and col_pos < self.col_count:
+                            self.matrix[row_idx + r][col_pos] = MatrixCell()
+                        
+                        # Second column of span: the actual content
+                        if row_idx + r < self.row_count and col_pos + 1 < self.col_count:
+                            matrix_cell = MatrixCell(
+                                original_cell=cell,
+                                is_spanned=False,
+                                row_origin=row_idx,
+                                col_origin=col_pos + 1
+                            )
+                            self.matrix[row_idx + r][col_pos + 1] = matrix_cell
+                        
+                        # Remaining columns of span: mark as spanned (though colspan=2 has no remaining)
+                        for c in range(2, cell.colspan):
+                            if row_idx + r < self.row_count and col_pos + c < self.col_count:
+                                matrix_cell = MatrixCell(
+                                    original_cell=cell,
+                                    is_spanned=True,
+                                    row_origin=row_idx,
+                                    col_origin=col_pos + 1
+                                )
+                                self.matrix[row_idx + r][col_pos + c] = matrix_cell
+                else:
+                    # Normal placement for other cells
+                    for r in range(cell.rowspan):
+                        for c in range(cell.colspan):
+                            if row_idx + r < self.row_count and col_pos + c < self.col_count:
+                                matrix_cell = MatrixCell(
+                                    original_cell=cell,
+                                    is_spanned=(r > 0 or c > 0),
+                                    row_origin=row_idx,
+                                    col_origin=col_pos
+                                )
+                                self.matrix[row_idx + r][col_pos + c] = matrix_cell
+                
+                col_pos += cell.colspan
+    
+    def _expand_columns(self, new_col_count: int):
+        """Expand matrix to accommodate more columns"""
+        if new_col_count <= self.col_count:
+            return
+        
+        for row in self.matrix:
+            row.extend([MatrixCell() for _ in range(new_col_count - self.col_count)])
+        
+        self.col_count = new_col_count
+    
+    def get_actual_columns(self) -> int:
+        """Get the actual number of data columns (excluding empty/spacing columns)"""
+        non_empty_cols = 0
+        
+        for col_idx in range(self.col_count):
+            has_content = False
+            for row_idx in range(self.row_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    # Check if cell has actual content
+                    text = cell.original_cell.text().strip()
+                    if text and text not in ['', ' ', '\xa0']:
+                        has_content = True
+                        break
+            
+            if has_content:
+                non_empty_cols += 1
+        
+        return non_empty_cols
+    
+    def get_column_widths(self) -> List[float]:
+        """Estimate column widths based on content"""
+        widths = []
+        
+        for col_idx in range(self.col_count):
+            max_width = 0
+            content_count = 0
+            
+            for row_idx in range(self.row_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    text = cell.original_cell.text().strip()
+                    if text:
+                        max_width = max(max_width, len(text))
+                        content_count += 1
+            
+            # If column has no content, it's likely a spacing column
+            if content_count == 0:
+                widths.append(0)
+            else:
+                widths.append(max_width)
+        
+        return widths
+    
+    def get_cell(self, row_idx: int, col_idx: int) -> Optional[Cell]:
+        """
+        Get a cell at specific position in the matrix.
+        
+        Args:
+            row_idx: Row index
+            col_idx: Column index
+            
+        Returns:
+            Cell at position or None if out of bounds
+        """
+        if row_idx >= self.row_count or col_idx >= self.col_count or row_idx < 0 or col_idx < 0:
+            return None
+        
+        matrix_cell = self.matrix[row_idx][col_idx]
+        
+        # Return the original cell
+        if matrix_cell.original_cell:
+            return matrix_cell.original_cell
+        
+        # Return empty cell for empty positions
+        return Cell("")
+    
+    def get_expanded_row(self, row_idx: int) -> List[Optional[Cell]]:
+        """
+        Get a row with cells expanded to match column count.
+        
+        For cells with colspan > 1, the cell appears in the first position
+        and None in subsequent positions.
+        """
+        if row_idx >= self.row_count:
+            return []
+        
+        expanded = []
+        for col_idx in range(self.col_count):
+            matrix_cell = self.matrix[row_idx][col_idx]
+            if matrix_cell.original_cell:
+                if not matrix_cell.is_spanned:
+                    # This is the origin cell
+                    expanded.append(matrix_cell.original_cell)
+                else:
+                    # This is a spanned position
+                    expanded.append(None)
+            else:
+                # Empty cell
+                expanded.append(None)
+        
+        return expanded
+    
+    def get_data_columns(self) -> List[int]:
+        """
+        Get indices of columns that contain actual data (not spacing).
+        Uses strategy similar to old parser - keeps single empty columns for spacing.
+        
+        Returns:
+            List of column indices that contain data
+        """
+        # First, identify which columns are empty
+        empty_cols = []
+        for col_idx in range(self.col_count):
+            has_content = False
+            for row_idx in range(self.row_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    text = cell.original_cell.text().strip()
+                    if text:
+                        has_content = True
+                        break
+            if not has_content:
+                empty_cols.append(col_idx)
+        
+        # Apply old parser's strategy
+        cols_to_remove = set()
+        
+        # Remove leading empty columns
+        for col in range(self.col_count):
+            if col in empty_cols:
+                cols_to_remove.add(col)
+            else:
+                break
+        
+        # Remove trailing empty columns
+        for col in reversed(range(self.col_count)):
+            if col in empty_cols:
+                cols_to_remove.add(col)
+            else:
+                break
+        
+        # Remove consecutive empty columns in the middle (keep single empty cols for spacing)
+        i = 0
+        while i < self.col_count - 1:
+            if i in empty_cols and (i + 1) in empty_cols:
+                # Found consecutive empty columns
+                consecutive_count = 0
+                j = i
+                while j < self.col_count and j in empty_cols:
+                    consecutive_count += 1
+                    j += 1
+                # Keep first empty column as spacer, remove the rest
+                cols_to_remove.update(range(i + 1, i + consecutive_count))
+                i = j
+            else:
+                i += 1
+        
+        # Return columns that are NOT in the removal set
+        data_cols = [col for col in range(self.col_count) if col not in cols_to_remove]
+        
+        return data_cols
+    
+    def filter_spacing_columns(self) -> 'TableMatrix':
+        """
+        Create a new matrix with spacing columns removed.
+        Also handles colspan-generated duplicate columns and misalignment.
+        
+        Returns:
+            New TableMatrix with only data columns
+        """
+        # First pass: identify primary header columns (those with colspan > 1 headers)
+        # and data columns
+        primary_header_cols = set()
+        all_header_cols = set()
+        data_cols = set()
+        
+        # Find primary header columns (those that start a colspan)
+        for row_idx in range(min(3, self.row_count)):
+            for col_idx in range(self.col_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    if cell.original_cell.text().strip():
+                        all_header_cols.add(col_idx)
+                        # Check if this is a primary header (colspan > 1)
+                        if cell.original_cell.colspan > 1:
+                            primary_header_cols.add(col_idx)
+        
+        # If no primary headers found, use all headers as primary
+        if not primary_header_cols:
+            primary_header_cols = all_header_cols
+
+        # Phase 1.5: Identify columns with header content
+        # Any column with non-empty text in ANY header row must be preserved
+        # This prevents legitimate header columns from being removed as "spacing"
+        # Also preserve columns that are spanned by headers (colspan > 1)
+        header_content_columns = set()
+        for col_idx in range(self.col_count):
+            for row_idx in range(self.header_row_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell:
+                    # Check for original header cell with content
+                    if not cell.is_spanned:
+                        text = cell.original_cell.text().strip()
+                        if text:
+                            header_content_columns.add(col_idx)
+                            # Also add all columns spanned by this header
+                            if cell.original_cell.colspan > 1:
+                                for span_offset in range(1, cell.original_cell.colspan):
+                                    span_col = col_idx + span_offset
+                                    if span_col < self.col_count:
+                                        header_content_columns.add(span_col)
+                            break  # Found content, no need to check other header rows
+                    # Also preserve columns that are spanned (part of a colspan)
+                    elif cell.is_spanned:
+                        # This column is part of a header's colspan
+                        text = cell.original_cell.text().strip()
+                        if text:
+                            header_content_columns.add(col_idx)
+
+        # Find columns with data (skip header rows)
+        # Count actual header rows by checking for non-data content
+        actual_header_rows = 0
+        for row_idx in range(min(3, self.row_count)):
+            has_numeric_data = False
+            for col_idx in range(self.col_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    text = cell.original_cell.text().strip()
+                    # Check if it looks like numeric data (has commas or starts with $)
+                    if text and (',' in text and any(c.isdigit() for c in text)) or text == '$':
+                        has_numeric_data = True
+                        break
+            if has_numeric_data:
+                break
+            actual_header_rows += 1
+        
+        data_start_row = max(1, actual_header_rows)
+        
+        # Track columns with significant data (not just isolated cells)
+        col_data_count = {}
+        for row_idx in range(data_start_row, self.row_count):
+            for col_idx in range(self.col_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    if cell.original_cell.text().strip():
+                        data_cols.add(col_idx)
+                        col_data_count[col_idx] = col_data_count.get(col_idx, 0) + 1
+        
+        # Build initial list of columns to keep
+        # Always include column 0 if it contains row labels
+        cols_to_keep = set(primary_header_cols)
+
+        # Add columns with header content (prevents removing legitimate headers)
+        cols_to_keep.update(header_content_columns)
+        
+        # Identify misaligned data columns that need to be consolidated
+        # These are data columns that are not primary header columns
+        misaligned_data_cols = data_cols - primary_header_cols
+        
+        # Map misaligned data columns to their nearest column for consolidation
+        # Only consolidate directly adjacent columns with specific patterns
+        consolidation_map = {}
+        
+        # First pass: identify all potential consolidations
+        potential_consolidations = {}
+        for data_col in sorted(misaligned_data_cols):
+            # Check if this column should be consolidated with an adjacent column
+            # Check the column immediately before this one
+            prev_col = data_col - 1
+            
+            # Sample some cells to see if consolidation makes sense
+            consolidation_type = None
+            
+            for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
+                prev_cell = self.matrix[row_idx][prev_col] if prev_col >= 0 else None
+                curr_cell = self.matrix[row_idx][data_col]
+                
+                if prev_cell and prev_cell.original_cell and curr_cell.original_cell:
+                    prev_text = prev_cell.original_cell.text().strip()
+                    curr_text = curr_cell.original_cell.text().strip()
+                    
+                    # Skip empty cells
+                    if not prev_text or not curr_text:
+                        continue
+                    
+                    # Check for patterns that indicate consolidation
+                    if prev_text == '$' and curr_text and curr_text[0].isdigit():
+                        consolidation_type = 'currency'
+                        break
+                    elif prev_text.startswith('(') and curr_text == ')':
+                        consolidation_type = 'parentheses'
+                        break
+                    elif curr_text == '%' and prev_text and prev_text[-1].isdigit():
+                        consolidation_type = 'percentage'
+                        break
+            
+            if consolidation_type:
+                potential_consolidations[data_col] = (prev_col, consolidation_type)
+        
+        # Second pass: resolve conflicts
+        # If column Y is a target for consolidation from Y+1 (e.g., parentheses),
+        # then don't consolidate Y into another column
+        columns_needed_as_targets = set()
+        for data_col, (target_col, cons_type) in potential_consolidations.items():
+            if cons_type == 'parentheses':
+                # This target column is needed for parentheses consolidation
+                columns_needed_as_targets.add(target_col)
+        
+        # Build final consolidation map, skipping consolidations that would remove needed targets
+        for data_col, (target_col, cons_type) in potential_consolidations.items():
+            # Don't consolidate this column if it's needed as a target for parentheses
+            if data_col in columns_needed_as_targets and cons_type != 'parentheses':
+                continue
+
+            # CRITICAL: Don't consolidate columns that have header content
+            # This prevents legitimate header columns from being merged together
+            if data_col in header_content_columns or target_col in header_content_columns:
+                continue
+
+            consolidation_map[data_col] = target_col
+            # Debug: uncomment to see consolidation mapping
+            # import os
+            # if os.environ.get('DEBUG_TABLE_CONSOLIDATION'):
+            #     print(f"Consolidating column {data_col} into {target_col}")
+        
+        # Special case: Keep data columns that are associated with header columns
+        # This handles cases where headers span multiple columns but data is in specific columns
+        for header_col in primary_header_cols:
+            # Check if there's a data column immediately after the header column
+            # This is common when headers span multiple columns
+            for offset in range(1, 3):  # Check next 1-2 columns
+                data_col = header_col + offset
+                if data_col in data_cols and data_col not in cols_to_keep:
+                    # Check if this column has meaningful data
+                    has_data = False
+                    for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
+                        cell = self.matrix[row_idx][data_col]
+                        if cell.original_cell and not cell.is_spanned:
+                            text = cell.original_cell.text().strip()
+                            if text and text not in ['', '-', '—', '–']:
+                                has_data = True
+                                break
+                    if has_data:
+                        cols_to_keep.add(data_col)
+        
+        # Keep data columns that have significant content but aren't near header columns
+        # This includes columns with dates, text descriptions, etc.
+        for col_idx in data_cols:
+            if col_idx not in cols_to_keep:
+                # Check if this column has important data
+                has_important_data = False
+                non_empty_count = 0
+                text_samples = []
+                
+                for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
+                    cell = self.matrix[row_idx][col_idx]
+                    if cell.original_cell and not cell.is_spanned:
+                        text = cell.original_cell.text().strip()
+                        if text and text not in ['', '-', '—', '–']:
+                            non_empty_count += 1
+                            if len(text_samples) < 3:
+                                text_samples.append(text)
+                            
+                            # Check for important patterns
+                            # Dates, years, text descriptions, etc.
+                            if any([
+                                len(text) > 3 and not text.replace(',', '').replace('.', '').isdigit(),  # Non-trivial text
+                                any(month in text for month in ['January', 'February', 'March', 'April', 'May', 'June', 
+                                                                'July', 'August', 'September', 'October', 'November', 'December']),
+                                any(month in text for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
+                                                                'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']),
+                                '20' in text and any(c.isdigit() for c in text),  # Likely contains year
+                            ]):
+                                has_important_data = True
+                
+                # Keep columns with consistent important data
+                if has_important_data and non_empty_count >= 3:
+                    cols_to_keep.add(col_idx)
+        
+        # Special case: If we have very few primary headers but lots of data columns,
+        # we might have a table where headers are in data rows (like years)
+        # Keep columns that have significant financial data
+        if len(primary_header_cols) <= 2 and len(data_cols) > 4:
+            # Check for financial data patterns in columns
+            for col_idx in data_cols:
+                has_financial_data = False
+                sample_count = 0
+                
+                # Sample a few cells from this column
+                for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
+                    cell = self.matrix[row_idx][col_idx]
+                    if cell.original_cell and not cell.is_spanned:
+                        text = cell.original_cell.text().strip()
+                        if text:
+                            sample_count += 1
+                            # Check for financial patterns
+                            if any([
+                                text.startswith('(') and any(c.isdigit() for c in text),  # Negative numbers
+                                text == ')' and col_idx > 0,  # Closing parenthesis
+                                '$' in text,  # Currency
+                                '%' in text,  # Percentages
+                                text.replace(',', '').replace('.', '').isdigit(),  # Plain numbers
+                                text in ['—', '–', '-', '*']  # Common placeholders
+                            ]):
+                                has_financial_data = True
+                                break
+                
+                # Keep columns with financial data
+                if has_financial_data and sample_count > 0:
+                    cols_to_keep.add(col_idx)
+        
+        # Check if column 0 contains row labels (non-empty cells in data rows)
+        col_0_has_labels = False
+        data_start_row = max(1, actual_header_rows)
+        for row_idx in range(data_start_row, self.row_count):
+            cell = self.matrix[row_idx][0]
+            if cell.original_cell and not cell.is_spanned:
+                text = cell.original_cell.text().strip()
+                if text and not text.isdigit() and not text.startswith('$') and len(text) > 1:
+                    col_0_has_labels = True
+                    break
+        
+        # Include column 0 if it has labels
+        if col_0_has_labels:
+            cols_to_keep.add(0)
+        
+        # Remove columns that will be consolidated into other columns
+        # These columns' data will be merged into their target columns
+        cols_to_remove = set(consolidation_map.keys())
+        cols_to_keep = cols_to_keep - cols_to_remove
+        
+        cols_to_keep = sorted(cols_to_keep)
+        
+        # Create new matrix with consolidated columns
+        if not cols_to_keep:
+            return self
+        
+        new_matrix = TableMatrix()
+        new_matrix.row_count = self.row_count
+        new_matrix.col_count = len(cols_to_keep)
+        new_matrix.header_row_count = self.header_row_count  # Preserve header row count
+        new_matrix.matrix = []
+        
+        # Create mapping from old to new column indices
+        old_to_new = {old_col: new_idx for new_idx, old_col in enumerate(cols_to_keep)}
+        
+        # Build new matrix with consolidation
+        for row_idx in range(self.row_count):
+            new_row = [MatrixCell() for _ in range(new_matrix.col_count)]
+            
+            # Track which cells we've already placed to handle colspan properly
+            placed_origins = {}  # Maps (row_origin, col_origin) to new column index
+            
+            # First, copy cells from kept columns
+            for old_col in sorted(cols_to_keep):
+                if old_col not in old_to_new:
+                    continue
+                new_col = old_to_new[old_col]
+                cell = self.matrix[row_idx][old_col]
+                if cell.original_cell:
+                    origin_key = (cell.row_origin, cell.col_origin)
+                    
+                    # Check if we've already placed this cell (due to colspan)
+                    if origin_key in placed_origins:
+                        # This is a continuation of a colspan - mark as spanned
+                        new_row[new_col] = MatrixCell(
+                            original_cell=cell.original_cell,
+                            is_spanned=True,  # Mark as spanned since it's part of a colspan
+                            row_origin=cell.row_origin,
+                            col_origin=placed_origins[origin_key]  # Point to the original placement
+                        )
+                    else:
+                        # First occurrence of this cell - place normally
+                        new_row[new_col] = MatrixCell(
+                            original_cell=cell.original_cell,
+                            is_spanned=False,  # This is the primary cell
+                            row_origin=cell.row_origin,
+                            col_origin=new_col
+                        )
+                        placed_origins[origin_key] = new_col
+            
+            # Then, consolidate misaligned data into header columns
+            for data_col, header_col in consolidation_map.items():
+                if header_col in old_to_new:
+                    new_col = old_to_new[header_col]
+                    data_cell = self.matrix[row_idx][data_col] if data_col < len(self.matrix[row_idx]) else None
+                    
+                    
+                    # If data cell has content, merge it with header column
+                    if data_cell and data_cell.original_cell and not data_cell.is_spanned:
+                        # Skip empty data cells
+                        if not data_cell.original_cell.text().strip():
+                            continue
+                        # Check the original header column cell to see if it has content to merge
+                        header_cell = self.matrix[row_idx][header_col]
+                        existing_cell = new_row[new_col]
+                        
+                        # Check if we need to merge (e.g., $ with value)
+                        if header_cell.original_cell and header_cell.original_cell.text().strip():
+                            existing_text = header_cell.original_cell.text().strip()
+                            new_text = data_cell.original_cell.text().strip()
+                            
+                            
+                            # Merge currency symbol with value OR value with percentage OR parentheses
+                            if existing_text == '$' and new_text:
+                                # Currency merge: $ + number
+                                merged_text = f"${new_text}"
+                                # Create new cell with merged content
+                                merged_cell = Cell(
+                                    content=merged_text,
+                                    colspan=header_cell.original_cell.colspan,
+                                    rowspan=header_cell.original_cell.rowspan,
+                                    is_header=header_cell.original_cell.is_header,
+                                    align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
+                                )
+                                new_row[new_col] = MatrixCell(
+                                    original_cell=merged_cell,
+                                    is_spanned=False,
+                                    row_origin=row_idx,
+                                    col_origin=new_col
+                                )
+                            elif new_text == ')' and existing_text.startswith('('):
+                                # Parentheses merge: (number + )
+                                merged_text = f"{existing_text})"
+                                # Create new cell with merged content
+                                merged_cell = Cell(
+                                    content=merged_text,
+                                    colspan=header_cell.original_cell.colspan,
+                                    rowspan=header_cell.original_cell.rowspan,
+                                    is_header=header_cell.original_cell.is_header,
+                                    align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
+                                )
+                                new_row[new_col] = MatrixCell(
+                                    original_cell=merged_cell,
+                                    is_spanned=False,
+                                    row_origin=row_idx,
+                                    col_origin=new_col
+                                )
+                            elif new_text == '%' and existing_text:
+                                # Percentage merge: number + %
+                                merged_text = f"{existing_text}%"
+                                # Create new cell with merged content
+                                merged_cell = Cell(
+                                    content=merged_text,
+                                    colspan=header_cell.original_cell.colspan,
+                                    rowspan=header_cell.original_cell.rowspan,
+                                    is_header=header_cell.original_cell.is_header,
+                                    align=header_cell.original_cell.align if hasattr(header_cell.original_cell, 'align') else None
+                                )
+                                new_row[new_col] = MatrixCell(
+                                    original_cell=merged_cell,
+                                    is_spanned=False,
+                                    row_origin=row_idx,
+                                    col_origin=new_col
+                                )
+                            else:
+                                # Just keep the data cell if can't merge
+                                new_row[new_col] = MatrixCell(
+                                    original_cell=data_cell.original_cell,
+                                    is_spanned=False,
+                                    row_origin=row_idx,
+                                    col_origin=new_col
+                                )
+                        else:
+                            # No existing content, just move the data
+                            new_row[new_col] = MatrixCell(
+                                original_cell=data_cell.original_cell,
+                                is_spanned=False,
+                                row_origin=row_idx,
+                                col_origin=new_col
+                            )
+            
+            new_matrix.matrix.append(new_row)
+        
+        return new_matrix
+    
+    def to_cell_grid(self) -> List[List[Optional[Cell]]]:
+        """
+        Convert matrix to a simple 2D grid of cells.
+        
+        Returns:
+            2D list where each position contains either a Cell or None
+        """
+        grid = []
+        
+        for row_idx in range(self.row_count):
+            row = []
+            for col_idx in range(self.col_count):
+                matrix_cell = self.matrix[row_idx][col_idx]
+                if matrix_cell.original_cell and not matrix_cell.is_spanned:
+                    row.append(matrix_cell.original_cell)
+                else:
+                    row.append(None)
+            grid.append(row)
+        
+        return grid
+    
+    def debug_print(self):
+        """Print matrix structure for debugging"""
+        print(f"Matrix: {self.row_count}×{self.col_count}")
+        
+        for row_idx in range(self.row_count):
+            row_str = []
+            for col_idx in range(self.col_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell:
+                    text = cell.original_cell.text()[:10]
+                    if cell.is_spanned:
+                        row_str.append(f"[{text}...]")
+                    else:
+                        row_str.append(f"{text}...")
+                else:
+                    row_str.append("___")
+            print(f"Row {row_idx}: {' | '.join(row_str)}")
+
+
+class ColumnAnalyzer:
+    """Analyze column structure to identify data vs spacing columns"""
+    
+    def __init__(self, matrix: TableMatrix):
+        """Initialize with a table matrix"""
+        self.matrix = matrix
+    
+    def identify_spacing_columns(self) -> List[int]:
+        """
+        Identify columns used only for spacing.
+        
+        Returns:
+            List of column indices that are spacing columns
+        """
+        spacing_cols = []
+        widths = self.matrix.get_column_widths()
+        total_width = sum(widths)
+        
+        for col_idx in range(self.matrix.col_count):
+            if self._is_spacing_column(col_idx, widths, total_width):
+                spacing_cols.append(col_idx)
+        
+        return spacing_cols
+    
+    def _is_spacing_column(self, col_idx: int, widths: List[float], total_width: float) -> bool:
+        """
+        Check if a column is used for spacing.
+        Only mark as spacing if column is completely empty.
+        
+        Criteria:
+        - Column has absolutely no content across all rows
+        """
+        # Check if column is completely empty
+        for row_idx in range(self.matrix.row_count):
+            cell = self.matrix.matrix[row_idx][col_idx]
+            if cell.original_cell and not cell.is_spanned:
+                text = cell.original_cell.text().strip()
+                # If there's any text at all, it's not a spacing column
+                if text:
+                    return False
+        
+        # Column is completely empty
+        return True
+    
+    def get_clean_column_indices(self) -> List[int]:
+        """
+        Get indices of non-spacing columns.
+        
+        Returns:
+            List of column indices that contain actual data
+        """
+        spacing = set(self.identify_spacing_columns())
+        return [i for i in range(self.matrix.col_count) if i not in spacing]
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/toc_analyzer.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/toc_analyzer.py
@@ -0,0 +1,440 @@
+"""
+Table of Contents analyzer for SEC filings.
+
+This module analyzes the TOC structure to map section names to anchor IDs,
+enabling section extraction for API filings with generated anchor IDs.
+"""
+import re
+from typing import Dict, List, Optional, Set, Tuple
+from dataclasses import dataclass
+from lxml import html as lxml_html
+
+
+@dataclass
+class TOCSection:
+    """Represents a section found in the Table of Contents."""
+    name: str
+    anchor_id: str
+    normalized_name: str
+    section_type: str  # 'item', 'part', 'other'
+    order: int
+    part: Optional[str] = None  # NEW: "Part I", "Part II", or None for 10-K
+
+
+class TOCAnalyzer:
+    """
+    Analyzes Table of Contents structure to map section names to anchor IDs.
+    
+    This enables section extraction for filings where anchor IDs are generated
+    rather than semantic (like API filings vs local HTML files).
+    """
+    
+    def __init__(self):
+        # SEC section patterns for normalization
+        self.section_patterns = [
+            (r'(?:item|part)\s+\d+[a-z]?', 'item'),
+            (r'business', 'item'),
+            (r'risk\s+factors?', 'item'),
+            (r'properties', 'item'),
+            (r'legal\s+proceedings', 'item'),
+            (r'management.*discussion', 'item'),
+            (r'md&a', 'item'),
+            (r'financial\s+statements?', 'item'),
+            (r'exhibits?', 'item'),
+            (r'signatures?', 'item'),
+            (r'part\s+[ivx]+', 'part'),
+        ]
+    
+    def analyze_toc_structure(self, html_content: str) -> Dict[str, str]:
+        """
+        Analyze HTML content to extract section mappings from TOC.
+
+        Args:
+            html_content: Raw HTML content
+
+        Returns:
+            Dict mapping normalized section names to anchor IDs
+        """
+        section_mapping = {}
+
+        try:
+            # Handle XML declaration issues
+            if html_content.startswith('<?xml'):
+                html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
+
+            tree = lxml_html.fromstring(html_content)
+
+            # Find all anchor links that could be TOC links
+            anchor_links = tree.xpath('//a[@href]')
+
+            toc_sections = []
+            current_part = None  # Track current part context for 10-Q filings
+            part_pattern = re.compile(r'^\s*Part\s+([IVX]+)\b', re.IGNORECASE)
+
+            for link in anchor_links:
+                href = link.get('href', '').strip()
+                text = (link.text_content() or '').strip()
+
+                # Check if this link or its row represents a part header
+                # Part headers in 10-Q TOCs typically appear as separate rows: "Part I", "Part II"
+                part_match = part_pattern.match(text)
+                if part_match:
+                    # Update current part context
+                    current_part = f"Part {part_match.group(1).upper()}"
+                    # Don't create a section for the part header itself
+                    continue
+
+                # Look for internal anchor links
+                if href.startswith('#') and text:
+                    anchor_id = href[1:]  # Remove #
+
+                    # Try to find item number in preceding context (for table-based TOCs)
+                    preceding_item = self._extract_preceding_item_label(link)
+
+                    # Check if this looks like a section reference (check text, anchor ID, and context)
+                    if self._is_section_link(text, anchor_id, preceding_item):
+                        # Verify target exists
+                        target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
+                        if target_elements:
+                            # Try to extract item number from: anchor ID > preceding context > text
+                            normalized_name = self._normalize_section_name(text, anchor_id, preceding_item)
+                            section_type, order = self._get_section_type_and_order(normalized_name)
+
+                            toc_section = TOCSection(
+                                name=text,
+                                anchor_id=anchor_id,
+                                normalized_name=normalized_name,
+                                section_type=section_type,
+                                order=order,
+                                part=current_part  # Assign current part context
+                            )
+                            toc_sections.append(toc_section)
+
+            # Build mapping prioritizing the most standard section names
+            section_mapping = self._build_section_mapping(toc_sections)
+
+        except Exception as e:
+            # Return empty mapping on error - fallback to other methods
+            pass
+
+        return section_mapping
+
+    def _extract_preceding_item_label(self, link_element) -> str:
+        """
+        Extract item/part label from preceding context.
+
+        Handles table-based TOCs where item number is in a separate cell:
+        <td>Item 1.</td><td><a href="...">Business</a></td>
+
+        Also handles nested structures like:
+        <td>Item 1.</td><td><div><span><a href="...">Business</a></span></div></td>
+
+        Args:
+            link_element: The <a> element
+
+        Returns:
+            Item label like "Item 1", "Item 1A", "Part I" or empty string
+        """
+        try:
+            # Traverse up to find the containing <td> or <th> (up to 5 levels)
+            current = link_element
+            td_element = None
+
+            for _ in range(5):
+                parent = current.getparent()
+                if parent is None:
+                    break
+
+                if parent.tag in ['td', 'th']:
+                    td_element = parent
+                    break
+
+                current = parent
+
+            # If we found a <td>, check ALL preceding siblings in the row
+            # This handles TOCs where item number is not in the immediately adjacent cell
+            # Example: ['Business', 'I', '1', '5'] where '1' is the item number
+            if td_element is not None:
+                # Check all preceding siblings (rightmost to leftmost)
+                prev_sibling = td_element.getprevious()
+                while prev_sibling is not None:
+                    if prev_sibling.tag in ['td', 'th']:
+                        prev_text = (prev_sibling.text_content() or '').strip()
+
+                        # Look for "Item X" or just "X" (bare number) pattern
+                        # Match full format: "Item 1A"
+                        item_match = re.match(r'(Item\s+\d+[A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
+                        if item_match:
+                            return item_match.group(1)
+
+                        # Match bare item number: "1A" or "1" (only valid 10-K item numbers: 1-15)
+                        # This prevents page numbers (50, 108, etc.) from being treated as items
+                        bare_item_match = re.match(r'^([1-9]|1[0-5])([A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
+                        if bare_item_match:
+                            item_num = bare_item_match.group(1)
+                            item_letter = bare_item_match.group(2)
+                            return f"Item {item_num}{item_letter}"
+
+                        # Match part: "Part I" or just "I"
+                        part_match = re.match(r'(Part\s+[IVX]+)\.?\s*$', prev_text, re.IGNORECASE)
+                        if part_match:
+                            return part_match.group(1)
+
+                        # Match bare part: "I", "II", etc.
+                        bare_part_match = re.match(r'^([IVX]+)\.?\s*$', prev_text)
+                        if bare_part_match:
+                            return f"Part {bare_part_match.group(1)}"
+
+                    prev_sibling = prev_sibling.getprevious()
+
+            # Also check immediate parent's text for inline patterns (div/span structures)
+            parent = link_element.getparent()
+            if parent is not None and parent.tag in ['div', 'span', 'p']:
+                if parent.text:
+                    text_before = parent.text.strip()
+                    item_match = re.search(r'(Item\s+\d+[A-Z]?)\.?\s*$', text_before, re.IGNORECASE)
+                    if item_match:
+                        return item_match.group(1)
+
+                    part_match = re.search(r'(Part\s+[IVX]+)\.?\s*$', text_before, re.IGNORECASE)
+                    if part_match:
+                        return part_match.group(1)
+
+        except Exception:
+            pass
+
+        return ''
+
+    def _is_section_link(self, text: str, anchor_id: str = '', preceding_item: str = '') -> bool:
+        """
+        Check if link represents a section reference.
+
+        Checks link text, anchor ID, and preceding context to handle cases where:
+        - Text is descriptive (e.g., "Executive Compensation")
+        - Anchor ID contains item number (e.g., "item_11_executive_compensation")
+        - Item number is in preceding table cell (e.g., <td>Item 1.</td><td><a>Business</a></td>)
+
+        Args:
+            text: Link text
+            anchor_id: Anchor ID from href (without #)
+            preceding_item: Item/part label from preceding context (e.g., "Item 1A")
+
+        Returns:
+            True if this appears to be a section link
+        """
+        if not text:
+            return False
+
+        # First check if there's a preceding item label (table-based TOC)
+        if preceding_item:
+            return True
+
+        # Then check anchor ID for item/part patterns (most reliable)
+        if anchor_id:
+            anchor_lower = anchor_id.lower()
+            # Match patterns like: item_1, item_1a, item1, item1a, part_i, part_ii, etc.
+            if re.search(r'item_?\d+[a-z]?', anchor_lower):
+                return True
+            if re.search(r'part_?[ivx]+', anchor_lower):
+                return True
+
+        # Then check text (with relaxed length limit for descriptive section names)
+        if len(text) > 150:  # Increased from 100 to accommodate longer section titles
+            return False
+
+        # Check against known patterns
+        for pattern, _ in self.section_patterns:
+            if re.search(pattern, text, re.IGNORECASE):
+                return True
+
+        # Also consider links with section keywords
+        if len(text) < 100 and any(keyword in text.lower() for keyword in
+                                   ['item', 'part', 'business', 'risk', 'properties', 'legal',
+                                    'compensation', 'ownership', 'governance', 'directors']):
+            return True
+
+        return False
+    
+    def _normalize_section_name(self, text: str, anchor_id: str = '', preceding_item: str = '') -> str:
+        """
+        Normalize section name for consistent lookup.
+
+        Prioritizes:
+        1. Preceding item label (table-based TOC)
+        2. Anchor ID pattern
+        3. Text-based normalization
+
+        Args:
+            text: Link text
+            anchor_id: Anchor ID from href (without #)
+            preceding_item: Item/part label from preceding context
+
+        Returns:
+            Normalized section name (e.g., "Item 1A", "Part II")
+        """
+        text = text.strip()
+
+        # HIGHEST PRIORITY: Use preceding item label if available (table-based TOC)
+        if preceding_item:
+            # Clean up and normalize the preceding item
+            item_match = re.match(r'item\s+(\d+[a-z]?)', preceding_item, re.IGNORECASE)
+            if item_match:
+                return f"Item {item_match.group(1).upper()}"
+
+            part_match = re.match(r'part\s+([ivx]+)', preceding_item, re.IGNORECASE)
+            if part_match:
+                return f"Part {part_match.group(1).upper()}"
+
+        # SECOND PRIORITY: Try to extract from anchor ID
+        if anchor_id:
+            anchor_lower = anchor_id.lower()
+
+            # Match item patterns: item_1a, item1a, item_1_business, etc.
+            item_match = re.search(r'item_?(\d+[a-z]?)', anchor_lower)
+            if item_match:
+                item_num = item_match.group(1).upper()
+                return f"Item {item_num}"
+
+            # Match part patterns: part_i, part_ii, parti, partii, etc.
+            part_match = re.search(r'part_?([ivx]+)', anchor_lower)
+            if part_match:
+                part_num = part_match.group(1).upper()
+                return f"Part {part_num}"
+
+        # THIRD PRIORITY: Text-based normalization
+        # Handle common Item patterns in text
+        item_match = re.match(r'item\s+(\d+[a-z]?)', text, re.IGNORECASE)
+        if item_match:
+            return f"Item {item_match.group(1).upper()}"
+
+        # Handle Part patterns
+        part_match = re.match(r'part\s+([ivx]+)', text, re.IGNORECASE)
+        if part_match:
+            return f"Part {part_match.group(1).upper()}"
+
+        # Handle specific known sections by text
+        text_lower = text.lower()
+        if 'business' in text_lower and 'item' not in text_lower:
+            return "Item 1"
+        elif 'risk factors' in text_lower and 'item' not in text_lower:
+            return "Item 1A"
+        elif 'properties' in text_lower and 'item' not in text_lower:
+            return "Item 2"
+        elif 'legal proceedings' in text_lower and 'item' not in text_lower:
+            return "Item 3"
+        elif 'management' in text_lower and 'discussion' in text_lower:
+            return "Item 7"
+        elif 'financial statements' in text_lower:
+            return "Item 8"
+        elif 'exhibits' in text_lower:
+            return "Item 15"
+
+        return text  # Return as-is if no normalization applies
+    
+    def _get_section_type_and_order(self, text: str) -> Tuple[str, int]:
+        """Get section type and order for sorting."""
+        text_lower = text.lower()
+        
+        # Items
+        item_match = re.search(r'item\s*(\d+)([a-z]?)', text_lower)
+        if item_match:
+            item_num = int(item_match.group(1))
+            item_letter = item_match.group(2) or ''
+            # Order: Item 1=1000, Item 1A=1001, Item 2=2000, etc.
+            order = item_num * 1000 + (ord(item_letter.upper()) - ord('A') + 1 if item_letter else 0)
+            return 'item', order
+        
+        # Parts
+        part_match = re.search(r'part\s*([ivx]+)', text_lower)
+        if part_match:
+            part_roman = part_match.group(1)
+            part_num = self._roman_to_int(part_roman)
+            return 'part', part_num * 100  # Part I=100, Part II=200, etc.
+        
+        # Known sections without explicit item numbers
+        if 'business' in text_lower:
+            return 'item', 1000  # Item 1
+        elif 'risk factors' in text_lower:
+            return 'item', 1001  # Item 1A
+        elif 'properties' in text_lower:
+            return 'item', 2000  # Item 2
+        elif 'legal proceedings' in text_lower:
+            return 'item', 3000  # Item 3
+        elif 'management' in text_lower and 'discussion' in text_lower:
+            return 'item', 7000  # Item 7
+        elif 'financial statements' in text_lower:
+            return 'item', 8000  # Item 8
+        elif 'exhibits' in text_lower:
+            return 'item', 15000  # Item 15
+        
+        return 'other', 99999
+    
+    def _roman_to_int(self, roman: str) -> int:
+        """Convert roman numerals to integers."""
+        roman_map = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
+        roman = roman.lower()
+        result = 0
+        prev = 0
+        
+        for char in reversed(roman):
+            value = roman_map.get(char, 0)
+            if value < prev:
+                result -= value
+            else:
+                result += value
+            prev = value
+        
+        return result
+    
+    def _build_section_mapping(self, toc_sections: List[TOCSection]) -> Dict[str, str]:
+        """Build final section mapping, handling duplicates intelligently.
+
+        For 10-Q filings with part context, generates part-aware section names
+        like "part_i_item_1" and "part_ii_item_1" to distinguish sections
+        with the same item number across different parts.
+        """
+        # Sort sections by order
+        toc_sections.sort(key=lambda x: x.order)
+
+        mapping = {}
+        seen_names = set()
+
+        for section in toc_sections:
+            # Generate part-aware section name for 10-Q filings
+            if section.part:
+                # Convert "Part I" -> "part_i", "Part II" -> "part_ii"
+                part_key = section.part.lower().replace(' ', '_')
+                # Convert "Item 1" -> "item_1", "Item 1A" -> "item_1a"
+                item_key = section.normalized_name.lower().replace(' ', '_')
+                section_name = f"{part_key}_{item_key}"
+            else:
+                # 10-K filings: use normalized name as-is
+                section_name = section.normalized_name
+
+            # Skip if we already have this section (prefer first occurrence)
+            if section_name in seen_names:
+                continue
+
+            mapping[section_name] = section.anchor_id
+            seen_names.add(section_name)
+
+        return mapping
+    
+    def get_section_suggestions(self, html_content: str) -> List[str]:
+        """Get list of available sections that can be extracted."""
+        mapping = self.analyze_toc_structure(html_content)
+        return sorted(mapping.keys(), key=lambda x: self._get_section_type_and_order(x)[1])
+
+
+def analyze_toc_for_sections(html_content: str) -> Dict[str, str]:
+    """
+    Convenience function to analyze TOC and return section mapping.
+    
+    Args:
+        html_content: Raw HTML content
+        
+    Returns:
+        Dict mapping section names to anchor IDs
+    """
+    analyzer = TOCAnalyzer()
+    return analyzer.analyze_toc_structure(html_content)
--- a/Show More
+++ b/Show More