Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/init.py
@@ -0,0 +1,51 @@
+"""
+Utility modules for HTML parsing.
+"""
+
+from edgar.documents.utils.cache import (
+    LRUCache,
+    WeakCache,
+    TimeBasedCache,
+    CacheManager,
+    get_cache_manager,
+    cached,
+    CacheStats
+)
+from edgar.documents.utils.streaming import (
+    StreamingParser
+)
+from edgar.documents.utils.table_matrix import (
+    TableMatrix,
+    ColumnAnalyzer,
+    MatrixCell
+)
+from edgar.documents.utils.currency_merger import (
+    CurrencyColumnMerger
+)
+# Note: CacheableMixin not exported to avoid circular imports
+# Import directly: from edgar.documents.cache_mixin import CacheableMixin
+from edgar.documents.utils.html_utils import (
+    remove_xml_declaration,
+    create_lxml_parser
+)
+# Note: table_utils not exported to avoid circular imports
+# Import directly: from edgar.documents.utils.table_utils import process_table_matrix
+
+__all__ = [
+    'LRUCache',
+    'WeakCache',
+    'TimeBasedCache',
+    'CacheManager',
+    'get_cache_manager',
+    'cached',
+    'CacheStats',
+    'StreamingParser',
+    'TableMatrix',
+    'ColumnAnalyzer',
+    'MatrixCell',
+    'CurrencyColumnMerger',
+    # 'CacheableMixin',  # Not exported - import directly to avoid circular imports
+    'remove_xml_declaration',
+    'create_lxml_parser',
+    # 'process_table_matrix'  # Not exported - import directly to avoid circular imports
+]
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/anchor_cache.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/anchor_cache.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/cache.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/cache.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/currency_merger.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/currency_merger.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/html_utils.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/html_utils.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/streaming.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/streaming.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/table_matrix.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/table_matrix.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/toc_analyzer.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/toc_analyzer.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/toc_filter.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/pycache/toc_filter.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/anchor_cache.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/anchor_cache.py
@@ -0,0 +1,205 @@
+"""
+Lightweight anchor analysis cache to avoid re-parsing HTML.
+
+This provides a middle-ground approach that caches anchor analysis results
+while minimizing memory overhead.
+"""
+import re
+from typing import Dict, Set, Optional
+from collections import Counter
+import hashlib
+import pickle
+from pathlib import Path
+
+
+class AnchorCache:
+    """
+    Cache for anchor link analysis results.
+    
+    Stores navigation patterns by HTML hash to avoid re-analysis.
+    """
+    
+    def __init__(self, cache_dir: Optional[Path] = None):
+        self.cache_dir = cache_dir or Path.home() / '.edgar_cache' / 'anchors'
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self._memory_cache = {}  # In-memory cache for current session
+    
+    def _get_html_hash(self, html_content: str) -> str:
+        """Get hash of HTML content for caching."""
+        return hashlib.md5(html_content.encode('utf-8')).hexdigest()
+    
+    def get_navigation_patterns(self, html_content: str) -> Optional[Set[str]]:
+        """
+        Get cached navigation patterns for HTML content.
+        
+        Args:
+            html_content: HTML to analyze
+            
+        Returns:
+            Set of navigation patterns or None if not cached
+        """
+        html_hash = self._get_html_hash(html_content)
+        
+        # Check in-memory cache first
+        if html_hash in self._memory_cache:
+            return self._memory_cache[html_hash]
+        
+        # Check disk cache
+        cache_file = self.cache_dir / f"{html_hash}.pkl"
+        if cache_file.exists():
+            try:
+                with open(cache_file, 'rb') as f:
+                    patterns = pickle.load(f)
+                self._memory_cache[html_hash] = patterns
+                return patterns
+            except:
+                # Corrupted cache file, remove it
+                cache_file.unlink(missing_ok=True)
+        
+        return None
+    
+    def cache_navigation_patterns(self, html_content: str, patterns: Set[str]) -> None:
+        """
+        Cache navigation patterns for HTML content.
+        
+        Args:
+            html_content: HTML content
+            patterns: Navigation patterns to cache
+        """
+        html_hash = self._get_html_hash(html_content)
+        
+        # Store in memory
+        self._memory_cache[html_hash] = patterns
+        
+        # Store on disk (async to avoid blocking)
+        try:
+            cache_file = self.cache_dir / f"{html_hash}.pkl"
+            with open(cache_file, 'wb') as f:
+                pickle.dump(patterns, f)
+        except:
+            # Ignore cache write errors
+            pass
+    
+    def clear_cache(self) -> None:
+        """Clear all cached data."""
+        self._memory_cache.clear()
+        for cache_file in self.cache_dir.glob("*.pkl"):
+            cache_file.unlink(missing_ok=True)
+
+
+# Global cache instance
+_anchor_cache = AnchorCache()
+
+
+def get_cached_navigation_patterns(html_content: str, 
+                                 force_analyze: bool = False) -> Set[str]:
+    """
+    Get navigation patterns with caching.
+    
+    Args:
+        html_content: HTML to analyze
+        force_analyze: Force re-analysis even if cached
+        
+    Returns:
+        Set of navigation link texts to filter
+    """
+    if not force_analyze:
+        cached_patterns = _anchor_cache.get_navigation_patterns(html_content)
+        if cached_patterns is not None:
+            return cached_patterns
+    
+    # Need to analyze - use minimal approach
+    patterns = _analyze_navigation_minimal(html_content)
+    
+    # Cache results
+    _anchor_cache.cache_navigation_patterns(html_content, patterns)
+    
+    return patterns
+
+
+def _analyze_navigation_minimal(html_content: str, min_frequency: int = 5) -> Set[str]:
+    """
+    Minimal navigation analysis using regex instead of full HTML parsing.
+    
+    This avoids BeautifulSoup overhead by using regex to find anchor patterns.
+    """
+    patterns = set()
+    
+    # Find all anchor links with regex (faster than BeautifulSoup)
+    anchor_pattern = re.compile(r'<a[^>]*href\s*=\s*["\']#([^"\']*)["\'][^>]*>(.*?)</a>', 
+                               re.IGNORECASE | re.DOTALL)
+    
+    link_counts = Counter()
+    
+    for match in anchor_pattern.finditer(html_content):
+        anchor_id = match.group(1).strip()
+        link_text = re.sub(r'<[^>]+>', '', match.group(2)).strip()  # Remove HTML tags
+        link_text = ' '.join(link_text.split())  # Normalize whitespace
+        
+        if link_text and len(link_text) < 100:  # Reasonable link text length
+            link_counts[link_text] += 1
+    
+    # Add frequently occurring links
+    for text, count in link_counts.items():
+        if count >= min_frequency:
+            patterns.add(text)
+    
+    return patterns
+
+
+def filter_with_cached_patterns(text: str, html_content: str = None) -> str:
+    """
+    Filter text using cached navigation patterns.
+    
+    Preserves first occurrences of patterns (document structure headers)
+    while filtering out repeated navigation links.
+    
+    Args:
+        text: Text to filter
+        html_content: HTML for pattern analysis (optional)
+        
+    Returns:
+        Filtered text
+    """
+    if not text:
+        return text
+    
+    # Get patterns (cached or analyze)
+    if html_content:
+        patterns = get_cached_navigation_patterns(html_content)
+    else:
+        # Fallback to common SEC patterns
+        patterns = {
+            'Table of Contents',
+            'Index to Financial Statements',
+            'Index to Exhibits'
+        }
+    
+    if not patterns:
+        return text
+    
+    # Smart filtering: preserve first few occurrences, filter out repetitions
+    lines = text.split('\n')
+    filtered_lines = []
+    pattern_counts = {}  # Track how many times we've seen each pattern
+    
+    # Allow first few occurrences of each pattern (document structure headers)
+    max_allowed_per_pattern = 2  # Allow up to 2 occurrences of each pattern
+    
+    for line in lines:
+        stripped_line = line.strip()
+        
+        if stripped_line in patterns:
+            # This line matches a navigation pattern
+            count = pattern_counts.get(stripped_line, 0)
+            
+            if count < max_allowed_per_pattern:
+                # Keep this occurrence (likely a document structure header)
+                filtered_lines.append(line)
+                pattern_counts[stripped_line] = count + 1
+            # else: skip this line (it's a repetitive navigation link)
+        else:
+            # Not a navigation pattern, always keep
+            filtered_lines.append(line)
+    
+    return '\n'.join(filtered_lines)
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/cache.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/cache.py
@@ -0,0 +1,426 @@
+"""
+Cache utilities for performance optimization.
+"""
+
+import weakref
+from collections import OrderedDict
+from typing import Any, Dict, Optional, Callable, TypeVar, Generic
+from functools import wraps
+import time
+import threading
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+
+T = TypeVar('T')
+
+
+@dataclass
+class CacheStats:
+    """Statistics for cache performance monitoring."""
+    hits: int = 0
+    misses: int = 0
+    evictions: int = 0
+    total_time: float = 0.0
+    last_reset: datetime = field(default_factory=datetime.now)
+    
+    @property
+    def hit_rate(self) -> float:
+        """Calculate cache hit rate."""
+        total = self.hits + self.misses
+        return self.hits / total if total > 0 else 0.0
+    
+    @property
+    def avg_access_time(self) -> float:
+        """Calculate average access time."""
+        total = self.hits + self.misses
+        return self.total_time / total if total > 0 else 0.0
+    
+    def reset(self):
+        """Reset statistics."""
+        self.hits = 0
+        self.misses = 0
+        self.evictions = 0
+        self.total_time = 0.0
+        self.last_reset = datetime.now()
+
+
+class LRUCache(Generic[T]):
+    """
+    Thread-safe LRU cache implementation.
+    
+    Used for caching expensive operations like style parsing
+    and header detection results.
+    """
+    
+    def __init__(self, max_size: int = 1000):
+        """
+        Initialize LRU cache.
+        
+        Args:
+            max_size: Maximum number of items to cache
+        """
+        self.max_size = max_size
+        self._cache: OrderedDict[str, T] = OrderedDict()
+        self._lock = threading.RLock()
+        self.stats = CacheStats()
+    
+    def get(self, key: str) -> Optional[T]:
+        """
+        Get item from cache.
+        
+        Args:
+            key: Cache key
+            
+        Returns:
+            Cached value or None if not found
+        """
+        start_time = time.time()
+        
+        with self._lock:
+            if key in self._cache:
+                # Move to end (most recently used)
+                self._cache.move_to_end(key)
+                self.stats.hits += 1
+                self.stats.total_time += time.time() - start_time
+                return self._cache[key]
+            
+            self.stats.misses += 1
+            self.stats.total_time += time.time() - start_time
+            return None
+    
+    def put(self, key: str, value: T) -> None:
+        """
+        Put item in cache.
+        
+        Args:
+            key: Cache key
+            value: Value to cache
+        """
+        with self._lock:
+            if key in self._cache:
+                # Update existing
+                self._cache.move_to_end(key)
+                self._cache[key] = value
+            else:
+                # Add new
+                self._cache[key] = value
+                
+                # Evict oldest if over capacity
+                if len(self._cache) > self.max_size:
+                    self._cache.popitem(last=False)
+                    self.stats.evictions += 1
+    
+    def clear(self) -> None:
+        """Clear all cached items."""
+        with self._lock:
+            self._cache.clear()
+    
+    def size(self) -> int:
+        """Get current cache size."""
+        with self._lock:
+            return len(self._cache)
+
+
+class WeakCache:
+    """
+    Weak reference cache for parsed nodes.
+    
+    Allows garbage collection of unused nodes while
+    maintaining references to actively used ones.
+    """
+    
+    def __init__(self):
+        """Initialize weak cache."""
+        self._cache: Dict[str, weakref.ref] = {}
+        self._lock = threading.RLock()
+        self.stats = CacheStats()
+    
+    def get(self, key: str) -> Optional[Any]:
+        """
+        Get item from cache.
+        
+        Args:
+            key: Cache key
+            
+        Returns:
+            Cached object or None if not found or collected
+        """
+        start_time = time.time()
+        
+        with self._lock:
+            ref = self._cache.get(key)
+            if ref is not None:
+                obj = ref()
+                if obj is not None:
+                    self.stats.hits += 1
+                    self.stats.total_time += time.time() - start_time
+                    return obj
+                else:
+                    # Object was garbage collected
+                    del self._cache[key]
+            
+            self.stats.misses += 1
+            self.stats.total_time += time.time() - start_time
+            return None
+    
+    def put(self, key: str, value: Any) -> None:
+        """
+        Put item in cache with weak reference.
+        
+        Args:
+            key: Cache key
+            value: Object to cache
+        """
+        with self._lock:
+            self._cache[key] = weakref.ref(value)
+    
+    def clear(self) -> None:
+        """Clear all cached references."""
+        with self._lock:
+            self._cache.clear()
+    
+    def cleanup(self) -> int:
+        """
+        Remove dead references.
+        
+        Returns:
+            Number of references removed
+        """
+        with self._lock:
+            dead_keys = [
+                key for key, ref in self._cache.items()
+                if ref() is None
+            ]
+            
+            for key in dead_keys:
+                del self._cache[key]
+            
+            return len(dead_keys)
+
+
+class TimeBasedCache(Generic[T]):
+    """
+    Time-based expiring cache.
+    
+    Items expire after a specified duration.
+    """
+    
+    def __init__(self, ttl_seconds: int = 3600):
+        """
+        Initialize time-based cache.
+        
+        Args:
+            ttl_seconds: Time to live in seconds
+        """
+        self.ttl = timedelta(seconds=ttl_seconds)
+        self._cache: Dict[str, tuple[T, datetime]] = {}
+        self._lock = threading.RLock()
+        self.stats = CacheStats()
+    
+    def get(self, key: str) -> Optional[T]:
+        """
+        Get item from cache if not expired.
+        
+        Args:
+            key: Cache key
+            
+        Returns:
+            Cached value or None if not found or expired
+        """
+        start_time = time.time()
+        
+        with self._lock:
+            if key in self._cache:
+                value, timestamp = self._cache[key]
+                if datetime.now() - timestamp < self.ttl:
+                    self.stats.hits += 1
+                    self.stats.total_time += time.time() - start_time
+                    return value
+                else:
+                    # Expired
+                    del self._cache[key]
+                    self.stats.evictions += 1
+            
+            self.stats.misses += 1
+            self.stats.total_time += time.time() - start_time
+            return None
+    
+    def put(self, key: str, value: T) -> None:
+        """
+        Put item in cache with timestamp.
+        
+        Args:
+            key: Cache key
+            value: Value to cache
+        """
+        with self._lock:
+            self._cache[key] = (value, datetime.now())
+    
+    def clear(self) -> None:
+        """Clear all cached items."""
+        with self._lock:
+            self._cache.clear()
+    
+    def cleanup(self) -> int:
+        """
+        Remove expired items.
+        
+        Returns:
+            Number of items removed
+        """
+        with self._lock:
+            now = datetime.now()
+            expired_keys = [
+                key for key, (_, timestamp) in self._cache.items()
+                if now - timestamp >= self.ttl
+            ]
+            
+            for key in expired_keys:
+                del self._cache[key]
+                self.stats.evictions += 1
+            
+            return len(expired_keys)
+
+
+def cached(cache: LRUCache, key_func: Optional[Callable] = None):
+    """
+    Decorator for caching function results.
+    
+    Args:
+        cache: Cache instance to use
+        key_func: Function to generate cache key from arguments
+        
+    Returns:
+        Decorated function
+    """
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            # Generate cache key
+            if key_func:
+                key = key_func(*args, **kwargs)
+            else:
+                # Default key generation
+                key = f"{func.__name__}:{str(args)}:{str(sorted(kwargs.items()))}"
+            
+            # Check cache
+            result = cache.get(key)
+            if result is not None:
+                return result
+            
+            # Compute and cache result
+            result = func(*args, **kwargs)
+            cache.put(key, result)
+            
+            return result
+        
+        return wrapper
+    
+    return decorator
+
+
+class CacheManager:
+    """
+    Manages multiple caches for the parser.
+    
+    Provides centralized cache management and monitoring.
+    """
+    
+    def __init__(self):
+        """Initialize cache manager."""
+        # Style parsing cache
+        self.style_cache = LRUCache[dict](max_size=5000)
+        
+        # Header detection cache
+        self.header_cache = LRUCache[bool](max_size=2000)
+        
+        # Pattern matching cache
+        self.pattern_cache = LRUCache[bool](max_size=10000)
+        
+        # Node reference cache
+        self.node_cache = WeakCache()
+        
+        # Compiled regex cache
+        self.regex_cache = LRUCache[Any](max_size=500)
+        
+        # All caches for management
+        self._caches = {
+            'style': self.style_cache,
+            'header': self.header_cache,
+            'pattern': self.pattern_cache,
+            'node': self.node_cache,
+            'regex': self.regex_cache
+        }
+    
+    def get_stats(self) -> Dict[str, CacheStats]:
+        """Get statistics for all caches."""
+        return {
+            name: cache.stats 
+            for name, cache in self._caches.items()
+            if hasattr(cache, 'stats')
+        }
+    
+    def reset_stats(self) -> None:
+        """Reset statistics for all caches."""
+        for cache in self._caches.values():
+            if hasattr(cache, 'stats'):
+                cache.stats.reset()
+    
+    def clear_all(self) -> None:
+        """Clear all caches."""
+        for cache in self._caches.values():
+            cache.clear()
+    
+    def cleanup(self) -> Dict[str, int]:
+        """
+        Cleanup expired/dead entries in all caches.
+        
+        Returns:
+            Number of entries cleaned up per cache
+        """
+        cleanup_counts = {}
+        
+        # Cleanup weak cache
+        if hasattr(self.node_cache, 'cleanup'):
+            cleanup_counts['node'] = self.node_cache.cleanup()
+        
+        return cleanup_counts
+    
+    def get_memory_usage(self) -> Dict[str, int]:
+        """
+        Estimate memory usage of caches.
+        
+        Returns:
+            Approximate memory usage in bytes per cache
+        """
+        import sys
+        
+        usage = {}
+        
+        for name, cache in self._caches.items():
+            if hasattr(cache, '_cache'):
+                # Rough estimation
+                size = 0
+                if isinstance(cache._cache, dict):
+                    for key, value in cache._cache.items():
+                        size += sys.getsizeof(key)
+                        if hasattr(value, '__sizeof__'):
+                            size += sys.getsizeof(value)
+                        else:
+                            size += 1000  # Default estimate
+                
+                usage[name] = size
+        
+        return usage
+
+
+# Global cache manager instance
+_cache_manager = None
+
+
+def get_cache_manager() -> CacheManager:
+    """Get global cache manager instance."""
+    global _cache_manager
+    if _cache_manager is None:
+        _cache_manager = CacheManager()
+    return _cache_manager
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/currency_merger.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/currency_merger.py
@@ -0,0 +1,277 @@
+"""
+Currency column merger for handling separated currency symbols in SEC filings.
+"""
+
+import re
+from typing import List, Tuple
+
+from edgar.documents.table_nodes import Cell
+from edgar.documents.utils.table_matrix import TableMatrix, MatrixCell
+
+
+class CurrencyColumnMerger:
+    """
+    Detects and merges currency symbol columns with their value columns.
+    
+    SEC filings often split currency values into two cells:
+    - Cell 1: "$" (left-aligned)
+    - Cell 2: "224.11" (right-aligned)
+    
+    This class detects this pattern and merges them into "$224.11"
+    """
+    
+    # Common currency symbols
+    CURRENCY_SYMBOLS = {'$', '€', '£', '¥', '₹', 'Rs', 'USD', 'EUR', 'GBP'}
+    
+    # Pattern for numeric values (with commas, decimals)
+    NUMERIC_PATTERN = re.compile(r'^[\d,]+\.?\d*$')
+    
+    def __init__(self, matrix: TableMatrix):
+        """Initialize with a table matrix."""
+        self.matrix = matrix
+        self.merge_pairs: List[Tuple[int, int]] = []
+        
+    def detect_currency_pairs(self) -> List[Tuple[int, int]]:
+        """
+        Detect column pairs that should be merged (currency symbol + value).
+        
+        Returns:
+            List of (symbol_col, value_col) pairs to merge
+        """
+        pairs = []
+        
+        for col_idx in range(self.matrix.col_count - 1):
+            if self._is_currency_column(col_idx):
+                next_col = col_idx + 1
+                if self._is_numeric_column(next_col):
+                    # Check if they're consistently paired
+                    if self._verify_pairing(col_idx, next_col):
+                        pairs.append((col_idx, next_col))
+        
+        self.merge_pairs = pairs
+        return pairs
+    
+    def _is_currency_column(self, col_idx: int) -> bool:
+        """
+        Check if a column contains only currency symbols.
+        
+        A currency column typically:
+        - Contains only currency symbols or empty cells
+        - Has very narrow width (1-3 characters)
+        - Is left-aligned (though we check content, not style)
+        """
+        currency_count = 0
+        empty_count = 0
+        other_count = 0
+        header_rows = 0
+        
+        for row_idx in range(self.matrix.row_count):
+            cell = self.matrix.matrix[row_idx][col_idx]
+            if cell.original_cell and not cell.is_spanned:
+                text = cell.original_cell.text().strip()
+                
+                # Skip header rows (first 2 rows typically)
+                if row_idx < 2 and text and not text in self.CURRENCY_SYMBOLS:
+                    header_rows += 1
+                    continue
+                
+                if not text:
+                    empty_count += 1
+                elif text in self.CURRENCY_SYMBOLS or text == '$':
+                    currency_count += 1
+                elif len(text) <= 3 and text in ['$', '€', '£', '¥']:
+                    currency_count += 1
+                else:
+                    other_count += 1
+        
+        # Column should be mostly currency symbols with some empty cells
+        # Exclude header rows from the calculation
+        total_non_empty = currency_count + other_count
+        if total_non_empty == 0:
+            return False
+        
+        # At least 60% of non-empty, non-header cells should be currency symbols
+        # Lower threshold since we're excluding headers
+        # Also accept if there's at least 1 currency symbol and no other non-currency content
+        return (currency_count >= 1 and other_count == 0) or \
+               (currency_count >= 2 and currency_count / total_non_empty >= 0.6)
+    
+    def _is_numeric_column(self, col_idx: int) -> bool:
+        """
+        Check if a column contains numeric values.
+        """
+        numeric_count = 0
+        non_empty_count = 0
+        
+        for row_idx in range(self.matrix.row_count):
+            cell = self.matrix.matrix[row_idx][col_idx]
+            if cell.original_cell and not cell.is_spanned:
+                text = cell.original_cell.text().strip()
+                
+                # Skip header rows
+                if row_idx < 2:
+                    continue
+                    
+                if text:
+                    non_empty_count += 1
+                    # Remove formatting and check if numeric
+                    clean_text = text.replace(',', '').replace('%', '').replace('(', '').replace(')', '')
+                    if self.NUMERIC_PATTERN.match(clean_text):
+                        numeric_count += 1
+        
+        if non_empty_count == 0:
+            return False
+        
+        # At least 60% should be numeric (lowered threshold)
+        return numeric_count / non_empty_count >= 0.6
+    
+    def _verify_pairing(self, symbol_col: int, value_col: int) -> bool:
+        """
+        Verify that symbol and value columns are consistently paired.
+        
+        They should have content in the same rows (when symbol present, value present).
+        """
+        paired_rows = 0
+        mismatched_rows = 0
+        
+        for row_idx in range(self.matrix.row_count):
+            symbol_cell = self.matrix.matrix[row_idx][symbol_col]
+            value_cell = self.matrix.matrix[row_idx][value_col]
+            
+            if symbol_cell.original_cell and value_cell.original_cell:
+                symbol_text = symbol_cell.original_cell.text().strip()
+                value_text = value_cell.original_cell.text().strip()
+                
+                # Check if they're paired (both have content or both empty)
+                if symbol_text in self.CURRENCY_SYMBOLS and value_text:
+                    paired_rows += 1
+                elif not symbol_text and not value_text:
+                    # Both empty is fine
+                    pass
+                elif symbol_text in self.CURRENCY_SYMBOLS and not value_text:
+                    # Symbol without value - might be header
+                    if row_idx < 2:  # Allow in headers
+                        pass
+                    else:
+                        mismatched_rows += 1
+                elif not symbol_text and value_text:
+                    # Value without symbol - could be valid (continuation)
+                    pass
+        
+        # Should have more paired than mismatched
+        return paired_rows > mismatched_rows
+    
+    def apply_merges(self) -> 'TableMatrix':
+        """
+        Create a new matrix with currency columns merged.
+        
+        Returns:
+            New TableMatrix with merged columns
+        """
+        if not self.merge_pairs:
+            self.detect_currency_pairs()
+        
+        if not self.merge_pairs:
+            # No merges needed
+            return self.matrix
+        
+        # Calculate new column count (each merge removes one column)
+        new_col_count = self.matrix.col_count - len(self.merge_pairs)
+        
+        # Create mapping from old to new columns
+        old_to_new = {}
+        merged_cols = set(pair[0] for pair in self.merge_pairs)  # Symbol columns to remove
+        
+        new_col = 0
+        for old_col in range(self.matrix.col_count):
+            if old_col in merged_cols:
+                # This column will be merged with next, skip it
+                continue
+            old_to_new[old_col] = new_col
+            new_col += 1
+        
+        # Create new matrix
+        new_matrix = TableMatrix()
+        new_matrix.row_count = self.matrix.row_count
+        new_matrix.col_count = new_col_count
+        new_matrix.matrix = []
+        
+        # Build new matrix with merged cells
+        for row_idx in range(self.matrix.row_count):
+            new_row = [MatrixCell() for _ in range(new_col_count)]
+            
+            for old_col in range(self.matrix.col_count):
+                # Check if this is a symbol column to merge
+                merge_pair = next((pair for pair in self.merge_pairs if pair[0] == old_col), None)
+                
+                if merge_pair:
+                    # Merge symbol with value
+                    symbol_col, value_col = merge_pair
+                    symbol_cell = self.matrix.matrix[row_idx][symbol_col]
+                    value_cell = self.matrix.matrix[row_idx][value_col]
+                    
+                    if value_cell.original_cell:
+                        # Create merged cell
+                        new_cell_content = self._merge_cell_content(symbol_cell, value_cell)
+                        if new_cell_content:
+                            # Create new merged cell
+                            merged_cell = Cell(
+                                content=new_cell_content,
+                                colspan=value_cell.original_cell.colspan,
+                                rowspan=value_cell.original_cell.rowspan,
+                                is_header=value_cell.original_cell.is_header,
+                                align=value_cell.original_cell.align
+                            )
+                            
+                            new_col_idx = old_to_new.get(value_col)
+                            if new_col_idx is not None:
+                                new_row[new_col_idx] = MatrixCell(
+                                    original_cell=merged_cell,
+                                    is_spanned=False,
+                                    row_origin=row_idx,
+                                    col_origin=new_col_idx
+                                )
+                
+                elif old_col not in set(pair[1] for pair in self.merge_pairs):
+                    # Regular column, not involved in merging
+                    new_col_idx = old_to_new.get(old_col)
+                    if new_col_idx is not None:
+                        new_row[new_col_idx] = self.matrix.matrix[row_idx][old_col]
+            
+            new_matrix.matrix.append(new_row)
+        
+        return new_matrix
+    
+    def _merge_cell_content(self, symbol_cell: MatrixCell, value_cell: MatrixCell) -> str:
+        """
+        Merge symbol and value cell contents.
+        
+        Returns:
+            Merged content like "$224.11" or original value if no symbol
+        """
+        value_text = value_cell.original_cell.text().strip() if value_cell.original_cell else ""
+        symbol_text = symbol_cell.original_cell.text().strip() if symbol_cell.original_cell else ""
+        
+        if not value_text:
+            return symbol_text  # Just return symbol if no value
+        
+        if symbol_text in self.CURRENCY_SYMBOLS:
+            # Merge symbol with value (no space for $, others may vary)
+            if symbol_text == '$':
+                return f"${value_text}"
+            else:
+                return f"{symbol_text}{value_text}"
+        else:
+            # No symbol, just return value
+            return value_text
+    
+    def get_merge_summary(self) -> str:
+        """Get a summary of merges to be applied."""
+        if not self.merge_pairs:
+            return "No currency column merges detected"
+        
+        summary = f"Currency merges detected: {len(self.merge_pairs)} pairs\n"
+        for symbol_col, value_col in self.merge_pairs:
+            summary += f"  • Column {symbol_col} ($) + Column {value_col} (value)\n"
+        
+        return summary
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/html_utils.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/html_utils.py
@@ -0,0 +1,96 @@
+"""
+HTML utility functions for document parsing.
+
+This module consolidates common HTML processing utilities used across
+the parser, preprocessor, and simple parser implementations.
+"""
+
+import lxml.html
+from typing import Optional
+
+
+def remove_xml_declaration(html: str) -> str:
+    """
+    Remove XML declaration from HTML if present.
+
+    SEC HTML documents sometimes include XML declarations like:
+        <?xml version="1.0" encoding="UTF-8"?>
+
+    These can interfere with HTML parsing and are safely removed since
+    the encoding is handled separately by the parser.
+
+    Args:
+        html: HTML string that may contain XML declaration
+
+    Returns:
+        HTML string with XML declaration removed (if present)
+
+    Examples:
+        >>> html = '<?xml version="1.0"?><!DOCTYPE html><html>...'
+        >>> remove_xml_declaration(html)
+        '<!DOCTYPE html><html>...'
+
+        >>> html = '<!DOCTYPE html><html>...'  # No XML declaration
+        >>> remove_xml_declaration(html)
+        '<!DOCTYPE html><html>...'
+    """
+    html_stripped = html.strip()
+    if html_stripped.startswith('<?xml'):
+        xml_end = html.find('?>') + 2
+        return html[xml_end:]
+    return html
+
+
+def create_lxml_parser(
+    remove_blank_text: bool = True,
+    remove_comments: bool = True,
+    recover: bool = True,
+    encoding: Optional[str] = 'utf-8'
+) -> lxml.html.HTMLParser:
+    """
+    Create a configured lxml HTMLParser.
+
+    This factory function creates an lxml HTMLParser with consistent
+    configuration settings used across the document parsing system.
+
+    Args:
+        remove_blank_text: Remove blank text nodes between tags.
+            Default True for cleaner tree structure.
+        remove_comments: Remove HTML comments from parsed tree.
+            Default True since comments are rarely needed.
+        recover: Enable error recovery mode to handle malformed HTML.
+            Default True since SEC filings often have HTML issues.
+        encoding: Character encoding for the parser.
+            Default 'utf-8'. Set to None to disable encoding handling.
+
+    Returns:
+        Configured lxml.html.HTMLParser instance
+
+    Examples:
+        >>> # Standard parser (removes whitespace and comments, recovers from errors)
+        >>> parser = create_lxml_parser()
+
+        >>> # Parser that preserves all content (for XBRL)
+        >>> parser = create_lxml_parser(
+        ...     remove_blank_text=False,
+        ...     remove_comments=False
+        ... )
+
+        >>> # Parser without encoding (auto-detect)
+        >>> parser = create_lxml_parser(encoding=None)
+
+    Note:
+        The recover=True setting is critical for SEC documents which
+        often contain non-standard HTML structures.
+    """
+    kwargs = {
+        'remove_blank_text': remove_blank_text,
+        'remove_comments': remove_comments,
+        'recover': recover,
+    }
+
+    # Only add encoding if specified
+    if encoding is not None:
+        kwargs['encoding'] = encoding
+
+    return lxml.html.HTMLParser(**kwargs)
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/streaming.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/streaming.py
@@ -0,0 +1,375 @@
+"""
+Streaming parser for large HTML documents.
+"""
+
+import io
+from typing import Dict, Any, TYPE_CHECKING
+
+from lxml import etree
+from lxml.html import HtmlElement
+
+from edgar.documents.config import ParserConfig
+from edgar.documents.exceptions import HTMLParsingError, DocumentTooLargeError
+
+# Use TYPE_CHECKING to avoid circular imports
+if TYPE_CHECKING:
+    from edgar.documents.document import Document, DocumentMetadata
+    from edgar.documents.nodes import DocumentNode, HeadingNode, ParagraphNode, TextNode, SectionNode, ContainerNode
+    from edgar.documents.table_nodes import TableNode
+from edgar.documents.types import SemanticType
+
+
+class StreamingParser:
+    """
+    Streaming parser for large HTML documents.
+    
+    Processes documents in chunks to minimize memory usage
+    while maintaining parse quality.
+    """
+    
+    # Chunk size for streaming (1MB)
+    CHUNK_SIZE = 1024 * 1024
+    
+    # Maximum node buffer before flush
+    MAX_NODE_BUFFER = 1000
+    
+    def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
+        """
+        Initialize streaming parser.
+        
+        Args:
+            config: Parser configuration
+            strategies: Parsing strategies to use
+        """
+        self.config = config
+        self.strategies = strategies
+        self._reset_state()
+    
+    def _reset_state(self):
+        """Reset parser state."""
+        # Import here to avoid circular import
+        from edgar.documents.document import DocumentMetadata
+        from edgar.documents.nodes import DocumentNode
+
+        self.current_section = None
+        self.node_buffer = []
+        self.metadata = DocumentMetadata()
+        self.root = DocumentNode()
+        self.current_parent = self.root
+        self.tag_stack = []
+        self.text_buffer = []
+        self.in_table = False
+        self.table_buffer = []
+        self.bytes_processed = 0
+
+    def parse(self, html: str) -> "Document":
+        """
+        Parse HTML in streaming mode.
+
+        Args:
+            html: HTML content to parse
+
+        Returns:
+            Parsed Document
+
+        Raises:
+            DocumentTooLargeError: If document exceeds size limit
+            HTMLParsingError: If parsing fails
+        """
+        self._reset_state()
+
+        # Store original HTML BEFORE parsing (needed for TOC-based section detection)
+        original_html = html
+        
+        try:
+            # Create streaming parser
+            parser = etree.iterparse(
+                io.BytesIO(html.encode('utf-8')),
+                events=('start', 'end'),
+                html=True,
+                recover=True,
+                encoding='utf-8'
+            )
+            
+            # Process events
+            for event, elem in parser:
+                self._process_event(event, elem)
+                
+                # Check size limit
+                self.bytes_processed += len(etree.tostring(elem, encoding='unicode', method='html'))
+                if self.bytes_processed > self.config.max_document_size:
+                    raise DocumentTooLargeError(self.bytes_processed, self.config.max_document_size)
+                
+                # Flush buffer if needed
+                if len(self.node_buffer) >= self.MAX_NODE_BUFFER:
+                    self._flush_buffer()
+                
+                # Clean up processed elements to save memory
+                elem.clear()
+                while elem.getprevious() is not None:
+                    parent = elem.getparent()
+                    if parent is not None:
+                        del parent[0]
+                    else:
+                        break
+            
+            # Final flush
+            self._flush_buffer()
+
+            # Store original HTML in metadata for section detection (TOC analysis)
+            self.metadata.original_html = original_html
+
+            # Create document (import here to avoid circular import)
+            from edgar.documents.document import Document
+            document = Document(root=self.root, metadata=self.metadata)
+
+            # Store config reference (required for section detection)
+            document._config = self.config
+            
+            # Apply post-processing
+            from edgar.documents.processors.postprocessor import DocumentPostprocessor
+            postprocessor = DocumentPostprocessor(self.config)
+            document = postprocessor.process(document)
+            
+            return document
+            
+        except etree.ParseError as e:
+            raise HTMLParsingError(f"Streaming parse failed: {str(e)}")
+        except Exception as e:
+            if isinstance(e, (DocumentTooLargeError, HTMLParsingError)):
+                raise
+            raise HTMLParsingError(f"Unexpected error during streaming parse: {str(e)}")
+    
+    def _process_event(self, event: str, elem: HtmlElement):
+        """Process a parse event."""
+        if event == 'start':
+            self._handle_start_tag(elem)
+        elif event == 'end':
+            self._handle_end_tag(elem)
+    
+    def _handle_start_tag(self, elem: HtmlElement):
+        """Handle opening tag."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import ContainerNode
+
+        tag = elem.tag.lower()
+
+        # Track tag stack
+        self.tag_stack.append(tag)
+
+        # Extract metadata from early elements
+        if tag == 'title' and elem.text:
+            self._extract_title_metadata(elem.text)
+        elif tag == 'meta':
+            self._extract_meta_metadata(elem)
+
+        # Handle specific tags
+        if tag == 'body':
+            # Create a container for body content
+            body_container = ContainerNode(tag_name='body')
+            self.root.add_child(body_container)
+            self.current_parent = body_container
+        elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            self._start_heading(elem)
+        elif tag == 'p':
+            self._start_paragraph(elem)
+        elif tag == 'table':
+            self._start_table(elem)
+        elif tag == 'section':
+            self._start_section(elem)
+    
+    def _handle_end_tag(self, elem: HtmlElement):
+        """Handle closing tag."""
+        tag = elem.tag.lower()
+        
+        # Remove from tag stack
+        if self.tag_stack and self.tag_stack[-1] == tag:
+            self.tag_stack.pop()
+        
+        # Handle specific tags
+        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            self._end_heading(elem)
+        elif tag == 'p':
+            self._end_paragraph(elem)
+        elif tag == 'table':
+            self._end_table(elem)
+        elif tag == 'section':
+            self._end_section(elem)
+        elif tag == 'body':
+            # When body ends, flush any remaining nodes
+            self._flush_buffer()
+        
+        # Handle text content
+        if elem.text:
+            self.text_buffer.append(elem.text.strip())
+        if elem.tail:
+            self.text_buffer.append(elem.tail.strip())
+    
+    def _start_heading(self, elem: HtmlElement):
+        """Start processing a heading."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import HeadingNode
+
+        level = int(elem.tag[1])
+        text = self._get_text_content(elem)
+
+        # Create heading node
+        heading = HeadingNode(
+            level=level,
+            content=text
+        )
+        
+        # Check if this is a section header
+        if self.strategies.get('header_detection'):
+            detector = self.strategies['header_detection']
+            if detector.is_section_header(text, elem):
+                heading.semantic_type = SemanticType.SECTION_HEADER
+        
+        self.node_buffer.append(heading)
+    
+    def _end_heading(self, elem: HtmlElement):
+        """End processing a heading."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import HeadingNode
+
+        # Get text content from element
+        text = self._get_text_content(elem)
+        if text and self.node_buffer and isinstance(self.node_buffer[-1], HeadingNode):
+            self.node_buffer[-1].content = text
+
+        # Clear any accumulated text buffer
+        self.text_buffer.clear()
+    
+    def _start_paragraph(self, elem: HtmlElement):
+        """Start processing a paragraph."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import ParagraphNode
+
+        para = ParagraphNode()
+
+        # Get style if present
+        style_attr = elem.get('style')
+        if style_attr and self.strategies.get('style_parser'):
+            style_parser = self.strategies['style_parser']
+            para.style = style_parser.parse(style_attr)
+
+        self.node_buffer.append(para)
+    
+    def _end_paragraph(self, elem: HtmlElement):
+        """End processing a paragraph."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import ParagraphNode, TextNode
+
+        # Get text content from element
+        text = self._get_text_content(elem)
+        if text and self.node_buffer and isinstance(self.node_buffer[-1], ParagraphNode):
+            text_node = TextNode(content=text)
+            self.node_buffer[-1].add_child(text_node)
+        
+        # Clear any accumulated text buffer
+        self.text_buffer.clear()
+    
+    def _start_table(self, elem: HtmlElement):
+        """Start processing a table."""
+        self.in_table = True
+        self.table_buffer = []
+        
+        # Store table element for later processing
+        self.table_elem = elem
+    
+    def _end_table(self, elem: HtmlElement):
+        """End processing a table."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.table_nodes import TableNode
+
+        self.in_table = False
+
+        # Process table with table processor if available
+        if self.strategies.get('table_processing'):
+            processor = self.strategies['table_processing']
+            table_node = processor.process(elem)
+            if table_node:
+                self.node_buffer.append(table_node)
+        else:
+            # Basic table node
+            table = TableNode()
+            self.node_buffer.append(table)
+
+        self.table_buffer.clear()
+    
+    def _start_section(self, elem: HtmlElement):
+        """Start processing a section."""
+        # Import node types at runtime to avoid circular imports
+        from edgar.documents.nodes import SectionNode
+
+        section = SectionNode()
+
+        # Get section attributes
+        section_id = elem.get('id')
+        if section_id:
+            section.metadata['id'] = section_id
+
+        section_class = elem.get('class')
+        if section_class:
+            section.metadata['class'] = section_class
+
+        self.current_section = section
+        self.node_buffer.append(section)
+    
+    def _end_section(self, elem: HtmlElement):
+        """End processing a section."""
+        self.current_section = None
+    
+    def _flush_buffer(self):
+        """Flush node buffer to document tree."""
+        for node in self.node_buffer:
+            # Add to current parent
+            if self.current_section:
+                self.current_section.add_child(node)
+            else:
+                self.current_parent.add_child(node)
+        
+        self.node_buffer.clear()
+    
+    def _get_text_content(self, elem: HtmlElement) -> str:
+        """Extract text content from element."""
+        text_parts = []
+        
+        if elem.text:
+            text_parts.append(elem.text.strip())
+        
+        for child in elem:
+            child_text = self._get_text_content(child)
+            if child_text:
+                text_parts.append(child_text)
+            if child.tail:
+                text_parts.append(child.tail.strip())
+        
+        return ' '.join(text_parts)
+    
+    def _extract_title_metadata(self, title: str):
+        """Extract metadata from title."""
+        # Example: "APPLE INC - 10-K - 2023-09-30"
+        parts = title.split(' - ')
+        if len(parts) >= 2:
+            self.metadata.company = parts[0].strip()
+            self.metadata.form = parts[1].strip()
+            if len(parts) >= 3:
+                self.metadata.filing_date = parts[2].strip()
+    
+    def _extract_meta_metadata(self, elem: HtmlElement):
+        """Extract metadata from meta tags."""
+        name = elem.get('name', '').lower()
+        content = elem.get('content', '')
+        
+        if name and content:
+            if name == 'company':
+                self.metadata.company = content
+            elif name == 'filing-type':
+                self.metadata.form = content
+            elif name == 'cik':
+                self.metadata.cik = content
+            elif name == 'filing-date':
+                self.metadata.filing_date = content
+            elif name == 'accession-number':
+                self.metadata.accession_number = content
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/table_matrix.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/table_matrix.py
@@ -0,0 +1,858 @@
+"""
+Table matrix builder for handling complex colspan/rowspan structures.
+"""
+
+from dataclasses import dataclass
+from typing import List, Optional
+
+from edgar.documents.table_nodes import Cell, Row
+
+
+@dataclass
+class MatrixCell:
+    """Cell in the matrix with reference to original cell"""
+    original_cell: Optional[Cell] = None
+    is_spanned: bool = False  # True if this is part of a colspan/rowspan
+    row_origin: int = -1  # Original row index
+    col_origin: int = -1  # Original column index
+    
+
+class TableMatrix:
+    """
+    Build a 2D matrix representation of table with proper handling of merged cells.
+    
+    This class converts a table with colspan/rowspan into a regular 2D grid
+    where each merged cell occupies multiple positions in the matrix.
+    """
+    
+    def __init__(self):
+        """Initialize empty matrix"""
+        self.matrix: List[List[MatrixCell]] = []
+        self.row_count = 0
+        self.col_count = 0
+        self.header_row_count = 0  # Track number of header rows
+
+    def build_from_rows(self, header_rows: List[List[Cell]], data_rows: List[Row]) -> 'TableMatrix':
+        """
+        Build matrix from header rows and data rows.
+
+        Args:
+            header_rows: List of header rows (each row is a list of Cells)
+            data_rows: List of Row objects
+
+        Returns:
+            Self for chaining
+        """
+        # Store header row count for later use
+        self.header_row_count = len(header_rows)
+
+        # Combine all rows for processing
+        all_rows = []
+
+        # Add header rows
+        for header_row in header_rows:
+            all_rows.append(header_row)
+        
+        # Add data rows
+        for row in data_rows:
+            all_rows.append(row.cells)
+        
+        if not all_rows:
+            return self
+        
+        # Calculate dimensions
+        self.row_count = len(all_rows)
+        
+        # First pass: determine actual column count
+        self._calculate_dimensions(all_rows)
+        
+        # Initialize matrix
+        self.matrix = [[MatrixCell() for _ in range(self.col_count)] 
+                       for _ in range(self.row_count)]
+        
+        # Second pass: place cells in matrix
+        self._place_cells(all_rows)
+        
+        return self
+    
+    def _calculate_dimensions(self, rows: List[List[Cell]]):
+        """Calculate the actual dimensions considering colspan"""
+        max_cols = 0
+        
+        for row_idx, row in enumerate(rows):
+            col_pos = 0
+            for cell in row:
+                # Skip positions that might be occupied by rowspan from above
+                while col_pos < max_cols and self._is_occupied(row_idx, col_pos):
+                    col_pos += 1
+                
+                # This cell will occupy from col_pos to col_pos + colspan
+                col_end = col_pos + cell.colspan
+                max_cols = max(max_cols, col_end)
+                col_pos = col_end
+        
+        self.col_count = max_cols
+    
+    def _is_occupied(self, row: int, col: int) -> bool:
+        """Check if a position is occupied by a cell from a previous row (rowspan)"""
+        if row == 0:
+            return False
+        
+        # Check if any cell above has rowspan that reaches this position
+        for prev_row in range(row):
+            if prev_row < len(self.matrix) and col < len(self.matrix[prev_row]):
+                cell = self.matrix[prev_row][col]
+                if cell.original_cell and cell.row_origin == prev_row:
+                    # Check if this cell's rowspan reaches current row
+                    if prev_row + cell.original_cell.rowspan > row:
+                        return True
+        return False
+    
+    def _place_cells(self, rows: List[List[Cell]]):
+        """Place cells in the matrix handling colspan and rowspan"""
+        for row_idx, row in enumerate(rows):
+            col_pos = 0
+            
+            for cell_idx, cell in enumerate(row):
+                # Find next available column position
+                while col_pos < self.col_count and self.matrix[row_idx][col_pos].original_cell is not None:
+                    col_pos += 1
+                
+                if col_pos >= self.col_count:
+                    # Need to expand matrix
+                    self._expand_columns(col_pos + cell.colspan)
+                
+                # Special handling for cells with colspan > 1 containing numeric values
+                # Only apply this logic for Table 15-style alignment issues
+                # Check if this looks like a financial value that should be right-aligned
+                cell_text = cell.text().strip()
+                
+                # Check for numeric values that need special alignment
+                # This is specifically for cases like "167,045" that should align with "$167,045"
+                has_comma_separator = ',' in cell_text
+                digit_ratio = sum(c.isdigit() for c in cell_text) / len(cell_text) if cell_text else 0
+                
+                # Only apply special placement for colspan=2 numeric values in data rows
+                # This handles Table 15's specific case without breaking Table 13
+                is_special_numeric = (cell.colspan == 2 and  # Specifically colspan=2
+                                    has_comma_separator and
+                                    digit_ratio > 0.5 and  # More than 50% digits
+                                    not cell_text.startswith('$') and
+                                    not any(month in cell_text.lower() for month in 
+                                           ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 
+                                            'jul', 'aug', 'sep', 'oct', 'nov', 'dec']) and
+                                    row_idx > 1)  # Not a header row (allow for multi-row headers)
+                
+                if is_special_numeric:
+                    # Place empty cell at first position, content at second position
+                    # This is specifically for Table 15 alignment
+                    for r in range(cell.rowspan):
+                        # First column of span: empty
+                        if row_idx + r < self.row_count and col_pos < self.col_count:
+                            self.matrix[row_idx + r][col_pos] = MatrixCell()
+                        
+                        # Second column of span: the actual content
+                        if row_idx + r < self.row_count and col_pos + 1 < self.col_count:
+                            matrix_cell = MatrixCell(
+                                original_cell=cell,
+                                is_spanned=False,
+                                row_origin=row_idx,
+                                col_origin=col_pos + 1
+                            )
+                            self.matrix[row_idx + r][col_pos + 1] = matrix_cell
+                        
+                        # Remaining columns of span: mark as spanned (though colspan=2 has no remaining)
+                        for c in range(2, cell.colspan):
+                            if row_idx + r < self.row_count and col_pos + c < self.col_count:
+                                matrix_cell = MatrixCell(
+                                    original_cell=cell,
+                                    is_spanned=True,
+                                    row_origin=row_idx,
+                                    col_origin=col_pos + 1
+                                )
+                                self.matrix[row_idx + r][col_pos + c] = matrix_cell
+                else:
+                    # Normal placement for other cells
+                    for r in range(cell.rowspan):
+                        for c in range(cell.colspan):
+                            if row_idx + r < self.row_count and col_pos + c < self.col_count:
+                                matrix_cell = MatrixCell(
+                                    original_cell=cell,
+                                    is_spanned=(r > 0 or c > 0),
+                                    row_origin=row_idx,
+                                    col_origin=col_pos
+                                )
+                                self.matrix[row_idx + r][col_pos + c] = matrix_cell
+                
+                col_pos += cell.colspan
+    
+    def _expand_columns(self, new_col_count: int):
+        """Expand matrix to accommodate more columns"""
+        if new_col_count <= self.col_count:
+            return
+        
+        for row in self.matrix:
+            row.extend([MatrixCell() for _ in range(new_col_count - self.col_count)])
+        
+        self.col_count = new_col_count
+    
+    def get_actual_columns(self) -> int:
+        """Get the actual number of data columns (excluding empty/spacing columns)"""
+        non_empty_cols = 0
+        
+        for col_idx in range(self.col_count):
+            has_content = False
+            for row_idx in range(self.row_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    # Check if cell has actual content
+                    text = cell.original_cell.text().strip()
+                    if text and text not in ['', ' ', '\xa0']:
+                        has_content = True
+                        break
+            
+            if has_content:
+                non_empty_cols += 1
+        
+        return non_empty_cols
+    
+    def get_column_widths(self) -> List[float]:
+        """Estimate column widths based on content"""
+        widths = []
+        
+        for col_idx in range(self.col_count):
+            max_width = 0
+            content_count = 0
+            
+            for row_idx in range(self.row_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    text = cell.original_cell.text().strip()
+                    if text:
+                        max_width = max(max_width, len(text))
+                        content_count += 1
+            
+            # If column has no content, it's likely a spacing column
+            if content_count == 0:
+                widths.append(0)
+            else:
+                widths.append(max_width)
+        
+        return widths
+    
+    def get_cell(self, row_idx: int, col_idx: int) -> Optional[Cell]:
+        """
+        Get a cell at specific position in the matrix.
+        
+        Args:
+            row_idx: Row index
+            col_idx: Column index
+            
+        Returns:
+            Cell at position or None if out of bounds
+        """
+        if row_idx >= self.row_count or col_idx >= self.col_count or row_idx < 0 or col_idx < 0:
+            return None
+        
+        matrix_cell = self.matrix[row_idx][col_idx]
+        
+        # Return the original cell
+        if matrix_cell.original_cell:
+            return matrix_cell.original_cell
+        
+        # Return empty cell for empty positions
+        return Cell("")
+    
+    def get_expanded_row(self, row_idx: int) -> List[Optional[Cell]]:
+        """
+        Get a row with cells expanded to match column count.
+        
+        For cells with colspan > 1, the cell appears in the first position
+        and None in subsequent positions.
+        """
+        if row_idx >= self.row_count:
+            return []
+        
+        expanded = []
+        for col_idx in range(self.col_count):
+            matrix_cell = self.matrix[row_idx][col_idx]
+            if matrix_cell.original_cell:
+                if not matrix_cell.is_spanned:
+                    # This is the origin cell
+                    expanded.append(matrix_cell.original_cell)
+                else:
+                    # This is a spanned position
+                    expanded.append(None)
+            else:
+                # Empty cell
+                expanded.append(None)
+        
+        return expanded
+    
+    def get_data_columns(self) -> List[int]:
+        """
+        Get indices of columns that contain actual data (not spacing).
+        Uses strategy similar to old parser - keeps single empty columns for spacing.
+        
+        Returns:
+            List of column indices that contain data
+        """
+        # First, identify which columns are empty
+        empty_cols = []
+        for col_idx in range(self.col_count):
+            has_content = False
+            for row_idx in range(self.row_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    text = cell.original_cell.text().strip()
+                    if text:
+                        has_content = True
+                        break
+            if not has_content:
+                empty_cols.append(col_idx)
+        
+        # Apply old parser's strategy
+        cols_to_remove = set()
+        
+        # Remove leading empty columns
+        for col in range(self.col_count):
+            if col in empty_cols:
+                cols_to_remove.add(col)
+            else:
+                break
+        
+        # Remove trailing empty columns
+        for col in reversed(range(self.col_count)):
+            if col in empty_cols:
+                cols_to_remove.add(col)
+            else:
+                break
+        
+        # Remove consecutive empty columns in the middle (keep single empty cols for spacing)
+        i = 0
+        while i < self.col_count - 1:
+            if i in empty_cols and (i + 1) in empty_cols:
+                # Found consecutive empty columns
+                consecutive_count = 0
+                j = i
+                while j < self.col_count and j in empty_cols:
+                    consecutive_count += 1
+                    j += 1
+                # Keep first empty column as spacer, remove the rest
+                cols_to_remove.update(range(i + 1, i + consecutive_count))
+                i = j
+            else:
+                i += 1
+        
+        # Return columns that are NOT in the removal set
+        data_cols = [col for col in range(self.col_count) if col not in cols_to_remove]
+        
+        return data_cols
+    
+    def filter_spacing_columns(self) -> 'TableMatrix':
+        """
+        Create a new matrix with spacing columns removed.
+        Also handles colspan-generated duplicate columns and misalignment.
+        
+        Returns:
+            New TableMatrix with only data columns
+        """
+        # First pass: identify primary header columns (those with colspan > 1 headers)
+        # and data columns
+        primary_header_cols = set()
+        all_header_cols = set()
+        data_cols = set()
+        
+        # Find primary header columns (those that start a colspan)
+        for row_idx in range(min(3, self.row_count)):
+            for col_idx in range(self.col_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    if cell.original_cell.text().strip():
+                        all_header_cols.add(col_idx)
+                        # Check if this is a primary header (colspan > 1)
+                        if cell.original_cell.colspan > 1:
+                            primary_header_cols.add(col_idx)
+        
+        # If no primary headers found, use all headers as primary
+        if not primary_header_cols:
+            primary_header_cols = all_header_cols
+
+        # Phase 1.5: Identify columns with header content
+        # Any column with non-empty text in ANY header row must be preserved
+        # This prevents legitimate header columns from being removed as "spacing"
+        # Also preserve columns that are spanned by headers (colspan > 1)
+        header_content_columns = set()
+        for col_idx in range(self.col_count):
+            for row_idx in range(self.header_row_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell:
+                    # Check for original header cell with content
+                    if not cell.is_spanned:
+                        text = cell.original_cell.text().strip()
+                        if text:
+                            header_content_columns.add(col_idx)
+                            # Also add all columns spanned by this header
+                            if cell.original_cell.colspan > 1:
+                                for span_offset in range(1, cell.original_cell.colspan):
+                                    span_col = col_idx + span_offset
+                                    if span_col < self.col_count:
+                                        header_content_columns.add(span_col)
+                            break  # Found content, no need to check other header rows
+                    # Also preserve columns that are spanned (part of a colspan)
+                    elif cell.is_spanned:
+                        # This column is part of a header's colspan
+                        text = cell.original_cell.text().strip()
+                        if text:
+                            header_content_columns.add(col_idx)
+
+        # Find columns with data (skip header rows)
+        # Count actual header rows by checking for non-data content
+        actual_header_rows = 0
+        for row_idx in range(min(3, self.row_count)):
+            has_numeric_data = False
+            for col_idx in range(self.col_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    text = cell.original_cell.text().strip()
+                    # Check if it looks like numeric data (has commas or starts with $)
+                    if text and (',' in text and any(c.isdigit() for c in text)) or text == '$':
+                        has_numeric_data = True
+                        break
+            if has_numeric_data:
+                break
+            actual_header_rows += 1
+        
+        data_start_row = max(1, actual_header_rows)
+        
+        # Track columns with significant data (not just isolated cells)
+        col_data_count = {}
+        for row_idx in range(data_start_row, self.row_count):
+            for col_idx in range(self.col_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell and not cell.is_spanned:
+                    if cell.original_cell.text().strip():
+                        data_cols.add(col_idx)
+                        col_data_count[col_idx] = col_data_count.get(col_idx, 0) + 1
+        
+        # Build initial list of columns to keep
+        # Always include column 0 if it contains row labels
+        cols_to_keep = set(primary_header_cols)
+
+        # Add columns with header content (prevents removing legitimate headers)
+        cols_to_keep.update(header_content_columns)
+        
+        # Identify misaligned data columns that need to be consolidated
+        # These are data columns that are not primary header columns
+        misaligned_data_cols = data_cols - primary_header_cols
+        
+        # Map misaligned data columns to their nearest column for consolidation
+        # Only consolidate directly adjacent columns with specific patterns
+        consolidation_map = {}
+        
+        # First pass: identify all potential consolidations
+        potential_consolidations = {}
+        for data_col in sorted(misaligned_data_cols):
+            # Check if this column should be consolidated with an adjacent column
+            # Check the column immediately before this one
+            prev_col = data_col - 1
+            
+            # Sample some cells to see if consolidation makes sense
+            consolidation_type = None
+            
+            for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
+                prev_cell = self.matrix[row_idx][prev_col] if prev_col >= 0 else None
+                curr_cell = self.matrix[row_idx][data_col]
+                
+                if prev_cell and prev_cell.original_cell and curr_cell.original_cell:
+                    prev_text = prev_cell.original_cell.text().strip()
+                    curr_text = curr_cell.original_cell.text().strip()
+                    
+                    # Skip empty cells
+                    if not prev_text or not curr_text:
+                        continue
+                    
+                    # Check for patterns that indicate consolidation
+                    if prev_text == '$' and curr_text and curr_text[0].isdigit():
+                        consolidation_type = 'currency'
+                        break
+                    elif prev_text.startswith('(') and curr_text == ')':
+                        consolidation_type = 'parentheses'
+                        break
+                    elif curr_text == '%' and prev_text and prev_text[-1].isdigit():
+                        consolidation_type = 'percentage'
+                        break
+            
+            if consolidation_type:
+                potential_consolidations[data_col] = (prev_col, consolidation_type)
+        
+        # Second pass: resolve conflicts
+        # If column Y is a target for consolidation from Y+1 (e.g., parentheses),
+        # then don't consolidate Y into another column
+        columns_needed_as_targets = set()
+        for data_col, (target_col, cons_type) in potential_consolidations.items():
+            if cons_type == 'parentheses':
+                # This target column is needed for parentheses consolidation
+                columns_needed_as_targets.add(target_col)
+        
+        # Build final consolidation map, skipping consolidations that would remove needed targets
+        for data_col, (target_col, cons_type) in potential_consolidations.items():
+            # Don't consolidate this column if it's needed as a target for parentheses
+            if data_col in columns_needed_as_targets and cons_type != 'parentheses':
+                continue
+
+            # CRITICAL: Don't consolidate columns that have header content
+            # This prevents legitimate header columns from being merged together
+            if data_col in header_content_columns or target_col in header_content_columns:
+                continue
+
+            consolidation_map[data_col] = target_col
+            # Debug: uncomment to see consolidation mapping
+            # import os
+            # if os.environ.get('DEBUG_TABLE_CONSOLIDATION'):
+            #     print(f"Consolidating column {data_col} into {target_col}")
+        
+        # Special case: Keep data columns that are associated with header columns
+        # This handles cases where headers span multiple columns but data is in specific columns
+        for header_col in primary_header_cols:
+            # Check if there's a data column immediately after the header column
+            # This is common when headers span multiple columns
+            for offset in range(1, 3):  # Check next 1-2 columns
+                data_col = header_col + offset
+                if data_col in data_cols and data_col not in cols_to_keep:
+                    # Check if this column has meaningful data
+                    has_data = False
+                    for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
+                        cell = self.matrix[row_idx][data_col]
+                        if cell.original_cell and not cell.is_spanned:
+                            text = cell.original_cell.text().strip()
+                            if text and text not in ['', '-', '—', '–']:
+                                has_data = True
+                                break
+                    if has_data:
+                        cols_to_keep.add(data_col)
+        
+        # Keep data columns that have significant content but aren't near header columns
+        # This includes columns with dates, text descriptions, etc.
+        for col_idx in data_cols:
+            if col_idx not in cols_to_keep:
+                # Check if this column has important data
+                has_important_data = False
+                non_empty_count = 0
+                text_samples = []
+                
+                for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
+                    cell = self.matrix[row_idx][col_idx]
+                    if cell.original_cell and not cell.is_spanned:
+                        text = cell.original_cell.text().strip()
+                        if text and text not in ['', '-', '—', '–']:
+                            non_empty_count += 1
+                            if len(text_samples) < 3:
+                                text_samples.append(text)
+                            
+                            # Check for important patterns
+                            # Dates, years, text descriptions, etc.
+                            if any([
+                                len(text) > 3 and not text.replace(',', '').replace('.', '').isdigit(),  # Non-trivial text
+                                any(month in text for month in ['January', 'February', 'March', 'April', 'May', 'June', 
+                                                                'July', 'August', 'September', 'October', 'November', 'December']),
+                                any(month in text for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
+                                                                'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']),
+                                '20' in text and any(c.isdigit() for c in text),  # Likely contains year
+                            ]):
+                                has_important_data = True
+                
+                # Keep columns with consistent important data
+                if has_important_data and non_empty_count >= 3:
+                    cols_to_keep.add(col_idx)
+        
+        # Special case: If we have very few primary headers but lots of data columns,
+        # we might have a table where headers are in data rows (like years)
+        # Keep columns that have significant financial data
+        if len(primary_header_cols) <= 2 and len(data_cols) > 4:
+            # Check for financial data patterns in columns
+            for col_idx in data_cols:
+                has_financial_data = False
+                sample_count = 0
+                
+                # Sample a few cells from this column
+                for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
+                    cell = self.matrix[row_idx][col_idx]
+                    if cell.original_cell and not cell.is_spanned:
+                        text = cell.original_cell.text().strip()
+                        if text:
+                            sample_count += 1
+                            # Check for financial patterns
+                            if any([
+                                text.startswith('(') and any(c.isdigit() for c in text),  # Negative numbers
+                                text == ')' and col_idx > 0,  # Closing parenthesis
+                                '$' in text,  # Currency
+                                '%' in text,  # Percentages
+                                text.replace(',', '').replace('.', '').isdigit(),  # Plain numbers
+                                text in ['—', '–', '-', '*']  # Common placeholders
+                            ]):
+                                has_financial_data = True
+                                break
+                
+                # Keep columns with financial data
+                if has_financial_data and sample_count > 0:
+                    cols_to_keep.add(col_idx)
+        
+        # Check if column 0 contains row labels (non-empty cells in data rows)
+        col_0_has_labels = False
+        data_start_row = max(1, actual_header_rows)
+        for row_idx in range(data_start_row, self.row_count):
+            cell = self.matrix[row_idx][0]
+            if cell.original_cell and not cell.is_spanned:
+                text = cell.original_cell.text().strip()
+                if text and not text.isdigit() and not text.startswith('$') and len(text) > 1:
+                    col_0_has_labels = True
+                    break
+        
+        # Include column 0 if it has labels
+        if col_0_has_labels:
+            cols_to_keep.add(0)
+        
+        # Remove columns that will be consolidated into other columns
+        # These columns' data will be merged into their target columns
+        cols_to_remove = set(consolidation_map.keys())
+        cols_to_keep = cols_to_keep - cols_to_remove
+        
+        cols_to_keep = sorted(cols_to_keep)
+        
+        # Create new matrix with consolidated columns
+        if not cols_to_keep:
+            return self
+        
+        new_matrix = TableMatrix()
+        new_matrix.row_count = self.row_count
+        new_matrix.col_count = len(cols_to_keep)
+        new_matrix.header_row_count = self.header_row_count  # Preserve header row count
+        new_matrix.matrix = []
+        
+        # Create mapping from old to new column indices
+        old_to_new = {old_col: new_idx for new_idx, old_col in enumerate(cols_to_keep)}
+        
+        # Build new matrix with consolidation
+        for row_idx in range(self.row_count):
+            new_row = [MatrixCell() for _ in range(new_matrix.col_count)]
+            
+            # Track which cells we've already placed to handle colspan properly
+            placed_origins = {}  # Maps (row_origin, col_origin) to new column index
+            
+            # First, copy cells from kept columns
+            for old_col in sorted(cols_to_keep):
+                if old_col not in old_to_new:
+                    continue
+                new_col = old_to_new[old_col]
+                cell = self.matrix[row_idx][old_col]
+                if cell.original_cell:
+                    origin_key = (cell.row_origin, cell.col_origin)
+                    
+                    # Check if we've already placed this cell (due to colspan)
+                    if origin_key in placed_origins:
+                        # This is a continuation of a colspan - mark as spanned
+                        new_row[new_col] = MatrixCell(
+                            original_cell=cell.original_cell,
+                            is_spanned=True,  # Mark as spanned since it's part of a colspan
+                            row_origin=cell.row_origin,
+                            col_origin=placed_origins[origin_key]  # Point to the original placement
+                        )
+                    else:
+                        # First occurrence of this cell - place normally
+                        new_row[new_col] = MatrixCell(
+                            original_cell=cell.original_cell,
+                            is_spanned=False,  # This is the primary cell
+                            row_origin=cell.row_origin,
+                            col_origin=new_col
+                        )
+                        placed_origins[origin_key] = new_col
+            
+            # Then, consolidate misaligned data into header columns
+            for data_col, header_col in consolidation_map.items():
+                if header_col in old_to_new:
+                    new_col = old_to_new[header_col]
+                    data_cell = self.matrix[row_idx][data_col] if data_col < len(self.matrix[row_idx]) else None
+                    
+                    
+                    # If data cell has content, merge it with header column
+                    if data_cell and data_cell.original_cell and not data_cell.is_spanned:
+                        # Skip empty data cells
+                        if not data_cell.original_cell.text().strip():
+                            continue
+                        # Check the original header column cell to see if it has content to merge
+                        header_cell = self.matrix[row_idx][header_col]
+                        existing_cell = new_row[new_col]
+                        
+                        # Check if we need to merge (e.g., $ with value)
+                        if header_cell.original_cell and header_cell.original_cell.text().strip():
+                            existing_text = header_cell.original_cell.text().strip()
+                            new_text = data_cell.original_cell.text().strip()
+                            
+                            
+                            # Merge currency symbol with value OR value with percentage OR parentheses
+                            if existing_text == '$' and new_text:
+                                # Currency merge: $ + number
+                                merged_text = f"${new_text}"
+                                # Create new cell with merged content
+                                merged_cell = Cell(
+                                    content=merged_text,
+                                    colspan=header_cell.original_cell.colspan,
+                                    rowspan=header_cell.original_cell.rowspan,
+                                    is_header=header_cell.original_cell.is_header,
+                                    align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
+                                )
+                                new_row[new_col] = MatrixCell(
+                                    original_cell=merged_cell,
+                                    is_spanned=False,
+                                    row_origin=row_idx,
+                                    col_origin=new_col
+                                )
+                            elif new_text == ')' and existing_text.startswith('('):
+                                # Parentheses merge: (number + )
+                                merged_text = f"{existing_text})"
+                                # Create new cell with merged content
+                                merged_cell = Cell(
+                                    content=merged_text,
+                                    colspan=header_cell.original_cell.colspan,
+                                    rowspan=header_cell.original_cell.rowspan,
+                                    is_header=header_cell.original_cell.is_header,
+                                    align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
+                                )
+                                new_row[new_col] = MatrixCell(
+                                    original_cell=merged_cell,
+                                    is_spanned=False,
+                                    row_origin=row_idx,
+                                    col_origin=new_col
+                                )
+                            elif new_text == '%' and existing_text:
+                                # Percentage merge: number + %
+                                merged_text = f"{existing_text}%"
+                                # Create new cell with merged content
+                                merged_cell = Cell(
+                                    content=merged_text,
+                                    colspan=header_cell.original_cell.colspan,
+                                    rowspan=header_cell.original_cell.rowspan,
+                                    is_header=header_cell.original_cell.is_header,
+                                    align=header_cell.original_cell.align if hasattr(header_cell.original_cell, 'align') else None
+                                )
+                                new_row[new_col] = MatrixCell(
+                                    original_cell=merged_cell,
+                                    is_spanned=False,
+                                    row_origin=row_idx,
+                                    col_origin=new_col
+                                )
+                            else:
+                                # Just keep the data cell if can't merge
+                                new_row[new_col] = MatrixCell(
+                                    original_cell=data_cell.original_cell,
+                                    is_spanned=False,
+                                    row_origin=row_idx,
+                                    col_origin=new_col
+                                )
+                        else:
+                            # No existing content, just move the data
+                            new_row[new_col] = MatrixCell(
+                                original_cell=data_cell.original_cell,
+                                is_spanned=False,
+                                row_origin=row_idx,
+                                col_origin=new_col
+                            )
+            
+            new_matrix.matrix.append(new_row)
+        
+        return new_matrix
+    
+    def to_cell_grid(self) -> List[List[Optional[Cell]]]:
+        """
+        Convert matrix to a simple 2D grid of cells.
+        
+        Returns:
+            2D list where each position contains either a Cell or None
+        """
+        grid = []
+        
+        for row_idx in range(self.row_count):
+            row = []
+            for col_idx in range(self.col_count):
+                matrix_cell = self.matrix[row_idx][col_idx]
+                if matrix_cell.original_cell and not matrix_cell.is_spanned:
+                    row.append(matrix_cell.original_cell)
+                else:
+                    row.append(None)
+            grid.append(row)
+        
+        return grid
+    
+    def debug_print(self):
+        """Print matrix structure for debugging"""
+        print(f"Matrix: {self.row_count}×{self.col_count}")
+        
+        for row_idx in range(self.row_count):
+            row_str = []
+            for col_idx in range(self.col_count):
+                cell = self.matrix[row_idx][col_idx]
+                if cell.original_cell:
+                    text = cell.original_cell.text()[:10]
+                    if cell.is_spanned:
+                        row_str.append(f"[{text}...]")
+                    else:
+                        row_str.append(f"{text}...")
+                else:
+                    row_str.append("___")
+            print(f"Row {row_idx}: {' | '.join(row_str)}")
+
+
+class ColumnAnalyzer:
+    """Analyze column structure to identify data vs spacing columns"""
+    
+    def __init__(self, matrix: TableMatrix):
+        """Initialize with a table matrix"""
+        self.matrix = matrix
+    
+    def identify_spacing_columns(self) -> List[int]:
+        """
+        Identify columns used only for spacing.
+        
+        Returns:
+            List of column indices that are spacing columns
+        """
+        spacing_cols = []
+        widths = self.matrix.get_column_widths()
+        total_width = sum(widths)
+        
+        for col_idx in range(self.matrix.col_count):
+            if self._is_spacing_column(col_idx, widths, total_width):
+                spacing_cols.append(col_idx)
+        
+        return spacing_cols
+    
+    def _is_spacing_column(self, col_idx: int, widths: List[float], total_width: float) -> bool:
+        """
+        Check if a column is used for spacing.
+        Only mark as spacing if column is completely empty.
+        
+        Criteria:
+        - Column has absolutely no content across all rows
+        """
+        # Check if column is completely empty
+        for row_idx in range(self.matrix.row_count):
+            cell = self.matrix.matrix[row_idx][col_idx]
+            if cell.original_cell and not cell.is_spanned:
+                text = cell.original_cell.text().strip()
+                # If there's any text at all, it's not a spacing column
+                if text:
+                    return False
+        
+        # Column is completely empty
+        return True
+    
+    def get_clean_column_indices(self) -> List[int]:
+        """
+        Get indices of non-spacing columns.
+        
+        Returns:
+            List of column indices that contain actual data
+        """
+        spacing = set(self.identify_spacing_columns())
+        return [i for i in range(self.matrix.col_count) if i not in spacing]
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/toc_analyzer.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/toc_analyzer.py
@@ -0,0 +1,440 @@
+"""
+Table of Contents analyzer for SEC filings.
+
+This module analyzes the TOC structure to map section names to anchor IDs,
+enabling section extraction for API filings with generated anchor IDs.
+"""
+import re
+from typing import Dict, List, Optional, Set, Tuple
+from dataclasses import dataclass
+from lxml import html as lxml_html
+
+
+@dataclass
+class TOCSection:
+    """Represents a section found in the Table of Contents."""
+    name: str
+    anchor_id: str
+    normalized_name: str
+    section_type: str  # 'item', 'part', 'other'
+    order: int
+    part: Optional[str] = None  # NEW: "Part I", "Part II", or None for 10-K
+
+
+class TOCAnalyzer:
+    """
+    Analyzes Table of Contents structure to map section names to anchor IDs.
+    
+    This enables section extraction for filings where anchor IDs are generated
+    rather than semantic (like API filings vs local HTML files).
+    """
+    
+    def __init__(self):
+        # SEC section patterns for normalization
+        self.section_patterns = [
+            (r'(?:item|part)\s+\d+[a-z]?', 'item'),
+            (r'business', 'item'),
+            (r'risk\s+factors?', 'item'),
+            (r'properties', 'item'),
+            (r'legal\s+proceedings', 'item'),
+            (r'management.*discussion', 'item'),
+            (r'md&a', 'item'),
+            (r'financial\s+statements?', 'item'),
+            (r'exhibits?', 'item'),
+            (r'signatures?', 'item'),
+            (r'part\s+[ivx]+', 'part'),
+        ]
+    
+    def analyze_toc_structure(self, html_content: str) -> Dict[str, str]:
+        """
+        Analyze HTML content to extract section mappings from TOC.
+
+        Args:
+            html_content: Raw HTML content
+
+        Returns:
+            Dict mapping normalized section names to anchor IDs
+        """
+        section_mapping = {}
+
+        try:
+            # Handle XML declaration issues
+            if html_content.startswith('<?xml'):
+                html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
+
+            tree = lxml_html.fromstring(html_content)
+
+            # Find all anchor links that could be TOC links
+            anchor_links = tree.xpath('//a[@href]')
+
+            toc_sections = []
+            current_part = None  # Track current part context for 10-Q filings
+            part_pattern = re.compile(r'^\s*Part\s+([IVX]+)\b', re.IGNORECASE)
+
+            for link in anchor_links:
+                href = link.get('href', '').strip()
+                text = (link.text_content() or '').strip()
+
+                # Check if this link or its row represents a part header
+                # Part headers in 10-Q TOCs typically appear as separate rows: "Part I", "Part II"
+                part_match = part_pattern.match(text)
+                if part_match:
+                    # Update current part context
+                    current_part = f"Part {part_match.group(1).upper()}"
+                    # Don't create a section for the part header itself
+                    continue
+
+                # Look for internal anchor links
+                if href.startswith('#') and text:
+                    anchor_id = href[1:]  # Remove #
+
+                    # Try to find item number in preceding context (for table-based TOCs)
+                    preceding_item = self._extract_preceding_item_label(link)
+
+                    # Check if this looks like a section reference (check text, anchor ID, and context)
+                    if self._is_section_link(text, anchor_id, preceding_item):
+                        # Verify target exists
+                        target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
+                        if target_elements:
+                            # Try to extract item number from: anchor ID > preceding context > text
+                            normalized_name = self._normalize_section_name(text, anchor_id, preceding_item)
+                            section_type, order = self._get_section_type_and_order(normalized_name)
+
+                            toc_section = TOCSection(
+                                name=text,
+                                anchor_id=anchor_id,
+                                normalized_name=normalized_name,
+                                section_type=section_type,
+                                order=order,
+                                part=current_part  # Assign current part context
+                            )
+                            toc_sections.append(toc_section)
+
+            # Build mapping prioritizing the most standard section names
+            section_mapping = self._build_section_mapping(toc_sections)
+
+        except Exception as e:
+            # Return empty mapping on error - fallback to other methods
+            pass
+
+        return section_mapping
+
+    def _extract_preceding_item_label(self, link_element) -> str:
+        """
+        Extract item/part label from preceding context.
+
+        Handles table-based TOCs where item number is in a separate cell:
+        <td>Item 1.</td><td><a href="...">Business</a></td>
+
+        Also handles nested structures like:
+        <td>Item 1.</td><td><div><span><a href="...">Business</a></span></div></td>
+
+        Args:
+            link_element: The <a> element
+
+        Returns:
+            Item label like "Item 1", "Item 1A", "Part I" or empty string
+        """
+        try:
+            # Traverse up to find the containing <td> or <th> (up to 5 levels)
+            current = link_element
+            td_element = None
+
+            for _ in range(5):
+                parent = current.getparent()
+                if parent is None:
+                    break
+
+                if parent.tag in ['td', 'th']:
+                    td_element = parent
+                    break
+
+                current = parent
+
+            # If we found a <td>, check ALL preceding siblings in the row
+            # This handles TOCs where item number is not in the immediately adjacent cell
+            # Example: ['Business', 'I', '1', '5'] where '1' is the item number
+            if td_element is not None:
+                # Check all preceding siblings (rightmost to leftmost)
+                prev_sibling = td_element.getprevious()
+                while prev_sibling is not None:
+                    if prev_sibling.tag in ['td', 'th']:
+                        prev_text = (prev_sibling.text_content() or '').strip()
+
+                        # Look for "Item X" or just "X" (bare number) pattern
+                        # Match full format: "Item 1A"
+                        item_match = re.match(r'(Item\s+\d+[A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
+                        if item_match:
+                            return item_match.group(1)
+
+                        # Match bare item number: "1A" or "1" (only valid 10-K item numbers: 1-15)
+                        # This prevents page numbers (50, 108, etc.) from being treated as items
+                        bare_item_match = re.match(r'^([1-9]|1[0-5])([A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
+                        if bare_item_match:
+                            item_num = bare_item_match.group(1)
+                            item_letter = bare_item_match.group(2)
+                            return f"Item {item_num}{item_letter}"
+
+                        # Match part: "Part I" or just "I"
+                        part_match = re.match(r'(Part\s+[IVX]+)\.?\s*$', prev_text, re.IGNORECASE)
+                        if part_match:
+                            return part_match.group(1)
+
+                        # Match bare part: "I", "II", etc.
+                        bare_part_match = re.match(r'^([IVX]+)\.?\s*$', prev_text)
+                        if bare_part_match:
+                            return f"Part {bare_part_match.group(1)}"
+
+                    prev_sibling = prev_sibling.getprevious()
+
+            # Also check immediate parent's text for inline patterns (div/span structures)
+            parent = link_element.getparent()
+            if parent is not None and parent.tag in ['div', 'span', 'p']:
+                if parent.text:
+                    text_before = parent.text.strip()
+                    item_match = re.search(r'(Item\s+\d+[A-Z]?)\.?\s*$', text_before, re.IGNORECASE)
+                    if item_match:
+                        return item_match.group(1)
+
+                    part_match = re.search(r'(Part\s+[IVX]+)\.?\s*$', text_before, re.IGNORECASE)
+                    if part_match:
+                        return part_match.group(1)
+
+        except Exception:
+            pass
+
+        return ''
+
+    def _is_section_link(self, text: str, anchor_id: str = '', preceding_item: str = '') -> bool:
+        """
+        Check if link represents a section reference.
+
+        Checks link text, anchor ID, and preceding context to handle cases where:
+        - Text is descriptive (e.g., "Executive Compensation")
+        - Anchor ID contains item number (e.g., "item_11_executive_compensation")
+        - Item number is in preceding table cell (e.g., <td>Item 1.</td><td><a>Business</a></td>)
+
+        Args:
+            text: Link text
+            anchor_id: Anchor ID from href (without #)
+            preceding_item: Item/part label from preceding context (e.g., "Item 1A")
+
+        Returns:
+            True if this appears to be a section link
+        """
+        if not text:
+            return False
+
+        # First check if there's a preceding item label (table-based TOC)
+        if preceding_item:
+            return True
+
+        # Then check anchor ID for item/part patterns (most reliable)
+        if anchor_id:
+            anchor_lower = anchor_id.lower()
+            # Match patterns like: item_1, item_1a, item1, item1a, part_i, part_ii, etc.
+            if re.search(r'item_?\d+[a-z]?', anchor_lower):
+                return True
+            if re.search(r'part_?[ivx]+', anchor_lower):
+                return True
+
+        # Then check text (with relaxed length limit for descriptive section names)
+        if len(text) > 150:  # Increased from 100 to accommodate longer section titles
+            return False
+
+        # Check against known patterns
+        for pattern, _ in self.section_patterns:
+            if re.search(pattern, text, re.IGNORECASE):
+                return True
+
+        # Also consider links with section keywords
+        if len(text) < 100 and any(keyword in text.lower() for keyword in
+                                   ['item', 'part', 'business', 'risk', 'properties', 'legal',
+                                    'compensation', 'ownership', 'governance', 'directors']):
+            return True
+
+        return False
+    
+    def _normalize_section_name(self, text: str, anchor_id: str = '', preceding_item: str = '') -> str:
+        """
+        Normalize section name for consistent lookup.
+
+        Prioritizes:
+        1. Preceding item label (table-based TOC)
+        2. Anchor ID pattern
+        3. Text-based normalization
+
+        Args:
+            text: Link text
+            anchor_id: Anchor ID from href (without #)
+            preceding_item: Item/part label from preceding context
+
+        Returns:
+            Normalized section name (e.g., "Item 1A", "Part II")
+        """
+        text = text.strip()
+
+        # HIGHEST PRIORITY: Use preceding item label if available (table-based TOC)
+        if preceding_item:
+            # Clean up and normalize the preceding item
+            item_match = re.match(r'item\s+(\d+[a-z]?)', preceding_item, re.IGNORECASE)
+            if item_match:
+                return f"Item {item_match.group(1).upper()}"
+
+            part_match = re.match(r'part\s+([ivx]+)', preceding_item, re.IGNORECASE)
+            if part_match:
+                return f"Part {part_match.group(1).upper()}"
+
+        # SECOND PRIORITY: Try to extract from anchor ID
+        if anchor_id:
+            anchor_lower = anchor_id.lower()
+
+            # Match item patterns: item_1a, item1a, item_1_business, etc.
+            item_match = re.search(r'item_?(\d+[a-z]?)', anchor_lower)
+            if item_match:
+                item_num = item_match.group(1).upper()
+                return f"Item {item_num}"
+
+            # Match part patterns: part_i, part_ii, parti, partii, etc.
+            part_match = re.search(r'part_?([ivx]+)', anchor_lower)
+            if part_match:
+                part_num = part_match.group(1).upper()
+                return f"Part {part_num}"
+
+        # THIRD PRIORITY: Text-based normalization
+        # Handle common Item patterns in text
+        item_match = re.match(r'item\s+(\d+[a-z]?)', text, re.IGNORECASE)
+        if item_match:
+            return f"Item {item_match.group(1).upper()}"
+
+        # Handle Part patterns
+        part_match = re.match(r'part\s+([ivx]+)', text, re.IGNORECASE)
+        if part_match:
+            return f"Part {part_match.group(1).upper()}"
+
+        # Handle specific known sections by text
+        text_lower = text.lower()
+        if 'business' in text_lower and 'item' not in text_lower:
+            return "Item 1"
+        elif 'risk factors' in text_lower and 'item' not in text_lower:
+            return "Item 1A"
+        elif 'properties' in text_lower and 'item' not in text_lower:
+            return "Item 2"
+        elif 'legal proceedings' in text_lower and 'item' not in text_lower:
+            return "Item 3"
+        elif 'management' in text_lower and 'discussion' in text_lower:
+            return "Item 7"
+        elif 'financial statements' in text_lower:
+            return "Item 8"
+        elif 'exhibits' in text_lower:
+            return "Item 15"
+
+        return text  # Return as-is if no normalization applies
+    
+    def _get_section_type_and_order(self, text: str) -> Tuple[str, int]:
+        """Get section type and order for sorting."""
+        text_lower = text.lower()
+        
+        # Items
+        item_match = re.search(r'item\s*(\d+)([a-z]?)', text_lower)
+        if item_match:
+            item_num = int(item_match.group(1))
+            item_letter = item_match.group(2) or ''
+            # Order: Item 1=1000, Item 1A=1001, Item 2=2000, etc.
+            order = item_num * 1000 + (ord(item_letter.upper()) - ord('A') + 1 if item_letter else 0)
+            return 'item', order
+        
+        # Parts
+        part_match = re.search(r'part\s*([ivx]+)', text_lower)
+        if part_match:
+            part_roman = part_match.group(1)
+            part_num = self._roman_to_int(part_roman)
+            return 'part', part_num * 100  # Part I=100, Part II=200, etc.
+        
+        # Known sections without explicit item numbers
+        if 'business' in text_lower:
+            return 'item', 1000  # Item 1
+        elif 'risk factors' in text_lower:
+            return 'item', 1001  # Item 1A
+        elif 'properties' in text_lower:
+            return 'item', 2000  # Item 2
+        elif 'legal proceedings' in text_lower:
+            return 'item', 3000  # Item 3
+        elif 'management' in text_lower and 'discussion' in text_lower:
+            return 'item', 7000  # Item 7
+        elif 'financial statements' in text_lower:
+            return 'item', 8000  # Item 8
+        elif 'exhibits' in text_lower:
+            return 'item', 15000  # Item 15
+        
+        return 'other', 99999
+    
+    def _roman_to_int(self, roman: str) -> int:
+        """Convert roman numerals to integers."""
+        roman_map = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
+        roman = roman.lower()
+        result = 0
+        prev = 0
+        
+        for char in reversed(roman):
+            value = roman_map.get(char, 0)
+            if value < prev:
+                result -= value
+            else:
+                result += value
+            prev = value
+        
+        return result
+    
+    def _build_section_mapping(self, toc_sections: List[TOCSection]) -> Dict[str, str]:
+        """Build final section mapping, handling duplicates intelligently.
+
+        For 10-Q filings with part context, generates part-aware section names
+        like "part_i_item_1" and "part_ii_item_1" to distinguish sections
+        with the same item number across different parts.
+        """
+        # Sort sections by order
+        toc_sections.sort(key=lambda x: x.order)
+
+        mapping = {}
+        seen_names = set()
+
+        for section in toc_sections:
+            # Generate part-aware section name for 10-Q filings
+            if section.part:
+                # Convert "Part I" -> "part_i", "Part II" -> "part_ii"
+                part_key = section.part.lower().replace(' ', '_')
+                # Convert "Item 1" -> "item_1", "Item 1A" -> "item_1a"
+                item_key = section.normalized_name.lower().replace(' ', '_')
+                section_name = f"{part_key}_{item_key}"
+            else:
+                # 10-K filings: use normalized name as-is
+                section_name = section.normalized_name
+
+            # Skip if we already have this section (prefer first occurrence)
+            if section_name in seen_names:
+                continue
+
+            mapping[section_name] = section.anchor_id
+            seen_names.add(section_name)
+
+        return mapping
+    
+    def get_section_suggestions(self, html_content: str) -> List[str]:
+        """Get list of available sections that can be extracted."""
+        mapping = self.analyze_toc_structure(html_content)
+        return sorted(mapping.keys(), key=lambda x: self._get_section_type_and_order(x)[1])
+
+
+def analyze_toc_for_sections(html_content: str) -> Dict[str, str]:
+    """
+    Convenience function to analyze TOC and return section mapping.
+    
+    Args:
+        html_content: Raw HTML content
+        
+    Returns:
+        Dict mapping section names to anchor IDs
+    """
+    analyzer = TOCAnalyzer()
+    return analyzer.analyze_toc_structure(html_content)
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/toc_filter.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/toc_filter.py
@@ -0,0 +1,104 @@
+"""
+Table of Contents Link Filter
+
+Removes repetitive "Table of Contents" anchor links from document text,
+matching the behavior of the old parser.
+"""
+import re
+from typing import List
+
+
+def filter_toc_links(text: str) -> str:
+    """
+    Filter out repetitive navigation links from text.
+    
+    This replicates the old parser's behavior of removing repetitive
+    navigation links that appear throughout SEC filings.
+    
+    Based on analysis of 12+ SEC filings across different companies:
+    - Average of 47.9 "Table of Contents" links per filing (575 total found)
+    - Oracle 10-K shows 230 "Index to Financial Statements" vs 83 in old parser
+    - Safe to filter without losing legitimate content
+    
+    Patterns filtered:
+    - "Table of Contents" (exact match)
+    - "Index to Financial Statements"  
+    - "Index to Exhibits"
+    
+    Args:
+        text: Input text to filter
+        
+    Returns:
+        Text with navigation links removed
+    """
+    if not text:
+        return text
+    
+    # Navigation link patterns based on analysis
+    patterns = [
+        r'^Table of Contents$',
+        r'^INDEX TO FINANCIAL STATEMENTS$',
+        r'^Index to Financial Statements$',
+        r'^INDEX TO EXHIBITS$', 
+        r'^Index to Exhibits$',
+    ]
+    
+    # Compile all patterns into one regex
+    combined_pattern = re.compile('|'.join(f'({pattern})' for pattern in patterns), re.IGNORECASE)
+    
+    lines = text.split('\n')
+    filtered_lines = []
+    
+    for line in lines:
+        stripped_line = line.strip()
+        if not combined_pattern.match(stripped_line):
+            filtered_lines.append(line)
+    
+    return '\n'.join(filtered_lines)
+
+
+def get_toc_link_stats(text: str) -> dict:
+    """
+    Get statistics about navigation links in text for debugging/analysis.
+    
+    Args:
+        text: Input text to analyze
+        
+    Returns:
+        Dict with counts and examples of navigation patterns
+    """
+    if not text:
+        return {'total_matches': 0, 'patterns': {}, 'examples': []}
+    
+    # All navigation patterns we filter
+    patterns = {
+        'Table of Contents': re.compile(r'^Table of Contents$', re.IGNORECASE),
+        'Index to Financial Statements': re.compile(r'^Index to Financial Statements$', re.IGNORECASE), 
+        'Index to Exhibits': re.compile(r'^Index to Exhibits$', re.IGNORECASE),
+    }
+    
+    lines = text.split('\n')
+    all_matches = []
+    pattern_counts = {}
+    
+    for pattern_name, pattern_regex in patterns.items():
+        pattern_matches = []
+        for i, line in enumerate(lines):
+            stripped_line = line.strip()
+            if pattern_regex.match(stripped_line):
+                pattern_matches.append({
+                    'line_num': i + 1,
+                    'content': line,
+                    'stripped': stripped_line,
+                    'pattern': pattern_name
+                })
+        
+        pattern_counts[pattern_name] = len(pattern_matches)
+        all_matches.extend(pattern_matches[:5])  # First 5 examples per pattern
+    
+    return {
+        'total_matches': sum(pattern_counts.values()),
+        'patterns': pattern_counts,
+        'examples': all_matches,
+        'total_lines': len(lines)
+    }