Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,51 @@
"""
Utility modules for HTML parsing.
"""
from edgar.documents.utils.cache import (
LRUCache,
WeakCache,
TimeBasedCache,
CacheManager,
get_cache_manager,
cached,
CacheStats
)
from edgar.documents.utils.streaming import (
StreamingParser
)
from edgar.documents.utils.table_matrix import (
TableMatrix,
ColumnAnalyzer,
MatrixCell
)
from edgar.documents.utils.currency_merger import (
CurrencyColumnMerger
)
# Note: CacheableMixin not exported to avoid circular imports
# Import directly: from edgar.documents.cache_mixin import CacheableMixin
from edgar.documents.utils.html_utils import (
remove_xml_declaration,
create_lxml_parser
)
# Note: table_utils not exported to avoid circular imports
# Import directly: from edgar.documents.utils.table_utils import process_table_matrix
__all__ = [
'LRUCache',
'WeakCache',
'TimeBasedCache',
'CacheManager',
'get_cache_manager',
'cached',
'CacheStats',
'StreamingParser',
'TableMatrix',
'ColumnAnalyzer',
'MatrixCell',
'CurrencyColumnMerger',
# 'CacheableMixin', # Not exported - import directly to avoid circular imports
'remove_xml_declaration',
'create_lxml_parser',
# 'process_table_matrix' # Not exported - import directly to avoid circular imports
]

View File

@@ -0,0 +1,205 @@
"""
Lightweight anchor analysis cache to avoid re-parsing HTML.
This provides a middle-ground approach that caches anchor analysis results
while minimizing memory overhead.
"""
import re
from typing import Dict, Set, Optional
from collections import Counter
import hashlib
import pickle
from pathlib import Path
class AnchorCache:
"""
Cache for anchor link analysis results.
Stores navigation patterns by HTML hash to avoid re-analysis.
"""
def __init__(self, cache_dir: Optional[Path] = None):
self.cache_dir = cache_dir or Path.home() / '.edgar_cache' / 'anchors'
self.cache_dir.mkdir(parents=True, exist_ok=True)
self._memory_cache = {} # In-memory cache for current session
def _get_html_hash(self, html_content: str) -> str:
"""Get hash of HTML content for caching."""
return hashlib.md5(html_content.encode('utf-8')).hexdigest()
def get_navigation_patterns(self, html_content: str) -> Optional[Set[str]]:
"""
Get cached navigation patterns for HTML content.
Args:
html_content: HTML to analyze
Returns:
Set of navigation patterns or None if not cached
"""
html_hash = self._get_html_hash(html_content)
# Check in-memory cache first
if html_hash in self._memory_cache:
return self._memory_cache[html_hash]
# Check disk cache
cache_file = self.cache_dir / f"{html_hash}.pkl"
if cache_file.exists():
try:
with open(cache_file, 'rb') as f:
patterns = pickle.load(f)
self._memory_cache[html_hash] = patterns
return patterns
except:
# Corrupted cache file, remove it
cache_file.unlink(missing_ok=True)
return None
def cache_navigation_patterns(self, html_content: str, patterns: Set[str]) -> None:
"""
Cache navigation patterns for HTML content.
Args:
html_content: HTML content
patterns: Navigation patterns to cache
"""
html_hash = self._get_html_hash(html_content)
# Store in memory
self._memory_cache[html_hash] = patterns
# Store on disk (async to avoid blocking)
try:
cache_file = self.cache_dir / f"{html_hash}.pkl"
with open(cache_file, 'wb') as f:
pickle.dump(patterns, f)
except:
# Ignore cache write errors
pass
def clear_cache(self) -> None:
"""Clear all cached data."""
self._memory_cache.clear()
for cache_file in self.cache_dir.glob("*.pkl"):
cache_file.unlink(missing_ok=True)
# Global cache instance
_anchor_cache = AnchorCache()
def get_cached_navigation_patterns(html_content: str,
force_analyze: bool = False) -> Set[str]:
"""
Get navigation patterns with caching.
Args:
html_content: HTML to analyze
force_analyze: Force re-analysis even if cached
Returns:
Set of navigation link texts to filter
"""
if not force_analyze:
cached_patterns = _anchor_cache.get_navigation_patterns(html_content)
if cached_patterns is not None:
return cached_patterns
# Need to analyze - use minimal approach
patterns = _analyze_navigation_minimal(html_content)
# Cache results
_anchor_cache.cache_navigation_patterns(html_content, patterns)
return patterns
def _analyze_navigation_minimal(html_content: str, min_frequency: int = 5) -> Set[str]:
"""
Minimal navigation analysis using regex instead of full HTML parsing.
This avoids BeautifulSoup overhead by using regex to find anchor patterns.
"""
patterns = set()
# Find all anchor links with regex (faster than BeautifulSoup)
anchor_pattern = re.compile(r'<a[^>]*href\s*=\s*["\']#([^"\']*)["\'][^>]*>(.*?)</a>',
re.IGNORECASE | re.DOTALL)
link_counts = Counter()
for match in anchor_pattern.finditer(html_content):
anchor_id = match.group(1).strip()
link_text = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Remove HTML tags
link_text = ' '.join(link_text.split()) # Normalize whitespace
if link_text and len(link_text) < 100: # Reasonable link text length
link_counts[link_text] += 1
# Add frequently occurring links
for text, count in link_counts.items():
if count >= min_frequency:
patterns.add(text)
return patterns
def filter_with_cached_patterns(text: str, html_content: str = None) -> str:
"""
Filter text using cached navigation patterns.
Preserves first occurrences of patterns (document structure headers)
while filtering out repeated navigation links.
Args:
text: Text to filter
html_content: HTML for pattern analysis (optional)
Returns:
Filtered text
"""
if not text:
return text
# Get patterns (cached or analyze)
if html_content:
patterns = get_cached_navigation_patterns(html_content)
else:
# Fallback to common SEC patterns
patterns = {
'Table of Contents',
'Index to Financial Statements',
'Index to Exhibits'
}
if not patterns:
return text
# Smart filtering: preserve first few occurrences, filter out repetitions
lines = text.split('\n')
filtered_lines = []
pattern_counts = {} # Track how many times we've seen each pattern
# Allow first few occurrences of each pattern (document structure headers)
max_allowed_per_pattern = 2 # Allow up to 2 occurrences of each pattern
for line in lines:
stripped_line = line.strip()
if stripped_line in patterns:
# This line matches a navigation pattern
count = pattern_counts.get(stripped_line, 0)
if count < max_allowed_per_pattern:
# Keep this occurrence (likely a document structure header)
filtered_lines.append(line)
pattern_counts[stripped_line] = count + 1
# else: skip this line (it's a repetitive navigation link)
else:
# Not a navigation pattern, always keep
filtered_lines.append(line)
return '\n'.join(filtered_lines)

View File

@@ -0,0 +1,426 @@
"""
Cache utilities for performance optimization.
"""
import weakref
from collections import OrderedDict
from typing import Any, Dict, Optional, Callable, TypeVar, Generic
from functools import wraps
import time
import threading
from dataclasses import dataclass, field
from datetime import datetime, timedelta
T = TypeVar('T')
@dataclass
class CacheStats:
"""Statistics for cache performance monitoring."""
hits: int = 0
misses: int = 0
evictions: int = 0
total_time: float = 0.0
last_reset: datetime = field(default_factory=datetime.now)
@property
def hit_rate(self) -> float:
"""Calculate cache hit rate."""
total = self.hits + self.misses
return self.hits / total if total > 0 else 0.0
@property
def avg_access_time(self) -> float:
"""Calculate average access time."""
total = self.hits + self.misses
return self.total_time / total if total > 0 else 0.0
def reset(self):
"""Reset statistics."""
self.hits = 0
self.misses = 0
self.evictions = 0
self.total_time = 0.0
self.last_reset = datetime.now()
class LRUCache(Generic[T]):
"""
Thread-safe LRU cache implementation.
Used for caching expensive operations like style parsing
and header detection results.
"""
def __init__(self, max_size: int = 1000):
"""
Initialize LRU cache.
Args:
max_size: Maximum number of items to cache
"""
self.max_size = max_size
self._cache: OrderedDict[str, T] = OrderedDict()
self._lock = threading.RLock()
self.stats = CacheStats()
def get(self, key: str) -> Optional[T]:
"""
Get item from cache.
Args:
key: Cache key
Returns:
Cached value or None if not found
"""
start_time = time.time()
with self._lock:
if key in self._cache:
# Move to end (most recently used)
self._cache.move_to_end(key)
self.stats.hits += 1
self.stats.total_time += time.time() - start_time
return self._cache[key]
self.stats.misses += 1
self.stats.total_time += time.time() - start_time
return None
def put(self, key: str, value: T) -> None:
"""
Put item in cache.
Args:
key: Cache key
value: Value to cache
"""
with self._lock:
if key in self._cache:
# Update existing
self._cache.move_to_end(key)
self._cache[key] = value
else:
# Add new
self._cache[key] = value
# Evict oldest if over capacity
if len(self._cache) > self.max_size:
self._cache.popitem(last=False)
self.stats.evictions += 1
def clear(self) -> None:
"""Clear all cached items."""
with self._lock:
self._cache.clear()
def size(self) -> int:
"""Get current cache size."""
with self._lock:
return len(self._cache)
class WeakCache:
"""
Weak reference cache for parsed nodes.
Allows garbage collection of unused nodes while
maintaining references to actively used ones.
"""
def __init__(self):
"""Initialize weak cache."""
self._cache: Dict[str, weakref.ref] = {}
self._lock = threading.RLock()
self.stats = CacheStats()
def get(self, key: str) -> Optional[Any]:
"""
Get item from cache.
Args:
key: Cache key
Returns:
Cached object or None if not found or collected
"""
start_time = time.time()
with self._lock:
ref = self._cache.get(key)
if ref is not None:
obj = ref()
if obj is not None:
self.stats.hits += 1
self.stats.total_time += time.time() - start_time
return obj
else:
# Object was garbage collected
del self._cache[key]
self.stats.misses += 1
self.stats.total_time += time.time() - start_time
return None
def put(self, key: str, value: Any) -> None:
"""
Put item in cache with weak reference.
Args:
key: Cache key
value: Object to cache
"""
with self._lock:
self._cache[key] = weakref.ref(value)
def clear(self) -> None:
"""Clear all cached references."""
with self._lock:
self._cache.clear()
def cleanup(self) -> int:
"""
Remove dead references.
Returns:
Number of references removed
"""
with self._lock:
dead_keys = [
key for key, ref in self._cache.items()
if ref() is None
]
for key in dead_keys:
del self._cache[key]
return len(dead_keys)
class TimeBasedCache(Generic[T]):
"""
Time-based expiring cache.
Items expire after a specified duration.
"""
def __init__(self, ttl_seconds: int = 3600):
"""
Initialize time-based cache.
Args:
ttl_seconds: Time to live in seconds
"""
self.ttl = timedelta(seconds=ttl_seconds)
self._cache: Dict[str, tuple[T, datetime]] = {}
self._lock = threading.RLock()
self.stats = CacheStats()
def get(self, key: str) -> Optional[T]:
"""
Get item from cache if not expired.
Args:
key: Cache key
Returns:
Cached value or None if not found or expired
"""
start_time = time.time()
with self._lock:
if key in self._cache:
value, timestamp = self._cache[key]
if datetime.now() - timestamp < self.ttl:
self.stats.hits += 1
self.stats.total_time += time.time() - start_time
return value
else:
# Expired
del self._cache[key]
self.stats.evictions += 1
self.stats.misses += 1
self.stats.total_time += time.time() - start_time
return None
def put(self, key: str, value: T) -> None:
"""
Put item in cache with timestamp.
Args:
key: Cache key
value: Value to cache
"""
with self._lock:
self._cache[key] = (value, datetime.now())
def clear(self) -> None:
"""Clear all cached items."""
with self._lock:
self._cache.clear()
def cleanup(self) -> int:
"""
Remove expired items.
Returns:
Number of items removed
"""
with self._lock:
now = datetime.now()
expired_keys = [
key for key, (_, timestamp) in self._cache.items()
if now - timestamp >= self.ttl
]
for key in expired_keys:
del self._cache[key]
self.stats.evictions += 1
return len(expired_keys)
def cached(cache: LRUCache, key_func: Optional[Callable] = None):
"""
Decorator for caching function results.
Args:
cache: Cache instance to use
key_func: Function to generate cache key from arguments
Returns:
Decorated function
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
# Generate cache key
if key_func:
key = key_func(*args, **kwargs)
else:
# Default key generation
key = f"{func.__name__}:{str(args)}:{str(sorted(kwargs.items()))}"
# Check cache
result = cache.get(key)
if result is not None:
return result
# Compute and cache result
result = func(*args, **kwargs)
cache.put(key, result)
return result
return wrapper
return decorator
class CacheManager:
"""
Manages multiple caches for the parser.
Provides centralized cache management and monitoring.
"""
def __init__(self):
"""Initialize cache manager."""
# Style parsing cache
self.style_cache = LRUCache[dict](max_size=5000)
# Header detection cache
self.header_cache = LRUCache[bool](max_size=2000)
# Pattern matching cache
self.pattern_cache = LRUCache[bool](max_size=10000)
# Node reference cache
self.node_cache = WeakCache()
# Compiled regex cache
self.regex_cache = LRUCache[Any](max_size=500)
# All caches for management
self._caches = {
'style': self.style_cache,
'header': self.header_cache,
'pattern': self.pattern_cache,
'node': self.node_cache,
'regex': self.regex_cache
}
def get_stats(self) -> Dict[str, CacheStats]:
"""Get statistics for all caches."""
return {
name: cache.stats
for name, cache in self._caches.items()
if hasattr(cache, 'stats')
}
def reset_stats(self) -> None:
"""Reset statistics for all caches."""
for cache in self._caches.values():
if hasattr(cache, 'stats'):
cache.stats.reset()
def clear_all(self) -> None:
"""Clear all caches."""
for cache in self._caches.values():
cache.clear()
def cleanup(self) -> Dict[str, int]:
"""
Cleanup expired/dead entries in all caches.
Returns:
Number of entries cleaned up per cache
"""
cleanup_counts = {}
# Cleanup weak cache
if hasattr(self.node_cache, 'cleanup'):
cleanup_counts['node'] = self.node_cache.cleanup()
return cleanup_counts
def get_memory_usage(self) -> Dict[str, int]:
"""
Estimate memory usage of caches.
Returns:
Approximate memory usage in bytes per cache
"""
import sys
usage = {}
for name, cache in self._caches.items():
if hasattr(cache, '_cache'):
# Rough estimation
size = 0
if isinstance(cache._cache, dict):
for key, value in cache._cache.items():
size += sys.getsizeof(key)
if hasattr(value, '__sizeof__'):
size += sys.getsizeof(value)
else:
size += 1000 # Default estimate
usage[name] = size
return usage
# Global cache manager instance
_cache_manager = None
def get_cache_manager() -> CacheManager:
"""Get global cache manager instance."""
global _cache_manager
if _cache_manager is None:
_cache_manager = CacheManager()
return _cache_manager

View File

@@ -0,0 +1,277 @@
"""
Currency column merger for handling separated currency symbols in SEC filings.
"""
import re
from typing import List, Tuple
from edgar.documents.table_nodes import Cell
from edgar.documents.utils.table_matrix import TableMatrix, MatrixCell
class CurrencyColumnMerger:
"""
Detects and merges currency symbol columns with their value columns.
SEC filings often split currency values into two cells:
- Cell 1: "$" (left-aligned)
- Cell 2: "224.11" (right-aligned)
This class detects this pattern and merges them into "$224.11"
"""
# Common currency symbols
CURRENCY_SYMBOLS = {'$', '', '£', '¥', '', 'Rs', 'USD', 'EUR', 'GBP'}
# Pattern for numeric values (with commas, decimals)
NUMERIC_PATTERN = re.compile(r'^[\d,]+\.?\d*$')
def __init__(self, matrix: TableMatrix):
"""Initialize with a table matrix."""
self.matrix = matrix
self.merge_pairs: List[Tuple[int, int]] = []
def detect_currency_pairs(self) -> List[Tuple[int, int]]:
"""
Detect column pairs that should be merged (currency symbol + value).
Returns:
List of (symbol_col, value_col) pairs to merge
"""
pairs = []
for col_idx in range(self.matrix.col_count - 1):
if self._is_currency_column(col_idx):
next_col = col_idx + 1
if self._is_numeric_column(next_col):
# Check if they're consistently paired
if self._verify_pairing(col_idx, next_col):
pairs.append((col_idx, next_col))
self.merge_pairs = pairs
return pairs
def _is_currency_column(self, col_idx: int) -> bool:
"""
Check if a column contains only currency symbols.
A currency column typically:
- Contains only currency symbols or empty cells
- Has very narrow width (1-3 characters)
- Is left-aligned (though we check content, not style)
"""
currency_count = 0
empty_count = 0
other_count = 0
header_rows = 0
for row_idx in range(self.matrix.row_count):
cell = self.matrix.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
# Skip header rows (first 2 rows typically)
if row_idx < 2 and text and not text in self.CURRENCY_SYMBOLS:
header_rows += 1
continue
if not text:
empty_count += 1
elif text in self.CURRENCY_SYMBOLS or text == '$':
currency_count += 1
elif len(text) <= 3 and text in ['$', '', '£', '¥']:
currency_count += 1
else:
other_count += 1
# Column should be mostly currency symbols with some empty cells
# Exclude header rows from the calculation
total_non_empty = currency_count + other_count
if total_non_empty == 0:
return False
# At least 60% of non-empty, non-header cells should be currency symbols
# Lower threshold since we're excluding headers
# Also accept if there's at least 1 currency symbol and no other non-currency content
return (currency_count >= 1 and other_count == 0) or \
(currency_count >= 2 and currency_count / total_non_empty >= 0.6)
def _is_numeric_column(self, col_idx: int) -> bool:
"""
Check if a column contains numeric values.
"""
numeric_count = 0
non_empty_count = 0
for row_idx in range(self.matrix.row_count):
cell = self.matrix.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
# Skip header rows
if row_idx < 2:
continue
if text:
non_empty_count += 1
# Remove formatting and check if numeric
clean_text = text.replace(',', '').replace('%', '').replace('(', '').replace(')', '')
if self.NUMERIC_PATTERN.match(clean_text):
numeric_count += 1
if non_empty_count == 0:
return False
# At least 60% should be numeric (lowered threshold)
return numeric_count / non_empty_count >= 0.6
def _verify_pairing(self, symbol_col: int, value_col: int) -> bool:
"""
Verify that symbol and value columns are consistently paired.
They should have content in the same rows (when symbol present, value present).
"""
paired_rows = 0
mismatched_rows = 0
for row_idx in range(self.matrix.row_count):
symbol_cell = self.matrix.matrix[row_idx][symbol_col]
value_cell = self.matrix.matrix[row_idx][value_col]
if symbol_cell.original_cell and value_cell.original_cell:
symbol_text = symbol_cell.original_cell.text().strip()
value_text = value_cell.original_cell.text().strip()
# Check if they're paired (both have content or both empty)
if symbol_text in self.CURRENCY_SYMBOLS and value_text:
paired_rows += 1
elif not symbol_text and not value_text:
# Both empty is fine
pass
elif symbol_text in self.CURRENCY_SYMBOLS and not value_text:
# Symbol without value - might be header
if row_idx < 2: # Allow in headers
pass
else:
mismatched_rows += 1
elif not symbol_text and value_text:
# Value without symbol - could be valid (continuation)
pass
# Should have more paired than mismatched
return paired_rows > mismatched_rows
def apply_merges(self) -> 'TableMatrix':
"""
Create a new matrix with currency columns merged.
Returns:
New TableMatrix with merged columns
"""
if not self.merge_pairs:
self.detect_currency_pairs()
if not self.merge_pairs:
# No merges needed
return self.matrix
# Calculate new column count (each merge removes one column)
new_col_count = self.matrix.col_count - len(self.merge_pairs)
# Create mapping from old to new columns
old_to_new = {}
merged_cols = set(pair[0] for pair in self.merge_pairs) # Symbol columns to remove
new_col = 0
for old_col in range(self.matrix.col_count):
if old_col in merged_cols:
# This column will be merged with next, skip it
continue
old_to_new[old_col] = new_col
new_col += 1
# Create new matrix
new_matrix = TableMatrix()
new_matrix.row_count = self.matrix.row_count
new_matrix.col_count = new_col_count
new_matrix.matrix = []
# Build new matrix with merged cells
for row_idx in range(self.matrix.row_count):
new_row = [MatrixCell() for _ in range(new_col_count)]
for old_col in range(self.matrix.col_count):
# Check if this is a symbol column to merge
merge_pair = next((pair for pair in self.merge_pairs if pair[0] == old_col), None)
if merge_pair:
# Merge symbol with value
symbol_col, value_col = merge_pair
symbol_cell = self.matrix.matrix[row_idx][symbol_col]
value_cell = self.matrix.matrix[row_idx][value_col]
if value_cell.original_cell:
# Create merged cell
new_cell_content = self._merge_cell_content(symbol_cell, value_cell)
if new_cell_content:
# Create new merged cell
merged_cell = Cell(
content=new_cell_content,
colspan=value_cell.original_cell.colspan,
rowspan=value_cell.original_cell.rowspan,
is_header=value_cell.original_cell.is_header,
align=value_cell.original_cell.align
)
new_col_idx = old_to_new.get(value_col)
if new_col_idx is not None:
new_row[new_col_idx] = MatrixCell(
original_cell=merged_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col_idx
)
elif old_col not in set(pair[1] for pair in self.merge_pairs):
# Regular column, not involved in merging
new_col_idx = old_to_new.get(old_col)
if new_col_idx is not None:
new_row[new_col_idx] = self.matrix.matrix[row_idx][old_col]
new_matrix.matrix.append(new_row)
return new_matrix
def _merge_cell_content(self, symbol_cell: MatrixCell, value_cell: MatrixCell) -> str:
"""
Merge symbol and value cell contents.
Returns:
Merged content like "$224.11" or original value if no symbol
"""
value_text = value_cell.original_cell.text().strip() if value_cell.original_cell else ""
symbol_text = symbol_cell.original_cell.text().strip() if symbol_cell.original_cell else ""
if not value_text:
return symbol_text # Just return symbol if no value
if symbol_text in self.CURRENCY_SYMBOLS:
# Merge symbol with value (no space for $, others may vary)
if symbol_text == '$':
return f"${value_text}"
else:
return f"{symbol_text}{value_text}"
else:
# No symbol, just return value
return value_text
def get_merge_summary(self) -> str:
"""Get a summary of merges to be applied."""
if not self.merge_pairs:
return "No currency column merges detected"
summary = f"Currency merges detected: {len(self.merge_pairs)} pairs\n"
for symbol_col, value_col in self.merge_pairs:
summary += f" • Column {symbol_col} ($) + Column {value_col} (value)\n"
return summary

View File

@@ -0,0 +1,96 @@
"""
HTML utility functions for document parsing.
This module consolidates common HTML processing utilities used across
the parser, preprocessor, and simple parser implementations.
"""
import lxml.html
from typing import Optional
def remove_xml_declaration(html: str) -> str:
"""
Remove XML declaration from HTML if present.
SEC HTML documents sometimes include XML declarations like:
<?xml version="1.0" encoding="UTF-8"?>
These can interfere with HTML parsing and are safely removed since
the encoding is handled separately by the parser.
Args:
html: HTML string that may contain XML declaration
Returns:
HTML string with XML declaration removed (if present)
Examples:
>>> html = '<?xml version="1.0"?><!DOCTYPE html><html>...'
>>> remove_xml_declaration(html)
'<!DOCTYPE html><html>...'
>>> html = '<!DOCTYPE html><html>...' # No XML declaration
>>> remove_xml_declaration(html)
'<!DOCTYPE html><html>...'
"""
html_stripped = html.strip()
if html_stripped.startswith('<?xml'):
xml_end = html.find('?>') + 2
return html[xml_end:]
return html
def create_lxml_parser(
remove_blank_text: bool = True,
remove_comments: bool = True,
recover: bool = True,
encoding: Optional[str] = 'utf-8'
) -> lxml.html.HTMLParser:
"""
Create a configured lxml HTMLParser.
This factory function creates an lxml HTMLParser with consistent
configuration settings used across the document parsing system.
Args:
remove_blank_text: Remove blank text nodes between tags.
Default True for cleaner tree structure.
remove_comments: Remove HTML comments from parsed tree.
Default True since comments are rarely needed.
recover: Enable error recovery mode to handle malformed HTML.
Default True since SEC filings often have HTML issues.
encoding: Character encoding for the parser.
Default 'utf-8'. Set to None to disable encoding handling.
Returns:
Configured lxml.html.HTMLParser instance
Examples:
>>> # Standard parser (removes whitespace and comments, recovers from errors)
>>> parser = create_lxml_parser()
>>> # Parser that preserves all content (for XBRL)
>>> parser = create_lxml_parser(
... remove_blank_text=False,
... remove_comments=False
... )
>>> # Parser without encoding (auto-detect)
>>> parser = create_lxml_parser(encoding=None)
Note:
The recover=True setting is critical for SEC documents which
often contain non-standard HTML structures.
"""
kwargs = {
'remove_blank_text': remove_blank_text,
'remove_comments': remove_comments,
'recover': recover,
}
# Only add encoding if specified
if encoding is not None:
kwargs['encoding'] = encoding
return lxml.html.HTMLParser(**kwargs)

View File

@@ -0,0 +1,375 @@
"""
Streaming parser for large HTML documents.
"""
import io
from typing import Dict, Any, TYPE_CHECKING
from lxml import etree
from lxml.html import HtmlElement
from edgar.documents.config import ParserConfig
from edgar.documents.exceptions import HTMLParsingError, DocumentTooLargeError
# Use TYPE_CHECKING to avoid circular imports
if TYPE_CHECKING:
from edgar.documents.document import Document, DocumentMetadata
from edgar.documents.nodes import DocumentNode, HeadingNode, ParagraphNode, TextNode, SectionNode, ContainerNode
from edgar.documents.table_nodes import TableNode
from edgar.documents.types import SemanticType
class StreamingParser:
"""
Streaming parser for large HTML documents.
Processes documents in chunks to minimize memory usage
while maintaining parse quality.
"""
# Chunk size for streaming (1MB)
CHUNK_SIZE = 1024 * 1024
# Maximum node buffer before flush
MAX_NODE_BUFFER = 1000
def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
"""
Initialize streaming parser.
Args:
config: Parser configuration
strategies: Parsing strategies to use
"""
self.config = config
self.strategies = strategies
self._reset_state()
def _reset_state(self):
"""Reset parser state."""
# Import here to avoid circular import
from edgar.documents.document import DocumentMetadata
from edgar.documents.nodes import DocumentNode
self.current_section = None
self.node_buffer = []
self.metadata = DocumentMetadata()
self.root = DocumentNode()
self.current_parent = self.root
self.tag_stack = []
self.text_buffer = []
self.in_table = False
self.table_buffer = []
self.bytes_processed = 0
def parse(self, html: str) -> "Document":
"""
Parse HTML in streaming mode.
Args:
html: HTML content to parse
Returns:
Parsed Document
Raises:
DocumentTooLargeError: If document exceeds size limit
HTMLParsingError: If parsing fails
"""
self._reset_state()
# Store original HTML BEFORE parsing (needed for TOC-based section detection)
original_html = html
try:
# Create streaming parser
parser = etree.iterparse(
io.BytesIO(html.encode('utf-8')),
events=('start', 'end'),
html=True,
recover=True,
encoding='utf-8'
)
# Process events
for event, elem in parser:
self._process_event(event, elem)
# Check size limit
self.bytes_processed += len(etree.tostring(elem, encoding='unicode', method='html'))
if self.bytes_processed > self.config.max_document_size:
raise DocumentTooLargeError(self.bytes_processed, self.config.max_document_size)
# Flush buffer if needed
if len(self.node_buffer) >= self.MAX_NODE_BUFFER:
self._flush_buffer()
# Clean up processed elements to save memory
elem.clear()
while elem.getprevious() is not None:
parent = elem.getparent()
if parent is not None:
del parent[0]
else:
break
# Final flush
self._flush_buffer()
# Store original HTML in metadata for section detection (TOC analysis)
self.metadata.original_html = original_html
# Create document (import here to avoid circular import)
from edgar.documents.document import Document
document = Document(root=self.root, metadata=self.metadata)
# Store config reference (required for section detection)
document._config = self.config
# Apply post-processing
from edgar.documents.processors.postprocessor import DocumentPostprocessor
postprocessor = DocumentPostprocessor(self.config)
document = postprocessor.process(document)
return document
except etree.ParseError as e:
raise HTMLParsingError(f"Streaming parse failed: {str(e)}")
except Exception as e:
if isinstance(e, (DocumentTooLargeError, HTMLParsingError)):
raise
raise HTMLParsingError(f"Unexpected error during streaming parse: {str(e)}")
def _process_event(self, event: str, elem: HtmlElement):
"""Process a parse event."""
if event == 'start':
self._handle_start_tag(elem)
elif event == 'end':
self._handle_end_tag(elem)
def _handle_start_tag(self, elem: HtmlElement):
"""Handle opening tag."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import ContainerNode
tag = elem.tag.lower()
# Track tag stack
self.tag_stack.append(tag)
# Extract metadata from early elements
if tag == 'title' and elem.text:
self._extract_title_metadata(elem.text)
elif tag == 'meta':
self._extract_meta_metadata(elem)
# Handle specific tags
if tag == 'body':
# Create a container for body content
body_container = ContainerNode(tag_name='body')
self.root.add_child(body_container)
self.current_parent = body_container
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
self._start_heading(elem)
elif tag == 'p':
self._start_paragraph(elem)
elif tag == 'table':
self._start_table(elem)
elif tag == 'section':
self._start_section(elem)
def _handle_end_tag(self, elem: HtmlElement):
"""Handle closing tag."""
tag = elem.tag.lower()
# Remove from tag stack
if self.tag_stack and self.tag_stack[-1] == tag:
self.tag_stack.pop()
# Handle specific tags
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
self._end_heading(elem)
elif tag == 'p':
self._end_paragraph(elem)
elif tag == 'table':
self._end_table(elem)
elif tag == 'section':
self._end_section(elem)
elif tag == 'body':
# When body ends, flush any remaining nodes
self._flush_buffer()
# Handle text content
if elem.text:
self.text_buffer.append(elem.text.strip())
if elem.tail:
self.text_buffer.append(elem.tail.strip())
def _start_heading(self, elem: HtmlElement):
"""Start processing a heading."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import HeadingNode
level = int(elem.tag[1])
text = self._get_text_content(elem)
# Create heading node
heading = HeadingNode(
level=level,
content=text
)
# Check if this is a section header
if self.strategies.get('header_detection'):
detector = self.strategies['header_detection']
if detector.is_section_header(text, elem):
heading.semantic_type = SemanticType.SECTION_HEADER
self.node_buffer.append(heading)
def _end_heading(self, elem: HtmlElement):
"""End processing a heading."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import HeadingNode
# Get text content from element
text = self._get_text_content(elem)
if text and self.node_buffer and isinstance(self.node_buffer[-1], HeadingNode):
self.node_buffer[-1].content = text
# Clear any accumulated text buffer
self.text_buffer.clear()
def _start_paragraph(self, elem: HtmlElement):
"""Start processing a paragraph."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import ParagraphNode
para = ParagraphNode()
# Get style if present
style_attr = elem.get('style')
if style_attr and self.strategies.get('style_parser'):
style_parser = self.strategies['style_parser']
para.style = style_parser.parse(style_attr)
self.node_buffer.append(para)
def _end_paragraph(self, elem: HtmlElement):
"""End processing a paragraph."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import ParagraphNode, TextNode
# Get text content from element
text = self._get_text_content(elem)
if text and self.node_buffer and isinstance(self.node_buffer[-1], ParagraphNode):
text_node = TextNode(content=text)
self.node_buffer[-1].add_child(text_node)
# Clear any accumulated text buffer
self.text_buffer.clear()
def _start_table(self, elem: HtmlElement):
"""Start processing a table."""
self.in_table = True
self.table_buffer = []
# Store table element for later processing
self.table_elem = elem
def _end_table(self, elem: HtmlElement):
"""End processing a table."""
# Import node types at runtime to avoid circular imports
from edgar.documents.table_nodes import TableNode
self.in_table = False
# Process table with table processor if available
if self.strategies.get('table_processing'):
processor = self.strategies['table_processing']
table_node = processor.process(elem)
if table_node:
self.node_buffer.append(table_node)
else:
# Basic table node
table = TableNode()
self.node_buffer.append(table)
self.table_buffer.clear()
def _start_section(self, elem: HtmlElement):
"""Start processing a section."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import SectionNode
section = SectionNode()
# Get section attributes
section_id = elem.get('id')
if section_id:
section.metadata['id'] = section_id
section_class = elem.get('class')
if section_class:
section.metadata['class'] = section_class
self.current_section = section
self.node_buffer.append(section)
def _end_section(self, elem: HtmlElement):
"""End processing a section."""
self.current_section = None
def _flush_buffer(self):
"""Flush node buffer to document tree."""
for node in self.node_buffer:
# Add to current parent
if self.current_section:
self.current_section.add_child(node)
else:
self.current_parent.add_child(node)
self.node_buffer.clear()
def _get_text_content(self, elem: HtmlElement) -> str:
"""Extract text content from element."""
text_parts = []
if elem.text:
text_parts.append(elem.text.strip())
for child in elem:
child_text = self._get_text_content(child)
if child_text:
text_parts.append(child_text)
if child.tail:
text_parts.append(child.tail.strip())
return ' '.join(text_parts)
def _extract_title_metadata(self, title: str):
"""Extract metadata from title."""
# Example: "APPLE INC - 10-K - 2023-09-30"
parts = title.split(' - ')
if len(parts) >= 2:
self.metadata.company = parts[0].strip()
self.metadata.form = parts[1].strip()
if len(parts) >= 3:
self.metadata.filing_date = parts[2].strip()
def _extract_meta_metadata(self, elem: HtmlElement):
"""Extract metadata from meta tags."""
name = elem.get('name', '').lower()
content = elem.get('content', '')
if name and content:
if name == 'company':
self.metadata.company = content
elif name == 'filing-type':
self.metadata.form = content
elif name == 'cik':
self.metadata.cik = content
elif name == 'filing-date':
self.metadata.filing_date = content
elif name == 'accession-number':
self.metadata.accession_number = content

View File

@@ -0,0 +1,858 @@
"""
Table matrix builder for handling complex colspan/rowspan structures.
"""
from dataclasses import dataclass
from typing import List, Optional
from edgar.documents.table_nodes import Cell, Row
@dataclass
class MatrixCell:
"""Cell in the matrix with reference to original cell"""
original_cell: Optional[Cell] = None
is_spanned: bool = False # True if this is part of a colspan/rowspan
row_origin: int = -1 # Original row index
col_origin: int = -1 # Original column index
class TableMatrix:
"""
Build a 2D matrix representation of table with proper handling of merged cells.
This class converts a table with colspan/rowspan into a regular 2D grid
where each merged cell occupies multiple positions in the matrix.
"""
def __init__(self):
"""Initialize empty matrix"""
self.matrix: List[List[MatrixCell]] = []
self.row_count = 0
self.col_count = 0
self.header_row_count = 0 # Track number of header rows
def build_from_rows(self, header_rows: List[List[Cell]], data_rows: List[Row]) -> 'TableMatrix':
"""
Build matrix from header rows and data rows.
Args:
header_rows: List of header rows (each row is a list of Cells)
data_rows: List of Row objects
Returns:
Self for chaining
"""
# Store header row count for later use
self.header_row_count = len(header_rows)
# Combine all rows for processing
all_rows = []
# Add header rows
for header_row in header_rows:
all_rows.append(header_row)
# Add data rows
for row in data_rows:
all_rows.append(row.cells)
if not all_rows:
return self
# Calculate dimensions
self.row_count = len(all_rows)
# First pass: determine actual column count
self._calculate_dimensions(all_rows)
# Initialize matrix
self.matrix = [[MatrixCell() for _ in range(self.col_count)]
for _ in range(self.row_count)]
# Second pass: place cells in matrix
self._place_cells(all_rows)
return self
def _calculate_dimensions(self, rows: List[List[Cell]]):
"""Calculate the actual dimensions considering colspan"""
max_cols = 0
for row_idx, row in enumerate(rows):
col_pos = 0
for cell in row:
# Skip positions that might be occupied by rowspan from above
while col_pos < max_cols and self._is_occupied(row_idx, col_pos):
col_pos += 1
# This cell will occupy from col_pos to col_pos + colspan
col_end = col_pos + cell.colspan
max_cols = max(max_cols, col_end)
col_pos = col_end
self.col_count = max_cols
def _is_occupied(self, row: int, col: int) -> bool:
"""Check if a position is occupied by a cell from a previous row (rowspan)"""
if row == 0:
return False
# Check if any cell above has rowspan that reaches this position
for prev_row in range(row):
if prev_row < len(self.matrix) and col < len(self.matrix[prev_row]):
cell = self.matrix[prev_row][col]
if cell.original_cell and cell.row_origin == prev_row:
# Check if this cell's rowspan reaches current row
if prev_row + cell.original_cell.rowspan > row:
return True
return False
def _place_cells(self, rows: List[List[Cell]]):
"""Place cells in the matrix handling colspan and rowspan"""
for row_idx, row in enumerate(rows):
col_pos = 0
for cell_idx, cell in enumerate(row):
# Find next available column position
while col_pos < self.col_count and self.matrix[row_idx][col_pos].original_cell is not None:
col_pos += 1
if col_pos >= self.col_count:
# Need to expand matrix
self._expand_columns(col_pos + cell.colspan)
# Special handling for cells with colspan > 1 containing numeric values
# Only apply this logic for Table 15-style alignment issues
# Check if this looks like a financial value that should be right-aligned
cell_text = cell.text().strip()
# Check for numeric values that need special alignment
# This is specifically for cases like "167,045" that should align with "$167,045"
has_comma_separator = ',' in cell_text
digit_ratio = sum(c.isdigit() for c in cell_text) / len(cell_text) if cell_text else 0
# Only apply special placement for colspan=2 numeric values in data rows
# This handles Table 15's specific case without breaking Table 13
is_special_numeric = (cell.colspan == 2 and # Specifically colspan=2
has_comma_separator and
digit_ratio > 0.5 and # More than 50% digits
not cell_text.startswith('$') and
not any(month in cell_text.lower() for month in
['jan', 'feb', 'mar', 'apr', 'may', 'jun',
'jul', 'aug', 'sep', 'oct', 'nov', 'dec']) and
row_idx > 1) # Not a header row (allow for multi-row headers)
if is_special_numeric:
# Place empty cell at first position, content at second position
# This is specifically for Table 15 alignment
for r in range(cell.rowspan):
# First column of span: empty
if row_idx + r < self.row_count and col_pos < self.col_count:
self.matrix[row_idx + r][col_pos] = MatrixCell()
# Second column of span: the actual content
if row_idx + r < self.row_count and col_pos + 1 < self.col_count:
matrix_cell = MatrixCell(
original_cell=cell,
is_spanned=False,
row_origin=row_idx,
col_origin=col_pos + 1
)
self.matrix[row_idx + r][col_pos + 1] = matrix_cell
# Remaining columns of span: mark as spanned (though colspan=2 has no remaining)
for c in range(2, cell.colspan):
if row_idx + r < self.row_count and col_pos + c < self.col_count:
matrix_cell = MatrixCell(
original_cell=cell,
is_spanned=True,
row_origin=row_idx,
col_origin=col_pos + 1
)
self.matrix[row_idx + r][col_pos + c] = matrix_cell
else:
# Normal placement for other cells
for r in range(cell.rowspan):
for c in range(cell.colspan):
if row_idx + r < self.row_count and col_pos + c < self.col_count:
matrix_cell = MatrixCell(
original_cell=cell,
is_spanned=(r > 0 or c > 0),
row_origin=row_idx,
col_origin=col_pos
)
self.matrix[row_idx + r][col_pos + c] = matrix_cell
col_pos += cell.colspan
def _expand_columns(self, new_col_count: int):
"""Expand matrix to accommodate more columns"""
if new_col_count <= self.col_count:
return
for row in self.matrix:
row.extend([MatrixCell() for _ in range(new_col_count - self.col_count)])
self.col_count = new_col_count
def get_actual_columns(self) -> int:
"""Get the actual number of data columns (excluding empty/spacing columns)"""
non_empty_cols = 0
for col_idx in range(self.col_count):
has_content = False
for row_idx in range(self.row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
# Check if cell has actual content
text = cell.original_cell.text().strip()
if text and text not in ['', ' ', '\xa0']:
has_content = True
break
if has_content:
non_empty_cols += 1
return non_empty_cols
def get_column_widths(self) -> List[float]:
"""Estimate column widths based on content"""
widths = []
for col_idx in range(self.col_count):
max_width = 0
content_count = 0
for row_idx in range(self.row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
max_width = max(max_width, len(text))
content_count += 1
# If column has no content, it's likely a spacing column
if content_count == 0:
widths.append(0)
else:
widths.append(max_width)
return widths
def get_cell(self, row_idx: int, col_idx: int) -> Optional[Cell]:
"""
Get a cell at specific position in the matrix.
Args:
row_idx: Row index
col_idx: Column index
Returns:
Cell at position or None if out of bounds
"""
if row_idx >= self.row_count or col_idx >= self.col_count or row_idx < 0 or col_idx < 0:
return None
matrix_cell = self.matrix[row_idx][col_idx]
# Return the original cell
if matrix_cell.original_cell:
return matrix_cell.original_cell
# Return empty cell for empty positions
return Cell("")
def get_expanded_row(self, row_idx: int) -> List[Optional[Cell]]:
"""
Get a row with cells expanded to match column count.
For cells with colspan > 1, the cell appears in the first position
and None in subsequent positions.
"""
if row_idx >= self.row_count:
return []
expanded = []
for col_idx in range(self.col_count):
matrix_cell = self.matrix[row_idx][col_idx]
if matrix_cell.original_cell:
if not matrix_cell.is_spanned:
# This is the origin cell
expanded.append(matrix_cell.original_cell)
else:
# This is a spanned position
expanded.append(None)
else:
# Empty cell
expanded.append(None)
return expanded
def get_data_columns(self) -> List[int]:
"""
Get indices of columns that contain actual data (not spacing).
Uses strategy similar to old parser - keeps single empty columns for spacing.
Returns:
List of column indices that contain data
"""
# First, identify which columns are empty
empty_cols = []
for col_idx in range(self.col_count):
has_content = False
for row_idx in range(self.row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
has_content = True
break
if not has_content:
empty_cols.append(col_idx)
# Apply old parser's strategy
cols_to_remove = set()
# Remove leading empty columns
for col in range(self.col_count):
if col in empty_cols:
cols_to_remove.add(col)
else:
break
# Remove trailing empty columns
for col in reversed(range(self.col_count)):
if col in empty_cols:
cols_to_remove.add(col)
else:
break
# Remove consecutive empty columns in the middle (keep single empty cols for spacing)
i = 0
while i < self.col_count - 1:
if i in empty_cols and (i + 1) in empty_cols:
# Found consecutive empty columns
consecutive_count = 0
j = i
while j < self.col_count and j in empty_cols:
consecutive_count += 1
j += 1
# Keep first empty column as spacer, remove the rest
cols_to_remove.update(range(i + 1, i + consecutive_count))
i = j
else:
i += 1
# Return columns that are NOT in the removal set
data_cols = [col for col in range(self.col_count) if col not in cols_to_remove]
return data_cols
def filter_spacing_columns(self) -> 'TableMatrix':
"""
Create a new matrix with spacing columns removed.
Also handles colspan-generated duplicate columns and misalignment.
Returns:
New TableMatrix with only data columns
"""
# First pass: identify primary header columns (those with colspan > 1 headers)
# and data columns
primary_header_cols = set()
all_header_cols = set()
data_cols = set()
# Find primary header columns (those that start a colspan)
for row_idx in range(min(3, self.row_count)):
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
if cell.original_cell.text().strip():
all_header_cols.add(col_idx)
# Check if this is a primary header (colspan > 1)
if cell.original_cell.colspan > 1:
primary_header_cols.add(col_idx)
# If no primary headers found, use all headers as primary
if not primary_header_cols:
primary_header_cols = all_header_cols
# Phase 1.5: Identify columns with header content
# Any column with non-empty text in ANY header row must be preserved
# This prevents legitimate header columns from being removed as "spacing"
# Also preserve columns that are spanned by headers (colspan > 1)
header_content_columns = set()
for col_idx in range(self.col_count):
for row_idx in range(self.header_row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell:
# Check for original header cell with content
if not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
header_content_columns.add(col_idx)
# Also add all columns spanned by this header
if cell.original_cell.colspan > 1:
for span_offset in range(1, cell.original_cell.colspan):
span_col = col_idx + span_offset
if span_col < self.col_count:
header_content_columns.add(span_col)
break # Found content, no need to check other header rows
# Also preserve columns that are spanned (part of a colspan)
elif cell.is_spanned:
# This column is part of a header's colspan
text = cell.original_cell.text().strip()
if text:
header_content_columns.add(col_idx)
# Find columns with data (skip header rows)
# Count actual header rows by checking for non-data content
actual_header_rows = 0
for row_idx in range(min(3, self.row_count)):
has_numeric_data = False
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
# Check if it looks like numeric data (has commas or starts with $)
if text and (',' in text and any(c.isdigit() for c in text)) or text == '$':
has_numeric_data = True
break
if has_numeric_data:
break
actual_header_rows += 1
data_start_row = max(1, actual_header_rows)
# Track columns with significant data (not just isolated cells)
col_data_count = {}
for row_idx in range(data_start_row, self.row_count):
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
if cell.original_cell.text().strip():
data_cols.add(col_idx)
col_data_count[col_idx] = col_data_count.get(col_idx, 0) + 1
# Build initial list of columns to keep
# Always include column 0 if it contains row labels
cols_to_keep = set(primary_header_cols)
# Add columns with header content (prevents removing legitimate headers)
cols_to_keep.update(header_content_columns)
# Identify misaligned data columns that need to be consolidated
# These are data columns that are not primary header columns
misaligned_data_cols = data_cols - primary_header_cols
# Map misaligned data columns to their nearest column for consolidation
# Only consolidate directly adjacent columns with specific patterns
consolidation_map = {}
# First pass: identify all potential consolidations
potential_consolidations = {}
for data_col in sorted(misaligned_data_cols):
# Check if this column should be consolidated with an adjacent column
# Check the column immediately before this one
prev_col = data_col - 1
# Sample some cells to see if consolidation makes sense
consolidation_type = None
for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
prev_cell = self.matrix[row_idx][prev_col] if prev_col >= 0 else None
curr_cell = self.matrix[row_idx][data_col]
if prev_cell and prev_cell.original_cell and curr_cell.original_cell:
prev_text = prev_cell.original_cell.text().strip()
curr_text = curr_cell.original_cell.text().strip()
# Skip empty cells
if not prev_text or not curr_text:
continue
# Check for patterns that indicate consolidation
if prev_text == '$' and curr_text and curr_text[0].isdigit():
consolidation_type = 'currency'
break
elif prev_text.startswith('(') and curr_text == ')':
consolidation_type = 'parentheses'
break
elif curr_text == '%' and prev_text and prev_text[-1].isdigit():
consolidation_type = 'percentage'
break
if consolidation_type:
potential_consolidations[data_col] = (prev_col, consolidation_type)
# Second pass: resolve conflicts
# If column Y is a target for consolidation from Y+1 (e.g., parentheses),
# then don't consolidate Y into another column
columns_needed_as_targets = set()
for data_col, (target_col, cons_type) in potential_consolidations.items():
if cons_type == 'parentheses':
# This target column is needed for parentheses consolidation
columns_needed_as_targets.add(target_col)
# Build final consolidation map, skipping consolidations that would remove needed targets
for data_col, (target_col, cons_type) in potential_consolidations.items():
# Don't consolidate this column if it's needed as a target for parentheses
if data_col in columns_needed_as_targets and cons_type != 'parentheses':
continue
# CRITICAL: Don't consolidate columns that have header content
# This prevents legitimate header columns from being merged together
if data_col in header_content_columns or target_col in header_content_columns:
continue
consolidation_map[data_col] = target_col
# Debug: uncomment to see consolidation mapping
# import os
# if os.environ.get('DEBUG_TABLE_CONSOLIDATION'):
# print(f"Consolidating column {data_col} into {target_col}")
# Special case: Keep data columns that are associated with header columns
# This handles cases where headers span multiple columns but data is in specific columns
for header_col in primary_header_cols:
# Check if there's a data column immediately after the header column
# This is common when headers span multiple columns
for offset in range(1, 3): # Check next 1-2 columns
data_col = header_col + offset
if data_col in data_cols and data_col not in cols_to_keep:
# Check if this column has meaningful data
has_data = False
for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
cell = self.matrix[row_idx][data_col]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text and text not in ['', '-', '', '']:
has_data = True
break
if has_data:
cols_to_keep.add(data_col)
# Keep data columns that have significant content but aren't near header columns
# This includes columns with dates, text descriptions, etc.
for col_idx in data_cols:
if col_idx not in cols_to_keep:
# Check if this column has important data
has_important_data = False
non_empty_count = 0
text_samples = []
for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text and text not in ['', '-', '', '']:
non_empty_count += 1
if len(text_samples) < 3:
text_samples.append(text)
# Check for important patterns
# Dates, years, text descriptions, etc.
if any([
len(text) > 3 and not text.replace(',', '').replace('.', '').isdigit(), # Non-trivial text
any(month in text for month in ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']),
any(month in text for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']),
'20' in text and any(c.isdigit() for c in text), # Likely contains year
]):
has_important_data = True
# Keep columns with consistent important data
if has_important_data and non_empty_count >= 3:
cols_to_keep.add(col_idx)
# Special case: If we have very few primary headers but lots of data columns,
# we might have a table where headers are in data rows (like years)
# Keep columns that have significant financial data
if len(primary_header_cols) <= 2 and len(data_cols) > 4:
# Check for financial data patterns in columns
for col_idx in data_cols:
has_financial_data = False
sample_count = 0
# Sample a few cells from this column
for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
sample_count += 1
# Check for financial patterns
if any([
text.startswith('(') and any(c.isdigit() for c in text), # Negative numbers
text == ')' and col_idx > 0, # Closing parenthesis
'$' in text, # Currency
'%' in text, # Percentages
text.replace(',', '').replace('.', '').isdigit(), # Plain numbers
text in ['', '', '-', '*'] # Common placeholders
]):
has_financial_data = True
break
# Keep columns with financial data
if has_financial_data and sample_count > 0:
cols_to_keep.add(col_idx)
# Check if column 0 contains row labels (non-empty cells in data rows)
col_0_has_labels = False
data_start_row = max(1, actual_header_rows)
for row_idx in range(data_start_row, self.row_count):
cell = self.matrix[row_idx][0]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text and not text.isdigit() and not text.startswith('$') and len(text) > 1:
col_0_has_labels = True
break
# Include column 0 if it has labels
if col_0_has_labels:
cols_to_keep.add(0)
# Remove columns that will be consolidated into other columns
# These columns' data will be merged into their target columns
cols_to_remove = set(consolidation_map.keys())
cols_to_keep = cols_to_keep - cols_to_remove
cols_to_keep = sorted(cols_to_keep)
# Create new matrix with consolidated columns
if not cols_to_keep:
return self
new_matrix = TableMatrix()
new_matrix.row_count = self.row_count
new_matrix.col_count = len(cols_to_keep)
new_matrix.header_row_count = self.header_row_count # Preserve header row count
new_matrix.matrix = []
# Create mapping from old to new column indices
old_to_new = {old_col: new_idx for new_idx, old_col in enumerate(cols_to_keep)}
# Build new matrix with consolidation
for row_idx in range(self.row_count):
new_row = [MatrixCell() for _ in range(new_matrix.col_count)]
# Track which cells we've already placed to handle colspan properly
placed_origins = {} # Maps (row_origin, col_origin) to new column index
# First, copy cells from kept columns
for old_col in sorted(cols_to_keep):
if old_col not in old_to_new:
continue
new_col = old_to_new[old_col]
cell = self.matrix[row_idx][old_col]
if cell.original_cell:
origin_key = (cell.row_origin, cell.col_origin)
# Check if we've already placed this cell (due to colspan)
if origin_key in placed_origins:
# This is a continuation of a colspan - mark as spanned
new_row[new_col] = MatrixCell(
original_cell=cell.original_cell,
is_spanned=True, # Mark as spanned since it's part of a colspan
row_origin=cell.row_origin,
col_origin=placed_origins[origin_key] # Point to the original placement
)
else:
# First occurrence of this cell - place normally
new_row[new_col] = MatrixCell(
original_cell=cell.original_cell,
is_spanned=False, # This is the primary cell
row_origin=cell.row_origin,
col_origin=new_col
)
placed_origins[origin_key] = new_col
# Then, consolidate misaligned data into header columns
for data_col, header_col in consolidation_map.items():
if header_col in old_to_new:
new_col = old_to_new[header_col]
data_cell = self.matrix[row_idx][data_col] if data_col < len(self.matrix[row_idx]) else None
# If data cell has content, merge it with header column
if data_cell and data_cell.original_cell and not data_cell.is_spanned:
# Skip empty data cells
if not data_cell.original_cell.text().strip():
continue
# Check the original header column cell to see if it has content to merge
header_cell = self.matrix[row_idx][header_col]
existing_cell = new_row[new_col]
# Check if we need to merge (e.g., $ with value)
if header_cell.original_cell and header_cell.original_cell.text().strip():
existing_text = header_cell.original_cell.text().strip()
new_text = data_cell.original_cell.text().strip()
# Merge currency symbol with value OR value with percentage OR parentheses
if existing_text == '$' and new_text:
# Currency merge: $ + number
merged_text = f"${new_text}"
# Create new cell with merged content
merged_cell = Cell(
content=merged_text,
colspan=header_cell.original_cell.colspan,
rowspan=header_cell.original_cell.rowspan,
is_header=header_cell.original_cell.is_header,
align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
)
new_row[new_col] = MatrixCell(
original_cell=merged_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
elif new_text == ')' and existing_text.startswith('('):
# Parentheses merge: (number + )
merged_text = f"{existing_text})"
# Create new cell with merged content
merged_cell = Cell(
content=merged_text,
colspan=header_cell.original_cell.colspan,
rowspan=header_cell.original_cell.rowspan,
is_header=header_cell.original_cell.is_header,
align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
)
new_row[new_col] = MatrixCell(
original_cell=merged_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
elif new_text == '%' and existing_text:
# Percentage merge: number + %
merged_text = f"{existing_text}%"
# Create new cell with merged content
merged_cell = Cell(
content=merged_text,
colspan=header_cell.original_cell.colspan,
rowspan=header_cell.original_cell.rowspan,
is_header=header_cell.original_cell.is_header,
align=header_cell.original_cell.align if hasattr(header_cell.original_cell, 'align') else None
)
new_row[new_col] = MatrixCell(
original_cell=merged_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
else:
# Just keep the data cell if can't merge
new_row[new_col] = MatrixCell(
original_cell=data_cell.original_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
else:
# No existing content, just move the data
new_row[new_col] = MatrixCell(
original_cell=data_cell.original_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
new_matrix.matrix.append(new_row)
return new_matrix
def to_cell_grid(self) -> List[List[Optional[Cell]]]:
"""
Convert matrix to a simple 2D grid of cells.
Returns:
2D list where each position contains either a Cell or None
"""
grid = []
for row_idx in range(self.row_count):
row = []
for col_idx in range(self.col_count):
matrix_cell = self.matrix[row_idx][col_idx]
if matrix_cell.original_cell and not matrix_cell.is_spanned:
row.append(matrix_cell.original_cell)
else:
row.append(None)
grid.append(row)
return grid
def debug_print(self):
"""Print matrix structure for debugging"""
print(f"Matrix: {self.row_count}×{self.col_count}")
for row_idx in range(self.row_count):
row_str = []
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell:
text = cell.original_cell.text()[:10]
if cell.is_spanned:
row_str.append(f"[{text}...]")
else:
row_str.append(f"{text}...")
else:
row_str.append("___")
print(f"Row {row_idx}: {' | '.join(row_str)}")
class ColumnAnalyzer:
"""Analyze column structure to identify data vs spacing columns"""
def __init__(self, matrix: TableMatrix):
"""Initialize with a table matrix"""
self.matrix = matrix
def identify_spacing_columns(self) -> List[int]:
"""
Identify columns used only for spacing.
Returns:
List of column indices that are spacing columns
"""
spacing_cols = []
widths = self.matrix.get_column_widths()
total_width = sum(widths)
for col_idx in range(self.matrix.col_count):
if self._is_spacing_column(col_idx, widths, total_width):
spacing_cols.append(col_idx)
return spacing_cols
def _is_spacing_column(self, col_idx: int, widths: List[float], total_width: float) -> bool:
"""
Check if a column is used for spacing.
Only mark as spacing if column is completely empty.
Criteria:
- Column has absolutely no content across all rows
"""
# Check if column is completely empty
for row_idx in range(self.matrix.row_count):
cell = self.matrix.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
# If there's any text at all, it's not a spacing column
if text:
return False
# Column is completely empty
return True
def get_clean_column_indices(self) -> List[int]:
"""
Get indices of non-spacing columns.
Returns:
List of column indices that contain actual data
"""
spacing = set(self.identify_spacing_columns())
return [i for i in range(self.matrix.col_count) if i not in spacing]

View File

@@ -0,0 +1,440 @@
"""
Table of Contents analyzer for SEC filings.
This module analyzes the TOC structure to map section names to anchor IDs,
enabling section extraction for API filings with generated anchor IDs.
"""
import re
from typing import Dict, List, Optional, Set, Tuple
from dataclasses import dataclass
from lxml import html as lxml_html
@dataclass
class TOCSection:
"""Represents a section found in the Table of Contents."""
name: str
anchor_id: str
normalized_name: str
section_type: str # 'item', 'part', 'other'
order: int
part: Optional[str] = None # NEW: "Part I", "Part II", or None for 10-K
class TOCAnalyzer:
"""
Analyzes Table of Contents structure to map section names to anchor IDs.
This enables section extraction for filings where anchor IDs are generated
rather than semantic (like API filings vs local HTML files).
"""
def __init__(self):
# SEC section patterns for normalization
self.section_patterns = [
(r'(?:item|part)\s+\d+[a-z]?', 'item'),
(r'business', 'item'),
(r'risk\s+factors?', 'item'),
(r'properties', 'item'),
(r'legal\s+proceedings', 'item'),
(r'management.*discussion', 'item'),
(r'md&a', 'item'),
(r'financial\s+statements?', 'item'),
(r'exhibits?', 'item'),
(r'signatures?', 'item'),
(r'part\s+[ivx]+', 'part'),
]
def analyze_toc_structure(self, html_content: str) -> Dict[str, str]:
"""
Analyze HTML content to extract section mappings from TOC.
Args:
html_content: Raw HTML content
Returns:
Dict mapping normalized section names to anchor IDs
"""
section_mapping = {}
try:
# Handle XML declaration issues
if html_content.startswith('<?xml'):
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
tree = lxml_html.fromstring(html_content)
# Find all anchor links that could be TOC links
anchor_links = tree.xpath('//a[@href]')
toc_sections = []
current_part = None # Track current part context for 10-Q filings
part_pattern = re.compile(r'^\s*Part\s+([IVX]+)\b', re.IGNORECASE)
for link in anchor_links:
href = link.get('href', '').strip()
text = (link.text_content() or '').strip()
# Check if this link or its row represents a part header
# Part headers in 10-Q TOCs typically appear as separate rows: "Part I", "Part II"
part_match = part_pattern.match(text)
if part_match:
# Update current part context
current_part = f"Part {part_match.group(1).upper()}"
# Don't create a section for the part header itself
continue
# Look for internal anchor links
if href.startswith('#') and text:
anchor_id = href[1:] # Remove #
# Try to find item number in preceding context (for table-based TOCs)
preceding_item = self._extract_preceding_item_label(link)
# Check if this looks like a section reference (check text, anchor ID, and context)
if self._is_section_link(text, anchor_id, preceding_item):
# Verify target exists
target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
if target_elements:
# Try to extract item number from: anchor ID > preceding context > text
normalized_name = self._normalize_section_name(text, anchor_id, preceding_item)
section_type, order = self._get_section_type_and_order(normalized_name)
toc_section = TOCSection(
name=text,
anchor_id=anchor_id,
normalized_name=normalized_name,
section_type=section_type,
order=order,
part=current_part # Assign current part context
)
toc_sections.append(toc_section)
# Build mapping prioritizing the most standard section names
section_mapping = self._build_section_mapping(toc_sections)
except Exception as e:
# Return empty mapping on error - fallback to other methods
pass
return section_mapping
def _extract_preceding_item_label(self, link_element) -> str:
"""
Extract item/part label from preceding context.
Handles table-based TOCs where item number is in a separate cell:
<td>Item 1.</td><td><a href="...">Business</a></td>
Also handles nested structures like:
<td>Item 1.</td><td><div><span><a href="...">Business</a></span></div></td>
Args:
link_element: The <a> element
Returns:
Item label like "Item 1", "Item 1A", "Part I" or empty string
"""
try:
# Traverse up to find the containing <td> or <th> (up to 5 levels)
current = link_element
td_element = None
for _ in range(5):
parent = current.getparent()
if parent is None:
break
if parent.tag in ['td', 'th']:
td_element = parent
break
current = parent
# If we found a <td>, check ALL preceding siblings in the row
# This handles TOCs where item number is not in the immediately adjacent cell
# Example: ['Business', 'I', '1', '5'] where '1' is the item number
if td_element is not None:
# Check all preceding siblings (rightmost to leftmost)
prev_sibling = td_element.getprevious()
while prev_sibling is not None:
if prev_sibling.tag in ['td', 'th']:
prev_text = (prev_sibling.text_content() or '').strip()
# Look for "Item X" or just "X" (bare number) pattern
# Match full format: "Item 1A"
item_match = re.match(r'(Item\s+\d+[A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
if item_match:
return item_match.group(1)
# Match bare item number: "1A" or "1" (only valid 10-K item numbers: 1-15)
# This prevents page numbers (50, 108, etc.) from being treated as items
bare_item_match = re.match(r'^([1-9]|1[0-5])([A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
if bare_item_match:
item_num = bare_item_match.group(1)
item_letter = bare_item_match.group(2)
return f"Item {item_num}{item_letter}"
# Match part: "Part I" or just "I"
part_match = re.match(r'(Part\s+[IVX]+)\.?\s*$', prev_text, re.IGNORECASE)
if part_match:
return part_match.group(1)
# Match bare part: "I", "II", etc.
bare_part_match = re.match(r'^([IVX]+)\.?\s*$', prev_text)
if bare_part_match:
return f"Part {bare_part_match.group(1)}"
prev_sibling = prev_sibling.getprevious()
# Also check immediate parent's text for inline patterns (div/span structures)
parent = link_element.getparent()
if parent is not None and parent.tag in ['div', 'span', 'p']:
if parent.text:
text_before = parent.text.strip()
item_match = re.search(r'(Item\s+\d+[A-Z]?)\.?\s*$', text_before, re.IGNORECASE)
if item_match:
return item_match.group(1)
part_match = re.search(r'(Part\s+[IVX]+)\.?\s*$', text_before, re.IGNORECASE)
if part_match:
return part_match.group(1)
except Exception:
pass
return ''
def _is_section_link(self, text: str, anchor_id: str = '', preceding_item: str = '') -> bool:
"""
Check if link represents a section reference.
Checks link text, anchor ID, and preceding context to handle cases where:
- Text is descriptive (e.g., "Executive Compensation")
- Anchor ID contains item number (e.g., "item_11_executive_compensation")
- Item number is in preceding table cell (e.g., <td>Item 1.</td><td><a>Business</a></td>)
Args:
text: Link text
anchor_id: Anchor ID from href (without #)
preceding_item: Item/part label from preceding context (e.g., "Item 1A")
Returns:
True if this appears to be a section link
"""
if not text:
return False
# First check if there's a preceding item label (table-based TOC)
if preceding_item:
return True
# Then check anchor ID for item/part patterns (most reliable)
if anchor_id:
anchor_lower = anchor_id.lower()
# Match patterns like: item_1, item_1a, item1, item1a, part_i, part_ii, etc.
if re.search(r'item_?\d+[a-z]?', anchor_lower):
return True
if re.search(r'part_?[ivx]+', anchor_lower):
return True
# Then check text (with relaxed length limit for descriptive section names)
if len(text) > 150: # Increased from 100 to accommodate longer section titles
return False
# Check against known patterns
for pattern, _ in self.section_patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
# Also consider links with section keywords
if len(text) < 100 and any(keyword in text.lower() for keyword in
['item', 'part', 'business', 'risk', 'properties', 'legal',
'compensation', 'ownership', 'governance', 'directors']):
return True
return False
def _normalize_section_name(self, text: str, anchor_id: str = '', preceding_item: str = '') -> str:
"""
Normalize section name for consistent lookup.
Prioritizes:
1. Preceding item label (table-based TOC)
2. Anchor ID pattern
3. Text-based normalization
Args:
text: Link text
anchor_id: Anchor ID from href (without #)
preceding_item: Item/part label from preceding context
Returns:
Normalized section name (e.g., "Item 1A", "Part II")
"""
text = text.strip()
# HIGHEST PRIORITY: Use preceding item label if available (table-based TOC)
if preceding_item:
# Clean up and normalize the preceding item
item_match = re.match(r'item\s+(\d+[a-z]?)', preceding_item, re.IGNORECASE)
if item_match:
return f"Item {item_match.group(1).upper()}"
part_match = re.match(r'part\s+([ivx]+)', preceding_item, re.IGNORECASE)
if part_match:
return f"Part {part_match.group(1).upper()}"
# SECOND PRIORITY: Try to extract from anchor ID
if anchor_id:
anchor_lower = anchor_id.lower()
# Match item patterns: item_1a, item1a, item_1_business, etc.
item_match = re.search(r'item_?(\d+[a-z]?)', anchor_lower)
if item_match:
item_num = item_match.group(1).upper()
return f"Item {item_num}"
# Match part patterns: part_i, part_ii, parti, partii, etc.
part_match = re.search(r'part_?([ivx]+)', anchor_lower)
if part_match:
part_num = part_match.group(1).upper()
return f"Part {part_num}"
# THIRD PRIORITY: Text-based normalization
# Handle common Item patterns in text
item_match = re.match(r'item\s+(\d+[a-z]?)', text, re.IGNORECASE)
if item_match:
return f"Item {item_match.group(1).upper()}"
# Handle Part patterns
part_match = re.match(r'part\s+([ivx]+)', text, re.IGNORECASE)
if part_match:
return f"Part {part_match.group(1).upper()}"
# Handle specific known sections by text
text_lower = text.lower()
if 'business' in text_lower and 'item' not in text_lower:
return "Item 1"
elif 'risk factors' in text_lower and 'item' not in text_lower:
return "Item 1A"
elif 'properties' in text_lower and 'item' not in text_lower:
return "Item 2"
elif 'legal proceedings' in text_lower and 'item' not in text_lower:
return "Item 3"
elif 'management' in text_lower and 'discussion' in text_lower:
return "Item 7"
elif 'financial statements' in text_lower:
return "Item 8"
elif 'exhibits' in text_lower:
return "Item 15"
return text # Return as-is if no normalization applies
def _get_section_type_and_order(self, text: str) -> Tuple[str, int]:
"""Get section type and order for sorting."""
text_lower = text.lower()
# Items
item_match = re.search(r'item\s*(\d+)([a-z]?)', text_lower)
if item_match:
item_num = int(item_match.group(1))
item_letter = item_match.group(2) or ''
# Order: Item 1=1000, Item 1A=1001, Item 2=2000, etc.
order = item_num * 1000 + (ord(item_letter.upper()) - ord('A') + 1 if item_letter else 0)
return 'item', order
# Parts
part_match = re.search(r'part\s*([ivx]+)', text_lower)
if part_match:
part_roman = part_match.group(1)
part_num = self._roman_to_int(part_roman)
return 'part', part_num * 100 # Part I=100, Part II=200, etc.
# Known sections without explicit item numbers
if 'business' in text_lower:
return 'item', 1000 # Item 1
elif 'risk factors' in text_lower:
return 'item', 1001 # Item 1A
elif 'properties' in text_lower:
return 'item', 2000 # Item 2
elif 'legal proceedings' in text_lower:
return 'item', 3000 # Item 3
elif 'management' in text_lower and 'discussion' in text_lower:
return 'item', 7000 # Item 7
elif 'financial statements' in text_lower:
return 'item', 8000 # Item 8
elif 'exhibits' in text_lower:
return 'item', 15000 # Item 15
return 'other', 99999
def _roman_to_int(self, roman: str) -> int:
"""Convert roman numerals to integers."""
roman_map = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
roman = roman.lower()
result = 0
prev = 0
for char in reversed(roman):
value = roman_map.get(char, 0)
if value < prev:
result -= value
else:
result += value
prev = value
return result
def _build_section_mapping(self, toc_sections: List[TOCSection]) -> Dict[str, str]:
"""Build final section mapping, handling duplicates intelligently.
For 10-Q filings with part context, generates part-aware section names
like "part_i_item_1" and "part_ii_item_1" to distinguish sections
with the same item number across different parts.
"""
# Sort sections by order
toc_sections.sort(key=lambda x: x.order)
mapping = {}
seen_names = set()
for section in toc_sections:
# Generate part-aware section name for 10-Q filings
if section.part:
# Convert "Part I" -> "part_i", "Part II" -> "part_ii"
part_key = section.part.lower().replace(' ', '_')
# Convert "Item 1" -> "item_1", "Item 1A" -> "item_1a"
item_key = section.normalized_name.lower().replace(' ', '_')
section_name = f"{part_key}_{item_key}"
else:
# 10-K filings: use normalized name as-is
section_name = section.normalized_name
# Skip if we already have this section (prefer first occurrence)
if section_name in seen_names:
continue
mapping[section_name] = section.anchor_id
seen_names.add(section_name)
return mapping
def get_section_suggestions(self, html_content: str) -> List[str]:
"""Get list of available sections that can be extracted."""
mapping = self.analyze_toc_structure(html_content)
return sorted(mapping.keys(), key=lambda x: self._get_section_type_and_order(x)[1])
def analyze_toc_for_sections(html_content: str) -> Dict[str, str]:
"""
Convenience function to analyze TOC and return section mapping.
Args:
html_content: Raw HTML content
Returns:
Dict mapping section names to anchor IDs
"""
analyzer = TOCAnalyzer()
return analyzer.analyze_toc_structure(html_content)

View File

@@ -0,0 +1,104 @@
"""
Table of Contents Link Filter
Removes repetitive "Table of Contents" anchor links from document text,
matching the behavior of the old parser.
"""
import re
from typing import List
def filter_toc_links(text: str) -> str:
"""
Filter out repetitive navigation links from text.
This replicates the old parser's behavior of removing repetitive
navigation links that appear throughout SEC filings.
Based on analysis of 12+ SEC filings across different companies:
- Average of 47.9 "Table of Contents" links per filing (575 total found)
- Oracle 10-K shows 230 "Index to Financial Statements" vs 83 in old parser
- Safe to filter without losing legitimate content
Patterns filtered:
- "Table of Contents" (exact match)
- "Index to Financial Statements"
- "Index to Exhibits"
Args:
text: Input text to filter
Returns:
Text with navigation links removed
"""
if not text:
return text
# Navigation link patterns based on analysis
patterns = [
r'^Table of Contents$',
r'^INDEX TO FINANCIAL STATEMENTS$',
r'^Index to Financial Statements$',
r'^INDEX TO EXHIBITS$',
r'^Index to Exhibits$',
]
# Compile all patterns into one regex
combined_pattern = re.compile('|'.join(f'({pattern})' for pattern in patterns), re.IGNORECASE)
lines = text.split('\n')
filtered_lines = []
for line in lines:
stripped_line = line.strip()
if not combined_pattern.match(stripped_line):
filtered_lines.append(line)
return '\n'.join(filtered_lines)
def get_toc_link_stats(text: str) -> dict:
"""
Get statistics about navigation links in text for debugging/analysis.
Args:
text: Input text to analyze
Returns:
Dict with counts and examples of navigation patterns
"""
if not text:
return {'total_matches': 0, 'patterns': {}, 'examples': []}
# All navigation patterns we filter
patterns = {
'Table of Contents': re.compile(r'^Table of Contents$', re.IGNORECASE),
'Index to Financial Statements': re.compile(r'^Index to Financial Statements$', re.IGNORECASE),
'Index to Exhibits': re.compile(r'^Index to Exhibits$', re.IGNORECASE),
}
lines = text.split('\n')
all_matches = []
pattern_counts = {}
for pattern_name, pattern_regex in patterns.items():
pattern_matches = []
for i, line in enumerate(lines):
stripped_line = line.strip()
if pattern_regex.match(stripped_line):
pattern_matches.append({
'line_num': i + 1,
'content': line,
'stripped': stripped_line,
'pattern': pattern_name
})
pattern_counts[pattern_name] = len(pattern_matches)
all_matches.extend(pattern_matches[:5]) # First 5 examples per pattern
return {
'total_matches': sum(pattern_counts.values()),
'patterns': pattern_counts,
'examples': all_matches,
'total_lines': len(lines)
}