Initial commit
This commit is contained in:
@@ -0,0 +1,51 @@
|
||||
"""
|
||||
Utility modules for HTML parsing.
|
||||
"""
|
||||
|
||||
from edgar.documents.utils.cache import (
|
||||
LRUCache,
|
||||
WeakCache,
|
||||
TimeBasedCache,
|
||||
CacheManager,
|
||||
get_cache_manager,
|
||||
cached,
|
||||
CacheStats
|
||||
)
|
||||
from edgar.documents.utils.streaming import (
|
||||
StreamingParser
|
||||
)
|
||||
from edgar.documents.utils.table_matrix import (
|
||||
TableMatrix,
|
||||
ColumnAnalyzer,
|
||||
MatrixCell
|
||||
)
|
||||
from edgar.documents.utils.currency_merger import (
|
||||
CurrencyColumnMerger
|
||||
)
|
||||
# Note: CacheableMixin not exported to avoid circular imports
|
||||
# Import directly: from edgar.documents.cache_mixin import CacheableMixin
|
||||
from edgar.documents.utils.html_utils import (
|
||||
remove_xml_declaration,
|
||||
create_lxml_parser
|
||||
)
|
||||
# Note: table_utils not exported to avoid circular imports
|
||||
# Import directly: from edgar.documents.utils.table_utils import process_table_matrix
|
||||
|
||||
__all__ = [
|
||||
'LRUCache',
|
||||
'WeakCache',
|
||||
'TimeBasedCache',
|
||||
'CacheManager',
|
||||
'get_cache_manager',
|
||||
'cached',
|
||||
'CacheStats',
|
||||
'StreamingParser',
|
||||
'TableMatrix',
|
||||
'ColumnAnalyzer',
|
||||
'MatrixCell',
|
||||
'CurrencyColumnMerger',
|
||||
# 'CacheableMixin', # Not exported - import directly to avoid circular imports
|
||||
'remove_xml_declaration',
|
||||
'create_lxml_parser',
|
||||
# 'process_table_matrix' # Not exported - import directly to avoid circular imports
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,205 @@
|
||||
"""
|
||||
Lightweight anchor analysis cache to avoid re-parsing HTML.
|
||||
|
||||
This provides a middle-ground approach that caches anchor analysis results
|
||||
while minimizing memory overhead.
|
||||
"""
|
||||
import re
|
||||
from typing import Dict, Set, Optional
|
||||
from collections import Counter
|
||||
import hashlib
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class AnchorCache:
|
||||
"""
|
||||
Cache for anchor link analysis results.
|
||||
|
||||
Stores navigation patterns by HTML hash to avoid re-analysis.
|
||||
"""
|
||||
|
||||
def __init__(self, cache_dir: Optional[Path] = None):
|
||||
self.cache_dir = cache_dir or Path.home() / '.edgar_cache' / 'anchors'
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._memory_cache = {} # In-memory cache for current session
|
||||
|
||||
def _get_html_hash(self, html_content: str) -> str:
|
||||
"""Get hash of HTML content for caching."""
|
||||
return hashlib.md5(html_content.encode('utf-8')).hexdigest()
|
||||
|
||||
def get_navigation_patterns(self, html_content: str) -> Optional[Set[str]]:
|
||||
"""
|
||||
Get cached navigation patterns for HTML content.
|
||||
|
||||
Args:
|
||||
html_content: HTML to analyze
|
||||
|
||||
Returns:
|
||||
Set of navigation patterns or None if not cached
|
||||
"""
|
||||
html_hash = self._get_html_hash(html_content)
|
||||
|
||||
# Check in-memory cache first
|
||||
if html_hash in self._memory_cache:
|
||||
return self._memory_cache[html_hash]
|
||||
|
||||
# Check disk cache
|
||||
cache_file = self.cache_dir / f"{html_hash}.pkl"
|
||||
if cache_file.exists():
|
||||
try:
|
||||
with open(cache_file, 'rb') as f:
|
||||
patterns = pickle.load(f)
|
||||
self._memory_cache[html_hash] = patterns
|
||||
return patterns
|
||||
except:
|
||||
# Corrupted cache file, remove it
|
||||
cache_file.unlink(missing_ok=True)
|
||||
|
||||
return None
|
||||
|
||||
def cache_navigation_patterns(self, html_content: str, patterns: Set[str]) -> None:
|
||||
"""
|
||||
Cache navigation patterns for HTML content.
|
||||
|
||||
Args:
|
||||
html_content: HTML content
|
||||
patterns: Navigation patterns to cache
|
||||
"""
|
||||
html_hash = self._get_html_hash(html_content)
|
||||
|
||||
# Store in memory
|
||||
self._memory_cache[html_hash] = patterns
|
||||
|
||||
# Store on disk (async to avoid blocking)
|
||||
try:
|
||||
cache_file = self.cache_dir / f"{html_hash}.pkl"
|
||||
with open(cache_file, 'wb') as f:
|
||||
pickle.dump(patterns, f)
|
||||
except:
|
||||
# Ignore cache write errors
|
||||
pass
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
"""Clear all cached data."""
|
||||
self._memory_cache.clear()
|
||||
for cache_file in self.cache_dir.glob("*.pkl"):
|
||||
cache_file.unlink(missing_ok=True)
|
||||
|
||||
|
||||
# Global cache instance
|
||||
_anchor_cache = AnchorCache()
|
||||
|
||||
|
||||
def get_cached_navigation_patterns(html_content: str,
|
||||
force_analyze: bool = False) -> Set[str]:
|
||||
"""
|
||||
Get navigation patterns with caching.
|
||||
|
||||
Args:
|
||||
html_content: HTML to analyze
|
||||
force_analyze: Force re-analysis even if cached
|
||||
|
||||
Returns:
|
||||
Set of navigation link texts to filter
|
||||
"""
|
||||
if not force_analyze:
|
||||
cached_patterns = _anchor_cache.get_navigation_patterns(html_content)
|
||||
if cached_patterns is not None:
|
||||
return cached_patterns
|
||||
|
||||
# Need to analyze - use minimal approach
|
||||
patterns = _analyze_navigation_minimal(html_content)
|
||||
|
||||
# Cache results
|
||||
_anchor_cache.cache_navigation_patterns(html_content, patterns)
|
||||
|
||||
return patterns
|
||||
|
||||
|
||||
def _analyze_navigation_minimal(html_content: str, min_frequency: int = 5) -> Set[str]:
|
||||
"""
|
||||
Minimal navigation analysis using regex instead of full HTML parsing.
|
||||
|
||||
This avoids BeautifulSoup overhead by using regex to find anchor patterns.
|
||||
"""
|
||||
patterns = set()
|
||||
|
||||
# Find all anchor links with regex (faster than BeautifulSoup)
|
||||
anchor_pattern = re.compile(r'<a[^>]*href\s*=\s*["\']#([^"\']*)["\'][^>]*>(.*?)</a>',
|
||||
re.IGNORECASE | re.DOTALL)
|
||||
|
||||
link_counts = Counter()
|
||||
|
||||
for match in anchor_pattern.finditer(html_content):
|
||||
anchor_id = match.group(1).strip()
|
||||
link_text = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Remove HTML tags
|
||||
link_text = ' '.join(link_text.split()) # Normalize whitespace
|
||||
|
||||
if link_text and len(link_text) < 100: # Reasonable link text length
|
||||
link_counts[link_text] += 1
|
||||
|
||||
# Add frequently occurring links
|
||||
for text, count in link_counts.items():
|
||||
if count >= min_frequency:
|
||||
patterns.add(text)
|
||||
|
||||
return patterns
|
||||
|
||||
|
||||
def filter_with_cached_patterns(text: str, html_content: str = None) -> str:
|
||||
"""
|
||||
Filter text using cached navigation patterns.
|
||||
|
||||
Preserves first occurrences of patterns (document structure headers)
|
||||
while filtering out repeated navigation links.
|
||||
|
||||
Args:
|
||||
text: Text to filter
|
||||
html_content: HTML for pattern analysis (optional)
|
||||
|
||||
Returns:
|
||||
Filtered text
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Get patterns (cached or analyze)
|
||||
if html_content:
|
||||
patterns = get_cached_navigation_patterns(html_content)
|
||||
else:
|
||||
# Fallback to common SEC patterns
|
||||
patterns = {
|
||||
'Table of Contents',
|
||||
'Index to Financial Statements',
|
||||
'Index to Exhibits'
|
||||
}
|
||||
|
||||
if not patterns:
|
||||
return text
|
||||
|
||||
# Smart filtering: preserve first few occurrences, filter out repetitions
|
||||
lines = text.split('\n')
|
||||
filtered_lines = []
|
||||
pattern_counts = {} # Track how many times we've seen each pattern
|
||||
|
||||
# Allow first few occurrences of each pattern (document structure headers)
|
||||
max_allowed_per_pattern = 2 # Allow up to 2 occurrences of each pattern
|
||||
|
||||
for line in lines:
|
||||
stripped_line = line.strip()
|
||||
|
||||
if stripped_line in patterns:
|
||||
# This line matches a navigation pattern
|
||||
count = pattern_counts.get(stripped_line, 0)
|
||||
|
||||
if count < max_allowed_per_pattern:
|
||||
# Keep this occurrence (likely a document structure header)
|
||||
filtered_lines.append(line)
|
||||
pattern_counts[stripped_line] = count + 1
|
||||
# else: skip this line (it's a repetitive navigation link)
|
||||
else:
|
||||
# Not a navigation pattern, always keep
|
||||
filtered_lines.append(line)
|
||||
|
||||
return '\n'.join(filtered_lines)
|
||||
426
venv/lib/python3.10/site-packages/edgar/documents/utils/cache.py
Normal file
426
venv/lib/python3.10/site-packages/edgar/documents/utils/cache.py
Normal file
@@ -0,0 +1,426 @@
|
||||
"""
|
||||
Cache utilities for performance optimization.
|
||||
"""
|
||||
|
||||
import weakref
|
||||
from collections import OrderedDict
|
||||
from typing import Any, Dict, Optional, Callable, TypeVar, Generic
|
||||
from functools import wraps
|
||||
import time
|
||||
import threading
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
@dataclass
|
||||
class CacheStats:
|
||||
"""Statistics for cache performance monitoring."""
|
||||
hits: int = 0
|
||||
misses: int = 0
|
||||
evictions: int = 0
|
||||
total_time: float = 0.0
|
||||
last_reset: datetime = field(default_factory=datetime.now)
|
||||
|
||||
@property
|
||||
def hit_rate(self) -> float:
|
||||
"""Calculate cache hit rate."""
|
||||
total = self.hits + self.misses
|
||||
return self.hits / total if total > 0 else 0.0
|
||||
|
||||
@property
|
||||
def avg_access_time(self) -> float:
|
||||
"""Calculate average access time."""
|
||||
total = self.hits + self.misses
|
||||
return self.total_time / total if total > 0 else 0.0
|
||||
|
||||
def reset(self):
|
||||
"""Reset statistics."""
|
||||
self.hits = 0
|
||||
self.misses = 0
|
||||
self.evictions = 0
|
||||
self.total_time = 0.0
|
||||
self.last_reset = datetime.now()
|
||||
|
||||
|
||||
class LRUCache(Generic[T]):
|
||||
"""
|
||||
Thread-safe LRU cache implementation.
|
||||
|
||||
Used for caching expensive operations like style parsing
|
||||
and header detection results.
|
||||
"""
|
||||
|
||||
def __init__(self, max_size: int = 1000):
|
||||
"""
|
||||
Initialize LRU cache.
|
||||
|
||||
Args:
|
||||
max_size: Maximum number of items to cache
|
||||
"""
|
||||
self.max_size = max_size
|
||||
self._cache: OrderedDict[str, T] = OrderedDict()
|
||||
self._lock = threading.RLock()
|
||||
self.stats = CacheStats()
|
||||
|
||||
def get(self, key: str) -> Optional[T]:
|
||||
"""
|
||||
Get item from cache.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
|
||||
Returns:
|
||||
Cached value or None if not found
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
if key in self._cache:
|
||||
# Move to end (most recently used)
|
||||
self._cache.move_to_end(key)
|
||||
self.stats.hits += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return self._cache[key]
|
||||
|
||||
self.stats.misses += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return None
|
||||
|
||||
def put(self, key: str, value: T) -> None:
|
||||
"""
|
||||
Put item in cache.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
value: Value to cache
|
||||
"""
|
||||
with self._lock:
|
||||
if key in self._cache:
|
||||
# Update existing
|
||||
self._cache.move_to_end(key)
|
||||
self._cache[key] = value
|
||||
else:
|
||||
# Add new
|
||||
self._cache[key] = value
|
||||
|
||||
# Evict oldest if over capacity
|
||||
if len(self._cache) > self.max_size:
|
||||
self._cache.popitem(last=False)
|
||||
self.stats.evictions += 1
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached items."""
|
||||
with self._lock:
|
||||
self._cache.clear()
|
||||
|
||||
def size(self) -> int:
|
||||
"""Get current cache size."""
|
||||
with self._lock:
|
||||
return len(self._cache)
|
||||
|
||||
|
||||
class WeakCache:
|
||||
"""
|
||||
Weak reference cache for parsed nodes.
|
||||
|
||||
Allows garbage collection of unused nodes while
|
||||
maintaining references to actively used ones.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize weak cache."""
|
||||
self._cache: Dict[str, weakref.ref] = {}
|
||||
self._lock = threading.RLock()
|
||||
self.stats = CacheStats()
|
||||
|
||||
def get(self, key: str) -> Optional[Any]:
|
||||
"""
|
||||
Get item from cache.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
|
||||
Returns:
|
||||
Cached object or None if not found or collected
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
ref = self._cache.get(key)
|
||||
if ref is not None:
|
||||
obj = ref()
|
||||
if obj is not None:
|
||||
self.stats.hits += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return obj
|
||||
else:
|
||||
# Object was garbage collected
|
||||
del self._cache[key]
|
||||
|
||||
self.stats.misses += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return None
|
||||
|
||||
def put(self, key: str, value: Any) -> None:
|
||||
"""
|
||||
Put item in cache with weak reference.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
value: Object to cache
|
||||
"""
|
||||
with self._lock:
|
||||
self._cache[key] = weakref.ref(value)
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached references."""
|
||||
with self._lock:
|
||||
self._cache.clear()
|
||||
|
||||
def cleanup(self) -> int:
|
||||
"""
|
||||
Remove dead references.
|
||||
|
||||
Returns:
|
||||
Number of references removed
|
||||
"""
|
||||
with self._lock:
|
||||
dead_keys = [
|
||||
key for key, ref in self._cache.items()
|
||||
if ref() is None
|
||||
]
|
||||
|
||||
for key in dead_keys:
|
||||
del self._cache[key]
|
||||
|
||||
return len(dead_keys)
|
||||
|
||||
|
||||
class TimeBasedCache(Generic[T]):
|
||||
"""
|
||||
Time-based expiring cache.
|
||||
|
||||
Items expire after a specified duration.
|
||||
"""
|
||||
|
||||
def __init__(self, ttl_seconds: int = 3600):
|
||||
"""
|
||||
Initialize time-based cache.
|
||||
|
||||
Args:
|
||||
ttl_seconds: Time to live in seconds
|
||||
"""
|
||||
self.ttl = timedelta(seconds=ttl_seconds)
|
||||
self._cache: Dict[str, tuple[T, datetime]] = {}
|
||||
self._lock = threading.RLock()
|
||||
self.stats = CacheStats()
|
||||
|
||||
def get(self, key: str) -> Optional[T]:
|
||||
"""
|
||||
Get item from cache if not expired.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
|
||||
Returns:
|
||||
Cached value or None if not found or expired
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
if key in self._cache:
|
||||
value, timestamp = self._cache[key]
|
||||
if datetime.now() - timestamp < self.ttl:
|
||||
self.stats.hits += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return value
|
||||
else:
|
||||
# Expired
|
||||
del self._cache[key]
|
||||
self.stats.evictions += 1
|
||||
|
||||
self.stats.misses += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return None
|
||||
|
||||
def put(self, key: str, value: T) -> None:
|
||||
"""
|
||||
Put item in cache with timestamp.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
value: Value to cache
|
||||
"""
|
||||
with self._lock:
|
||||
self._cache[key] = (value, datetime.now())
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached items."""
|
||||
with self._lock:
|
||||
self._cache.clear()
|
||||
|
||||
def cleanup(self) -> int:
|
||||
"""
|
||||
Remove expired items.
|
||||
|
||||
Returns:
|
||||
Number of items removed
|
||||
"""
|
||||
with self._lock:
|
||||
now = datetime.now()
|
||||
expired_keys = [
|
||||
key for key, (_, timestamp) in self._cache.items()
|
||||
if now - timestamp >= self.ttl
|
||||
]
|
||||
|
||||
for key in expired_keys:
|
||||
del self._cache[key]
|
||||
self.stats.evictions += 1
|
||||
|
||||
return len(expired_keys)
|
||||
|
||||
|
||||
def cached(cache: LRUCache, key_func: Optional[Callable] = None):
|
||||
"""
|
||||
Decorator for caching function results.
|
||||
|
||||
Args:
|
||||
cache: Cache instance to use
|
||||
key_func: Function to generate cache key from arguments
|
||||
|
||||
Returns:
|
||||
Decorated function
|
||||
"""
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
# Generate cache key
|
||||
if key_func:
|
||||
key = key_func(*args, **kwargs)
|
||||
else:
|
||||
# Default key generation
|
||||
key = f"{func.__name__}:{str(args)}:{str(sorted(kwargs.items()))}"
|
||||
|
||||
# Check cache
|
||||
result = cache.get(key)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
# Compute and cache result
|
||||
result = func(*args, **kwargs)
|
||||
cache.put(key, result)
|
||||
|
||||
return result
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
class CacheManager:
|
||||
"""
|
||||
Manages multiple caches for the parser.
|
||||
|
||||
Provides centralized cache management and monitoring.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize cache manager."""
|
||||
# Style parsing cache
|
||||
self.style_cache = LRUCache[dict](max_size=5000)
|
||||
|
||||
# Header detection cache
|
||||
self.header_cache = LRUCache[bool](max_size=2000)
|
||||
|
||||
# Pattern matching cache
|
||||
self.pattern_cache = LRUCache[bool](max_size=10000)
|
||||
|
||||
# Node reference cache
|
||||
self.node_cache = WeakCache()
|
||||
|
||||
# Compiled regex cache
|
||||
self.regex_cache = LRUCache[Any](max_size=500)
|
||||
|
||||
# All caches for management
|
||||
self._caches = {
|
||||
'style': self.style_cache,
|
||||
'header': self.header_cache,
|
||||
'pattern': self.pattern_cache,
|
||||
'node': self.node_cache,
|
||||
'regex': self.regex_cache
|
||||
}
|
||||
|
||||
def get_stats(self) -> Dict[str, CacheStats]:
|
||||
"""Get statistics for all caches."""
|
||||
return {
|
||||
name: cache.stats
|
||||
for name, cache in self._caches.items()
|
||||
if hasattr(cache, 'stats')
|
||||
}
|
||||
|
||||
def reset_stats(self) -> None:
|
||||
"""Reset statistics for all caches."""
|
||||
for cache in self._caches.values():
|
||||
if hasattr(cache, 'stats'):
|
||||
cache.stats.reset()
|
||||
|
||||
def clear_all(self) -> None:
|
||||
"""Clear all caches."""
|
||||
for cache in self._caches.values():
|
||||
cache.clear()
|
||||
|
||||
def cleanup(self) -> Dict[str, int]:
|
||||
"""
|
||||
Cleanup expired/dead entries in all caches.
|
||||
|
||||
Returns:
|
||||
Number of entries cleaned up per cache
|
||||
"""
|
||||
cleanup_counts = {}
|
||||
|
||||
# Cleanup weak cache
|
||||
if hasattr(self.node_cache, 'cleanup'):
|
||||
cleanup_counts['node'] = self.node_cache.cleanup()
|
||||
|
||||
return cleanup_counts
|
||||
|
||||
def get_memory_usage(self) -> Dict[str, int]:
|
||||
"""
|
||||
Estimate memory usage of caches.
|
||||
|
||||
Returns:
|
||||
Approximate memory usage in bytes per cache
|
||||
"""
|
||||
import sys
|
||||
|
||||
usage = {}
|
||||
|
||||
for name, cache in self._caches.items():
|
||||
if hasattr(cache, '_cache'):
|
||||
# Rough estimation
|
||||
size = 0
|
||||
if isinstance(cache._cache, dict):
|
||||
for key, value in cache._cache.items():
|
||||
size += sys.getsizeof(key)
|
||||
if hasattr(value, '__sizeof__'):
|
||||
size += sys.getsizeof(value)
|
||||
else:
|
||||
size += 1000 # Default estimate
|
||||
|
||||
usage[name] = size
|
||||
|
||||
return usage
|
||||
|
||||
|
||||
# Global cache manager instance
|
||||
_cache_manager = None
|
||||
|
||||
|
||||
def get_cache_manager() -> CacheManager:
|
||||
"""Get global cache manager instance."""
|
||||
global _cache_manager
|
||||
if _cache_manager is None:
|
||||
_cache_manager = CacheManager()
|
||||
return _cache_manager
|
||||
@@ -0,0 +1,277 @@
|
||||
"""
|
||||
Currency column merger for handling separated currency symbols in SEC filings.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Tuple
|
||||
|
||||
from edgar.documents.table_nodes import Cell
|
||||
from edgar.documents.utils.table_matrix import TableMatrix, MatrixCell
|
||||
|
||||
|
||||
class CurrencyColumnMerger:
|
||||
"""
|
||||
Detects and merges currency symbol columns with their value columns.
|
||||
|
||||
SEC filings often split currency values into two cells:
|
||||
- Cell 1: "$" (left-aligned)
|
||||
- Cell 2: "224.11" (right-aligned)
|
||||
|
||||
This class detects this pattern and merges them into "$224.11"
|
||||
"""
|
||||
|
||||
# Common currency symbols
|
||||
CURRENCY_SYMBOLS = {'$', '€', '£', '¥', '₹', 'Rs', 'USD', 'EUR', 'GBP'}
|
||||
|
||||
# Pattern for numeric values (with commas, decimals)
|
||||
NUMERIC_PATTERN = re.compile(r'^[\d,]+\.?\d*$')
|
||||
|
||||
def __init__(self, matrix: TableMatrix):
|
||||
"""Initialize with a table matrix."""
|
||||
self.matrix = matrix
|
||||
self.merge_pairs: List[Tuple[int, int]] = []
|
||||
|
||||
def detect_currency_pairs(self) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Detect column pairs that should be merged (currency symbol + value).
|
||||
|
||||
Returns:
|
||||
List of (symbol_col, value_col) pairs to merge
|
||||
"""
|
||||
pairs = []
|
||||
|
||||
for col_idx in range(self.matrix.col_count - 1):
|
||||
if self._is_currency_column(col_idx):
|
||||
next_col = col_idx + 1
|
||||
if self._is_numeric_column(next_col):
|
||||
# Check if they're consistently paired
|
||||
if self._verify_pairing(col_idx, next_col):
|
||||
pairs.append((col_idx, next_col))
|
||||
|
||||
self.merge_pairs = pairs
|
||||
return pairs
|
||||
|
||||
def _is_currency_column(self, col_idx: int) -> bool:
|
||||
"""
|
||||
Check if a column contains only currency symbols.
|
||||
|
||||
A currency column typically:
|
||||
- Contains only currency symbols or empty cells
|
||||
- Has very narrow width (1-3 characters)
|
||||
- Is left-aligned (though we check content, not style)
|
||||
"""
|
||||
currency_count = 0
|
||||
empty_count = 0
|
||||
other_count = 0
|
||||
header_rows = 0
|
||||
|
||||
for row_idx in range(self.matrix.row_count):
|
||||
cell = self.matrix.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
|
||||
# Skip header rows (first 2 rows typically)
|
||||
if row_idx < 2 and text and not text in self.CURRENCY_SYMBOLS:
|
||||
header_rows += 1
|
||||
continue
|
||||
|
||||
if not text:
|
||||
empty_count += 1
|
||||
elif text in self.CURRENCY_SYMBOLS or text == '$':
|
||||
currency_count += 1
|
||||
elif len(text) <= 3 and text in ['$', '€', '£', '¥']:
|
||||
currency_count += 1
|
||||
else:
|
||||
other_count += 1
|
||||
|
||||
# Column should be mostly currency symbols with some empty cells
|
||||
# Exclude header rows from the calculation
|
||||
total_non_empty = currency_count + other_count
|
||||
if total_non_empty == 0:
|
||||
return False
|
||||
|
||||
# At least 60% of non-empty, non-header cells should be currency symbols
|
||||
# Lower threshold since we're excluding headers
|
||||
# Also accept if there's at least 1 currency symbol and no other non-currency content
|
||||
return (currency_count >= 1 and other_count == 0) or \
|
||||
(currency_count >= 2 and currency_count / total_non_empty >= 0.6)
|
||||
|
||||
def _is_numeric_column(self, col_idx: int) -> bool:
|
||||
"""
|
||||
Check if a column contains numeric values.
|
||||
"""
|
||||
numeric_count = 0
|
||||
non_empty_count = 0
|
||||
|
||||
for row_idx in range(self.matrix.row_count):
|
||||
cell = self.matrix.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
|
||||
# Skip header rows
|
||||
if row_idx < 2:
|
||||
continue
|
||||
|
||||
if text:
|
||||
non_empty_count += 1
|
||||
# Remove formatting and check if numeric
|
||||
clean_text = text.replace(',', '').replace('%', '').replace('(', '').replace(')', '')
|
||||
if self.NUMERIC_PATTERN.match(clean_text):
|
||||
numeric_count += 1
|
||||
|
||||
if non_empty_count == 0:
|
||||
return False
|
||||
|
||||
# At least 60% should be numeric (lowered threshold)
|
||||
return numeric_count / non_empty_count >= 0.6
|
||||
|
||||
def _verify_pairing(self, symbol_col: int, value_col: int) -> bool:
|
||||
"""
|
||||
Verify that symbol and value columns are consistently paired.
|
||||
|
||||
They should have content in the same rows (when symbol present, value present).
|
||||
"""
|
||||
paired_rows = 0
|
||||
mismatched_rows = 0
|
||||
|
||||
for row_idx in range(self.matrix.row_count):
|
||||
symbol_cell = self.matrix.matrix[row_idx][symbol_col]
|
||||
value_cell = self.matrix.matrix[row_idx][value_col]
|
||||
|
||||
if symbol_cell.original_cell and value_cell.original_cell:
|
||||
symbol_text = symbol_cell.original_cell.text().strip()
|
||||
value_text = value_cell.original_cell.text().strip()
|
||||
|
||||
# Check if they're paired (both have content or both empty)
|
||||
if symbol_text in self.CURRENCY_SYMBOLS and value_text:
|
||||
paired_rows += 1
|
||||
elif not symbol_text and not value_text:
|
||||
# Both empty is fine
|
||||
pass
|
||||
elif symbol_text in self.CURRENCY_SYMBOLS and not value_text:
|
||||
# Symbol without value - might be header
|
||||
if row_idx < 2: # Allow in headers
|
||||
pass
|
||||
else:
|
||||
mismatched_rows += 1
|
||||
elif not symbol_text and value_text:
|
||||
# Value without symbol - could be valid (continuation)
|
||||
pass
|
||||
|
||||
# Should have more paired than mismatched
|
||||
return paired_rows > mismatched_rows
|
||||
|
||||
def apply_merges(self) -> 'TableMatrix':
|
||||
"""
|
||||
Create a new matrix with currency columns merged.
|
||||
|
||||
Returns:
|
||||
New TableMatrix with merged columns
|
||||
"""
|
||||
if not self.merge_pairs:
|
||||
self.detect_currency_pairs()
|
||||
|
||||
if not self.merge_pairs:
|
||||
# No merges needed
|
||||
return self.matrix
|
||||
|
||||
# Calculate new column count (each merge removes one column)
|
||||
new_col_count = self.matrix.col_count - len(self.merge_pairs)
|
||||
|
||||
# Create mapping from old to new columns
|
||||
old_to_new = {}
|
||||
merged_cols = set(pair[0] for pair in self.merge_pairs) # Symbol columns to remove
|
||||
|
||||
new_col = 0
|
||||
for old_col in range(self.matrix.col_count):
|
||||
if old_col in merged_cols:
|
||||
# This column will be merged with next, skip it
|
||||
continue
|
||||
old_to_new[old_col] = new_col
|
||||
new_col += 1
|
||||
|
||||
# Create new matrix
|
||||
new_matrix = TableMatrix()
|
||||
new_matrix.row_count = self.matrix.row_count
|
||||
new_matrix.col_count = new_col_count
|
||||
new_matrix.matrix = []
|
||||
|
||||
# Build new matrix with merged cells
|
||||
for row_idx in range(self.matrix.row_count):
|
||||
new_row = [MatrixCell() for _ in range(new_col_count)]
|
||||
|
||||
for old_col in range(self.matrix.col_count):
|
||||
# Check if this is a symbol column to merge
|
||||
merge_pair = next((pair for pair in self.merge_pairs if pair[0] == old_col), None)
|
||||
|
||||
if merge_pair:
|
||||
# Merge symbol with value
|
||||
symbol_col, value_col = merge_pair
|
||||
symbol_cell = self.matrix.matrix[row_idx][symbol_col]
|
||||
value_cell = self.matrix.matrix[row_idx][value_col]
|
||||
|
||||
if value_cell.original_cell:
|
||||
# Create merged cell
|
||||
new_cell_content = self._merge_cell_content(symbol_cell, value_cell)
|
||||
if new_cell_content:
|
||||
# Create new merged cell
|
||||
merged_cell = Cell(
|
||||
content=new_cell_content,
|
||||
colspan=value_cell.original_cell.colspan,
|
||||
rowspan=value_cell.original_cell.rowspan,
|
||||
is_header=value_cell.original_cell.is_header,
|
||||
align=value_cell.original_cell.align
|
||||
)
|
||||
|
||||
new_col_idx = old_to_new.get(value_col)
|
||||
if new_col_idx is not None:
|
||||
new_row[new_col_idx] = MatrixCell(
|
||||
original_cell=merged_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col_idx
|
||||
)
|
||||
|
||||
elif old_col not in set(pair[1] for pair in self.merge_pairs):
|
||||
# Regular column, not involved in merging
|
||||
new_col_idx = old_to_new.get(old_col)
|
||||
if new_col_idx is not None:
|
||||
new_row[new_col_idx] = self.matrix.matrix[row_idx][old_col]
|
||||
|
||||
new_matrix.matrix.append(new_row)
|
||||
|
||||
return new_matrix
|
||||
|
||||
def _merge_cell_content(self, symbol_cell: MatrixCell, value_cell: MatrixCell) -> str:
|
||||
"""
|
||||
Merge symbol and value cell contents.
|
||||
|
||||
Returns:
|
||||
Merged content like "$224.11" or original value if no symbol
|
||||
"""
|
||||
value_text = value_cell.original_cell.text().strip() if value_cell.original_cell else ""
|
||||
symbol_text = symbol_cell.original_cell.text().strip() if symbol_cell.original_cell else ""
|
||||
|
||||
if not value_text:
|
||||
return symbol_text # Just return symbol if no value
|
||||
|
||||
if symbol_text in self.CURRENCY_SYMBOLS:
|
||||
# Merge symbol with value (no space for $, others may vary)
|
||||
if symbol_text == '$':
|
||||
return f"${value_text}"
|
||||
else:
|
||||
return f"{symbol_text}{value_text}"
|
||||
else:
|
||||
# No symbol, just return value
|
||||
return value_text
|
||||
|
||||
def get_merge_summary(self) -> str:
|
||||
"""Get a summary of merges to be applied."""
|
||||
if not self.merge_pairs:
|
||||
return "No currency column merges detected"
|
||||
|
||||
summary = f"Currency merges detected: {len(self.merge_pairs)} pairs\n"
|
||||
for symbol_col, value_col in self.merge_pairs:
|
||||
summary += f" • Column {symbol_col} ($) + Column {value_col} (value)\n"
|
||||
|
||||
return summary
|
||||
@@ -0,0 +1,96 @@
|
||||
"""
|
||||
HTML utility functions for document parsing.
|
||||
|
||||
This module consolidates common HTML processing utilities used across
|
||||
the parser, preprocessor, and simple parser implementations.
|
||||
"""
|
||||
|
||||
import lxml.html
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def remove_xml_declaration(html: str) -> str:
|
||||
"""
|
||||
Remove XML declaration from HTML if present.
|
||||
|
||||
SEC HTML documents sometimes include XML declarations like:
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
These can interfere with HTML parsing and are safely removed since
|
||||
the encoding is handled separately by the parser.
|
||||
|
||||
Args:
|
||||
html: HTML string that may contain XML declaration
|
||||
|
||||
Returns:
|
||||
HTML string with XML declaration removed (if present)
|
||||
|
||||
Examples:
|
||||
>>> html = '<?xml version="1.0"?><!DOCTYPE html><html>...'
|
||||
>>> remove_xml_declaration(html)
|
||||
'<!DOCTYPE html><html>...'
|
||||
|
||||
>>> html = '<!DOCTYPE html><html>...' # No XML declaration
|
||||
>>> remove_xml_declaration(html)
|
||||
'<!DOCTYPE html><html>...'
|
||||
"""
|
||||
html_stripped = html.strip()
|
||||
if html_stripped.startswith('<?xml'):
|
||||
xml_end = html.find('?>') + 2
|
||||
return html[xml_end:]
|
||||
return html
|
||||
|
||||
|
||||
def create_lxml_parser(
|
||||
remove_blank_text: bool = True,
|
||||
remove_comments: bool = True,
|
||||
recover: bool = True,
|
||||
encoding: Optional[str] = 'utf-8'
|
||||
) -> lxml.html.HTMLParser:
|
||||
"""
|
||||
Create a configured lxml HTMLParser.
|
||||
|
||||
This factory function creates an lxml HTMLParser with consistent
|
||||
configuration settings used across the document parsing system.
|
||||
|
||||
Args:
|
||||
remove_blank_text: Remove blank text nodes between tags.
|
||||
Default True for cleaner tree structure.
|
||||
remove_comments: Remove HTML comments from parsed tree.
|
||||
Default True since comments are rarely needed.
|
||||
recover: Enable error recovery mode to handle malformed HTML.
|
||||
Default True since SEC filings often have HTML issues.
|
||||
encoding: Character encoding for the parser.
|
||||
Default 'utf-8'. Set to None to disable encoding handling.
|
||||
|
||||
Returns:
|
||||
Configured lxml.html.HTMLParser instance
|
||||
|
||||
Examples:
|
||||
>>> # Standard parser (removes whitespace and comments, recovers from errors)
|
||||
>>> parser = create_lxml_parser()
|
||||
|
||||
>>> # Parser that preserves all content (for XBRL)
|
||||
>>> parser = create_lxml_parser(
|
||||
... remove_blank_text=False,
|
||||
... remove_comments=False
|
||||
... )
|
||||
|
||||
>>> # Parser without encoding (auto-detect)
|
||||
>>> parser = create_lxml_parser(encoding=None)
|
||||
|
||||
Note:
|
||||
The recover=True setting is critical for SEC documents which
|
||||
often contain non-standard HTML structures.
|
||||
"""
|
||||
kwargs = {
|
||||
'remove_blank_text': remove_blank_text,
|
||||
'remove_comments': remove_comments,
|
||||
'recover': recover,
|
||||
}
|
||||
|
||||
# Only add encoding if specified
|
||||
if encoding is not None:
|
||||
kwargs['encoding'] = encoding
|
||||
|
||||
return lxml.html.HTMLParser(**kwargs)
|
||||
@@ -0,0 +1,375 @@
|
||||
"""
|
||||
Streaming parser for large HTML documents.
|
||||
"""
|
||||
|
||||
import io
|
||||
from typing import Dict, Any, TYPE_CHECKING
|
||||
|
||||
from lxml import etree
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.exceptions import HTMLParsingError, DocumentTooLargeError
|
||||
|
||||
# Use TYPE_CHECKING to avoid circular imports
|
||||
if TYPE_CHECKING:
|
||||
from edgar.documents.document import Document, DocumentMetadata
|
||||
from edgar.documents.nodes import DocumentNode, HeadingNode, ParagraphNode, TextNode, SectionNode, ContainerNode
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
from edgar.documents.types import SemanticType
|
||||
|
||||
|
||||
class StreamingParser:
|
||||
"""
|
||||
Streaming parser for large HTML documents.
|
||||
|
||||
Processes documents in chunks to minimize memory usage
|
||||
while maintaining parse quality.
|
||||
"""
|
||||
|
||||
# Chunk size for streaming (1MB)
|
||||
CHUNK_SIZE = 1024 * 1024
|
||||
|
||||
# Maximum node buffer before flush
|
||||
MAX_NODE_BUFFER = 1000
|
||||
|
||||
def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
|
||||
"""
|
||||
Initialize streaming parser.
|
||||
|
||||
Args:
|
||||
config: Parser configuration
|
||||
strategies: Parsing strategies to use
|
||||
"""
|
||||
self.config = config
|
||||
self.strategies = strategies
|
||||
self._reset_state()
|
||||
|
||||
def _reset_state(self):
|
||||
"""Reset parser state."""
|
||||
# Import here to avoid circular import
|
||||
from edgar.documents.document import DocumentMetadata
|
||||
from edgar.documents.nodes import DocumentNode
|
||||
|
||||
self.current_section = None
|
||||
self.node_buffer = []
|
||||
self.metadata = DocumentMetadata()
|
||||
self.root = DocumentNode()
|
||||
self.current_parent = self.root
|
||||
self.tag_stack = []
|
||||
self.text_buffer = []
|
||||
self.in_table = False
|
||||
self.table_buffer = []
|
||||
self.bytes_processed = 0
|
||||
|
||||
def parse(self, html: str) -> "Document":
|
||||
"""
|
||||
Parse HTML in streaming mode.
|
||||
|
||||
Args:
|
||||
html: HTML content to parse
|
||||
|
||||
Returns:
|
||||
Parsed Document
|
||||
|
||||
Raises:
|
||||
DocumentTooLargeError: If document exceeds size limit
|
||||
HTMLParsingError: If parsing fails
|
||||
"""
|
||||
self._reset_state()
|
||||
|
||||
# Store original HTML BEFORE parsing (needed for TOC-based section detection)
|
||||
original_html = html
|
||||
|
||||
try:
|
||||
# Create streaming parser
|
||||
parser = etree.iterparse(
|
||||
io.BytesIO(html.encode('utf-8')),
|
||||
events=('start', 'end'),
|
||||
html=True,
|
||||
recover=True,
|
||||
encoding='utf-8'
|
||||
)
|
||||
|
||||
# Process events
|
||||
for event, elem in parser:
|
||||
self._process_event(event, elem)
|
||||
|
||||
# Check size limit
|
||||
self.bytes_processed += len(etree.tostring(elem, encoding='unicode', method='html'))
|
||||
if self.bytes_processed > self.config.max_document_size:
|
||||
raise DocumentTooLargeError(self.bytes_processed, self.config.max_document_size)
|
||||
|
||||
# Flush buffer if needed
|
||||
if len(self.node_buffer) >= self.MAX_NODE_BUFFER:
|
||||
self._flush_buffer()
|
||||
|
||||
# Clean up processed elements to save memory
|
||||
elem.clear()
|
||||
while elem.getprevious() is not None:
|
||||
parent = elem.getparent()
|
||||
if parent is not None:
|
||||
del parent[0]
|
||||
else:
|
||||
break
|
||||
|
||||
# Final flush
|
||||
self._flush_buffer()
|
||||
|
||||
# Store original HTML in metadata for section detection (TOC analysis)
|
||||
self.metadata.original_html = original_html
|
||||
|
||||
# Create document (import here to avoid circular import)
|
||||
from edgar.documents.document import Document
|
||||
document = Document(root=self.root, metadata=self.metadata)
|
||||
|
||||
# Store config reference (required for section detection)
|
||||
document._config = self.config
|
||||
|
||||
# Apply post-processing
|
||||
from edgar.documents.processors.postprocessor import DocumentPostprocessor
|
||||
postprocessor = DocumentPostprocessor(self.config)
|
||||
document = postprocessor.process(document)
|
||||
|
||||
return document
|
||||
|
||||
except etree.ParseError as e:
|
||||
raise HTMLParsingError(f"Streaming parse failed: {str(e)}")
|
||||
except Exception as e:
|
||||
if isinstance(e, (DocumentTooLargeError, HTMLParsingError)):
|
||||
raise
|
||||
raise HTMLParsingError(f"Unexpected error during streaming parse: {str(e)}")
|
||||
|
||||
def _process_event(self, event: str, elem: HtmlElement):
|
||||
"""Process a parse event."""
|
||||
if event == 'start':
|
||||
self._handle_start_tag(elem)
|
||||
elif event == 'end':
|
||||
self._handle_end_tag(elem)
|
||||
|
||||
def _handle_start_tag(self, elem: HtmlElement):
|
||||
"""Handle opening tag."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import ContainerNode
|
||||
|
||||
tag = elem.tag.lower()
|
||||
|
||||
# Track tag stack
|
||||
self.tag_stack.append(tag)
|
||||
|
||||
# Extract metadata from early elements
|
||||
if tag == 'title' and elem.text:
|
||||
self._extract_title_metadata(elem.text)
|
||||
elif tag == 'meta':
|
||||
self._extract_meta_metadata(elem)
|
||||
|
||||
# Handle specific tags
|
||||
if tag == 'body':
|
||||
# Create a container for body content
|
||||
body_container = ContainerNode(tag_name='body')
|
||||
self.root.add_child(body_container)
|
||||
self.current_parent = body_container
|
||||
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
self._start_heading(elem)
|
||||
elif tag == 'p':
|
||||
self._start_paragraph(elem)
|
||||
elif tag == 'table':
|
||||
self._start_table(elem)
|
||||
elif tag == 'section':
|
||||
self._start_section(elem)
|
||||
|
||||
def _handle_end_tag(self, elem: HtmlElement):
|
||||
"""Handle closing tag."""
|
||||
tag = elem.tag.lower()
|
||||
|
||||
# Remove from tag stack
|
||||
if self.tag_stack and self.tag_stack[-1] == tag:
|
||||
self.tag_stack.pop()
|
||||
|
||||
# Handle specific tags
|
||||
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
self._end_heading(elem)
|
||||
elif tag == 'p':
|
||||
self._end_paragraph(elem)
|
||||
elif tag == 'table':
|
||||
self._end_table(elem)
|
||||
elif tag == 'section':
|
||||
self._end_section(elem)
|
||||
elif tag == 'body':
|
||||
# When body ends, flush any remaining nodes
|
||||
self._flush_buffer()
|
||||
|
||||
# Handle text content
|
||||
if elem.text:
|
||||
self.text_buffer.append(elem.text.strip())
|
||||
if elem.tail:
|
||||
self.text_buffer.append(elem.tail.strip())
|
||||
|
||||
def _start_heading(self, elem: HtmlElement):
|
||||
"""Start processing a heading."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import HeadingNode
|
||||
|
||||
level = int(elem.tag[1])
|
||||
text = self._get_text_content(elem)
|
||||
|
||||
# Create heading node
|
||||
heading = HeadingNode(
|
||||
level=level,
|
||||
content=text
|
||||
)
|
||||
|
||||
# Check if this is a section header
|
||||
if self.strategies.get('header_detection'):
|
||||
detector = self.strategies['header_detection']
|
||||
if detector.is_section_header(text, elem):
|
||||
heading.semantic_type = SemanticType.SECTION_HEADER
|
||||
|
||||
self.node_buffer.append(heading)
|
||||
|
||||
def _end_heading(self, elem: HtmlElement):
|
||||
"""End processing a heading."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import HeadingNode
|
||||
|
||||
# Get text content from element
|
||||
text = self._get_text_content(elem)
|
||||
if text and self.node_buffer and isinstance(self.node_buffer[-1], HeadingNode):
|
||||
self.node_buffer[-1].content = text
|
||||
|
||||
# Clear any accumulated text buffer
|
||||
self.text_buffer.clear()
|
||||
|
||||
def _start_paragraph(self, elem: HtmlElement):
|
||||
"""Start processing a paragraph."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import ParagraphNode
|
||||
|
||||
para = ParagraphNode()
|
||||
|
||||
# Get style if present
|
||||
style_attr = elem.get('style')
|
||||
if style_attr and self.strategies.get('style_parser'):
|
||||
style_parser = self.strategies['style_parser']
|
||||
para.style = style_parser.parse(style_attr)
|
||||
|
||||
self.node_buffer.append(para)
|
||||
|
||||
def _end_paragraph(self, elem: HtmlElement):
|
||||
"""End processing a paragraph."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import ParagraphNode, TextNode
|
||||
|
||||
# Get text content from element
|
||||
text = self._get_text_content(elem)
|
||||
if text and self.node_buffer and isinstance(self.node_buffer[-1], ParagraphNode):
|
||||
text_node = TextNode(content=text)
|
||||
self.node_buffer[-1].add_child(text_node)
|
||||
|
||||
# Clear any accumulated text buffer
|
||||
self.text_buffer.clear()
|
||||
|
||||
def _start_table(self, elem: HtmlElement):
|
||||
"""Start processing a table."""
|
||||
self.in_table = True
|
||||
self.table_buffer = []
|
||||
|
||||
# Store table element for later processing
|
||||
self.table_elem = elem
|
||||
|
||||
def _end_table(self, elem: HtmlElement):
|
||||
"""End processing a table."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
|
||||
self.in_table = False
|
||||
|
||||
# Process table with table processor if available
|
||||
if self.strategies.get('table_processing'):
|
||||
processor = self.strategies['table_processing']
|
||||
table_node = processor.process(elem)
|
||||
if table_node:
|
||||
self.node_buffer.append(table_node)
|
||||
else:
|
||||
# Basic table node
|
||||
table = TableNode()
|
||||
self.node_buffer.append(table)
|
||||
|
||||
self.table_buffer.clear()
|
||||
|
||||
def _start_section(self, elem: HtmlElement):
|
||||
"""Start processing a section."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import SectionNode
|
||||
|
||||
section = SectionNode()
|
||||
|
||||
# Get section attributes
|
||||
section_id = elem.get('id')
|
||||
if section_id:
|
||||
section.metadata['id'] = section_id
|
||||
|
||||
section_class = elem.get('class')
|
||||
if section_class:
|
||||
section.metadata['class'] = section_class
|
||||
|
||||
self.current_section = section
|
||||
self.node_buffer.append(section)
|
||||
|
||||
def _end_section(self, elem: HtmlElement):
|
||||
"""End processing a section."""
|
||||
self.current_section = None
|
||||
|
||||
def _flush_buffer(self):
|
||||
"""Flush node buffer to document tree."""
|
||||
for node in self.node_buffer:
|
||||
# Add to current parent
|
||||
if self.current_section:
|
||||
self.current_section.add_child(node)
|
||||
else:
|
||||
self.current_parent.add_child(node)
|
||||
|
||||
self.node_buffer.clear()
|
||||
|
||||
def _get_text_content(self, elem: HtmlElement) -> str:
|
||||
"""Extract text content from element."""
|
||||
text_parts = []
|
||||
|
||||
if elem.text:
|
||||
text_parts.append(elem.text.strip())
|
||||
|
||||
for child in elem:
|
||||
child_text = self._get_text_content(child)
|
||||
if child_text:
|
||||
text_parts.append(child_text)
|
||||
if child.tail:
|
||||
text_parts.append(child.tail.strip())
|
||||
|
||||
return ' '.join(text_parts)
|
||||
|
||||
def _extract_title_metadata(self, title: str):
|
||||
"""Extract metadata from title."""
|
||||
# Example: "APPLE INC - 10-K - 2023-09-30"
|
||||
parts = title.split(' - ')
|
||||
if len(parts) >= 2:
|
||||
self.metadata.company = parts[0].strip()
|
||||
self.metadata.form = parts[1].strip()
|
||||
if len(parts) >= 3:
|
||||
self.metadata.filing_date = parts[2].strip()
|
||||
|
||||
def _extract_meta_metadata(self, elem: HtmlElement):
|
||||
"""Extract metadata from meta tags."""
|
||||
name = elem.get('name', '').lower()
|
||||
content = elem.get('content', '')
|
||||
|
||||
if name and content:
|
||||
if name == 'company':
|
||||
self.metadata.company = content
|
||||
elif name == 'filing-type':
|
||||
self.metadata.form = content
|
||||
elif name == 'cik':
|
||||
self.metadata.cik = content
|
||||
elif name == 'filing-date':
|
||||
self.metadata.filing_date = content
|
||||
elif name == 'accession-number':
|
||||
self.metadata.accession_number = content
|
||||
@@ -0,0 +1,858 @@
|
||||
"""
|
||||
Table matrix builder for handling complex colspan/rowspan structures.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
|
||||
from edgar.documents.table_nodes import Cell, Row
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatrixCell:
|
||||
"""Cell in the matrix with reference to original cell"""
|
||||
original_cell: Optional[Cell] = None
|
||||
is_spanned: bool = False # True if this is part of a colspan/rowspan
|
||||
row_origin: int = -1 # Original row index
|
||||
col_origin: int = -1 # Original column index
|
||||
|
||||
|
||||
class TableMatrix:
|
||||
"""
|
||||
Build a 2D matrix representation of table with proper handling of merged cells.
|
||||
|
||||
This class converts a table with colspan/rowspan into a regular 2D grid
|
||||
where each merged cell occupies multiple positions in the matrix.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize empty matrix"""
|
||||
self.matrix: List[List[MatrixCell]] = []
|
||||
self.row_count = 0
|
||||
self.col_count = 0
|
||||
self.header_row_count = 0 # Track number of header rows
|
||||
|
||||
def build_from_rows(self, header_rows: List[List[Cell]], data_rows: List[Row]) -> 'TableMatrix':
|
||||
"""
|
||||
Build matrix from header rows and data rows.
|
||||
|
||||
Args:
|
||||
header_rows: List of header rows (each row is a list of Cells)
|
||||
data_rows: List of Row objects
|
||||
|
||||
Returns:
|
||||
Self for chaining
|
||||
"""
|
||||
# Store header row count for later use
|
||||
self.header_row_count = len(header_rows)
|
||||
|
||||
# Combine all rows for processing
|
||||
all_rows = []
|
||||
|
||||
# Add header rows
|
||||
for header_row in header_rows:
|
||||
all_rows.append(header_row)
|
||||
|
||||
# Add data rows
|
||||
for row in data_rows:
|
||||
all_rows.append(row.cells)
|
||||
|
||||
if not all_rows:
|
||||
return self
|
||||
|
||||
# Calculate dimensions
|
||||
self.row_count = len(all_rows)
|
||||
|
||||
# First pass: determine actual column count
|
||||
self._calculate_dimensions(all_rows)
|
||||
|
||||
# Initialize matrix
|
||||
self.matrix = [[MatrixCell() for _ in range(self.col_count)]
|
||||
for _ in range(self.row_count)]
|
||||
|
||||
# Second pass: place cells in matrix
|
||||
self._place_cells(all_rows)
|
||||
|
||||
return self
|
||||
|
||||
def _calculate_dimensions(self, rows: List[List[Cell]]):
|
||||
"""Calculate the actual dimensions considering colspan"""
|
||||
max_cols = 0
|
||||
|
||||
for row_idx, row in enumerate(rows):
|
||||
col_pos = 0
|
||||
for cell in row:
|
||||
# Skip positions that might be occupied by rowspan from above
|
||||
while col_pos < max_cols and self._is_occupied(row_idx, col_pos):
|
||||
col_pos += 1
|
||||
|
||||
# This cell will occupy from col_pos to col_pos + colspan
|
||||
col_end = col_pos + cell.colspan
|
||||
max_cols = max(max_cols, col_end)
|
||||
col_pos = col_end
|
||||
|
||||
self.col_count = max_cols
|
||||
|
||||
def _is_occupied(self, row: int, col: int) -> bool:
|
||||
"""Check if a position is occupied by a cell from a previous row (rowspan)"""
|
||||
if row == 0:
|
||||
return False
|
||||
|
||||
# Check if any cell above has rowspan that reaches this position
|
||||
for prev_row in range(row):
|
||||
if prev_row < len(self.matrix) and col < len(self.matrix[prev_row]):
|
||||
cell = self.matrix[prev_row][col]
|
||||
if cell.original_cell and cell.row_origin == prev_row:
|
||||
# Check if this cell's rowspan reaches current row
|
||||
if prev_row + cell.original_cell.rowspan > row:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _place_cells(self, rows: List[List[Cell]]):
|
||||
"""Place cells in the matrix handling colspan and rowspan"""
|
||||
for row_idx, row in enumerate(rows):
|
||||
col_pos = 0
|
||||
|
||||
for cell_idx, cell in enumerate(row):
|
||||
# Find next available column position
|
||||
while col_pos < self.col_count and self.matrix[row_idx][col_pos].original_cell is not None:
|
||||
col_pos += 1
|
||||
|
||||
if col_pos >= self.col_count:
|
||||
# Need to expand matrix
|
||||
self._expand_columns(col_pos + cell.colspan)
|
||||
|
||||
# Special handling for cells with colspan > 1 containing numeric values
|
||||
# Only apply this logic for Table 15-style alignment issues
|
||||
# Check if this looks like a financial value that should be right-aligned
|
||||
cell_text = cell.text().strip()
|
||||
|
||||
# Check for numeric values that need special alignment
|
||||
# This is specifically for cases like "167,045" that should align with "$167,045"
|
||||
has_comma_separator = ',' in cell_text
|
||||
digit_ratio = sum(c.isdigit() for c in cell_text) / len(cell_text) if cell_text else 0
|
||||
|
||||
# Only apply special placement for colspan=2 numeric values in data rows
|
||||
# This handles Table 15's specific case without breaking Table 13
|
||||
is_special_numeric = (cell.colspan == 2 and # Specifically colspan=2
|
||||
has_comma_separator and
|
||||
digit_ratio > 0.5 and # More than 50% digits
|
||||
not cell_text.startswith('$') and
|
||||
not any(month in cell_text.lower() for month in
|
||||
['jan', 'feb', 'mar', 'apr', 'may', 'jun',
|
||||
'jul', 'aug', 'sep', 'oct', 'nov', 'dec']) and
|
||||
row_idx > 1) # Not a header row (allow for multi-row headers)
|
||||
|
||||
if is_special_numeric:
|
||||
# Place empty cell at first position, content at second position
|
||||
# This is specifically for Table 15 alignment
|
||||
for r in range(cell.rowspan):
|
||||
# First column of span: empty
|
||||
if row_idx + r < self.row_count and col_pos < self.col_count:
|
||||
self.matrix[row_idx + r][col_pos] = MatrixCell()
|
||||
|
||||
# Second column of span: the actual content
|
||||
if row_idx + r < self.row_count and col_pos + 1 < self.col_count:
|
||||
matrix_cell = MatrixCell(
|
||||
original_cell=cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=col_pos + 1
|
||||
)
|
||||
self.matrix[row_idx + r][col_pos + 1] = matrix_cell
|
||||
|
||||
# Remaining columns of span: mark as spanned (though colspan=2 has no remaining)
|
||||
for c in range(2, cell.colspan):
|
||||
if row_idx + r < self.row_count and col_pos + c < self.col_count:
|
||||
matrix_cell = MatrixCell(
|
||||
original_cell=cell,
|
||||
is_spanned=True,
|
||||
row_origin=row_idx,
|
||||
col_origin=col_pos + 1
|
||||
)
|
||||
self.matrix[row_idx + r][col_pos + c] = matrix_cell
|
||||
else:
|
||||
# Normal placement for other cells
|
||||
for r in range(cell.rowspan):
|
||||
for c in range(cell.colspan):
|
||||
if row_idx + r < self.row_count and col_pos + c < self.col_count:
|
||||
matrix_cell = MatrixCell(
|
||||
original_cell=cell,
|
||||
is_spanned=(r > 0 or c > 0),
|
||||
row_origin=row_idx,
|
||||
col_origin=col_pos
|
||||
)
|
||||
self.matrix[row_idx + r][col_pos + c] = matrix_cell
|
||||
|
||||
col_pos += cell.colspan
|
||||
|
||||
def _expand_columns(self, new_col_count: int):
|
||||
"""Expand matrix to accommodate more columns"""
|
||||
if new_col_count <= self.col_count:
|
||||
return
|
||||
|
||||
for row in self.matrix:
|
||||
row.extend([MatrixCell() for _ in range(new_col_count - self.col_count)])
|
||||
|
||||
self.col_count = new_col_count
|
||||
|
||||
def get_actual_columns(self) -> int:
|
||||
"""Get the actual number of data columns (excluding empty/spacing columns)"""
|
||||
non_empty_cols = 0
|
||||
|
||||
for col_idx in range(self.col_count):
|
||||
has_content = False
|
||||
for row_idx in range(self.row_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
# Check if cell has actual content
|
||||
text = cell.original_cell.text().strip()
|
||||
if text and text not in ['', ' ', '\xa0']:
|
||||
has_content = True
|
||||
break
|
||||
|
||||
if has_content:
|
||||
non_empty_cols += 1
|
||||
|
||||
return non_empty_cols
|
||||
|
||||
def get_column_widths(self) -> List[float]:
|
||||
"""Estimate column widths based on content"""
|
||||
widths = []
|
||||
|
||||
for col_idx in range(self.col_count):
|
||||
max_width = 0
|
||||
content_count = 0
|
||||
|
||||
for row_idx in range(self.row_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text:
|
||||
max_width = max(max_width, len(text))
|
||||
content_count += 1
|
||||
|
||||
# If column has no content, it's likely a spacing column
|
||||
if content_count == 0:
|
||||
widths.append(0)
|
||||
else:
|
||||
widths.append(max_width)
|
||||
|
||||
return widths
|
||||
|
||||
def get_cell(self, row_idx: int, col_idx: int) -> Optional[Cell]:
|
||||
"""
|
||||
Get a cell at specific position in the matrix.
|
||||
|
||||
Args:
|
||||
row_idx: Row index
|
||||
col_idx: Column index
|
||||
|
||||
Returns:
|
||||
Cell at position or None if out of bounds
|
||||
"""
|
||||
if row_idx >= self.row_count or col_idx >= self.col_count or row_idx < 0 or col_idx < 0:
|
||||
return None
|
||||
|
||||
matrix_cell = self.matrix[row_idx][col_idx]
|
||||
|
||||
# Return the original cell
|
||||
if matrix_cell.original_cell:
|
||||
return matrix_cell.original_cell
|
||||
|
||||
# Return empty cell for empty positions
|
||||
return Cell("")
|
||||
|
||||
def get_expanded_row(self, row_idx: int) -> List[Optional[Cell]]:
|
||||
"""
|
||||
Get a row with cells expanded to match column count.
|
||||
|
||||
For cells with colspan > 1, the cell appears in the first position
|
||||
and None in subsequent positions.
|
||||
"""
|
||||
if row_idx >= self.row_count:
|
||||
return []
|
||||
|
||||
expanded = []
|
||||
for col_idx in range(self.col_count):
|
||||
matrix_cell = self.matrix[row_idx][col_idx]
|
||||
if matrix_cell.original_cell:
|
||||
if not matrix_cell.is_spanned:
|
||||
# This is the origin cell
|
||||
expanded.append(matrix_cell.original_cell)
|
||||
else:
|
||||
# This is a spanned position
|
||||
expanded.append(None)
|
||||
else:
|
||||
# Empty cell
|
||||
expanded.append(None)
|
||||
|
||||
return expanded
|
||||
|
||||
def get_data_columns(self) -> List[int]:
|
||||
"""
|
||||
Get indices of columns that contain actual data (not spacing).
|
||||
Uses strategy similar to old parser - keeps single empty columns for spacing.
|
||||
|
||||
Returns:
|
||||
List of column indices that contain data
|
||||
"""
|
||||
# First, identify which columns are empty
|
||||
empty_cols = []
|
||||
for col_idx in range(self.col_count):
|
||||
has_content = False
|
||||
for row_idx in range(self.row_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text:
|
||||
has_content = True
|
||||
break
|
||||
if not has_content:
|
||||
empty_cols.append(col_idx)
|
||||
|
||||
# Apply old parser's strategy
|
||||
cols_to_remove = set()
|
||||
|
||||
# Remove leading empty columns
|
||||
for col in range(self.col_count):
|
||||
if col in empty_cols:
|
||||
cols_to_remove.add(col)
|
||||
else:
|
||||
break
|
||||
|
||||
# Remove trailing empty columns
|
||||
for col in reversed(range(self.col_count)):
|
||||
if col in empty_cols:
|
||||
cols_to_remove.add(col)
|
||||
else:
|
||||
break
|
||||
|
||||
# Remove consecutive empty columns in the middle (keep single empty cols for spacing)
|
||||
i = 0
|
||||
while i < self.col_count - 1:
|
||||
if i in empty_cols and (i + 1) in empty_cols:
|
||||
# Found consecutive empty columns
|
||||
consecutive_count = 0
|
||||
j = i
|
||||
while j < self.col_count and j in empty_cols:
|
||||
consecutive_count += 1
|
||||
j += 1
|
||||
# Keep first empty column as spacer, remove the rest
|
||||
cols_to_remove.update(range(i + 1, i + consecutive_count))
|
||||
i = j
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Return columns that are NOT in the removal set
|
||||
data_cols = [col for col in range(self.col_count) if col not in cols_to_remove]
|
||||
|
||||
return data_cols
|
||||
|
||||
def filter_spacing_columns(self) -> 'TableMatrix':
|
||||
"""
|
||||
Create a new matrix with spacing columns removed.
|
||||
Also handles colspan-generated duplicate columns and misalignment.
|
||||
|
||||
Returns:
|
||||
New TableMatrix with only data columns
|
||||
"""
|
||||
# First pass: identify primary header columns (those with colspan > 1 headers)
|
||||
# and data columns
|
||||
primary_header_cols = set()
|
||||
all_header_cols = set()
|
||||
data_cols = set()
|
||||
|
||||
# Find primary header columns (those that start a colspan)
|
||||
for row_idx in range(min(3, self.row_count)):
|
||||
for col_idx in range(self.col_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
if cell.original_cell.text().strip():
|
||||
all_header_cols.add(col_idx)
|
||||
# Check if this is a primary header (colspan > 1)
|
||||
if cell.original_cell.colspan > 1:
|
||||
primary_header_cols.add(col_idx)
|
||||
|
||||
# If no primary headers found, use all headers as primary
|
||||
if not primary_header_cols:
|
||||
primary_header_cols = all_header_cols
|
||||
|
||||
# Phase 1.5: Identify columns with header content
|
||||
# Any column with non-empty text in ANY header row must be preserved
|
||||
# This prevents legitimate header columns from being removed as "spacing"
|
||||
# Also preserve columns that are spanned by headers (colspan > 1)
|
||||
header_content_columns = set()
|
||||
for col_idx in range(self.col_count):
|
||||
for row_idx in range(self.header_row_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell:
|
||||
# Check for original header cell with content
|
||||
if not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text:
|
||||
header_content_columns.add(col_idx)
|
||||
# Also add all columns spanned by this header
|
||||
if cell.original_cell.colspan > 1:
|
||||
for span_offset in range(1, cell.original_cell.colspan):
|
||||
span_col = col_idx + span_offset
|
||||
if span_col < self.col_count:
|
||||
header_content_columns.add(span_col)
|
||||
break # Found content, no need to check other header rows
|
||||
# Also preserve columns that are spanned (part of a colspan)
|
||||
elif cell.is_spanned:
|
||||
# This column is part of a header's colspan
|
||||
text = cell.original_cell.text().strip()
|
||||
if text:
|
||||
header_content_columns.add(col_idx)
|
||||
|
||||
# Find columns with data (skip header rows)
|
||||
# Count actual header rows by checking for non-data content
|
||||
actual_header_rows = 0
|
||||
for row_idx in range(min(3, self.row_count)):
|
||||
has_numeric_data = False
|
||||
for col_idx in range(self.col_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
# Check if it looks like numeric data (has commas or starts with $)
|
||||
if text and (',' in text and any(c.isdigit() for c in text)) or text == '$':
|
||||
has_numeric_data = True
|
||||
break
|
||||
if has_numeric_data:
|
||||
break
|
||||
actual_header_rows += 1
|
||||
|
||||
data_start_row = max(1, actual_header_rows)
|
||||
|
||||
# Track columns with significant data (not just isolated cells)
|
||||
col_data_count = {}
|
||||
for row_idx in range(data_start_row, self.row_count):
|
||||
for col_idx in range(self.col_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
if cell.original_cell.text().strip():
|
||||
data_cols.add(col_idx)
|
||||
col_data_count[col_idx] = col_data_count.get(col_idx, 0) + 1
|
||||
|
||||
# Build initial list of columns to keep
|
||||
# Always include column 0 if it contains row labels
|
||||
cols_to_keep = set(primary_header_cols)
|
||||
|
||||
# Add columns with header content (prevents removing legitimate headers)
|
||||
cols_to_keep.update(header_content_columns)
|
||||
|
||||
# Identify misaligned data columns that need to be consolidated
|
||||
# These are data columns that are not primary header columns
|
||||
misaligned_data_cols = data_cols - primary_header_cols
|
||||
|
||||
# Map misaligned data columns to their nearest column for consolidation
|
||||
# Only consolidate directly adjacent columns with specific patterns
|
||||
consolidation_map = {}
|
||||
|
||||
# First pass: identify all potential consolidations
|
||||
potential_consolidations = {}
|
||||
for data_col in sorted(misaligned_data_cols):
|
||||
# Check if this column should be consolidated with an adjacent column
|
||||
# Check the column immediately before this one
|
||||
prev_col = data_col - 1
|
||||
|
||||
# Sample some cells to see if consolidation makes sense
|
||||
consolidation_type = None
|
||||
|
||||
for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
|
||||
prev_cell = self.matrix[row_idx][prev_col] if prev_col >= 0 else None
|
||||
curr_cell = self.matrix[row_idx][data_col]
|
||||
|
||||
if prev_cell and prev_cell.original_cell and curr_cell.original_cell:
|
||||
prev_text = prev_cell.original_cell.text().strip()
|
||||
curr_text = curr_cell.original_cell.text().strip()
|
||||
|
||||
# Skip empty cells
|
||||
if not prev_text or not curr_text:
|
||||
continue
|
||||
|
||||
# Check for patterns that indicate consolidation
|
||||
if prev_text == '$' and curr_text and curr_text[0].isdigit():
|
||||
consolidation_type = 'currency'
|
||||
break
|
||||
elif prev_text.startswith('(') and curr_text == ')':
|
||||
consolidation_type = 'parentheses'
|
||||
break
|
||||
elif curr_text == '%' and prev_text and prev_text[-1].isdigit():
|
||||
consolidation_type = 'percentage'
|
||||
break
|
||||
|
||||
if consolidation_type:
|
||||
potential_consolidations[data_col] = (prev_col, consolidation_type)
|
||||
|
||||
# Second pass: resolve conflicts
|
||||
# If column Y is a target for consolidation from Y+1 (e.g., parentheses),
|
||||
# then don't consolidate Y into another column
|
||||
columns_needed_as_targets = set()
|
||||
for data_col, (target_col, cons_type) in potential_consolidations.items():
|
||||
if cons_type == 'parentheses':
|
||||
# This target column is needed for parentheses consolidation
|
||||
columns_needed_as_targets.add(target_col)
|
||||
|
||||
# Build final consolidation map, skipping consolidations that would remove needed targets
|
||||
for data_col, (target_col, cons_type) in potential_consolidations.items():
|
||||
# Don't consolidate this column if it's needed as a target for parentheses
|
||||
if data_col in columns_needed_as_targets and cons_type != 'parentheses':
|
||||
continue
|
||||
|
||||
# CRITICAL: Don't consolidate columns that have header content
|
||||
# This prevents legitimate header columns from being merged together
|
||||
if data_col in header_content_columns or target_col in header_content_columns:
|
||||
continue
|
||||
|
||||
consolidation_map[data_col] = target_col
|
||||
# Debug: uncomment to see consolidation mapping
|
||||
# import os
|
||||
# if os.environ.get('DEBUG_TABLE_CONSOLIDATION'):
|
||||
# print(f"Consolidating column {data_col} into {target_col}")
|
||||
|
||||
# Special case: Keep data columns that are associated with header columns
|
||||
# This handles cases where headers span multiple columns but data is in specific columns
|
||||
for header_col in primary_header_cols:
|
||||
# Check if there's a data column immediately after the header column
|
||||
# This is common when headers span multiple columns
|
||||
for offset in range(1, 3): # Check next 1-2 columns
|
||||
data_col = header_col + offset
|
||||
if data_col in data_cols and data_col not in cols_to_keep:
|
||||
# Check if this column has meaningful data
|
||||
has_data = False
|
||||
for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
|
||||
cell = self.matrix[row_idx][data_col]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text and text not in ['', '-', '—', '–']:
|
||||
has_data = True
|
||||
break
|
||||
if has_data:
|
||||
cols_to_keep.add(data_col)
|
||||
|
||||
# Keep data columns that have significant content but aren't near header columns
|
||||
# This includes columns with dates, text descriptions, etc.
|
||||
for col_idx in data_cols:
|
||||
if col_idx not in cols_to_keep:
|
||||
# Check if this column has important data
|
||||
has_important_data = False
|
||||
non_empty_count = 0
|
||||
text_samples = []
|
||||
|
||||
for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text and text not in ['', '-', '—', '–']:
|
||||
non_empty_count += 1
|
||||
if len(text_samples) < 3:
|
||||
text_samples.append(text)
|
||||
|
||||
# Check for important patterns
|
||||
# Dates, years, text descriptions, etc.
|
||||
if any([
|
||||
len(text) > 3 and not text.replace(',', '').replace('.', '').isdigit(), # Non-trivial text
|
||||
any(month in text for month in ['January', 'February', 'March', 'April', 'May', 'June',
|
||||
'July', 'August', 'September', 'October', 'November', 'December']),
|
||||
any(month in text for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
|
||||
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']),
|
||||
'20' in text and any(c.isdigit() for c in text), # Likely contains year
|
||||
]):
|
||||
has_important_data = True
|
||||
|
||||
# Keep columns with consistent important data
|
||||
if has_important_data and non_empty_count >= 3:
|
||||
cols_to_keep.add(col_idx)
|
||||
|
||||
# Special case: If we have very few primary headers but lots of data columns,
|
||||
# we might have a table where headers are in data rows (like years)
|
||||
# Keep columns that have significant financial data
|
||||
if len(primary_header_cols) <= 2 and len(data_cols) > 4:
|
||||
# Check for financial data patterns in columns
|
||||
for col_idx in data_cols:
|
||||
has_financial_data = False
|
||||
sample_count = 0
|
||||
|
||||
# Sample a few cells from this column
|
||||
for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text:
|
||||
sample_count += 1
|
||||
# Check for financial patterns
|
||||
if any([
|
||||
text.startswith('(') and any(c.isdigit() for c in text), # Negative numbers
|
||||
text == ')' and col_idx > 0, # Closing parenthesis
|
||||
'$' in text, # Currency
|
||||
'%' in text, # Percentages
|
||||
text.replace(',', '').replace('.', '').isdigit(), # Plain numbers
|
||||
text in ['—', '–', '-', '*'] # Common placeholders
|
||||
]):
|
||||
has_financial_data = True
|
||||
break
|
||||
|
||||
# Keep columns with financial data
|
||||
if has_financial_data and sample_count > 0:
|
||||
cols_to_keep.add(col_idx)
|
||||
|
||||
# Check if column 0 contains row labels (non-empty cells in data rows)
|
||||
col_0_has_labels = False
|
||||
data_start_row = max(1, actual_header_rows)
|
||||
for row_idx in range(data_start_row, self.row_count):
|
||||
cell = self.matrix[row_idx][0]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text and not text.isdigit() and not text.startswith('$') and len(text) > 1:
|
||||
col_0_has_labels = True
|
||||
break
|
||||
|
||||
# Include column 0 if it has labels
|
||||
if col_0_has_labels:
|
||||
cols_to_keep.add(0)
|
||||
|
||||
# Remove columns that will be consolidated into other columns
|
||||
# These columns' data will be merged into their target columns
|
||||
cols_to_remove = set(consolidation_map.keys())
|
||||
cols_to_keep = cols_to_keep - cols_to_remove
|
||||
|
||||
cols_to_keep = sorted(cols_to_keep)
|
||||
|
||||
# Create new matrix with consolidated columns
|
||||
if not cols_to_keep:
|
||||
return self
|
||||
|
||||
new_matrix = TableMatrix()
|
||||
new_matrix.row_count = self.row_count
|
||||
new_matrix.col_count = len(cols_to_keep)
|
||||
new_matrix.header_row_count = self.header_row_count # Preserve header row count
|
||||
new_matrix.matrix = []
|
||||
|
||||
# Create mapping from old to new column indices
|
||||
old_to_new = {old_col: new_idx for new_idx, old_col in enumerate(cols_to_keep)}
|
||||
|
||||
# Build new matrix with consolidation
|
||||
for row_idx in range(self.row_count):
|
||||
new_row = [MatrixCell() for _ in range(new_matrix.col_count)]
|
||||
|
||||
# Track which cells we've already placed to handle colspan properly
|
||||
placed_origins = {} # Maps (row_origin, col_origin) to new column index
|
||||
|
||||
# First, copy cells from kept columns
|
||||
for old_col in sorted(cols_to_keep):
|
||||
if old_col not in old_to_new:
|
||||
continue
|
||||
new_col = old_to_new[old_col]
|
||||
cell = self.matrix[row_idx][old_col]
|
||||
if cell.original_cell:
|
||||
origin_key = (cell.row_origin, cell.col_origin)
|
||||
|
||||
# Check if we've already placed this cell (due to colspan)
|
||||
if origin_key in placed_origins:
|
||||
# This is a continuation of a colspan - mark as spanned
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=cell.original_cell,
|
||||
is_spanned=True, # Mark as spanned since it's part of a colspan
|
||||
row_origin=cell.row_origin,
|
||||
col_origin=placed_origins[origin_key] # Point to the original placement
|
||||
)
|
||||
else:
|
||||
# First occurrence of this cell - place normally
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=cell.original_cell,
|
||||
is_spanned=False, # This is the primary cell
|
||||
row_origin=cell.row_origin,
|
||||
col_origin=new_col
|
||||
)
|
||||
placed_origins[origin_key] = new_col
|
||||
|
||||
# Then, consolidate misaligned data into header columns
|
||||
for data_col, header_col in consolidation_map.items():
|
||||
if header_col in old_to_new:
|
||||
new_col = old_to_new[header_col]
|
||||
data_cell = self.matrix[row_idx][data_col] if data_col < len(self.matrix[row_idx]) else None
|
||||
|
||||
|
||||
# If data cell has content, merge it with header column
|
||||
if data_cell and data_cell.original_cell and not data_cell.is_spanned:
|
||||
# Skip empty data cells
|
||||
if not data_cell.original_cell.text().strip():
|
||||
continue
|
||||
# Check the original header column cell to see if it has content to merge
|
||||
header_cell = self.matrix[row_idx][header_col]
|
||||
existing_cell = new_row[new_col]
|
||||
|
||||
# Check if we need to merge (e.g., $ with value)
|
||||
if header_cell.original_cell and header_cell.original_cell.text().strip():
|
||||
existing_text = header_cell.original_cell.text().strip()
|
||||
new_text = data_cell.original_cell.text().strip()
|
||||
|
||||
|
||||
# Merge currency symbol with value OR value with percentage OR parentheses
|
||||
if existing_text == '$' and new_text:
|
||||
# Currency merge: $ + number
|
||||
merged_text = f"${new_text}"
|
||||
# Create new cell with merged content
|
||||
merged_cell = Cell(
|
||||
content=merged_text,
|
||||
colspan=header_cell.original_cell.colspan,
|
||||
rowspan=header_cell.original_cell.rowspan,
|
||||
is_header=header_cell.original_cell.is_header,
|
||||
align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
|
||||
)
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=merged_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col
|
||||
)
|
||||
elif new_text == ')' and existing_text.startswith('('):
|
||||
# Parentheses merge: (number + )
|
||||
merged_text = f"{existing_text})"
|
||||
# Create new cell with merged content
|
||||
merged_cell = Cell(
|
||||
content=merged_text,
|
||||
colspan=header_cell.original_cell.colspan,
|
||||
rowspan=header_cell.original_cell.rowspan,
|
||||
is_header=header_cell.original_cell.is_header,
|
||||
align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
|
||||
)
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=merged_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col
|
||||
)
|
||||
elif new_text == '%' and existing_text:
|
||||
# Percentage merge: number + %
|
||||
merged_text = f"{existing_text}%"
|
||||
# Create new cell with merged content
|
||||
merged_cell = Cell(
|
||||
content=merged_text,
|
||||
colspan=header_cell.original_cell.colspan,
|
||||
rowspan=header_cell.original_cell.rowspan,
|
||||
is_header=header_cell.original_cell.is_header,
|
||||
align=header_cell.original_cell.align if hasattr(header_cell.original_cell, 'align') else None
|
||||
)
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=merged_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col
|
||||
)
|
||||
else:
|
||||
# Just keep the data cell if can't merge
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=data_cell.original_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col
|
||||
)
|
||||
else:
|
||||
# No existing content, just move the data
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=data_cell.original_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col
|
||||
)
|
||||
|
||||
new_matrix.matrix.append(new_row)
|
||||
|
||||
return new_matrix
|
||||
|
||||
def to_cell_grid(self) -> List[List[Optional[Cell]]]:
|
||||
"""
|
||||
Convert matrix to a simple 2D grid of cells.
|
||||
|
||||
Returns:
|
||||
2D list where each position contains either a Cell or None
|
||||
"""
|
||||
grid = []
|
||||
|
||||
for row_idx in range(self.row_count):
|
||||
row = []
|
||||
for col_idx in range(self.col_count):
|
||||
matrix_cell = self.matrix[row_idx][col_idx]
|
||||
if matrix_cell.original_cell and not matrix_cell.is_spanned:
|
||||
row.append(matrix_cell.original_cell)
|
||||
else:
|
||||
row.append(None)
|
||||
grid.append(row)
|
||||
|
||||
return grid
|
||||
|
||||
def debug_print(self):
|
||||
"""Print matrix structure for debugging"""
|
||||
print(f"Matrix: {self.row_count}×{self.col_count}")
|
||||
|
||||
for row_idx in range(self.row_count):
|
||||
row_str = []
|
||||
for col_idx in range(self.col_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell:
|
||||
text = cell.original_cell.text()[:10]
|
||||
if cell.is_spanned:
|
||||
row_str.append(f"[{text}...]")
|
||||
else:
|
||||
row_str.append(f"{text}...")
|
||||
else:
|
||||
row_str.append("___")
|
||||
print(f"Row {row_idx}: {' | '.join(row_str)}")
|
||||
|
||||
|
||||
class ColumnAnalyzer:
|
||||
"""Analyze column structure to identify data vs spacing columns"""
|
||||
|
||||
def __init__(self, matrix: TableMatrix):
|
||||
"""Initialize with a table matrix"""
|
||||
self.matrix = matrix
|
||||
|
||||
def identify_spacing_columns(self) -> List[int]:
|
||||
"""
|
||||
Identify columns used only for spacing.
|
||||
|
||||
Returns:
|
||||
List of column indices that are spacing columns
|
||||
"""
|
||||
spacing_cols = []
|
||||
widths = self.matrix.get_column_widths()
|
||||
total_width = sum(widths)
|
||||
|
||||
for col_idx in range(self.matrix.col_count):
|
||||
if self._is_spacing_column(col_idx, widths, total_width):
|
||||
spacing_cols.append(col_idx)
|
||||
|
||||
return spacing_cols
|
||||
|
||||
def _is_spacing_column(self, col_idx: int, widths: List[float], total_width: float) -> bool:
|
||||
"""
|
||||
Check if a column is used for spacing.
|
||||
Only mark as spacing if column is completely empty.
|
||||
|
||||
Criteria:
|
||||
- Column has absolutely no content across all rows
|
||||
"""
|
||||
# Check if column is completely empty
|
||||
for row_idx in range(self.matrix.row_count):
|
||||
cell = self.matrix.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
# If there's any text at all, it's not a spacing column
|
||||
if text:
|
||||
return False
|
||||
|
||||
# Column is completely empty
|
||||
return True
|
||||
|
||||
def get_clean_column_indices(self) -> List[int]:
|
||||
"""
|
||||
Get indices of non-spacing columns.
|
||||
|
||||
Returns:
|
||||
List of column indices that contain actual data
|
||||
"""
|
||||
spacing = set(self.identify_spacing_columns())
|
||||
return [i for i in range(self.matrix.col_count) if i not in spacing]
|
||||
@@ -0,0 +1,440 @@
|
||||
"""
|
||||
Table of Contents analyzer for SEC filings.
|
||||
|
||||
This module analyzes the TOC structure to map section names to anchor IDs,
|
||||
enabling section extraction for API filings with generated anchor IDs.
|
||||
"""
|
||||
import re
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from dataclasses import dataclass
|
||||
from lxml import html as lxml_html
|
||||
|
||||
|
||||
@dataclass
|
||||
class TOCSection:
|
||||
"""Represents a section found in the Table of Contents."""
|
||||
name: str
|
||||
anchor_id: str
|
||||
normalized_name: str
|
||||
section_type: str # 'item', 'part', 'other'
|
||||
order: int
|
||||
part: Optional[str] = None # NEW: "Part I", "Part II", or None for 10-K
|
||||
|
||||
|
||||
class TOCAnalyzer:
|
||||
"""
|
||||
Analyzes Table of Contents structure to map section names to anchor IDs.
|
||||
|
||||
This enables section extraction for filings where anchor IDs are generated
|
||||
rather than semantic (like API filings vs local HTML files).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# SEC section patterns for normalization
|
||||
self.section_patterns = [
|
||||
(r'(?:item|part)\s+\d+[a-z]?', 'item'),
|
||||
(r'business', 'item'),
|
||||
(r'risk\s+factors?', 'item'),
|
||||
(r'properties', 'item'),
|
||||
(r'legal\s+proceedings', 'item'),
|
||||
(r'management.*discussion', 'item'),
|
||||
(r'md&a', 'item'),
|
||||
(r'financial\s+statements?', 'item'),
|
||||
(r'exhibits?', 'item'),
|
||||
(r'signatures?', 'item'),
|
||||
(r'part\s+[ivx]+', 'part'),
|
||||
]
|
||||
|
||||
def analyze_toc_structure(self, html_content: str) -> Dict[str, str]:
|
||||
"""
|
||||
Analyze HTML content to extract section mappings from TOC.
|
||||
|
||||
Args:
|
||||
html_content: Raw HTML content
|
||||
|
||||
Returns:
|
||||
Dict mapping normalized section names to anchor IDs
|
||||
"""
|
||||
section_mapping = {}
|
||||
|
||||
try:
|
||||
# Handle XML declaration issues
|
||||
if html_content.startswith('<?xml'):
|
||||
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
|
||||
|
||||
tree = lxml_html.fromstring(html_content)
|
||||
|
||||
# Find all anchor links that could be TOC links
|
||||
anchor_links = tree.xpath('//a[@href]')
|
||||
|
||||
toc_sections = []
|
||||
current_part = None # Track current part context for 10-Q filings
|
||||
part_pattern = re.compile(r'^\s*Part\s+([IVX]+)\b', re.IGNORECASE)
|
||||
|
||||
for link in anchor_links:
|
||||
href = link.get('href', '').strip()
|
||||
text = (link.text_content() or '').strip()
|
||||
|
||||
# Check if this link or its row represents a part header
|
||||
# Part headers in 10-Q TOCs typically appear as separate rows: "Part I", "Part II"
|
||||
part_match = part_pattern.match(text)
|
||||
if part_match:
|
||||
# Update current part context
|
||||
current_part = f"Part {part_match.group(1).upper()}"
|
||||
# Don't create a section for the part header itself
|
||||
continue
|
||||
|
||||
# Look for internal anchor links
|
||||
if href.startswith('#') and text:
|
||||
anchor_id = href[1:] # Remove #
|
||||
|
||||
# Try to find item number in preceding context (for table-based TOCs)
|
||||
preceding_item = self._extract_preceding_item_label(link)
|
||||
|
||||
# Check if this looks like a section reference (check text, anchor ID, and context)
|
||||
if self._is_section_link(text, anchor_id, preceding_item):
|
||||
# Verify target exists
|
||||
target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
|
||||
if target_elements:
|
||||
# Try to extract item number from: anchor ID > preceding context > text
|
||||
normalized_name = self._normalize_section_name(text, anchor_id, preceding_item)
|
||||
section_type, order = self._get_section_type_and_order(normalized_name)
|
||||
|
||||
toc_section = TOCSection(
|
||||
name=text,
|
||||
anchor_id=anchor_id,
|
||||
normalized_name=normalized_name,
|
||||
section_type=section_type,
|
||||
order=order,
|
||||
part=current_part # Assign current part context
|
||||
)
|
||||
toc_sections.append(toc_section)
|
||||
|
||||
# Build mapping prioritizing the most standard section names
|
||||
section_mapping = self._build_section_mapping(toc_sections)
|
||||
|
||||
except Exception as e:
|
||||
# Return empty mapping on error - fallback to other methods
|
||||
pass
|
||||
|
||||
return section_mapping
|
||||
|
||||
def _extract_preceding_item_label(self, link_element) -> str:
|
||||
"""
|
||||
Extract item/part label from preceding context.
|
||||
|
||||
Handles table-based TOCs where item number is in a separate cell:
|
||||
<td>Item 1.</td><td><a href="...">Business</a></td>
|
||||
|
||||
Also handles nested structures like:
|
||||
<td>Item 1.</td><td><div><span><a href="...">Business</a></span></div></td>
|
||||
|
||||
Args:
|
||||
link_element: The <a> element
|
||||
|
||||
Returns:
|
||||
Item label like "Item 1", "Item 1A", "Part I" or empty string
|
||||
"""
|
||||
try:
|
||||
# Traverse up to find the containing <td> or <th> (up to 5 levels)
|
||||
current = link_element
|
||||
td_element = None
|
||||
|
||||
for _ in range(5):
|
||||
parent = current.getparent()
|
||||
if parent is None:
|
||||
break
|
||||
|
||||
if parent.tag in ['td', 'th']:
|
||||
td_element = parent
|
||||
break
|
||||
|
||||
current = parent
|
||||
|
||||
# If we found a <td>, check ALL preceding siblings in the row
|
||||
# This handles TOCs where item number is not in the immediately adjacent cell
|
||||
# Example: ['Business', 'I', '1', '5'] where '1' is the item number
|
||||
if td_element is not None:
|
||||
# Check all preceding siblings (rightmost to leftmost)
|
||||
prev_sibling = td_element.getprevious()
|
||||
while prev_sibling is not None:
|
||||
if prev_sibling.tag in ['td', 'th']:
|
||||
prev_text = (prev_sibling.text_content() or '').strip()
|
||||
|
||||
# Look for "Item X" or just "X" (bare number) pattern
|
||||
# Match full format: "Item 1A"
|
||||
item_match = re.match(r'(Item\s+\d+[A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
|
||||
if item_match:
|
||||
return item_match.group(1)
|
||||
|
||||
# Match bare item number: "1A" or "1" (only valid 10-K item numbers: 1-15)
|
||||
# This prevents page numbers (50, 108, etc.) from being treated as items
|
||||
bare_item_match = re.match(r'^([1-9]|1[0-5])([A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
|
||||
if bare_item_match:
|
||||
item_num = bare_item_match.group(1)
|
||||
item_letter = bare_item_match.group(2)
|
||||
return f"Item {item_num}{item_letter}"
|
||||
|
||||
# Match part: "Part I" or just "I"
|
||||
part_match = re.match(r'(Part\s+[IVX]+)\.?\s*$', prev_text, re.IGNORECASE)
|
||||
if part_match:
|
||||
return part_match.group(1)
|
||||
|
||||
# Match bare part: "I", "II", etc.
|
||||
bare_part_match = re.match(r'^([IVX]+)\.?\s*$', prev_text)
|
||||
if bare_part_match:
|
||||
return f"Part {bare_part_match.group(1)}"
|
||||
|
||||
prev_sibling = prev_sibling.getprevious()
|
||||
|
||||
# Also check immediate parent's text for inline patterns (div/span structures)
|
||||
parent = link_element.getparent()
|
||||
if parent is not None and parent.tag in ['div', 'span', 'p']:
|
||||
if parent.text:
|
||||
text_before = parent.text.strip()
|
||||
item_match = re.search(r'(Item\s+\d+[A-Z]?)\.?\s*$', text_before, re.IGNORECASE)
|
||||
if item_match:
|
||||
return item_match.group(1)
|
||||
|
||||
part_match = re.search(r'(Part\s+[IVX]+)\.?\s*$', text_before, re.IGNORECASE)
|
||||
if part_match:
|
||||
return part_match.group(1)
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return ''
|
||||
|
||||
def _is_section_link(self, text: str, anchor_id: str = '', preceding_item: str = '') -> bool:
|
||||
"""
|
||||
Check if link represents a section reference.
|
||||
|
||||
Checks link text, anchor ID, and preceding context to handle cases where:
|
||||
- Text is descriptive (e.g., "Executive Compensation")
|
||||
- Anchor ID contains item number (e.g., "item_11_executive_compensation")
|
||||
- Item number is in preceding table cell (e.g., <td>Item 1.</td><td><a>Business</a></td>)
|
||||
|
||||
Args:
|
||||
text: Link text
|
||||
anchor_id: Anchor ID from href (without #)
|
||||
preceding_item: Item/part label from preceding context (e.g., "Item 1A")
|
||||
|
||||
Returns:
|
||||
True if this appears to be a section link
|
||||
"""
|
||||
if not text:
|
||||
return False
|
||||
|
||||
# First check if there's a preceding item label (table-based TOC)
|
||||
if preceding_item:
|
||||
return True
|
||||
|
||||
# Then check anchor ID for item/part patterns (most reliable)
|
||||
if anchor_id:
|
||||
anchor_lower = anchor_id.lower()
|
||||
# Match patterns like: item_1, item_1a, item1, item1a, part_i, part_ii, etc.
|
||||
if re.search(r'item_?\d+[a-z]?', anchor_lower):
|
||||
return True
|
||||
if re.search(r'part_?[ivx]+', anchor_lower):
|
||||
return True
|
||||
|
||||
# Then check text (with relaxed length limit for descriptive section names)
|
||||
if len(text) > 150: # Increased from 100 to accommodate longer section titles
|
||||
return False
|
||||
|
||||
# Check against known patterns
|
||||
for pattern, _ in self.section_patterns:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Also consider links with section keywords
|
||||
if len(text) < 100 and any(keyword in text.lower() for keyword in
|
||||
['item', 'part', 'business', 'risk', 'properties', 'legal',
|
||||
'compensation', 'ownership', 'governance', 'directors']):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _normalize_section_name(self, text: str, anchor_id: str = '', preceding_item: str = '') -> str:
|
||||
"""
|
||||
Normalize section name for consistent lookup.
|
||||
|
||||
Prioritizes:
|
||||
1. Preceding item label (table-based TOC)
|
||||
2. Anchor ID pattern
|
||||
3. Text-based normalization
|
||||
|
||||
Args:
|
||||
text: Link text
|
||||
anchor_id: Anchor ID from href (without #)
|
||||
preceding_item: Item/part label from preceding context
|
||||
|
||||
Returns:
|
||||
Normalized section name (e.g., "Item 1A", "Part II")
|
||||
"""
|
||||
text = text.strip()
|
||||
|
||||
# HIGHEST PRIORITY: Use preceding item label if available (table-based TOC)
|
||||
if preceding_item:
|
||||
# Clean up and normalize the preceding item
|
||||
item_match = re.match(r'item\s+(\d+[a-z]?)', preceding_item, re.IGNORECASE)
|
||||
if item_match:
|
||||
return f"Item {item_match.group(1).upper()}"
|
||||
|
||||
part_match = re.match(r'part\s+([ivx]+)', preceding_item, re.IGNORECASE)
|
||||
if part_match:
|
||||
return f"Part {part_match.group(1).upper()}"
|
||||
|
||||
# SECOND PRIORITY: Try to extract from anchor ID
|
||||
if anchor_id:
|
||||
anchor_lower = anchor_id.lower()
|
||||
|
||||
# Match item patterns: item_1a, item1a, item_1_business, etc.
|
||||
item_match = re.search(r'item_?(\d+[a-z]?)', anchor_lower)
|
||||
if item_match:
|
||||
item_num = item_match.group(1).upper()
|
||||
return f"Item {item_num}"
|
||||
|
||||
# Match part patterns: part_i, part_ii, parti, partii, etc.
|
||||
part_match = re.search(r'part_?([ivx]+)', anchor_lower)
|
||||
if part_match:
|
||||
part_num = part_match.group(1).upper()
|
||||
return f"Part {part_num}"
|
||||
|
||||
# THIRD PRIORITY: Text-based normalization
|
||||
# Handle common Item patterns in text
|
||||
item_match = re.match(r'item\s+(\d+[a-z]?)', text, re.IGNORECASE)
|
||||
if item_match:
|
||||
return f"Item {item_match.group(1).upper()}"
|
||||
|
||||
# Handle Part patterns
|
||||
part_match = re.match(r'part\s+([ivx]+)', text, re.IGNORECASE)
|
||||
if part_match:
|
||||
return f"Part {part_match.group(1).upper()}"
|
||||
|
||||
# Handle specific known sections by text
|
||||
text_lower = text.lower()
|
||||
if 'business' in text_lower and 'item' not in text_lower:
|
||||
return "Item 1"
|
||||
elif 'risk factors' in text_lower and 'item' not in text_lower:
|
||||
return "Item 1A"
|
||||
elif 'properties' in text_lower and 'item' not in text_lower:
|
||||
return "Item 2"
|
||||
elif 'legal proceedings' in text_lower and 'item' not in text_lower:
|
||||
return "Item 3"
|
||||
elif 'management' in text_lower and 'discussion' in text_lower:
|
||||
return "Item 7"
|
||||
elif 'financial statements' in text_lower:
|
||||
return "Item 8"
|
||||
elif 'exhibits' in text_lower:
|
||||
return "Item 15"
|
||||
|
||||
return text # Return as-is if no normalization applies
|
||||
|
||||
def _get_section_type_and_order(self, text: str) -> Tuple[str, int]:
|
||||
"""Get section type and order for sorting."""
|
||||
text_lower = text.lower()
|
||||
|
||||
# Items
|
||||
item_match = re.search(r'item\s*(\d+)([a-z]?)', text_lower)
|
||||
if item_match:
|
||||
item_num = int(item_match.group(1))
|
||||
item_letter = item_match.group(2) or ''
|
||||
# Order: Item 1=1000, Item 1A=1001, Item 2=2000, etc.
|
||||
order = item_num * 1000 + (ord(item_letter.upper()) - ord('A') + 1 if item_letter else 0)
|
||||
return 'item', order
|
||||
|
||||
# Parts
|
||||
part_match = re.search(r'part\s*([ivx]+)', text_lower)
|
||||
if part_match:
|
||||
part_roman = part_match.group(1)
|
||||
part_num = self._roman_to_int(part_roman)
|
||||
return 'part', part_num * 100 # Part I=100, Part II=200, etc.
|
||||
|
||||
# Known sections without explicit item numbers
|
||||
if 'business' in text_lower:
|
||||
return 'item', 1000 # Item 1
|
||||
elif 'risk factors' in text_lower:
|
||||
return 'item', 1001 # Item 1A
|
||||
elif 'properties' in text_lower:
|
||||
return 'item', 2000 # Item 2
|
||||
elif 'legal proceedings' in text_lower:
|
||||
return 'item', 3000 # Item 3
|
||||
elif 'management' in text_lower and 'discussion' in text_lower:
|
||||
return 'item', 7000 # Item 7
|
||||
elif 'financial statements' in text_lower:
|
||||
return 'item', 8000 # Item 8
|
||||
elif 'exhibits' in text_lower:
|
||||
return 'item', 15000 # Item 15
|
||||
|
||||
return 'other', 99999
|
||||
|
||||
def _roman_to_int(self, roman: str) -> int:
|
||||
"""Convert roman numerals to integers."""
|
||||
roman_map = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
|
||||
roman = roman.lower()
|
||||
result = 0
|
||||
prev = 0
|
||||
|
||||
for char in reversed(roman):
|
||||
value = roman_map.get(char, 0)
|
||||
if value < prev:
|
||||
result -= value
|
||||
else:
|
||||
result += value
|
||||
prev = value
|
||||
|
||||
return result
|
||||
|
||||
def _build_section_mapping(self, toc_sections: List[TOCSection]) -> Dict[str, str]:
|
||||
"""Build final section mapping, handling duplicates intelligently.
|
||||
|
||||
For 10-Q filings with part context, generates part-aware section names
|
||||
like "part_i_item_1" and "part_ii_item_1" to distinguish sections
|
||||
with the same item number across different parts.
|
||||
"""
|
||||
# Sort sections by order
|
||||
toc_sections.sort(key=lambda x: x.order)
|
||||
|
||||
mapping = {}
|
||||
seen_names = set()
|
||||
|
||||
for section in toc_sections:
|
||||
# Generate part-aware section name for 10-Q filings
|
||||
if section.part:
|
||||
# Convert "Part I" -> "part_i", "Part II" -> "part_ii"
|
||||
part_key = section.part.lower().replace(' ', '_')
|
||||
# Convert "Item 1" -> "item_1", "Item 1A" -> "item_1a"
|
||||
item_key = section.normalized_name.lower().replace(' ', '_')
|
||||
section_name = f"{part_key}_{item_key}"
|
||||
else:
|
||||
# 10-K filings: use normalized name as-is
|
||||
section_name = section.normalized_name
|
||||
|
||||
# Skip if we already have this section (prefer first occurrence)
|
||||
if section_name in seen_names:
|
||||
continue
|
||||
|
||||
mapping[section_name] = section.anchor_id
|
||||
seen_names.add(section_name)
|
||||
|
||||
return mapping
|
||||
|
||||
def get_section_suggestions(self, html_content: str) -> List[str]:
|
||||
"""Get list of available sections that can be extracted."""
|
||||
mapping = self.analyze_toc_structure(html_content)
|
||||
return sorted(mapping.keys(), key=lambda x: self._get_section_type_and_order(x)[1])
|
||||
|
||||
|
||||
def analyze_toc_for_sections(html_content: str) -> Dict[str, str]:
|
||||
"""
|
||||
Convenience function to analyze TOC and return section mapping.
|
||||
|
||||
Args:
|
||||
html_content: Raw HTML content
|
||||
|
||||
Returns:
|
||||
Dict mapping section names to anchor IDs
|
||||
"""
|
||||
analyzer = TOCAnalyzer()
|
||||
return analyzer.analyze_toc_structure(html_content)
|
||||
@@ -0,0 +1,104 @@
|
||||
"""
|
||||
Table of Contents Link Filter
|
||||
|
||||
Removes repetitive "Table of Contents" anchor links from document text,
|
||||
matching the behavior of the old parser.
|
||||
"""
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
|
||||
def filter_toc_links(text: str) -> str:
|
||||
"""
|
||||
Filter out repetitive navigation links from text.
|
||||
|
||||
This replicates the old parser's behavior of removing repetitive
|
||||
navigation links that appear throughout SEC filings.
|
||||
|
||||
Based on analysis of 12+ SEC filings across different companies:
|
||||
- Average of 47.9 "Table of Contents" links per filing (575 total found)
|
||||
- Oracle 10-K shows 230 "Index to Financial Statements" vs 83 in old parser
|
||||
- Safe to filter without losing legitimate content
|
||||
|
||||
Patterns filtered:
|
||||
- "Table of Contents" (exact match)
|
||||
- "Index to Financial Statements"
|
||||
- "Index to Exhibits"
|
||||
|
||||
Args:
|
||||
text: Input text to filter
|
||||
|
||||
Returns:
|
||||
Text with navigation links removed
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Navigation link patterns based on analysis
|
||||
patterns = [
|
||||
r'^Table of Contents$',
|
||||
r'^INDEX TO FINANCIAL STATEMENTS$',
|
||||
r'^Index to Financial Statements$',
|
||||
r'^INDEX TO EXHIBITS$',
|
||||
r'^Index to Exhibits$',
|
||||
]
|
||||
|
||||
# Compile all patterns into one regex
|
||||
combined_pattern = re.compile('|'.join(f'({pattern})' for pattern in patterns), re.IGNORECASE)
|
||||
|
||||
lines = text.split('\n')
|
||||
filtered_lines = []
|
||||
|
||||
for line in lines:
|
||||
stripped_line = line.strip()
|
||||
if not combined_pattern.match(stripped_line):
|
||||
filtered_lines.append(line)
|
||||
|
||||
return '\n'.join(filtered_lines)
|
||||
|
||||
|
||||
def get_toc_link_stats(text: str) -> dict:
|
||||
"""
|
||||
Get statistics about navigation links in text for debugging/analysis.
|
||||
|
||||
Args:
|
||||
text: Input text to analyze
|
||||
|
||||
Returns:
|
||||
Dict with counts and examples of navigation patterns
|
||||
"""
|
||||
if not text:
|
||||
return {'total_matches': 0, 'patterns': {}, 'examples': []}
|
||||
|
||||
# All navigation patterns we filter
|
||||
patterns = {
|
||||
'Table of Contents': re.compile(r'^Table of Contents$', re.IGNORECASE),
|
||||
'Index to Financial Statements': re.compile(r'^Index to Financial Statements$', re.IGNORECASE),
|
||||
'Index to Exhibits': re.compile(r'^Index to Exhibits$', re.IGNORECASE),
|
||||
}
|
||||
|
||||
lines = text.split('\n')
|
||||
all_matches = []
|
||||
pattern_counts = {}
|
||||
|
||||
for pattern_name, pattern_regex in patterns.items():
|
||||
pattern_matches = []
|
||||
for i, line in enumerate(lines):
|
||||
stripped_line = line.strip()
|
||||
if pattern_regex.match(stripped_line):
|
||||
pattern_matches.append({
|
||||
'line_num': i + 1,
|
||||
'content': line,
|
||||
'stripped': stripped_line,
|
||||
'pattern': pattern_name
|
||||
})
|
||||
|
||||
pattern_counts[pattern_name] = len(pattern_matches)
|
||||
all_matches.extend(pattern_matches[:5]) # First 5 examples per pattern
|
||||
|
||||
return {
|
||||
'total_matches': sum(pattern_counts.values()),
|
||||
'patterns': pattern_counts,
|
||||
'examples': all_matches,
|
||||
'total_lines': len(lines)
|
||||
}
|
||||
Reference in New Issue
Block a user