Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,49 @@
"""
EdgarTools HTML Parser v2.0
A high-performance, semantically-aware HTML parser for SEC filings.
"""
from edgar.documents.parser import HTMLParser
from edgar.documents.document import Document
from edgar.documents.config import ParserConfig
from edgar.documents.exceptions import ParsingError
from edgar.documents.types import NodeType, SemanticType, TableType
from edgar.documents.search import DocumentSearch, SearchResult, SearchMode
from edgar.documents.renderers import MarkdownRenderer, TextRenderer
__version__ = "2.0.0"
__all__ = [
'HTMLParser',
'Document',
'ParserConfig',
'ParsingError',
'NodeType',
'SemanticType',
'TableType',
'DocumentSearch',
'SearchResult',
'SearchMode',
'MarkdownRenderer',
'TextRenderer',
'parse_html'
]
def parse_html(html: str, config: ParserConfig = None) -> Document:
"""
Convenience function for parsing HTML.
Args:
html: HTML content to parse
config: Optional parser configuration
Returns:
Parsed Document object
Example:
>>> document = parse_html(html_content)
>>> print(document.text()[:100])
"""
parser = HTMLParser(config or ParserConfig())
return parser.parse(html)

View File

@@ -0,0 +1,83 @@
"""
Mixin class providing text caching functionality for document nodes.
This module consolidates the text caching pattern used across multiple node types
(DocumentNode, ParagraphNode, ContainerNode, TableNode, and Document).
"""
from typing import Callable, Any
class CacheableMixin:
"""
Mixin providing text caching functionality for nodes.
This mixin implements a lazy-evaluated text caching pattern that:
1. Checks for existing cached text
2. Generates text on first access via a generator function
3. Caches the result for subsequent accesses
4. Provides recursive cache clearing for tree structures
Usage:
class MyNode(CacheableMixin):
def text(self, **kwargs):
def generator():
# Generate text logic here
return "generated text"
return self._get_cached_text(generator)
"""
def _get_cached_text(self, generator_func: Callable[[], Any], *args, **kwargs) -> Any:
"""
Get cached text or generate and cache it.
This method implements the caching pattern:
- If cache exists and is not None, return cached value
- Otherwise, call generator function to create text
- Store result in cache
- Return the result
Args:
generator_func: Function that generates the text when cache miss occurs
*args: Positional arguments to pass to generator (currently unused)
**kwargs: Keyword arguments to pass to generator (currently unused)
Returns:
The cached or newly generated text
Note:
The cache is stored in the instance attribute '_text_cache'.
Generator function is called without arguments in current implementation.
"""
if hasattr(self, '_text_cache') and self._text_cache is not None:
return self._text_cache
# Generate text and cache it
self._text_cache = generator_func(*args, **kwargs)
return self._text_cache
def clear_text_cache(self) -> None:
"""
Clear cached text recursively.
This method:
1. Clears the text cache for this node (sets to None)
2. Recursively clears cache for all children (if node has children)
The recursive clearing ensures that when a parent node's content changes,
all descendant nodes also have their caches invalidated.
Safe to call even if:
- Node doesn't have a cache (_text_cache attribute)
- Node doesn't have children
- Children don't have clear_text_cache method
"""
# Clear own cache if it exists
if hasattr(self, '_text_cache'):
self._text_cache = None
# Recursively clear children's caches
if hasattr(self, 'children'):
for child in self.children:
if hasattr(child, 'clear_text_cache'):
child.clear_text_cache()

View File

@@ -0,0 +1,211 @@
"""
Configuration for the HTML parser.
"""
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
@dataclass
class DetectionThresholds:
"""
Configurable thresholds for section detection strategies.
Attributes:
min_confidence: Minimum confidence score to include a section (0.0-1.0)
cross_validation_boost: Multiplier when multiple methods agree (>1.0)
disagreement_penalty: Multiplier when methods disagree (<1.0)
boundary_overlap_penalty: Multiplier for overlapping sections (<1.0)
enable_cross_validation: Whether to run cross-validation (slower but more accurate)
thresholds_by_form: Filing-specific threshold overrides
"""
min_confidence: float = 0.6
cross_validation_boost: float = 1.2
disagreement_penalty: float = 0.8
boundary_overlap_penalty: float = 0.9
enable_cross_validation: bool = False # Disabled by default for performance
thresholds_by_form: Dict[str, Dict[str, float]] = field(default_factory=dict)
@dataclass
class ParserConfig:
"""
Configuration for HTML parser.
Attributes:
max_document_size: Maximum document size in bytes
streaming_threshold: Document size threshold for streaming mode
cache_size: Maximum number of cached items
enable_parallel: Enable parallel processing for tables
strict_mode: Fail on parsing errors vs. best effort
extract_xbrl: Extract inline XBRL facts
extract_styles: Extract and process CSS styles
preserve_whitespace: Preserve original whitespace
optimize_for_ai: Enable AI-specific optimizations
max_token_estimation: Maximum estimated tokens for AI optimization
features: Feature flags for optional functionality
"""
# Performance settings
max_document_size: int = 100 * 1024 * 1024 # 100MB (handles large filings like JPM)
streaming_threshold: int = 10 * 1024 * 1024 # 10MB
cache_size: int = 1000
enable_parallel: bool = True
max_workers: Optional[int] = None # None = use CPU count
# Parsing settings
strict_mode: bool = False
extract_xbrl: bool = True
extract_styles: bool = True
preserve_whitespace: bool = False
normalize_text: bool = True
extract_links: bool = True
extract_images: bool = False
# AI optimization
optimize_for_ai: bool = True
max_token_estimation: int = 100_000
chunk_size: int = 512
chunk_overlap: int = 128
# Table processing
table_extraction: bool = True
detect_table_types: bool = True
extract_table_relationships: bool = True
fast_table_rendering: bool = True # Fast renderer is now production-ready (7-10x faster than Rich)
# Section detection
detect_sections: bool = True
eager_section_extraction: bool = False # Extract sections during parsing vs. on first access (default: lazy)
form: Optional[str] = None # Required for section detection (e.g. '10-K', '10-Q', '8-K')
detection_thresholds: DetectionThresholds = field(default_factory=DetectionThresholds)
section_patterns: Dict[str, List[str]] = field(default_factory=lambda: {
'business': [
r'item\s+1\.?\s*business',
r'business\s+overview',
r'our\s+business'
],
'risk_factors': [
r'item\s+1a\.?\s*risk\s+factors',
r'risk\s+factors',
r'factors\s+that\s+may\s+affect'
],
'properties': [
r'item\s+2\.?\s*properties',
r'properties'
],
'legal_proceedings': [
r'item\s+3\.?\s*legal\s+proceedings',
r'legal\s+proceedings',
r'litigation'
],
'mda': [
r'item\s+7\.?\s*management\'?s?\s+discussion',
r'md&a',
r'management\'?s?\s+discussion\s+and\s+analysis'
],
'financial_statements': [
r'item\s+8\.?\s*financial\s+statements',
r'consolidated\s+financial\s+statements',
r'financial\s+statements'
]
})
# Feature flags
features: Dict[str, bool] = field(default_factory=lambda: {
'ml_header_detection': True,
'semantic_analysis': True,
'table_understanding': True,
'xbrl_validation': True,
'auto_section_detection': True,
'smart_text_extraction': True,
'footnote_linking': True,
'cross_reference_resolution': True
})
# Header detection settings
header_detection_threshold: float = 0.6 # Minimum confidence
header_detection_methods: List[str] = field(default_factory=lambda: [
'style',
'pattern',
'structural',
'contextual'
])
# Text extraction settings
min_text_length: int = 10 # Minimum text length to keep
merge_adjacent_nodes: bool = True
merge_distance: int = 2 # Max distance between nodes to merge
# Performance monitoring
enable_profiling: bool = False
log_performance: bool = False
def to_dict(self) -> Dict[str, Any]:
"""Convert configuration to dictionary."""
return {
'max_document_size': self.max_document_size,
'streaming_threshold': self.streaming_threshold,
'cache_size': self.cache_size,
'enable_parallel': self.enable_parallel,
'strict_mode': self.strict_mode,
'extract_xbrl': self.extract_xbrl,
'extract_styles': self.extract_styles,
'preserve_whitespace': self.preserve_whitespace,
'optimize_for_ai': self.optimize_for_ai,
'features': self.features.copy()
}
@classmethod
def for_performance(cls) -> 'ParserConfig':
"""Create config optimized for performance."""
return cls(
extract_styles=False,
extract_xbrl=False,
enable_parallel=True,
cache_size=5000,
eager_section_extraction=False, # Skip expensive section extraction
fast_table_rendering=True, # Fast renderer (enabled by default now)
features={
'ml_header_detection': False,
'semantic_analysis': False,
'table_understanding': False,
'xbrl_validation': False
}
)
@classmethod
def for_accuracy(cls) -> 'ParserConfig':
"""Create config optimized for accuracy."""
return cls(
strict_mode=True,
extract_styles=True,
extract_xbrl=True,
enable_parallel=True,
features={
'ml_header_detection': True,
'semantic_analysis': True,
'table_understanding': True,
'xbrl_validation': True,
'auto_section_detection': True,
'smart_text_extraction': True,
'footnote_linking': True,
'cross_reference_resolution': True
}
)
@classmethod
def for_ai(cls) -> 'ParserConfig':
"""Create config optimized for AI/LLM processing."""
return cls(
optimize_for_ai=True,
extract_styles=False,
extract_xbrl=True,
normalize_text=True,
merge_adjacent_nodes=True,
features={
'ml_header_detection': True,
'semantic_analysis': True,
'smart_text_extraction': True
}
)

View File

@@ -0,0 +1,314 @@
# HTML Parser Rewrite - Status Report
**Generated**: 2025-10-08
**Branch**: `html_rewrite`
**Target**: Merge to `main`
---
## Overall Progress: ~95% Complete ✅
### Completed Phases
#### ✅ Phase 1: Core Implementation (100%)
- [x] Streaming parser for large documents
- [x] TableMatrix system for accurate table rendering
- [x] Section extraction with Part I/II detection
- [x] XBRL integration
- [x] Rich-based table rendering
- [x] Configuration system (ParserConfig)
- [x] Error handling and validation
#### ✅ Phase 2: Functional Testing (100%)
- [x] **Corpus Validation** - 40 diverse filings, 100% success rate
- [x] **Edge Cases** - 31 tests covering invalid inputs, malformed HTML, edge conditions
- [x] **Integration Tests** - 25 tests for Filing/Company integration, backward compatibility
- [x] **Regression Tests** - 15 tests preventing known bugs from returning
**Total Test Count**: 79 functional tests, all passing
#### ✅ Phase 3: Performance Profiling (100%)
- [x] **Benchmarking Infrastructure** - Comprehensive benchmark suite
- [x] **Hot Path Analysis** - Identified 3 critical bottlenecks (63% section extraction, 40% Rich rendering, 15% regex)
- [x] **Memory Profiling** - Found 255MB memory leak in MSFT 10-K, documented root causes
- [x] **Performance Regression Tests** - 15 tests locking in baseline thresholds
**Performance Baseline Established**:
- Average: 3.8MB/s throughput, 4.1MB memory per doc
- Small docs: 2.6MB/s (optimization opportunity)
- Large docs: 20.7MB/s (excellent streaming)
- Memory leak: 19-25x ratio on medium docs (needs fixing)
#### ✅ Phase 4: Test Data Augmentation (100%)
- [x] **HTML Fixtures** - Downloaded 32 files (155MB) from 16 companies across 6 industries
- [x] **Download Automation** - Created `download_html_fixtures.py` script
- [x] **Documentation** - Comprehensive fixture documentation
---
## Current Status: Ready for Optimization Phase
### What's Working Well ✅
1. **Parsing Accuracy**: 100% success rate across 40+ diverse filings
2. **Large Document Handling**: Excellent streaming performance (20.7MB/s on JPM 10-K)
3. **Table Extraction**: TableMatrix accurately handles colspan/rowspan
4. **Test Coverage**: 79 comprehensive tests covering edge cases, integration, regression
5. **Backward Compatibility**: Old TenK API still works for existing code
### Known Issues to Address 🔧
#### Critical (Must Fix Before Merge)
1. **Memory Leaks** (Priority: CRITICAL)
- MSFT 10-K: 255MB leak (19x document size)
- Apple 10-K: 41MB leak (23x document size)
- **Root Causes**:
- Rich Console objects retained (0.4MB per doc)
- Global caches not cleared on document deletion
- Circular references in node graph
- **Location**: `tests/perf/memory_analysis.md:90-130`
- **Impact**: Server crashes after 10-20 requests in production
2. **Performance Bottlenecks** (Priority: HIGH)
- Section extraction: 3.7s (63% of parse time)
- Rich rendering for text: 2.4s (40% of parse time)
- Regex normalization: 0.8s (15% of parse time)
- **Location**: `tests/perf/hotpath_analysis.md:9-66`
- **Impact**: 4x slower than necessary on medium documents
#### Non-Critical (Can Fix After Merge)
3. **Small Document Performance** (Priority: MEDIUM)
- 2.6MB/s vs desired 5MB/s
- Overhead dominates on <5MB documents
- **Optimization**: Lazy loading, reduce upfront processing
---
## Next Steps (In Order)
### Phase 5: Critical Fixes (2-3 days) 🔧
#### 5.1 Memory Leak Fixes (1-2 days)
**Goal**: Reduce memory leak from 255MB to <5MB
Tasks:
- [ ] Implement `Document.__del__()` to clear caches
- [ ] Replace Rich rendering in `text()` with direct string building
- [ ] Break circular references in node graph
- [ ] Use weak references for parent links
- [ ] Add `__slots__` to frequently created objects (Cell, TableNode)
**Expected Result**: MSFT 10-K leak: 255MB → <5MB (95% improvement)
**Validation**:
```bash
pytest tests/perf/test_performance_regression.py::TestMemoryRegression -v
```
#### 5.2 Performance Optimizations (1-2 days)
**Goal**: Improve parse speed from 1.2s → 0.3s on Apple 10-K (77% faster)
Tasks:
- [ ] Fix section detection - use headings instead of rendering entire document
- [ ] Implement fast text extraction without Rich overhead
- [ ] Optimize regex normalization - combine patterns, use compilation
**Expected Results**:
- Section extraction: 3.7s → 1.2s (60% faster)
- Text extraction: 2.4s → 1.2s (50% faster)
- Regex: 0.8s → 0.5s (40% faster)
**Validation**:
```bash
pytest tests/perf/test_performance_regression.py::TestParseSpeedRegression -v
```
### Phase 6: Final Validation (1 day) ✅
Tasks:
- [ ] Re-run all 79 functional tests
- [ ] Re-run performance regression tests (verify improvements)
- [ ] Run full corpus validation
- [ ] Memory profiling validation (confirm leaks fixed)
- [ ] Update CHANGELOG.md
- [ ] Create merge summary document
### Phase 7: Merge to Main (1 day) 🚀
Tasks:
- [ ] Final code review
- [ ] Squash commits or create clean merge
- [ ] Update version number
- [ ] Merge to main
- [ ] Tag release
- [ ] Monitor for issues
---
## Test Summary
### Current Test Status: 79/79 Passing (100%)
```
tests/corpus/test_corpus_validation.py 8 tests ✓
tests/test_html_parser_edge_cases.py 31 tests ✓
tests/test_html_parser_integration.py 25 tests ✓
tests/test_html_parser_regressions.py 15 tests ✓
tests/perf/test_performance_regression.py 15 tests ✓ (baseline established)
```
### Test Execution
```bash
# Functional tests (79 tests, ~30s)
pytest tests/corpus tests/test_html_parser_*.py -v
# Performance tests (15 tests, ~20s)
pytest tests/perf/test_performance_regression.py -m performance -v
# All tests
pytest tests/ -v
```
---
## Performance Metrics
### Current Baseline (Before Optimization)
| Document | Size | Parse Time | Throughput | Memory | Tables | Sections |
|----------|------|------------|------------|--------|--------|----------|
| Apple 10-Q | 1.1MB | 0.307s | 3.6MB/s | 27.9MB (25.6x) | 40 | 9 |
| Apple 10-K | 1.8MB | 0.500s | 3.6MB/s | 21.6MB (11.9x) | 63 | 8 |
| MSFT 10-K | 7.8MB | 1.501s | 5.2MB/s | 147.0MB (18.9x) | 85 | 0 |
| JPM 10-K | 52.4MB | 2.537s | 20.7MB/s | 0.6MB (0.01x) | 681 | 0 |
### Target Metrics (After Optimization)
| Metric | Current | Target | Improvement |
|--------|---------|--------|-------------|
| **Memory leak** | 41-255MB | <5MB | 95% reduction |
| **Memory ratio** | 19-25x | <3x | 87% reduction |
| **Parse time (Apple 10-K)** | 0.500s | 0.150s | 70% faster |
| **Throughput (small docs)** | 2.6MB/s | 5.0MB/s | 92% faster |
---
## File Organization
### Core Parser Files
```
edgar/documents/
├── __init__.py # Public API (parse_html)
├── parser.py # Main parser with streaming
├── config.py # ParserConfig
├── document_builder.py # Document tree construction
├── nodes/ # Node types (TableNode, SectionNode)
├── utils/
│ ├── streaming.py # Streaming parser (fixed JPM bug)
│ └── table_processing.py # TableMatrix system
└── exceptions.py # Custom exceptions
```
### Test Files
```
tests/
├── corpus/ # Corpus validation
│ ├── quick_corpus.py # Corpus builder
│ └── test_corpus_validation.py # 8 validation tests
├── fixtures/
│ ├── html/ # 32 HTML fixtures (155MB)
│ │ ├── {ticker}/10k/ # By company and form
│ │ └── README.md
│ └── download_html_fixtures.py # Download automation
├── perf/ # Performance testing
│ ├── benchmark_html_parser.py # Benchmarking
│ ├── profile_hotpaths.py # Hot path profiling
│ ├── profile_memory.py # Memory profiling
│ ├── test_performance_regression.py # Regression tests
│ ├── performance_report.md # Benchmark results
│ ├── hotpath_analysis.md # Bottleneck analysis
│ └── memory_analysis.md # Memory leak analysis
├── test_html_parser_edge_cases.py # 31 edge case tests
├── test_html_parser_integration.py # 25 integration tests
└── test_html_parser_regressions.py # 15 regression tests
```
---
## Risks and Mitigation
### Risk 1: Memory Leaks in Production
**Severity**: HIGH
**Probability**: HIGH (confirmed in testing)
**Mitigation**: Must fix before merge (Phase 5.1)
### Risk 2: Performance Regression
**Severity**: MEDIUM
**Probability**: LOW (baseline established, regression tests in place)
**Mitigation**: Performance regression tests will catch any degradation
### Risk 3: Backward Compatibility
**Severity**: LOW
**Probability**: LOW (integration tests passing)
**Mitigation**: 25 integration tests verify old API still works
---
## Estimated Timeline to Merge
```
Phase 5.1: Memory leak fixes 1-2 days
Phase 5.2: Performance optimization 1-2 days
Phase 6: Final validation 1 day
Phase 7: Merge to main 1 day
----------------------------------------
Total: 4-6 days
```
**Target Merge Date**: October 12-14, 2025
---
## Decision Points
### Should We Merge Now or After Optimization?
**Option A: Merge Now (Not Recommended)**
- ✅ Functional tests passing
- ✅ Backward compatible
- ❌ Memory leaks (production risk)
- ❌ Performance issues
- ❌ Will require hotfix soon
**Option B: Fix Critical Issues First (Recommended)**
- ✅ Production-ready
- ✅ Performance validated
- ✅ Memory efficient
- ❌ 4-6 days delay
- ✅ Clean, professional release
**Recommendation**: **Option B** - Fix critical memory leaks and performance issues before merge. The 4-6 day investment prevents production incidents and ensures a polished release.
---
## Questions for Review
1. **Scope**: Should we fix only critical issues (memory + performance) or also tackle small-doc optimization?
2. **Timeline**: Is 4-6 days acceptable, or do we need to merge sooner?
3. **Testing**: Are 79 functional tests + 15 performance tests sufficient coverage?
4. **Documentation**: Do we need user-facing documentation updates?
---
## Conclusion
The HTML parser rewrite is **95% complete** with excellent functional testing but critical memory and performance issues identified. The smart path forward is:
1. ✅ Complete critical fixes (4-6 days)
2. ✅ Validate improvements
3. ✅ Merge to main with confidence
This approach ensures a production-ready, performant parser rather than merging now and hotfixing later.

View File

@@ -0,0 +1,437 @@
# HTML Parser Rewrite - Progress Assessment
**Date**: 2025-10-07
**Status**: Active Development (html_rewrite branch)
---
## Executive Summary
The HTML parser rewrite is **substantially complete** for core functionality with **excellent progress** on Item/section detection. Recent bug fixes (2025-10-07) have addressed critical table rendering issues and 10-Q Part I/II distinction, bringing the parser close to production-ready quality.
### Overall Progress: **~90% Complete**
- ✅ Core parsing infrastructure: **100% Complete**
- ✅ Table processing: **95% Complete** (recent fixes)
- ✅ Section/Item detection: **95% Complete** (Part I/II fixed, needs validation)
- ⚠️ Performance optimization: **70% Complete**
- ⚠️ Comprehensive testing: **65% Complete** (added 10-Q Part tests)
- ⚠️ Documentation: **75% Complete**
---
## Goal Achievement Analysis
### Primary Goals (from goals.md)
#### 1. **Semantic Meaning Preservation** ✅ **ACHIEVED**
> "Read text, tables and ixbrl data preserving greatest semantic meaning"
**Status**: ✅ Fully implemented
- Text extraction with structure preservation
- Advanced table matrix system for accurate table rendering
- XBRL fact extraction before preprocessing
- Hierarchical node model maintains document structure
**Recent Improvements**:
- Header detection fixes (Oracle Table 6, Tesla Table 16)
- Spacing column filter now preserves header columns (MSFT Table 39)
- Multi-row header normalization
#### 2. **AI Channel (Primary) + Human Channel (Secondary)** ✅ **ACHIEVED**
> "AI context is the primary goal, with human context being secondary"
**Status**: ✅ Both channels working
- **AI Channel**:
- Clean text output optimized for LLMs
- Structured table rendering for context windows
- Section-level extraction for chunking
- Semantic divisibility supported
- **Human Channel**:
- Rich console rendering with proper formatting
- Markdown export
- Visual table alignment (recently fixed)
#### 3. **Section-Level Processing** ✅ **ACHIEVED**
> "Work at full document level and section level - breaking into independently processable sections"
**Status**: ✅ Implemented with good coverage
- `SectionExtractor` class fully functional
- TOC-based section detection
- Pattern-based section identification
- Lazy loading support for large documents
**What Works**:
```python
# Section detection is operational
doc = parse_html(html)
sections = doc.sections # Dict of section names -> SectionNode
# Access specific sections
business = sections.get('Item 1 - Business')
mda = sections.get('Item 7 - MD&A')
financials = sections.get('Item 8 - Financial Statements')
```
#### 4. **Standard Section Names (10-K, 10-Q, 8-K)** ✅ **ACHIEVED**
> "For some filing types (10-K, 10-Q, 8-K) identify sections by standard names"
**Status**: ✅ 95% Complete - Implemented with Part I/II distinction for 10-Q
**What's Implemented**:
- Pattern matching for standard Items:
- Item 1 - Business
- Item 1A - Risk Factors
- Item 7 - MD&A
- Item 7A - Market Risk
- Item 8 - Financial Statements
- And more...
- **10-Q Part I/Part II distinction** (newly fixed 2025-10-07):
- Part I - Item 1 (Financial Statements)
- Part II - Item 1 (Legal Proceedings)
- Proper boundary detection and context propagation
- Prevents Item number conflicts
**What's Remaining** (5%):
- Validation against large corpus of 10-K/10-Q filings
- Edge case handling (non-standard formatting)
- 8-K specific section patterns expansion
**Evidence from Code**:
```python
# edgar/documents/extractors/section_extractor.py
(r'^(Item|ITEM)\s+1\.?\s*Business', 'Item 1 - Business'),
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
(r'^(Item|ITEM)\s+7\.?\s*Management.*Discussion', 'Item 7 - MD&A'),
(r'^(Item|ITEM)\s+8\.?\s*Financial\s+Statements', 'Item 8 - Financial Statements'),
# NEW: Part I/II detection (edgar/documents/extractors/section_extractor.py:294-324)
def _detect_10q_parts(self, headers) -> Dict[int, str]:
"""Detect Part I and Part II boundaries in 10-Q filings."""
```
#### 5. **Table Processing for AI Context** ✅ **ACHIEVED**
> "Getting tables in the right structure for rendering to text for AI context is more important than dataframes"
**Status**: ✅ Excellent progress with recent fixes
- Advanced TableMatrix system handles complex tables
- Multi-row header detection and normalization
- Spacing column filtering (preserves semantic columns)
- Currency symbol merging
- Clean text rendering for LLM consumption
**Recent Fixes (Today)**:
- ✅ Fixed spacing column filter removing legitimate headers (MSFT Table 39)
- ✅ Fixed header detection for date ranges (Oracle Table 6)
- ✅ Fixed long narrative text misclassification (Tesla Table 16)
- ✅ Header row normalization for alignment
#### 6. **Better Than Old Parser in Every Way** 🟡 **MOSTLY ACHIEVED**
> "Speed, accuracy, features, usability"
**Comparison**:
| Aspect | Old Parser | New Parser | Status |
|--------|-----------|------------|--------|
| **Speed** | Baseline | 1.4x faster (typical) | ✅ Better |
| **Accuracy** | Good | Excellent (with recent fixes) | ✅ Better |
| **Features** | Basic | Rich (XBRL, sections, multiple outputs) | ✅ Better |
| **Usability** | Simple | Powerful + Simple API | ✅ Better |
| **Table Rendering** | Basic alignment | Advanced matrix system | ✅ Better |
| **Section Detection** | Limited | Comprehensive | ✅ Better |
**Areas Needing Validation**:
- Performance on very large documents (>50MB)
- Memory usage under sustained load
- Edge case handling across diverse filings
---
## Item/Section Detection Deep Dive
### Current Capabilities
**10-K Sections Detected**:
- ✅ Item 1 - Business
- ✅ Item 1A - Risk Factors
- ✅ Item 1B - Unresolved Staff Comments
- ✅ Item 2 - Properties
- ✅ Item 3 - Legal Proceedings
- ✅ Item 4 - Mine Safety Disclosures
- ✅ Item 5 - Market for Stock
- ✅ Item 6 - Selected Financial Data
- ✅ Item 7 - MD&A
- ✅ Item 7A - Market Risk
- ✅ Item 8 - Financial Statements
- ✅ Item 9 - Changes in Accounting
- ✅ Item 9A - Controls and Procedures
- ✅ Item 9B - Other Information
- ✅ Item 10 - Directors and Officers
- ✅ Item 11 - Executive Compensation
- ✅ Item 12 - Security Ownership
- ✅ Item 13 - Related Transactions
- ✅ Item 14 - Principal Accountant
- ✅ Item 15 - Exhibits
**10-Q Sections Detected**:
- ✅ Part I Items (Financial Information):
- Part I - Item 1 - Financial Statements
- Part I - Item 2 - MD&A
- Part I - Item 3 - Market Risk
- Part I - Item 4 - Controls and Procedures
- ✅ Part II Items (Other Information):
- Part II - Item 1 - Legal Proceedings
- Part II - Item 1A - Risk Factors
- Part II - Item 2 - Unregistered Sales
- Part II - Item 6 - Exhibits
**✅ FIXED** (2025-10-07): Part I/Part II distinction now implemented!
- Part I Item 1 and Part II Item 1 are properly distinguished
- Section keys include Part context: "Part I - Item 1 - Financial Statements" vs "Part II - Item 1 - Legal Proceedings"
- Comprehensive test coverage added (5 tests in test_10q_part_detection.py)
**8-K Sections**:
- ⚠️ Limited - needs expansion
### Detection Methods
1. **TOC-based Detection**
- Analyzes Table of Contents
- Extracts anchor links
- Maps sections to content
2. **Pattern-based Detection**
- Regex matching for Item headers
- Heading analysis (h1-h6 tags)
- Text pattern recognition
3. **Hybrid Approach**
- Combines TOC + patterns
- Fallback mechanisms
- Cross-validation
### What's Working
```python
# This works today:
from edgar.documents import parse_html
html = filing.html()
doc = parse_html(html)
# Get all sections
sections = doc.sections # Returns dict
# Access specific Items
if 'Item 7 - MD&A' in sections:
mda = sections['Item 7 - MD&A']
mda_text = mda.text()
mda_tables = mda.tables()
```
### What Needs Work
1. **Validation Coverage** (20% remaining)
- Test against 100+ diverse 10-K filings
- Test against 10-Q filings
- Test against 8-K filings
- Capture edge cases and variations
2. **Edge Cases** (20% remaining)
- Non-standard Item formatting
- Missing TOC
- Nested sections
- Combined Items (e.g., "Items 10, 13, 14")
3. **8-K Support** (50% remaining)
- 8-K specific Item patterns
- Event-based section detection
- Exhibit handling
---
## Recent Achievements (Past 24 Hours)
### Critical Bug Fixes ✅
1. **Spacing Column Filter Fix** (MSFT Table 39)
- Problem: Legitimate headers removed as "spacing"
- Solution: Header content protection + colspan preservation
- Impact: Tables now render correctly with all headers
- Commits: `4e43276`, `d19ddd1`
2. **Header Detection Improvements**
- Oracle Table 6: Date ranges no longer misclassified
- Tesla Table 16: Long narrative text properly handled
- Multi-row header normalization
- Comprehensive test coverage (16 new tests)
3. **Documentation Updates**
- TESTING.md clarified output limits
- CHANGELOG updated with fixes
- Bug reports and research docs completed
### Quality Metrics
**Test Coverage**:
- 16 new tests added (all passing)
- 0 regressions in existing tests
- Comprehensive edge case coverage
**Code Quality**:
- Clean implementation following plan
- Well-documented changes
- Proper commit messages with Claude Code attribution
---
## Path to 100% Completion
### High Priority (Next Steps)
**📋 Detailed plans available**:
- **Performance**: See `docs-internal/planning/active-tasks/2025-10-07-performance-optimization-plan.md`
- **Testing**: See `docs-internal/planning/active-tasks/2025-10-07-comprehensive-testing-plan.md`
1. **Performance Optimization** (1-2 weeks)
- [ ] Phase 1: Benchmarking & profiling (2-3 days)
- [ ] Phase 2: Algorithm optimizations (3-4 days)
- [ ] Phase 3: Validation & regression tests (2-3 days)
- [ ] Phase 4: Documentation & monitoring (1 day)
- **Goal**: Maintain 1.3x+ speed advantage, <2x memory usage
2. **Comprehensive Testing** (2-3 weeks)
- [ ] Phase 1: Corpus validation - 100+ filings (3-4 days)
- [ ] Phase 2: Edge cases & error handling (2-3 days)
- [ ] Phase 3: Integration testing (2-3 days)
- [ ] Phase 4: Regression prevention (1-2 days)
- [ ] Phase 5: Documentation & sign-off (1 day)
- **Goal**: >95% success rate, >80% test coverage
3. **Item Detection Validation** (included in testing plan)
- [ ] Test against 50+ diverse 10-K filings
- [ ] Test against 20+ 10-Q filings
- [ ] Document any pattern variations found
- [ ] Add regression tests for edge cases
### Medium Priority
4. **8-K Support** (1-2 days)
- [ ] Research 8-K Item patterns
- [ ] Implement detection patterns
- [ ] Test against sample 8-K filings
5. **Documentation** (1 day)
- [ ] User guide for section access
- [ ] API documentation
- [ ] Migration guide from old parser
- [ ] Examples and recipes
### Low Priority (Polish)
6. **Final Polish**
- [ ] Error message improvements
- [ ] Logging enhancements
- [ ] Configuration documentation
- [ ] Performance tuning
---
## Risk Assessment
### Low Risk ✅
- Core parsing functionality (stable)
- Table processing (recently fixed, well-tested)
- Text extraction (working well)
- XBRL extraction (functional)
### Medium Risk ⚠️
- Section detection edge cases (needs validation)
- Performance on very large docs (needs testing)
- Memory usage (needs profiling)
### Mitigation Strategy
1. Comprehensive validation testing (in progress)
2. Real-world filing corpus testing
3. Performance benchmarking suite
4. Gradual rollout with monitoring
---
## Recommendations
### Immediate Actions (This Week)
1. **Validate Item Detection** 🎯 **TOP PRIORITY**
```bash
# Run on diverse corpus
python tests/manual/compare_parsers.py --all
# Test specific sections
python -c "
from edgar.documents import parse_html
from pathlib import Path
for filing in ['Apple', 'Oracle', 'Tesla', 'Microsoft']:
html = Path(f'data/html/{filing}.10-K.html').read_text()
doc = parse_html(html)
print(f'{filing}: {list(doc.sections.keys())[:5]}...')
"
```
2. **Create Section Access Tests**
- Write tests that verify each Item can be accessed
- Validate text and table extraction from sections
- Test edge cases (missing Items, combined Items)
3. **User Acceptance Testing**
- Have maintainer review section detection output
- Validate against known-good filings
- Document any issues found
### Timeline to Production
**Optimistic**: 1 week
- If validation shows good Item detection
- If performance is acceptable
- If no major issues found
**Realistic**: 2-3 weeks
- Account for edge case fixes
- Additional testing needed
- Documentation completion
**Conservative**: 4 weeks
- Account for 8-K support
- Comprehensive testing across all filing types
- Full documentation
---
## Conclusion
The HTML parser rewrite is **very close to completion** with excellent progress on all goals:
**✅ Fully Achieved**:
- Semantic meaning preservation
- AI/Human channel support
- Section-level processing
- Table processing for AI context
- Superior to old parser (in most respects)
- **Standard Item detection for 10-K/10-Q** (with Part I/II distinction)
**⚠️ Remaining Work (10%)**:
- Validation against diverse corpus
- Edge case handling
- 8-K specific support expansion
- Final testing and documentation
**Bottom Line**: The parser is **production-ready for 10-K/10-Q** with Item detection functional but requiring validation. The recent bug fixes have resolved critical table rendering issues. With 1-2 weeks of focused validation and testing, this can be shipped with confidence.
### Next Steps
1. Run comprehensive Item detection validation
2. Create section access test suite
3. Performance benchmark
4. Maintainer review and sign-off
5. Merge to main branch

View File

@@ -0,0 +1,233 @@
# HTML Parser Testing Quick Start
Quick reference for testing the HTML parser rewrite during quality improvement.
## Quick Start
```bash
# Use shortcuts (easy!)
python tests/manual/compare_parsers.py aapl # Apple 10-K
python tests/manual/compare_parsers.py nvda --tables # Nvidia tables
python tests/manual/compare_parsers.py 'aapl 10-q' # Apple 10-Q
python tests/manual/compare_parsers.py orcl --table 5 # Oracle table #5
# Or use full paths
python tests/manual/compare_parsers.py data/html/Apple.10-K.html
# Run all test files
python tests/manual/compare_parsers.py --all
```
**Available shortcuts:**
- **Companies**: `aapl`, `msft`, `tsla`, `nvda`, `orcl` (or full names like `apple`)
- **Filing types**: `10-k` (default), `10-q`, `8-k`
- **Combine**: `'aapl 10-q'`, `'orcl 8-k'`
## Common Use Cases
### 1. First Look at a Filing
```bash
# Get overview: speed, table count, sections
python tests/manual/compare_parsers.py orcl
```
**Shows**:
- Parse time comparison (OLD vs NEW)
- Tables found
- Text length
- Sections detected
- New features (headings, XBRL)
### 2. Check Table Rendering
```bash
# List all tables with dimensions (shows first 20 tables)
python tests/manual/compare_parsers.py aapl --tables
# Compare specific table side-by-side (FULL table, no truncation)
python tests/manual/compare_parsers.py aapl --table 7
# Compare a range of tables
python tests/manual/compare_parsers.py aapl --range 5:10
```
**Look for**:
- Currency symbols merged: `$1,234` not `$ | 1,234`
- Proper column alignment
- Correct row/column counts
- Clean rendering without extra spacing columns
**Note**: `--table N` shows the **complete table** with all rows - no truncation!
### 3. Verify Text Extraction
```bash
# See first 50 lines side-by-side (default limit)
python tests/manual/compare_parsers.py msft --text
# Show more lines (configurable)
python tests/manual/compare_parsers.py msft --text --lines 100
# Show first 200 lines
python tests/manual/compare_parsers.py msft --text --lines 200
```
**Check**:
- Semantic meaning preserved
- No missing content
- Clean formatting for LLM consumption
**Note**: Text mode shows first N lines only (default: 50). Use `--lines N` to adjust.
### 4. Check Section Detection
```bash
python tests/manual/compare_parsers.py aapl --sections
```
**Verify**:
- Standard sections identified (10-K/10-Q)
- Section boundaries correct
- Text length reasonable per section
### 5. Run Full Test Suite
```bash
# Test all files in corpus
python tests/manual/compare_parsers.py --all
```
**Results**:
- Summary table across all files
- Performance comparison
- Table detection comparison
## Test Files
Available in `data/html/`:
- `Apple.10-K.html` - 1.8MB, complex financials
- `Oracle.10-K.html` - Large filing
- `Nvidia.10-K.html` - Tech company
- `Apple.10-Q.html` - Quarterly format
- More files as needed...
## Command Reference
```
python tests/manual/compare_parsers.py [FILE] [OPTIONS]
Options:
--all Run on all test files
--tables Show tables summary (first 20 tables)
--table N Show specific table N side-by-side (FULL table)
--range START:END Show range of tables (e.g., 5:10)
--text Show text comparison (first 50 lines by default)
--sections Show sections comparison
--lines N Number of text lines to show (default: 50, only for --text)
--help Show full help
```
### Output Limits Summary
| Mode | Limit | Configurable | Notes |
|---------------|------------|-------------------|---------------------------------|
| `--table N` | None | N/A | Shows **complete table** |
| `--range N:M` | None | N/A | Shows **complete tables** in range |
| `--tables` | 20 tables | No | Lists first 20 tables only |
| `--text` | 50 lines | Yes (`--lines N`) | Preview only |
| `--sections` | None | N/A | Shows all sections |
## Output Interpretation
### Overview Table
```
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ Metric ┃ Old Parser ┃ New Parser ┃ Notes ┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ Parse Time │ 454ms │ 334ms │ 1.4x faster│
│ Tables Found │ 63 │ 63 │ +0 │
│ Text Length │ 0 │ 159,388 │ NEW! │
└───────────────┴────────────┴────────────┴────────────┘
```
**Good signs**:
- ✅ New parser faster or similar speed
- ✅ Same or more tables found
- ✅ Text extracted (old parser shows 0)
- ✅ Sections detected
**Red flags**:
- ❌ Significantly slower
- ❌ Fewer tables (unless removing layout tables)
- ❌ Much shorter text (content missing)
### Table Comparison
```
Old Parser:
┌─────────┬──────────┬──────────┐
│ Year │ Revenue │ Profit │
├─────────┼──────────┼──────────┤
│ 2023 │ $ 100M │ $ 20M │ <- Currency separated
└─────────┴──────────┴──────────┘
New Parser:
┌─────────┬──────────┬──────────┐
│ Year │ Revenue │ Profit │
├─────────┼──────────┼──────────┤
│ 2023 │ $100M │ $20M │ <- Currency merged ✅
└─────────┴──────────┴──────────┘
```
**Look for**:
- Currency symbols merged with values
- No extra empty columns
- Proper alignment
- Clean numeric formatting
## Tips
1. **Start with overview** - Get the big picture first
2. **Check tables visually** - Automated metrics miss formatting issues
3. **Use specific table inspection** - Don't scroll through 60 tables manually
4. **Compare text for semantics** - Does it make sense for an LLM?
5. **Run --all periodically** - Catch regressions across files
## Troubleshooting
### Script fails with import error
```bash
# Clear cached modules
find . -type d -name __pycache__ -exec rm -rf {} +
python tests/manual/compare_parsers.py data/html/Apple.10-K.html
```
### File not found
```bash
# Check available files
ls -lh data/html/*.html
# Use full path
python tests/manual/compare_parsers.py /full/path/to/file.html
```
### Old parser shows 0 text
This is expected - old parser has different text extraction. Focus on:
- Table comparison
- Parse time
- Visual quality of output
## Next Steps
1. Run comparison on all test files
2. Document bugs in `quality-improvement-strategy.md`
3. Fix issues
4. Repeat until satisfied
See `edgar/documents/docs/quality-improvement-strategy.md` for full process.

View File

@@ -0,0 +1,529 @@
# Fast Table Rendering
**Status**: Production Ready - **Now the Default** (as of 2025-10-08)
**Performance**: ~8-10x faster than Rich rendering with correct colspan/rowspan handling
---
## Overview
Fast table rendering provides a high-performance alternative to Rich library rendering for table text extraction. When parsing SEC filings with hundreds of tables, the cumulative rendering time can become a bottleneck. Fast rendering addresses this by using direct string building with TableMatrix for proper colspan/rowspan handling, achieving 8-10x speedup while maintaining correctness.
**As of 2025-10-08, fast rendering is the default** for all table text extraction. You no longer need to explicitly enable it.
### Why It's Now the Default
- **Production-ready**: Fixed all major issues (colspan, multi-row headers, multi-line cells)
- **7-10x faster**: Significant performance improvement with correct output
- **Maintains quality**: Matches Rich's appearance with simple() style
- **Proven**: Extensively tested with Apple, NVIDIA, Microsoft 10-K filings
### When to Disable (Use Rich Instead)
You may want to disable fast rendering and use Rich for:
- **Terminal display for humans**: Rich has more sophisticated text wrapping and layout
- **Visual reports**: When presentation quality is more important than speed
- **Debugging**: Rich output can be easier to visually inspect
---
## Usage
### Default Behavior (Fast Rendering Enabled)
```python
from edgar.documents import parse_html
# Fast rendering is now the default - no configuration needed!
doc = parse_html(html)
# Tables automatically use fast renderer (7-10x faster)
table_text = doc.tables[0].text()
```
### Disabling Fast Rendering (Use Rich Instead)
If you need Rich's sophisticated layout for visual display:
```python
from edgar.documents import parse_html
from edgar.documents.config import ParserConfig
# Explicitly disable fast rendering to use Rich
config = ParserConfig(fast_table_rendering=False)
doc = parse_html(html, config=config)
# Tables use Rich renderer (slower but with advanced formatting)
table_text = doc.tables[0].text()
```
### Custom Table Styles
**New in this version**: Fast rendering now uses the `simple()` style by default, which matches Rich's `box.SIMPLE` appearance (borderless, clean).
```python
from edgar.documents import parse_html
from edgar.documents.config import ParserConfig
from edgar.documents.renderers.fast_table import FastTableRenderer, TableStyle
# Enable fast rendering (uses simple() style by default)
config = ParserConfig(fast_table_rendering=True)
doc = parse_html(html, config=config)
# Default: simple() style - borderless, clean
table_text = doc.tables[0].text()
# To use pipe_table() style explicitly (markdown-compatible borders):
renderer = FastTableRenderer(TableStyle.pipe_table())
pipe_text = renderer.render_table_node(doc.tables[0])
# To use minimal() style (no separator):
renderer = FastTableRenderer(TableStyle.minimal())
minimal_text = renderer.render_table_node(doc.tables[0])
```
---
## Performance Comparison
### Benchmark Results
**Test**: Apple 10-K (63 tables) - Updated 2025-10-08
| Renderer | Average Per Table | Improvement | Notes |
|----------|-------------------|-------------|-------|
| Rich | 1.5-2.5ms | Baseline | Varies by table complexity |
| Fast (simple) | 0.15-0.35ms | **7-10x faster** | With proper colspan/rowspan handling |
**Real-world Examples** (Apple 10-K):
- Table 15 (complex colspan): Rich 2.51ms → Fast 0.35ms (**7.1x faster**)
- Table 6 (multi-line cells): Rich 1.61ms → Fast 0.17ms (**9.5x faster**)
- Table 5 (wide table): Rich 3.70ms → Fast 0.48ms (**7.7x faster**)
**Impact on Full Parse**:
- Rich rendering: 30-40% of total parse time spent in table rendering
- Fast rendering: 5-10% of total parse time
- **Overall speedup**: Reduces total parsing time by ~25-30%
### Memory Impact
Fast rendering also reduces memory overhead:
- No Rich Console objects retained
- Direct string building (no intermediate objects)
- Helps prevent memory leaks identified in profiling
---
## Output Examples
### Rich Renderer Output (Default)
```
(In millions)
Year Ended June 30, 2025 2024 2023
──────────────────────────────────────────────────────────
Operating lease cost $5,524 3,555 2,875
Finance lease cost:
Amortization of right-of-use assets $3,408 1,800 1,352
Interest on lease liabilities 1,417 734 501
Total finance lease cost $4,825 2,534 1,853
```
**Style**: `box.SIMPLE` - No outer border, just horizontal separator under header
**Pros**: Clean, uncluttered, perfect alignment, generous spacing
**Cons**: Slow (6.5ms per table), creates Rich objects, memory overhead
### Fast Renderer Output (NEW: simple() style - Default)
```
December 31, 2023 December 31, 2022 December 31, 2021
───────────────────────────────────────────────────────────────────────────────────────
Revenue 365,817 394,328 365,817
Cost of revenue 223,546 212,981 192,266
Gross profit 142,271 181,347 173,551
```
**Style**: `simple()` - Matches Rich's `box.SIMPLE` appearance
**Pros**: Fast (0.2ms per table), clean appearance, no visual noise, professional look
**Cons**: None - this is now the recommended default!
### Fast Renderer Output (pipe_table() style - Optional)
```
| | December 31, 2023 | December 31, 2022 | December 31, 2021 |
|--------------------------|---------------------|---------------------|---------------------|
| Revenue | 365,817 | 394,328 | 365,817 |
| Cost of revenue | 223,546 | 212,981 | 192,266 |
| Gross profit | 142,271 | 181,347 | 173,551 |
```
**Style**: `pipe_table()` - Markdown-compatible with borders
**Pros**: Fast (0.2ms per table), markdown-compatible, explicit column boundaries
**Cons**: Visual noise from pipe characters, busier appearance
**Use when**: You need markdown-compatible output with explicit borders
### Visual Comparison
**Rich** (`box.SIMPLE`):
- No outer border - clean, uncluttered look
- Horizontal line separator under header only
- Generous internal spacing and padding
- Perfect column alignment
- Professional, minimalist presentation
**Fast simple()** (NEW DEFAULT):
- No outer border - matches Rich's clean look
- Horizontal line separator under header (using `─`)
- Space-separated columns with generous padding
- Clean, professional appearance
- Same performance as pipe_table (~0.2ms per table)
**Fast pipe_table()** (optional):
- Full pipe table borders (`|` characters everywhere)
- Horizontal dashes for header separator
- Markdown-compatible format
- Explicit column boundaries
---
## Recent Improvements (2025-10-08)
### 1. Colspan/Rowspan Support
**Fixed**: Tables with `colspan` and `rowspan` attributes now render correctly.
**Previous issue**: Fast renderer was extracting cell text without accounting for colspan/rowspan, causing:
- Missing columns (e.g., "2023" column disappeared in Apple 10-K table 15)
- Misaligned data (currency symbols separated from values)
- Data loss (em dashes and other values missing)
**Solution**: Integrated `TableMatrix` for proper cell expansion, same as Rich rendering uses.
**Status**: ✅ FIXED
### 2. Multi-Row Header Preservation
**Fixed**: Tables with multiple header rows now preserve each row separately.
**Previous issue**: Multi-row headers were collapsed into a single line, causing "Investment portfolio" row to disappear in Apple 10-K table 20.
**Solution**: Modified `render_table_data()` and `_build_table()` to preserve each header row as a separate line.
**Status**: ✅ FIXED
### 3. Multi-Line Cell Rendering
**Fixed**: Cells containing newline characters (`\n`) now render as multiple lines.
**Previous issue**: Multi-line cells like "Interest Rate\nSensitive Instrument" were truncated to first line only.
**Solution**: Added `_format_multiline_row()` to split cells by `\n` and render each line separately.
**Status**: ✅ FIXED
### Performance Impact
All three fixes maintain excellent performance:
- **Speedup**: 7-10x faster than Rich (down from initial 14x, but with correct output)
- **Correctness**: Now matches Rich output exactly for colspan, multi-row headers, and multi-line cells
- **Production ready**: Can confidently use as default renderer
---
## Known Limitations
### 1. Column Alignment in Some Tables
**Issue**: Currency symbols and values may have extra spacing in some complex tables (e.g., Apple 10-K table 22)
**Example**:
- Rich: `$294,866`
- Fast: `$ 294,866` (extra spacing)
**Root cause**: Column width calculation creates wider columns for some currency/value pairs after colspan expansion and column filtering.
**Impact**: Visual appearance differs slightly, but data is correct and readable.
**Status**: ⚠️ Minor visual difference - acceptable trade-off for 10x performance gain
### 3. Visual Polish
**Issue**: Some visual aspects don't exactly match Rich's sophisticated layout
**Examples**:
- Multi-line cell wrapping may differ
- Column alignment in edge cases
**Status**: ⚠️ Acceptable trade-off for 8-10x performance gain
---
## Configuration Options
### Table Styles
Fast renderer supports different visual styles:
```python
from edgar.documents.renderers.fast_table import FastTableRenderer, TableStyle
# Pipe table style (default) - markdown compatible
renderer = FastTableRenderer(TableStyle.pipe_table())
# Minimal style - no borders, just spacing
renderer = FastTableRenderer(TableStyle.minimal())
```
### Minimal Style Output
```
December 31, 2023 December 31, 2022 December 31, 2021
Revenue 365,817 394,328 365,817
Cost of revenue 223,546 212,981 192,266
Gross profit 142,271 181,347 173,551
```
**Note**: Minimal style has cleaner appearance but loses column boundaries
---
## Technical Details
### How It Works
1. **Direct String Building**: Bypasses Rich's layout engine
2. **Column Analysis**: Detects numeric columns for right-alignment
3. **Smart Filtering**: Removes empty spacing columns
4. **Currency Merging**: Combines `$` symbols with amounts
5. **Width Calculation**: Measures content, applies min/max limits
### Code Path
```python
# When fast_table_rendering=True:
table.text()
TableNode._fast_text_rendering()
FastTableRenderer.render_table_node()
Direct string building
```
### Memory Benefits
Fast rendering avoids:
- Rich Console object creation (~0.4MB per document)
- Intermediate rich.Table objects
- Style/theme processing overhead
- ANSI escape code generation
---
## Future Improvements
### Planned Enhancements
1. **Match Rich's `box.SIMPLE` Style** (Priority: HIGH)
- **Remove all pipe characters** - no outer border, no column separators
- **Keep only horizontal separator** under header (using `─` character)
- **Increase internal padding** to match Rich's generous spacing
- **Clean, minimalist appearance** like Rich's SIMPLE box style
- **Goal**: Match Rich visual quality, still 30x faster
2. **Improved Layout Engine**
- Better column width calculation (avoid too-wide/too-narrow columns)
- Respect natural content breaks
- Dynamic spacing based on content type
- Handle wrapping for long content
3. **Dynamic Padding**
- Match Rich's generous spacing (currently too tight)
- Adjust padding based on content type
- Configurable padding rules
- Maintain alignment with variable padding
4. **Header Handling**
- Better multi-row header collapse
- Preserve important hierarchies
- Smart column spanning
- Honor header groupings
5. **Style Presets**
- `TableStyle.simple()` - Match Rich's `box.SIMPLE` (no borders, header separator only) ⭐ **PRIMARY GOAL**
- `TableStyle.minimal()` - no borders, just spacing (already implemented)
- `TableStyle.pipe_table()` - current markdown style (default)
- `TableStyle.ascii_clean()` - no Unicode, pure ASCII
- `TableStyle.compact()` - minimal spacing for dense data
### Timeline
These improvements are **planned for Phase 2** of the HTML parser optimization work (after memory leak fixes).
---
## Migration Guide
### From Rich to Fast
**Before** (using Rich):
```python
doc = parse_html(html)
table_text = doc.tables[0].text() # Slow but pretty
```
**After** (using Fast):
```python
config = ParserConfig(fast_table_rendering=True)
doc = parse_html(html, config=config)
table_text = doc.tables[0].text() # Fast but current visual issues
```
### Hybrid Approach
Use fast rendering during processing, Rich for final display:
```python
# Fast processing
config = ParserConfig(fast_table_rendering=True)
doc = parse_html(html, config=config)
# Extract data quickly
for table in doc.tables:
data = table.text() # Fast
# Process data...
# Display one table nicely
special_table = doc.tables[5]
rich_output = special_table.render() # Switch to Rich for display
```
---
## Performance Recommendations
### Recommended Settings by Use Case
**Batch Processing** (optimize for speed):
```python
config = ParserConfig.for_performance()
# Includes: fast_table_rendering=True, eager_section_extraction=False
```
**Data Extraction** (balance speed and accuracy):
```python
config = ParserConfig(
fast_table_rendering=True,
extract_xbrl=True,
detect_sections=True
)
```
**Display/Reports** (optimize for quality):
```python
config = ParserConfig() # Default settings use Rich
# Or explicitly:
config = ParserConfig.for_accuracy()
```
---
## FAQ
**Q: Can I mix Fast and Rich rendering?**
A: Not per-table. The setting is document-wide via ParserConfig. However, you can manually call `table.render()` to get Rich output.
**Q: Does this affect section extraction?**
A: Indirectly, yes. Section detection calls `text()` on the entire document, which includes tables. Fast rendering speeds this up significantly.
**Q: Will the output format change?**
A: Yes, as we improve the renderer. We'll maintain backward compatibility via style options.
**Q: Can I customize the appearance?**
A: Currently limited to `TableStyle.pipe_table()` vs `TableStyle.minimal()`. More options coming.
**Q: What about DataFrame export?**
A: Fast rendering only affects text output. `table.to_dataframe()` is unaffected.
---
## Feedback
The fast renderer is actively being improved based on user feedback. Known issues:
1.**Pipe characters** - visual noise
2.**Layout engine** - inconsistent spacing
3.**Padding** - needs tuning
If you have specific rendering issues or suggestions, please provide:
- Sample table HTML
- Expected vs actual output
- Use case description
This helps prioritize improvements while maintaining the performance advantage.
---
## Summary
### Current State (As of 2025-10-08)
**Performance**: ✅ Excellent (8-10x faster than Rich)
**Correctness**: ✅ Production ready (proper colspan/rowspan handling)
**Visual Quality**: ⚠️ Good (simple() style matches Rich's box.SIMPLE appearance)
**Use Case**: Production-ready for all use cases
### Recent Milestones
**✅ Completed**:
- Core fast rendering implementation
- TableStyle.simple() preset (borderless, clean)
- Column filtering and merging
- Numeric alignment detection
- **Colspan/rowspan support via TableMatrix**
- **Performance benchmarking with real tables**
**🔧 Current Limitations**:
- Multi-row header collapsing differs from Rich
- Some visual polish differences (acceptable for speed gain)
- Layout engine not as sophisticated as Rich
### Development Roadmap
**Phase 1** (✅ COMPLETED):
- ✅ Core fast rendering implementation
- ✅ Simple() style matching Rich's box.SIMPLE
- ✅ Proper colspan/rowspan handling via TableMatrix
- ✅ Production-ready performance (8-10x faster)
**Phase 2** (Future Enhancements):
- 📋 Improve multi-row header handling
- 📋 Better layout engine for perfect column widths
- 📋 Additional style presets
- 📋 Advanced header detection (data vs labels)
### Bottom Line
Fast table rendering is **production-ready and now the default** for all table text extraction in EdgarTools.
**Benefits**:
- ✅ 7-10x faster than Rich rendering
- ✅ Correct data extraction with proper colspan/rowspan handling
- ✅ Multi-row header preservation
- ✅ Multi-line cell rendering
- ✅ Clean, borderless appearance (simple() style)
**Minor differences from Rich**:
- ⚠️ Some tables have extra spacing between currency symbols and values (e.g., table 22)
- ⚠️ Column width calculation may differ slightly in complex tables
- ✅ All data is preserved and correct - only visual presentation differs
The implementation achieves **correct data extraction** with **significant performance gains** and **clean visual output**, making it the ideal default for EdgarTools.
---
## Related Documentation
- [HTML Parser Status](HTML_PARSER_STATUS.md) - Overall parser progress
- [Performance Analysis](../perf/hotpath_analysis.md) - Profiling results showing Rich rendering bottleneck
- [Memory Analysis](../perf/memory_analysis.md) - Memory leak issues with Rich objects

View File

@@ -0,0 +1,164 @@
# Goals
## Mission
Replace `edgar.files` with a parser that is better in **every way** - utility, accuracy, and user experience. The maintainer is the final judge: output must look correct when printed.
## Core Principles
### Primary Goal: AI Context Optimization
- **Token efficiency**: 30-50% reduction vs raw HTML while preserving semantic meaning
- **Chunking support**: Enable independent processing of sections/tables for LLM context windows
- **Clean text output**: Tables rendered in LLM-friendly formats (clean text, markdown)
- **Semantic preservation**: Extract meaning, not just formatting
### Secondary Goal: Human Readability
- **Rich console output**: Beautiful rendering with proper table alignment
- **Markdown export**: Professional-looking document conversion
- **Section navigation**: Easy access to specific Items/sections
## User-Focused Feature Goals
### 1. Text Extraction
- Extract full document text without dropping meaningful content
- Preserve paragraph structure and semantic whitespace
- Handle inline XBRL facts gracefully (show values, not raw tags)
- Clean HTML artifacts automatically (scripts, styles, page numbers)
- **Target**: 99%+ accuracy vs manual reading
### 2. Section Extraction (10-K, 10-Q, 8-K)
- Detect >90% of standard sections for >90% of test tickers
- Support flexible access: `doc.sections['Item 1A']`, `doc['1A']`, `doc.risk_factors`
- Return Section objects with `.text()`, `.tables`, `.search()` methods
- Include confidence scores and detection method metadata
- **Target**: Better recall than old parser (quantify with test suite)
### 3. Table Extraction
- Extract all meaningful data tables (ignore pure layout tables)
- Accurate rendering with aligned columns and proper formatting
- Handle complex tables (rowspan, colspan, nested headers)
- Preserve table captions and surrounding context
- Support DataFrame conversion for data analysis
- **Target**: 95%+ accuracy on test corpus
### 4. Search Capabilities
- Text search within documents
- Regex pattern matching
- Semantic search preparation (structure for embedding-based search)
- Search within sections for focused queries
### 5. Multiple Output Formats
- Plain text (optimized for LLM context)
- Markdown (for documentation/sharing)
- Rich console (beautiful terminal display)
- JSON (structured data export)
### 6. Developer Experience
- Intuitive API: `doc.text()`, `doc.tables`, `doc.sections`
- Rich objects with useful methods (not just strings)
- Simple tasks simple, complex tasks possible
- Helpful error messages with recovery suggestions
- **Target**: New users productive in <10 minutes
## Performance Targets
### Speed Benchmarks (Based on Current Performance)
- **Small docs (<5MB)**: <500ms ✅ *Currently 96ms - excellent*
- **Medium docs (5-20MB)**: <2s ✅ *Currently 1.19s - excellent*
- **Large docs (>50MB)**: <10s ✅ *Currently 0.59s - excellent*
- **Throughput**: >3MB/s sustained ✅ *Currently 3.8MB/s*
- **Target**: Maintain or improve on all benchmarks
### Memory Efficiency
- **Small docs (<5MB)**: <3x document size *(currently 9x - needs optimization)*
- **Large docs (>10MB)**: <2x document size *(currently 1.9x - good)*
- **No memory spikes**: Never exceed 5x document size *(MSFT currently 5.4x)*
- **Target**: Consistent 2-3x overhead across all document sizes
### Accuracy Benchmarks
- **Section detection recall**: >90% on 20-ticker test set
- **Table extraction accuracy**: >95% on manual validation set
- **Text fidelity**: >99% semantic equivalence to source HTML
- **XBRL fact extraction**: 100% of inline facts captured correctly
## Implementation Details
### HTML Parsing
- Read the entire HTML document without dropping semantically meaningful content
- Drop non-meaningful content (scripts, styles, pure formatting tags)
- Preserve semantic structure (headings, paragraphs, lists)
- Handle both old (pre-2015) and modern (inline XBRL) formats
- Graceful degradation for malformed HTML
### Table Parsing
- Extract tables containing meaningful data
- Ignore layout tables (unless they aid document understanding)
- Accurate rendering with proper column alignment
- Handle complex structures: rowspan, colspan, nested headers, multi-level headers
- Preserve table captions and contextual information
- Support conversion to pandas DataFrame
### Section Extraction
- Detect standard sections (Item 1, 1A, 7, etc.) for 10-K, 10-Q, 8-K filings
- Support multiple detection strategies: TOC-based, heading-based, pattern-based
- Return Section objects with full API: `.text()`, `.text_without_tables()`, `.tables`, `.search()`
- Include metadata: confidence scores, detection method, position
- Better recall than old parser (establish baseline with test suite)
## Quality Gates Before Replacing edgar.files
### Automated Tests
- [ ] All existing tests pass with new parser (1000+ tests)
- [ ] Performance regression tests (<5% slower on any document)
- [ ] Memory regression tests (no >10% increases)
- [ ] Section detection accuracy >90% on test corpus
- [ ] Table extraction accuracy >95% on validation set
### Manual Validation (Maintainer Review)
- [ ] Print full document text for 10 sample filings → verify quality
- [ ] Compare table rendering old vs new → verify improvement
- [ ] Test section extraction on edge cases → verify robustness
- [ ] Review markdown output → verify professional appearance
- [ ] Check memory usage → verify no concerning spikes
### Documentation Requirements
- [ ] Migration guide (old API → new API with examples)
- [ ] Updated user guide showing new features
- [ ] Performance comparison report (old vs new)
- [ ] Known limitations documented clearly
- [ ] API reference complete for all public methods
## Success Metrics
### Launch Criteria
1. **Speed**: Equal or faster on 95% of test corpus
2. **Accuracy**: Maintainer approves output quality on sample set
3. **API**: Clean, intuitive interface (no confusion)
4. **Tests**: Zero regressions, 95%+ coverage on new code
5. **Docs**: Complete with examples for all major use cases
### Post-Launch Monitoring
- Issue reports: <5% related to parser quality/accuracy
- User feedback: Positive sentiment on ease of use
- Performance: No degradation over time (regression tests)
- Adoption: Smooth migration from old parser (deprecation path)
## Feature Parity with Old Parser
### Must-Have (Required for Migration)
- ✅ Get document text (with/without tables)
- ✅ Extract specific sections by name/number
- ✅ List all tables in document
- ✅ Search document content
- ✅ Convert to markdown
- ✅ Handle both old and new SEC filing formats
- ✅ Graceful error handling
### Nice-to-Have (Improvements Over Old Parser)
- 🎯 Semantic search capabilities
- 🎯 Better subsection extraction within Items
- 🎯 Table-of-contents navigation
- 🎯 Export to multiple formats (JSON, clean HTML)
- 🎯 Batch processing optimizations
- 🎯 Section confidence scores and metadata

View File

@@ -0,0 +1,240 @@
# HTML Parser Rewrite Technical Overview
## Executive Summary
The `edgar/documents` module represents a comprehensive rewrite of the HTML parsing capabilities originally implemented in `edgar/files`. This new parser is designed to provide superior parsing accuracy, structured data extraction, and rendering quality for SEC filing documents. The rewrite introduces a modern, extensible architecture with specialized components for handling the complex structure of financial documents.
## Architecture Overview
### Core Components
#### 1. Document Object Model
The new parser introduces a sophisticated node-based document model:
- **Document**: Top-level container with metadata and sections
- **Node Hierarchy**: Abstract base classes for all document elements
- `DocumentNode`: Root document container
- `TextNode`: Plain text content
- `ParagraphNode`: Paragraph elements with styling
- `HeadingNode`: Headers with levels 1-6
- `ContainerNode`: Generic containers (div, section)
- `SectionNode`: Document sections with semantic meaning
- `ListNode`/`ListItemNode`: Ordered and unordered lists
- `LinkNode`: Hyperlinks with metadata
- `ImageNode`: Images with attributes
#### 2. Table Processing System
Advanced table handling represents a major improvement over the old parser:
- **TableNode**: Sophisticated table representation with multi-level headers
- **Cell**: Individual cell with colspan/rowspan support and type detection
- **Row**: Table row with header detection and semantic classification
- **TableMatrix**: Handles complex cell spanning and alignment
- **CurrencyColumnMerger**: Intelligently merges currency symbols with values
- **ColumnAnalyzer**: Detects spacing columns and optimizes layout
#### 3. Parser Pipeline
The parsing process follows a well-defined pipeline:
1. **HTMLParser**: Main orchestration class
2. **HTMLPreprocessor**: Cleans and normalizes HTML
3. **DocumentBuilder**: Converts HTML tree to document nodes
4. **Strategy Pattern**: Pluggable parsing strategies
5. **DocumentPostprocessor**: Final cleanup and optimization
### Key Improvements Over Old Parser
#### Table Processing Enhancements
**Old Parser (`edgar/files`)**:
- Basic table extraction using BeautifulSoup
- Limited colspan/rowspan handling
- Simple text-based rendering
- Manual column alignment
- Currency symbols often misaligned
**New Parser (`edgar/documents`)**:
- Advanced table matrix system for perfect cell alignment
- Intelligent header detection (multi-row headers, year detection)
- Automatic currency column merging ($1,234 instead of $ | 1,234)
- Semantic table type detection (FINANCIAL, METRICS, TOC, etc.)
- Rich table rendering with proper formatting
- Smart column width calculation
- Enhanced numeric formatting with comma separators
#### Document Structure
**Old Parser**:
- Flat block-based structure
- Limited semantic understanding
- Basic text extraction
**New Parser**:
- Hierarchical node-based model
- Semantic section detection
- Rich metadata preservation
- XBRL fact extraction
- Search capabilities
- Multiple output formats (text, markdown, JSON, pandas)
#### Rendering Quality
**Old Parser**:
- Basic text output
- Limited table formatting
- No styling preservation
**New Parser**:
- Multiple renderers (text, markdown, Rich console)
- Preserves document structure and styling
- Configurable output options
- LLM-optimized formatting
## Implementation Details
### Configuration System
The new parser uses a comprehensive configuration system:
```python
@dataclass
class ParserConfig:
# Size limits
max_document_size: int = 50 * 1024 * 1024 # 50MB
streaming_threshold: int = 10 * 1024 * 1024 # 10MB
# Processing options
preserve_whitespace: bool = False
detect_sections: bool = True
extract_xbrl: bool = True
table_extraction: bool = True
detect_table_types: bool = True
```
### Strategy Pattern Implementation
The parser uses pluggable strategies for different aspects:
- **HeaderDetectionStrategy**: Identifies document sections
- **TableProcessor**: Handles table extraction and classification
- **XBRLExtractor**: Extracts XBRL facts and metadata
- **StyleParser**: Processes CSS styling information
### Table Processing Deep Dive
The table processing system represents the most significant improvement:
#### Header Detection Algorithm
- Analyzes cell content patterns (th vs td elements)
- Detects year patterns in financial tables
- Identifies period indicators (quarters, fiscal years)
- Handles multi-row headers with units and descriptions
- Prevents misclassification of data rows as headers
#### Cell Type Detection
- Numeric vs text classification
- Currency value recognition
- Percentage handling
- Em dash and null value detection
- Proper number formatting with thousand separators
#### Matrix Building
- Handles colspan and rowspan expansion
- Maintains cell relationships
- Optimizes column layout
- Removes spacing columns automatically
### XBRL Integration
The new parser includes sophisticated XBRL processing:
- Extracts facts before preprocessing to preserve ix:hidden content
- Maintains metadata relationships
- Supports inline XBRL transformations
- Preserves semantic context
## Performance Characteristics
### Memory Efficiency
- Streaming support for large documents (>10MB)
- Lazy loading of document sections
- Caching for repeated operations
- Memory-efficient node representation
### Processing Speed
- Optimized HTML parsing with lxml
- Configurable processing strategies
- Parallel extraction capabilities
- Smart caching of expensive operations
## Migration and Compatibility
### API Compatibility
The new parser maintains high-level compatibility with the old parser while offering enhanced functionality:
```python
# Old way
from edgar.files import FilingDocument
doc = FilingDocument(html)
text = doc.text()
# New way
from edgar.documents import HTMLParser
parser = HTMLParser()
doc = parser.parse(html)
text = doc.text()
```
### Feature Parity
All major features from the old parser are preserved:
- Text extraction
- Table conversion to DataFrame
- Section detection
- Metadata extraction
### Enhanced Features
New capabilities not available in the old parser:
- Rich console rendering
- Markdown export
- Advanced table semantics
- XBRL fact extraction
- Document search
- LLM optimization
- Multiple output formats
## Current Status and Next Steps
### Completed Components
- ✅ Core document model
- ✅ HTML parsing pipeline
- ✅ Advanced table processing
- ✅ Multiple renderers (text, markdown, Rich)
- ✅ XBRL extraction
- ✅ Configuration system
- ✅ Streaming support
### Remaining Work
- 🔄 Performance optimization and benchmarking
- 🔄 Comprehensive test coverage migration
- 🔄 Error handling improvements
- 🔄 Documentation and examples
- 🔄 Validation against large corpus of filings
### Testing Strategy
The rewrite requires extensive validation:
- Comparison testing against old parser output
- Financial table accuracy verification
- Performance benchmarking
- Edge case handling
- Integration testing with existing workflows
## Conclusion
The `edgar/documents` rewrite represents a significant advancement in SEC filing processing capabilities. The new architecture provides:
1. **Better Accuracy**: Advanced table processing and semantic understanding
2. **Enhanced Functionality**: Multiple output formats and rich rendering
3. **Improved Maintainability**: Clean, modular architecture with clear separation of concerns
4. **Future Extensibility**: Plugin architecture for new parsing strategies
5. **Performance**: Streaming support and optimized processing for large documents
The modular design ensures that improvements can be made incrementally while maintaining backward compatibility. The sophisticated table processing system alone represents a major advancement in handling complex financial documents accurately.

View File

@@ -0,0 +1,208 @@
# HTML Parser Quality Improvement Strategy
## Overview
Simple, iterative testing strategy for the HTML parser rewrite. The goal is rapid feedback loops where we compare OLD vs NEW parser output, identify visual/functional issues, fix them, and repeat until satisfied.
## Test Corpus
### 10 Representative Documents
Selected to cover different filing types, companies, and edge cases:
| # | Company | Filing Type | File Path | Rationale |
|---|---------|-------------|-----------|-----------|
| 1 | Apple | 10-K | `data/html/Apple.10-K.html` | Large complex filing, existing test file |
| 2 | Oracle | 10-K | `data/html/Oracle.10-K.html` | Complex financials, existing test file |
| 3 | Nvidia | 10-K | `data/html/Nvidia.10-K.html` | Tech company, existing test file |
| 4 | Microsoft | 10-K | `data/html/Microsoft.10-K.html` | Popular company, complex tables |
| 5 | Tesla | 10-K | `data/html/Tesla.10-K.html` | Manufacturing sector, different formatting |
| 6 | [TBD] | 10-Q | TBD | Quarterly report format |
| 7 | [TBD] | 10-Q | TBD | Another quarterly for variety |
| 8 | [TBD] | 8-K | `data/html/BuckleInc.8-K.html` | Event-driven filing |
| 9 | [TBD] | Proxy (DEF 14A) | TBD | Proxy statement with compensation tables |
| 10 | [TBD] | Edge case | TBD | Unusual formatting or very large file |
**Note**: Fill in TBD entries as we identify good test candidates.
## The 4-Step Loop
### Step 1: Run Comparison
Use existing test scripts to compare OLD vs NEW parsers:
```bash
# Full comparison with metrics
python tests/manual/check_parser_comparison.py
# Table-focused comparison with rendering
python tests/manual/check_tables.py
# Or run on specific file
python tests/manual/check_html_rewrite.py
```
**Outputs to review**:
- Console output with side-by-side Rich panels
- Metrics (parse time, table count, section detection)
- Rendered tables (old vs new)
### Step 2: Human Review
**Visual Inspection Process**:
1. Look at console output directly (Rich rendering)
2. For detailed text comparison, optionally dump to files:
- OLD parser: `doc.text()``output/old_apple.txt`
- NEW parser: `doc.text()``output/new_apple.txt`
- Use `diff` or visual diff tool
3. Take screenshots for complex table issues
4. Focus on:
- Table alignment and formatting
- Currency symbol placement (should be merged: `$1,234` not `$ | 1,234`)
- Column count (fewer is better after removing spacing columns)
- Section detection accuracy
- Text readability for LLM context
**Quality Criteria** (from goals.md):
- Semantic meaning preserved
- Tables render correctly when printed
- Better than old parser in speed, accuracy, features
- **You are the final judge**: "Does this look right?"
### Step 3: Document Bugs
Record issues in the tracker below as you find them:
| Bug # | Status | Priority | Description | File/Location | Notes |
|-------|--------|----------|-------------|---------------|-------|
| Example | Fixed | High | Currency symbols not merging in balance sheet | Apple 10-K, Table 5 | Issue in CurrencyColumnMerger |
| | | | | | |
| | | | | | |
| | | | | | |
**Status values**: Open, In Progress, Fixed, Won't Fix, Deferred
**Priority values**: Critical, High, Medium, Low
**Bug Description Template**:
- What's wrong: Clear description of the issue
- Where: Which file/table/section
- Expected: What it should look like
- Actual: What it currently looks like
- Impact: How it affects usability/readability
### Step 4: Fix & Repeat
1. Pick highest priority bug
2. Fix the code
3. Re-run comparison on affected file(s)
4. Verify fix doesn't break other files
5. Mark bug as Fixed
6. Repeat until exit criteria met
**Quick verification**:
```bash
# Re-run just the problematic file
python -c "
from edgar.documents import parse_html
from pathlib import Path
html = Path('data/html/Apple.10-K.html').read_text()
doc = parse_html(html)
# Quick inspection
print(f'Tables: {len(doc.tables)}')
print(doc.tables[5].render(width=200)) # Check specific table
"
```
## Exit Criteria
We're done when:
1. ✅ All 10 test documents parse successfully
2. ✅ Visual output looks correct (maintainer approval)
3. ✅ Tables render cleanly with proper alignment
4. ✅ No critical or high priority bugs remain
5. ✅ Performance is equal or better than old parser
6. ✅ Text extraction is complete and clean for AI context
**Final approval**: Maintainer says "This is good enough to ship."
## Testing Infrastructure
### Primary Tool: compare_parsers.py
Simple command-line tool for the quality improvement loop:
```bash
# Quick overview comparison (using shortcuts!)
python tests/manual/compare_parsers.py aapl
# See all tables in a document
python tests/manual/compare_parsers.py aapl --tables
# Compare specific table (OLD vs NEW side-by-side)
python tests/manual/compare_parsers.py aapl --table 5
# Compare text extraction
python tests/manual/compare_parsers.py msft --text
# See section detection
python tests/manual/compare_parsers.py orcl --sections
# Test with 10-Q filings
python tests/manual/compare_parsers.py 'aapl 10-q'
# Run all test files at once
python tests/manual/compare_parsers.py --all
```
**Shortcuts available**:
- Companies: `aapl`, `msft`, `tsla`, `nvda`, `orcl`
- Filing types: `10-k` (default), `10-q`, `8-k`
- Or use full file paths
**Features**:
- Clean command-line interface
- Side-by-side OLD vs NEW comparison
- Rich console output with colors and tables
- Performance metrics
- Individual table inspection
### Other Available Scripts
Additional tools for specific testing:
- `tests/manual/check_parser_comparison.py` - Full comparison with metrics
- `tests/manual/check_tables.py` - Table-specific comparison with rendering
- `tests/manual/check_html_rewrite.py` - General HTML parsing checks
- `tests/manual/check_html_parser_real_files.py` - Real filing tests
## Quick Reference
For day-to-day testing commands and usage examples, see [TESTING.md](TESTING.md).
## Notes
- **Keep it simple**: This is about rapid iteration, not comprehensive automation
- **Visual inspection is key**: Automated metrics don't catch layout/formatting issues
- **Use screenshots**: When describing bugs, screenshots speak louder than words
- **Iterative approach**: Don't try to fix everything at once, prioritize
- **Trust your judgment**: If it looks wrong, it probably is wrong
## Bug Tracker
### Active Issues
(Add bugs here as they're discovered)
### Fixed Issues
(Move completed bugs here for history)
### Deferred Issues
(Issues that aren't blocking release but could be improved later)
---
**Status**: Initial draft
**Last Updated**: 2025-10-07
**Maintainer**: Dwight Gunning

View File

@@ -0,0 +1,931 @@
"""
Document model for parsed HTML.
"""
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any, Iterator
from rich.table import Table as RichTable
from rich.console import Group
from rich.text import Text
from edgar.richtools import repr_rich
from edgar.documents.nodes import Node, SectionNode
from edgar.documents.table_nodes import TableNode
from edgar.documents.types import XBRLFact, SearchResult
@dataclass
class DocumentMetadata:
"""
Document metadata.
Contains information about the source document and parsing process.
"""
source: Optional[str] = None
form: Optional[str] = None
company: Optional[str] = None
cik: Optional[str] = None
accession_number: Optional[str] = None
filing_date: Optional[str] = None
report_date: Optional[str] = None
url: Optional[str] = None
size: int = 0
parse_time: float = 0.0
parser_version: str = "2.0.0"
xbrl_data: Optional[List[XBRLFact]] = None
preserve_whitespace: bool = False
original_html: Optional[str] = None # Store original HTML for anchor analysis
def to_dict(self) -> Dict[str, Any]:
"""Convert metadata to dictionary."""
return {
'source': self.source,
'form': self.form,
'company': self.company,
'cik': self.cik,
'accession_number': self.accession_number,
'filing_date': self.filing_date,
'report_date': self.report_date,
'url': self.url,
'size': self.size,
'parse_time': self.parse_time,
'parser_version': self.parser_version,
'xbrl_data': [fact.to_dict() for fact in self.xbrl_data] if self.xbrl_data else None
}
@dataclass
class Section:
"""
Document section representation.
Represents a logical section of the document (e.g., Risk Factors, MD&A).
Attributes:
name: Section identifier (e.g., "item_1", "part_i_item_1", "risk_factors")
title: Display title (e.g., "Item 1 - Business")
node: Node containing section content
start_offset: Character position where section starts
end_offset: Character position where section ends
confidence: Detection confidence score (0.0-1.0)
detection_method: How section was detected ('toc', 'heading', 'pattern')
validated: Whether section has been cross-validated
part: Optional part identifier for 10-Q filings ("I", "II", or None for 10-K)
item: Optional item identifier (e.g., "1", "1A", "2")
_text_extractor: Optional callback for lazy text extraction (for TOC-based sections)
"""
name: str
title: str
node: SectionNode
start_offset: int = 0
end_offset: int = 0
confidence: float = 1.0 # Detection confidence (0.0-1.0)
detection_method: str = 'unknown' # 'toc', 'heading', 'pattern', or 'unknown'
validated: bool = False # Cross-validated flag
part: Optional[str] = None # Part identifier for 10-Q: "I", "II", or None for 10-K
item: Optional[str] = None # Item identifier: "1", "1A", "2", etc.
_text_extractor: Optional[Any] = field(default=None, repr=False) # Callback for lazy text extraction
def text(self, **kwargs) -> str:
"""Extract text from section."""
# If we have a text extractor callback (TOC-based sections), use it
if self._text_extractor is not None:
return self._text_extractor(self.name, **kwargs)
# Otherwise extract from node (heading/pattern-based sections)
from edgar.documents.extractors.text_extractor import TextExtractor
extractor = TextExtractor(**kwargs)
return extractor.extract_from_node(self.node)
def tables(self) -> List[TableNode]:
"""Get all tables in section."""
return self.node.find(lambda n: isinstance(n, TableNode))
def search(self, query: str) -> List[SearchResult]:
"""Search within section."""
# Implementation would use semantic search
results = []
# Simple text search for now
text = self.text().lower()
query_lower = query.lower()
if query_lower in text:
# Find snippet around match
index = text.find(query_lower)
start = max(0, index - 50)
end = min(len(text), index + len(query) + 50)
snippet = text[start:end]
results.append(SearchResult(
node=self.node,
score=1.0,
snippet=snippet,
section=self.name
))
return results
@staticmethod
def parse_section_name(section_name: str) -> tuple[Optional[str], Optional[str]]:
"""
Parse section name to extract part and item identifiers.
Handles both 10-Q part-aware names and 10-K simple names.
Args:
section_name: Section identifier (e.g., "part_i_item_1", "item_1a", "risk_factors")
Returns:
Tuple of (part, item) where:
- part: "I", "II", or None for 10-K sections
- item: "1", "1A", "2", etc. or None if not an item section
Examples:
>>> Section.parse_section_name("part_i_item_1")
("I", "1")
>>> Section.parse_section_name("part_ii_item_1a")
("II", "1A")
>>> Section.parse_section_name("item_7")
(None, "7")
>>> Section.parse_section_name("risk_factors")
(None, None)
"""
import re
section_lower = section_name.lower()
# Match 10-Q format: "part_i_item_1", "part_ii_item_1a"
part_item_match = re.match(r'part_([ivx]+)_item_(\d+[a-z]?)', section_lower)
if part_item_match:
part_roman = part_item_match.group(1).upper()
item_num = part_item_match.group(2).upper()
return (part_roman, item_num)
# Match 10-K format: "item_1", "item_1a", "item_7"
item_match = re.match(r'item_(\d+[a-z]?)', section_lower)
if item_match:
item_num = item_match.group(1).upper()
return (None, item_num)
# Not a structured item section
return (None, None)
class Sections(Dict[str, Section]):
"""
Dictionary wrapper for sections with rich display support.
Behaves like a normal dict but provides beautiful terminal display
via __rich__() method when printed in rich-enabled environments.
"""
def __rich__(self):
"""Return rich representation for display."""
if not self:
return Text("No sections detected", style="dim")
# Create summary table
table = RichTable(title="Document Sections", show_header=True, header_style="bold magenta")
table.add_column("Section", style="cyan", no_wrap=True)
table.add_column("Title", style="white")
table.add_column("Confidence", justify="right", style="green")
table.add_column("Method", style="yellow")
table.add_column("Part/Item", style="blue")
# Sort sections by part (roman numeral) and item number
def sort_key(item):
name, section = item
# Convert roman numerals to integers for sorting
roman_to_int = {'i': 1, 'ii': 2, 'iii': 3, 'iv': 4, 'v': 5}
part = section.part.lower() if section.part else ''
item_str = section.item if section.item else ''
# Extract part number
part_num = roman_to_int.get(part, 0)
# Extract item number and letter
import re
if item_str:
match = re.match(r'(\d+)([a-z]?)', item_str.lower())
if match:
item_num = int(match.group(1))
item_letter = match.group(2) or ''
return (part_num, item_num, item_letter)
# Fallback to name sorting
return (part_num, 999, name)
sorted_sections = sorted(self.items(), key=sort_key)
# Add rows for each section
for name, section in sorted_sections:
# Format confidence as percentage
confidence = f"{section.confidence:.1%}"
# Format part/item info
part_item = ""
if section.part and section.item:
part_item = f"Part {section.part}, Item {section.item}"
elif section.item:
part_item = f"Item {section.item}"
elif section.part:
part_item = f"Part {section.part}"
# Truncate title if too long
title = section.title
if len(title) > 50:
title = title[:47] + "..."
table.add_row(
name,
title,
confidence,
section.detection_method,
part_item
)
# Create summary stats
total = len(self)
high_conf = sum(1 for s in self.values() if s.confidence >= 0.8)
methods = {}
for section in self.values():
methods[section.detection_method] = methods.get(section.detection_method, 0) + 1
summary = Text()
summary.append(f"\nTotal: {total} sections | ", style="dim")
summary.append(f"High confidence (≥80%): {high_conf} | ", style="dim")
summary.append(f"Methods: {', '.join(f'{m}={c}' for m, c in methods.items())}", style="dim")
return Group(table, summary)
def __repr__(self):
return repr_rich(self.__rich__())
def get_item(self, item: str, part: str = None) -> Optional[Section]:
"""
Get section by item number with optional part specification.
Args:
item: Item identifier (e.g., "1", "1A", "7", "Item 1", "Item 7A")
part: Optional part specification (e.g., "I", "II", "Part I", "Part II")
If not specified and multiple parts contain the item, returns first match.
Returns:
Section object if found, None otherwise
Examples:
>>> sections.get_item("1") # Returns first Item 1 (any part)
>>> sections.get_item("1", "I") # Returns Part I, Item 1
>>> sections.get_item("Item 1A") # Returns first Item 1A
>>> sections.get_item("7A", "II") # Returns Part II, Item 7A
"""
# Normalize item string - remove "Item " prefix if present
item_clean = item.replace("Item ", "").replace("item ", "").strip().upper()
# Normalize part string if provided
part_clean = None
if part:
part_clean = part.replace("Part ", "").replace("part ", "").replace("PART ", "").strip().upper()
# Search through sections
for name, section in self.items():
if section.item and section.item.upper() == item_clean:
if part_clean is None:
# No part specified - return first match
return section
elif section.part and section.part.upper() == part_clean:
# Part matches
return section
return None
def get_part(self, part: str) -> Dict[str, Section]:
"""
Get all sections in a specific part.
Args:
part: Part identifier (e.g., "I", "II", "Part I", "Part II")
Returns:
Dictionary of sections in that part
Examples:
>>> sections.get_part("I") # All Part I sections
>>> sections.get_part("Part II") # All Part II sections
"""
# Normalize part string
part_clean = part.replace("Part ", "").replace("part ", "").replace("PART ", "").strip().upper()
result = {}
for name, section in self.items():
if section.part and section.part.upper() == part_clean:
result[name] = section
return result
def get(self, key, default=None):
"""
Enhanced get method that supports flexible key formats.
Supports:
- Standard dict key: "part_i_item_1"
- Item number: "Item 1", "1", "1A"
- Part+Item: ("I", "1"), ("Part II", "7A")
Args:
key: Section key (string or tuple)
default: Default value if not found
Returns:
Section object or default value
"""
# Try standard dict lookup first
if isinstance(key, str):
result = super().get(key, None)
if result is not None:
return result
# Try as item number
result = self.get_item(key)
if result is not None:
return result
# Try as (part, item) tuple
elif isinstance(key, tuple) and len(key) == 2:
part, item = key
result = self.get_item(item, part)
if result is not None:
return result
return default
def __getitem__(self, key):
"""
Enhanced __getitem__ that supports flexible key formats.
Supports:
- Standard dict key: sections["part_i_item_1"]
- Item number: sections["Item 1"], sections["1A"]
- Part+Item tuple: sections[("I", "1")], sections[("II", "7A")]
Raises KeyError if not found (standard dict behavior).
"""
# Try standard dict lookup first
if isinstance(key, str):
try:
return super().__getitem__(key)
except KeyError:
# Try as item number
result = self.get_item(key)
if result is not None:
return result
# Try as (part, item) tuple
elif isinstance(key, tuple) and len(key) == 2:
part, item = key
result = self.get_item(item, part)
if result is not None:
return result
# Not found - raise KeyError
raise KeyError(key)
@dataclass
class Document:
"""
Main document class.
Represents a parsed HTML document with methods for content extraction,
search, and transformation.
"""
# Core properties
root: Node
metadata: DocumentMetadata = field(default_factory=DocumentMetadata)
# Cached extractions
_sections: Optional[Sections] = field(default=None, init=False, repr=False)
_tables: Optional[List[TableNode]] = field(default=None, init=False, repr=False)
_headings: Optional[List[Node]] = field(default=None, init=False, repr=False)
_xbrl_facts: Optional[List[XBRLFact]] = field(default=None, init=False, repr=False)
_text_cache: Optional[str] = field(default=None, init=False, repr=False)
_config: Optional[Any] = field(default=None, init=False, repr=False) # ParserConfig reference
@property
def sections(self) -> Sections:
"""
Get document sections using hybrid multi-strategy detection.
Tries detection methods in order of reliability:
1. TOC-based (0.95 confidence)
2. Heading-based (0.7-0.9 confidence)
3. Pattern-based (0.6 confidence)
Returns a Sections dictionary wrapper that provides rich terminal display
via __rich__() method. Each section includes confidence score and detection method.
"""
if self._sections is None:
# Get form type from config or metadata
form = None
if self._config and hasattr(self._config, 'form'):
form = self._config.form
elif self.metadata and self.metadata.form:
form = self.metadata.form
# Only detect sections for supported form types (including amendments)
# Normalize form type by removing /A suffix for amendments
base_form = form.replace('/A', '') if form else None
if base_form and base_form in ['10-K', '10-Q', '8-K']:
from edgar.documents.extractors.hybrid_section_detector import HybridSectionDetector
# Pass thresholds from config if available
thresholds = self._config.detection_thresholds if self._config else None
# Use base form type for detection (10-K/A → 10-K)
detector = HybridSectionDetector(self, base_form, thresholds)
detected_sections = detector.detect_sections()
else:
# Fallback to pattern-based for other types or unknown
from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
extractor = SectionExtractor(form) if form else SectionExtractor()
detected_sections = extractor.extract(self)
# Wrap detected sections in Sections class for rich display
self._sections = Sections(detected_sections)
return self._sections
@property
def tables(self) -> List[TableNode]:
"""Get all tables in document."""
if self._tables is None:
self._tables = self.root.find(lambda n: isinstance(n, TableNode))
return self._tables
@property
def headings(self) -> List[Node]:
"""Get all headings in document."""
if self._headings is None:
from edgar.documents.nodes import HeadingNode
self._headings = self.root.find(lambda n: isinstance(n, HeadingNode))
return self._headings
@property
def xbrl_facts(self) -> List[XBRLFact]:
"""Get all XBRL facts in document."""
if self._xbrl_facts is None:
self._xbrl_facts = self._extract_xbrl_facts()
return self._xbrl_facts
def text(self,
clean: bool = True,
include_tables: bool = True,
include_metadata: bool = False,
max_length: Optional[int] = None) -> str:
"""
Extract text from document.
Args:
clean: Clean and normalize text
include_tables: Include table content in text
include_metadata: Include metadata annotations
max_length: Maximum text length
Returns:
Extracted text
"""
# Use cache if available and parameters match
if (self._text_cache is not None and
clean and not include_tables and not include_metadata and max_length is None):
return self._text_cache
# If whitespace was preserved during parsing and clean is default (True),
# respect the preserve_whitespace setting
if self.metadata.preserve_whitespace and clean:
clean = False
from edgar.documents.extractors.text_extractor import TextExtractor
extractor = TextExtractor(
clean=clean,
include_tables=include_tables,
include_metadata=include_metadata,
max_length=max_length
)
text = extractor.extract(self)
# Apply navigation link filtering when cleaning
if clean:
# Use cached/integrated navigation filtering (optimized approach)
try:
from edgar.documents.utils.anchor_cache import filter_with_cached_patterns
# Use minimal cached approach (no memory overhead)
original_html = getattr(self.metadata, 'original_html', None)
text = filter_with_cached_patterns(text, html_content=original_html)
except:
# Fallback to pattern-based filtering
from edgar.documents.utils.toc_filter import filter_toc_links
text = filter_toc_links(text)
# Cache if using default parameters
if clean and not include_tables and not include_metadata and max_length is None:
self._text_cache = text
return text
def search(self, query: str, top_k: int = 10) -> List[SearchResult]:
"""
Search document for query.
Args:
query: Search query
top_k: Maximum results to return
Returns:
List of search results
"""
from edgar.documents.search import DocumentSearch
searcher = DocumentSearch(self)
return searcher.search(query, top_k=top_k)
def get_section(self, section_name: str, part: Optional[str] = None) -> Optional[Section]:
"""
Get section by name with optional part specification for 10-Q filings.
Args:
section_name: Section identifier (e.g., "item_1", "part_i_item_1")
part: Optional part specification for 10-Q ("I", "II", "i", "ii")
If provided, searches for "part_{part}_{section_name}"
Returns:
Section object if found, None otherwise
Examples:
# 10-K usage (unchanged)
>>> doc.get_section("item_1") # Returns Item 1
# 10-Q usage with explicit part
>>> doc.get_section("item_1", part="I") # Returns Part I Item 1
>>> doc.get_section("item_1", part="II") # Returns Part II Item 1
# 10-Q usage with full name
>>> doc.get_section("part_i_item_1") # Returns Part I Item 1
"""
# If part is specified, construct part-aware name
if part:
part_normalized = part.upper()
# Remove "item_" prefix if present in section_name
item_name = section_name.replace("item_", "") if section_name.startswith("item_") else section_name
full_name = f"part_{part_normalized.lower()}_item_{item_name.lower()}"
return self.sections.get(full_name)
# Direct lookup (works for both 10-K "item_1" and 10-Q "part_i_item_1")
section = self.sections.get(section_name)
if section:
return section
# If not found and looks like an item without part, check if we have multiple parts
# In that case, raise a helpful error
if section_name.startswith("item_") or section_name.replace("_", "").startswith("item"):
# Check if we have part-aware sections (10-Q)
matching_sections = [name for name in self.sections.keys()
if section_name in name and "part_" in name]
if matching_sections:
# Multiple parts available - user needs to specify which one
parts = sorted(set(s.split("_")[1] for s in matching_sections if s.startswith("part_")))
raise ValueError(
f"Ambiguous section '{section_name}' in 10-Q filing. "
f"Found in parts: {parts}. "
f"Please specify part: get_section('{section_name}', part='I') or part='II'"
)
return None
def extract_section_text(self, section_name: str) -> Optional[str]:
"""Extract text from specific section."""
section = self.get_section(section_name)
if section:
return section.text()
return None
def get_sec_section(self, section_name: str, clean: bool = True,
include_subsections: bool = True) -> Optional[str]:
"""
Extract content from a specific SEC filing section using anchor analysis.
Args:
section_name: Section name (e.g., "Item 1", "Item 1A", "Part I")
clean: Whether to apply text cleaning and navigation filtering
include_subsections: Whether to include subsections
Returns:
Section text content or None if section not found
Examples:
>>> doc.get_sec_section("Item 1") # Business description
>>> doc.get_sec_section("Item 1A") # Risk factors
>>> doc.get_sec_section("Item 7") # MD&A
"""
# Lazy-load section extractor
if not hasattr(self, '_section_extractor'):
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
self._section_extractor = SECSectionExtractor(self)
return self._section_extractor.get_section_text(
section_name, include_subsections, clean
)
def get_available_sec_sections(self) -> List[str]:
"""
Get list of SEC sections available for extraction.
Returns:
List of section names that can be passed to get_sec_section()
Example:
>>> sections = doc.get_available_sec_sections()
>>> print(sections)
['Part I', 'Item 1', 'Item 1A', 'Item 1B', 'Item 2', ...]
"""
if not hasattr(self, '_section_extractor'):
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
self._section_extractor = SECSectionExtractor(self)
return self._section_extractor.get_available_sections()
def get_sec_section_info(self, section_name: str) -> Optional[Dict]:
"""
Get detailed information about an SEC section.
Args:
section_name: Section name to look up
Returns:
Dict with section metadata including anchor info
"""
if not hasattr(self, '_section_extractor'):
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
self._section_extractor = SECSectionExtractor(self)
return self._section_extractor.get_section_info(section_name)
def to_markdown(self) -> str:
"""Convert document to Markdown."""
from edgar.documents.renderers.markdown_renderer import MarkdownRenderer
renderer = MarkdownRenderer()
return renderer.render(self)
def to_json(self, include_content: bool = True) -> Dict[str, Any]:
"""
Convert document to JSON.
Args:
include_content: Include full content or just structure
Returns:
JSON-serializable dictionary
"""
result = {
'metadata': self.metadata.to_dict(),
'sections': list(self.sections.keys()),
'table_count': len(self.tables),
'xbrl_fact_count': len(self.xbrl_facts)
}
if include_content:
result['sections_detail'] = {
name: {
'title': section.title,
'text_length': len(section.text()),
'table_count': len(section.tables())
}
for name, section in self.sections.items()
}
result['tables'] = [
{
'type': table.table_type.name,
'rows': len(table.rows),
'columns': len(table.headers[0]) if table.headers else 0,
'caption': table.caption
}
for table in self.tables
]
return result
def to_dataframe(self) -> 'pd.DataFrame':
"""
Convert document tables to pandas DataFrame.
Returns a DataFrame with all tables concatenated.
"""
import pandas as pd
if not self.tables:
return pd.DataFrame()
# Convert each table to DataFrame
dfs = []
for i, table in enumerate(self.tables):
df = table.to_dataframe()
# Add table index
df['_table_index'] = i
df['_table_type'] = table.table_type.name
if table.caption:
df['_table_caption'] = table.caption
dfs.append(df)
# Concatenate all tables
return pd.concat(dfs, ignore_index=True)
def chunks(self, chunk_size: int = 512, overlap: int = 128) -> Iterator['DocumentChunk']:
"""
Generate document chunks for processing.
Args:
chunk_size: Target chunk size in tokens
overlap: Overlap between chunks
Yields:
Document chunks
"""
from edgar.documents.extractors.chunk_extractor import ChunkExtractor
extractor = ChunkExtractor(chunk_size=chunk_size, overlap=overlap)
return extractor.extract(self)
def prepare_for_llm(self,
max_tokens: int = 4000,
preserve_structure: bool = True,
focus_sections: Optional[List[str]] = None) -> 'LLMDocument':
"""
Prepare document for LLM processing.
Args:
max_tokens: Maximum tokens
preserve_structure: Preserve document structure
focus_sections: Sections to focus on
Returns:
LLM-optimized document
"""
from edgar.documents.ai.llm_optimizer import LLMOptimizer
optimizer = LLMOptimizer()
return optimizer.optimize(
self,
max_tokens=max_tokens,
preserve_structure=preserve_structure,
focus_sections=focus_sections
)
def extract_key_information(self) -> Dict[str, Any]:
"""Extract key information from document."""
return {
'company': self.metadata.company,
'form': self.metadata.form,
'filing_date': self.metadata.filing_date,
'sections': list(self.sections.keys()),
'financial_tables': sum(1 for t in self.tables if t.is_financial_table),
'total_tables': len(self.tables),
'xbrl_facts': len(self.xbrl_facts),
'document_length': len(self.text())
}
def _extract_xbrl_facts(self) -> List[XBRLFact]:
"""Extract XBRL facts from document."""
facts = []
# Find all nodes with XBRL metadata
xbrl_nodes = self.root.find(
lambda n: n.get_metadata('ix_tag') is not None
)
for node in xbrl_nodes:
fact = XBRLFact(
concept=node.get_metadata('ix_tag'),
value=node.text(),
context_ref=node.get_metadata('ix_context'),
unit_ref=node.get_metadata('ix_unit'),
decimals=node.get_metadata('ix_decimals'),
scale=node.get_metadata('ix_scale')
)
facts.append(fact)
return facts
def __len__(self) -> int:
"""Get number of top-level nodes."""
return len(self.root.children)
def __iter__(self) -> Iterator[Node]:
"""Iterate over top-level nodes."""
return iter(self.root.children)
def __repr__(self) -> str:
return self.text()
def walk(self) -> Iterator[Node]:
"""Walk entire document tree."""
return self.root.walk()
def find_nodes(self, predicate) -> List[Node]:
"""Find all nodes matching predicate."""
return self.root.find(predicate)
def find_first_node(self, predicate) -> Optional[Node]:
"""Find first node matching predicate."""
return self.root.find_first(predicate)
@property
def is_empty(self) -> bool:
"""Check if document is empty."""
return len(self.root.children) == 0
@property
def has_tables(self) -> bool:
"""Check if document has tables."""
return len(self.tables) > 0
@property
def has_xbrl(self) -> bool:
"""Check if document has XBRL data."""
return len(self.xbrl_facts) > 0
def validate(self) -> List[str]:
"""
Validate document structure.
Returns list of validation issues.
"""
issues = []
# Check for empty document
if self.is_empty:
issues.append("Document is empty")
# Check for sections
if not self.sections:
issues.append("No sections detected")
# Check for common sections in filings
if self.metadata.form in ['10-K', '10-Q']:
expected_sections = ['business', 'risk_factors', 'mda']
missing = [s for s in expected_sections if s not in self.sections]
if missing:
issues.append(f"Missing expected sections: {', '.join(missing)}")
# Check for orphaned nodes
orphaned = self.root.find(lambda n: n.parent is None and n != self.root)
if orphaned:
issues.append(f"Found {len(orphaned)} orphaned nodes")
return issues
@dataclass
class DocumentChunk:
"""Represents a chunk of document for processing."""
content: str
start_node: Node
end_node: Node
section: Optional[str] = None
token_count: int = 0
def to_dict(self) -> Dict[str, Any]:
"""Convert chunk to dictionary."""
return {
'content': self.content,
'section': self.section,
'token_count': self.token_count,
'start_path': self.start_node.path,
'end_path': self.end_node.path
}
@dataclass
class LLMDocument:
"""Document optimized for LLM processing."""
content: str
metadata: Dict[str, Any]
token_count: int
sections: List[str]
truncated: bool = False
def to_prompt(self) -> str:
"""Convert to LLM prompt."""
parts = []
# Add metadata context
parts.append(f"Document: {self.metadata.get('form', 'Unknown')}")
parts.append(f"Company: {self.metadata.get('company', 'Unknown')}")
parts.append(f"Date: {self.metadata.get('filing_date', 'Unknown')}")
parts.append("")
# Add content
parts.append(self.content)
if self.truncated:
parts.append("\n[Content truncated due to length]")
return '\n'.join(parts)

View File

@@ -0,0 +1,81 @@
"""
Custom exceptions for the HTML parser.
"""
from typing import Optional, Dict, Any
class ParsingError(Exception):
"""Base exception for parsing errors."""
def __init__(self,
message: str,
context: Optional[Dict[str, Any]] = None,
suggestions: Optional[list] = None):
super().__init__(message)
self.message = message
self.context = context or {}
self.suggestions = suggestions or []
def __str__(self):
result = self.message
if self.context:
result += f"\nContext: {self.context}"
if self.suggestions:
result += f"\nSuggestions: {', '.join(self.suggestions)}"
return result
class HTMLParsingError(ParsingError):
"""Error parsing HTML structure."""
pass
class StyleParsingError(ParsingError):
"""Error parsing CSS styles."""
pass
class XBRLParsingError(ParsingError):
"""Error parsing inline XBRL."""
pass
class TableParsingError(ParsingError):
"""Error parsing table structure."""
pass
class SectionDetectionError(ParsingError):
"""Error detecting document sections."""
pass
class DocumentTooLargeError(ParsingError):
"""Document exceeds maximum size."""
def __init__(self, size: int, max_size: int):
super().__init__(
f"Document size ({size:,} bytes) exceeds maximum ({max_size:,} bytes)",
context={'size': size, 'max_size': max_size},
suggestions=[
"Use streaming parser for large documents",
"Increase max_document_size in configuration",
"Split document into smaller parts"
]
)
class InvalidConfigurationError(ParsingError):
"""Invalid parser configuration."""
pass
class NodeNotFoundError(ParsingError):
"""Requested node not found in document."""
pass
class ExtractionError(ParsingError):
"""Error extracting content from document."""
pass

View File

@@ -0,0 +1,15 @@
"""
Content extractors for documents.
"""
from edgar.documents.extractors.text_extractor import TextExtractor
from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
from edgar.documents.extractors.hybrid_section_detector import HybridSectionDetector
from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
__all__ = [
'TextExtractor',
'SectionExtractor',
'HybridSectionDetector',
'TOCSectionDetector'
]

View File

@@ -0,0 +1,170 @@
"""
Heading-based section detection strategy.
Detects sections by analyzing heading nodes with HeaderInfo metadata.
This strategy provides moderate confidence (0.7-0.9) and serves as a
fallback when TOC-based detection is not available.
"""
import logging
from typing import Dict, Optional
from edgar.documents.document import Document, Section
from edgar.documents.nodes import HeadingNode, SectionNode
from edgar.documents.types import HeaderInfo
logger = logging.getLogger(__name__)
class HeadingSectionDetector:
"""
Heading-based section detection using HeaderInfo.
Analyzes heading nodes that have been annotated with HeaderInfo
during parsing. Detects sections based on:
- Item numbers (Item 1, Item 1A, etc.)
- Heading confidence scores
- Heading hierarchy
Provides moderate confidence (0.7-0.9) detection.
"""
def __init__(
self,
document: Document,
form: Optional[str] = None,
min_confidence: float = 0.5 # Lower threshold, let hybrid detector filter
):
"""
Initialize heading-based detector.
Args:
document: Document to analyze
form: Optional filing type for context ('10-K', '10-Q', '8-K')
min_confidence: Minimum confidence for headings (default 0.5)
"""
self.document = document
self.form = form
self.min_confidence = min_confidence
def detect(self) -> Optional[Dict[str, Section]]:
"""
Detect sections from heading nodes with HeaderInfo.
Returns:
Dictionary of sections if successful, None if no sections found
"""
try:
# Get heading nodes from document
headings = self.document.headings
if not headings:
logger.debug("No headings found in document")
return None
sections = {}
for heading in headings:
# Check if heading has header info
if not hasattr(heading, 'header_info') or not heading.header_info:
continue
header_info = heading.header_info
# Only use headings with sufficient confidence
if header_info.confidence < self.min_confidence:
continue
# Check if it's an item header
if not header_info.is_item:
continue
# Extract section from this heading
section = self._extract_section_from_heading(heading, header_info)
if section:
section.confidence = header_info.confidence
section.detection_method = 'heading'
sections[section.name] = section
if not sections:
logger.debug("No item headers found with sufficient confidence")
return None
logger.info(f"Heading detection found {len(sections)} sections")
return sections
except Exception as e:
logger.warning(f"Heading detection failed: {e}")
return None
def _extract_section_from_heading(
self, heading: HeadingNode, header_info: HeaderInfo
) -> Optional[Section]:
"""
Extract section content from heading node to next heading.
Args:
heading: HeadingNode representing section start
header_info: HeaderInfo with section metadata
Returns:
Section object if successful, None otherwise
"""
try:
# Create section name from item number
if header_info.item_number:
# Normalize: "1A" -> "item_1a", "7" -> "item_7"
section_name = f"item_{header_info.item_number.replace('.', '_').lower()}"
else:
section_name = "unknown"
# Create section node
section_node = SectionNode(section_name=section_name)
# Find next heading at same or higher level to determine section end
current_level = header_info.level
parent = heading.parent
if not parent:
logger.debug(f"Heading {header_info.text} has no parent")
return None
# Find heading position in parent's children
try:
heading_index = parent.children.index(heading)
except ValueError:
logger.debug(f"Could not find heading in parent's children")
return None
# Collect nodes until next section heading
for i in range(heading_index + 1, len(parent.children)):
child = parent.children[i]
# Stop at next heading of same or higher level
if isinstance(child, HeadingNode):
if hasattr(child, 'header_info') and child.header_info:
if child.header_info.level <= current_level:
break
# Add child to section
section_node.add_child(child)
# Parse section name to extract part and item identifiers
part, item = Section.parse_section_name(section_name)
# Create Section object
section = Section(
name=section_name,
title=header_info.text,
node=section_node,
start_offset=0, # Would need actual text position
end_offset=0, # Would need actual text position
confidence=header_info.confidence,
detection_method='heading',
part=part,
item=item
)
return section
except Exception as e:
logger.warning(f"Failed to extract section from heading: {e}")
return None

View File

@@ -0,0 +1,489 @@
"""
Hybrid section detection system with multiple fallback strategies.
This module implements a multi-strategy approach to section detection:
1. TOC-based (primary): High confidence, uses Table of Contents structure
2. Heading-based (fallback): Moderate confidence, uses multi-strategy heading detection
3. Pattern-based (last resort): Lower confidence, uses regex pattern matching
"""
import logging
from typing import Dict, Optional, List
from dataclasses import dataclass
from functools import lru_cache
from edgar.documents.document import Document, Section
from edgar.documents.nodes import SectionNode, HeadingNode
from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
from edgar.documents.config import DetectionThresholds
logger = logging.getLogger(__name__)
class HybridSectionDetector:
"""
Multi-strategy section detector with fallback.
Tries strategies in order of reliability:
1. TOC-based (0.95 confidence) - Most reliable
2. Multi-strategy heading detection (0.7-0.9 confidence) - Fallback
3. Pattern matching (0.6 confidence) - Last resort
Example:
>>> detector = HybridSectionDetector(document, '10-K')
>>> sections = detector.detect_sections()
>>> for name, section in sections.items():
... print(f"{name}: {section.confidence:.2f} ({section.detection_method})")
"""
def __init__(self, document: Document, form: str, thresholds: Optional[DetectionThresholds] = None):
"""
Initialize hybrid detector.
Args:
document: Document to extract sections from
form: Filing type ('10-K', '10-Q', '8-K')
thresholds: Detection thresholds configuration
"""
self.document = document
self.form = form
self.thresholds = thresholds or DetectionThresholds()
# Initialize detection strategies
self.toc_detector = TOCSectionDetector(document)
self.pattern_extractor = SectionExtractor(form)
def detect_sections(self) -> Dict[str, Section]:
"""
Detect sections using hybrid approach with fallback and validation.
Returns:
Dictionary mapping section names to Section objects with confidence scores
"""
# Strategy 1: TOC-based (most reliable)
logger.debug("Trying TOC-based detection...")
sections = self.toc_detector.detect()
if sections:
logger.info(f"TOC detection successful: {len(sections)} sections found")
return self._validate_pipeline(sections, enable_cross_validation=True)
# Strategy 2: Heading-based (fallback)
logger.debug("TOC detection failed, trying heading detection...")
sections = self._try_heading_detection()
if sections:
logger.info(f"Heading detection successful: {len(sections)} sections found")
return self._validate_pipeline(sections, enable_cross_validation=False)
# Strategy 3: Pattern-based (last resort)
logger.debug("Heading detection failed, trying pattern matching...")
sections = self._try_pattern_detection()
if sections:
logger.info(f"Pattern detection successful: {len(sections)} sections found")
return self._validate_pipeline(sections, enable_cross_validation=False)
logger.warning("All detection strategies failed, no sections found")
return {}
def _validate_pipeline(
self,
sections: Dict[str, Section],
enable_cross_validation: bool = False
) -> Dict[str, Section]:
"""
Apply validation pipeline to sections.
Centralizes validation logic to eliminate duplication.
Args:
sections: Sections to validate
enable_cross_validation: Whether to enable cross-validation (expensive)
Returns:
Validated sections
"""
if not sections:
return sections
# Cross-validate (optional, expensive)
if enable_cross_validation and self.thresholds.enable_cross_validation:
sections = self._cross_validate(sections)
# Validate boundaries
sections = self._validate_boundaries(sections)
# Deduplicate
sections = self._deduplicate(sections)
# Filter by confidence
sections = self._filter_by_confidence(sections)
return sections
def _try_heading_detection(self) -> Optional[Dict[str, Section]]:
"""
Try multi-strategy heading detection.
Returns:
Dictionary of sections if successful, None if failed
"""
try:
# Get heading nodes from document
headings = self.document.headings
if not headings:
return None
sections = {}
for heading in headings:
# Check if heading has header info
if not hasattr(heading, 'header_info') or not heading.header_info:
continue
header_info = heading.header_info
# Only use headings with sufficient confidence
if header_info.confidence < 0.7:
continue
# Check if it's an item header
if not header_info.is_item:
continue
# Extract section from this heading to next
section = self._extract_section_from_heading(heading, header_info)
if section:
section.confidence = header_info.confidence
section.detection_method = 'heading'
sections[section.name] = section
return sections if sections else None
except Exception as e:
logger.warning(f"Heading detection failed: {e}")
return None
def _try_pattern_detection(self) -> Optional[Dict[str, Section]]:
"""
Try pattern-based extraction.
Returns:
Dictionary of sections if successful, None if failed
"""
try:
# Use pattern extractor
sections = self.pattern_extractor.extract(self.document)
# Mark with pattern detection confidence
for section in sections.values():
section.confidence = 0.6 # Pattern-based = lower confidence
section.detection_method = 'pattern'
return sections if sections else None
except Exception as e:
logger.warning(f"Pattern detection failed: {e}")
return None
def _extract_section_from_heading(self, heading: HeadingNode, header_info) -> Optional[Section]:
"""
Extract section content from heading node to next heading.
Args:
heading: HeadingNode representing section start
header_info: HeaderInfo with section metadata
Returns:
Section object if successful, None otherwise
"""
try:
# Create section name from item number
section_name = f"item_{header_info.item_number.replace('.', '_')}" if header_info.item_number else "unknown"
# Create section node
section_node = SectionNode(section_name=section_name)
# Find next heading at same or higher level to determine section end
current_level = header_info.level
parent = heading.parent
if not parent:
return None
# Find heading position in parent's children
try:
heading_index = parent.children.index(heading)
except ValueError:
return None
# Collect nodes until next section heading
for i in range(heading_index + 1, len(parent.children)):
child = parent.children[i]
# Stop at next heading of same or higher level
if isinstance(child, HeadingNode):
if hasattr(child, 'header_info') and child.header_info:
if child.header_info.level <= current_level:
break
# Add child to section
section_node.add_child(child)
# Create Section object
section = Section(
name=section_name,
title=header_info.text,
node=section_node,
start_offset=0, # Would need actual text position
end_offset=0, # Would need actual text position
confidence=header_info.confidence,
detection_method='heading'
)
return section
except Exception as e:
logger.warning(f"Failed to extract section from heading: {e}")
return None
def _cross_validate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
"""
Cross-validate sections using multiple detection methods.
Boosts confidence if multiple methods detect the same section.
Reduces confidence if methods disagree.
Args:
sections: Sections detected by primary method
Returns:
Validated sections with adjusted confidence scores
"""
validated = {}
# Get pattern-based sections once for comparison (not per section)
try:
pattern_sections = self.pattern_extractor.extract(self.document)
except Exception as e:
logger.debug(f"Pattern extraction failed for cross-validation: {e}")
pattern_sections = {}
for name, section in sections.items():
# Try alternative detection (pattern matching for validation)
try:
# Check if this section is also found by pattern matching
found_in_patterns = False
for pattern_name, pattern_section in pattern_sections.items():
# Check for name similarity or overlap
if self._sections_similar(section, pattern_section):
found_in_patterns = True
break
# Boost confidence if methods agree
if found_in_patterns:
section.confidence = min(section.confidence * self.thresholds.cross_validation_boost, 1.0)
section.validated = True
logger.debug(f"Section {name} validated by multiple methods, confidence boosted to {section.confidence:.2f}")
else:
# Slight reduction if not validated
section.confidence *= self.thresholds.disagreement_penalty
section.validated = False
except Exception as e:
logger.debug(f"Cross-validation failed for {name}: {e}")
# Keep original confidence if validation fails
pass
validated[name] = section
return validated
def _validate_boundaries(self, sections: Dict[str, Section]) -> Dict[str, Section]:
"""
Validate section boundaries for overlaps, gaps, and ordering.
Args:
sections: Sections to validate
Returns:
Sections with validated boundaries
"""
if not sections:
return sections
# Sort by start offset
sorted_sections = sorted(sections.items(), key=lambda x: x[1].start_offset)
validated = {}
prev_section = None
for name, section in sorted_sections:
# Check for overlap with previous section
if prev_section and section.start_offset > 0:
if section.start_offset < prev_section[1].end_offset:
# Overlap detected - adjust boundary at midpoint
gap_mid = (prev_section[1].end_offset + section.start_offset) // 2
prev_section[1].end_offset = gap_mid
section.start_offset = gap_mid
# Reduce confidence due to boundary adjustment
section.confidence *= self.thresholds.boundary_overlap_penalty
prev_section[1].confidence *= self.thresholds.boundary_overlap_penalty
logger.debug(f"Adjusted boundary between {prev_section[0]} and {name}")
# Check for large gap (>10% of document size)
elif prev_section[1].end_offset > 0:
gap_size = section.start_offset - prev_section[1].end_offset
if gap_size > 100000: # Arbitrary large gap threshold
# Large gap - might indicate missing section
section.confidence *= 0.9
logger.debug(f"Large gap detected before {name}")
validated[name] = section
prev_section = (name, section)
return validated
def _deduplicate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
"""
Remove duplicate sections detected by multiple methods.
Keeps the detection with highest confidence.
Args:
sections: Sections possibly containing duplicates
Returns:
Deduplicated sections
"""
if len(sections) <= 1:
return sections
# Group similar sections
groups = self._group_similar_sections(sections)
deduplicated = {}
for group in groups:
if len(group) == 1:
# No duplicates
deduplicated[group[0].name] = group[0]
else:
# Keep section with highest confidence
best = max(group, key=lambda s: s.confidence)
# Merge detection methods
methods = set(s.detection_method for s in group)
if len(methods) > 1:
best.detection_method = ','.join(sorted(methods))
# Boost confidence for multi-method detection
best.confidence = min(best.confidence * 1.15, 1.0)
best.validated = True
logger.debug(f"Merged duplicate sections for {best.name}, methods: {best.detection_method}")
deduplicated[best.name] = best
return deduplicated
def _group_similar_sections(self, sections: Dict[str, Section]) -> List[List[Section]]:
"""
Group sections that appear to be duplicates.
Args:
sections: Sections to group
Returns:
List of section groups
"""
groups = []
used = set()
for name1, section1 in sections.items():
if name1 in used:
continue
group = [section1]
used.add(name1)
for name2, section2 in sections.items():
if name2 in used:
continue
# Check if sections are similar
if self._sections_similar(section1, section2):
group.append(section2)
used.add(name2)
groups.append(group)
return groups
def _sections_similar(self, section1: Section, section2: Section) -> bool:
"""
Check if two sections are similar (likely duplicates).
Args:
section1: First section
section2: Second section
Returns:
True if sections are similar
"""
# Normalize names for comparison
name1 = section1.name.lower().replace('_', ' ').strip()
name2 = section2.name.lower().replace('_', ' ').strip()
# Check exact match after normalization
if name1 == name2:
return True
# Check title similarity (exact match)
title1 = section1.title.lower().strip()
title2 = section2.title.lower().strip()
if title1 == title2:
return True
# Check for position overlap (if positions are set)
if section1.start_offset > 0 and section2.start_offset > 0:
# Calculate overlap
overlap_start = max(section1.start_offset, section2.start_offset)
overlap_end = min(section1.end_offset, section2.end_offset)
if overlap_end > overlap_start:
# There is overlap
overlap_size = overlap_end - overlap_start
min_size = min(
section1.end_offset - section1.start_offset,
section2.end_offset - section2.start_offset
)
# If overlap is >50% of smaller section, consider similar
if min_size > 0 and overlap_size / min_size > 0.5:
return True
return False
def _filter_by_confidence(self, sections: Dict[str, Section]) -> Dict[str, Section]:
"""
Filter sections by minimum confidence threshold.
Args:
sections: Sections to filter
Returns:
Filtered sections meeting minimum confidence
"""
# Check for filing-specific thresholds
min_conf = self.thresholds.min_confidence
if self.form in self.thresholds.thresholds_by_form:
filing_thresholds = self.thresholds.thresholds_by_form[self.form]
min_conf = filing_thresholds.get('min_confidence', min_conf)
filtered = {}
for name, section in sections.items():
if section.confidence >= min_conf:
filtered[name] = section
else:
logger.debug(f"Filtered out section {name} with confidence {section.confidence:.2f} < {min_conf:.2f}")
return filtered

View File

@@ -0,0 +1,405 @@
"""
Section extraction from documents.
"""
import re
from typing import Dict, List, Optional, Tuple
from edgar.documents.document import Document, Section
from edgar.documents.nodes import Node, HeadingNode, SectionNode
class SectionExtractor:
"""
Extracts logical sections from documents.
Identifies document sections like:
- Business Overview (Item 1)
- Risk Factors (Item 1A)
- MD&A (Item 7)
- Financial Statements (Item 8)
"""
# Common section patterns for different filing types
SECTION_PATTERNS = {
'10-K': {
'business': [
(r'^(Item|ITEM)\s+1\.?\s*Business', 'Item 1 - Business'),
(r'^Business\s*$', 'Business'),
(r'^Business Overview', 'Business Overview'),
(r'^Our Business', 'Our Business'),
(r'^Company Overview', 'Company Overview')
],
'risk_factors': [
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
(r'^Risk\s+Factors', 'Risk Factors'),
(r'^Factors\s+That\s+May\s+Affect', 'Risk Factors')
],
'properties': [
(r'^(Item|ITEM)\s+2\.?\s*Properties', 'Item 2 - Properties'),
(r'^Properties', 'Properties'),
(r'^Real\s+Estate', 'Real Estate')
],
'legal_proceedings': [
(r'^(Item|ITEM)\s+3\.?\s*Legal\s+Proceedings', 'Item 3 - Legal Proceedings'),
(r'^Legal\s+Proceedings', 'Legal Proceedings'),
(r'^Litigation', 'Litigation')
],
'market_risk': [
(r'^(Item|ITEM)\s+7A\.?\s*Quantitative.*Disclosures', 'Item 7A - Market Risk'),
(r'^Market\s+Risk', 'Market Risk'),
(r'^Quantitative.*Qualitative.*Market\s+Risk', 'Market Risk')
],
'mda': [
(r'^(Item|ITEM)\s+7\.?\s*Management.*Discussion', 'Item 7 - MD&A'),
(r'^Management.*Discussion.*Analysis', 'MD&A'),
(r'^MD&A', 'MD&A')
],
'financial_statements': [
(r'^(Item|ITEM)\s+8\.?\s*Financial\s+Statements', 'Item 8 - Financial Statements'),
(r'^Financial\s+Statements', 'Financial Statements'),
(r'^Consolidated\s+Financial\s+Statements', 'Consolidated Financial Statements')
],
'controls_procedures': [
(r'^(Item|ITEM)\s+9A\.?\s*Controls.*Procedures', 'Item 9A - Controls and Procedures'),
(r'^Controls.*Procedures', 'Controls and Procedures'),
(r'^Internal\s+Control', 'Internal Controls')
]
},
'10-Q': {
'financial_statements': [
(r'^(Item|ITEM)\s+1\.?\s*Financial\s+Statements', 'Item 1 - Financial Statements'),
(r'^Financial\s+Statements', 'Financial Statements'),
(r'^Condensed.*Financial\s+Statements', 'Condensed Financial Statements')
],
'mda': [
(r'^(Item|ITEM)\s+2\.?\s*Management.*Discussion', 'Item 2 - MD&A'),
(r'^Management.*Discussion.*Analysis', 'MD&A')
],
'market_risk': [
(r'^(Item|ITEM)\s+3\.?\s*Quantitative.*Disclosures', 'Item 3 - Market Risk'),
(r'^Market\s+Risk', 'Market Risk')
],
'controls_procedures': [
(r'^(Item|ITEM)\s+4\.?\s*Controls.*Procedures', 'Item 4 - Controls and Procedures'),
(r'^Controls.*Procedures', 'Controls and Procedures')
],
'legal_proceedings': [
(r'^(Item|ITEM)\s+1\.?\s*Legal\s+Proceedings', 'Item 1 - Legal Proceedings'),
(r'^Legal\s+Proceedings', 'Legal Proceedings')
],
'risk_factors': [
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
(r'^Risk\s+Factors', 'Risk Factors')
]
},
'8-K': {
'item_101': [
(r'^(Item|ITEM)\s+1\.01', 'Item 1.01 - Entry into Material Agreement'),
(r'^Entry.*Material.*Agreement', 'Material Agreement')
],
'item_201': [
(r'^(Item|ITEM)\s+2\.01', 'Item 2.01 - Completion of Acquisition'),
(r'^Completion.*Acquisition', 'Acquisition')
],
'item_202': [
(r'^(Item|ITEM)\s+2\.02', 'Item 2.02 - Results of Operations'),
(r'^Results.*Operations', 'Results of Operations')
],
'item_503': [
(r'^(Item|ITEM)\s+5\.03', 'Item 5.03 - Director/Officer Changes'),
(r'^Amendments.*Articles', 'Charter Amendments')
],
'item_801': [
(r'^(Item|ITEM)\s+8\.01', 'Item 8.01 - Other Events'),
(r'^Other\s+Events', 'Other Events')
],
'item_901': [
(r'^(Item|ITEM)\s+9\.01', 'Item 9.01 - Financial Statements and Exhibits'),
(r'^Financial.*Exhibits', 'Financial Statements and Exhibits')
]
}
}
def __init__(self, form: Optional[str] = None):
"""
Initialize section extractor.
Args:
form: Type of filing (10-K, 10-Q, 8-K, etc.)
"""
self.form = form
def extract(self, document: Document) -> Dict[str, Section]:
"""
Extract sections from document.
Args:
document: Document to extract sections from
Returns:
Dictionary mapping section names to Section objects
"""
# Get filing type from instance, metadata, or document config
# NOTE: We no longer auto-detect filing type (expensive and unnecessary)
form = None
if self.form:
form = self.form
elif document.metadata and document.metadata.form:
form = document.metadata.form
elif hasattr(document, '_config') and document._config and document._config.form:
form = document._config.form
# Only extract sections for forms that have standard sections
if not form or form not in ['10-K', '10-Q', '8-K']:
return {} # No filing type or unsupported form = no section detection
# Get patterns for filing type
patterns = self.SECTION_PATTERNS.get(form, {})
if not patterns:
return {} # No patterns defined for this form type
# Find section headers
headers = self._find_section_headers(document)
# For 10-Q, detect Part I/Part II boundaries
part_context = None
if form == '10-Q':
part_context = self._detect_10q_parts(headers)
# Match headers to sections
sections = self._match_sections(headers, patterns, document, part_context)
# Create section objects
return self._create_sections(sections, document)
# NOTE: _detect_form() removed - form type should be known from context
# Filing metadata should be set by the caller (Filing class, TenK/TenQ, etc.)
# NOTE: _infer_form_from_headers() kept for backward compatibility but not used
# in normal flow anymore. Form type should always be provided explicitly.
def _infer_form_from_headers(self, document: Document) -> str:
"""
Infer filing type from section headers.
NOTE: This method is kept for backward compatibility but should not be used
in the normal flow. Form type should be explicitly provided via config or metadata.
"""
headers = document.headings
header_texts = [h.text().upper() for h in headers if h.text()]
# Check for 10-K specific sections
has_10k_sections = any(
'ITEM 1.' in text or 'ITEM 1A.' in text or 'ITEM 7.' in text or 'ITEM 8.' in text
for text in header_texts
)
# Check for 10-Q specific sections
has_10q_sections = any(
('ITEM 1.' in text and 'FINANCIAL STATEMENTS' in text) or
('ITEM 2.' in text and 'MANAGEMENT' in text) or
'ITEM 3.' in text or 'ITEM 4.' in text
for text in header_texts
)
# Check for 8-K specific sections
has_8k_sections = any(
re.search(r'ITEM \d\.\d{2}', text) for text in header_texts
)
if has_10k_sections and not has_10q_sections:
return '10-K'
elif has_10q_sections:
return '10-Q'
elif has_8k_sections:
return '8-K'
else:
return 'UNKNOWN'
def _get_general_patterns(self) -> Dict[str, List[Tuple[str, str]]]:
"""Get general section patterns."""
return {
'business': [
(r'^Business', 'Business'),
(r'^Overview', 'Overview'),
(r'^Company', 'Company')
],
'financial': [
(r'^Financial\s+Statements', 'Financial Statements'),
(r'^Consolidated.*Statements', 'Consolidated Statements')
],
'notes': [
(r'^Notes\s+to.*Financial\s+Statements', 'Notes to Financial Statements'),
(r'^Notes\s+to.*Statements', 'Notes')
]
}
def _find_section_headers(self, document: Document) -> List[Tuple[Node, str, int]]:
"""Find all potential section headers."""
headers = []
# Find all heading nodes
heading_nodes = document.root.find(lambda n: isinstance(n, HeadingNode))
for node in heading_nodes:
text = node.text()
if text:
# Get position in document
position = self._get_node_position(node, document)
headers.append((node, text, position))
# Also check for section nodes
section_nodes = document.root.find(lambda n: isinstance(n, SectionNode))
for node in section_nodes:
# Get first heading in section
first_heading = node.find_first(lambda n: isinstance(n, HeadingNode))
if first_heading:
text = first_heading.text()
if text:
position = self._get_node_position(node, document)
headers.append((node, text, position))
# Sort by position
headers.sort(key=lambda x: x[2])
return headers
def _get_node_position(self, node: Node, document: Document) -> int:
"""Get position of node in document."""
position = 0
for n in document.root.walk():
if n == node:
return position
position += 1
return position
def _detect_10q_parts(self, headers: List[Tuple[Node, str, int]]) -> Dict[int, str]:
"""
Detect Part I and Part II boundaries in 10-Q filings.
Args:
headers: List of (node, text, position) tuples
Returns:
Dict mapping header index to part name ("Part I" or "Part II")
"""
part_context = {}
current_part = None
part_i_pattern = re.compile(r'^\s*PART\s+I\b', re.IGNORECASE)
part_ii_pattern = re.compile(r'^\s*PART\s+II\b', re.IGNORECASE)
for i, (node, text, position) in enumerate(headers):
text_stripped = text.strip()
# Check if this is a Part I or Part II header
if part_i_pattern.match(text_stripped):
current_part = "Part I"
part_context[i] = current_part
elif part_ii_pattern.match(text_stripped):
current_part = "Part II"
part_context[i] = current_part
elif current_part:
# Headers after a Part declaration belong to that part
part_context[i] = current_part
return part_context
def _match_sections(self,
headers: List[Tuple[Node, str, int]],
patterns: Dict[str, List[Tuple[str, str]]],
document: Document,
part_context: Optional[Dict[int, str]] = None) -> Dict[str, Tuple[Node, str, int, int]]:
"""Match headers to section patterns."""
matched_sections = {}
used_headers = set()
# Try to match each pattern
for section_name, section_patterns in patterns.items():
for pattern, title in section_patterns:
for i, (node, text, position) in enumerate(headers):
if i in used_headers:
continue
# Try to match pattern
if re.match(pattern, text.strip(), re.IGNORECASE):
# Find end position (next section or end of document)
end_position = self._find_section_end(i, headers, document)
# For 10-Q, prefix with Part I or Part II
final_title = title
if part_context and i in part_context:
final_title = f"{part_context[i]} - {title}"
# Use final_title as key to avoid conflicts
section_key = final_title if part_context and i in part_context else section_name
matched_sections[section_key] = (node, final_title, position, end_position)
used_headers.add(i)
break
# If we found a match, move to next section
if section_name in matched_sections:
break
return matched_sections
def _find_section_end(self,
section_index: int,
headers: List[Tuple[Node, str, int]],
document: Document) -> int:
"""Find where section ends."""
# Next section starts where next header at same or higher level begins
if section_index + 1 < len(headers):
current_node = headers[section_index][0]
current_level = current_node.level if isinstance(current_node, HeadingNode) else 1
for i in range(section_index + 1, len(headers)):
next_node = headers[i][0]
next_level = next_node.level if isinstance(next_node, HeadingNode) else 1
# If next header is at same or higher level, that's our end
if next_level <= current_level:
return headers[i][2]
# Otherwise, section goes to end of document
return sum(1 for _ in document.root.walk())
def _create_sections(self,
matched_sections: Dict[str, Tuple[Node, str, int, int]],
document: Document) -> Dict[str, Section]:
"""Create Section objects from matches."""
sections = {}
for section_name, (node, title, start_pos, end_pos) in matched_sections.items():
# Create section node containing all content in range
section_node = SectionNode(section_name=section_name)
# Find all nodes in position range
position = 0
for n in document.root.walk():
if start_pos <= position < end_pos:
# Clone node and add to section
# (In real implementation, would properly handle node hierarchy)
section_node.add_child(n)
position += 1
# Parse section name to extract part and item identifiers
part, item = Section.parse_section_name(section_name)
# Create Section object
section = Section(
name=section_name,
title=title,
node=section_node,
start_offset=start_pos,
end_offset=end_pos,
confidence=0.7, # Pattern-based detection = moderate confidence
detection_method='pattern', # Method: regex pattern matching
part=part,
item=item
)
sections[section_name] = section
return sections

View File

@@ -0,0 +1,348 @@
"""
Text extraction from documents with various options.
"""
import re
from typing import List, Optional, Set
from edgar.documents.document import Document
from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode
from edgar.documents.table_nodes import TableNode
from edgar.documents.types import NodeType
class TextExtractor:
"""
Extracts text from documents with configurable options.
Supports:
- Clean text extraction for AI/NLP
- Table inclusion/exclusion
- Metadata annotations
- Length limiting
- Smart whitespace handling
"""
def __init__(self,
clean: bool = True,
include_tables: bool = True,
include_metadata: bool = False,
include_links: bool = False,
max_length: Optional[int] = None,
preserve_structure: bool = False):
"""
Initialize text extractor.
Args:
clean: Clean and normalize text
include_tables: Include table content
include_metadata: Include metadata annotations
include_links: Include link URLs
max_length: Maximum text length
preserve_structure: Preserve document structure with markers
"""
self.clean = clean
self.include_tables = include_tables
self.include_metadata = include_metadata
self.include_links = include_links
self.max_length = max_length
self.preserve_structure = preserve_structure
# Track what we've extracted to avoid duplicates
self._extracted_ids: Set[str] = set()
def extract(self, document: Document) -> str:
"""
Extract text from document.
Args:
document: Document to extract from
Returns:
Extracted text
"""
parts = []
self._extracted_ids.clear()
# Extract from root
self._extract_from_node(document.root, parts, depth=0)
# Join parts
if self.preserve_structure:
text = '\n'.join(parts)
else:
text = '\n\n'.join(filter(None, parts))
# Apply minimal global cleaning - tables are already handled appropriately per node
if self.clean:
text = self._clean_document_text(text)
# Limit length if requested
if self.max_length and len(text) > self.max_length:
text = self._truncate_text(text, self.max_length)
return text
def extract_from_node(self, node: Node) -> str:
"""Extract text from a specific node."""
parts = []
self._extracted_ids.clear()
self._extract_from_node(node, parts, depth=0)
text = '\n\n'.join(filter(None, parts))
if self.clean:
text = self._clean_document_text(text)
return text
def _extract_from_node(self, node: Node, parts: List[str], depth: int):
"""Recursively extract text from node - render each node type appropriately."""
# Skip if already extracted (handles shared nodes)
if node.id in self._extracted_ids:
return
self._extracted_ids.add(node.id)
# Handle based on node type - like old parser's block.get_text()
if isinstance(node, TableNode):
if self.include_tables:
# Tables render themselves - preserve their formatting
self._extract_table(node, parts)
elif isinstance(node, HeadingNode):
# Headings get cleaned text
self._extract_heading(node, parts, depth)
elif isinstance(node, TextNode):
# Text nodes get cleaned if cleaning is enabled
text = node.text()
if text:
if self.clean:
text = self._clean_text_content(text) # Clean non-table text
if self.include_metadata and node.metadata:
text = self._annotate_with_metadata(text, node.metadata)
parts.append(text)
elif isinstance(node, ParagraphNode):
# Extract paragraph as unified text to maintain flow of inline elements
text = node.text()
if text:
if self.clean:
text = self._clean_text_content(text)
if self.include_metadata and node.metadata:
text = self._annotate_with_metadata(text, node.metadata)
parts.append(text)
# Don't process children since we already got the paragraph text
return
else:
# Check if this looks like a bullet point container that should flow together
if self._is_bullet_point_container(node):
# Extract text from bullet point children and join with spaces (not newlines)
bullet_parts = []
for child in node.children:
child_text = child.text() if hasattr(child, 'text') else ""
if child_text and child_text.strip():
bullet_parts.append(child_text.strip())
if bullet_parts:
# Join with spaces for bullet points
text = ' '.join(bullet_parts)
if self.clean:
text = self._clean_text_content(text)
if self.include_metadata and node.metadata:
text = self._annotate_with_metadata(text, node.metadata)
parts.append(text)
# Don't process children since we already got the unified text
return
# For other nodes, extract text content and clean if appropriate
if hasattr(node, 'content') and isinstance(node.content, str):
text = node.content
if text and text.strip():
if self.clean:
text = self._clean_text_content(text) # Clean non-table text
if self.include_metadata and node.metadata:
text = self._annotate_with_metadata(text, node.metadata)
parts.append(text)
# Process children
for child in node.children:
self._extract_from_node(child, parts, depth + 1)
def _extract_heading(self, node: HeadingNode, parts: List[str], depth: int):
"""Extract heading with optional structure markers."""
text = node.text()
if not text:
return
if self.preserve_structure:
# Add structure markers
marker = '#' * node.level
text = f"{marker} {text}"
if self.include_metadata and node.metadata:
text = self._annotate_with_metadata(text, node.metadata)
parts.append(text)
def _extract_table(self, table: TableNode, parts: List[str]):
"""Extract table content - preserve original formatting like old parser."""
if self.preserve_structure:
parts.append("[TABLE START]")
# Add table caption if present
if table.caption:
caption_text = table.caption
if self.clean:
caption_text = self._clean_text_content(caption_text) # Clean caption but not table content
if self.preserve_structure:
parts.append(f"Caption: {caption_text}")
else:
parts.append(caption_text)
# Extract table text - PRESERVE FORMATTING (like old parser's TableBlock.get_text())
table_text = table.text()
if table_text:
# Tables render their own formatting - don't apply text cleaning to preserve alignment
parts.append(table_text) # Keep original spacing and alignment
if self.preserve_structure:
parts.append("[TABLE END]")
def _annotate_with_metadata(self, text: str, metadata: dict) -> str:
"""Add metadata annotations to text."""
annotations = []
# Add XBRL annotations
if 'ix_tag' in metadata:
annotations.append(f"[XBRL: {metadata['ix_tag']}]")
# Add section annotations
if 'section_name' in metadata:
annotations.append(f"[Section: {metadata['section_name']}]")
# Add semantic type
if 'semantic_type' in metadata:
annotations.append(f"[Type: {metadata['semantic_type']}]")
if annotations:
return f"{' '.join(annotations)} {text}"
return text
def _clean_text_content(self, text: str) -> str:
"""Clean regular text content (not tables) - like old parser text cleaning."""
if not text:
return text
# Replace multiple spaces with single space for regular text
text = re.sub(r' {2,}', ' ', text)
# Clean up space around newlines
text = re.sub(r' *\n *', '\n', text)
# Remove leading/trailing whitespace from lines
lines = text.split('\n')
lines = [line.strip() for line in lines]
text = '\n'.join(lines)
# Normalize quotes and dashes
text = self._normalize_punctuation(text)
return text
def _is_bullet_point_container(self, node) -> bool:
"""Check if a container node represents a bullet point that should flow as one line."""
from edgar.documents.nodes import ContainerNode
if not isinstance(node, ContainerNode):
return False
# Must have at least 2 children (bullet + content)
if len(node.children) < 2:
return False
# Get the text of all children to check for bullet patterns
all_text = node.text()
if not all_text:
return False
# Check if starts with common bullet characters
bullet_chars = ['', '', '', '', '', '', '-', '*']
starts_with_bullet = any(all_text.strip().startswith(char) for char in bullet_chars)
if not starts_with_bullet:
return False
# Check if container has flex display (common for bullet point layouts)
if hasattr(node, 'style') and node.style and hasattr(node.style, 'display'):
if node.style.display == 'flex':
return True
# Check if it has bullet-like structure: short first child + longer content
if len(node.children) >= 2:
first_child_text = node.children[0].text() if hasattr(node.children[0], 'text') else ""
second_child_text = node.children[1].text() if hasattr(node.children[1], 'text') else ""
# First child is very short (likely bullet), second is longer (content)
if len(first_child_text.strip()) <= 3 and len(second_child_text.strip()) > 10:
return True
return False
def _clean_document_text(self, text: str) -> str:
"""Apply minimal document-level cleaning that preserves table formatting."""
if not text:
return text
# Only apply global formatting that doesn't affect table alignment:
# Replace excessive newlines (4+ consecutive) with triple newline
text = re.sub(r'\n{4,}', '\n\n\n', text)
# Remove empty lines at start/end only
text = text.strip()
return text
def _normalize_punctuation(self, text: str) -> str:
"""Normalize punctuation for cleaner text."""
# Normalize quotes
text = text.replace('"', '"').replace('"', '"')
text = text.replace(''', "'").replace(''', "'")
# Normalize dashes
text = text.replace('', ' - ') # em dash
text = text.replace('', ' - ') # en dash
# Fix spacing around punctuation
text = re.sub(r'\s+([.,;!?])', r'\1', text)
text = re.sub(r'([.,;!?])\s*', r'\1 ', text)
# Remove extra spaces
text = re.sub(r' {2,}', ' ', text)
return text.strip()
def _truncate_text(self, text: str, max_length: int) -> str:
"""Truncate text intelligently."""
if len(text) <= max_length:
return text
# Try to truncate at sentence boundary
truncated = text[:max_length]
last_period = truncated.rfind('.')
last_newline = truncated.rfind('\n')
# Choose the better truncation point
truncate_at = max(last_period, last_newline)
if truncate_at > max_length * 0.8: # If we found a good boundary
return text[:truncate_at + 1].strip()
# Otherwise truncate at word boundary
last_space = truncated.rfind(' ')
if last_space > max_length * 0.9:
return text[:last_space].strip() + '...'
# Last resort: hard truncate
return text[:max_length - 3].strip() + '...'

View File

@@ -0,0 +1,178 @@
"""
TOC-based section detection strategy.
Detects sections using Table of Contents structure. Provides highest
confidence (0.95) and includes full text extraction capabilities.
This detector wraps SECSectionExtractor which has proven implementations of:
- Multi-column TOC support (checks all preceding table cells)
- Nested anchor handling (traverses up to find content container)
- Full section text extraction
"""
import logging
from typing import Dict, Optional
from edgar.documents.document import Document, Section
from edgar.documents.nodes import SectionNode
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
logger = logging.getLogger(__name__)
class TOCSectionDetector:
"""
TOC-based section detection strategy.
Uses Table of Contents structure to identify section boundaries and
extract full section content. Provides high confidence (0.95) detection.
This implementation wraps the proven SECSectionExtractor which includes:
- Multi-column TOC support for edge cases like Morgan Stanley
- Nested anchor handling for sections with no sibling content
- Complete text extraction with proper boundary detection
"""
def __init__(self, document: Document):
"""
Initialize TOC-based detector.
Args:
document: Document to analyze (must have metadata.original_html)
"""
self.document = document
self.extractor = SECSectionExtractor(document)
def detect(self) -> Optional[Dict[str, Section]]:
"""
Detect sections using TOC structure.
Returns:
Dictionary mapping section names to Section objects, or None if unavailable
Note:
Requires document.metadata.original_html to be available.
Returns None if HTML is not available or no sections found.
"""
# Check if original HTML is available
html_content = getattr(self.document.metadata, 'original_html', None)
if not html_content:
logger.debug("TOC detection unavailable: original_html not in document metadata")
return None
try:
# Get available sections from TOC
available = self.extractor.get_available_sections()
if not available:
logger.debug("No sections found in TOC")
return None
sections = {}
# Extract each section
for section_name in available:
# Get section metadata first to check for subsections
section_info = self.extractor.get_section_info(section_name)
if not section_info:
logger.debug(f"Skipping {section_name}: no section info")
continue
# Get section text (may be empty for container sections)
section_text = self.extractor.get_section_text(section_name, include_subsections=True)
# Check if this section has subsections
has_subsections = section_info.get('subsections', [])
if not section_text and not has_subsections:
# Skip only if no text AND no subsections
logger.debug(f"Skipping {section_name}: no text and no subsections")
continue
# Create section node (placeholder - actual content extracted lazily)
section_node = SectionNode(section_name=section_name)
# For container sections (Item 1, Item 10), text will include all subsections
section_length = len(section_text) if section_text else 0
# Create text extractor callback for lazy loading
def make_text_extractor(extractor, name):
"""Create a closure that captures extractor and section name."""
def extract_text(section_name=None, **kwargs):
# Use captured name, ignore passed section_name
clean = kwargs.get('clean', True)
return extractor.get_section_text(name, include_subsections=True, clean=clean) or ""
return extract_text
# Parse section name to extract part and item identifiers
part, item = Section.parse_section_name(section_name)
# Create Section with TOC confidence
section = Section(
name=section_name,
title=section_info.get('canonical_name', section_name),
node=section_node,
start_offset=0, # Would need actual offsets from parsing
end_offset=section_length,
confidence=0.95, # TOC-based = high confidence
detection_method='toc',
part=part,
item=item,
_text_extractor=make_text_extractor(self.extractor, section_name)
)
sections[section_name] = section
if sections:
logger.info(f"TOC detection found {len(sections)} sections")
return sections
return None
except Exception as e:
logger.warning(f"TOC detection failed: {e}", exc_info=True)
return None
def get_section_text(document: Document, section_name: str) -> Optional[str]:
"""
Get section text using TOC-based extraction.
Args:
document: Document to extract from
section_name: Section name (e.g., 'Item 1', 'Item 1A')
Returns:
Section text if available, None otherwise
"""
html_content = getattr(document.metadata, 'original_html', None)
if not html_content:
return None
try:
extractor = SECSectionExtractor(document)
return extractor.get_section_text(section_name)
except Exception as e:
logger.warning(f"Failed to get section text for {section_name}: {e}")
return None
def get_available_sections(document: Document) -> list[str]:
"""
Get list of available sections from TOC.
Args:
document: Document to analyze
Returns:
List of section names found in TOC
"""
html_content = getattr(document.metadata, 'original_html', None)
if not html_content:
return []
try:
extractor = SECSectionExtractor(document)
return extractor.get_available_sections()
except Exception as e:
logger.warning(f"Failed to get available sections: {e}")
return []

View File

@@ -0,0 +1,383 @@
"""
Section extraction for SEC filings using Table of Contents analysis.
This system uses TOC structure to extract specific sections like "Item 1",
"Item 1A", etc. from SEC filings. This approach works consistently across
all SEC filings regardless of whether they use semantic anchors or generated IDs.
"""
import re
from typing import Dict, List, Optional, Tuple, Set
from dataclasses import dataclass
from lxml import html as lxml_html
from edgar.documents.nodes import Node, SectionNode
from edgar.documents.document import Document
from edgar.documents.utils.toc_analyzer import TOCAnalyzer
@dataclass
class SectionBoundary:
"""Represents the boundaries of a document section."""
name: str
anchor_id: str
start_element_id: Optional[str] = None
end_element_id: Optional[str] = None
start_node: Optional[Node] = None
end_node: Optional[Node] = None
text_start: Optional[int] = None # Character position in full text
text_end: Optional[int] = None
confidence: float = 1.0 # Detection confidence (0.0-1.0)
detection_method: str = 'unknown' # How section was detected
class SECSectionExtractor:
"""
Extract specific sections from SEC filings using Table of Contents analysis.
This uses TOC structure to identify section boundaries and extract content
between them. Works consistently for all SEC filings.
"""
def __init__(self, document: Document):
self.document = document
self.section_map = {} # Maps section names to canonical names
self.section_boundaries = {} # Maps section names to boundaries
self.toc_analyzer = TOCAnalyzer()
self._analyze_sections()
def _analyze_sections(self) -> None:
"""
Analyze the document using TOC structure to identify section boundaries.
This creates a map of section names to their anchor positions using
Table of Contents analysis, which works for all SEC filings.
"""
# Get the original HTML if available
html_content = getattr(self.document.metadata, 'original_html', None)
if not html_content:
return
# Use TOC analysis to find sections
toc_mapping = self.toc_analyzer.analyze_toc_structure(html_content)
if not toc_mapping:
return # No sections found
# Handle XML declaration issues
if html_content.startswith('<?xml'):
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
tree = lxml_html.fromstring(html_content)
sec_sections = {}
for section_name, anchor_id in toc_mapping.items():
# Verify the anchor target exists
target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
if target_elements:
element = target_elements[0]
# Use TOC-based section info
section_type, order = self.toc_analyzer._get_section_type_and_order(section_name)
sec_sections[section_name] = {
'anchor_id': anchor_id,
'element': element,
'canonical_name': section_name,
'type': section_type,
'order': order,
'confidence': 0.95, # TOC-based detection = high confidence
'detection_method': 'toc' # Method: Table of Contents
}
if not sec_sections:
return # No valid sections found
# Sort sections by their logical order
sorted_sections = sorted(sec_sections.items(), key=lambda x: x[1]['order'])
# Calculate section boundaries
for i, (section_name, section_data) in enumerate(sorted_sections):
start_anchor = section_data['anchor_id']
# End boundary is the start of the next section (if any)
end_anchor = None
if i + 1 < len(sorted_sections):
next_section = sorted_sections[i + 1][1]
end_anchor = next_section['anchor_id']
self.section_boundaries[section_name] = SectionBoundary(
name=section_name,
anchor_id=start_anchor,
end_element_id=end_anchor,
confidence=section_data.get('confidence', 0.95),
detection_method=section_data.get('detection_method', 'toc')
)
self.section_map = {name: data['canonical_name'] for name, data in sec_sections.items()}
def get_available_sections(self) -> List[str]:
"""
Get list of available sections that can be extracted.
Returns:
List of section names
"""
return sorted(self.section_boundaries.keys(),
key=lambda x: self.section_boundaries[x].anchor_id)
def get_section_text(self, section_name: str,
include_subsections: bool = True,
clean: bool = True) -> Optional[str]:
"""
Extract text content for a specific section.
Args:
section_name: Name of section (e.g., "Item 1", "Item 1A", "Part I")
include_subsections: Whether to include subsections
clean: Whether to apply text cleaning
Returns:
Section text content or None if section not found
"""
# Normalize section name
normalized_name = self._normalize_section_name(section_name)
if normalized_name not in self.section_boundaries:
return None
boundary = self.section_boundaries[normalized_name]
# Extract content between boundaries using HTML parsing
html_content = getattr(self.document.metadata, 'original_html', None)
if not html_content:
return None
try:
section_text = self._extract_section_content(html_content, boundary, include_subsections, clean)
# If no direct content but include_subsections=True, aggregate subsection text
if not section_text and include_subsections:
subsections = self._get_subsections(normalized_name)
if subsections:
# Recursively get text from all subsections
subsection_texts = []
for subsection_name in subsections:
subsection_text = self.get_section_text(subsection_name, include_subsections=True, clean=clean)
if subsection_text:
subsection_texts.append(subsection_text)
if subsection_texts:
section_text = '\n\n'.join(subsection_texts)
return section_text
except Exception as e:
# Fallback to simple text extraction
return self._extract_section_fallback(section_name, clean)
def _normalize_section_name(self, section_name: str) -> str:
"""Normalize section name for lookup."""
# Handle common variations
name = section_name.strip()
# "Item 1" vs "Item 1." vs "Item 1:"
name = re.sub(r'[.:]$', '', name)
# Case normalization
if re.match(r'item\s+\d+', name, re.IGNORECASE):
match = re.match(r'item\s+(\d+[a-z]?)', name, re.IGNORECASE)
if match:
name = f"Item {match.group(1).upper()}"
elif re.match(r'part\s+[ivx]+', name, re.IGNORECASE):
match = re.match(r'part\s+([ivx]+)', name, re.IGNORECASE)
if match:
name = f"Part {match.group(1).upper()}"
return name
def _extract_section_content(self, html_content: str, boundary: SectionBoundary,
include_subsections: bool, clean: bool) -> str:
"""
Extract section content from HTML between anchors.
Args:
html_content: Full HTML content
boundary: Section boundary info
include_subsections: Whether to include subsections
clean: Whether to clean the text
Returns:
Extracted section text
"""
# Handle XML declaration issues
if html_content.startswith('<?xml'):
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
tree = lxml_html.fromstring(html_content)
# Find start element
start_elements = tree.xpath(f'//*[@id="{boundary.anchor_id}"]')
if not start_elements:
return ""
start_element = start_elements[0]
# Collect content until we hit the end boundary (if specified)
content_elements = []
# If anchor has no siblings (nested in empty container), traverse up to find content container
# This handles cases like <div id="item7"><div></div></div> where content is after the container
current = start_element.getnext()
if current is None:
# No sibling - traverse up to find a container with siblings
container = start_element.getparent()
while container is not None and container.getnext() is None:
container = container.getparent()
# Start from the container's next sibling if found
if container is not None:
current = container.getnext()
# Collect content from siblings
if current is not None:
# Normal case - anchor has siblings
while current is not None:
# Check if we've reached the end boundary
if boundary.end_element_id:
current_id = current.get('id', '')
if current_id == boundary.end_element_id:
break
# Also check if this is a sibling section we should stop at
if not include_subsections and self._is_sibling_section(current_id, boundary.name):
break
content_elements.append(current)
current = current.getnext()
# Extract text from collected elements
section_texts = []
for element in content_elements:
text = self._extract_element_text(element)
if text.strip():
section_texts.append(text)
combined_text = '\n\n'.join(section_texts)
# Apply cleaning if requested
if clean:
combined_text = self._clean_section_text(combined_text)
return combined_text
def _is_sibling_section(self, element_id: str, current_section: str) -> bool:
"""Check if element ID represents a sibling section."""
if not element_id:
return False
# Check if this looks like another item at the same level
if 'item' in current_section.lower() and 'item' in element_id.lower():
current_item = re.search(r'item\s*(\d+)', current_section, re.IGNORECASE)
other_item = re.search(r'item[\s_]*(\d+)', element_id, re.IGNORECASE)
if current_item and other_item:
return current_item.group(1) != other_item.group(1)
return False
def _extract_element_text(self, element) -> str:
"""Extract clean text from an HTML element."""
# This would integrate with your existing text extraction logic
# For now, simple text extraction
return element.text_content() or ""
def _clean_section_text(self, text: str) -> str:
"""Clean extracted section text."""
# Apply the same cleaning as the main document
from edgar.documents.utils.anchor_cache import filter_with_cached_patterns
# Remove excessive whitespace
text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
# Filter navigation links
html_content = getattr(self.document.metadata, 'original_html', None)
if html_content:
text = filter_with_cached_patterns(text, html_content)
return text.strip()
def _extract_section_fallback(self, section_name: str, clean: bool) -> Optional[str]:
"""
Fallback section extraction using document nodes.
This is used when HTML-based extraction fails.
"""
# Search through document sections
for name, section in self.document.sections.items():
if section_name.lower() in name.lower():
return section.text(clean=clean)
return None
def get_section_info(self, section_name: str) -> Optional[Dict]:
"""
Get detailed information about a section.
Args:
section_name: Section name to look up
Returns:
Dict with section metadata
"""
normalized_name = self._normalize_section_name(section_name)
if normalized_name not in self.section_boundaries:
return None
boundary = self.section_boundaries[normalized_name]
return {
'name': boundary.name,
'anchor_id': boundary.anchor_id,
'available': True,
'estimated_length': None, # Could calculate if needed
'subsections': self._get_subsections(normalized_name)
}
def _get_subsections(self, parent_section: str) -> List[str]:
"""
Get subsections of a parent section.
For example:
- "Item 1" has subsections "Item 1A", "Item 1B" (valid)
- "Item 1" does NOT have subsection "Item 10" (invalid - different item)
"""
subsections = []
# Look for sections that start with the parent name
for section_name in self.section_boundaries:
if section_name == parent_section:
continue
if section_name.startswith(parent_section):
# Check if this is a true subsection (e.g., Item 1A)
# vs a different section that happens to start with same prefix (e.g., Item 10)
remainder = section_name[len(parent_section):]
# Valid subsection patterns:
# - "Item 1A" (remainder: "A") - letter suffix
# - "Item 1 - Business" (remainder: " - Business") - has separator
# Invalid patterns:
# - "Item 10" (remainder: "0") - digit continues the number
if remainder and remainder[0].isalpha():
# Letter suffix like "A", "B" - valid subsection
subsections.append(section_name)
elif remainder and remainder[0] in [' ', '-', '.', ':']:
# Has separator - could be descriptive title
subsections.append(section_name)
# If remainder starts with digit, it's NOT a subsection (e.g., "Item 10")
return sorted(subsections)

View File

@@ -0,0 +1,318 @@
"""
Migration and compatibility layer for transitioning from old parser to new.
NOTE: This compatibility layer is documented for user migration from v1.x → v2.0
It is intentionally not used internally but kept for user convenience.
Do not remove without versioning consideration.
"""
from typing import Optional, List, Dict, Any
import warnings
from edgar.documents import HTMLParser, Document, ParserConfig
from edgar.documents.search import DocumentSearch
class LegacyHTMLDocument:
"""
Compatibility wrapper that mimics the old Document API.
This allows existing code to work with the new parser
while providing deprecation warnings.
"""
def __init__(self, new_document: Document):
"""Initialize with new document."""
self._doc = new_document
self._warn_on_use = True
def _deprecation_warning(self, old_method: str, new_method: str = None):
"""Issue deprecation warning."""
if self._warn_on_use:
msg = f"Document.{old_method} is deprecated."
if new_method:
msg += f" Use {new_method} instead."
warnings.warn(msg, DeprecationWarning, stacklevel=3)
@property
def text(self) -> str:
"""Get document text (old API)."""
self._deprecation_warning("text", "Document.text()")
return self._doc.text()
def get_text(self, clean: bool = True) -> str:
"""Get text with options (old API)."""
self._deprecation_warning("get_text()", "Document.text()")
return self._doc.text()
@property
def tables(self) -> List[Any]:
"""Get tables (old API)."""
self._deprecation_warning("tables", "Document.tables")
return self._doc.tables
def find_all(self, tag: str) -> List[Any]:
"""Find elements by tag (old API)."""
self._deprecation_warning("find_all()", "Document.root.find()")
# Map old tag names to node types
from edgar.documents.types import NodeType
tag_map = {
'h1': NodeType.HEADING,
'h2': NodeType.HEADING,
'h3': NodeType.HEADING,
'p': NodeType.PARAGRAPH,
'table': NodeType.TABLE,
}
node_type = tag_map.get(tag.lower())
if node_type:
return self._doc.root.find(lambda n: n.type == node_type)
return []
def search(self, pattern: str) -> List[str]:
"""Search document (old API)."""
self._deprecation_warning("search()", "DocumentSearch.search()")
search = DocumentSearch(self._doc)
results = search.search(pattern)
return [r.text for r in results]
@property
def sections(self) -> Dict[str, Any]:
"""Get sections (old API)."""
# Convert new sections to old format
new_sections = self._doc.sections
old_sections = {}
for name, section in new_sections.items():
old_sections[name] = {
'title': section.title,
'text': section.text(),
'start': section.start_offset,
'end': section.end_offset
}
return old_sections
def to_markdown(self) -> str:
"""Convert to markdown (old API)."""
self._deprecation_warning("to_markdown()", "MarkdownRenderer.render()")
from edgar.documents.renderers import MarkdownRenderer
renderer = MarkdownRenderer()
return renderer.render(self._doc)
class LegacySECHTMLParser:
"""
Compatibility wrapper for old SECHTMLParser.
Maps old parser methods to new parser.
"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""Initialize with optional config."""
# Convert old config to new
new_config = self._convert_config(config)
self._parser = HTMLParser(new_config)
self._warn_on_use = True
def _convert_config(self, old_config: Optional[Dict[str, Any]]) -> ParserConfig:
"""Convert old config format to new."""
if not old_config:
return ParserConfig()
new_config = ParserConfig()
# Map old config keys to new
if 'clean_text' in old_config:
new_config.clean_text = old_config['clean_text']
if 'extract_tables' in old_config:
new_config.table_extraction = old_config['extract_tables']
if 'preserve_layout' in old_config:
new_config.preserve_whitespace = old_config['preserve_layout']
return new_config
def parse(self, html: str) -> LegacyHTMLDocument:
"""Parse HTML (old API)."""
if self._warn_on_use:
warnings.warn(
"SECHTMLParser is deprecated. Use HTMLParser instead.",
DeprecationWarning,
stacklevel=2
)
new_doc = self._parser.parse(html)
return LegacyHTMLDocument(new_doc)
def parse_file(self, filepath: str) -> LegacyHTMLDocument:
"""Parse HTML file (old API)."""
if self._warn_on_use:
warnings.warn(
"SECHTMLParser.parse_file() is deprecated. Use HTMLParser.parse_file() instead.",
DeprecationWarning,
stacklevel=2
)
new_doc = self._parser.parse_file(filepath)
return LegacyHTMLDocument(new_doc)
def migrate_parser_usage(code: str) -> str:
"""
Helper to migrate code from old parser to new.
Args:
code: Python code using old parser
Returns:
Updated code using new parser
"""
replacements = [
# Import statements
("from edgar.files.html import SECHTMLParser",
"from edgar.documents import HTMLParser"),
("from edgar.files.html import Document",
"from edgar.documents import Document"),
# Class instantiation
("SECHTMLParser(", "HTMLParser("),
# Method calls
("document.text", "document.text()"),
("document.get_text(", "document.text("),
("document.find_all(", "document.root.find(lambda n: n.tag == "),
("document.to_markdown(", "MarkdownRenderer().render(document"),
# Config changes
("extract_tables=", "table_extraction="),
("preserve_layout=", "preserve_whitespace="),
]
migrated = code
for old, new in replacements:
migrated = migrated.replace(old, new)
return migrated
class MigrationGuide:
"""
Provides migration guidance and utilities.
"""
@staticmethod
def check_compatibility(old_parser_instance) -> Dict[str, Any]:
"""
Check if old parser instance can be migrated.
Returns:
Dict with compatibility info
"""
return {
'can_migrate': True,
'warnings': [],
'recommendations': [
"Replace SECHTMLParser with HTMLParser",
"Update document.text to document.text()",
"Use DocumentSearch for search functionality",
"Use MarkdownRenderer for markdown conversion"
]
}
@staticmethod
def print_migration_guide():
"""Print migration guide."""
guide = """
HTML Parser Migration Guide
==========================
The new HTML parser provides significant improvements:
- 10x performance improvement
- Better table parsing
- Reliable section detection
- Advanced search capabilities
Key Changes:
-----------
1. Imports:
OLD: from edgar.files.html import SECHTMLParser, Document
NEW: from edgar.documents import HTMLParser, Document
2. Parser Creation:
OLD: parser = SECHTMLParser()
NEW: parser = HTMLParser()
3. Document Text:
OLD: document.text or document.get_text()
NEW: document.text()
4. Search:
OLD: document.search(pattern)
NEW: search = DocumentSearch(document)
results = search.search(pattern)
5. Tables:
OLD: document.tables
NEW: document.tables (same, but returns richer TableNode objects)
6. Sections:
OLD: document.sections
NEW: document.sections (returns Section objects with more features)
7. Markdown:
OLD: document.to_markdown()
NEW: renderer = MarkdownRenderer()
markdown = renderer.render(document)
Compatibility:
-------------
For gradual migration, use the compatibility layer:
from edgar.documents.migration import LegacySECHTMLParser
parser = LegacySECHTMLParser() # Works like old parser
This will issue deprecation warnings to help you migrate.
Performance Config:
------------------
For best performance:
parser = HTMLParser.create_for_performance()
For best accuracy:
parser = HTMLParser.create_for_accuracy()
For AI/LLM processing:
parser = HTMLParser.create_for_ai()
"""
print(guide)
# Compatibility aliases
SECHTMLParser = LegacySECHTMLParser
HTMLDocument = LegacyHTMLDocument
# Auto-migration for common imports
def __getattr__(name):
"""Provide compatibility imports with warnings."""
if name == "SECHTMLParser":
warnings.warn(
"Importing SECHTMLParser from edgar.documents.migration is deprecated. "
"Use HTMLParser from edgar.documents instead.",
DeprecationWarning,
stacklevel=2
)
return LegacySECHTMLParser
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

View File

@@ -0,0 +1,160 @@
"""
Example showing how to migrate from old parser to new.
"""
def old_parser_example():
"""Example using old parser API."""
# This is how code might look with the old parser
from edgar.documents.migration import SECHTMLParser # Using compatibility layer
# Create parser
parser = SECHTMLParser({
'extract_tables': True,
'clean_text': True,
'preserve_layout': False
})
# Parse HTML
html = """
<html>
<body>
<h1>Item 1. Business</h1>
<p>We are a technology company.</p>
<table>
<tr><th>Year</th><th>Revenue</th></tr>
<tr><td>2023</td><td>$100M</td></tr>
</table>
</body>
</html>
"""
document = parser.parse(html)
# Old API usage (will show deprecation warnings)
# Search
document.search("revenue")
# Convert to markdown
document.to_markdown()
def new_parser_example():
"""Example using new parser API."""
# New imports
from edgar.documents import DocumentSearch, HTMLParser, ParserConfig
from edgar.documents.renderers import MarkdownRenderer
# Create parser with new config
config = ParserConfig(
table_extraction=True,
clean_text=True,
preserve_whitespace=False,
detect_sections=True
)
parser = HTMLParser(config)
# Parse HTML
html = """
<html>
<body>
<h1>Item 1. Business</h1>
<p>We are a technology company.</p>
<table>
<tr><th>Year</th><th>Revenue</th></tr>
<tr><td>2023</td><td>$100M</td></tr>
</table>
</body>
</html>
"""
document = parser.parse(html)
# New API usage
# Search with new API
search = DocumentSearch(document)
search.search("revenue")
# Convert to markdown with new API
renderer = MarkdownRenderer()
renderer.render(document)
# New features not available in old parser
# Advanced search
search.find_tables(caption_pattern="Revenue")
# Performance-optimized parser
HTMLParser.create_for_performance()
# Cache statistics
from edgar.documents.utils import get_cache_manager
get_cache_manager().get_stats()
def migration_comparison():
"""Show side-by-side comparison."""
def automatic_migration_example():
"""Show automatic code migration."""
from edgar.documents.migration import migrate_parser_usage
old_code = '''
from edgar.files.html import SECHTMLParser, Document
def analyze_filing(html):
parser = SECHTMLParser({'extract_tables': True})
document = parser.parse(html)
# Get text
text = document.text
# Search for revenue
revenue_mentions = document.search("revenue")
# Convert to markdown
markdown = document.to_markdown()
return {
'text': text,
'revenue_mentions': revenue_mentions,
'markdown': markdown
}
'''
migrate_parser_usage(old_code)
if __name__ == "__main__":
# Run examples
import warnings
# Show deprecation warnings
warnings.filterwarnings('always', category=DeprecationWarning)
# Run old parser example (will show warnings)
old_parser_example()
# Run new parser example
new_parser_example()
# Show comparison
migration_comparison()
# Show automatic migration
automatic_migration_example()
# Print full migration guide
from edgar.documents.migration import MigrationGuide
MigrationGuide.print_migration_guide()

View File

@@ -0,0 +1,456 @@
"""
Node hierarchy for the document tree.
"""
import uuid
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any, Callable, Iterator
from edgar.documents.types import NodeType, SemanticType, Style
from edgar.documents.cache_mixin import CacheableMixin
@dataclass
class Node(ABC):
"""
Base node class for document tree.
All nodes in the document inherit from this class and implement
the abstract methods for text and HTML generation.
"""
# Identity
id: str = field(default_factory=lambda: str(uuid.uuid4()))
type: NodeType = NodeType.DOCUMENT
# Hierarchy
parent: Optional['Node'] = field(default=None, repr=False)
children: List['Node'] = field(default_factory=list, repr=False)
# Content
content: Any = None
metadata: Dict[str, Any] = field(default_factory=dict)
style: Style = field(default_factory=Style)
# Semantic info
semantic_type: Optional[SemanticType] = None
semantic_role: Optional[str] = None
def add_child(self, child: 'Node') -> None:
"""Add child node, maintaining parent reference."""
child.parent = self
self.children.append(child)
def remove_child(self, child: 'Node') -> None:
"""Remove child node."""
if child in self.children:
self.children.remove(child)
child.parent = None
def insert_child(self, index: int, child: 'Node') -> None:
"""Insert child at specific index."""
child.parent = self
self.children.insert(index, child)
@abstractmethod
def text(self) -> str:
"""Extract text content from node and its children."""
pass
@abstractmethod
def html(self) -> str:
"""Generate HTML representation of node."""
pass
def find(self, predicate: Callable[['Node'], bool]) -> List['Node']:
"""Find all nodes matching predicate."""
results = []
if predicate(self):
results.append(self)
for child in self.children:
results.extend(child.find(predicate))
return results
def find_first(self, predicate: Callable[['Node'], bool]) -> Optional['Node']:
"""Find first node matching predicate."""
if predicate(self):
return self
for child in self.children:
result = child.find_first(predicate)
if result:
return result
return None
def xpath(self, expression: str) -> List['Node']:
"""
Simple XPath-like node selection.
Supports:
- //node_type - Find all nodes of type
- /node_type - Direct children of type
- [@attr=value] - Attribute matching
"""
# Simple implementation - can be extended
if expression.startswith('//'):
node_type = expression[2:].lower()
return self.find(lambda n: n.type.name.lower() == node_type)
elif expression.startswith('/'):
node_type = expression[1:].lower()
return [c for c in self.children if c.type.name.lower() == node_type]
return []
def walk(self) -> Iterator['Node']:
"""Walk the tree depth-first."""
yield self
for child in self.children:
yield from child.walk()
@property
def depth(self) -> int:
"""Get depth of node in tree."""
depth = 0
current = self.parent
while current:
depth += 1
current = current.parent
return depth
@property
def path(self) -> str:
"""Get path from root to this node."""
parts = []
current = self
while current:
parts.append(current.type.name)
current = current.parent
return '/'.join(reversed(parts))
def get_metadata(self, key: str, default: Any = None) -> Any:
"""Get metadata value with default."""
return self.metadata.get(key, default)
def set_metadata(self, key: str, value: Any) -> None:
"""Set metadata value."""
self.metadata[key] = value
def has_metadata(self, key: str) -> bool:
"""Check if metadata key exists."""
return key in self.metadata
@dataclass
class DocumentNode(Node, CacheableMixin):
"""Root document node."""
type: NodeType = field(default=NodeType.DOCUMENT, init=False)
def text(self) -> str:
"""Extract all text from document with caching."""
def _generate_text():
parts = []
for child in self.children:
text = child.text()
if text:
parts.append(text)
return '\n\n'.join(parts)
return self._get_cached_text(_generate_text)
def html(self) -> str:
"""Generate complete HTML document."""
body_content = '\n'.join(child.html() for child in self.children)
return f"""<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Document</title>
</head>
<body>
{body_content}
</body>
</html>"""
@dataclass
class TextNode(Node):
"""Plain text content node."""
type: NodeType = field(default=NodeType.TEXT, init=False)
content: str = ""
def text(self) -> str:
"""Return text content."""
return self.content
def html(self) -> str:
"""Generate HTML for text."""
# Escape HTML entities
text = self.content
text = text.replace('&', '&amp;')
text = text.replace('<', '&lt;')
text = text.replace('>', '&gt;')
return text
@dataclass
class ParagraphNode(Node, CacheableMixin):
"""Paragraph node."""
type: NodeType = field(default=NodeType.PARAGRAPH, init=False)
def text(self) -> str:
"""Extract paragraph text with intelligent spacing and caching."""
def _generate_text():
parts = []
for i, child in enumerate(self.children):
text = child.text()
if text:
# For the first child, just add the text
if i == 0:
parts.append(text)
else:
# For subsequent children, check if previous child had tail whitespace
prev_child = self.children[i - 1]
should_add_space = False
# Add space if previous child had tail whitespace
if hasattr(prev_child, 'get_metadata') and prev_child.get_metadata('has_tail_whitespace'):
should_add_space = True
# Add space if current text starts with space (preserve intended spacing)
elif text.startswith(' '):
should_add_space = True
# Remove the leading space from text since we're adding it as separation
text = text.lstrip()
# Add space if previous text ends with punctuation (sentence boundaries)
elif parts and parts[-1].rstrip()[-1:] in '.!?:;':
should_add_space = True
# Add space between adjacent inline elements if the current text starts with a letter/digit
# This handles cases where whitespace was stripped but spacing is semantically important
elif (text and text[0].isalpha() and
parts and parts[-1] and not parts[-1].endswith(' ') and
hasattr(child, 'get_metadata') and child.get_metadata('original_tag') in ['span', 'a', 'em', 'strong', 'i', 'b']):
should_add_space = True
if should_add_space:
parts.append(' ' + text)
else:
# Concatenate directly without space
if parts:
parts[-1] += text
else:
parts.append(text)
return ''.join(parts)
return self._get_cached_text(_generate_text)
def html(self) -> str:
"""Generate paragraph HTML."""
content = ''.join(child.html() for child in self.children)
style_attr = self._generate_style_attr()
return f'<p{style_attr}>{content}</p>'
def _generate_style_attr(self) -> str:
"""Generate style attribute from style object."""
if not self.style:
return ''
styles = []
if self.style.text_align:
styles.append(f'text-align: {self.style.text_align}')
if self.style.margin_top:
styles.append(f'margin-top: {self.style.margin_top}px')
if self.style.margin_bottom:
styles.append(f'margin-bottom: {self.style.margin_bottom}px')
if styles:
return f' style="{"; ".join(styles)}"'
return ''
@dataclass
class HeadingNode(Node):
"""Heading node with level."""
type: NodeType = field(default=NodeType.HEADING, init=False)
level: int = 1
def text(self) -> str:
"""Extract heading text."""
if isinstance(self.content, str):
return self.content
parts = []
for child in self.children:
text = child.text()
if text:
parts.append(text)
return ' '.join(parts)
def html(self) -> str:
"""Generate heading HTML."""
level = max(1, min(6, self.level)) # Ensure level is 1-6
content = self.text()
style_attr = self._generate_style_attr()
return f'<h{level}{style_attr}>{content}</h{level}>'
def _generate_style_attr(self) -> str:
"""Generate style attribute."""
styles = []
if self.style.text_align:
styles.append(f'text-align: {self.style.text_align}')
if self.style.color:
styles.append(f'color: {self.style.color}')
if styles:
return f' style="{"; ".join(styles)}"'
return ''
@dataclass
class ContainerNode(Node, CacheableMixin):
"""Generic container node (div, section, etc.)."""
type: NodeType = field(default=NodeType.CONTAINER, init=False)
tag_name: str = 'div'
def text(self) -> str:
"""Extract text from container with caching."""
def _generate_text():
parts = []
for child in self.children:
text = child.text()
if text:
parts.append(text)
return '\n'.join(parts)
return self._get_cached_text(_generate_text)
def html(self) -> str:
"""Generate container HTML."""
content = '\n'.join(child.html() for child in self.children)
style_attr = self._generate_style_attr()
class_attr = f' class="{self.semantic_role}"' if self.semantic_role else ''
return f'<{self.tag_name}{style_attr}{class_attr}>{content}</{self.tag_name}>'
def _generate_style_attr(self) -> str:
"""Generate style attribute."""
if not self.style:
return ''
styles = []
if self.style.margin_top:
styles.append(f'margin-top: {self.style.margin_top}px')
if self.style.margin_bottom:
styles.append(f'margin-bottom: {self.style.margin_bottom}px')
if self.style.padding_left:
styles.append(f'padding-left: {self.style.padding_left}px')
if styles:
return f' style="{"; ".join(styles)}"'
return ''
@dataclass
class SectionNode(ContainerNode):
"""Document section node."""
type: NodeType = field(default=NodeType.SECTION, init=False)
section_name: Optional[str] = None
tag_name: str = field(default='section', init=False)
def __post_init__(self):
if self.section_name:
self.set_metadata('section_name', self.section_name)
@dataclass
class ListNode(Node):
"""List node (ordered or unordered)."""
type: NodeType = field(default=NodeType.LIST, init=False)
ordered: bool = False
def text(self) -> str:
"""Extract list text."""
parts = []
for i, child in enumerate(self.children):
if self.ordered:
prefix = f"{i+1}. "
else:
prefix = ""
text = child.text()
if text:
parts.append(f"{prefix}{text}")
return '\n'.join(parts)
def html(self) -> str:
"""Generate list HTML."""
tag = 'ol' if self.ordered else 'ul'
items = '\n'.join(child.html() for child in self.children)
return f'<{tag}>\n{items}\n</{tag}>'
@dataclass
class ListItemNode(Node):
"""List item node."""
type: NodeType = field(default=NodeType.LIST_ITEM, init=False)
def text(self) -> str:
"""Extract list item text."""
parts = []
for child in self.children:
text = child.text()
if text:
parts.append(text)
return ' '.join(parts)
def html(self) -> str:
"""Generate list item HTML."""
content = ''.join(child.html() for child in self.children)
return f'<li>{content}</li>'
@dataclass
class LinkNode(Node):
"""Hyperlink node."""
type: NodeType = field(default=NodeType.LINK, init=False)
href: Optional[str] = None
title: Optional[str] = None
def text(self) -> str:
"""Extract link text."""
if isinstance(self.content, str):
return self.content
parts = []
for child in self.children:
text = child.text()
if text:
parts.append(text)
return ' '.join(parts)
def html(self) -> str:
"""Generate link HTML."""
content = self.text()
href_attr = f' href="{self.href}"' if self.href else ''
title_attr = f' title="{self.title}"' if self.title else ''
return f'<a{href_attr}{title_attr}>{content}</a>'
@dataclass
class ImageNode(Node):
"""Image node."""
type: NodeType = field(default=NodeType.IMAGE, init=False)
src: Optional[str] = None
alt: Optional[str] = None
width: Optional[int] = None
height: Optional[int] = None
def text(self) -> str:
"""Extract image alt text."""
return self.alt or ''
def html(self) -> str:
"""Generate image HTML."""
src_attr = f' src="{self.src}"' if self.src else ''
alt_attr = f' alt="{self.alt}"' if self.alt else ''
width_attr = f' width="{self.width}"' if self.width else ''
height_attr = f' height="{self.height}"' if self.height else ''
return f'<img{src_attr}{alt_attr}{width_attr}{height_attr}>'

View File

@@ -0,0 +1,387 @@
"""
Main HTML parser implementation.
"""
import time
from typing import List, Union
import lxml.html
from lxml import etree
from lxml.html import HtmlElement
from edgar.documents.config import ParserConfig
from edgar.documents.document import Document, DocumentMetadata
from edgar.documents.exceptions import (
HTMLParsingError, DocumentTooLargeError, InvalidConfigurationError
)
from edgar.documents.nodes import DocumentNode
from edgar.documents.processors.postprocessor import DocumentPostprocessor
from edgar.documents.processors.preprocessor import HTMLPreprocessor
from edgar.documents.strategies.document_builder import DocumentBuilder
from edgar.documents.types import XBRLFact
from edgar.documents.utils import get_cache_manager
from edgar.documents.utils.html_utils import remove_xml_declaration, create_lxml_parser
class HTMLParser:
"""
Main HTML parser class.
Orchestrates the parsing pipeline with configurable strategies
and processors.
"""
def __init__(self, config: ParserConfig = None):
"""
Initialize parser with configuration.
Args:
config: Parser configuration
"""
self.config = config or ParserConfig()
self._validate_config()
# Initialize components
self.cache_manager = get_cache_manager()
self.preprocessor = HTMLPreprocessor(self.config)
self.postprocessor = DocumentPostprocessor(self.config)
# Initialize strategies
self._init_strategies()
def _validate_config(self):
"""Validate configuration."""
if self.config.max_document_size <= 0:
raise InvalidConfigurationError("max_document_size must be positive")
if self.config.streaming_threshold and self.config.max_document_size:
if self.config.streaming_threshold > self.config.max_document_size:
raise InvalidConfigurationError(
"streaming_threshold cannot exceed max_document_size"
)
def _init_strategies(self):
"""Initialize parsing strategies based on configuration."""
self.strategies = {}
# Header detection strategy
if self.config.detect_sections:
from edgar.documents.strategies.header_detection import HeaderDetectionStrategy
self.strategies['header_detection'] = HeaderDetectionStrategy(self.config)
# Table processing strategy
if self.config.table_extraction:
from edgar.documents.strategies.table_processing import TableProcessor
self.strategies['table_processing'] = TableProcessor(self.config)
# XBRL extraction strategy
if self.config.extract_xbrl:
from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
self.strategies['xbrl_extraction'] = XBRLExtractor()
def parse(self, html: Union[str, bytes]) -> Document:
"""
Parse HTML into Document.
Args:
html: HTML content as string or bytes
Returns:
Parsed Document object
Raises:
DocumentTooLargeError: If document exceeds size limit
HTMLParsingError: If parsing fails
"""
start_time = time.time()
# Validate input type
if html is None:
raise TypeError("HTML input cannot be None")
if not isinstance(html, (str, bytes)):
raise TypeError(f"HTML must be string or bytes, got {type(html).__name__}")
# Convert bytes to string if needed
if isinstance(html, bytes):
html = html.decode('utf-8', errors='replace')
# Handle empty HTML
if not html.strip():
# Return empty document
root = DocumentNode()
metadata = DocumentMetadata(
size=0,
parse_time=time.time() - start_time,
parser_version="2.0.0"
)
return Document(root=root, metadata=metadata)
# Check document size
doc_size = len(html.encode('utf-8'))
if doc_size > self.config.max_document_size:
raise DocumentTooLargeError(doc_size, self.config.max_document_size)
# Check if streaming is needed
if doc_size > self.config.streaming_threshold:
return self._parse_streaming(html)
try:
# Store original HTML BEFORE preprocessing (needed for TOC analysis)
original_html = html
# Extract XBRL data BEFORE preprocessing (to preserve ix:hidden content)
xbrl_facts = []
if self.config.extract_xbrl:
xbrl_facts = self._extract_xbrl_pre_process(html)
# Preprocessing (will remove ix:hidden for rendering)
html = self.preprocessor.process(html)
# Parse with lxml
tree = self._parse_html(html)
# Extract metadata
metadata = self._extract_metadata(tree, html)
metadata.preserve_whitespace = self.config.preserve_whitespace
# Store ORIGINAL unmodified HTML for section extraction (TOC analysis)
# Must be the raw HTML before preprocessing
metadata.original_html = original_html
# Add XBRL facts to metadata if found
if xbrl_facts:
metadata.xbrl_data = {'facts': xbrl_facts}
# Build document
document = self._build_document(tree, metadata)
# Store config reference for section extraction
document._config = self.config
# Postprocessing
document = self.postprocessor.process(document)
# Record parse time
document.metadata.parse_time = time.time() - start_time
document.metadata.size = doc_size
return document
except Exception as e:
if isinstance(e, (DocumentTooLargeError, HTMLParsingError)):
raise
raise HTMLParsingError(
f"Failed to parse HTML: {str(e)}",
context={'error_type': type(e).__name__}
)
def _parse_html(self, html: str) -> HtmlElement:
"""Parse HTML with lxml."""
try:
# Remove XML declaration if present
html = remove_xml_declaration(html)
parser = create_lxml_parser(
remove_blank_text=not self.config.preserve_whitespace,
remove_comments=True,
recover=True,
encoding='utf-8'
)
# Parse HTML
tree = lxml.html.fromstring(html, parser=parser)
# Ensure we have a proper document structure
if tree.tag != 'html':
# Wrap in html/body if needed
html_tree = lxml.html.Element('html')
body = etree.SubElement(html_tree, 'body')
body.append(tree)
tree = html_tree
return tree
except Exception as e:
raise HTMLParsingError(
f"lxml parsing failed: {str(e)}",
context={'parser': 'lxml.html'}
)
def _extract_metadata(self, tree: HtmlElement, html: str) -> DocumentMetadata:
"""Extract metadata from HTML tree."""
metadata = DocumentMetadata()
# Use filing type from config if provided (avoids expensive detection)
if self.config.form:
metadata.form = self.config.form
# Try to extract from meta tags
for meta in tree.xpath('//meta'):
name = meta.get('name', '').lower()
content = meta.get('content', '')
if name == 'company':
metadata.company = content
elif name == 'filing-type':
metadata.form = content
elif name == 'cik':
metadata.cik = content
elif name == 'filing-date':
metadata.filing_date = content
elif name == 'accession-number':
metadata.accession_number = content
# Try to extract from title
title_elem = tree.find('.//title')
if title_elem is not None and title_elem.text:
# Parse title for filing info
title = title_elem.text.strip()
# Example: "APPLE INC - 10-K - 2023-09-30"
parts = title.split(' - ')
if len(parts) >= 2:
if not metadata.company:
metadata.company = parts[0].strip()
if not metadata.form:
metadata.form = parts[1].strip()
# Try to extract from document content
if not metadata.form:
# Look for form type in first 1000 chars
text_start = html[:1000].upper()
for form_type in ['10-K', '10-Q', '8-K', 'DEF 14A', 'S-1']:
if form_type in text_start:
metadata.form = form_type
break
return metadata
def _build_document(self, tree: HtmlElement, metadata: DocumentMetadata) -> Document:
"""Build document from parsed tree."""
# Create document builder with strategies
builder = DocumentBuilder(self.config, self.strategies)
# Build document node tree
root_node = builder.build(tree)
# Create document
document = Document(root=root_node, metadata=metadata)
return document
def _parse_streaming(self, html: str) -> Document:
"""Parse large document in streaming mode."""
from edgar.documents.utils.streaming import StreamingParser
streaming_parser = StreamingParser(self.config, self.strategies)
return streaming_parser.parse(html)
def _extract_xbrl_pre_process(self, html: str) -> List[XBRLFact]:
"""
Extract XBRL facts before preprocessing.
This ensures we capture XBRL data from ix:hidden elements.
"""
try:
# Parse HTML without preprocessing to preserve all XBRL content
parser = create_lxml_parser(
remove_blank_text=False,
remove_comments=False,
recover=True,
encoding='utf-8'
)
# Remove XML declaration if present
html = remove_xml_declaration(html)
tree = lxml.html.fromstring(html, parser=parser)
# Use XBRL extractor
from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
extractor = XBRLExtractor()
facts = []
# Find all XBRL elements (including those in ix:hidden)
# Simple approach: find all elements with ix: prefix
for element in tree.iter():
if element.tag and isinstance(element.tag, str) and 'ix:' in element.tag.lower():
# Skip container elements
local_name = element.tag.split(':')[-1].lower() if ':' in element.tag else element.tag.lower()
if local_name in ['nonnumeric', 'nonfraction', 'continuation', 'footnote', 'fraction']:
fact = extractor.extract_fact(element)
if fact:
# Mark if fact was in hidden section or header
parent = element.getparent()
while parent is not None:
if parent.tag:
tag_lower = parent.tag.lower()
if 'ix:hidden' in tag_lower or 'ix:header' in tag_lower:
fact.metadata = fact.metadata or {}
fact.metadata['hidden'] = True
break
parent = parent.getparent()
facts.append(fact)
return facts
except Exception as e:
# Log error but don't fail parsing
import logging
logging.warning(f"Failed to extract XBRL data: {e}")
return []
def parse_file(self, file_path: str) -> Document:
"""
Parse HTML from file.
Args:
file_path: Path to HTML file
Returns:
Parsed Document object
"""
with open(file_path, 'r', encoding='utf-8') as f:
html = f.read()
document = self.parse(html)
document.metadata.source = file_path
return document
def parse_url(self, url: str) -> Document:
"""
Parse HTML from URL.
Args:
url: URL to fetch and parse
Returns:
Parsed Document object
"""
import requests
response = requests.get(url, timeout=30)
response.raise_for_status()
document = self.parse(response.text)
document.metadata.url = url
return document
@classmethod
def create_for_performance(cls) -> 'HTMLParser':
"""Create parser optimized for performance."""
config = ParserConfig.for_performance()
return cls(config)
@classmethod
def create_for_accuracy(cls) -> 'HTMLParser':
"""Create parser optimized for accuracy."""
config = ParserConfig.for_accuracy()
return cls(config)
@classmethod
def create_for_ai(cls) -> 'HTMLParser':
"""Create parser optimized for AI processing."""
config = ParserConfig.for_ai()
return cls(config)

View File

@@ -0,0 +1,11 @@
"""
Document processors for preprocessing and postprocessing.
"""
from edgar.documents.processors.preprocessor import HTMLPreprocessor
from edgar.documents.processors.postprocessor import DocumentPostprocessor
__all__ = [
'HTMLPreprocessor',
'DocumentPostprocessor'
]

View File

@@ -0,0 +1,283 @@
"""
Document postprocessor for final processing after parsing.
"""
from typing import List, Set
from edgar.documents.config import ParserConfig
from edgar.documents.document import Document
from edgar.documents.nodes import Node, TextNode, ParagraphNode, HeadingNode
from edgar.documents.types import NodeType
class DocumentPostprocessor:
"""
Postprocesses parsed documents to improve quality.
Handles:
- Adjacent node merging
- Empty node removal
- Heading level normalization
- Section detection enhancement
- Metadata enrichment
"""
def __init__(self, config: ParserConfig):
"""Initialize postprocessor with configuration."""
self.config = config
def process(self, document: Document) -> Document:
"""
Postprocess document.
Args:
document: Parsed document
Returns:
Processed document
"""
# Remove empty nodes
self._remove_empty_nodes(document.root)
# Merge adjacent text nodes if configured
if self.config.merge_adjacent_nodes:
self._merge_adjacent_nodes(document.root)
# Normalize heading levels
self._normalize_heading_levels(document.root)
# Enhance section detection if configured
if self.config.detect_sections:
self._enhance_sections(document)
# Add document statistics
self._add_statistics(document)
# Validate document structure
self._validate_structure(document)
return document
def _remove_empty_nodes(self, node: Node):
"""Remove empty nodes from tree."""
# Process children first (bottom-up)
children_to_remove = []
for child in node.children:
self._remove_empty_nodes(child)
# Check if child is empty
if self._is_empty_node(child):
children_to_remove.append(child)
# Remove empty children
for child in children_to_remove:
node.remove_child(child)
def _is_empty_node(self, node: Node) -> bool:
"""Check if node is empty and can be removed."""
# Never remove table nodes
if node.type == NodeType.TABLE:
return False
# Never remove nodes with metadata
if node.metadata:
return False
# Check text nodes
if isinstance(node, TextNode):
return not node.text().strip()
# Check other nodes with text content
if hasattr(node, 'content') and isinstance(node.content, str):
return not node.content.strip()
# Check container nodes
if not node.children:
# Empty container with no children
return True
return False
def _merge_adjacent_nodes(self, node: Node):
"""Merge adjacent text nodes with similar properties."""
if not node.children:
return
# Process children first
for child in node.children:
self._merge_adjacent_nodes(child)
# Merge adjacent text nodes
merged_children = []
i = 0
while i < len(node.children):
current = node.children[i]
# Look for mergeable nodes
if self._can_merge(current):
# Collect all adjacent mergeable nodes
merge_group = [current]
j = i + 1
while j < len(node.children) and self._can_merge_with(current, node.children[j]):
merge_group.append(node.children[j])
j += 1
# Merge if we have multiple nodes
if len(merge_group) > 1:
merged = self._merge_nodes(merge_group)
merged_children.append(merged)
i = j
else:
merged_children.append(current)
i += 1
else:
merged_children.append(current)
i += 1
# Update children
node.children = merged_children
# Update parent references
for child in node.children:
child.parent = node
def _can_merge(self, node: Node) -> bool:
"""Check if node can be merged."""
# Only merge TextNodes, not ParagraphNodes
return isinstance(node, TextNode) and not node.metadata
def _can_merge_with(self, node1: Node, node2: Node) -> bool:
"""Check if two nodes can be merged."""
# Must be same type
if type(node1) != type(node2):
return False
# Must have compatible styles
if not self._compatible_styles(node1.style, node2.style):
return False
# Must not have metadata
if node1.metadata or node2.metadata:
return False
return True
def _compatible_styles(self, style1, style2) -> bool:
"""Check if two styles are compatible for merging."""
# For now, just check key properties
return (
style1.font_size == style2.font_size and
style1.font_weight == style2.font_weight and
style1.text_align == style2.text_align
)
def _merge_nodes(self, nodes: List[Node]) -> Node:
"""Merge multiple nodes into one."""
if not nodes:
return None
# Use first node as base
merged = nodes[0]
# Merge content
if isinstance(merged, TextNode):
texts = [n.text() for n in nodes]
merged.content = '\n'.join(texts)
elif isinstance(merged, ParagraphNode):
# Merge all children
for node in nodes[1:]:
merged.children.extend(node.children)
return merged
def _normalize_heading_levels(self, node: Node):
"""Normalize heading levels to ensure proper hierarchy."""
# Collect all headings
headings = []
self._collect_headings(node, headings)
if not headings:
return
# Analyze heading structure
levels_used = set(h.level for h in headings)
# If we're missing level 1, promote headings
if 1 not in levels_used and levels_used:
min_level = min(levels_used)
adjustment = min_level - 1
for heading in headings:
heading.level = max(1, heading.level - adjustment)
def _collect_headings(self, node: Node, headings: List[HeadingNode]):
"""Collect all heading nodes."""
if isinstance(node, HeadingNode):
headings.append(node)
for child in node.children:
self._collect_headings(child, headings)
def _enhance_sections(self, document: Document):
"""Enhance section detection and metadata."""
# Only extract sections eagerly if configured to do so
if not self.config.eager_section_extraction:
return
# Force section extraction to populate cache
_ = document.sections
# Add section metadata to nodes
for section_name, section in document.sections.items():
# Add section name to all nodes in section
for node in section.node.walk():
node.set_metadata('section', section_name)
def _add_statistics(self, document: Document):
"""Add document statistics to metadata."""
stats = {
'node_count': sum(1 for _ in document.root.walk()),
'text_length': len(document.text()),
'table_count': len(document.tables),
'heading_count': len(document.headings),
}
# Only add section count if sections were extracted
if self.config.eager_section_extraction:
stats['section_count'] = len(document.sections)
document.metadata.statistics = stats
def _validate_structure(self, document: Document):
"""Validate document structure and fix issues."""
issues = []
# Check for orphaned nodes
for node in document.root.walk():
if node != document.root and node.parent is None:
issues.append(f"Orphaned node: {node.type}")
# Fix by adding to root
document.root.add_child(node)
# Check for circular references
visited = set()
def check_cycles(node: Node, path: Set[str]):
if node.id in path:
issues.append(f"Circular reference detected: {node.type}")
return
path.add(node.id)
visited.add(node.id)
for child in node.children:
if child.id not in visited:
check_cycles(child, path.copy())
check_cycles(document.root, set())
# Store validation results
if issues:
document.metadata.validation_issues = issues

View File

@@ -0,0 +1,242 @@
"""
HTML preprocessor for cleaning and normalizing HTML before parsing.
"""
import re
from edgar.documents.config import ParserConfig
from edgar.documents.utils.html_utils import remove_xml_declaration
class HTMLPreprocessor:
"""
Preprocesses HTML to fix common issues and normalize content.
Handles:
- Character encoding issues
- Malformed HTML
- Excessive whitespace
- Script/style removal
- Entity normalization
"""
def __init__(self, config: ParserConfig):
"""Initialize preprocessor with configuration."""
self.config = config
# Pre-compile regex patterns for performance
self._compiled_patterns = self._compile_patterns()
def _compile_patterns(self):
"""Pre-compile frequently used regex patterns."""
return {
# Encoding and cleanup
'control_chars': re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]'),
# Script/style removal
'script_tags': re.compile(r'<script[^>]*>.*?</script>', re.IGNORECASE | re.DOTALL),
'style_tags': re.compile(r'<style[^>]*>.*?</style>', re.IGNORECASE | re.DOTALL),
'link_tags': re.compile(r'<link[^>]*>', re.IGNORECASE),
'comments': re.compile(r'<!--.*?-->', re.DOTALL),
'ix_hidden': re.compile(r'<ix:hidden[^>]*>.*?</ix:hidden>', re.IGNORECASE | re.DOTALL),
'ix_header': re.compile(r'<ix:header[^>]*>.*?</ix:header>', re.IGNORECASE | re.DOTALL),
# Malformed tags
'br_tags': re.compile(r'<br(?![^>]*/)>', re.IGNORECASE),
'img_tags': re.compile(r'<img([^>]+)(?<!/)>', re.IGNORECASE),
'input_tags': re.compile(r'<input([^>]+)(?<!/)>', re.IGNORECASE),
'hr_tags': re.compile(r'<hr(?![^>]*/)>', re.IGNORECASE),
'nested_p_open': re.compile(r'<p>\s*<p>', re.IGNORECASE),
'nested_p_close': re.compile(r'</p>\s*</p>', re.IGNORECASE),
# Whitespace normalization
'multiple_spaces': re.compile(r'[ \t]+'),
'multiple_newlines': re.compile(r'\n{3,}'),
'spaces_around_tags': re.compile(r'\s*(<[^>]+>)\s*'),
# Block element newlines - combined pattern for opening tags
'block_open_tags': re.compile(
r'(<(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)[^>]*>)',
re.IGNORECASE
),
# Block element newlines - combined pattern for closing tags
'block_close_tags': re.compile(
r'(</(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)>)',
re.IGNORECASE
),
# Empty tags removal - combined pattern for all removable tags
'empty_tags': re.compile(
r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*>\s*</(?:span|div|p|font|b|i|u|strong|em)>',
re.IGNORECASE
),
'empty_self_closing': re.compile(
r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*/>\s*',
re.IGNORECASE
),
# Common issues
'multiple_br': re.compile(r'(<br\s*/?>[\s\n]*){3,}', re.IGNORECASE),
'space_before_punct': re.compile(r'\s+([.,;!?])'),
'missing_space_after_punct': re.compile(r'([.,;!?])([A-Z])'),
}
def process(self, html: str) -> str:
"""
Preprocess HTML content.
Args:
html: Raw HTML content
Returns:
Cleaned HTML ready for parsing
"""
# Remove BOM if present
if html.startswith('\ufeff'):
html = html[1:]
# Remove XML declaration if present
html = remove_xml_declaration(html)
# Fix common character encoding issues
html = self._fix_encoding_issues(html)
# Remove script and style tags
html = self._remove_script_style(html)
# Normalize entities
html = self._normalize_entities(html)
# Fix malformed tags
html = self._fix_malformed_tags(html)
# Normalize whitespace if not preserving
if not self.config.preserve_whitespace:
html = self._normalize_whitespace(html)
# Remove empty tags
html = self._remove_empty_tags(html)
# Fix common HTML issues
html = self._fix_common_issues(html)
return html
def _fix_encoding_issues(self, html: str) -> str:
"""Fix common character encoding issues."""
# Replace Windows-1252 characters with Unicode equivalents
replacements = {
'\x91': "'", # Left single quote
'\x92': "'", # Right single quote
'\x93': '"', # Left double quote
'\x94': '"', # Right double quote
'\x95': '', # Bullet
'\x96': '', # En dash
'\x97': '', # Em dash
'\xa0': ' ', # Non-breaking space
}
for old, new in replacements.items():
html = html.replace(old, new)
# Remove other control characters
html = self._compiled_patterns['control_chars'].sub('', html)
return html
def _remove_script_style(self, html: str) -> str:
"""Remove script and style tags with content."""
# Use pre-compiled patterns for better performance
html = self._compiled_patterns['script_tags'].sub('', html)
html = self._compiled_patterns['style_tags'].sub('', html)
html = self._compiled_patterns['link_tags'].sub('', html)
html = self._compiled_patterns['comments'].sub('', html)
html = self._compiled_patterns['ix_hidden'].sub('', html)
html = self._compiled_patterns['ix_header'].sub('', html)
return html
def _normalize_entities(self, html: str) -> str:
"""Normalize HTML entities."""
# Common entity replacements
entities = {
'&nbsp;': ' ',
'&ensp;': ' ',
'&emsp;': ' ',
'&thinsp;': ' ',
'&#160;': ' ',
'&#32;': ' ',
'&zwj;': '', # Zero-width joiner
'&zwnj;': '', # Zero-width non-joiner
'&#8203;': '', # Zero-width space
}
for entity, replacement in entities.items():
html = html.replace(entity, replacement)
# Fix double-encoded entities
html = html.replace('&amp;amp;', '&amp;')
html = html.replace('&amp;nbsp;', ' ')
html = html.replace('&amp;lt;', '&lt;')
html = html.replace('&amp;gt;', '&gt;')
return html
def _fix_malformed_tags(self, html: str) -> str:
"""Fix common malformed tag issues."""
# Use pre-compiled patterns for better performance
html = self._compiled_patterns['br_tags'].sub('<br/>', html)
html = self._compiled_patterns['img_tags'].sub(r'<img\1/>', html)
html = self._compiled_patterns['input_tags'].sub(r'<input\1/>', html)
html = self._compiled_patterns['hr_tags'].sub('<hr/>', html)
html = self._compiled_patterns['nested_p_open'].sub('<p>', html)
html = self._compiled_patterns['nested_p_close'].sub('</p>', html)
return html
def _normalize_whitespace(self, html: str) -> str:
"""Normalize whitespace in HTML."""
# Use pre-compiled patterns for better performance
# Replace multiple spaces with single space
html = self._compiled_patterns['multiple_spaces'].sub(' ', html)
# Replace multiple newlines with double newline
html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
# Remove spaces around tags
html = self._compiled_patterns['spaces_around_tags'].sub(r'\1', html)
# Add newlines around block elements for readability
# Using combined patterns instead of looping over individual tags
html = self._compiled_patterns['block_open_tags'].sub(r'\n\1', html)
html = self._compiled_patterns['block_close_tags'].sub(r'\1\n', html)
# Clean up excessive newlines (apply again after adding newlines)
html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
return html.strip()
def _remove_empty_tags(self, html: str) -> str:
"""Remove empty tags that don't contribute content."""
# Use pre-compiled combined patterns instead of looping
html = self._compiled_patterns['empty_tags'].sub('', html)
html = self._compiled_patterns['empty_self_closing'].sub('', html)
return html
def _fix_common_issues(self, html: str) -> str:
"""Fix other common HTML issues."""
# Use pre-compiled patterns for better performance
html = self._compiled_patterns['multiple_br'].sub('<br/><br/>', html)
html = self._compiled_patterns['space_before_punct'].sub(r'\1', html)
html = self._compiled_patterns['missing_space_after_punct'].sub(r'\1 \2', html)
# Remove zero-width spaces (simple string replace is faster than regex)
html = html.replace('\u200b', '')
html = html.replace('\ufeff', '')
# Fix common typos in tags (simple string replace is faster than regex)
html = html.replace('<tabel', '<table')
html = html.replace('</tabel>', '</table>')
return html

View File

@@ -0,0 +1,34 @@
"""
Advanced ranking functionality for edgar.documents.
This package provides BM25-based ranking with semantic structure awareness
and intelligent index caching for performance optimization.
"""
from edgar.documents.ranking.ranking import (
RankingAlgorithm,
RankingEngine,
BM25Engine,
HybridEngine,
SemanticEngine,
RankedResult,
)
from edgar.documents.ranking.cache import (
SearchIndexCache,
CacheEntry,
get_search_cache,
set_search_cache,
)
__all__ = [
'RankingAlgorithm',
'RankingEngine',
'BM25Engine',
'HybridEngine',
'SemanticEngine',
'RankedResult',
'SearchIndexCache',
'CacheEntry',
'get_search_cache',
'set_search_cache',
]

View File

@@ -0,0 +1,311 @@
"""
Search index caching for performance optimization.
Provides memory and disk caching with LRU eviction and TTL expiration.
"""
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Dict, Any, List
import hashlib
import pickle
import logging
logger = logging.getLogger(__name__)
@dataclass
class CacheEntry:
"""
Cached search index entry.
Stores pre-built search indices for a document along with metadata
for cache management (access tracking, TTL).
"""
document_hash: str
index_data: Dict[str, Any] # Serialized BM25 index data
created_at: datetime
access_count: int = 0
last_accessed: Optional[datetime] = None
metadata: Dict[str, Any] = field(default_factory=dict)
class SearchIndexCache:
"""
Manages search index caching with memory + disk storage.
Features:
- In-memory LRU cache for fast access
- Optional disk persistence for reuse across sessions
- TTL-based expiration
- Access statistics tracking
Parameters:
memory_cache_size: Maximum entries in memory (default: 10)
disk_cache_enabled: Enable disk persistence (default: True)
cache_dir: Directory for disk cache (default: ~/.edgar_cache/search)
ttl_hours: Time-to-live for cached entries (default: 24)
"""
def __init__(self,
memory_cache_size: int = 10,
disk_cache_enabled: bool = True,
cache_dir: Optional[Path] = None,
ttl_hours: int = 24):
"""Initialize cache."""
self.memory_cache_size = memory_cache_size
self.disk_cache_enabled = disk_cache_enabled
self.cache_dir = cache_dir or Path.home() / ".edgar_cache" / "search"
self.ttl = timedelta(hours=ttl_hours)
# In-memory cache (LRU)
self._memory_cache: Dict[str, CacheEntry] = {}
self._access_order: List[str] = []
# Statistics
self._hits = 0
self._misses = 0
# Create cache directory
if disk_cache_enabled:
self.cache_dir.mkdir(parents=True, exist_ok=True)
def compute_document_hash(self, document_id: str, content_sample: str) -> str:
"""
Compute cache key from document identifiers.
Uses document ID (e.g., accession number) and a content sample
to create a unique, stable hash.
Args:
document_id: Unique document identifier
content_sample: Sample of document content for verification
Returns:
16-character hex hash
"""
content = f"{document_id}:{content_sample}"
return hashlib.sha256(content.encode()).hexdigest()[:16]
def get(self, document_hash: str) -> Optional[CacheEntry]:
"""
Get cached entry.
Tries memory cache first, then disk cache. Updates LRU order
and access statistics.
Args:
document_hash: Cache key
Returns:
CacheEntry if found and valid, None otherwise
"""
# Try memory cache first
if document_hash in self._memory_cache:
entry = self._memory_cache[document_hash]
# Check TTL
if datetime.now() - entry.created_at > self.ttl:
# Expired - remove from cache
self._evict_memory(document_hash)
self._misses += 1
return None
# Update access tracking
entry.access_count += 1
entry.last_accessed = datetime.now()
# Update LRU order
if document_hash in self._access_order:
self._access_order.remove(document_hash)
self._access_order.append(document_hash)
self._hits += 1
logger.debug(f"Cache hit (memory): {document_hash}")
return entry
# Try disk cache
if self.disk_cache_enabled:
entry = self._load_from_disk(document_hash)
if entry:
# Check TTL
if datetime.now() - entry.created_at > self.ttl:
# Expired - delete file
self._delete_from_disk(document_hash)
self._misses += 1
return None
# Load into memory cache
self._put_memory(document_hash, entry)
self._hits += 1
logger.debug(f"Cache hit (disk): {document_hash}")
return entry
self._misses += 1
logger.debug(f"Cache miss: {document_hash}")
return None
def put(self, document_hash: str, entry: CacheEntry) -> None:
"""
Cache entry in memory and optionally on disk.
Args:
document_hash: Cache key
entry: Entry to cache
"""
# Put in memory cache
self._put_memory(document_hash, entry)
# Put in disk cache
if self.disk_cache_enabled:
self._save_to_disk(document_hash, entry)
logger.debug(f"Cached entry: {document_hash}")
def _put_memory(self, document_hash: str, entry: CacheEntry) -> None:
"""Put entry in memory cache with LRU eviction."""
# Evict if cache full
while len(self._memory_cache) >= self.memory_cache_size:
if self._access_order:
oldest = self._access_order.pop(0)
self._evict_memory(oldest)
else:
break
self._memory_cache[document_hash] = entry
self._access_order.append(document_hash)
def _evict_memory(self, document_hash: str) -> None:
"""Evict entry from memory cache."""
if document_hash in self._memory_cache:
del self._memory_cache[document_hash]
logger.debug(f"Evicted from memory: {document_hash}")
def _load_from_disk(self, document_hash: str) -> Optional[CacheEntry]:
"""Load entry from disk cache."""
cache_file = self.cache_dir / f"{document_hash}.pkl"
if not cache_file.exists():
return None
try:
with open(cache_file, 'rb') as f:
entry = pickle.load(f)
return entry
except Exception as e:
logger.warning(f"Failed to load cache from disk: {e}")
# Delete corrupted file
try:
cache_file.unlink()
except:
pass
return None
def _save_to_disk(self, document_hash: str, entry: CacheEntry) -> None:
"""Save entry to disk cache."""
cache_file = self.cache_dir / f"{document_hash}.pkl"
try:
with open(cache_file, 'wb') as f:
pickle.dump(entry, f)
except Exception as e:
logger.warning(f"Failed to save cache to disk: {e}")
def _delete_from_disk(self, document_hash: str) -> None:
"""Delete entry from disk cache."""
cache_file = self.cache_dir / f"{document_hash}.pkl"
try:
if cache_file.exists():
cache_file.unlink()
except Exception as e:
logger.warning(f"Failed to delete cache file: {e}")
def clear(self, memory_only: bool = False) -> None:
"""
Clear cache.
Args:
memory_only: If True, only clear memory cache (keep disk)
"""
self._memory_cache.clear()
self._access_order.clear()
logger.info("Cleared memory cache")
if not memory_only and self.disk_cache_enabled:
try:
for cache_file in self.cache_dir.glob("*.pkl"):
cache_file.unlink()
logger.info("Cleared disk cache")
except Exception as e:
logger.warning(f"Failed to clear disk cache: {e}")
def get_stats(self) -> Dict[str, Any]:
"""
Get cache statistics.
Returns:
Dictionary with cache statistics
"""
disk_entries = 0
if self.disk_cache_enabled:
try:
disk_entries = len(list(self.cache_dir.glob("*.pkl")))
except:
pass
total_requests = self._hits + self._misses
hit_rate = self._hits / total_requests if total_requests > 0 else 0.0
return {
"memory_entries": len(self._memory_cache),
"disk_entries": disk_entries,
"total_accesses": sum(e.access_count for e in self._memory_cache.values()),
"cache_hits": self._hits,
"cache_misses": self._misses,
"hit_rate": hit_rate,
"memory_size_mb": self._estimate_cache_size()
}
def _estimate_cache_size(self) -> float:
"""Estimate memory cache size in MB."""
try:
import sys
total_bytes = sum(
sys.getsizeof(entry.index_data)
for entry in self._memory_cache.values()
)
return total_bytes / (1024 * 1024)
except:
# Rough estimate if sys.getsizeof fails
return len(self._memory_cache) * 5.0 # Assume ~5MB per entry
# Global cache instance
_global_cache: Optional[SearchIndexCache] = None
def get_search_cache() -> SearchIndexCache:
"""
Get global search cache instance.
Creates a singleton cache instance on first call.
Returns:
Global SearchIndexCache instance
"""
global _global_cache
if _global_cache is None:
_global_cache = SearchIndexCache()
return _global_cache
def set_search_cache(cache: Optional[SearchIndexCache]) -> None:
"""
Set global search cache instance.
Useful for testing or custom cache configuration.
Args:
cache: Cache instance to use globally (None to disable)
"""
global _global_cache
_global_cache = cache

View File

@@ -0,0 +1,187 @@
"""
Text preprocessing for search.
Provides tokenization and text normalization for BM25 and semantic analysis.
"""
import re
from typing import List, Set
# Common English stopwords (minimal set for financial documents)
# We keep many financial terms that might be stopwords in other contexts
STOPWORDS: Set[str] = {
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for',
'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on',
'that', 'the', 'to', 'was', 'will', 'with'
}
def preprocess_text(text: str,
lowercase: bool = True,
remove_punctuation: bool = False) -> str:
"""
Preprocess text for search.
Args:
text: Raw text
lowercase: Convert to lowercase
remove_punctuation: Remove punctuation (keep for financial data)
Returns:
Preprocessed text
"""
if not text:
return ""
# Normalize whitespace
text = ' '.join(text.split())
# Lowercase (important for BM25 matching)
if lowercase:
text = text.lower()
# Optionally remove punctuation (usually keep for "$5B", "Item 1A", etc.)
if remove_punctuation:
text = re.sub(r'[^\w\s]', ' ', text)
text = ' '.join(text.split()) # Clean up extra spaces
return text
def tokenize(text: str,
remove_stopwords: bool = False,
min_token_length: int = 2) -> List[str]:
"""
Tokenize text for BM25 indexing.
Args:
text: Text to tokenize
remove_stopwords: Remove common stopwords
min_token_length: Minimum token length to keep
Returns:
List of tokens
"""
if not text:
return []
# Split on whitespace and punctuation boundaries
# Keep alphanumeric + some special chars for financial terms
tokens = re.findall(r'\b[\w$%]+\b', text.lower())
# Filter by length
tokens = [t for t in tokens if len(t) >= min_token_length]
# Optionally remove stopwords
if remove_stopwords:
tokens = [t for t in tokens if t not in STOPWORDS]
return tokens
def extract_query_terms(query: str) -> List[str]:
"""
Extract important terms from query for boosting.
Identifies key financial terms, numbers, and important phrases.
Args:
query: Search query
Returns:
List of important query terms
"""
# Tokenize
tokens = tokenize(query, remove_stopwords=True)
# Extract important patterns
important = []
# Financial amounts: $5B, $1.2M, etc.
amounts = re.findall(r'\$[\d,.]+[BMK]?', query, re.IGNORECASE)
important.extend(amounts)
# Percentages: 15%, 3.5%
percentages = re.findall(r'\d+\.?\d*%', query)
important.extend(percentages)
# Years: 2023, 2024
years = re.findall(r'\b(19|20)\d{2}\b', query)
important.extend(years)
# Item references: Item 1A, Item 7
items = re.findall(r'item\s+\d+[a-z]?', query, re.IGNORECASE)
important.extend(items)
# Add all tokens
important.extend(tokens)
# Remove duplicates while preserving order
seen = set()
result = []
for term in important:
term_lower = term.lower()
if term_lower not in seen:
seen.add(term_lower)
result.append(term)
return result
def normalize_financial_term(term: str) -> str:
"""
Normalize financial terms for consistent matching.
Examples:
"$5 billion" -> "$5b"
"5,000,000" -> "5000000"
"Item 1A" -> "item1a"
Args:
term: Financial term
Returns:
Normalized term
"""
term = term.lower().strip()
# Remove commas from numbers
term = term.replace(',', '')
# Normalize billion/million/thousand
term = re.sub(r'\s*billion\b', 'b', term)
term = re.sub(r'\s*million\b', 'm', term)
term = re.sub(r'\s*thousand\b', 'k', term)
# Remove spaces in compound terms
term = re.sub(r'(item|section|part)\s+(\d+[a-z]?)', r'\1\2', term)
# Remove extra whitespace
term = ' '.join(term.split())
return term
def get_ngrams(tokens: List[str], n: int = 2) -> List[str]:
"""
Generate n-grams from tokens.
Useful for phrase matching in BM25.
Args:
tokens: List of tokens
n: N-gram size
Returns:
List of n-grams as strings
"""
if len(tokens) < n:
return []
ngrams = []
for i in range(len(tokens) - n + 1):
ngram = ' '.join(tokens[i:i + n])
ngrams.append(ngram)
return ngrams

View File

@@ -0,0 +1,401 @@
"""
Ranking engines for document search.
Provides BM25-based ranking with optional semantic structure boosting.
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from enum import Enum, auto
from typing import List, Optional, Dict, Any, TYPE_CHECKING
from rank_bm25 import BM25Okapi
if TYPE_CHECKING:
from edgar.documents.nodes import Node
class RankingAlgorithm(Enum):
"""Supported ranking algorithms."""
BM25 = auto() # Classic BM25 (Okapi variant)
HYBRID = auto() # BM25 + Semantic structure boosting
SEMANTIC = auto() # Pure structure-aware scoring
@dataclass
class RankedResult:
"""
A search result with ranking score.
Attributes:
node: Document node containing the match
score: Relevance score (higher is better)
rank: Position in results (1-indexed)
text: Matched text content
bm25_score: Raw BM25 score (if applicable)
semantic_score: Semantic boost score (if applicable)
metadata: Additional result metadata
"""
node: 'Node'
score: float
rank: int
text: str
bm25_score: Optional[float] = None
semantic_score: Optional[float] = None
metadata: Dict[str, Any] = field(default_factory=dict)
@property
def snippet(self) -> str:
"""Get text snippet (first 200 chars)."""
if len(self.text) <= 200:
return self.text
return self.text[:197] + "..."
class RankingEngine(ABC):
"""Abstract base class for ranking engines."""
@abstractmethod
def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
"""
Rank nodes by relevance to query.
Args:
query: Search query
nodes: Nodes to rank
Returns:
List of ranked results sorted by relevance (best first)
"""
pass
@abstractmethod
def get_algorithm_name(self) -> str:
"""Get name of ranking algorithm."""
pass
class BM25Engine(RankingEngine):
"""
BM25 ranking engine using Okapi variant.
BM25 is a probabilistic retrieval function that ranks documents based on
query term frequency and inverse document frequency. Well-suited for
financial documents where exact term matching is important.
Parameters:
k1: Term frequency saturation parameter (default: 1.5)
Controls how quickly term frequency impact plateaus.
b: Length normalization parameter (default: 0.75)
0 = no normalization, 1 = full normalization.
"""
def __init__(self, k1: float = 1.5, b: float = 0.75):
"""
Initialize BM25 engine.
Args:
k1: Term frequency saturation (1.2-2.0 typical)
b: Length normalization (0.75 is standard)
"""
self.k1 = k1
self.b = b
self._bm25: Optional[BM25Okapi] = None
self._corpus_nodes: Optional[List['Node']] = None
self._tokenized_corpus: Optional[List[List[str]]] = None
def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
"""
Rank nodes using BM25 algorithm.
Args:
query: Search query
nodes: Nodes to rank
Returns:
Ranked results sorted by BM25 score
"""
if not nodes:
return []
# Import preprocessing here to avoid circular dependency
from edgar.documents.ranking.preprocessing import preprocess_text, tokenize
# Build index if needed or if nodes changed
if self._corpus_nodes != nodes:
self._build_index(nodes)
# Tokenize and preprocess query
query_tokens = tokenize(preprocess_text(query))
if not query_tokens:
return []
# Get BM25 scores
scores = self._bm25.get_scores(query_tokens)
# Create ranked results
results = []
for idx, (node, score) in enumerate(zip(nodes, scores)):
if score > 0: # Only include nodes with positive scores
text = node.text() if hasattr(node, 'text') else str(node)
results.append(RankedResult(
node=node,
score=float(score),
rank=0, # Will be set after sorting
text=text,
bm25_score=float(score),
metadata={'algorithm': 'BM25'}
))
# Sort by score (highest first) and assign ranks
results.sort(key=lambda r: r.score, reverse=True)
for rank, result in enumerate(results, start=1):
result.rank = rank
return results
def _build_index(self, nodes: List['Node']):
"""Build BM25 index from nodes."""
from edgar.documents.ranking.preprocessing import preprocess_text, tokenize
# Store corpus
self._corpus_nodes = nodes
# Tokenize all nodes
self._tokenized_corpus = []
for node in nodes:
text = node.text() if hasattr(node, 'text') else str(node)
processed = preprocess_text(text)
tokens = tokenize(processed)
self._tokenized_corpus.append(tokens)
# Build BM25 index with custom parameters
self._bm25 = BM25Okapi(
self._tokenized_corpus,
k1=self.k1,
b=self.b
)
def get_index_data(self) -> Dict[str, Any]:
"""
Serialize index data for caching.
Returns:
Dictionary with serializable index data
"""
return {
'tokenized_corpus': self._tokenized_corpus,
'k1': self.k1,
'b': self.b,
'algorithm': 'BM25'
}
def load_index_data(self, index_data: Dict[str, Any], nodes: List['Node']) -> None:
"""
Load index from cached data.
Args:
index_data: Serialized index data
nodes: Nodes corresponding to the index
"""
self._corpus_nodes = nodes
self._tokenized_corpus = index_data['tokenized_corpus']
self.k1 = index_data['k1']
self.b = index_data['b']
# Rebuild BM25 index from tokenized corpus
self._bm25 = BM25Okapi(
self._tokenized_corpus,
k1=self.k1,
b=self.b
)
def get_algorithm_name(self) -> str:
"""Get algorithm name."""
return "BM25"
class HybridEngine(RankingEngine):
"""
Hybrid ranking engine: BM25 + Semantic structure boosting.
Combines classic BM25 text matching with semantic structure awareness:
- BM25 provides strong exact-match ranking for financial terms
- Semantic scoring boosts results based on document structure:
* Headings and section markers
* Cross-references ("See Item X")
* Gateway content (summaries, overviews)
* Table and XBRL importance
This approach is agent-friendly: it surfaces starting points for
investigation rather than fragmented chunks.
Parameters:
bm25_weight: Weight for BM25 score (default: 0.8)
semantic_weight: Weight for semantic score (default: 0.2)
k1: BM25 term frequency saturation
b: BM25 length normalization
"""
def __init__(self,
bm25_weight: float = 0.8,
semantic_weight: float = 0.2,
k1: float = 1.5,
b: float = 0.75,
boost_sections: Optional[List[str]] = None):
"""
Initialize hybrid engine.
Args:
bm25_weight: Weight for BM25 component (0-1)
semantic_weight: Weight for semantic component (0-1)
k1: BM25 k1 parameter
b: BM25 b parameter
boost_sections: Section names to boost (e.g., ["Risk Factors"])
"""
self.bm25_engine = BM25Engine(k1=k1, b=b)
self.bm25_weight = bm25_weight
self.semantic_weight = semantic_weight
self.boost_sections = boost_sections or []
# Validate weights
total_weight = bm25_weight + semantic_weight
if not (0.99 <= total_weight <= 1.01): # Allow small floating point error
raise ValueError(f"Weights must sum to 1.0, got {total_weight}")
def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
"""
Rank nodes using hybrid approach.
Args:
query: Search query
nodes: Nodes to rank
Returns:
Ranked results with combined BM25 + semantic scores
"""
if not nodes:
return []
# Get BM25 results
bm25_results = self.bm25_engine.rank(query, nodes)
if not bm25_results:
return []
# Import semantic scoring
from edgar.documents.ranking.semantic import compute_semantic_scores
# Get semantic scores for all nodes
semantic_scores_dict = compute_semantic_scores(
nodes=nodes,
query=query,
boost_sections=self.boost_sections
)
# Normalize BM25 scores to 0-1 range
max_bm25 = max(r.bm25_score for r in bm25_results)
if max_bm25 > 0:
for result in bm25_results:
result.bm25_score = result.bm25_score / max_bm25
# Combine scores
for result in bm25_results:
semantic_score = semantic_scores_dict.get(id(result.node), 0.0)
result.semantic_score = semantic_score
# Weighted combination
result.score = (
self.bm25_weight * result.bm25_score +
self.semantic_weight * semantic_score
)
result.metadata['algorithm'] = 'Hybrid'
result.metadata['bm25_weight'] = self.bm25_weight
result.metadata['semantic_weight'] = self.semantic_weight
# Re-sort by combined score
bm25_results.sort(key=lambda r: r.score, reverse=True)
# Update ranks
for rank, result in enumerate(bm25_results, start=1):
result.rank = rank
return bm25_results
def get_algorithm_name(self) -> str:
"""Get algorithm name."""
return "Hybrid"
class SemanticEngine(RankingEngine):
"""
Pure semantic/structure-based ranking (no text matching).
Ranks nodes purely by structural importance:
- Section headings
- Cross-references
- Gateway content
- Document structure position
Useful for understanding document organization without specific queries.
"""
def __init__(self, boost_sections: Optional[List[str]] = None):
"""
Initialize semantic engine.
Args:
boost_sections: Section names to boost
"""
self.boost_sections = boost_sections or []
def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
"""
Rank nodes by semantic importance.
Args:
query: Search query (used for context)
nodes: Nodes to rank
Returns:
Ranked results by structural importance
"""
if not nodes:
return []
from edgar.documents.ranking.semantic import compute_semantic_scores
# Get semantic scores
semantic_scores = compute_semantic_scores(
nodes=nodes,
query=query,
boost_sections=self.boost_sections
)
# Create results
results = []
for node in nodes:
score = semantic_scores.get(id(node), 0.0)
if score > 0:
text = node.text() if hasattr(node, 'text') else str(node)
results.append(RankedResult(
node=node,
score=score,
rank=0,
text=text,
semantic_score=score,
metadata={'algorithm': 'Semantic'}
))
# Sort and rank
results.sort(key=lambda r: r.score, reverse=True)
for rank, result in enumerate(results, start=1):
result.rank = rank
return results
def get_algorithm_name(self) -> str:
"""Get algorithm name."""
return "Semantic"

View File

@@ -0,0 +1,333 @@
"""
Semantic scoring for document structure awareness.
Provides structure-based boosting without ML/embeddings:
- Node type importance (headings, tables, XBRL)
- Cross-reference detection (gateway content)
- Section importance
- Text quality signals
This is NOT embedding-based semantic search. It's structure-aware ranking
that helps agents find investigation starting points.
"""
import re
from typing import List, Dict, Optional, TYPE_CHECKING
if TYPE_CHECKING:
from edgar.documents.nodes import Node
from edgar.documents.types import NodeType, SemanticType
# Gateway terms that indicate summary/overview content
GATEWAY_TERMS = [
'summary', 'overview', 'introduction', 'highlights',
'key points', 'executive summary', 'in summary',
'table of contents', 'index'
]
# Cross-reference patterns
CROSS_REFERENCE_PATTERNS = [
r'\bsee\s+item\s+\d+[a-z]?\b', # "See Item 1A"
r'\bsee\s+(?:part|section)\s+\d+\b', # "See Part II"
r'\brefer\s+to\s+item\s+\d+[a-z]?\b', # "Refer to Item 7"
r'\bas\s+discussed\s+in\s+item\s+\d+\b', # "As discussed in Item 1"
r'\bfor\s+(?:more|additional)\s+information\b', # "For more information"
]
# Section importance weights
SECTION_IMPORTANCE = {
'risk factors': 1.5,
'management discussion': 1.4,
'md&a': 1.4,
'business': 1.3,
'financial statements': 1.2,
'controls and procedures': 1.2,
}
def compute_semantic_scores(nodes: List['Node'],
query: str,
boost_sections: Optional[List[str]] = None) -> Dict[int, float]:
"""
Compute semantic/structure scores for nodes.
This provides structure-aware boosting based on:
1. Node type (headings > tables > paragraphs)
2. Cross-references (gateway content)
3. Section importance
4. Gateway terms (summaries, overviews)
5. XBRL presence
6. Text quality
Args:
nodes: Nodes to score
query: Search query (for context-aware boosting)
boost_sections: Additional sections to boost
Returns:
Dictionary mapping node id to semantic score (0-1 range)
"""
scores = {}
boost_sections = boost_sections or []
# Get query context
query_lower = query.lower()
is_item_query = bool(re.search(r'item\s+\d+[a-z]?', query_lower))
for node in nodes:
score = 0.0
# 1. Node Type Boosting
score += _get_node_type_boost(node)
# 2. Semantic Type Boosting
score += _get_semantic_type_boost(node)
# 3. Cross-Reference Detection (gateway content)
score += _detect_cross_references(node)
# 4. Gateway Content Detection
score += _detect_gateway_content(node, query_lower)
# 5. Section Importance Boosting
score += _get_section_boost(node, boost_sections)
# 6. XBRL Fact Boosting (for financial queries)
score += _get_xbrl_boost(node)
# 7. Text Quality Signals
score += _get_quality_boost(node)
# 8. Query-Specific Boosting
if is_item_query:
score += _get_item_header_boost(node)
# Normalize to 0-1 range (max possible score is ~7.0)
normalized_score = min(score / 7.0, 1.0)
scores[id(node)] = normalized_score
return scores
def _get_node_type_boost(node: 'Node') -> float:
"""
Boost based on node type.
Headings and structural elements are more important for navigation.
"""
type_boosts = {
NodeType.HEADING: 2.0, # Headings are key navigation points
NodeType.SECTION: 1.5, # Section markers
NodeType.TABLE: 1.0, # Tables contain structured data
NodeType.XBRL_FACT: 0.8, # Financial facts
NodeType.LIST: 0.5, # Lists
NodeType.PARAGRAPH: 0.3, # Regular text
NodeType.TEXT: 0.1, # Plain text nodes
}
return type_boosts.get(node.type, 0.0)
def _get_semantic_type_boost(node: 'Node') -> float:
"""
Boost based on semantic type.
Section headers and items are important for SEC filings.
"""
if not hasattr(node, 'semantic_type') or node.semantic_type is None:
return 0.0
semantic_boosts = {
SemanticType.ITEM_HEADER: 2.0, # Item headers are critical
SemanticType.SECTION_HEADER: 1.5, # Section headers
SemanticType.FINANCIAL_STATEMENT: 1.2, # Financial statements
SemanticType.TABLE_OF_CONTENTS: 1.0, # TOC is a gateway
SemanticType.TITLE: 0.8,
SemanticType.HEADER: 0.6,
}
return semantic_boosts.get(node.semantic_type, 0.0)
def _detect_cross_references(node: 'Node') -> float:
"""
Detect cross-references that indicate gateway content.
Content that points to other sections is useful for navigation.
"""
text = node.text() if hasattr(node, 'text') else ''
if not text:
return 0.0
text_lower = text.lower()
# Check each pattern
matches = 0
for pattern in CROSS_REFERENCE_PATTERNS:
if re.search(pattern, text_lower):
matches += 1
# Boost increases with number of cross-references
return min(matches * 0.5, 1.5) # Cap at 1.5
def _detect_gateway_content(node: 'Node', query_lower: str) -> float:
"""
Detect gateway content (summaries, overviews, introductions).
These are excellent starting points for investigation.
"""
text = node.text() if hasattr(node, 'text') else ''
if not text:
return 0.0
text_lower = text.lower()
# Check for gateway terms in text
for term in GATEWAY_TERMS:
if term in text_lower:
return 1.0
# Check if this is an introductory paragraph (first ~200 chars)
if len(text) < 200 and len(text) > 20:
# Short intro paragraphs are often summaries
if any(word in text_lower for word in ['provides', 'describes', 'includes', 'contains']):
return 0.5
return 0.0
def _get_section_boost(node: 'Node', boost_sections: List[str]) -> float:
"""
Boost nodes in important sections.
Some SEC sections are more relevant for certain queries.
"""
# Try to determine section from node or ancestors
section_name = _get_node_section(node)
if not section_name:
return 0.0
section_lower = section_name.lower()
# Check built-in importance
for key, boost in SECTION_IMPORTANCE.items():
if key in section_lower:
return boost
# Check user-specified sections
for boost_section in boost_sections:
if boost_section.lower() in section_lower:
return 1.5
return 0.0
def _get_xbrl_boost(node: 'Node') -> float:
"""
Boost XBRL facts and tables with XBRL data.
Financial data is important for financial queries.
"""
if node.type == NodeType.XBRL_FACT:
return 0.8
# Check if table contains XBRL facts
if node.type == NodeType.TABLE:
# Check metadata for XBRL indicator
if hasattr(node, 'metadata') and node.metadata.get('has_xbrl'):
return 0.6
return 0.0
def _get_quality_boost(node: 'Node') -> float:
"""
Boost based on text quality signals.
Higher quality content tends to be more useful:
- Appropriate length (not too short, not too long)
- Good structure (sentences, punctuation)
- Substantive content (not just formatting)
"""
text = node.text() if hasattr(node, 'text') else ''
if not text:
return 0.0
score = 0.0
# Length signal
text_len = len(text)
if 50 <= text_len <= 1000:
score += 0.3 # Good length
elif text_len > 1000:
score += 0.1 # Long but might be comprehensive
else:
score += 0.0 # Too short, likely not substantive
# Sentence structure
sentence_count = text.count('.') + text.count('?') + text.count('!')
if sentence_count >= 2:
score += 0.2 # Multiple sentences indicate substantive content
# Avoid pure formatting/navigation
if text.strip() in ['...', '', '-', 'Table of Contents', 'Page', '']:
return 0.0 # Skip pure formatting
return score
def _get_item_header_boost(node: 'Node') -> float:
"""
Boost Item headers when query is about items.
"Item 1A" queries should prioritize Item 1A headers.
"""
if node.type != NodeType.HEADING:
return 0.0
text = node.text() if hasattr(node, 'text') else ''
if not text:
return 0.0
# Check if this is an Item header
if re.match(r'^\s*item\s+\d+[a-z]?[:\.\s]', text, re.IGNORECASE):
return 1.5
return 0.0
def _get_node_section(node: 'Node') -> Optional[str]:
"""
Get section name for a node by walking up the tree.
Returns:
Section name if found, None otherwise
"""
# Check if node has section in metadata
if hasattr(node, 'metadata') and 'section' in node.metadata:
return node.metadata['section']
# Walk up tree looking for section marker
current = node
while current:
if hasattr(current, 'semantic_type'):
if current.semantic_type in (SemanticType.SECTION_HEADER, SemanticType.ITEM_HEADER):
return current.text() if hasattr(current, 'text') else None
current = current.parent if hasattr(current, 'parent') else None
return None
def get_section_importance_names() -> List[str]:
"""
Get list of important section names for reference.
Returns:
List of section names with built-in importance boosts
"""
return list(SECTION_IMPORTANCE.keys())

View File

@@ -0,0 +1,13 @@
"""
Document renderers for various output formats.
"""
from edgar.documents.renderers.markdown import MarkdownRenderer
from edgar.documents.renderers.text import TextRenderer
from edgar.documents.renderers.fast_table import FastTableRenderer
__all__ = [
'MarkdownRenderer',
'TextRenderer',
'FastTableRenderer'
]

View File

@@ -0,0 +1,669 @@
"""
Fast table renderer for edgar.documents - optimized for performance.
This module provides a high-performance alternative to Rich table rendering
while maintaining professional output quality and readability.
Performance target: ~32x faster than Rich rendering (0.2ms vs 6.5ms per table)
"""
from dataclasses import dataclass
from typing import List, Dict, Optional, Union, Tuple
from enum import Enum
class Alignment(Enum):
"""Column alignment options."""
LEFT = "left"
RIGHT = "right"
CENTER = "center"
@dataclass
class ColumnConfig:
"""Configuration for a table column."""
alignment: Alignment = Alignment.LEFT
min_width: int = 8
max_width: Optional[int] = None
padding: int = 1
@dataclass
class TableStyle:
"""Table styling configuration."""
border_char: str = "|"
header_separator: str = "-"
corner_char: str = "+"
padding: int = 1
min_col_width: int = 8
max_col_width: int = 50
@classmethod
def pipe_table(cls) -> 'TableStyle':
"""Markdown-compatible pipe table style."""
return cls(
border_char="|",
header_separator="-",
corner_char="|",
padding=1,
min_col_width=8,
max_col_width=50
)
@classmethod
def minimal(cls) -> 'TableStyle':
"""Minimal table style with spacing only."""
return cls(
border_char="",
header_separator="",
corner_char="",
padding=2,
min_col_width=6,
max_col_width=40
)
@classmethod
def simple(cls) -> 'TableStyle':
"""
Simple table style matching Rich's box.SIMPLE.
Features:
- No outer border
- No column separators
- Single horizontal line under header
- Space-separated columns with generous padding
- Clean, professional appearance
This style provides the best balance of visual quality and performance,
matching Rich's box.SIMPLE aesthetic while maintaining fast rendering speed.
"""
return cls(
border_char="", # No pipes/borders
header_separator="", # Unicode horizontal line
corner_char="", # No corners
padding=2, # Generous spacing (was 1 in pipe_table)
min_col_width=6, # Slightly relaxed (was 8)
max_col_width=60 # Raised from 50 for wider columns
)
class FastTableRenderer:
"""
High-performance table renderer optimized for speed.
Features:
- 30x+ faster than Rich table rendering
- Professional, readable output
- Configurable alignment and styling
- Handles complex SEC filing table structures
- Markdown-compatible output
- Memory efficient
"""
def __init__(self, style: Optional[TableStyle] = None):
"""Initialize renderer with optional style configuration."""
self.style = style or TableStyle.pipe_table()
# Pre-compile format strings for performance
self._format_cache = {}
def render_table_node(self, table_node) -> str:
"""
Render a TableNode to text format with proper colspan/rowspan handling.
Args:
table_node: TableNode instance from edgar.documents
Returns:
Formatted table string
"""
from edgar.documents.utils.table_matrix import TableMatrix
# Build matrix to handle colspan/rowspan properly
# This ensures cells are expanded to fill their full colspan/rowspan
matrix = TableMatrix()
matrix.build_from_rows(table_node.headers, table_node.rows)
# Extract headers from expanded matrix
headers = []
if table_node.headers:
for row_idx in range(len(table_node.headers)):
expanded_row = matrix.get_expanded_row(row_idx)
# Convert Cell objects to strings, handling None values
row_texts = [cell.text().strip() if cell else '' for cell in expanded_row]
headers.append(row_texts)
# Extract data rows from expanded matrix
rows = []
start_row = len(table_node.headers) if table_node.headers else 0
for row_idx in range(start_row, matrix.row_count):
expanded_row = matrix.get_expanded_row(row_idx)
# Convert Cell objects to strings, handling None values
row_texts = [cell.text().strip() if cell else '' for cell in expanded_row]
rows.append(row_texts)
# Render the table
table_text = self.render_table_data(headers, rows)
# Add caption if present (matches Rich renderer behavior)
if hasattr(table_node, 'caption') and table_node.caption:
return f"{table_node.caption}\n{table_text}"
return table_text
def render_table_data(self, headers: List[List[str]], rows: List[List[str]]) -> str:
"""
Render table data with headers and rows.
Args:
headers: List of header rows (for multi-row headers)
rows: List of data rows
Returns:
Formatted table string
"""
if not headers and not rows:
return ""
# Determine column count from all rows (headers + data)
all_rows = headers + rows if headers else rows
if not all_rows:
return ""
max_cols = max(len(row) for row in all_rows) if all_rows else 0
if max_cols == 0:
return ""
# Filter out empty/spacing columns
meaningful_columns = self._identify_meaningful_columns(all_rows, max_cols)
if not meaningful_columns:
return ""
# Filter all rows (both headers and data) to only meaningful columns
filtered_headers = [self._filter_row_to_columns(row, meaningful_columns) for row in headers] if headers else []
filtered_rows = [self._filter_row_to_columns(row, meaningful_columns) for row in rows]
# Post-process to merge related columns (e.g., currency symbols with amounts)
# Apply to all rows including headers
all_filtered = filtered_headers + filtered_rows
if all_filtered:
# Merge using first filtered row as reference
_, all_merged = self._merge_related_columns(all_filtered[0], all_filtered)
# Split back into headers and data
if filtered_headers:
filtered_headers = all_merged[:len(filtered_headers)]
filtered_rows = all_merged[len(filtered_headers):]
else:
filtered_rows = all_merged
# Recalculate with filtered and merged data
filtered_all_rows = filtered_headers + filtered_rows if filtered_headers else filtered_rows
filtered_max_cols = max(len(row) for row in filtered_all_rows) if filtered_all_rows else 0
# Calculate optimal column widths for filtered columns
col_widths = self._calculate_column_widths(filtered_all_rows, filtered_max_cols)
# Detect column alignments based on filtered content
alignments = self._detect_alignments(filtered_all_rows, filtered_max_cols)
# Build table with filtered data - pass headers as multiple rows
return self._build_table(filtered_headers, filtered_rows, col_widths, alignments)
def _combine_headers(self, headers: List[List[str]]) -> List[str]:
"""
Combine multi-row headers intelligently.
For SEC tables, this prioritizes specific dates/periods over generic labels.
"""
if not headers:
return []
if len(headers) == 1:
return headers[0]
# Determine max columns across all header rows
max_cols = max(len(row) for row in headers) if headers else 0
combined = [""] * max_cols
for col in range(max_cols):
# Collect all values for this column
values = []
for header_row in headers:
if col < len(header_row) and header_row[col].strip():
values.append(header_row[col].strip())
if values:
# Prioritize date-like values over generic terms
date_values = [v for v in values if self._looks_like_date(v)]
if date_values:
combined[col] = date_values[0]
elif len(values) == 1:
combined[col] = values[0]
else:
# Skip generic terms like "Year Ended" if we have something more specific
specific_values = [v for v in values
if v.lower() not in {'year ended', 'years ended', 'period ended'}]
combined[col] = specific_values[0] if specific_values else values[0]
return combined
def _looks_like_date(self, text: str) -> bool:
"""Quick date detection for header processing."""
if not text or len(text) < 4:
return False
text_lower = text.lower().replace('\n', ' ').strip()
# Common date indicators
date_indicators = [
'january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november', 'december',
'20', '19', # Year prefixes
]
return any(indicator in text_lower for indicator in date_indicators) and \
any(c.isdigit() for c in text)
def _identify_meaningful_columns(self, all_rows: List[List[str]], max_cols: int) -> List[int]:
"""
Identify columns that contain meaningful content (not just spacing).
Returns:
List of column indices that have meaningful content
"""
column_scores = []
for col_idx in range(max_cols):
content_score = 0
total_rows = 0
# Score each column based on content quality
for row in all_rows:
if col_idx < len(row):
total_rows += 1
cell_content = str(row[col_idx]).strip()
if cell_content:
# Higher score for longer, more substantial content
if len(cell_content) >= 3: # Substantial content
content_score += 3
elif len(cell_content) == 2 and cell_content.isalnum():
content_score += 2
elif len(cell_content) == 1 and (cell_content.isalnum() or cell_content == '$'):
content_score += 1
# Skip single spaces, dashes, or other likely spacing characters
# Calculate average score per row for this column
avg_score = content_score / max(total_rows, 1)
column_scores.append((col_idx, avg_score, content_score))
# Sort by score descending
column_scores.sort(key=lambda x: x[1], reverse=True)
# Take columns with meaningful content (score >= 0.5 or among top columns)
meaningful_columns = []
for col_idx, avg_score, total_score in column_scores:
# Include if it has good average score or significant total content
if avg_score >= 0.5 or total_score >= 5:
meaningful_columns.append(col_idx)
# Limit to reasonable number of columns for readability
if len(meaningful_columns) >= 8:
break
# Sort by original column order
meaningful_columns.sort()
return meaningful_columns
def _filter_row_to_columns(self, row: List[str], column_indices: List[int]) -> List[str]:
"""
Filter a row to only include the specified column indices.
Args:
row: Original row data
column_indices: List of column indices to keep
Returns:
Filtered row with only the specified columns
"""
if not row:
return []
filtered_row = []
for col_idx in column_indices:
if col_idx < len(row):
filtered_row.append(row[col_idx])
else:
filtered_row.append("") # Missing column
return filtered_row
def _merge_related_columns(self, headers: List[str], rows: List[List[str]]) -> tuple:
"""
Merge related columns (e.g., currency symbols with their amounts).
Returns:
Tuple of (merged_headers, merged_rows)
"""
if not rows or not any(rows):
return headers, rows
# Find columns that should be merged
merge_pairs = []
max_cols = max(len(row) for row in [headers] + rows if row) if rows else len(headers) if headers else 0
for col_idx in range(max_cols - 1):
# Check if this column and the next should be merged
should_merge = self._should_merge_columns(headers, rows, col_idx, col_idx + 1)
if should_merge:
merge_pairs.append((col_idx, col_idx + 1))
# Apply merges (from right to left to avoid index shifting)
merged_headers = headers[:] if headers else []
merged_rows = [row[:] for row in rows]
for left_idx, right_idx in reversed(merge_pairs):
# Merge headers
if merged_headers and left_idx < len(merged_headers) and right_idx < len(merged_headers):
left_header = merged_headers[left_idx].strip()
right_header = merged_headers[right_idx].strip()
merged_header = f"{left_header} {right_header}".strip()
merged_headers[left_idx] = merged_header
merged_headers.pop(right_idx)
# Merge rows
for row in merged_rows:
if left_idx < len(row) and right_idx < len(row):
left_cell = str(row[left_idx]).strip()
right_cell = str(row[right_idx]).strip()
# Smart merging based on content
if left_cell == '$' and right_cell:
merged_cell = f"${right_cell}"
elif left_cell and right_cell:
merged_cell = f"{left_cell} {right_cell}"
else:
merged_cell = left_cell or right_cell
row[left_idx] = merged_cell
if right_idx < len(row):
row.pop(right_idx)
return merged_headers, merged_rows
def _should_merge_columns(self, headers: List[str], rows: List[List[str]], left_idx: int, right_idx: int) -> bool:
"""
Determine if two adjacent columns should be merged.
Returns:
True if columns should be merged
"""
# Check if left column is mostly currency symbols
currency_count = 0
total_count = 0
for row in rows:
if left_idx < len(row) and right_idx < len(row):
total_count += 1
left_cell = str(row[left_idx]).strip()
right_cell = str(row[right_idx]).strip()
# If left is '$' and right is a number, they should be merged
if left_cell == '$' and right_cell and (right_cell.replace(',', '').replace('.', '').isdigit()):
currency_count += 1
# If most rows have currency symbol + number pattern, merge them
if total_count > 0 and currency_count / total_count >= 0.5:
return True
# Check for other merge patterns (e.g., empty left column with content right column)
empty_left_count = 0
for row in rows:
if left_idx < len(row) and right_idx < len(row):
left_cell = str(row[left_idx]).strip()
right_cell = str(row[right_idx]).strip()
if not left_cell and right_cell:
empty_left_count += 1
# If left column is mostly empty, consider merging
if total_count > 0 and empty_left_count / total_count >= 0.7:
return True
return False
def _calculate_column_widths(self, all_rows: List[List[str]], max_cols: int) -> List[int]:
"""Calculate optimal column widths based on content."""
col_widths = [self.style.min_col_width] * max_cols
# Find the maximum content width for each column
for row in all_rows:
for col_idx in range(min(len(row), max_cols)):
content = str(row[col_idx]) if row[col_idx] else ""
# Handle multi-line content
max_line_width = max((len(line) for line in content.split('\n')), default=0)
content_width = max_line_width + (self.style.padding * 2)
# Apply limits
content_width = min(content_width, self.style.max_col_width)
col_widths[col_idx] = max(col_widths[col_idx], content_width)
return col_widths
def _detect_alignments(self, all_rows: List[List[str]], max_cols: int) -> List[Alignment]:
"""Detect appropriate alignment for each column based on content."""
alignments = [Alignment.LEFT] * max_cols
for col_idx in range(max_cols):
# Analyze column content (skip header row if present)
data_rows = all_rows[1:] if len(all_rows) > 1 else all_rows
numeric_count = 0
total_count = 0
for row in data_rows:
if col_idx < len(row) and row[col_idx].strip():
total_count += 1
content = row[col_idx].strip()
# Check if content looks numeric (currency, percentages, numbers)
if self._looks_numeric(content):
numeric_count += 1
# If most values in column are numeric, right-align
if total_count > 0 and numeric_count / total_count >= 0.7:
alignments[col_idx] = Alignment.RIGHT
return alignments
def _looks_numeric(self, text: str) -> bool:
"""Check if text content looks numeric."""
if not text:
return False
# Remove common formatting characters
clean_text = text.replace(',', '').replace('$', '').replace('%', '').replace('(', '').replace(')', '').strip()
# Handle negative numbers in parentheses
if text.strip().startswith('(') and text.strip().endswith(')'):
clean_text = text.strip()[1:-1].replace(',', '').replace('$', '').strip()
# Check if remaining text is numeric
try:
float(clean_text)
return True
except ValueError:
return False
def _build_table(self, headers: List[List[str]], rows: List[List[str]],
col_widths: List[int], alignments: List[Alignment]) -> str:
"""
Build the final table string.
Args:
headers: List of header rows (can be multiple rows for multi-row headers)
rows: List of data rows
col_widths: Column widths
alignments: Column alignments
"""
lines = []
# Header rows (can be multiple)
if headers:
for header_row in headers:
# Only add header rows with meaningful content
if any(cell.strip() for cell in header_row):
# Handle multi-line cells in header rows
formatted_lines = self._format_multiline_row(header_row, col_widths, alignments)
lines.extend(formatted_lines)
# Header separator (after all header rows)
if self.style.header_separator:
sep_line = self._create_separator_line(col_widths)
lines.append(sep_line)
# Data rows
for row in rows:
# Only add rows with meaningful content
if any(cell.strip() for cell in row):
row_line = self._format_row(row, col_widths, alignments)
lines.append(row_line)
return '\n'.join(lines)
def _format_row(self, row: List[str], col_widths: List[int],
alignments: List[Alignment]) -> str:
"""Format a single row with proper alignment and padding."""
cells = []
border = self.style.border_char
for col_idx, width in enumerate(col_widths):
# Get cell content
content = str(row[col_idx]) if col_idx < len(row) else ""
# Handle multi-line content (take first line only for table)
if '\n' in content:
content = content.split('\n')[0]
content = content.strip()
# Calculate available width for content
available_width = width - (self.style.padding * 2)
# Truncate if too long
if len(content) > available_width:
content = content[:available_width-3] + "..."
# Apply alignment
alignment = alignments[col_idx] if col_idx < len(alignments) else Alignment.LEFT
if alignment == Alignment.RIGHT:
aligned_content = content.rjust(available_width)
elif alignment == Alignment.CENTER:
aligned_content = content.center(available_width)
else: # LEFT
aligned_content = content.ljust(available_width)
# Add padding
padded_cell = ' ' * self.style.padding + aligned_content + ' ' * self.style.padding
cells.append(padded_cell)
# Join with borders
if border:
return border + border.join(cells) + border
else:
return ' '.join(cells)
def _format_multiline_row(self, row: List[str], col_widths: List[int],
alignments: List[Alignment]) -> List[str]:
"""
Format a row that may contain multi-line cells (cells with \n characters).
Returns a list of formatted lines, one for each line of text in the cells.
"""
# Split each cell by newlines
cell_lines = []
max_lines = 1
for col_idx, content in enumerate(row):
lines = content.split('\n') if content else ['']
cell_lines.append(lines)
max_lines = max(max_lines, len(lines))
# Build output lines
output_lines = []
for line_idx in range(max_lines):
# Build row for this line
current_row = []
for col_idx in range(len(row)):
# Get the line for this cell, or empty string if this cell has fewer lines
if line_idx < len(cell_lines[col_idx]):
current_row.append(cell_lines[col_idx][line_idx])
else:
current_row.append('')
# Format this line
formatted_line = self._format_row(current_row, col_widths, alignments)
output_lines.append(formatted_line)
return output_lines
def _create_separator_line(self, col_widths: List[int]) -> str:
"""
Create header separator line.
For bordered styles: |-------|-------|
For borderless styles: ─────────────── (full width horizontal line)
"""
sep_char = self.style.header_separator
border = self.style.border_char
if not sep_char:
# No separator at all (minimal style)
return ""
if border:
# Bordered style: create separator matching column widths
separators = []
for width in col_widths:
separators.append(sep_char * width)
return border + border.join(separators) + border
else:
# Borderless style (simple): single horizontal line across full width
# Calculate total width: sum of column widths + gaps between columns
total_width = sum(col_widths) + (len(col_widths) - 1) * 2 # 2-space gaps
# Add leading space for indentation (matching row indentation)
return " " + sep_char * total_width
# Factory functions for easy usage
def create_fast_renderer(style: str = "pipe") -> FastTableRenderer:
"""
Create a FastTableRenderer with predefined style.
Args:
style: Style name ("pipe", "minimal")
Returns:
Configured FastTableRenderer instance
"""
if style == "minimal":
return FastTableRenderer(TableStyle.minimal())
else: # Default to pipe
return FastTableRenderer(TableStyle.pipe_table())
def render_table_fast(table_node, style: str = "pipe") -> str:
"""
Convenience function to quickly render a table.
Args:
table_node: TableNode instance
style: Style name ("pipe", "minimal")
Returns:
Formatted table string
"""
renderer = create_fast_renderer(style)
return renderer.render_table_node(table_node)

View File

@@ -0,0 +1,613 @@
"""
Markdown renderer for parsed documents.
"""
from typing import List, Optional, Dict, Set
from edgar.documents.document import Document
from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode, ListNode, ListItemNode
from edgar.documents.table_nodes import TableNode
class MarkdownRenderer:
"""
Renders parsed documents to Markdown format.
Features:
- Preserves document structure
- Handles tables with proper formatting
- Supports nested lists
- Includes metadata annotations
- Configurable output options
"""
def __init__(self,
include_metadata: bool = False,
include_toc: bool = False,
max_heading_level: int = 6,
table_format: str = 'pipe',
wrap_width: Optional[int] = None):
"""
Initialize markdown renderer.
Args:
include_metadata: Include metadata annotations
include_toc: Generate table of contents
max_heading_level: Maximum heading level to render
table_format: Table format ('pipe', 'grid', 'simple')
wrap_width: Wrap text at specified width
"""
self.include_metadata = include_metadata
self.include_toc = include_toc
self.max_heading_level = max_heading_level
self.table_format = table_format
self.wrap_width = wrap_width
# Track state during rendering
self._toc_entries: List[tuple] = []
self._rendered_ids: Set[str] = set()
self._list_depth = 0
self._in_table = False
def render(self, document: Document) -> str:
"""
Render document to Markdown.
Args:
document: Document to render
Returns:
Markdown formatted text
"""
self._reset_state()
parts = []
# Add metadata header if requested
if self.include_metadata:
parts.append(self._render_metadata(document))
parts.append("")
# Placeholder for TOC
if self.include_toc:
toc_placeholder = "<!-- TOC -->"
parts.append(toc_placeholder)
parts.append("")
# Render document content
content = self._render_node(document.root)
parts.append(content)
# Join parts
markdown = "\n".join(parts)
# Replace TOC placeholder
if self.include_toc and self._toc_entries:
toc = self._generate_toc()
markdown = markdown.replace(toc_placeholder, toc)
return markdown.strip()
def render_node(self, node: Node) -> str:
"""
Render a specific node to Markdown.
Args:
node: Node to render
Returns:
Markdown formatted text
"""
self._reset_state()
return self._render_node(node)
def _reset_state(self):
"""Reset renderer state."""
self._toc_entries = []
self._rendered_ids = set()
self._list_depth = 0
self._in_table = False
def _render_node(self, node: Node) -> str:
"""Render a node and its children."""
# Skip if already rendered (handles shared nodes)
if node.id in self._rendered_ids:
return ""
self._rendered_ids.add(node.id)
# Dispatch based on node type
if isinstance(node, HeadingNode):
return self._render_heading(node)
elif isinstance(node, ParagraphNode):
return self._render_paragraph(node)
elif isinstance(node, TextNode):
return self._render_text(node)
elif isinstance(node, TableNode):
return self._render_table(node)
elif isinstance(node, ListNode):
return self._render_list(node)
elif isinstance(node, ListItemNode):
return self._render_list_item(node)
else:
# Default: render children
return self._render_children(node)
def _render_heading(self, node: HeadingNode) -> str:
"""Render heading node."""
# Limit heading level
level = min(node.level, self.max_heading_level)
# Get heading text
text = node.text().strip()
if not text:
return ""
# Add to TOC
if self.include_toc:
self._toc_entries.append((level, text, node.id))
# Create markdown heading
markdown = "#" * level + " " + text
# Add metadata if requested
if self.include_metadata and node.metadata:
metadata = self._format_metadata(node.metadata)
if metadata:
markdown += f" <!-- {metadata} -->"
# Add children content
children_content = self._render_children(node)
if children_content:
markdown += "\n\n" + children_content
return markdown
def _render_paragraph(self, node: ParagraphNode) -> str:
"""Render paragraph node."""
# Get paragraph content
content = self._render_children(node).strip()
if not content:
return ""
# Wrap if requested
if self.wrap_width:
content = self._wrap_text(content, self.wrap_width)
# Add metadata if requested
if self.include_metadata and node.metadata:
metadata = self._format_metadata(node.metadata)
if metadata:
content = f"<!-- {metadata} -->\n{content}"
return content
def _render_text(self, node: TextNode) -> str:
"""Render text node."""
text = node.text()
# Escape markdown special characters
text = self._escape_markdown(text)
# Apply text formatting based on style
if node.style:
if node.style.font_weight in ['bold', '700', '800', '900']:
text = f"**{text}**"
elif node.style.font_style == 'italic':
text = f"*{text}*"
elif node.style.text_decoration == 'underline':
text = f"<u>{text}</u>"
return text
def _render_table(self, node: TableNode) -> str:
"""Render table node."""
self._in_table = True
parts = []
# Add caption if present
if node.caption:
parts.append(f"**Table: {node.caption}**")
parts.append("")
# Render based on format
if self.table_format == 'pipe':
table_md = self._render_table_pipe(node)
elif self.table_format == 'grid':
table_md = self._render_table_grid(node)
else: # simple
table_md = self._render_table_simple(node)
parts.append(table_md)
# Add metadata if requested
if self.include_metadata and node.metadata:
metadata = self._format_metadata(node.metadata)
if metadata:
parts.append(f"<!-- Table metadata: {metadata} -->")
self._in_table = False
return "\n".join(parts)
def _render_table_pipe(self, node: TableNode) -> str:
"""Render table in pipe format with proper column spanning support."""
# Handle complex SEC filing tables with column spanning
expanded_headers, expanded_data_rows = self._expand_table_structure(node)
# Identify and filter to meaningful columns
content_columns = self._identify_content_columns(expanded_headers, expanded_data_rows)
if not content_columns:
return ""
rows = []
# Render headers with intelligent multi-row combination
if expanded_headers:
combined_headers = self._combine_multi_row_headers(expanded_headers)
filtered_headers = [combined_headers[i] if i < len(combined_headers) else "" for i in content_columns]
row_md = "| " + " | ".join(filtered_headers) + " |"
rows.append(row_md)
# Add separator
separator = "| " + " | ".join(["---"] * len(filtered_headers)) + " |"
rows.append(separator)
# Render data rows
for expanded_row in expanded_data_rows:
filtered_row = [expanded_row[i] if i < len(expanded_row) else "" for i in content_columns]
# Only add rows with meaningful content
if any(cell.strip() for cell in filtered_row):
row_md = "| " + " | ".join(filtered_row) + " |"
rows.append(row_md)
return "\n".join(rows)
def _render_table_grid(self, node: TableNode) -> str:
"""Render table in grid format."""
# Simplified grid format
all_rows = []
# Add headers
if node.headers:
for header_row in node.headers:
cells = [cell.text() for cell in header_row]
all_rows.append(" | ".join(cells))
# Add data rows
for row in node.rows:
cells = [cell.text() for cell in row.cells]
all_rows.append(" | ".join(cells))
if all_rows:
# Add borders
max_width = max(len(row) for row in all_rows)
border = "+" + "-" * (max_width + 2) + "+"
result = [border]
for row in all_rows:
result.append(f"| {row:<{max_width}} |")
result.append(border)
return "\n".join(result)
return ""
def _render_table_simple(self, node: TableNode) -> str:
"""Render table in simple format."""
rows = []
# Add headers
if node.headers:
for header_row in node.headers:
cells = [cell.text() for cell in header_row]
rows.append(" ".join(cells))
# Add separator if we have headers
if node.headers and node.rows:
rows.append("")
# Add data rows
for row in node.rows:
cells = [cell.text() for cell in row.cells]
rows.append(" ".join(cells))
return "\n".join(rows)
def _render_list(self, node: ListNode) -> str:
"""Render list node."""
self._list_depth += 1
items = []
for child in node.children:
if isinstance(child, ListItemNode):
item_md = self._render_list_item(child)
if item_md:
items.append(item_md)
self._list_depth -= 1
return "\n".join(items)
def _render_list_item(self, node: ListItemNode) -> str:
"""Render list item node."""
# Determine bullet/number
if node.parent and hasattr(node.parent, 'ordered') and node.parent.ordered:
# Ordered list
index = node.parent.children.index(node) + 1
marker = f"{index}."
else:
# Unordered list
markers = ['*', '-', '+']
marker = markers[(self._list_depth - 1) % len(markers)]
# Indentation
indent = " " * (self._list_depth - 1)
# Get content
content = self._render_children(node).strip()
# Format item
if '\n' in content:
# Multi-line content
lines = content.split('\n')
result = indent + marker + " " + lines[0]
for line in lines[1:]:
result += "\n" + indent + " " + line
return result
else:
# Single line
return indent + marker + " " + content
def _render_children(self, node: Node) -> str:
"""Render all children of a node."""
parts = []
for child in node.children:
child_md = self._render_node(child)
if child_md:
parts.append(child_md)
# Join with appropriate separator
if self._in_table:
return " ".join(parts)
elif any(isinstance(child, (HeadingNode, ParagraphNode, TableNode, ListNode))
for child in node.children):
return "\n\n".join(parts)
else:
return " ".join(parts)
def _render_metadata(self, document: Document) -> str:
"""Render document metadata."""
lines = ["---"]
if document.metadata.company:
lines.append(f"company: {document.metadata.company}")
if document.metadata.form:
lines.append(f"form: {document.metadata.form}")
if document.metadata.filing_date:
lines.append(f"filing_date: {document.metadata.filing_date}")
if document.metadata.cik:
lines.append(f"cik: {document.metadata.cik}")
if document.metadata.accession_number:
lines.append(f"accession_number: {document.metadata.accession_number}")
lines.append("---")
return "\n".join(lines)
def _generate_toc(self) -> str:
"""Generate table of contents."""
lines = ["## Table of Contents", ""]
for level, text, node_id in self._toc_entries:
# Create anchor link
anchor = self._create_anchor(text)
# Indentation based on level
indent = " " * (level - 1)
# Add TOC entry
lines.append(f"{indent}- [{text}](#{anchor})")
return "\n".join(lines)
def _create_anchor(self, text: str) -> str:
"""Create anchor from heading text."""
# Convert to lowercase and replace spaces with hyphens
anchor = text.lower()
anchor = anchor.replace(' ', '-')
# Remove special characters
import re
anchor = re.sub(r'[^a-z0-9\-]', '', anchor)
# Remove multiple hyphens
anchor = re.sub(r'-+', '-', anchor)
return anchor.strip('-')
def _format_metadata(self, metadata: Dict) -> str:
"""Format metadata for display."""
parts = []
for key, value in metadata.items():
if key == 'semantic_type':
parts.append(f"type:{value}")
elif key == 'section':
parts.append(f"section:{value}")
elif key == 'ix_tag':
parts.append(f"xbrl:{value}")
else:
parts.append(f"{key}:{value}")
return " ".join(parts)
def _escape_markdown(self, text: str) -> str:
"""Escape markdown special characters."""
# Don't escape in tables
if self._in_table:
return text
# Escape special characters
for char in ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']:
text = text.replace(char, '\\' + char)
return text
def _wrap_text(self, text: str, width: int) -> str:
"""Wrap text at specified width."""
import textwrap
return textwrap.fill(text, width=width, break_long_words=False)
def _expand_table_structure(self, node: TableNode) -> tuple:
"""
Expand table structure to handle column spanning properly.
Returns (expanded_headers, expanded_data_rows).
"""
# Calculate the logical column count from colspan
max_columns = 0
# Check all rows for maximum column span
all_rows = []
if node.headers:
for header_row in node.headers:
all_rows.append(header_row)
for row in node.rows:
all_rows.append(row.cells)
for row in all_rows:
column_count = sum(cell.colspan for cell in row)
max_columns = max(max_columns, column_count)
# Expand headers
expanded_headers = []
if node.headers:
for header_row in node.headers:
expanded = self._expand_row_to_columns(header_row, max_columns)
expanded_headers.append(expanded)
# Expand data rows
expanded_data_rows = []
for row in node.rows:
expanded = self._expand_row_to_columns(row.cells, max_columns)
expanded_data_rows.append(expanded)
return expanded_headers, expanded_data_rows
def _expand_row_to_columns(self, cells: List, target_columns: int) -> List[str]:
"""Expand a row with colspan cells to match the target column count."""
expanded = []
current_column = 0
for cell in cells:
cell_text = cell.text().strip()
# Add the cell content
expanded.append(cell_text)
current_column += 1
# Add empty cells for remaining colspan
for _ in range(cell.colspan - 1):
if current_column < target_columns:
expanded.append("")
current_column += 1
# Pad to target column count if needed
while len(expanded) < target_columns:
expanded.append("")
return expanded[:target_columns]
def _identify_content_columns(self, expanded_headers: List[List[str]],
expanded_data_rows: List[List[str]]) -> List[int]:
"""Identify which columns actually contain meaningful content."""
if not expanded_headers and not expanded_data_rows:
return []
# Get the column count
max_cols = 0
if expanded_headers:
max_cols = max(max_cols, max(len(row) for row in expanded_headers))
if expanded_data_rows:
max_cols = max(max_cols, max(len(row) for row in expanded_data_rows))
content_columns = []
for col in range(max_cols):
has_content = False
# Check headers
for header_row in expanded_headers:
if col < len(header_row) and header_row[col].strip():
has_content = True
break
# Check data rows
if not has_content:
for data_row in expanded_data_rows:
if col < len(data_row) and data_row[col].strip():
has_content = True
break
if has_content:
content_columns.append(col)
return content_columns
def _combine_multi_row_headers(self, header_rows: List[List[str]]) -> List[str]:
"""
Combine multi-row headers intelligently for SEC filing tables.
Prioritizes specific dates/periods over generic labels.
"""
if not header_rows:
return []
num_columns = len(header_rows[0])
combined = [""] * num_columns
for col in range(num_columns):
# Collect all values for this column across header rows
column_values = []
for row in header_rows:
if col < len(row) and row[col].strip():
column_values.append(row[col].strip())
if column_values:
# Prioritize date-like values over generic labels
date_values = [v for v in column_values if self._looks_like_date(v)]
if date_values:
# Clean up line breaks in dates
combined[col] = date_values[0].replace('\n', ' ')
elif len(column_values) == 1:
combined[col] = column_values[0].replace('\n', ' ')
else:
# Skip generic terms like "Year Ended" if we have something more specific
specific_values = [v for v in column_values
if v.lower() not in ['year ended', 'years ended']]
if specific_values:
combined[col] = specific_values[0].replace('\n', ' ')
else:
combined[col] = column_values[0].replace('\n', ' ')
return combined
def _looks_like_date(self, text: str) -> bool:
"""Check if text looks like a date."""
import re
# Common date patterns in SEC filings
date_patterns = [
r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s*\d{4}',
r'\d{1,2}/\d{1,2}/\d{4}',
r'\d{4}-\d{2}-\d{2}',
r'^\d{4}$', # Just a year
]
text_clean = text.replace('\n', ' ').strip()
for pattern in date_patterns:
if re.search(pattern, text_clean, re.IGNORECASE):
return True
return False

View File

@@ -0,0 +1,51 @@
"""
Plain text renderer for parsed documents.
"""
from typing import Optional
from edgar.documents.document import Document
from edgar.documents.extractors.text_extractor import TextExtractor
class TextRenderer:
"""
Renders parsed documents to plain text.
This is a simple wrapper around TextExtractor for consistency
with other renderers.
"""
def __init__(self,
clean: bool = True,
include_tables: bool = True,
max_length: Optional[int] = None,
preserve_structure: bool = False):
"""
Initialize text renderer.
Args:
clean: Clean and normalize text
include_tables: Include table content
max_length: Maximum text length
preserve_structure: Preserve document structure
"""
self.extractor = TextExtractor(
clean=clean,
include_tables=include_tables,
include_metadata=False,
include_links=False,
max_length=max_length,
preserve_structure=preserve_structure
)
def render(self, document: Document) -> str:
"""
Render document to plain text.
Args:
document: Document to render
Returns:
Plain text
"""
return self.extractor.extract(document)

View File

@@ -0,0 +1,769 @@
"""
Search functionality for parsed documents.
Provides both traditional search modes (TEXT, REGEX, SEMANTIC, XPATH) and
advanced BM25-based ranking with semantic structure awareness.
"""
import re
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional, Dict, Any, TYPE_CHECKING
from edgar.documents.document import Document
from edgar.documents.nodes import Node, HeadingNode
from edgar.documents.table_nodes import TableNode
from edgar.documents.types import NodeType, SemanticType
if TYPE_CHECKING:
from edgar.documents.types import SearchResult as TypesSearchResult
class SearchMode(Enum):
"""Search modes."""
TEXT = "text" # Plain text search
REGEX = "regex" # Regular expression search
SEMANTIC = "semantic" # Semantic/structural search
XPATH = "xpath" # XPath-like search
@dataclass
class SearchResult:
"""Result from a search operation."""
node: Node # Node containing match
text: str # Matched text
start_offset: int # Start position in text
end_offset: int # End position in text
context: Optional[str] = None # Surrounding context
score: float = 1.0 # Relevance score
@property
def snippet(self) -> str:
"""Get text snippet with match highlighted."""
if self.context:
# Highlight match in context
before = self.context[:self.start_offset]
match = self.context[self.start_offset:self.end_offset]
after = self.context[self.end_offset:]
return f"{before}**{match}**{after}"
return f"**{self.text}**"
class DocumentSearch:
"""
Search functionality for parsed documents.
Supports various search modes and options.
"""
def __init__(self, document: Document, use_cache: bool = True):
"""
Initialize search with document.
Args:
document: Document to search
use_cache: Enable index caching for faster repeated searches (default: True)
"""
self.document = document
self.use_cache = use_cache
self._ranking_engines: Dict[str, Any] = {} # Cached ranking engines
self._build_index()
def _build_index(self):
"""Build search index for performance."""
# Text index: map text to nodes
self.text_index: Dict[str, List[Node]] = {}
# Type index: map node types to nodes
self.type_index: Dict[NodeType, List[Node]] = {}
# Semantic index: map semantic types to nodes
self.semantic_index: Dict[SemanticType, List[Node]] = {}
# Build indices
for node in self.document.root.walk():
# Text index
if hasattr(node, 'text'):
text = node.text()
if text:
text_lower = text.lower()
if text_lower not in self.text_index:
self.text_index[text_lower] = []
self.text_index[text_lower].append(node)
# Type index
if node.type not in self.type_index:
self.type_index[node.type] = []
self.type_index[node.type].append(node)
# Semantic index
if hasattr(node, 'semantic_type') and node.semantic_type:
if node.semantic_type not in self.semantic_index:
self.semantic_index[node.semantic_type] = []
self.semantic_index[node.semantic_type].append(node)
def search(self,
query: str,
mode: SearchMode = SearchMode.TEXT,
case_sensitive: bool = False,
whole_word: bool = False,
limit: Optional[int] = None,
node_types: Optional[List[NodeType]] = None,
in_section: Optional[str] = None) -> List[SearchResult]:
"""
Search document.
Args:
query: Search query
mode: Search mode
case_sensitive: Case sensitive search
whole_word: Match whole words only
limit: Maximum results to return
node_types: Limit search to specific node types
in_section: Limit search to specific section
Returns:
List of search results
"""
if mode == SearchMode.TEXT:
results = self._text_search(query, case_sensitive, whole_word)
elif mode == SearchMode.REGEX:
results = self._regex_search(query, case_sensitive)
elif mode == SearchMode.SEMANTIC:
results = self._semantic_search(query)
elif mode == SearchMode.XPATH:
results = self._xpath_search(query)
else:
raise ValueError(f"Unsupported search mode: {mode}")
# Filter by node types
if node_types:
results = [r for r in results if r.node.type in node_types]
# Filter by section
if in_section:
section_nodes = self._get_section_nodes(in_section)
results = [r for r in results if r.node in section_nodes]
# Apply limit
if limit and len(results) > limit:
results = results[:limit]
return results
def _text_search(self, query: str, case_sensitive: bool, whole_word: bool) -> List[SearchResult]:
"""Perform text search."""
results = []
# Prepare query
if not case_sensitive:
query = query.lower()
# Search only leaf nodes to avoid duplicates
for node in self.document.root.walk():
# Skip nodes with children (they aggregate child text)
if hasattr(node, 'children') and node.children:
continue
if not hasattr(node, 'text'):
continue
text = node.text()
if not text:
continue
search_text = text if case_sensitive else text.lower()
# Find all occurrences
if whole_word:
# Use word boundary regex
pattern = r'\b' + re.escape(query) + r'\b'
flags = 0 if case_sensitive else re.IGNORECASE
for match in re.finditer(pattern, text, flags):
results.append(SearchResult(
node=node,
text=match.group(),
start_offset=match.start(),
end_offset=match.end(),
context=self._get_context(text, match.start(), match.end())
))
else:
# Simple substring search
start = 0
while True:
pos = search_text.find(query, start)
if pos == -1:
break
results.append(SearchResult(
node=node,
text=text[pos:pos + len(query)],
start_offset=pos,
end_offset=pos + len(query),
context=self._get_context(text, pos, pos + len(query))
))
start = pos + 1
return results
def _regex_search(self, pattern: str, case_sensitive: bool) -> List[SearchResult]:
"""Perform regex search."""
results = []
try:
flags = 0 if case_sensitive else re.IGNORECASE
regex = re.compile(pattern, flags)
except re.error as e:
raise ValueError(f"Invalid regex pattern: {e}")
# Search only leaf nodes to avoid duplicates
for node in self.document.root.walk():
# Skip nodes with children (they aggregate child text)
if hasattr(node, 'children') and node.children:
continue
if not hasattr(node, 'text'):
continue
text = node.text()
if not text:
continue
# Find all matches
for match in regex.finditer(text):
results.append(SearchResult(
node=node,
text=match.group(),
start_offset=match.start(),
end_offset=match.end(),
context=self._get_context(text, match.start(), match.end())
))
return results
def _semantic_search(self, query: str) -> List[SearchResult]:
"""Perform semantic/structural search."""
results = []
# Parse semantic query
# Examples: "heading:Item 1", "table:revenue", "section:risk factors"
if ':' in query:
search_type, search_text = query.split(':', 1)
search_type = search_type.lower().strip()
search_text = search_text.strip()
else:
# Default to text search in headings
search_type = 'heading'
search_text = query
if search_type == 'heading':
# Search headings
for node in self.type_index.get(NodeType.HEADING, []):
if isinstance(node, HeadingNode):
heading_text = node.text()
if heading_text and search_text.lower() in heading_text.lower():
results.append(SearchResult(
node=node,
text=heading_text,
start_offset=0,
end_offset=len(heading_text),
score=self._calculate_heading_score(node)
))
elif search_type == 'table':
# Search tables
for node in self.type_index.get(NodeType.TABLE, []):
if isinstance(node, TableNode):
# Search in table content
table_text = node.text()
if table_text and search_text.lower() in table_text.lower():
results.append(SearchResult(
node=node,
text=f"Table: {node.caption or 'Untitled'}",
start_offset=0,
end_offset=len(table_text),
context=table_text[:200] + "..." if len(table_text) > 200 else table_text
))
elif search_type == 'section':
# Search sections
sections = self.document.sections
for section_name, section in sections.items():
if search_text.lower() in section_name.lower():
results.append(SearchResult(
node=section.node,
text=section.title,
start_offset=section.start_offset,
end_offset=section.end_offset,
score=2.0 # Boost section matches
))
# Sort by score
results.sort(key=lambda r: r.score, reverse=True)
return results
def _xpath_search(self, xpath: str) -> List[SearchResult]:
"""Perform XPath-like search."""
results = []
# Simple XPath parser
# Examples: "//h1", "//table[@class='financial']", "//p[contains(text(),'revenue')]"
# Extract tag name
tag_match = re.match(r'//(\w+)', xpath)
if not tag_match:
raise ValueError(f"Invalid XPath: {xpath}")
tag_name = tag_match.group(1).lower()
# Map tag to node type
tag_to_type = {
'h1': NodeType.HEADING,
'h2': NodeType.HEADING,
'h3': NodeType.HEADING,
'h4': NodeType.HEADING,
'h5': NodeType.HEADING,
'h6': NodeType.HEADING,
'p': NodeType.PARAGRAPH,
'table': NodeType.TABLE,
'section': NodeType.SECTION
}
node_type = tag_to_type.get(tag_name)
if not node_type:
return results
# Get nodes of type
nodes = self.type_index.get(node_type, [])
# Apply filters
if '[' in xpath:
# Extract condition
condition_match = re.search(r'\[(.*?)\]', xpath)
if condition_match:
condition = condition_match.group(1)
nodes = self._apply_xpath_condition(nodes, condition)
# Create results
for node in nodes:
text = node.text() if hasattr(node, 'text') else str(node)
results.append(SearchResult(
node=node,
text=text[:100] + "..." if len(text) > 100 else text,
start_offset=0,
end_offset=len(text)
))
return results
def _apply_xpath_condition(self, nodes: List[Node], condition: str) -> List[Node]:
"""Apply XPath condition to filter nodes."""
filtered = []
# Parse condition
if condition.startswith('@'):
# Attribute condition
attr_match = re.match(r'@(\w+)=["\']([^"\']+)["\']', condition)
if attr_match:
attr_name, attr_value = attr_match.groups()
for node in nodes:
if node.metadata.get(attr_name) == attr_value:
filtered.append(node)
elif 'contains(text()' in condition:
# Text contains condition
text_match = re.search(r'contains\(text\(\),\s*["\']([^"\']+)["\']\)', condition)
if text_match:
search_text = text_match.group(1).lower()
for node in nodes:
if hasattr(node, 'text'):
node_text = node.text()
if node_text and search_text in node_text.lower():
filtered.append(node)
else:
# Level condition for headings
try:
level = int(condition)
for node in nodes:
if isinstance(node, HeadingNode) and node.level == level:
filtered.append(node)
except ValueError:
pass
return filtered
def _get_context(self, text: str, start: int, end: int, context_size: int = 50) -> str:
"""Get context around match."""
# Calculate context boundaries
context_start = max(0, start - context_size)
context_end = min(len(text), end + context_size)
# Get context
context = text[context_start:context_end]
# Add ellipsis if truncated
if context_start > 0:
context = "..." + context
if context_end < len(text):
context = context + "..."
# Adjust offsets for context
if context_start > 0:
start = start - context_start + 3 # Account for "..."
end = end - context_start + 3
else:
start = start - context_start
end = end - context_start
return context
def _calculate_heading_score(self, heading: HeadingNode) -> float:
"""Calculate relevance score for heading."""
# Higher level headings get higher scores
base_score = 7 - heading.level # H1=6, H2=5, etc.
# Boost section headers
if heading.semantic_type == SemanticType.SECTION_HEADER:
base_score *= 1.5
return base_score
def _get_section_nodes(self, section_name: str) -> List[Node]:
"""Get all nodes in a section."""
nodes = []
sections = self.document.sections
if section_name in sections:
section = sections[section_name]
# Get all nodes in section
for node in section.node.walk():
nodes.append(node)
return nodes
def find_tables(self,
caption_pattern: Optional[str] = None,
min_rows: Optional[int] = None,
min_cols: Optional[int] = None) -> List[TableNode]:
"""
Find tables matching criteria.
Args:
caption_pattern: Regex pattern for caption
min_rows: Minimum number of rows
min_cols: Minimum number of columns
Returns:
List of matching tables
"""
tables = []
for node in self.type_index.get(NodeType.TABLE, []):
if not isinstance(node, TableNode):
continue
# Check caption
if caption_pattern and node.caption:
if not re.search(caption_pattern, node.caption, re.IGNORECASE):
continue
# Check dimensions
if min_rows and node.row_count < min_rows:
continue
if min_cols and node.col_count < min_cols:
continue
tables.append(node)
return tables
def find_headings(self,
level: Optional[int] = None,
pattern: Optional[str] = None) -> List[HeadingNode]:
"""
Find headings matching criteria.
Args:
level: Heading level (1-6)
pattern: Regex pattern for heading text
Returns:
List of matching headings
"""
headings = []
for node in self.type_index.get(NodeType.HEADING, []):
if not isinstance(node, HeadingNode):
continue
# Check level
if level and node.level != level:
continue
# Check pattern
if pattern:
heading_text = node.text()
if not heading_text or not re.search(pattern, heading_text, re.IGNORECASE):
continue
headings.append(node)
return headings
def ranked_search(self,
query: str,
algorithm: str = "hybrid",
top_k: int = 10,
node_types: Optional[List[NodeType]] = None,
in_section: Optional[str] = None,
boost_sections: Optional[List[str]] = None) -> List['TypesSearchResult']:
"""
Advanced search with BM25-based ranking and semantic structure awareness.
This provides relevance-ranked results better suited for financial documents
than simple substring matching. Uses BM25 for exact term matching combined
with semantic structure boosting for gateway content detection.
Args:
query: Search query
algorithm: Ranking algorithm ("bm25", "hybrid", "semantic")
top_k: Maximum results to return
node_types: Limit search to specific node types
in_section: Limit search to specific section
boost_sections: Section names to boost (e.g., ["Risk Factors"])
Returns:
List of SearchResult objects with relevance scores (from types.py)
Examples:
>>> searcher = DocumentSearch(document)
>>> results = searcher.ranked_search("revenue growth", algorithm="hybrid", top_k=5)
>>> for result in results:
>>> print(f"Score: {result.score:.3f}")
>>> print(f"Text: {result.snippet}")
>>> print(f"Full context: {result.full_context[:200]}...")
"""
from edgar.documents.ranking.ranking import (
BM25Engine,
HybridEngine,
SemanticEngine
)
from edgar.documents.types import SearchResult as TypesSearchResult
# Get all leaf nodes for ranking (avoid duplicates from parent nodes)
nodes = []
for node in self.document.root.walk():
# Only include leaf nodes with text
if hasattr(node, 'children') and node.children:
continue # Skip parent nodes
if hasattr(node, 'text'):
text = node.text()
if text and len(text.strip()) > 0:
nodes.append(node)
# Filter by node types if specified
if node_types:
nodes = [n for n in nodes if n.type in node_types]
# Filter by section if specified
if in_section:
section_nodes = self._get_section_nodes(in_section)
nodes = [n for n in nodes if n in section_nodes]
if not nodes:
return []
# Select ranking engine (with caching)
engine = self._get_ranking_engine(algorithm.lower(), nodes, boost_sections)
# Rank nodes
ranked_results = engine.rank(query, nodes)
# Convert to types.SearchResult format and add section context
search_results = []
for ranked in ranked_results[:top_k]:
# Try to find which section this node belongs to
section_obj = self._find_node_section(ranked.node)
search_results.append(TypesSearchResult(
node=ranked.node,
score=ranked.score,
snippet=ranked.snippet,
section=section_obj.name if section_obj else None,
context=ranked.text if len(ranked.text) <= 500 else ranked.text[:497] + "...",
_section_obj=section_obj # Agent navigation support
))
return search_results
def _get_ranking_engine(self, algorithm: str, nodes: List[Node],
boost_sections: Optional[List[str]] = None):
"""
Get or create ranking engine with caching support.
Args:
algorithm: Ranking algorithm ("bm25", "hybrid", "semantic")
nodes: Nodes to index
boost_sections: Section names to boost (for hybrid/semantic)
Returns:
Ready-to-use ranking engine
"""
from edgar.documents.ranking.ranking import (
BM25Engine,
HybridEngine,
SemanticEngine
)
from edgar.documents.ranking.cache import get_search_cache, CacheEntry
from datetime import datetime
# Create cache key
# Use document ID, algorithm, and sample of first node for stability
content_sample = nodes[0].text()[:200] if nodes and hasattr(nodes[0], 'text') else ""
cache_key = f"{self.document.accession_number if hasattr(self.document, 'accession_number') else id(self.document)}_{algorithm}"
# Check instance cache first (for same search session)
if cache_key in self._ranking_engines:
engine, cached_nodes = self._ranking_engines[cache_key]
# Verify nodes haven't changed
if cached_nodes == nodes:
return engine
# Create engine based on algorithm
if algorithm == "bm25":
engine = BM25Engine()
elif algorithm == "hybrid":
engine = HybridEngine(boost_sections=boost_sections)
elif algorithm == "semantic":
engine = SemanticEngine(boost_sections=boost_sections)
else:
raise ValueError(f"Unsupported algorithm: {algorithm}")
# Try to load from global cache if enabled
if self.use_cache and algorithm == "bm25": # Only cache BM25 for now
search_cache = get_search_cache()
document_hash = search_cache.compute_document_hash(
document_id=cache_key,
content_sample=content_sample
)
cached_entry = search_cache.get(document_hash)
if cached_entry:
# Load index from cache
try:
engine.load_index_data(cached_entry.index_data, nodes)
# Cache in instance
self._ranking_engines[cache_key] = (engine, nodes)
return engine
except Exception as e:
# Cache load failed, rebuild
pass
# Build fresh index
# For BM25/Hybrid, index is built lazily on first rank() call
# But we can force it here and cache the result
if self.use_cache and algorithm == "bm25":
# Force index build by doing a dummy rank
engine._build_index(nodes)
# Save to global cache
try:
search_cache = get_search_cache()
document_hash = search_cache.compute_document_hash(
document_id=cache_key,
content_sample=content_sample
)
index_data = engine.get_index_data()
cache_entry = CacheEntry(
document_hash=document_hash,
index_data=index_data,
created_at=datetime.now()
)
search_cache.put(document_hash, cache_entry)
except Exception as e:
# Cache save failed, not critical
pass
# Cache in instance
self._ranking_engines[cache_key] = (engine, nodes)
return engine
def get_cache_stats(self) -> Dict[str, Any]:
"""
Get search cache statistics.
Returns:
Dictionary with cache performance metrics including:
- memory_entries: Number of indices in memory
- disk_entries: Number of indices on disk
- cache_hits: Total cache hits
- cache_misses: Total cache misses
- hit_rate: Cache hit rate (0-1)
- memory_size_mb: Estimated memory usage in MB
Examples:
>>> searcher = DocumentSearch(document)
>>> searcher.ranked_search("revenue", algorithm="bm25")
>>> stats = searcher.get_cache_stats()
>>> print(f"Hit rate: {stats['hit_rate']:.1%}")
"""
from edgar.documents.ranking.cache import get_search_cache
stats = {
'instance_cache_entries': len(self._ranking_engines),
'global_cache_stats': {}
}
if self.use_cache:
cache = get_search_cache()
stats['global_cache_stats'] = cache.get_stats()
return stats
def clear_cache(self, memory_only: bool = False) -> None:
"""
Clear search caches.
Args:
memory_only: If True, only clear in-memory caches (default: False)
Examples:
>>> searcher = DocumentSearch(document)
>>> searcher.clear_cache() # Clear all caches
>>> searcher.clear_cache(memory_only=True) # Only clear memory
"""
# Clear instance cache
self._ranking_engines.clear()
# Clear global cache if enabled
if self.use_cache:
from edgar.documents.ranking.cache import get_search_cache
cache = get_search_cache()
cache.clear(memory_only=memory_only)
def _find_node_section(self, node: Node):
"""
Find which section a node belongs to.
Returns:
Section object or None
"""
# Walk up the tree to find section markers
current = node
while current:
# Check if any section contains this node
for section_name, section in self.document.sections.items():
# Check if node is in section's subtree
for section_node in section.node.walk():
if section_node is current or section_node is node:
return section
current = current.parent if hasattr(current, 'parent') else None
return None

View File

@@ -0,0 +1,15 @@
"""
Parsing strategies for different content types.
"""
from edgar.documents.strategies.document_builder import DocumentBuilder
from edgar.documents.strategies.header_detection import HeaderDetectionStrategy
from edgar.documents.strategies.table_processing import TableProcessor
from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
__all__ = [
'DocumentBuilder',
'HeaderDetectionStrategy',
'TableProcessor',
'XBRLExtractor'
]

View File

@@ -0,0 +1,670 @@
"""
Document builder that converts parsed HTML tree into document nodes.
"""
from typing import Dict, Any, Optional
from lxml.html import HtmlElement
from edgar.documents.config import ParserConfig
from edgar.documents.nodes import (
Node, DocumentNode, TextNode, ParagraphNode, HeadingNode,
ContainerNode, SectionNode, ListNode, ListItemNode, LinkNode, ImageNode
)
from edgar.documents.strategies.style_parser import StyleParser
from edgar.documents.table_nodes import TableNode, Cell, Row
from edgar.documents.types import Style, ParseContext, SemanticType
class DocumentBuilder:
"""
Builds Document node tree from parsed HTML.
Handles the conversion of HTML elements into structured nodes
with proper hierarchy and metadata.
"""
# Block-level elements
BLOCK_ELEMENTS = {
'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'ul', 'ol', 'li', 'blockquote', 'pre', 'hr',
'table', 'form', 'fieldset', 'address', 'section',
'article', 'aside', 'nav', 'header', 'footer', 'main'
}
# Inline elements
INLINE_ELEMENTS = {
'span', 'a', 'em', 'strong', 'b', 'i', 'u', 's',
'small', 'mark', 'del', 'ins', 'sub', 'sup',
'code', 'kbd', 'var', 'samp', 'abbr', 'cite',
'q', 'time', 'font',
# IXBRL inline elements for simple values - should not break text flow
'ix:nonfraction', 'ix:footnote', 'ix:fraction'
}
# Elements to skip
SKIP_ELEMENTS = {
'script', 'style', 'meta', 'link', 'noscript',
# IXBRL exclude elements - content that should not appear in final document
'ix:exclude'
}
def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
"""
Initialize document builder.
Args:
config: Parser configuration
strategies: Dictionary of parsing strategies
"""
self.config = config
self.strategies = strategies
self.style_parser = StyleParser()
self.context = ParseContext()
# Track XBRL context
self.xbrl_context_stack = []
self.xbrl_continuations = {}
def build(self, tree: HtmlElement) -> DocumentNode:
"""
Build document from HTML tree.
Args:
tree: Parsed HTML tree
Returns:
Document root node
"""
# Create root document node
root = DocumentNode()
# Find body element
body = tree.find('.//body')
if body is None:
# If no body, use the entire tree
body = tree
# Process body content
self._process_element(body, root)
# Apply node merging if configured
if self.config.merge_adjacent_nodes:
self._merge_adjacent_nodes(root)
return root
def _process_element(self, element: HtmlElement, parent: Node) -> Optional[Node]:
"""
Process HTML element into node.
Args:
element: HTML element to process
parent: Parent node
Returns:
Created node or None if skipped
"""
# Skip certain elements but preserve their tail text
if element.tag in self.SKIP_ELEMENTS:
# Process tail text even when skipping element
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
return None
# Skip page number containers
if self._is_page_number_container(element):
return None
# Skip page break elements
if self._is_page_break_element(element):
return None
# Skip navigation containers that follow page breaks
if self._is_page_navigation_container(element):
return None
# Track parsing depth
self.context.depth += 1
try:
# Handle XBRL elements
if element.tag.startswith('{'): # Namespaced element
self._enter_xbrl_context(element)
# Extract style
style = self._extract_style(element)
# Create appropriate node based on element type
node = self._create_node_for_element(element, style)
if node:
# Add XBRL metadata if in context
if self.xbrl_context_stack:
node.metadata.update(self._get_current_xbrl_metadata())
# Add to parent
parent.add_child(node)
# Process children for container nodes
if self._should_process_children(element, node):
# Add element's direct text first
if element.text:
if self.config.preserve_whitespace:
if element.text: # Don't strip whitespace
text_node = TextNode(content=element.text)
node.add_child(text_node)
else:
if element.text.strip():
text_node = TextNode(content=element.text.strip())
node.add_child(text_node)
# Process child elements
for child in element:
self._process_element(child, node)
# Process text after children
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
elif element.tail.isspace():
# Even if tail is just whitespace, preserve the spacing info
# This helps with inline element spacing decisions
if hasattr(node, 'set_metadata'):
node.set_metadata('has_tail_whitespace', True)
else:
# Node created but children not processed - still need to handle tail
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
elif element.tail.isspace():
# Even if tail is just whitespace, preserve the spacing info
if hasattr(node, 'set_metadata'):
node.set_metadata('has_tail_whitespace', True)
else:
# No node created, process children with same parent
for child in element:
self._process_element(child, parent)
# Process tail text
if element.tail:
if self.config.preserve_whitespace:
text_node = TextNode(content=element.tail)
parent.add_child(text_node)
else:
if element.tail.strip():
text_node = TextNode(content=element.tail.strip())
parent.add_child(text_node)
# Exit XBRL context
if element.tag.startswith('{'):
self._exit_xbrl_context(element)
return node
finally:
self.context.depth -= 1
def _create_node_for_element(self, element: HtmlElement, style: Style) -> Optional[Node]:
"""Create appropriate node for HTML element."""
tag = element.tag.lower() if not element.tag.startswith('{') else element.tag
# Check for heading
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
level = int(tag[1])
text = self._get_element_text(element)
if text:
return HeadingNode(content=text, level=level, style=style)
# Handle specific elements first before header detection
if tag == 'p':
return ParagraphNode(style=style)
elif tag == 'li':
return ListItemNode(style=style)
# Check if element might be a heading based on style/content
# Skip header detection for certain tags that should never be headers
skip_header_detection_tags = {
'li', 'td', 'th', 'option', 'a', 'button', 'label',
# IXBRL inline elements - should not be treated as headers
'ix:nonfraction', 'ix:footnote', 'ix:fraction',
# IXBRL elements that can contain tables and complex content
'ix:nonNumeric', 'ix:continuation'
}
if tag not in skip_header_detection_tags and self.strategies.get('header_detection'):
header_info = self.strategies['header_detection'].detect(element, self.context)
if header_info and header_info.confidence > self.config.header_detection_threshold:
text = self._get_element_text(element)
if text:
node = HeadingNode(
content=text,
level=header_info.level,
style=style
)
# Add header metadata
node.set_metadata('detection_method', header_info.detection_method)
node.set_metadata('confidence', header_info.confidence)
if header_info.is_item:
node.semantic_type = SemanticType.ITEM_HEADER
node.set_metadata('item_number', header_info.item_number)
return node
# Continue handling other specific elements
if tag == 'table':
if self.strategies.get('table_processing'):
return self.strategies['table_processing'].process(element)
else:
return self._process_table_basic(element, style)
elif tag in ['ul', 'ol']:
return ListNode(ordered=(tag == 'ol'), style=style)
elif tag == 'li':
return ListItemNode(style=style)
elif tag == 'a':
href = element.get('href', '')
title = element.get('title', '')
text = self._get_element_text(element)
return LinkNode(content=text, href=href, title=title, style=style)
elif tag == 'img':
return ImageNode(
src=element.get('src'),
alt=element.get('alt'),
width=self._parse_dimension(element.get('width')),
height=self._parse_dimension(element.get('height')),
style=style
)
elif tag == 'br':
# Line break - add as text node
return TextNode(content='\n')
elif tag in ['section', 'article']:
return SectionNode(style=style)
elif tag == 'div' or tag in self.BLOCK_ELEMENTS:
# Check if CSS display property makes this inline
if style.display in ['inline', 'inline-block']:
# Treat as inline element despite being a div
text = self._get_element_text(element)
if text:
text_node = TextNode(content=text, style=style)
text_node.set_metadata('original_tag', tag)
text_node.set_metadata('inline_via_css', True)
return text_node
# If no text but inline, still process children inline
return ContainerNode(tag_name=tag, style=style)
# Normal block behavior
# Check if this is just a text container with only inline elements
if self._is_text_only_container(element):
# Create ParagraphNode for divs containing only inline elements
# This ensures proper text concatenation for spans, etc.
return ParagraphNode(style=style)
else:
return ContainerNode(tag_name=tag, style=style)
elif tag in self.INLINE_ELEMENTS:
# Inline elements - extract text and add to parent
text = self._get_element_text(element)
if text:
text_node = TextNode(content=text, style=style)
# Preserve inline element metadata
text_node.set_metadata('original_tag', tag)
return text_node
elif tag in ['ix:nonNumeric', 'ix:continuation']:
# IXBRL elements that can contain complex content including tables
# Process as container to allow proper table parsing
return ContainerNode(tag_name=tag, style=style)
# Default: create container for unknown elements
return ContainerNode(tag_name=tag, style=style)
def _is_page_number_container(self, element: HtmlElement) -> bool:
"""Detect and filter page number containers across various SEC filing patterns."""
import re
# Get text content first - all page numbers should be short
text_content = element.text_content().strip()
# Must be short content (1-8 chars to handle "Page X" format)
if len(text_content) > 8 or len(text_content) == 0:
return False
# Must be numeric, roman numerals, or "Page X" format
if not self._is_page_number_content(text_content):
return False
# Check various patterns based on element type and styling
tag = element.tag.lower()
# Pattern 1: Oracle-style flexbox containers (highest confidence)
if tag == 'div' and self._is_flexbox_page_number(element):
return True
# Pattern 2: Center/right aligned paragraphs (common pattern)
if tag == 'p' and self._is_aligned_page_number(element):
return True
# Pattern 3: Footer-style divs with centered page numbers
if tag == 'div' and self._is_footer_page_number(element):
return True
# Pattern 4: Simple divs with page break context
if tag == 'div' and self._is_page_break_context(element):
return True
return False
def _is_page_number_content(self, text: str) -> bool:
"""Check if text content looks like a page number."""
import re
# Simple numeric (most common)
if text.isdigit():
return True
# Roman numerals
if re.match(r'^[ivxlcdm]+$', text.lower()):
return True
# "Page X" or "Page X of Y" format
if re.match(r'^page\s+\d+(\s+of\s+\d+)?$', text.lower()):
return True
return False
def _is_flexbox_page_number(self, element: HtmlElement) -> bool:
"""Detect Oracle-style flexbox page number containers."""
import re
style_attr = element.get('style', '')
if not style_attr:
return False
# Must have: display:flex, justify-content:flex-end, min-height:1in
required_patterns = [
r'display:\s*flex',
r'justify-content:\s*flex-end',
r'min-height:\s*1in'
]
return all(re.search(pattern, style_attr) for pattern in required_patterns)
def _is_aligned_page_number(self, element: HtmlElement) -> bool:
"""Detect center or right-aligned page number paragraphs."""
import re
style_attr = element.get('style', '')
# Check for center or right alignment
alignment_pattern = r'text-align:\s*(center|right)'
if not re.search(alignment_pattern, style_attr):
return False
# Optional: check for smaller font size (common in page numbers)
font_size_pattern = r'font-size:\s*([0-9]+)pt'
font_match = re.search(font_size_pattern, style_attr)
if font_match:
font_size = int(font_match.group(1))
# Page numbers often use smaller fonts (8-12pt)
if font_size <= 12:
return True
return True # Any center/right aligned short content
def _is_footer_page_number(self, element: HtmlElement) -> bool:
"""Detect footer-style page number containers."""
import re
style_attr = element.get('style', '')
# Look for bottom positioning or footer-like styling
footer_patterns = [
r'bottom:\s*[0-9]',
r'position:\s*absolute',
r'margin-bottom:\s*0',
r'text-align:\s*center'
]
# Need at least 2 footer indicators
matches = sum(1 for pattern in footer_patterns if re.search(pattern, style_attr))
return matches >= 2
def _is_page_break_context(self, element: HtmlElement) -> bool:
"""Check if element is near page breaks (common page number context)."""
# Check next sibling for page break HR
next_elem = element.getnext()
if next_elem is not None and next_elem.tag == 'hr':
hr_style = next_elem.get('style', '')
if 'page-break' in hr_style:
return True
# Check if element has page-break styling itself
style_attr = element.get('style', '')
if 'page-break' in style_attr:
return True
return False
def _is_page_break_element(self, element: HtmlElement) -> bool:
"""Detect page break HR elements."""
if element.tag.lower() != 'hr':
return False
style_attr = element.get('style', '')
# Check for page-break-after:always or similar page break styles
return 'page-break' in style_attr
def _is_page_navigation_container(self, element: HtmlElement) -> bool:
"""Detect navigation containers that appear after page breaks."""
if element.tag.lower() != 'div':
return False
style_attr = element.get('style', '')
# Check for navigation container patterns
# Often have: padding-top, min-height:1in, box-sizing:border-box
nav_indicators = [
r'padding-top:\s*0\.5in',
r'min-height:\s*1in',
r'box-sizing:\s*border-box'
]
import re
matches = sum(1 for pattern in nav_indicators if re.search(pattern, style_attr))
# Need at least 2 indicators
if matches < 2:
return False
# Check if it contains typical navigation content
text_content = element.text_content().strip().lower()
# Common navigation phrases
nav_phrases = [
'table of contents',
'index to financial statements',
'table of content',
'index to financial statement'
]
return any(phrase in text_content for phrase in nav_phrases)
def _extract_style(self, element: HtmlElement) -> Style:
"""Extract style from element."""
style_str = element.get('style', '')
style = self.style_parser.parse(style_str)
# Add tag-specific styles
tag = element.tag.lower()
if tag == 'b' or tag == 'strong':
style.font_weight = 'bold'
elif tag == 'i' or tag == 'em':
style.font_style = 'italic'
elif tag == 'u':
style.text_decoration = 'underline'
# Handle alignment
align = element.get('align')
if align:
style.text_align = align
return style
def _get_element_text(self, element: HtmlElement) -> str:
"""Get text content from element."""
text_parts = []
# Get element's direct text
if element.text:
# For inline elements, preserve leading/trailing whitespace
if element.tag.lower() in self.INLINE_ELEMENTS:
text_parts.append(element.text)
else:
text_parts.append(element.text.strip())
# For simple elements, get all text content
if element.tag.lower() in self.INLINE_ELEMENTS or \
element.tag.lower() in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
# Get all text including from child elements
for child in element:
if child.tag.lower() not in self.SKIP_ELEMENTS:
child_text = child.text_content()
if child_text:
# For inline elements, preserve whitespace in child content too
if element.tag.lower() in self.INLINE_ELEMENTS:
text_parts.append(child_text)
else:
text_parts.append(child_text.strip())
# For inline elements with preserved whitespace, concatenate directly
# For others, join with spaces
if element.tag.lower() in self.INLINE_ELEMENTS and len(text_parts) == 1:
return text_parts[0] if text_parts else ''
else:
return ' '.join(text_parts)
def _is_text_only_container(self, element: HtmlElement) -> bool:
"""Check if element contains only text and inline elements."""
for child in element:
if child.tag.lower() in self.BLOCK_ELEMENTS:
return False
if child.tag.lower() == 'table':
return False
return True
def _should_process_children(self, element: HtmlElement, node: Node) -> bool:
"""Determine if children should be processed."""
# Don't process children for certain node types
if isinstance(node, (TextNode, HeadingNode)):
return False
# Tables are processed separately
if isinstance(node, TableNode):
return False
return True
def _process_table_basic(self, element: HtmlElement, style: Style) -> TableNode:
"""Basic table processing without advanced strategy."""
table = TableNode(style=style)
# Set config for rendering decisions
table._config = self.config
# Extract caption
caption_elem = element.find('.//caption')
if caption_elem is not None:
table.caption = caption_elem.text_content().strip()
# Process rows
for tr in element.findall('.//tr'):
cells = []
for td in tr.findall('.//td') + tr.findall('.//th'):
cell = Cell(
content=td.text_content().strip(),
colspan=int(td.get('colspan', '1')),
rowspan=int(td.get('rowspan', '1')),
is_header=(td.tag == 'th'),
align=td.get('align')
)
cells.append(cell)
if cells:
row = Row(cells=cells, is_header=(tr.find('.//th') is not None))
# Determine if header or data row
if tr.getparent().tag == 'thead' or row.is_header:
table.headers.append(cells)
else:
table.rows.append(row)
return table
def _parse_dimension(self, value: Optional[str]) -> Optional[int]:
"""Parse dimension value (width/height)."""
if not value:
return None
# Remove 'px' suffix if present
value = value.strip().rstrip('px')
try:
return int(value)
except ValueError:
return None
def _enter_xbrl_context(self, element: HtmlElement):
"""Enter XBRL context."""
if self.config.extract_xbrl and self.strategies.get('xbrl_extraction'):
xbrl_data = self.strategies['xbrl_extraction'].extract_context(element)
if xbrl_data:
self.xbrl_context_stack.append(xbrl_data)
def _exit_xbrl_context(self, element: HtmlElement):
"""Exit XBRL context."""
if self.xbrl_context_stack:
self.xbrl_context_stack.pop()
def _get_current_xbrl_metadata(self) -> Dict[str, Any]:
"""Get current XBRL metadata."""
if not self.xbrl_context_stack:
return {}
# Merge all contexts in stack
metadata = {}
for context in self.xbrl_context_stack:
metadata.update(context)
return metadata
def _merge_adjacent_nodes(self, root: Node):
"""Merge adjacent text nodes with similar styles."""
# Implementation would recursively merge adjacent text nodes
# This is a placeholder for the actual implementation
pass

View File

@@ -0,0 +1,450 @@
"""
Multi-strategy header detection for document structure.
"""
import re
from abc import ABC, abstractmethod
from typing import Optional, List, Dict
from lxml.html import HtmlElement
from edgar.documents.config import ParserConfig
from edgar.documents.types import HeaderInfo, ParseContext
class HeaderDetector(ABC):
"""Abstract base class for header detectors."""
@abstractmethod
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""Detect if element is a header."""
pass
@property
@abstractmethod
def name(self) -> str:
"""Detector name."""
pass
class StyleBasedDetector(HeaderDetector):
"""Detect headers based on CSS styles."""
@property
def name(self) -> str:
return "style"
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""Detect headers based on style attributes."""
# Get element style
style = context.get_current_style()
# Skip if no style info
if not style:
return None
# Get text content
text = element.text_content().strip()
if not text or len(text) > 200: # Skip very long text
return None
confidence = 0.0
level = 3 # Default level
# Check font size
if style.font_size and context.base_font_size:
size_ratio = style.font_size / context.base_font_size
if size_ratio >= 2.0:
confidence += 0.8
level = 1
elif size_ratio >= 1.5:
confidence += 0.7
level = 2
elif size_ratio >= 1.2:
confidence += 0.5
level = 3
elif size_ratio >= 1.1:
confidence += 0.3
level = 4
# Check font weight
if style.is_bold:
confidence += 0.3
if level == 3: # Adjust level for bold text
level = 2
# Check text alignment
if style.is_centered:
confidence += 0.2
# Check for uppercase
if text.isupper() and len(text.split()) <= 10:
confidence += 0.2
# Check margins (headers often have larger margins)
if style.margin_top and style.margin_top > 20:
confidence += 0.1
if style.margin_bottom and style.margin_bottom > 10:
confidence += 0.1
# Normalize confidence
confidence = min(confidence, 1.0)
if confidence > 0.4: # Threshold for style-based detection
return HeaderInfo.from_text(text, level, confidence, self.name)
return None
class PatternBasedDetector(HeaderDetector):
"""Detect headers based on text patterns."""
# Common header patterns in SEC filings
HEADER_PATTERNS = [
# Item patterns
(r'^(Item|ITEM)\s+(\d+[A-Z]?)[.\s]+(.+)$', 1, 0.95),
(r'^Part\s+[IVX]+[.\s]*$', 1, 0.9),
(r'^PART\s+[IVX]+[.\s]*$', 1, 0.9),
# Section patterns
(r'^(BUSINESS|RISK FACTORS|PROPERTIES|LEGAL PROCEEDINGS)$', 2, 0.85),
(r'^(Management\'?s?\s+Discussion|MD&A)', 2, 0.85),
(r'^(Financial\s+Statements|Consolidated\s+Financial\s+Statements)$', 2, 0.85),
# Numbered sections
(r'^\d+\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
(r'^[A-Z]\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
(r'^\([a-z]\)\s+[A-Z][A-Za-z\s]+$', 4, 0.6),
# Title case headers
(r'^[A-Z][A-Za-z\s]+[A-Za-z]$', 3, 0.5),
# All caps headers
(r'^[A-Z\s]+$', 3, 0.6),
]
@property
def name(self) -> str:
return "pattern"
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""Detect headers based on text patterns."""
text = element.text_content().strip()
# Skip empty or very long text
if not text or len(text) > 200:
return None
# Skip single punctuation - never headers
if len(text) == 1 and text in '.,!?;:()[]{}':
return None
# Skip if text contains multiple sentences (likely paragraph)
if text.count('.') > 2:
return None
# Check against patterns
for pattern, level, base_confidence in self.HEADER_PATTERNS:
match = re.match(pattern, text, re.IGNORECASE)
if match:
# Adjust confidence based on context
confidence = base_confidence
# Boost confidence if element is alone in parent
if len(element.getparent()) == 1:
confidence += 0.1
# Boost confidence if followed by substantial text
next_elem = element.getnext()
if next_elem is not None and len(next_elem.text_content()) > 100:
confidence += 0.1
confidence = min(confidence, 1.0)
return HeaderInfo.from_text(text, level, confidence, self.name)
return None
class StructuralDetector(HeaderDetector):
"""Detect headers based on DOM structure."""
@property
def name(self) -> str:
return "structural"
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""Detect headers based on structural cues."""
text = element.text_content().strip()
# Skip empty or very long text
if not text or len(text) > 200:
return None
# Skip single punctuation - never headers
if len(text) == 1 and text in '.,!?;:()[]{}':
return None
confidence = 0.0
level = 3
# Check if element is in a header tag
tag = element.tag.lower()
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
confidence = 1.0
level = int(tag[1])
return HeaderInfo.from_text(text, level, confidence, self.name)
# Check parent structure
parent = element.getparent()
if parent is not None:
parent_tag = parent.tag.lower()
# Check if in header-like container
if parent_tag in ['header', 'thead', 'caption']:
confidence += 0.6
level = 2
# Check if parent has few children (isolated element)
if len(parent) <= 3:
confidence += 0.3
# Check if parent is centered
parent_align = parent.get('align')
if parent_align == 'center':
confidence += 0.2
# Check element properties
if tag in ['strong', 'b']:
confidence += 0.3
if element.get('align') == 'center':
confidence += 0.2
# Check if followed by block content
next_elem = element.getnext()
if next_elem is not None:
next_tag = next_elem.tag.lower()
if next_tag in ['p', 'div', 'table', 'ul', 'ol']:
confidence += 0.2
# Check text characteristics
words = text.split()
if 1 <= len(words) <= 10: # Short text
confidence += 0.1
# Normalize confidence
confidence = min(confidence, 1.0)
if confidence > 0.5:
return HeaderInfo.from_text(text, level, confidence, self.name)
return None
class ContextualDetector(HeaderDetector):
"""Detect headers based on surrounding context."""
@property
def name(self) -> str:
return "contextual"
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""Detect headers based on contextual clues."""
text = element.text_content().strip()
# Skip empty or very long text
if not text or len(text) > 200:
return None
# Skip single punctuation - never headers
if len(text) == 1 and text in '.,!?;:()[]{}':
return None
confidence = 0.0
level = 3
# Check if text looks like a header
if self._looks_like_header(text):
confidence += 0.4
# Check relationship to previous content
prev_elem = element.getprevious()
if prev_elem is not None:
prev_text = prev_elem.text_content().strip()
# Check if previous was also a header (section hierarchy)
if prev_text and self._looks_like_header(prev_text):
confidence += 0.3
# Adjust level based on comparison
if len(text) > len(prev_text):
level = 2
else:
level = 3
# Check relationship to next content
next_elem = element.getnext()
if next_elem is not None:
next_text = next_elem.text_content().strip()
# Headers are often followed by longer content
if len(next_text) > len(text) * 3:
confidence += 0.3
# Check if next element is indented or styled differently
next_style = next_elem.get('style', '')
if 'margin-left' in next_style or 'padding-left' in next_style:
confidence += 0.2
# Check position in document
if context.current_section is None and context.depth < 5:
# Early in document, more likely to be header
confidence += 0.2
# Normalize confidence
confidence = min(confidence, 1.0)
if confidence > 0.5:
return HeaderInfo.from_text(text, level, confidence, self.name)
return None
def _looks_like_header(self, text: str) -> bool:
"""Check if text looks like a header."""
# Short text
if len(text.split()) > 15:
return False
# No ending punctuation (except colon)
if text.rstrip().endswith(('.', '!', '?', ';')):
return False
# Title case or all caps
if text.istitle() or text.isupper():
return True
# Starts with capital letter
if text and text[0].isupper():
return True
return False
class HeaderDetectionStrategy:
"""
Multi-strategy header detection.
Combines multiple detection methods with weighted voting.
"""
def __init__(self, config: ParserConfig):
"""Initialize with configuration."""
self.config = config
self.detectors = self._init_detectors()
def _init_detectors(self) -> List[HeaderDetector]:
"""Initialize enabled detectors."""
detectors = []
# Always include basic detectors
detectors.extend([
StyleBasedDetector(),
PatternBasedDetector(),
StructuralDetector(),
ContextualDetector()
])
# Add ML detector if enabled
if self.config.features.get('ml_header_detection'):
# Would add MLBasedDetector here
pass
return detectors
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
"""
Detect if element is a header using multiple strategies.
Args:
element: HTML element to check
context: Current parsing context
Returns:
HeaderInfo if element is detected as header, None otherwise
"""
# Skip if element has no text
text = element.text_content().strip()
if not text:
return None
# Collect results from all detectors
results: List[HeaderInfo] = []
for detector in self.detectors:
try:
result = detector.detect(element, context)
if result:
results.append(result)
except Exception:
# Don't let one detector failure stop others
continue
if not results:
return None
# If only one detector fired, use its result if confident enough
if len(results) == 1:
if results[0].confidence >= self.config.header_detection_threshold:
return results[0]
return None
# Multiple detectors - combine results
return self._combine_results(results, text)
def _combine_results(self, results: List[HeaderInfo], text: str) -> HeaderInfo:
"""Combine multiple detection results."""
# Weight different detectors
detector_weights = {
'style': 0.3,
'pattern': 0.4,
'structural': 0.2,
'contextual': 0.1,
'ml': 0.5 # Would be highest if available
}
# Calculate weighted confidence
total_confidence = 0.0
total_weight = 0.0
# Group by level
level_votes: Dict[int, float] = {}
for result in results:
weight = detector_weights.get(result.detection_method, 0.1)
total_confidence += result.confidence * weight
total_weight += weight
# Vote for level
if result.level not in level_votes:
level_votes[result.level] = 0.0
level_votes[result.level] += result.confidence * weight
# Normalize confidence
final_confidence = total_confidence / total_weight if total_weight > 0 else 0.0
# Choose most voted level
final_level = max(level_votes.items(), key=lambda x: x[1])[0]
# Check if any detector found this is an item
is_item = any(r.is_item for r in results)
item_number = next((r.item_number for r in results if r.item_number), None)
return HeaderInfo(
level=final_level,
confidence=final_confidence,
text=text,
detection_method='combined',
is_item=is_item,
item_number=item_number
)

View File

@@ -0,0 +1,344 @@
"""
CSS style parser for HTML elements.
"""
import re
from typing import Dict, Optional, Tuple, Union
from edgar.documents.types import Style
from edgar.documents.utils import get_cache_manager
class StyleParser:
"""
Parser for CSS style attributes.
Handles inline styles and converts them to Style objects.
"""
# Common CSS units
ABSOLUTE_UNITS = {'px', 'pt', 'pc', 'cm', 'mm', 'in'}
RELATIVE_UNITS = {'em', 'rem', 'ex', 'ch', 'vw', 'vh', '%'}
# Font weight mappings
FONT_WEIGHT_MAP = {
'normal': '400',
'bold': '700',
'bolder': '800',
'lighter': '300'
}
def __init__(self):
"""Initialize style parser with cache."""
self._cache = get_cache_manager().style_cache
def parse(self, style_string: str) -> Style:
"""
Parse CSS style string into Style object.
Args:
style_string: CSS style string (e.g., "font-size: 14px; color: red")
Returns:
Parsed Style object
"""
if not style_string:
return Style()
# Check cache first
cached_style = self._cache.get(style_string)
if cached_style is not None:
return cached_style
# Parse style
style = Style()
# Split into individual declarations
declarations = self._split_declarations(style_string)
for prop, value in declarations.items():
self._apply_property(style, prop, value)
# Cache result
self._cache.put(style_string, style)
return style
def _split_declarations(self, style_string: str) -> Dict[str, str]:
"""Split style string into property-value pairs."""
declarations = {}
# Split by semicolon, handling potential issues
parts = style_string.split(';')
for part in parts:
part = part.strip()
if not part:
continue
# Split property and value
if ':' in part:
prop, value = part.split(':', 1)
prop = prop.strip().lower()
value = value.strip()
if prop and value:
declarations[prop] = value
return declarations
def _apply_property(self, style: Style, prop: str, value: str):
"""Apply CSS property to Style object."""
# Font properties
if prop == 'font-size':
size = self._parse_length(value)
if size is not None:
style.font_size = size
elif prop == 'font-weight':
style.font_weight = self._normalize_font_weight(value)
elif prop == 'font-style':
if value in ['italic', 'oblique']:
style.font_style = 'italic'
elif value == 'normal':
style.font_style = 'normal'
# Text properties
elif prop == 'text-align':
if value in ['left', 'right', 'center', 'justify']:
style.text_align = value
elif prop == 'text-decoration':
style.text_decoration = value
# Color properties
elif prop == 'color':
style.color = self._normalize_color(value)
elif prop in ['background-color', 'background']:
color = self._extract_background_color(value)
if color:
style.background_color = color
# Spacing properties
elif prop == 'margin':
self._parse_box_property(style, 'margin', value)
elif prop == 'margin-top':
margin = self._parse_length(value)
if margin is not None:
style.margin_top = margin
elif prop == 'margin-bottom':
margin = self._parse_length(value)
if margin is not None:
style.margin_bottom = margin
elif prop == 'margin-left':
margin = self._parse_length(value)
if margin is not None:
style.margin_left = margin
elif prop == 'margin-right':
margin = self._parse_length(value)
if margin is not None:
style.margin_right = margin
elif prop == 'padding':
self._parse_box_property(style, 'padding', value)
elif prop == 'padding-top':
padding = self._parse_length(value)
if padding is not None:
style.padding_top = padding
elif prop == 'padding-bottom':
padding = self._parse_length(value)
if padding is not None:
style.padding_bottom = padding
elif prop == 'padding-left':
padding = self._parse_length(value)
if padding is not None:
style.padding_left = padding
elif prop == 'padding-right':
padding = self._parse_length(value)
if padding is not None:
style.padding_right = padding
# Display properties
elif prop == 'display':
style.display = value
# Size properties
elif prop == 'width':
style.width = self._parse_dimension(value)
elif prop == 'height':
style.height = self._parse_dimension(value)
# Line height
elif prop == 'line-height':
line_height = self._parse_line_height(value)
if line_height is not None:
style.line_height = line_height
def _parse_length(self, value: str) -> Optional[float]:
"""Parse CSS length value to pixels."""
value = value.strip().lower()
# Handle special values
if value in ['0', 'auto', 'inherit', 'initial']:
return 0.0 if value == '0' else None
# Extract number and unit
match = re.match(r'^(-?\d*\.?\d+)\s*([a-z%]*)$', value)
if not match:
return None
num_str, unit = match.groups()
try:
num = float(num_str)
except ValueError:
return None
# Convert to pixels
if not unit or unit == 'px':
return num
elif unit == 'pt':
return num * 1.333 # 1pt = 1.333px
elif unit == 'em':
return num * 16 # Assume 16px base
elif unit == 'rem':
return num * 16 # Assume 16px root
elif unit == '%':
return None # Can't convert percentage without context
elif unit == 'in':
return num * 96 # 1in = 96px
elif unit == 'cm':
return num * 37.8 # 1cm = 37.8px
elif unit == 'mm':
return num * 3.78 # 1mm = 3.78px
return None
def _parse_dimension(self, value: str) -> Optional[Union[float, str]]:
"""Parse dimension value (width/height)."""
value = value.strip()
# Check for percentage
if value.endswith('%'):
return value # Return as string
# Try to parse as length
length = self._parse_length(value)
return length
def _parse_line_height(self, value: str) -> Optional[float]:
"""Parse line-height value."""
value = value.strip()
# Unitless number (multiplier)
try:
return float(value)
except ValueError:
pass
# Try as length
return self._parse_length(value)
def _normalize_font_weight(self, value: str) -> str:
"""Normalize font weight value."""
value = value.strip().lower()
# Map keywords to numeric values
if value in self.FONT_WEIGHT_MAP:
return self.FONT_WEIGHT_MAP[value]
# Check if it's already numeric
if value.isdigit() and 100 <= int(value) <= 900:
return value
return value
def _normalize_color(self, value: str) -> str:
"""Normalize color value."""
value = value.strip().lower()
# Handle rgb/rgba
if value.startswith(('rgb(', 'rgba(')):
return value
# Handle hex colors
if value.startswith('#'):
# Expand 3-char hex to 6-char
if len(value) == 4:
return '#' + ''.join(c*2 for c in value[1:])
return value
# Return named colors as-is
return value
def _extract_background_color(self, value: str) -> Optional[str]:
"""Extract color from background property."""
# Simple extraction - could be enhanced
parts = value.split()
for part in parts:
if part.startswith('#') or part.startswith('rgb'):
return self._normalize_color(part)
# Check for named colors
if not any(unit in part for unit in self.ABSOLUTE_UNITS | self.RELATIVE_UNITS):
return part
return None
def _parse_box_property(self, style: Style, prop_type: str, value: str):
"""Parse box property (margin/padding) with multiple values."""
parts = value.split()
if not parts:
return
# Convert all parts to lengths
lengths = []
for part in parts:
length = self._parse_length(part)
if length is not None:
lengths.append(length)
if not lengths:
return
# Apply based on number of values (CSS box model)
if len(lengths) == 1:
# All sides
val = lengths[0]
setattr(style, f'{prop_type}_top', val)
setattr(style, f'{prop_type}_right', val)
setattr(style, f'{prop_type}_bottom', val)
setattr(style, f'{prop_type}_left', val)
elif len(lengths) == 2:
# Vertical, horizontal
vert, horiz = lengths
setattr(style, f'{prop_type}_top', vert)
setattr(style, f'{prop_type}_bottom', vert)
setattr(style, f'{prop_type}_left', horiz)
setattr(style, f'{prop_type}_right', horiz)
elif len(lengths) == 3:
# Top, horizontal, bottom
top, horiz, bottom = lengths
setattr(style, f'{prop_type}_top', top)
setattr(style, f'{prop_type}_bottom', bottom)
setattr(style, f'{prop_type}_left', horiz)
setattr(style, f'{prop_type}_right', horiz)
elif len(lengths) >= 4:
# Top, right, bottom, left
setattr(style, f'{prop_type}_top', lengths[0])
setattr(style, f'{prop_type}_right', lengths[1])
setattr(style, f'{prop_type}_bottom', lengths[2])
setattr(style, f'{prop_type}_left', lengths[3])
def merge_styles(self, base: Style, override: Style) -> Style:
"""
Merge two styles with override taking precedence.
Args:
base: Base style
override: Override style
Returns:
Merged style
"""
return base.merge(override)

View File

@@ -0,0 +1,637 @@
"""
Advanced table processing strategy.
"""
import re
from functools import lru_cache
from typing import List, Optional
from lxml.html import HtmlElement
from edgar.documents.config import ParserConfig
from edgar.documents.strategies.style_parser import StyleParser
from edgar.documents.table_nodes import TableNode, Cell, Row
from edgar.documents.types import TableType
class TableProcessor:
"""
Advanced table processing with type detection and structure analysis.
"""
# HTML entities that need replacement
ENTITY_REPLACEMENTS = {
'&horbar;': '-----',
'&mdash;': '-----',
'&ndash;': '---',
'&minus;': '-',
'&hyphen;': '-',
'&dash;': '-',
'&nbsp;': ' ',
'&amp;': '&',
'&lt;': '<',
'&gt;': '>',
'&quot;': '"',
'&apos;': "'",
'&#8202;': ' ',
'&#8203;': '',
'&#x2014;': '-----',
'&#x2013;': '---',
'&#x2212;': '-',
}
# Financial keywords for table type detection
FINANCIAL_KEYWORDS = {
'revenue', 'income', 'expense', 'asset', 'liability',
'cash', 'equity', 'profit', 'loss', 'margin',
'earnings', 'cost', 'sales', 'operating', 'net',
'gross', 'total', 'balance', 'statement', 'consolidated',
'provision', 'tax', 'taxes', 'compensation', 'stock',
'share', 'shares', 'rsu', 'option', 'grant', 'vest'
}
# Metrics keywords
METRICS_KEYWORDS = {
'ratio', 'percentage', 'percent', '%', 'rate',
'growth', 'change', 'increase', 'decrease',
'average', 'median', 'total', 'count', 'number'
}
def __init__(self, config: ParserConfig):
"""Initialize table processor."""
self.config = config
self.style_parser = StyleParser()
def process(self, element: HtmlElement) -> TableNode:
"""
Process table element into TableNode.
Args:
element: HTML table element
Returns:
Processed TableNode
"""
# Extract table metadata
table_id = element.get('id')
table_class = element.get('class', '').split()
table_style = self.style_parser.parse(element.get('style', ''))
# Create table node
table = TableNode(style=table_style)
# Set config for rendering decisions
table._config = self.config
# Add metadata
if table_id:
table.set_metadata('id', table_id)
if table_class:
table.set_metadata('classes', table_class)
# Extract caption
caption_elem = element.find('.//caption')
if caption_elem is not None:
table.caption = self._extract_text(caption_elem)
# Extract summary
summary = element.get('summary')
if summary:
table.summary = summary
# Process table structure
self._process_table_structure(element, table)
# Detect table type if configured
if self.config.detect_table_types:
table.table_type = self._detect_table_type(table)
# Extract relationships if configured
if self.config.extract_table_relationships:
self._extract_relationships(table)
return table
def _process_table_structure(self, element: HtmlElement, table: TableNode):
"""Process table structure (thead, tbody, tfoot)."""
# Process thead
thead = element.find('.//thead')
if thead is not None:
for tr in thead.findall('.//tr'):
cells = self._process_row(tr, is_header=True)
if cells:
table.headers.append(cells)
# Process tbody (or direct rows)
tbody = element.find('.//tbody')
rows_container = tbody if tbody is not None else element
# Track if we've seen headers and data rows
headers_found = bool(table.headers)
consecutive_header_rows = 0
data_rows_started = False
for tr in rows_container.findall('.//tr'):
# Skip if already processed in thead
if thead is not None and tr.getparent() == thead:
continue
# Check if this might be a header row
is_header_row = False
# Continue checking for headers if:
# 1. We haven't found any headers yet, OR
# 2. We've found headers but haven't seen data rows yet (multi-row headers)
if not data_rows_started:
is_header_row = self._is_header_row(tr)
# Additional check for multi-row headers in financial tables
# If the previous row was a header and this row has years or units,
# it's likely part of the header
if headers_found and not is_header_row:
row_text = tr.text_content().strip()
# Check for units like "(in millions)" or "(in thousands)"
if '(in millions)' in row_text or '(in thousands)' in row_text or '(in billions)' in row_text:
is_header_row = True
# Check for year rows that follow "Year Ended" headers
elif len(table.headers) > 0:
last_header_text = ' '.join(cell.text() for cell in table.headers[-1])
if 'year ended' in last_header_text.lower() or 'years ended' in last_header_text.lower():
# Check if this row has years
year_pattern = r'\b(19\d{2}|20\d{2})\b'
years_found = re.findall(year_pattern, row_text)
if years_found:
is_header_row = True
cells = self._process_row(tr, is_header=is_header_row)
if cells:
if is_header_row:
table.headers.append(cells)
headers_found = True
consecutive_header_rows += 1
else:
# Only mark data_rows_started if this row has actual content
# Empty rows at the beginning shouldn't stop header detection
row = Row(cells=cells, is_header=False)
table.rows.append(row)
# Check if row has significant content that indicates data rows have started
# But be smart about it - descriptive rows like "(in millions)" or pure spacing
# shouldn't stop header detection
has_content = any(cell.text().strip() for cell in cells)
if has_content:
# Get the row text for smarter analysis
row_text = ' '.join(cell.text().strip() for cell in cells).strip()
row_text_lower = row_text.lower()
# Don't consider this as "data started" if it's likely a header-related row
is_header_related = (
# Unit descriptions
'(in millions)' in row_text_lower or
'(in thousands)' in row_text_lower or
'(in billions)' in row_text_lower or
'except per share' in row_text_lower or
# Financial period descriptions
'year ended' in row_text_lower or
'months ended' in row_text_lower or
# Mostly just spacing/formatting
len(row_text.strip()) < 5 or
# Contains years (might be misclassified header)
bool(re.search(r'\b(19\d{2}|20\d{2})\b', row_text))
)
# Only mark data_rows_started if this seems like actual data, not header-related
if not is_header_related:
data_rows_started = True
consecutive_header_rows = 0
# Process tfoot
tfoot = element.find('.//tfoot')
if tfoot is not None:
for tr in tfoot.findall('.//tr'):
cells = self._process_row(tr, is_header=False)
if cells:
row = Row(cells=cells, is_header=False)
table.footer.append(row)
def _process_row(self, tr: HtmlElement, is_header: bool) -> List[Cell]:
"""Process table row into cells."""
cells = []
# Process both td and th elements
for cell_elem in tr.findall('.//td') + tr.findall('.//th'):
cell = self._process_cell(cell_elem, is_header or cell_elem.tag == 'th')
if cell:
cells.append(cell)
return cells
def _process_cell(self, elem: HtmlElement, is_header: bool) -> Optional[Cell]:
"""Process table cell."""
# Extract cell properties
colspan = int(elem.get('colspan', '1'))
rowspan = int(elem.get('rowspan', '1'))
align = elem.get('align')
# Extract style
style = self.style_parser.parse(elem.get('style', ''))
if style.text_align:
align = style.text_align
# Extract content
content = self._extract_cell_content(elem)
# Create cell
cell = Cell(
content=content,
colspan=colspan,
rowspan=rowspan,
is_header=is_header,
align=align
)
return cell
def _extract_cell_content(self, elem: HtmlElement) -> str:
"""Extract and clean cell content."""
# Check for nested structure
divs = elem.findall('.//div')
if divs and len(divs) > 1:
# Multiple divs - likely multi-line content
lines = []
for div in divs:
text = self._extract_text(div)
if text:
lines.append(text)
return '\n'.join(lines)
# Handle line breaks
for br in elem.findall('.//br'):
br.tail = '\n' + (br.tail or '')
# Extract text
text = self._extract_text(elem)
return text
def _extract_text(self, elem: HtmlElement) -> str:
"""Extract and clean text from element."""
# Use itertext() to get all text fragments
# This preserves spaces better than text_content()
text_parts = []
for text in elem.itertext():
if text:
text_parts.append(text)
# Join parts, ensuring we don't lose spaces
# If a part doesn't end with whitespace and the next doesn't start with whitespace,
# we need to add a space between them
if not text_parts:
return ''
result = []
for i, part in enumerate(text_parts):
if i == 0:
result.append(part)
else:
prev_part = text_parts[i-1]
# Check if we need to add a space between parts
# Don't add space if previous ends with space or current starts with space
if prev_part and part:
if not prev_part[-1].isspace() and not part[0].isspace():
# Check for punctuation that shouldn't have space before it
if part[0] not in ',.;:!?%)]':
result.append(' ')
result.append(part)
text = ''.join(result)
# Replace entities
for entity, replacement in self.ENTITY_REPLACEMENTS.items():
text = text.replace(entity, replacement)
# Clean whitespace
text = text.strip()
# Normalize internal whitespace but preserve line breaks
lines = text.split('\n')
cleaned_lines = []
for line in lines:
# Collapse multiple spaces to single space
line = ' '.join(line.split())
cleaned_lines.append(line)
return '\n'.join(cleaned_lines)
@staticmethod
@lru_cache(maxsize=1)
def _get_period_header_pattern():
"""
Compile comprehensive regex for financial period headers.
Adapted from old parser's proven patterns.
Returns:
Compiled regex pattern matching financial period headers
"""
# Base components
periods = r'(?:three|six|nine|twelve|[1-4]|first|second|third|fourth)'
timeframes = r'(?:month|quarter|year|week)'
ended_variants = r'(?:ended|ending|end|period)'
as_of_variants = r'(?:as\s+of|at|as\s+at)'
# Date pattern
months = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
day = r'\d{1,2}'
year = r'(?:19|20)\d{2}'
date = f'{months}\\s*\\.?\\s*{day}\\s*,?\\s*{year}'
# Combined patterns
patterns = [
# Standard period headers
f'{periods}\\s+{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
f'(?:fiscal\\s+)?{timeframes}\\s+{ended_variants}',
f'{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
# Balance sheet date headers
f'{as_of_variants}\\s+{date}',
# Multiple date sequences
f'{date}(?:\\s*(?:and|,)\\s*{date})*',
# Single dates
f'(?:{ended_variants}\\s+)?{date}'
]
pattern = '|'.join(f'(?:{p})' for p in patterns)
return re.compile(pattern, re.IGNORECASE)
def _is_header_row(self, tr: HtmlElement) -> bool:
"""Detect if row is likely a header row in SEC filings."""
# Check if contains th elements (most reliable indicator)
if tr.find('.//th') is not None:
return True
cells = tr.findall('.//td')
if not cells:
return False
# Get row text for analysis
row_text = tr.text_content()
row_text_lower = row_text.lower()
# Check for date ranges with financial data (Oracle Table 6 pattern)
# Date ranges like "March 1, 2024—March 31, 2024" should be data rows, not headers
date_range_pattern = r'(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}\s*[—–-]\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}'
has_date_range = bool(re.search(date_range_pattern, row_text_lower))
# Check for financial data indicators
has_currency = bool(re.search(r'\$[\s]*[\d,\.]+', row_text))
has_decimals = bool(re.search(r'\b\d+\.\d+\b', row_text))
has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
# If row has date range + financial data, it's definitely a data row
if has_date_range and (has_currency or has_decimals or has_large_numbers):
return False
# Check for year patterns (very common in financial headers)
year_pattern = r'\b(19\d{2}|20\d{2})\b'
years_found = re.findall(year_pattern, row_text)
if len(years_found) >= 2: # Multiple years suggest header row
# IMPORTANT: Check for date ranges and same-year repetition
# Date ranges like "March 1, 2024—March 31, 2024" contain the same year twice
# but are data rows, not multi-year comparison headers
# If all years are the same (date range pattern)
if len(set(years_found)) == 1:
# Same year repeated - likely a date range like "Jan 1, 2024 - Mar 31, 2024"
# Not a multi-year comparison header
pass # Don't return True
# Multiple different years suggest multi-year comparison header
elif 'total' not in row_text_lower[:20]: # Check first 20 chars
return True
# Enhanced year detection - check individual cells for year patterns
# This handles cases where years are in separate cells
year_cells = 0
date_phrases = 0
for cell in cells:
cell_text = cell.text_content().strip()
if cell_text:
# Check for individual years
if re.match(r'^\s*(19\d{2}|20\d{2})\s*$', cell_text):
year_cells += 1
# Check for date phrases like "June 30, 2025"
elif 'june 30' in cell_text.lower() or 'december 31' in cell_text.lower():
date_phrases += 1
# If we have multiple year cells or year + date phrases, likely a header
if year_cells >= 2 or (year_cells >= 1 and date_phrases >= 1):
if 'total' not in row_text_lower[:20]:
return True
# Check for comprehensive financial period patterns (from old parser)
period_pattern = self._get_period_header_pattern()
if period_pattern.search(row_text_lower):
# Additional validation: ensure it's not a data row with period text
# Check for absence of strong data indicators
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\s*[+\-*/]\s*\d+|\(\s*\d+(?:,\d{3})*\s*\))'
if not re.search(data_pattern, row_text):
return True
# Check for units notation (in millions, thousands, billions)
units_pattern = r'\(in\s+(?:millions|thousands|billions)\)'
if re.search(units_pattern, row_text_lower):
return True
# Check for period indicators (quarters, months)
# But be careful with "fiscal" - it could be data like "Fiscal 2025"
period_keywords = ['quarter', 'q1', 'q2', 'q3', 'q4', 'month',
'january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november', 'december',
'ended', 'three months', 'six months', 'nine months']
# Special handling for "fiscal" - only treat as header if it's part of a phrase like "fiscal year ended"
if 'fiscal' in row_text_lower:
# Check if row has numeric values (suggests it's data, not header)
# Look for patterns like "Fiscal 2025 $10,612"
has_currency_values = bool(re.search(r'\$[\s]*[\d,]+', row_text))
has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
# If it has currency or large numbers, it's likely data
if has_currency_values or has_large_numbers:
return False
# Check if it's just "Fiscal YYYY" which is likely data, not a header
fiscal_year_only = re.match(r'^\s*fiscal\s+\d{4}\s*$', row_text_lower.strip())
if fiscal_year_only:
return False # This is data, not a header
# Check for header-like phrases with fiscal
if 'fiscal year' in row_text_lower and ('ended' in row_text_lower or 'ending' in row_text_lower):
return True
if any(keyword in row_text_lower for keyword in period_keywords):
# Validate it's not a data row with period keywords
# Check for strong data indicators
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
if not re.search(data_pattern, row_text):
return True
# Check for column descriptors (but NOT total)
# These are words commonly found in headers but not data rows
header_keywords = ['description', 'item', 'category', 'type', 'classification',
'change', 'percent', 'increase', 'decrease', 'variance']
if any(keyword in row_text_lower for keyword in header_keywords):
# Make sure it's not a total row
if 'total' not in row_text_lower[:30]:
# Additional validation: long narrative text is not a header
# Headers are typically concise (< 150 chars)
if len(row_text) > 150:
return False
# Check for data indicators (would indicate data row, not header)
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
if re.search(data_pattern, row_text):
return False
return True
# Check if all cells are bold (common header formatting)
bold_count = 0
for cell in cells:
style = cell.get('style', '')
if 'font-weight' in style and 'bold' in style:
bold_count += 1
elif cell.find('.//b') is not None or cell.find('.//strong') is not None:
bold_count += 1
# Only consider it a header if ALL cells are bold (not just some)
if bold_count == len(cells) and bold_count > 0:
return True
# Check content type ratio - headers usually have more text than numbers
# Count cells with primarily text vs primarily numbers
text_cells = 0
number_cells = 0
for cell in cells:
cell_text = cell.text_content().strip()
if cell_text:
# Remove common symbols for analysis
clean_text = cell_text.replace('$', '').replace('%', '').replace(',', '').replace('(', '').replace(')', '')
if clean_text.replace('.', '').replace('-', '').strip().isdigit():
number_cells += 1
else:
text_cells += 1
# Be very careful about treating text-heavy rows as headers
# Many data rows start with text labels (e.g., "Impact of...", "Effect of...")
# Only consider it a header if it has mostly text AND doesn't look like a data label
if text_cells > number_cells * 2 and text_cells >= 3:
# Check for common data row patterns
data_row_indicators = [
'impact of', 'effect of', 'adjustment', 'provision for', 'benefit',
'expense', 'income from', 'loss on', 'gain on', 'charge', 'credit',
'earnings', 'computed', 'state taxes', 'research', 'excess tax'
]
# If it starts with any of these, it's likely a data row, not a header
for indicator in data_row_indicators:
if row_text_lower.startswith(indicator) or indicator in row_text_lower[:50]:
return False
# Also not a header if it starts with "total"
if not row_text_lower.startswith('total'):
return True
return False
def _detect_table_type(self, table: TableNode) -> TableType:
"""Detect the type of table based on content."""
# Collect text from headers and first few rows
text_parts = []
# Add caption
if table.caption:
text_parts.append(table.caption.lower())
# Add headers
for header_row in table.headers:
for cell in header_row:
text_parts.append(cell.text().lower())
# Add first few rows
for row in table.rows[:3]:
for cell in row.cells:
text_parts.append(cell.text().lower())
combined_text = ' '.join(text_parts)
# Check for financial table
financial_count = sum(1 for keyword in self.FINANCIAL_KEYWORDS if keyword in combined_text)
if financial_count >= 2: # Lowered threshold for better detection
return TableType.FINANCIAL
# Check for metrics table
metrics_count = sum(1 for keyword in self.METRICS_KEYWORDS if keyword in combined_text)
numeric_cells = sum(1 for row in table.rows for cell in row.cells if cell.is_numeric)
total_cells = sum(len(row.cells) for row in table.rows)
if total_cells > 0:
numeric_ratio = numeric_cells / total_cells
# More lenient metrics detection
if metrics_count >= 1 or numeric_ratio > 0.3:
return TableType.METRICS
# Check for table of contents
if 'content' in combined_text or 'index' in combined_text:
# Look for page numbers
has_page_numbers = any(
re.search(r'\b\d{1,3}\b', cell.text())
for row in table.rows
for cell in row.cells
)
if has_page_numbers:
return TableType.TABLE_OF_CONTENTS
# Check for exhibit index
if 'exhibit' in combined_text:
return TableType.EXHIBIT_INDEX
# Check for reference table (citations, definitions, etc.)
if any(word in combined_text for word in ['reference', 'definition', 'glossary', 'citation']):
return TableType.REFERENCE
return TableType.GENERAL
def _extract_relationships(self, table: TableNode):
"""Extract relationships within table data."""
# This would implement relationship extraction
# For now, just set a flag that relationships were processed
table.set_metadata('relationships_extracted', True)
# Example relationships to extract:
# - Parent-child relationships (indented rows)
# - Total rows that sum other rows
# - Cross-references between cells
# - Time series relationships
# Detect total rows
total_rows = []
for i, row in enumerate(table.rows):
if row.is_total_row:
total_rows.append(i)
if total_rows:
table.set_metadata('total_rows', total_rows)
# Detect indentation patterns (parent-child)
indentation_levels = []
for row in table.rows:
if row.cells:
first_cell_text = row.cells[0].text()
# Count leading spaces
indent = len(first_cell_text) - len(first_cell_text.lstrip())
indentation_levels.append(indent)
if any(level > 0 for level in indentation_levels):
table.set_metadata('has_hierarchy', True)
table.set_metadata('indentation_levels', indentation_levels)

View File

@@ -0,0 +1,345 @@
"""
XBRL extraction strategy for inline XBRL documents.
"""
from typing import Dict, Any, Optional
from lxml.html import HtmlElement
from edgar.documents.types import XBRLFact
class XBRLExtractor:
"""
Extracts XBRL facts from inline XBRL (iXBRL) documents.
Handles:
- ix:nonFraction, ix:nonNumeric facts
- Context and unit resolution
- Continuation handling
- Transformation rules
"""
# XBRL namespaces
NAMESPACES = {
'ix': 'http://www.xbrl.org/2013/inlineXBRL',
'xbrli': 'http://www.xbrl.org/2003/instance',
'xbrldi': 'http://xbrl.org/2006/xbrldi',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}
# Common transformation formats
TRANSFORMATIONS = {
'ixt:numdotdecimal': lambda x: x.replace(',', ''),
'ixt:numcommadecimal': lambda x: x.replace('.', '_').replace(',', '.').replace('_', ','),
'ixt:zerodash': lambda x: '0' if x == '-' else x,
'ixt:datedoteu': lambda x: x.replace('.', '-'),
'ixt:datedotus': lambda x: x.replace('.', '/'),
}
def __init__(self):
"""Initialize XBRL extractor."""
self.contexts: Dict[str, Dict[str, Any]] = {}
self.units: Dict[str, str] = {}
self.continuations: Dict[str, str] = {}
self._initialized = False
def extract_context(self, element: HtmlElement) -> Optional[Dict[str, Any]]:
"""
Extract XBRL context from element.
Args:
element: HTML element that might contain XBRL
Returns:
XBRL metadata if found
"""
# Check if element is an ix: tag
if not self._is_xbrl_element(element):
return None
# Initialize context if needed
if not self._initialized:
self._initialize_context(element)
# Extract based on element type
tag_name = self._get_local_name(element.tag)
if tag_name == 'nonfraction':
return self._extract_nonfraction(element)
elif tag_name == 'nonnumeric':
return self._extract_nonnumeric(element)
elif tag_name == 'continuation':
return self._extract_continuation(element)
elif tag_name == 'footnote':
return self._extract_footnote(element)
elif tag_name == 'fraction':
return self._extract_fraction(element)
return None
def extract_fact(self, element: HtmlElement) -> Optional[XBRLFact]:
"""Extract XBRL fact from element."""
context = self.extract_context(element)
if not context:
return None
# Get fact value
value = self._get_fact_value(element)
# Create fact
fact = XBRLFact(
concept=context.get('name', ''),
value=value,
context_ref=context.get('contextRef'),
unit_ref=context.get('unitRef'),
decimals=context.get('decimals'),
scale=context.get('scale'),
format=context.get('format'),
sign=context.get('sign')
)
# Resolve references
if fact.context_ref and fact.context_ref in self.contexts:
fact.context = self.contexts[fact.context_ref]
if fact.unit_ref and fact.unit_ref in self.units:
fact.unit = self.units[fact.unit_ref]
return fact
def _is_xbrl_element(self, element: HtmlElement) -> bool:
"""Check if element is an XBRL element."""
tag = element.tag
if not isinstance(tag, str):
return False
# Handle both namespaced and non-namespaced tags
tag_lower = tag.lower()
return (
tag.startswith('{' + self.NAMESPACES['ix'] + '}') or
tag.startswith('ix:') or
tag_lower.startswith('ix:')
)
def _get_local_name(self, tag: str) -> str:
"""Get local name from qualified tag."""
if '}' in tag:
return tag.split('}')[1].lower()
elif ':' in tag:
return tag.split(':')[1].lower()
return tag.lower()
def _initialize_context(self, element: HtmlElement):
"""Initialize context and unit information from document."""
# Find root element
root = element.getroottree().getroot()
# Extract contexts
self._extract_contexts(root)
# Extract units
self._extract_units(root)
self._initialized = True
def _extract_contexts(self, root: HtmlElement):
"""Extract all context definitions."""
# Look for xbrli:context elements
for context in root.xpath('//xbrli:context', namespaces=self.NAMESPACES):
context_id = context.get('id')
if not context_id:
continue
context_data = {
'id': context_id
}
# Extract entity
entity = context.find('.//xbrli:entity', namespaces=self.NAMESPACES)
if entity is not None:
identifier = entity.find('.//xbrli:identifier', namespaces=self.NAMESPACES)
if identifier is not None:
context_data['entity'] = identifier.text
context_data['scheme'] = identifier.get('scheme')
# Extract period
period = context.find('.//xbrli:period', namespaces=self.NAMESPACES)
if period is not None:
instant = period.find('.//xbrli:instant', namespaces=self.NAMESPACES)
if instant is not None:
context_data['instant'] = instant.text
context_data['period_type'] = 'instant'
else:
start = period.find('.//xbrli:startDate', namespaces=self.NAMESPACES)
end = period.find('.//xbrli:endDate', namespaces=self.NAMESPACES)
if start is not None and end is not None:
context_data['start_date'] = start.text
context_data['end_date'] = end.text
context_data['period_type'] = 'duration'
# Extract dimensions
segment = context.find('.//xbrli:segment', namespaces=self.NAMESPACES)
if segment is not None:
dimensions = {}
for member in segment.findall('.//xbrldi:explicitMember', namespaces=self.NAMESPACES):
dim = member.get('dimension')
if dim:
dimensions[dim] = member.text
if dimensions:
context_data['dimensions'] = dimensions
self.contexts[context_id] = context_data
def _extract_units(self, root: HtmlElement):
"""Extract all unit definitions."""
# Look for xbrli:unit elements
for unit in root.xpath('//xbrli:unit', namespaces=self.NAMESPACES):
unit_id = unit.get('id')
if not unit_id:
continue
# Check for simple measure
measure = unit.find('.//xbrli:measure', namespaces=self.NAMESPACES)
if measure is not None:
self.units[unit_id] = self._normalize_unit(measure.text)
continue
# Check for complex unit (divide)
divide = unit.find('.//xbrli:divide', namespaces=self.NAMESPACES)
if divide is not None:
numerator = divide.find('.//xbrli:unitNumerator/xbrli:measure', namespaces=self.NAMESPACES)
denominator = divide.find('.//xbrli:unitDenominator/xbrli:measure', namespaces=self.NAMESPACES)
if numerator is not None and denominator is not None:
num_unit = self._normalize_unit(numerator.text)
den_unit = self._normalize_unit(denominator.text)
self.units[unit_id] = f"{num_unit}/{den_unit}"
def _normalize_unit(self, unit_text: str) -> str:
"""Normalize unit text."""
if not unit_text:
return ''
# Remove namespace prefix
if ':' in unit_text:
unit_text = unit_text.split(':')[-1]
# Common normalizations
unit_map = {
'usd': 'USD',
'shares': 'shares',
'pure': 'pure',
'percent': '%'
}
return unit_map.get(unit_text.lower(), unit_text)
def _extract_nonfraction(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:nonFraction element."""
metadata = {
'type': 'nonFraction',
'name': element.get('name'),
'contextRef': element.get('contextRef') or element.get('contextref'),
'unitRef': element.get('unitRef') or element.get('unitref'),
'decimals': element.get('decimals'),
'scale': element.get('scale'),
'format': element.get('format'),
'sign': element.get('sign')
}
# Clean None values
return {k: v for k, v in metadata.items() if v is not None}
def _extract_nonnumeric(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:nonNumeric element."""
metadata = {
'type': 'nonNumeric',
'name': element.get('name'),
'contextRef': element.get('contextRef') or element.get('contextref'),
'format': element.get('format')
}
# Clean None values
return {k: v for k, v in metadata.items() if v is not None}
def _extract_continuation(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:continuation element."""
cont_id = element.get('id')
continued_at = element.get('continuedAt')
if cont_id and continued_at:
# Map continuation to original
if continued_at in self.continuations:
original = self.continuations[continued_at]
self.continuations[cont_id] = original
return original
else:
# Store for later resolution
metadata = {
'type': 'continuation',
'id': cont_id,
'continuedAt': continued_at
}
self.continuations[cont_id] = metadata
return metadata
return {}
def _extract_footnote(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:footnote element."""
return {
'type': 'footnote',
'footnoteRole': element.get('footnoteRole'),
'footnoteID': element.get('footnoteID')
}
def _extract_fraction(self, element: HtmlElement) -> Dict[str, Any]:
"""Extract ix:fraction element."""
metadata = {
'type': 'fraction',
'name': element.get('name'),
'contextRef': element.get('contextRef'),
'unitRef': element.get('unitRef')
}
# Extract numerator and denominator
numerator = element.find('.//ix:numerator', namespaces=self.NAMESPACES)
denominator = element.find('.//ix:denominator', namespaces=self.NAMESPACES)
if numerator is not None:
metadata['numerator'] = numerator.text
if denominator is not None:
metadata['denominator'] = denominator.text
return {k: v for k, v in metadata.items() if v is not None}
def _get_fact_value(self, element: HtmlElement) -> str:
"""Get fact value from element with transformations."""
# Get raw value
value = element.text or ''
# Apply format transformation if specified
format_attr = element.get('format')
if format_attr and format_attr in self.TRANSFORMATIONS:
transform = self.TRANSFORMATIONS[format_attr]
value = transform(value)
# Apply scale if specified
scale = element.get('scale')
if scale:
try:
scale_factor = int(scale)
numeric_value = float(value.replace(',', ''))
scaled_value = numeric_value * (10 ** scale_factor)
value = str(scaled_value)
except (ValueError, TypeError):
pass
# Apply sign if specified
sign = element.get('sign')
if sign == '-':
if value and not value.startswith('-'):
value = '-' + value
return value.strip()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,62 @@
"""
Table processing utilities for document parsing.
This module consolidates the standard table matrix processing pipeline used
across table rendering implementations (TableNode.render(), TableNode.to_dataframe(),
and FastTableRenderer.render_table_node()).
"""
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from edgar.documents.utils.table_matrix import TableMatrix, ColumnAnalyzer
from edgar.documents.utils.currency_merger import CurrencyColumnMerger
def process_table_matrix(matrix: "TableMatrix", headers, rows) -> "TableMatrix":
"""
Standard table matrix processing pipeline.
This function applies the standard three-step processing pipeline:
1. Build matrix from headers and rows (handles colspan/rowspan)
2. Filter out spacing columns (columns with only whitespace)
3. Detect and merge currency symbol columns with adjacent value columns
Args:
matrix: TableMatrix instance to populate
headers: List of header rows (each row is a list of Cell objects)
rows: List of data rows (each row is a list of Cell objects)
Returns:
Processed TableMatrix with spacing columns removed and currency columns merged
Example:
>>> matrix = TableMatrix()
>>> clean_matrix = process_table_matrix(matrix, headers, rows)
>>> # clean_matrix now has colspan/rowspan expanded, spacing removed, and currencies merged
Note:
This consolidates the identical processing sequence that appeared in:
- table_nodes.py:240-251 (TableNode.render())
- table_nodes.py:XXX (TableNode.to_dataframe())
- renderers/fast_table.py:XXX (FastTableRenderer.render_table_node())
"""
# Import at runtime to avoid circular imports
from edgar.documents.utils.table_matrix import ColumnAnalyzer
from edgar.documents.utils.currency_merger import CurrencyColumnMerger
# Step 1: Build matrix from rows (expands colspan/rowspan)
matrix.build_from_rows(headers, rows)
# Step 2: Remove spacing columns (columns with only whitespace/empty cells)
# Note: ColumnAnalyzer is created but unused in original implementation
analyzer = ColumnAnalyzer(matrix)
clean_matrix = matrix.filter_spacing_columns()
# Step 3: Detect and merge currency columns ($ with adjacent numbers)
currency_merger = CurrencyColumnMerger(clean_matrix)
currency_merger.detect_currency_pairs()
if currency_merger.merge_pairs:
clean_matrix = currency_merger.apply_merges()
return clean_matrix

View File

@@ -0,0 +1,282 @@
"""
Type definitions for the HTML parser.
"""
import re
from dataclasses import dataclass
from enum import Enum, auto
from typing import Protocol, Union, Optional, Dict, Any, List
class NodeType(Enum):
"""Types of nodes in the document tree."""
DOCUMENT = auto()
SECTION = auto()
HEADING = auto()
PARAGRAPH = auto()
TABLE = auto()
LIST = auto()
LIST_ITEM = auto()
LINK = auto()
IMAGE = auto()
XBRL_FACT = auto()
TEXT = auto()
CONTAINER = auto()
class SemanticType(Enum):
"""Semantic types for document understanding."""
TITLE = auto()
HEADER = auto()
BODY_TEXT = auto()
FOOTNOTE = auto()
TABLE_OF_CONTENTS = auto()
FINANCIAL_STATEMENT = auto()
DISCLOSURE = auto()
ITEM_HEADER = auto()
SECTION_HEADER = auto()
SIGNATURE = auto()
EXHIBIT = auto()
class TableType(Enum):
"""Types of tables for semantic understanding."""
FINANCIAL = auto()
METRICS = auto()
REFERENCE = auto()
GENERAL = auto()
TABLE_OF_CONTENTS = auto()
EXHIBIT_INDEX = auto()
@dataclass
class Style:
"""Unified style representation."""
font_size: Optional[float] = None
font_weight: Optional[str] = None
font_style: Optional[str] = None
text_align: Optional[str] = None
text_decoration: Optional[str] = None
color: Optional[str] = None
background_color: Optional[str] = None
margin_top: Optional[float] = None
margin_bottom: Optional[float] = None
margin_left: Optional[float] = None
margin_right: Optional[float] = None
padding_top: Optional[float] = None
padding_bottom: Optional[float] = None
padding_left: Optional[float] = None
padding_right: Optional[float] = None
display: Optional[str] = None
width: Optional[Union[float, str]] = None
height: Optional[Union[float, str]] = None
line_height: Optional[float] = None
def merge(self, other: 'Style') -> 'Style':
"""Merge this style with another, with other taking precedence."""
merged = Style()
for field in self.__dataclass_fields__:
other_value = getattr(other, field)
if other_value is not None:
setattr(merged, field, other_value)
else:
setattr(merged, field, getattr(self, field))
return merged
@property
def is_bold(self) -> bool:
"""Check if style represents bold text."""
return self.font_weight in ('bold', '700', '800', '900')
@property
def is_italic(self) -> bool:
"""Check if style represents italic text."""
return self.font_style == 'italic'
@property
def is_centered(self) -> bool:
"""Check if text is centered."""
return self.text_align == 'center'
class NodeProtocol(Protocol):
"""Protocol for all nodes."""
id: str
type: NodeType
content: Any
metadata: Dict[str, Any]
style: Style
parent: Optional['NodeProtocol']
children: List['NodeProtocol']
def text(self) -> str: ...
def html(self) -> str: ...
def find(self, predicate) -> List['NodeProtocol']: ...
@dataclass
class HeaderInfo:
"""Information about detected headers."""
level: int # 1-6
confidence: float # 0.0-1.0
text: str
detection_method: str
is_item: bool = False
item_number: Optional[str] = None
@classmethod
def from_text(cls, text: str, level: int, confidence: float, method: str) -> 'HeaderInfo':
"""Create HeaderInfo from text, detecting if it's an item header."""
# Check for item patterns
item_pattern = re.compile(r'^(Item|ITEM)\s+(\d+[A-Z]?\.?)', re.IGNORECASE)
match = item_pattern.match(text.strip())
is_item = bool(match)
item_number = match.group(2).rstrip('.') if match else None
return cls(
level=level,
confidence=confidence,
text=text,
detection_method=method,
is_item=is_item,
item_number=item_number
)
@dataclass
class XBRLFact:
"""Represents an XBRL fact extracted from inline XBRL."""
concept: str
value: str
context_ref: Optional[str] = None
unit_ref: Optional[str] = None
decimals: Optional[str] = None
scale: Optional[str] = None
format: Optional[str] = None
sign: Optional[str] = None
# Resolved references
context: Optional[Dict[str, Any]] = None
unit: Optional[str] = None
# Additional metadata
metadata: Optional[Dict[str, Any]] = None
@property
def numeric_value(self) -> Optional[float]:
"""Get numeric value if applicable."""
try:
# Remove commas and convert
clean_value = self.value.replace(',', '')
return float(clean_value)
except (ValueError, AttributeError):
return None
@property
def is_numeric(self) -> bool:
"""Check if this is a numeric fact."""
return self.numeric_value is not None
def to_dict(self) -> Dict[str, Any]:
"""Convert XBRLFact to dictionary."""
return {
'concept': self.concept,
'value': self.value,
'context_ref': self.context_ref,
'unit_ref': self.unit_ref,
'decimals': self.decimals,
'scale': self.scale,
'format': self.format,
'sign': self.sign,
'context': self.context,
'unit': self.unit,
'is_numeric': self.is_numeric,
'numeric_value': self.numeric_value
}
@dataclass
class SearchResult:
"""
Result from document search.
Designed for agent-friendly investigation workflows - provides access to
full section context rather than fragmented chunks.
"""
node: 'NodeProtocol'
score: float
snippet: str
section: Optional[str] = None
context: Optional[str] = None
_section_obj: Optional[Any] = None # Hidden Section object for agent navigation
@property
def section_object(self) -> Optional[Any]:
"""
Get full Section object for agent navigation.
Enables multi-step investigation by providing access to complete
section content, not just the matched fragment.
Returns:
Section object with text(), tables(), and search() methods
"""
return self._section_obj
@property
def full_context(self) -> str:
"""
Get complete section text for agent investigation.
Returns full section content instead of fragmented chunks.
This supports the post-RAG "investigation not retrieval" pattern.
Returns:
Complete section text if section available, else snippet
"""
if self._section_obj and hasattr(self._section_obj, 'text'):
return self._section_obj.text()
return self.snippet
@dataclass
class ParseContext:
"""Context information during parsing."""
base_font_size: float = 10.0
current_section: Optional[str] = None
in_table: bool = False
in_list: bool = False
depth: int = 0
style_stack: List[Style] = None
def __post_init__(self):
if self.style_stack is None:
self.style_stack = []
def push_style(self, style: Style):
"""Push style onto stack."""
self.style_stack.append(style)
def pop_style(self):
"""Pop style from stack."""
if self.style_stack:
self.style_stack.pop()
def get_current_style(self) -> Style:
"""Get combined style from stack."""
if not self.style_stack:
return Style()
result = self.style_stack[0]
for style in self.style_stack[1:]:
result = result.merge(style)
return result
# Type aliases for clarity
NodeId = str
SectionName = str
ConceptName = str
ContextRef = str
UnitRef = str

View File

@@ -0,0 +1,51 @@
"""
Utility modules for HTML parsing.
"""
from edgar.documents.utils.cache import (
LRUCache,
WeakCache,
TimeBasedCache,
CacheManager,
get_cache_manager,
cached,
CacheStats
)
from edgar.documents.utils.streaming import (
StreamingParser
)
from edgar.documents.utils.table_matrix import (
TableMatrix,
ColumnAnalyzer,
MatrixCell
)
from edgar.documents.utils.currency_merger import (
CurrencyColumnMerger
)
# Note: CacheableMixin not exported to avoid circular imports
# Import directly: from edgar.documents.cache_mixin import CacheableMixin
from edgar.documents.utils.html_utils import (
remove_xml_declaration,
create_lxml_parser
)
# Note: table_utils not exported to avoid circular imports
# Import directly: from edgar.documents.utils.table_utils import process_table_matrix
__all__ = [
'LRUCache',
'WeakCache',
'TimeBasedCache',
'CacheManager',
'get_cache_manager',
'cached',
'CacheStats',
'StreamingParser',
'TableMatrix',
'ColumnAnalyzer',
'MatrixCell',
'CurrencyColumnMerger',
# 'CacheableMixin', # Not exported - import directly to avoid circular imports
'remove_xml_declaration',
'create_lxml_parser',
# 'process_table_matrix' # Not exported - import directly to avoid circular imports
]

View File

@@ -0,0 +1,205 @@
"""
Lightweight anchor analysis cache to avoid re-parsing HTML.
This provides a middle-ground approach that caches anchor analysis results
while minimizing memory overhead.
"""
import re
from typing import Dict, Set, Optional
from collections import Counter
import hashlib
import pickle
from pathlib import Path
class AnchorCache:
"""
Cache for anchor link analysis results.
Stores navigation patterns by HTML hash to avoid re-analysis.
"""
def __init__(self, cache_dir: Optional[Path] = None):
self.cache_dir = cache_dir or Path.home() / '.edgar_cache' / 'anchors'
self.cache_dir.mkdir(parents=True, exist_ok=True)
self._memory_cache = {} # In-memory cache for current session
def _get_html_hash(self, html_content: str) -> str:
"""Get hash of HTML content for caching."""
return hashlib.md5(html_content.encode('utf-8')).hexdigest()
def get_navigation_patterns(self, html_content: str) -> Optional[Set[str]]:
"""
Get cached navigation patterns for HTML content.
Args:
html_content: HTML to analyze
Returns:
Set of navigation patterns or None if not cached
"""
html_hash = self._get_html_hash(html_content)
# Check in-memory cache first
if html_hash in self._memory_cache:
return self._memory_cache[html_hash]
# Check disk cache
cache_file = self.cache_dir / f"{html_hash}.pkl"
if cache_file.exists():
try:
with open(cache_file, 'rb') as f:
patterns = pickle.load(f)
self._memory_cache[html_hash] = patterns
return patterns
except:
# Corrupted cache file, remove it
cache_file.unlink(missing_ok=True)
return None
def cache_navigation_patterns(self, html_content: str, patterns: Set[str]) -> None:
"""
Cache navigation patterns for HTML content.
Args:
html_content: HTML content
patterns: Navigation patterns to cache
"""
html_hash = self._get_html_hash(html_content)
# Store in memory
self._memory_cache[html_hash] = patterns
# Store on disk (async to avoid blocking)
try:
cache_file = self.cache_dir / f"{html_hash}.pkl"
with open(cache_file, 'wb') as f:
pickle.dump(patterns, f)
except:
# Ignore cache write errors
pass
def clear_cache(self) -> None:
"""Clear all cached data."""
self._memory_cache.clear()
for cache_file in self.cache_dir.glob("*.pkl"):
cache_file.unlink(missing_ok=True)
# Global cache instance
_anchor_cache = AnchorCache()
def get_cached_navigation_patterns(html_content: str,
force_analyze: bool = False) -> Set[str]:
"""
Get navigation patterns with caching.
Args:
html_content: HTML to analyze
force_analyze: Force re-analysis even if cached
Returns:
Set of navigation link texts to filter
"""
if not force_analyze:
cached_patterns = _anchor_cache.get_navigation_patterns(html_content)
if cached_patterns is not None:
return cached_patterns
# Need to analyze - use minimal approach
patterns = _analyze_navigation_minimal(html_content)
# Cache results
_anchor_cache.cache_navigation_patterns(html_content, patterns)
return patterns
def _analyze_navigation_minimal(html_content: str, min_frequency: int = 5) -> Set[str]:
"""
Minimal navigation analysis using regex instead of full HTML parsing.
This avoids BeautifulSoup overhead by using regex to find anchor patterns.
"""
patterns = set()
# Find all anchor links with regex (faster than BeautifulSoup)
anchor_pattern = re.compile(r'<a[^>]*href\s*=\s*["\']#([^"\']*)["\'][^>]*>(.*?)</a>',
re.IGNORECASE | re.DOTALL)
link_counts = Counter()
for match in anchor_pattern.finditer(html_content):
anchor_id = match.group(1).strip()
link_text = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Remove HTML tags
link_text = ' '.join(link_text.split()) # Normalize whitespace
if link_text and len(link_text) < 100: # Reasonable link text length
link_counts[link_text] += 1
# Add frequently occurring links
for text, count in link_counts.items():
if count >= min_frequency:
patterns.add(text)
return patterns
def filter_with_cached_patterns(text: str, html_content: str = None) -> str:
"""
Filter text using cached navigation patterns.
Preserves first occurrences of patterns (document structure headers)
while filtering out repeated navigation links.
Args:
text: Text to filter
html_content: HTML for pattern analysis (optional)
Returns:
Filtered text
"""
if not text:
return text
# Get patterns (cached or analyze)
if html_content:
patterns = get_cached_navigation_patterns(html_content)
else:
# Fallback to common SEC patterns
patterns = {
'Table of Contents',
'Index to Financial Statements',
'Index to Exhibits'
}
if not patterns:
return text
# Smart filtering: preserve first few occurrences, filter out repetitions
lines = text.split('\n')
filtered_lines = []
pattern_counts = {} # Track how many times we've seen each pattern
# Allow first few occurrences of each pattern (document structure headers)
max_allowed_per_pattern = 2 # Allow up to 2 occurrences of each pattern
for line in lines:
stripped_line = line.strip()
if stripped_line in patterns:
# This line matches a navigation pattern
count = pattern_counts.get(stripped_line, 0)
if count < max_allowed_per_pattern:
# Keep this occurrence (likely a document structure header)
filtered_lines.append(line)
pattern_counts[stripped_line] = count + 1
# else: skip this line (it's a repetitive navigation link)
else:
# Not a navigation pattern, always keep
filtered_lines.append(line)
return '\n'.join(filtered_lines)

View File

@@ -0,0 +1,426 @@
"""
Cache utilities for performance optimization.
"""
import weakref
from collections import OrderedDict
from typing import Any, Dict, Optional, Callable, TypeVar, Generic
from functools import wraps
import time
import threading
from dataclasses import dataclass, field
from datetime import datetime, timedelta
T = TypeVar('T')
@dataclass
class CacheStats:
"""Statistics for cache performance monitoring."""
hits: int = 0
misses: int = 0
evictions: int = 0
total_time: float = 0.0
last_reset: datetime = field(default_factory=datetime.now)
@property
def hit_rate(self) -> float:
"""Calculate cache hit rate."""
total = self.hits + self.misses
return self.hits / total if total > 0 else 0.0
@property
def avg_access_time(self) -> float:
"""Calculate average access time."""
total = self.hits + self.misses
return self.total_time / total if total > 0 else 0.0
def reset(self):
"""Reset statistics."""
self.hits = 0
self.misses = 0
self.evictions = 0
self.total_time = 0.0
self.last_reset = datetime.now()
class LRUCache(Generic[T]):
"""
Thread-safe LRU cache implementation.
Used for caching expensive operations like style parsing
and header detection results.
"""
def __init__(self, max_size: int = 1000):
"""
Initialize LRU cache.
Args:
max_size: Maximum number of items to cache
"""
self.max_size = max_size
self._cache: OrderedDict[str, T] = OrderedDict()
self._lock = threading.RLock()
self.stats = CacheStats()
def get(self, key: str) -> Optional[T]:
"""
Get item from cache.
Args:
key: Cache key
Returns:
Cached value or None if not found
"""
start_time = time.time()
with self._lock:
if key in self._cache:
# Move to end (most recently used)
self._cache.move_to_end(key)
self.stats.hits += 1
self.stats.total_time += time.time() - start_time
return self._cache[key]
self.stats.misses += 1
self.stats.total_time += time.time() - start_time
return None
def put(self, key: str, value: T) -> None:
"""
Put item in cache.
Args:
key: Cache key
value: Value to cache
"""
with self._lock:
if key in self._cache:
# Update existing
self._cache.move_to_end(key)
self._cache[key] = value
else:
# Add new
self._cache[key] = value
# Evict oldest if over capacity
if len(self._cache) > self.max_size:
self._cache.popitem(last=False)
self.stats.evictions += 1
def clear(self) -> None:
"""Clear all cached items."""
with self._lock:
self._cache.clear()
def size(self) -> int:
"""Get current cache size."""
with self._lock:
return len(self._cache)
class WeakCache:
"""
Weak reference cache for parsed nodes.
Allows garbage collection of unused nodes while
maintaining references to actively used ones.
"""
def __init__(self):
"""Initialize weak cache."""
self._cache: Dict[str, weakref.ref] = {}
self._lock = threading.RLock()
self.stats = CacheStats()
def get(self, key: str) -> Optional[Any]:
"""
Get item from cache.
Args:
key: Cache key
Returns:
Cached object or None if not found or collected
"""
start_time = time.time()
with self._lock:
ref = self._cache.get(key)
if ref is not None:
obj = ref()
if obj is not None:
self.stats.hits += 1
self.stats.total_time += time.time() - start_time
return obj
else:
# Object was garbage collected
del self._cache[key]
self.stats.misses += 1
self.stats.total_time += time.time() - start_time
return None
def put(self, key: str, value: Any) -> None:
"""
Put item in cache with weak reference.
Args:
key: Cache key
value: Object to cache
"""
with self._lock:
self._cache[key] = weakref.ref(value)
def clear(self) -> None:
"""Clear all cached references."""
with self._lock:
self._cache.clear()
def cleanup(self) -> int:
"""
Remove dead references.
Returns:
Number of references removed
"""
with self._lock:
dead_keys = [
key for key, ref in self._cache.items()
if ref() is None
]
for key in dead_keys:
del self._cache[key]
return len(dead_keys)
class TimeBasedCache(Generic[T]):
"""
Time-based expiring cache.
Items expire after a specified duration.
"""
def __init__(self, ttl_seconds: int = 3600):
"""
Initialize time-based cache.
Args:
ttl_seconds: Time to live in seconds
"""
self.ttl = timedelta(seconds=ttl_seconds)
self._cache: Dict[str, tuple[T, datetime]] = {}
self._lock = threading.RLock()
self.stats = CacheStats()
def get(self, key: str) -> Optional[T]:
"""
Get item from cache if not expired.
Args:
key: Cache key
Returns:
Cached value or None if not found or expired
"""
start_time = time.time()
with self._lock:
if key in self._cache:
value, timestamp = self._cache[key]
if datetime.now() - timestamp < self.ttl:
self.stats.hits += 1
self.stats.total_time += time.time() - start_time
return value
else:
# Expired
del self._cache[key]
self.stats.evictions += 1
self.stats.misses += 1
self.stats.total_time += time.time() - start_time
return None
def put(self, key: str, value: T) -> None:
"""
Put item in cache with timestamp.
Args:
key: Cache key
value: Value to cache
"""
with self._lock:
self._cache[key] = (value, datetime.now())
def clear(self) -> None:
"""Clear all cached items."""
with self._lock:
self._cache.clear()
def cleanup(self) -> int:
"""
Remove expired items.
Returns:
Number of items removed
"""
with self._lock:
now = datetime.now()
expired_keys = [
key for key, (_, timestamp) in self._cache.items()
if now - timestamp >= self.ttl
]
for key in expired_keys:
del self._cache[key]
self.stats.evictions += 1
return len(expired_keys)
def cached(cache: LRUCache, key_func: Optional[Callable] = None):
"""
Decorator for caching function results.
Args:
cache: Cache instance to use
key_func: Function to generate cache key from arguments
Returns:
Decorated function
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
# Generate cache key
if key_func:
key = key_func(*args, **kwargs)
else:
# Default key generation
key = f"{func.__name__}:{str(args)}:{str(sorted(kwargs.items()))}"
# Check cache
result = cache.get(key)
if result is not None:
return result
# Compute and cache result
result = func(*args, **kwargs)
cache.put(key, result)
return result
return wrapper
return decorator
class CacheManager:
"""
Manages multiple caches for the parser.
Provides centralized cache management and monitoring.
"""
def __init__(self):
"""Initialize cache manager."""
# Style parsing cache
self.style_cache = LRUCache[dict](max_size=5000)
# Header detection cache
self.header_cache = LRUCache[bool](max_size=2000)
# Pattern matching cache
self.pattern_cache = LRUCache[bool](max_size=10000)
# Node reference cache
self.node_cache = WeakCache()
# Compiled regex cache
self.regex_cache = LRUCache[Any](max_size=500)
# All caches for management
self._caches = {
'style': self.style_cache,
'header': self.header_cache,
'pattern': self.pattern_cache,
'node': self.node_cache,
'regex': self.regex_cache
}
def get_stats(self) -> Dict[str, CacheStats]:
"""Get statistics for all caches."""
return {
name: cache.stats
for name, cache in self._caches.items()
if hasattr(cache, 'stats')
}
def reset_stats(self) -> None:
"""Reset statistics for all caches."""
for cache in self._caches.values():
if hasattr(cache, 'stats'):
cache.stats.reset()
def clear_all(self) -> None:
"""Clear all caches."""
for cache in self._caches.values():
cache.clear()
def cleanup(self) -> Dict[str, int]:
"""
Cleanup expired/dead entries in all caches.
Returns:
Number of entries cleaned up per cache
"""
cleanup_counts = {}
# Cleanup weak cache
if hasattr(self.node_cache, 'cleanup'):
cleanup_counts['node'] = self.node_cache.cleanup()
return cleanup_counts
def get_memory_usage(self) -> Dict[str, int]:
"""
Estimate memory usage of caches.
Returns:
Approximate memory usage in bytes per cache
"""
import sys
usage = {}
for name, cache in self._caches.items():
if hasattr(cache, '_cache'):
# Rough estimation
size = 0
if isinstance(cache._cache, dict):
for key, value in cache._cache.items():
size += sys.getsizeof(key)
if hasattr(value, '__sizeof__'):
size += sys.getsizeof(value)
else:
size += 1000 # Default estimate
usage[name] = size
return usage
# Global cache manager instance
_cache_manager = None
def get_cache_manager() -> CacheManager:
"""Get global cache manager instance."""
global _cache_manager
if _cache_manager is None:
_cache_manager = CacheManager()
return _cache_manager

View File

@@ -0,0 +1,277 @@
"""
Currency column merger for handling separated currency symbols in SEC filings.
"""
import re
from typing import List, Tuple
from edgar.documents.table_nodes import Cell
from edgar.documents.utils.table_matrix import TableMatrix, MatrixCell
class CurrencyColumnMerger:
"""
Detects and merges currency symbol columns with their value columns.
SEC filings often split currency values into two cells:
- Cell 1: "$" (left-aligned)
- Cell 2: "224.11" (right-aligned)
This class detects this pattern and merges them into "$224.11"
"""
# Common currency symbols
CURRENCY_SYMBOLS = {'$', '', '£', '¥', '', 'Rs', 'USD', 'EUR', 'GBP'}
# Pattern for numeric values (with commas, decimals)
NUMERIC_PATTERN = re.compile(r'^[\d,]+\.?\d*$')
def __init__(self, matrix: TableMatrix):
"""Initialize with a table matrix."""
self.matrix = matrix
self.merge_pairs: List[Tuple[int, int]] = []
def detect_currency_pairs(self) -> List[Tuple[int, int]]:
"""
Detect column pairs that should be merged (currency symbol + value).
Returns:
List of (symbol_col, value_col) pairs to merge
"""
pairs = []
for col_idx in range(self.matrix.col_count - 1):
if self._is_currency_column(col_idx):
next_col = col_idx + 1
if self._is_numeric_column(next_col):
# Check if they're consistently paired
if self._verify_pairing(col_idx, next_col):
pairs.append((col_idx, next_col))
self.merge_pairs = pairs
return pairs
def _is_currency_column(self, col_idx: int) -> bool:
"""
Check if a column contains only currency symbols.
A currency column typically:
- Contains only currency symbols or empty cells
- Has very narrow width (1-3 characters)
- Is left-aligned (though we check content, not style)
"""
currency_count = 0
empty_count = 0
other_count = 0
header_rows = 0
for row_idx in range(self.matrix.row_count):
cell = self.matrix.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
# Skip header rows (first 2 rows typically)
if row_idx < 2 and text and not text in self.CURRENCY_SYMBOLS:
header_rows += 1
continue
if not text:
empty_count += 1
elif text in self.CURRENCY_SYMBOLS or text == '$':
currency_count += 1
elif len(text) <= 3 and text in ['$', '', '£', '¥']:
currency_count += 1
else:
other_count += 1
# Column should be mostly currency symbols with some empty cells
# Exclude header rows from the calculation
total_non_empty = currency_count + other_count
if total_non_empty == 0:
return False
# At least 60% of non-empty, non-header cells should be currency symbols
# Lower threshold since we're excluding headers
# Also accept if there's at least 1 currency symbol and no other non-currency content
return (currency_count >= 1 and other_count == 0) or \
(currency_count >= 2 and currency_count / total_non_empty >= 0.6)
def _is_numeric_column(self, col_idx: int) -> bool:
"""
Check if a column contains numeric values.
"""
numeric_count = 0
non_empty_count = 0
for row_idx in range(self.matrix.row_count):
cell = self.matrix.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
# Skip header rows
if row_idx < 2:
continue
if text:
non_empty_count += 1
# Remove formatting and check if numeric
clean_text = text.replace(',', '').replace('%', '').replace('(', '').replace(')', '')
if self.NUMERIC_PATTERN.match(clean_text):
numeric_count += 1
if non_empty_count == 0:
return False
# At least 60% should be numeric (lowered threshold)
return numeric_count / non_empty_count >= 0.6
def _verify_pairing(self, symbol_col: int, value_col: int) -> bool:
"""
Verify that symbol and value columns are consistently paired.
They should have content in the same rows (when symbol present, value present).
"""
paired_rows = 0
mismatched_rows = 0
for row_idx in range(self.matrix.row_count):
symbol_cell = self.matrix.matrix[row_idx][symbol_col]
value_cell = self.matrix.matrix[row_idx][value_col]
if symbol_cell.original_cell and value_cell.original_cell:
symbol_text = symbol_cell.original_cell.text().strip()
value_text = value_cell.original_cell.text().strip()
# Check if they're paired (both have content or both empty)
if symbol_text in self.CURRENCY_SYMBOLS and value_text:
paired_rows += 1
elif not symbol_text and not value_text:
# Both empty is fine
pass
elif symbol_text in self.CURRENCY_SYMBOLS and not value_text:
# Symbol without value - might be header
if row_idx < 2: # Allow in headers
pass
else:
mismatched_rows += 1
elif not symbol_text and value_text:
# Value without symbol - could be valid (continuation)
pass
# Should have more paired than mismatched
return paired_rows > mismatched_rows
def apply_merges(self) -> 'TableMatrix':
"""
Create a new matrix with currency columns merged.
Returns:
New TableMatrix with merged columns
"""
if not self.merge_pairs:
self.detect_currency_pairs()
if not self.merge_pairs:
# No merges needed
return self.matrix
# Calculate new column count (each merge removes one column)
new_col_count = self.matrix.col_count - len(self.merge_pairs)
# Create mapping from old to new columns
old_to_new = {}
merged_cols = set(pair[0] for pair in self.merge_pairs) # Symbol columns to remove
new_col = 0
for old_col in range(self.matrix.col_count):
if old_col in merged_cols:
# This column will be merged with next, skip it
continue
old_to_new[old_col] = new_col
new_col += 1
# Create new matrix
new_matrix = TableMatrix()
new_matrix.row_count = self.matrix.row_count
new_matrix.col_count = new_col_count
new_matrix.matrix = []
# Build new matrix with merged cells
for row_idx in range(self.matrix.row_count):
new_row = [MatrixCell() for _ in range(new_col_count)]
for old_col in range(self.matrix.col_count):
# Check if this is a symbol column to merge
merge_pair = next((pair for pair in self.merge_pairs if pair[0] == old_col), None)
if merge_pair:
# Merge symbol with value
symbol_col, value_col = merge_pair
symbol_cell = self.matrix.matrix[row_idx][symbol_col]
value_cell = self.matrix.matrix[row_idx][value_col]
if value_cell.original_cell:
# Create merged cell
new_cell_content = self._merge_cell_content(symbol_cell, value_cell)
if new_cell_content:
# Create new merged cell
merged_cell = Cell(
content=new_cell_content,
colspan=value_cell.original_cell.colspan,
rowspan=value_cell.original_cell.rowspan,
is_header=value_cell.original_cell.is_header,
align=value_cell.original_cell.align
)
new_col_idx = old_to_new.get(value_col)
if new_col_idx is not None:
new_row[new_col_idx] = MatrixCell(
original_cell=merged_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col_idx
)
elif old_col not in set(pair[1] for pair in self.merge_pairs):
# Regular column, not involved in merging
new_col_idx = old_to_new.get(old_col)
if new_col_idx is not None:
new_row[new_col_idx] = self.matrix.matrix[row_idx][old_col]
new_matrix.matrix.append(new_row)
return new_matrix
def _merge_cell_content(self, symbol_cell: MatrixCell, value_cell: MatrixCell) -> str:
"""
Merge symbol and value cell contents.
Returns:
Merged content like "$224.11" or original value if no symbol
"""
value_text = value_cell.original_cell.text().strip() if value_cell.original_cell else ""
symbol_text = symbol_cell.original_cell.text().strip() if symbol_cell.original_cell else ""
if not value_text:
return symbol_text # Just return symbol if no value
if symbol_text in self.CURRENCY_SYMBOLS:
# Merge symbol with value (no space for $, others may vary)
if symbol_text == '$':
return f"${value_text}"
else:
return f"{symbol_text}{value_text}"
else:
# No symbol, just return value
return value_text
def get_merge_summary(self) -> str:
"""Get a summary of merges to be applied."""
if not self.merge_pairs:
return "No currency column merges detected"
summary = f"Currency merges detected: {len(self.merge_pairs)} pairs\n"
for symbol_col, value_col in self.merge_pairs:
summary += f" • Column {symbol_col} ($) + Column {value_col} (value)\n"
return summary

View File

@@ -0,0 +1,96 @@
"""
HTML utility functions for document parsing.
This module consolidates common HTML processing utilities used across
the parser, preprocessor, and simple parser implementations.
"""
import lxml.html
from typing import Optional
def remove_xml_declaration(html: str) -> str:
"""
Remove XML declaration from HTML if present.
SEC HTML documents sometimes include XML declarations like:
<?xml version="1.0" encoding="UTF-8"?>
These can interfere with HTML parsing and are safely removed since
the encoding is handled separately by the parser.
Args:
html: HTML string that may contain XML declaration
Returns:
HTML string with XML declaration removed (if present)
Examples:
>>> html = '<?xml version="1.0"?><!DOCTYPE html><html>...'
>>> remove_xml_declaration(html)
'<!DOCTYPE html><html>...'
>>> html = '<!DOCTYPE html><html>...' # No XML declaration
>>> remove_xml_declaration(html)
'<!DOCTYPE html><html>...'
"""
html_stripped = html.strip()
if html_stripped.startswith('<?xml'):
xml_end = html.find('?>') + 2
return html[xml_end:]
return html
def create_lxml_parser(
remove_blank_text: bool = True,
remove_comments: bool = True,
recover: bool = True,
encoding: Optional[str] = 'utf-8'
) -> lxml.html.HTMLParser:
"""
Create a configured lxml HTMLParser.
This factory function creates an lxml HTMLParser with consistent
configuration settings used across the document parsing system.
Args:
remove_blank_text: Remove blank text nodes between tags.
Default True for cleaner tree structure.
remove_comments: Remove HTML comments from parsed tree.
Default True since comments are rarely needed.
recover: Enable error recovery mode to handle malformed HTML.
Default True since SEC filings often have HTML issues.
encoding: Character encoding for the parser.
Default 'utf-8'. Set to None to disable encoding handling.
Returns:
Configured lxml.html.HTMLParser instance
Examples:
>>> # Standard parser (removes whitespace and comments, recovers from errors)
>>> parser = create_lxml_parser()
>>> # Parser that preserves all content (for XBRL)
>>> parser = create_lxml_parser(
... remove_blank_text=False,
... remove_comments=False
... )
>>> # Parser without encoding (auto-detect)
>>> parser = create_lxml_parser(encoding=None)
Note:
The recover=True setting is critical for SEC documents which
often contain non-standard HTML structures.
"""
kwargs = {
'remove_blank_text': remove_blank_text,
'remove_comments': remove_comments,
'recover': recover,
}
# Only add encoding if specified
if encoding is not None:
kwargs['encoding'] = encoding
return lxml.html.HTMLParser(**kwargs)

View File

@@ -0,0 +1,375 @@
"""
Streaming parser for large HTML documents.
"""
import io
from typing import Dict, Any, TYPE_CHECKING
from lxml import etree
from lxml.html import HtmlElement
from edgar.documents.config import ParserConfig
from edgar.documents.exceptions import HTMLParsingError, DocumentTooLargeError
# Use TYPE_CHECKING to avoid circular imports
if TYPE_CHECKING:
from edgar.documents.document import Document, DocumentMetadata
from edgar.documents.nodes import DocumentNode, HeadingNode, ParagraphNode, TextNode, SectionNode, ContainerNode
from edgar.documents.table_nodes import TableNode
from edgar.documents.types import SemanticType
class StreamingParser:
"""
Streaming parser for large HTML documents.
Processes documents in chunks to minimize memory usage
while maintaining parse quality.
"""
# Chunk size for streaming (1MB)
CHUNK_SIZE = 1024 * 1024
# Maximum node buffer before flush
MAX_NODE_BUFFER = 1000
def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
"""
Initialize streaming parser.
Args:
config: Parser configuration
strategies: Parsing strategies to use
"""
self.config = config
self.strategies = strategies
self._reset_state()
def _reset_state(self):
"""Reset parser state."""
# Import here to avoid circular import
from edgar.documents.document import DocumentMetadata
from edgar.documents.nodes import DocumentNode
self.current_section = None
self.node_buffer = []
self.metadata = DocumentMetadata()
self.root = DocumentNode()
self.current_parent = self.root
self.tag_stack = []
self.text_buffer = []
self.in_table = False
self.table_buffer = []
self.bytes_processed = 0
def parse(self, html: str) -> "Document":
"""
Parse HTML in streaming mode.
Args:
html: HTML content to parse
Returns:
Parsed Document
Raises:
DocumentTooLargeError: If document exceeds size limit
HTMLParsingError: If parsing fails
"""
self._reset_state()
# Store original HTML BEFORE parsing (needed for TOC-based section detection)
original_html = html
try:
# Create streaming parser
parser = etree.iterparse(
io.BytesIO(html.encode('utf-8')),
events=('start', 'end'),
html=True,
recover=True,
encoding='utf-8'
)
# Process events
for event, elem in parser:
self._process_event(event, elem)
# Check size limit
self.bytes_processed += len(etree.tostring(elem, encoding='unicode', method='html'))
if self.bytes_processed > self.config.max_document_size:
raise DocumentTooLargeError(self.bytes_processed, self.config.max_document_size)
# Flush buffer if needed
if len(self.node_buffer) >= self.MAX_NODE_BUFFER:
self._flush_buffer()
# Clean up processed elements to save memory
elem.clear()
while elem.getprevious() is not None:
parent = elem.getparent()
if parent is not None:
del parent[0]
else:
break
# Final flush
self._flush_buffer()
# Store original HTML in metadata for section detection (TOC analysis)
self.metadata.original_html = original_html
# Create document (import here to avoid circular import)
from edgar.documents.document import Document
document = Document(root=self.root, metadata=self.metadata)
# Store config reference (required for section detection)
document._config = self.config
# Apply post-processing
from edgar.documents.processors.postprocessor import DocumentPostprocessor
postprocessor = DocumentPostprocessor(self.config)
document = postprocessor.process(document)
return document
except etree.ParseError as e:
raise HTMLParsingError(f"Streaming parse failed: {str(e)}")
except Exception as e:
if isinstance(e, (DocumentTooLargeError, HTMLParsingError)):
raise
raise HTMLParsingError(f"Unexpected error during streaming parse: {str(e)}")
def _process_event(self, event: str, elem: HtmlElement):
"""Process a parse event."""
if event == 'start':
self._handle_start_tag(elem)
elif event == 'end':
self._handle_end_tag(elem)
def _handle_start_tag(self, elem: HtmlElement):
"""Handle opening tag."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import ContainerNode
tag = elem.tag.lower()
# Track tag stack
self.tag_stack.append(tag)
# Extract metadata from early elements
if tag == 'title' and elem.text:
self._extract_title_metadata(elem.text)
elif tag == 'meta':
self._extract_meta_metadata(elem)
# Handle specific tags
if tag == 'body':
# Create a container for body content
body_container = ContainerNode(tag_name='body')
self.root.add_child(body_container)
self.current_parent = body_container
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
self._start_heading(elem)
elif tag == 'p':
self._start_paragraph(elem)
elif tag == 'table':
self._start_table(elem)
elif tag == 'section':
self._start_section(elem)
def _handle_end_tag(self, elem: HtmlElement):
"""Handle closing tag."""
tag = elem.tag.lower()
# Remove from tag stack
if self.tag_stack and self.tag_stack[-1] == tag:
self.tag_stack.pop()
# Handle specific tags
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
self._end_heading(elem)
elif tag == 'p':
self._end_paragraph(elem)
elif tag == 'table':
self._end_table(elem)
elif tag == 'section':
self._end_section(elem)
elif tag == 'body':
# When body ends, flush any remaining nodes
self._flush_buffer()
# Handle text content
if elem.text:
self.text_buffer.append(elem.text.strip())
if elem.tail:
self.text_buffer.append(elem.tail.strip())
def _start_heading(self, elem: HtmlElement):
"""Start processing a heading."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import HeadingNode
level = int(elem.tag[1])
text = self._get_text_content(elem)
# Create heading node
heading = HeadingNode(
level=level,
content=text
)
# Check if this is a section header
if self.strategies.get('header_detection'):
detector = self.strategies['header_detection']
if detector.is_section_header(text, elem):
heading.semantic_type = SemanticType.SECTION_HEADER
self.node_buffer.append(heading)
def _end_heading(self, elem: HtmlElement):
"""End processing a heading."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import HeadingNode
# Get text content from element
text = self._get_text_content(elem)
if text and self.node_buffer and isinstance(self.node_buffer[-1], HeadingNode):
self.node_buffer[-1].content = text
# Clear any accumulated text buffer
self.text_buffer.clear()
def _start_paragraph(self, elem: HtmlElement):
"""Start processing a paragraph."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import ParagraphNode
para = ParagraphNode()
# Get style if present
style_attr = elem.get('style')
if style_attr and self.strategies.get('style_parser'):
style_parser = self.strategies['style_parser']
para.style = style_parser.parse(style_attr)
self.node_buffer.append(para)
def _end_paragraph(self, elem: HtmlElement):
"""End processing a paragraph."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import ParagraphNode, TextNode
# Get text content from element
text = self._get_text_content(elem)
if text and self.node_buffer and isinstance(self.node_buffer[-1], ParagraphNode):
text_node = TextNode(content=text)
self.node_buffer[-1].add_child(text_node)
# Clear any accumulated text buffer
self.text_buffer.clear()
def _start_table(self, elem: HtmlElement):
"""Start processing a table."""
self.in_table = True
self.table_buffer = []
# Store table element for later processing
self.table_elem = elem
def _end_table(self, elem: HtmlElement):
"""End processing a table."""
# Import node types at runtime to avoid circular imports
from edgar.documents.table_nodes import TableNode
self.in_table = False
# Process table with table processor if available
if self.strategies.get('table_processing'):
processor = self.strategies['table_processing']
table_node = processor.process(elem)
if table_node:
self.node_buffer.append(table_node)
else:
# Basic table node
table = TableNode()
self.node_buffer.append(table)
self.table_buffer.clear()
def _start_section(self, elem: HtmlElement):
"""Start processing a section."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import SectionNode
section = SectionNode()
# Get section attributes
section_id = elem.get('id')
if section_id:
section.metadata['id'] = section_id
section_class = elem.get('class')
if section_class:
section.metadata['class'] = section_class
self.current_section = section
self.node_buffer.append(section)
def _end_section(self, elem: HtmlElement):
"""End processing a section."""
self.current_section = None
def _flush_buffer(self):
"""Flush node buffer to document tree."""
for node in self.node_buffer:
# Add to current parent
if self.current_section:
self.current_section.add_child(node)
else:
self.current_parent.add_child(node)
self.node_buffer.clear()
def _get_text_content(self, elem: HtmlElement) -> str:
"""Extract text content from element."""
text_parts = []
if elem.text:
text_parts.append(elem.text.strip())
for child in elem:
child_text = self._get_text_content(child)
if child_text:
text_parts.append(child_text)
if child.tail:
text_parts.append(child.tail.strip())
return ' '.join(text_parts)
def _extract_title_metadata(self, title: str):
"""Extract metadata from title."""
# Example: "APPLE INC - 10-K - 2023-09-30"
parts = title.split(' - ')
if len(parts) >= 2:
self.metadata.company = parts[0].strip()
self.metadata.form = parts[1].strip()
if len(parts) >= 3:
self.metadata.filing_date = parts[2].strip()
def _extract_meta_metadata(self, elem: HtmlElement):
"""Extract metadata from meta tags."""
name = elem.get('name', '').lower()
content = elem.get('content', '')
if name and content:
if name == 'company':
self.metadata.company = content
elif name == 'filing-type':
self.metadata.form = content
elif name == 'cik':
self.metadata.cik = content
elif name == 'filing-date':
self.metadata.filing_date = content
elif name == 'accession-number':
self.metadata.accession_number = content

View File

@@ -0,0 +1,858 @@
"""
Table matrix builder for handling complex colspan/rowspan structures.
"""
from dataclasses import dataclass
from typing import List, Optional
from edgar.documents.table_nodes import Cell, Row
@dataclass
class MatrixCell:
"""Cell in the matrix with reference to original cell"""
original_cell: Optional[Cell] = None
is_spanned: bool = False # True if this is part of a colspan/rowspan
row_origin: int = -1 # Original row index
col_origin: int = -1 # Original column index
class TableMatrix:
"""
Build a 2D matrix representation of table with proper handling of merged cells.
This class converts a table with colspan/rowspan into a regular 2D grid
where each merged cell occupies multiple positions in the matrix.
"""
def __init__(self):
"""Initialize empty matrix"""
self.matrix: List[List[MatrixCell]] = []
self.row_count = 0
self.col_count = 0
self.header_row_count = 0 # Track number of header rows
def build_from_rows(self, header_rows: List[List[Cell]], data_rows: List[Row]) -> 'TableMatrix':
"""
Build matrix from header rows and data rows.
Args:
header_rows: List of header rows (each row is a list of Cells)
data_rows: List of Row objects
Returns:
Self for chaining
"""
# Store header row count for later use
self.header_row_count = len(header_rows)
# Combine all rows for processing
all_rows = []
# Add header rows
for header_row in header_rows:
all_rows.append(header_row)
# Add data rows
for row in data_rows:
all_rows.append(row.cells)
if not all_rows:
return self
# Calculate dimensions
self.row_count = len(all_rows)
# First pass: determine actual column count
self._calculate_dimensions(all_rows)
# Initialize matrix
self.matrix = [[MatrixCell() for _ in range(self.col_count)]
for _ in range(self.row_count)]
# Second pass: place cells in matrix
self._place_cells(all_rows)
return self
def _calculate_dimensions(self, rows: List[List[Cell]]):
"""Calculate the actual dimensions considering colspan"""
max_cols = 0
for row_idx, row in enumerate(rows):
col_pos = 0
for cell in row:
# Skip positions that might be occupied by rowspan from above
while col_pos < max_cols and self._is_occupied(row_idx, col_pos):
col_pos += 1
# This cell will occupy from col_pos to col_pos + colspan
col_end = col_pos + cell.colspan
max_cols = max(max_cols, col_end)
col_pos = col_end
self.col_count = max_cols
def _is_occupied(self, row: int, col: int) -> bool:
"""Check if a position is occupied by a cell from a previous row (rowspan)"""
if row == 0:
return False
# Check if any cell above has rowspan that reaches this position
for prev_row in range(row):
if prev_row < len(self.matrix) and col < len(self.matrix[prev_row]):
cell = self.matrix[prev_row][col]
if cell.original_cell and cell.row_origin == prev_row:
# Check if this cell's rowspan reaches current row
if prev_row + cell.original_cell.rowspan > row:
return True
return False
def _place_cells(self, rows: List[List[Cell]]):
"""Place cells in the matrix handling colspan and rowspan"""
for row_idx, row in enumerate(rows):
col_pos = 0
for cell_idx, cell in enumerate(row):
# Find next available column position
while col_pos < self.col_count and self.matrix[row_idx][col_pos].original_cell is not None:
col_pos += 1
if col_pos >= self.col_count:
# Need to expand matrix
self._expand_columns(col_pos + cell.colspan)
# Special handling for cells with colspan > 1 containing numeric values
# Only apply this logic for Table 15-style alignment issues
# Check if this looks like a financial value that should be right-aligned
cell_text = cell.text().strip()
# Check for numeric values that need special alignment
# This is specifically for cases like "167,045" that should align with "$167,045"
has_comma_separator = ',' in cell_text
digit_ratio = sum(c.isdigit() for c in cell_text) / len(cell_text) if cell_text else 0
# Only apply special placement for colspan=2 numeric values in data rows
# This handles Table 15's specific case without breaking Table 13
is_special_numeric = (cell.colspan == 2 and # Specifically colspan=2
has_comma_separator and
digit_ratio > 0.5 and # More than 50% digits
not cell_text.startswith('$') and
not any(month in cell_text.lower() for month in
['jan', 'feb', 'mar', 'apr', 'may', 'jun',
'jul', 'aug', 'sep', 'oct', 'nov', 'dec']) and
row_idx > 1) # Not a header row (allow for multi-row headers)
if is_special_numeric:
# Place empty cell at first position, content at second position
# This is specifically for Table 15 alignment
for r in range(cell.rowspan):
# First column of span: empty
if row_idx + r < self.row_count and col_pos < self.col_count:
self.matrix[row_idx + r][col_pos] = MatrixCell()
# Second column of span: the actual content
if row_idx + r < self.row_count and col_pos + 1 < self.col_count:
matrix_cell = MatrixCell(
original_cell=cell,
is_spanned=False,
row_origin=row_idx,
col_origin=col_pos + 1
)
self.matrix[row_idx + r][col_pos + 1] = matrix_cell
# Remaining columns of span: mark as spanned (though colspan=2 has no remaining)
for c in range(2, cell.colspan):
if row_idx + r < self.row_count and col_pos + c < self.col_count:
matrix_cell = MatrixCell(
original_cell=cell,
is_spanned=True,
row_origin=row_idx,
col_origin=col_pos + 1
)
self.matrix[row_idx + r][col_pos + c] = matrix_cell
else:
# Normal placement for other cells
for r in range(cell.rowspan):
for c in range(cell.colspan):
if row_idx + r < self.row_count and col_pos + c < self.col_count:
matrix_cell = MatrixCell(
original_cell=cell,
is_spanned=(r > 0 or c > 0),
row_origin=row_idx,
col_origin=col_pos
)
self.matrix[row_idx + r][col_pos + c] = matrix_cell
col_pos += cell.colspan
def _expand_columns(self, new_col_count: int):
"""Expand matrix to accommodate more columns"""
if new_col_count <= self.col_count:
return
for row in self.matrix:
row.extend([MatrixCell() for _ in range(new_col_count - self.col_count)])
self.col_count = new_col_count
def get_actual_columns(self) -> int:
"""Get the actual number of data columns (excluding empty/spacing columns)"""
non_empty_cols = 0
for col_idx in range(self.col_count):
has_content = False
for row_idx in range(self.row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
# Check if cell has actual content
text = cell.original_cell.text().strip()
if text and text not in ['', ' ', '\xa0']:
has_content = True
break
if has_content:
non_empty_cols += 1
return non_empty_cols
def get_column_widths(self) -> List[float]:
"""Estimate column widths based on content"""
widths = []
for col_idx in range(self.col_count):
max_width = 0
content_count = 0
for row_idx in range(self.row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
max_width = max(max_width, len(text))
content_count += 1
# If column has no content, it's likely a spacing column
if content_count == 0:
widths.append(0)
else:
widths.append(max_width)
return widths
def get_cell(self, row_idx: int, col_idx: int) -> Optional[Cell]:
"""
Get a cell at specific position in the matrix.
Args:
row_idx: Row index
col_idx: Column index
Returns:
Cell at position or None if out of bounds
"""
if row_idx >= self.row_count or col_idx >= self.col_count or row_idx < 0 or col_idx < 0:
return None
matrix_cell = self.matrix[row_idx][col_idx]
# Return the original cell
if matrix_cell.original_cell:
return matrix_cell.original_cell
# Return empty cell for empty positions
return Cell("")
def get_expanded_row(self, row_idx: int) -> List[Optional[Cell]]:
"""
Get a row with cells expanded to match column count.
For cells with colspan > 1, the cell appears in the first position
and None in subsequent positions.
"""
if row_idx >= self.row_count:
return []
expanded = []
for col_idx in range(self.col_count):
matrix_cell = self.matrix[row_idx][col_idx]
if matrix_cell.original_cell:
if not matrix_cell.is_spanned:
# This is the origin cell
expanded.append(matrix_cell.original_cell)
else:
# This is a spanned position
expanded.append(None)
else:
# Empty cell
expanded.append(None)
return expanded
def get_data_columns(self) -> List[int]:
"""
Get indices of columns that contain actual data (not spacing).
Uses strategy similar to old parser - keeps single empty columns for spacing.
Returns:
List of column indices that contain data
"""
# First, identify which columns are empty
empty_cols = []
for col_idx in range(self.col_count):
has_content = False
for row_idx in range(self.row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
has_content = True
break
if not has_content:
empty_cols.append(col_idx)
# Apply old parser's strategy
cols_to_remove = set()
# Remove leading empty columns
for col in range(self.col_count):
if col in empty_cols:
cols_to_remove.add(col)
else:
break
# Remove trailing empty columns
for col in reversed(range(self.col_count)):
if col in empty_cols:
cols_to_remove.add(col)
else:
break
# Remove consecutive empty columns in the middle (keep single empty cols for spacing)
i = 0
while i < self.col_count - 1:
if i in empty_cols and (i + 1) in empty_cols:
# Found consecutive empty columns
consecutive_count = 0
j = i
while j < self.col_count and j in empty_cols:
consecutive_count += 1
j += 1
# Keep first empty column as spacer, remove the rest
cols_to_remove.update(range(i + 1, i + consecutive_count))
i = j
else:
i += 1
# Return columns that are NOT in the removal set
data_cols = [col for col in range(self.col_count) if col not in cols_to_remove]
return data_cols
def filter_spacing_columns(self) -> 'TableMatrix':
"""
Create a new matrix with spacing columns removed.
Also handles colspan-generated duplicate columns and misalignment.
Returns:
New TableMatrix with only data columns
"""
# First pass: identify primary header columns (those with colspan > 1 headers)
# and data columns
primary_header_cols = set()
all_header_cols = set()
data_cols = set()
# Find primary header columns (those that start a colspan)
for row_idx in range(min(3, self.row_count)):
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
if cell.original_cell.text().strip():
all_header_cols.add(col_idx)
# Check if this is a primary header (colspan > 1)
if cell.original_cell.colspan > 1:
primary_header_cols.add(col_idx)
# If no primary headers found, use all headers as primary
if not primary_header_cols:
primary_header_cols = all_header_cols
# Phase 1.5: Identify columns with header content
# Any column with non-empty text in ANY header row must be preserved
# This prevents legitimate header columns from being removed as "spacing"
# Also preserve columns that are spanned by headers (colspan > 1)
header_content_columns = set()
for col_idx in range(self.col_count):
for row_idx in range(self.header_row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell:
# Check for original header cell with content
if not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
header_content_columns.add(col_idx)
# Also add all columns spanned by this header
if cell.original_cell.colspan > 1:
for span_offset in range(1, cell.original_cell.colspan):
span_col = col_idx + span_offset
if span_col < self.col_count:
header_content_columns.add(span_col)
break # Found content, no need to check other header rows
# Also preserve columns that are spanned (part of a colspan)
elif cell.is_spanned:
# This column is part of a header's colspan
text = cell.original_cell.text().strip()
if text:
header_content_columns.add(col_idx)
# Find columns with data (skip header rows)
# Count actual header rows by checking for non-data content
actual_header_rows = 0
for row_idx in range(min(3, self.row_count)):
has_numeric_data = False
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
# Check if it looks like numeric data (has commas or starts with $)
if text and (',' in text and any(c.isdigit() for c in text)) or text == '$':
has_numeric_data = True
break
if has_numeric_data:
break
actual_header_rows += 1
data_start_row = max(1, actual_header_rows)
# Track columns with significant data (not just isolated cells)
col_data_count = {}
for row_idx in range(data_start_row, self.row_count):
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
if cell.original_cell.text().strip():
data_cols.add(col_idx)
col_data_count[col_idx] = col_data_count.get(col_idx, 0) + 1
# Build initial list of columns to keep
# Always include column 0 if it contains row labels
cols_to_keep = set(primary_header_cols)
# Add columns with header content (prevents removing legitimate headers)
cols_to_keep.update(header_content_columns)
# Identify misaligned data columns that need to be consolidated
# These are data columns that are not primary header columns
misaligned_data_cols = data_cols - primary_header_cols
# Map misaligned data columns to their nearest column for consolidation
# Only consolidate directly adjacent columns with specific patterns
consolidation_map = {}
# First pass: identify all potential consolidations
potential_consolidations = {}
for data_col in sorted(misaligned_data_cols):
# Check if this column should be consolidated with an adjacent column
# Check the column immediately before this one
prev_col = data_col - 1
# Sample some cells to see if consolidation makes sense
consolidation_type = None
for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
prev_cell = self.matrix[row_idx][prev_col] if prev_col >= 0 else None
curr_cell = self.matrix[row_idx][data_col]
if prev_cell and prev_cell.original_cell and curr_cell.original_cell:
prev_text = prev_cell.original_cell.text().strip()
curr_text = curr_cell.original_cell.text().strip()
# Skip empty cells
if not prev_text or not curr_text:
continue
# Check for patterns that indicate consolidation
if prev_text == '$' and curr_text and curr_text[0].isdigit():
consolidation_type = 'currency'
break
elif prev_text.startswith('(') and curr_text == ')':
consolidation_type = 'parentheses'
break
elif curr_text == '%' and prev_text and prev_text[-1].isdigit():
consolidation_type = 'percentage'
break
if consolidation_type:
potential_consolidations[data_col] = (prev_col, consolidation_type)
# Second pass: resolve conflicts
# If column Y is a target for consolidation from Y+1 (e.g., parentheses),
# then don't consolidate Y into another column
columns_needed_as_targets = set()
for data_col, (target_col, cons_type) in potential_consolidations.items():
if cons_type == 'parentheses':
# This target column is needed for parentheses consolidation
columns_needed_as_targets.add(target_col)
# Build final consolidation map, skipping consolidations that would remove needed targets
for data_col, (target_col, cons_type) in potential_consolidations.items():
# Don't consolidate this column if it's needed as a target for parentheses
if data_col in columns_needed_as_targets and cons_type != 'parentheses':
continue
# CRITICAL: Don't consolidate columns that have header content
# This prevents legitimate header columns from being merged together
if data_col in header_content_columns or target_col in header_content_columns:
continue
consolidation_map[data_col] = target_col
# Debug: uncomment to see consolidation mapping
# import os
# if os.environ.get('DEBUG_TABLE_CONSOLIDATION'):
# print(f"Consolidating column {data_col} into {target_col}")
# Special case: Keep data columns that are associated with header columns
# This handles cases where headers span multiple columns but data is in specific columns
for header_col in primary_header_cols:
# Check if there's a data column immediately after the header column
# This is common when headers span multiple columns
for offset in range(1, 3): # Check next 1-2 columns
data_col = header_col + offset
if data_col in data_cols and data_col not in cols_to_keep:
# Check if this column has meaningful data
has_data = False
for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
cell = self.matrix[row_idx][data_col]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text and text not in ['', '-', '', '']:
has_data = True
break
if has_data:
cols_to_keep.add(data_col)
# Keep data columns that have significant content but aren't near header columns
# This includes columns with dates, text descriptions, etc.
for col_idx in data_cols:
if col_idx not in cols_to_keep:
# Check if this column has important data
has_important_data = False
non_empty_count = 0
text_samples = []
for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text and text not in ['', '-', '', '']:
non_empty_count += 1
if len(text_samples) < 3:
text_samples.append(text)
# Check for important patterns
# Dates, years, text descriptions, etc.
if any([
len(text) > 3 and not text.replace(',', '').replace('.', '').isdigit(), # Non-trivial text
any(month in text for month in ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']),
any(month in text for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']),
'20' in text and any(c.isdigit() for c in text), # Likely contains year
]):
has_important_data = True
# Keep columns with consistent important data
if has_important_data and non_empty_count >= 3:
cols_to_keep.add(col_idx)
# Special case: If we have very few primary headers but lots of data columns,
# we might have a table where headers are in data rows (like years)
# Keep columns that have significant financial data
if len(primary_header_cols) <= 2 and len(data_cols) > 4:
# Check for financial data patterns in columns
for col_idx in data_cols:
has_financial_data = False
sample_count = 0
# Sample a few cells from this column
for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
sample_count += 1
# Check for financial patterns
if any([
text.startswith('(') and any(c.isdigit() for c in text), # Negative numbers
text == ')' and col_idx > 0, # Closing parenthesis
'$' in text, # Currency
'%' in text, # Percentages
text.replace(',', '').replace('.', '').isdigit(), # Plain numbers
text in ['', '', '-', '*'] # Common placeholders
]):
has_financial_data = True
break
# Keep columns with financial data
if has_financial_data and sample_count > 0:
cols_to_keep.add(col_idx)
# Check if column 0 contains row labels (non-empty cells in data rows)
col_0_has_labels = False
data_start_row = max(1, actual_header_rows)
for row_idx in range(data_start_row, self.row_count):
cell = self.matrix[row_idx][0]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text and not text.isdigit() and not text.startswith('$') and len(text) > 1:
col_0_has_labels = True
break
# Include column 0 if it has labels
if col_0_has_labels:
cols_to_keep.add(0)
# Remove columns that will be consolidated into other columns
# These columns' data will be merged into their target columns
cols_to_remove = set(consolidation_map.keys())
cols_to_keep = cols_to_keep - cols_to_remove
cols_to_keep = sorted(cols_to_keep)
# Create new matrix with consolidated columns
if not cols_to_keep:
return self
new_matrix = TableMatrix()
new_matrix.row_count = self.row_count
new_matrix.col_count = len(cols_to_keep)
new_matrix.header_row_count = self.header_row_count # Preserve header row count
new_matrix.matrix = []
# Create mapping from old to new column indices
old_to_new = {old_col: new_idx for new_idx, old_col in enumerate(cols_to_keep)}
# Build new matrix with consolidation
for row_idx in range(self.row_count):
new_row = [MatrixCell() for _ in range(new_matrix.col_count)]
# Track which cells we've already placed to handle colspan properly
placed_origins = {} # Maps (row_origin, col_origin) to new column index
# First, copy cells from kept columns
for old_col in sorted(cols_to_keep):
if old_col not in old_to_new:
continue
new_col = old_to_new[old_col]
cell = self.matrix[row_idx][old_col]
if cell.original_cell:
origin_key = (cell.row_origin, cell.col_origin)
# Check if we've already placed this cell (due to colspan)
if origin_key in placed_origins:
# This is a continuation of a colspan - mark as spanned
new_row[new_col] = MatrixCell(
original_cell=cell.original_cell,
is_spanned=True, # Mark as spanned since it's part of a colspan
row_origin=cell.row_origin,
col_origin=placed_origins[origin_key] # Point to the original placement
)
else:
# First occurrence of this cell - place normally
new_row[new_col] = MatrixCell(
original_cell=cell.original_cell,
is_spanned=False, # This is the primary cell
row_origin=cell.row_origin,
col_origin=new_col
)
placed_origins[origin_key] = new_col
# Then, consolidate misaligned data into header columns
for data_col, header_col in consolidation_map.items():
if header_col in old_to_new:
new_col = old_to_new[header_col]
data_cell = self.matrix[row_idx][data_col] if data_col < len(self.matrix[row_idx]) else None
# If data cell has content, merge it with header column
if data_cell and data_cell.original_cell and not data_cell.is_spanned:
# Skip empty data cells
if not data_cell.original_cell.text().strip():
continue
# Check the original header column cell to see if it has content to merge
header_cell = self.matrix[row_idx][header_col]
existing_cell = new_row[new_col]
# Check if we need to merge (e.g., $ with value)
if header_cell.original_cell and header_cell.original_cell.text().strip():
existing_text = header_cell.original_cell.text().strip()
new_text = data_cell.original_cell.text().strip()
# Merge currency symbol with value OR value with percentage OR parentheses
if existing_text == '$' and new_text:
# Currency merge: $ + number
merged_text = f"${new_text}"
# Create new cell with merged content
merged_cell = Cell(
content=merged_text,
colspan=header_cell.original_cell.colspan,
rowspan=header_cell.original_cell.rowspan,
is_header=header_cell.original_cell.is_header,
align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
)
new_row[new_col] = MatrixCell(
original_cell=merged_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
elif new_text == ')' and existing_text.startswith('('):
# Parentheses merge: (number + )
merged_text = f"{existing_text})"
# Create new cell with merged content
merged_cell = Cell(
content=merged_text,
colspan=header_cell.original_cell.colspan,
rowspan=header_cell.original_cell.rowspan,
is_header=header_cell.original_cell.is_header,
align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
)
new_row[new_col] = MatrixCell(
original_cell=merged_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
elif new_text == '%' and existing_text:
# Percentage merge: number + %
merged_text = f"{existing_text}%"
# Create new cell with merged content
merged_cell = Cell(
content=merged_text,
colspan=header_cell.original_cell.colspan,
rowspan=header_cell.original_cell.rowspan,
is_header=header_cell.original_cell.is_header,
align=header_cell.original_cell.align if hasattr(header_cell.original_cell, 'align') else None
)
new_row[new_col] = MatrixCell(
original_cell=merged_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
else:
# Just keep the data cell if can't merge
new_row[new_col] = MatrixCell(
original_cell=data_cell.original_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
else:
# No existing content, just move the data
new_row[new_col] = MatrixCell(
original_cell=data_cell.original_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
new_matrix.matrix.append(new_row)
return new_matrix
def to_cell_grid(self) -> List[List[Optional[Cell]]]:
"""
Convert matrix to a simple 2D grid of cells.
Returns:
2D list where each position contains either a Cell or None
"""
grid = []
for row_idx in range(self.row_count):
row = []
for col_idx in range(self.col_count):
matrix_cell = self.matrix[row_idx][col_idx]
if matrix_cell.original_cell and not matrix_cell.is_spanned:
row.append(matrix_cell.original_cell)
else:
row.append(None)
grid.append(row)
return grid
def debug_print(self):
"""Print matrix structure for debugging"""
print(f"Matrix: {self.row_count}×{self.col_count}")
for row_idx in range(self.row_count):
row_str = []
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell:
text = cell.original_cell.text()[:10]
if cell.is_spanned:
row_str.append(f"[{text}...]")
else:
row_str.append(f"{text}...")
else:
row_str.append("___")
print(f"Row {row_idx}: {' | '.join(row_str)}")
class ColumnAnalyzer:
"""Analyze column structure to identify data vs spacing columns"""
def __init__(self, matrix: TableMatrix):
"""Initialize with a table matrix"""
self.matrix = matrix
def identify_spacing_columns(self) -> List[int]:
"""
Identify columns used only for spacing.
Returns:
List of column indices that are spacing columns
"""
spacing_cols = []
widths = self.matrix.get_column_widths()
total_width = sum(widths)
for col_idx in range(self.matrix.col_count):
if self._is_spacing_column(col_idx, widths, total_width):
spacing_cols.append(col_idx)
return spacing_cols
def _is_spacing_column(self, col_idx: int, widths: List[float], total_width: float) -> bool:
"""
Check if a column is used for spacing.
Only mark as spacing if column is completely empty.
Criteria:
- Column has absolutely no content across all rows
"""
# Check if column is completely empty
for row_idx in range(self.matrix.row_count):
cell = self.matrix.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
# If there's any text at all, it's not a spacing column
if text:
return False
# Column is completely empty
return True
def get_clean_column_indices(self) -> List[int]:
"""
Get indices of non-spacing columns.
Returns:
List of column indices that contain actual data
"""
spacing = set(self.identify_spacing_columns())
return [i for i in range(self.matrix.col_count) if i not in spacing]

View File

@@ -0,0 +1,440 @@
"""
Table of Contents analyzer for SEC filings.
This module analyzes the TOC structure to map section names to anchor IDs,
enabling section extraction for API filings with generated anchor IDs.
"""
import re
from typing import Dict, List, Optional, Set, Tuple
from dataclasses import dataclass
from lxml import html as lxml_html
@dataclass
class TOCSection:
"""Represents a section found in the Table of Contents."""
name: str
anchor_id: str
normalized_name: str
section_type: str # 'item', 'part', 'other'
order: int
part: Optional[str] = None # NEW: "Part I", "Part II", or None for 10-K
class TOCAnalyzer:
"""
Analyzes Table of Contents structure to map section names to anchor IDs.
This enables section extraction for filings where anchor IDs are generated
rather than semantic (like API filings vs local HTML files).
"""
def __init__(self):
# SEC section patterns for normalization
self.section_patterns = [
(r'(?:item|part)\s+\d+[a-z]?', 'item'),
(r'business', 'item'),
(r'risk\s+factors?', 'item'),
(r'properties', 'item'),
(r'legal\s+proceedings', 'item'),
(r'management.*discussion', 'item'),
(r'md&a', 'item'),
(r'financial\s+statements?', 'item'),
(r'exhibits?', 'item'),
(r'signatures?', 'item'),
(r'part\s+[ivx]+', 'part'),
]
def analyze_toc_structure(self, html_content: str) -> Dict[str, str]:
"""
Analyze HTML content to extract section mappings from TOC.
Args:
html_content: Raw HTML content
Returns:
Dict mapping normalized section names to anchor IDs
"""
section_mapping = {}
try:
# Handle XML declaration issues
if html_content.startswith('<?xml'):
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
tree = lxml_html.fromstring(html_content)
# Find all anchor links that could be TOC links
anchor_links = tree.xpath('//a[@href]')
toc_sections = []
current_part = None # Track current part context for 10-Q filings
part_pattern = re.compile(r'^\s*Part\s+([IVX]+)\b', re.IGNORECASE)
for link in anchor_links:
href = link.get('href', '').strip()
text = (link.text_content() or '').strip()
# Check if this link or its row represents a part header
# Part headers in 10-Q TOCs typically appear as separate rows: "Part I", "Part II"
part_match = part_pattern.match(text)
if part_match:
# Update current part context
current_part = f"Part {part_match.group(1).upper()}"
# Don't create a section for the part header itself
continue
# Look for internal anchor links
if href.startswith('#') and text:
anchor_id = href[1:] # Remove #
# Try to find item number in preceding context (for table-based TOCs)
preceding_item = self._extract_preceding_item_label(link)
# Check if this looks like a section reference (check text, anchor ID, and context)
if self._is_section_link(text, anchor_id, preceding_item):
# Verify target exists
target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
if target_elements:
# Try to extract item number from: anchor ID > preceding context > text
normalized_name = self._normalize_section_name(text, anchor_id, preceding_item)
section_type, order = self._get_section_type_and_order(normalized_name)
toc_section = TOCSection(
name=text,
anchor_id=anchor_id,
normalized_name=normalized_name,
section_type=section_type,
order=order,
part=current_part # Assign current part context
)
toc_sections.append(toc_section)
# Build mapping prioritizing the most standard section names
section_mapping = self._build_section_mapping(toc_sections)
except Exception as e:
# Return empty mapping on error - fallback to other methods
pass
return section_mapping
def _extract_preceding_item_label(self, link_element) -> str:
"""
Extract item/part label from preceding context.
Handles table-based TOCs where item number is in a separate cell:
<td>Item 1.</td><td><a href="...">Business</a></td>
Also handles nested structures like:
<td>Item 1.</td><td><div><span><a href="...">Business</a></span></div></td>
Args:
link_element: The <a> element
Returns:
Item label like "Item 1", "Item 1A", "Part I" or empty string
"""
try:
# Traverse up to find the containing <td> or <th> (up to 5 levels)
current = link_element
td_element = None
for _ in range(5):
parent = current.getparent()
if parent is None:
break
if parent.tag in ['td', 'th']:
td_element = parent
break
current = parent
# If we found a <td>, check ALL preceding siblings in the row
# This handles TOCs where item number is not in the immediately adjacent cell
# Example: ['Business', 'I', '1', '5'] where '1' is the item number
if td_element is not None:
# Check all preceding siblings (rightmost to leftmost)
prev_sibling = td_element.getprevious()
while prev_sibling is not None:
if prev_sibling.tag in ['td', 'th']:
prev_text = (prev_sibling.text_content() or '').strip()
# Look for "Item X" or just "X" (bare number) pattern
# Match full format: "Item 1A"
item_match = re.match(r'(Item\s+\d+[A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
if item_match:
return item_match.group(1)
# Match bare item number: "1A" or "1" (only valid 10-K item numbers: 1-15)
# This prevents page numbers (50, 108, etc.) from being treated as items
bare_item_match = re.match(r'^([1-9]|1[0-5])([A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
if bare_item_match:
item_num = bare_item_match.group(1)
item_letter = bare_item_match.group(2)
return f"Item {item_num}{item_letter}"
# Match part: "Part I" or just "I"
part_match = re.match(r'(Part\s+[IVX]+)\.?\s*$', prev_text, re.IGNORECASE)
if part_match:
return part_match.group(1)
# Match bare part: "I", "II", etc.
bare_part_match = re.match(r'^([IVX]+)\.?\s*$', prev_text)
if bare_part_match:
return f"Part {bare_part_match.group(1)}"
prev_sibling = prev_sibling.getprevious()
# Also check immediate parent's text for inline patterns (div/span structures)
parent = link_element.getparent()
if parent is not None and parent.tag in ['div', 'span', 'p']:
if parent.text:
text_before = parent.text.strip()
item_match = re.search(r'(Item\s+\d+[A-Z]?)\.?\s*$', text_before, re.IGNORECASE)
if item_match:
return item_match.group(1)
part_match = re.search(r'(Part\s+[IVX]+)\.?\s*$', text_before, re.IGNORECASE)
if part_match:
return part_match.group(1)
except Exception:
pass
return ''
def _is_section_link(self, text: str, anchor_id: str = '', preceding_item: str = '') -> bool:
"""
Check if link represents a section reference.
Checks link text, anchor ID, and preceding context to handle cases where:
- Text is descriptive (e.g., "Executive Compensation")
- Anchor ID contains item number (e.g., "item_11_executive_compensation")
- Item number is in preceding table cell (e.g., <td>Item 1.</td><td><a>Business</a></td>)
Args:
text: Link text
anchor_id: Anchor ID from href (without #)
preceding_item: Item/part label from preceding context (e.g., "Item 1A")
Returns:
True if this appears to be a section link
"""
if not text:
return False
# First check if there's a preceding item label (table-based TOC)
if preceding_item:
return True
# Then check anchor ID for item/part patterns (most reliable)
if anchor_id:
anchor_lower = anchor_id.lower()
# Match patterns like: item_1, item_1a, item1, item1a, part_i, part_ii, etc.
if re.search(r'item_?\d+[a-z]?', anchor_lower):
return True
if re.search(r'part_?[ivx]+', anchor_lower):
return True
# Then check text (with relaxed length limit for descriptive section names)
if len(text) > 150: # Increased from 100 to accommodate longer section titles
return False
# Check against known patterns
for pattern, _ in self.section_patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
# Also consider links with section keywords
if len(text) < 100 and any(keyword in text.lower() for keyword in
['item', 'part', 'business', 'risk', 'properties', 'legal',
'compensation', 'ownership', 'governance', 'directors']):
return True
return False
def _normalize_section_name(self, text: str, anchor_id: str = '', preceding_item: str = '') -> str:
"""
Normalize section name for consistent lookup.
Prioritizes:
1. Preceding item label (table-based TOC)
2. Anchor ID pattern
3. Text-based normalization
Args:
text: Link text
anchor_id: Anchor ID from href (without #)
preceding_item: Item/part label from preceding context
Returns:
Normalized section name (e.g., "Item 1A", "Part II")
"""
text = text.strip()
# HIGHEST PRIORITY: Use preceding item label if available (table-based TOC)
if preceding_item:
# Clean up and normalize the preceding item
item_match = re.match(r'item\s+(\d+[a-z]?)', preceding_item, re.IGNORECASE)
if item_match:
return f"Item {item_match.group(1).upper()}"
part_match = re.match(r'part\s+([ivx]+)', preceding_item, re.IGNORECASE)
if part_match:
return f"Part {part_match.group(1).upper()}"
# SECOND PRIORITY: Try to extract from anchor ID
if anchor_id:
anchor_lower = anchor_id.lower()
# Match item patterns: item_1a, item1a, item_1_business, etc.
item_match = re.search(r'item_?(\d+[a-z]?)', anchor_lower)
if item_match:
item_num = item_match.group(1).upper()
return f"Item {item_num}"
# Match part patterns: part_i, part_ii, parti, partii, etc.
part_match = re.search(r'part_?([ivx]+)', anchor_lower)
if part_match:
part_num = part_match.group(1).upper()
return f"Part {part_num}"
# THIRD PRIORITY: Text-based normalization
# Handle common Item patterns in text
item_match = re.match(r'item\s+(\d+[a-z]?)', text, re.IGNORECASE)
if item_match:
return f"Item {item_match.group(1).upper()}"
# Handle Part patterns
part_match = re.match(r'part\s+([ivx]+)', text, re.IGNORECASE)
if part_match:
return f"Part {part_match.group(1).upper()}"
# Handle specific known sections by text
text_lower = text.lower()
if 'business' in text_lower and 'item' not in text_lower:
return "Item 1"
elif 'risk factors' in text_lower and 'item' not in text_lower:
return "Item 1A"
elif 'properties' in text_lower and 'item' not in text_lower:
return "Item 2"
elif 'legal proceedings' in text_lower and 'item' not in text_lower:
return "Item 3"
elif 'management' in text_lower and 'discussion' in text_lower:
return "Item 7"
elif 'financial statements' in text_lower:
return "Item 8"
elif 'exhibits' in text_lower:
return "Item 15"
return text # Return as-is if no normalization applies
def _get_section_type_and_order(self, text: str) -> Tuple[str, int]:
"""Get section type and order for sorting."""
text_lower = text.lower()
# Items
item_match = re.search(r'item\s*(\d+)([a-z]?)', text_lower)
if item_match:
item_num = int(item_match.group(1))
item_letter = item_match.group(2) or ''
# Order: Item 1=1000, Item 1A=1001, Item 2=2000, etc.
order = item_num * 1000 + (ord(item_letter.upper()) - ord('A') + 1 if item_letter else 0)
return 'item', order
# Parts
part_match = re.search(r'part\s*([ivx]+)', text_lower)
if part_match:
part_roman = part_match.group(1)
part_num = self._roman_to_int(part_roman)
return 'part', part_num * 100 # Part I=100, Part II=200, etc.
# Known sections without explicit item numbers
if 'business' in text_lower:
return 'item', 1000 # Item 1
elif 'risk factors' in text_lower:
return 'item', 1001 # Item 1A
elif 'properties' in text_lower:
return 'item', 2000 # Item 2
elif 'legal proceedings' in text_lower:
return 'item', 3000 # Item 3
elif 'management' in text_lower and 'discussion' in text_lower:
return 'item', 7000 # Item 7
elif 'financial statements' in text_lower:
return 'item', 8000 # Item 8
elif 'exhibits' in text_lower:
return 'item', 15000 # Item 15
return 'other', 99999
def _roman_to_int(self, roman: str) -> int:
"""Convert roman numerals to integers."""
roman_map = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
roman = roman.lower()
result = 0
prev = 0
for char in reversed(roman):
value = roman_map.get(char, 0)
if value < prev:
result -= value
else:
result += value
prev = value
return result
def _build_section_mapping(self, toc_sections: List[TOCSection]) -> Dict[str, str]:
"""Build final section mapping, handling duplicates intelligently.
For 10-Q filings with part context, generates part-aware section names
like "part_i_item_1" and "part_ii_item_1" to distinguish sections
with the same item number across different parts.
"""
# Sort sections by order
toc_sections.sort(key=lambda x: x.order)
mapping = {}
seen_names = set()
for section in toc_sections:
# Generate part-aware section name for 10-Q filings
if section.part:
# Convert "Part I" -> "part_i", "Part II" -> "part_ii"
part_key = section.part.lower().replace(' ', '_')
# Convert "Item 1" -> "item_1", "Item 1A" -> "item_1a"
item_key = section.normalized_name.lower().replace(' ', '_')
section_name = f"{part_key}_{item_key}"
else:
# 10-K filings: use normalized name as-is
section_name = section.normalized_name
# Skip if we already have this section (prefer first occurrence)
if section_name in seen_names:
continue
mapping[section_name] = section.anchor_id
seen_names.add(section_name)
return mapping
def get_section_suggestions(self, html_content: str) -> List[str]:
"""Get list of available sections that can be extracted."""
mapping = self.analyze_toc_structure(html_content)
return sorted(mapping.keys(), key=lambda x: self._get_section_type_and_order(x)[1])
def analyze_toc_for_sections(html_content: str) -> Dict[str, str]:
"""
Convenience function to analyze TOC and return section mapping.
Args:
html_content: Raw HTML content
Returns:
Dict mapping section names to anchor IDs
"""
analyzer = TOCAnalyzer()
return analyzer.analyze_toc_structure(html_content)

Some files were not shown because too many files have changed in this diff Show More