Initial commit
This commit is contained in:
@@ -0,0 +1,49 @@
|
||||
"""
|
||||
EdgarTools HTML Parser v2.0
|
||||
|
||||
A high-performance, semantically-aware HTML parser for SEC filings.
|
||||
"""
|
||||
|
||||
from edgar.documents.parser import HTMLParser
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.exceptions import ParsingError
|
||||
from edgar.documents.types import NodeType, SemanticType, TableType
|
||||
from edgar.documents.search import DocumentSearch, SearchResult, SearchMode
|
||||
from edgar.documents.renderers import MarkdownRenderer, TextRenderer
|
||||
|
||||
__version__ = "2.0.0"
|
||||
__all__ = [
|
||||
'HTMLParser',
|
||||
'Document',
|
||||
'ParserConfig',
|
||||
'ParsingError',
|
||||
'NodeType',
|
||||
'SemanticType',
|
||||
'TableType',
|
||||
'DocumentSearch',
|
||||
'SearchResult',
|
||||
'SearchMode',
|
||||
'MarkdownRenderer',
|
||||
'TextRenderer',
|
||||
'parse_html'
|
||||
]
|
||||
|
||||
|
||||
def parse_html(html: str, config: ParserConfig = None) -> Document:
|
||||
"""
|
||||
Convenience function for parsing HTML.
|
||||
|
||||
Args:
|
||||
html: HTML content to parse
|
||||
config: Optional parser configuration
|
||||
|
||||
Returns:
|
||||
Parsed Document object
|
||||
|
||||
Example:
|
||||
>>> document = parse_html(html_content)
|
||||
>>> print(document.text()[:100])
|
||||
"""
|
||||
parser = HTMLParser(config or ParserConfig())
|
||||
return parser.parse(html)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,83 @@
|
||||
"""
|
||||
Mixin class providing text caching functionality for document nodes.
|
||||
|
||||
This module consolidates the text caching pattern used across multiple node types
|
||||
(DocumentNode, ParagraphNode, ContainerNode, TableNode, and Document).
|
||||
"""
|
||||
|
||||
from typing import Callable, Any
|
||||
|
||||
|
||||
class CacheableMixin:
|
||||
"""
|
||||
Mixin providing text caching functionality for nodes.
|
||||
|
||||
This mixin implements a lazy-evaluated text caching pattern that:
|
||||
1. Checks for existing cached text
|
||||
2. Generates text on first access via a generator function
|
||||
3. Caches the result for subsequent accesses
|
||||
4. Provides recursive cache clearing for tree structures
|
||||
|
||||
Usage:
|
||||
class MyNode(CacheableMixin):
|
||||
def text(self, **kwargs):
|
||||
def generator():
|
||||
# Generate text logic here
|
||||
return "generated text"
|
||||
return self._get_cached_text(generator)
|
||||
"""
|
||||
|
||||
def _get_cached_text(self, generator_func: Callable[[], Any], *args, **kwargs) -> Any:
|
||||
"""
|
||||
Get cached text or generate and cache it.
|
||||
|
||||
This method implements the caching pattern:
|
||||
- If cache exists and is not None, return cached value
|
||||
- Otherwise, call generator function to create text
|
||||
- Store result in cache
|
||||
- Return the result
|
||||
|
||||
Args:
|
||||
generator_func: Function that generates the text when cache miss occurs
|
||||
*args: Positional arguments to pass to generator (currently unused)
|
||||
**kwargs: Keyword arguments to pass to generator (currently unused)
|
||||
|
||||
Returns:
|
||||
The cached or newly generated text
|
||||
|
||||
Note:
|
||||
The cache is stored in the instance attribute '_text_cache'.
|
||||
Generator function is called without arguments in current implementation.
|
||||
"""
|
||||
if hasattr(self, '_text_cache') and self._text_cache is not None:
|
||||
return self._text_cache
|
||||
|
||||
# Generate text and cache it
|
||||
self._text_cache = generator_func(*args, **kwargs)
|
||||
return self._text_cache
|
||||
|
||||
def clear_text_cache(self) -> None:
|
||||
"""
|
||||
Clear cached text recursively.
|
||||
|
||||
This method:
|
||||
1. Clears the text cache for this node (sets to None)
|
||||
2. Recursively clears cache for all children (if node has children)
|
||||
|
||||
The recursive clearing ensures that when a parent node's content changes,
|
||||
all descendant nodes also have their caches invalidated.
|
||||
|
||||
Safe to call even if:
|
||||
- Node doesn't have a cache (_text_cache attribute)
|
||||
- Node doesn't have children
|
||||
- Children don't have clear_text_cache method
|
||||
"""
|
||||
# Clear own cache if it exists
|
||||
if hasattr(self, '_text_cache'):
|
||||
self._text_cache = None
|
||||
|
||||
# Recursively clear children's caches
|
||||
if hasattr(self, 'children'):
|
||||
for child in self.children:
|
||||
if hasattr(child, 'clear_text_cache'):
|
||||
child.clear_text_cache()
|
||||
211
venv/lib/python3.10/site-packages/edgar/documents/config.py
Normal file
211
venv/lib/python3.10/site-packages/edgar/documents/config.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""
|
||||
Configuration for the HTML parser.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetectionThresholds:
|
||||
"""
|
||||
Configurable thresholds for section detection strategies.
|
||||
|
||||
Attributes:
|
||||
min_confidence: Minimum confidence score to include a section (0.0-1.0)
|
||||
cross_validation_boost: Multiplier when multiple methods agree (>1.0)
|
||||
disagreement_penalty: Multiplier when methods disagree (<1.0)
|
||||
boundary_overlap_penalty: Multiplier for overlapping sections (<1.0)
|
||||
enable_cross_validation: Whether to run cross-validation (slower but more accurate)
|
||||
thresholds_by_form: Filing-specific threshold overrides
|
||||
"""
|
||||
min_confidence: float = 0.6
|
||||
cross_validation_boost: float = 1.2
|
||||
disagreement_penalty: float = 0.8
|
||||
boundary_overlap_penalty: float = 0.9
|
||||
enable_cross_validation: bool = False # Disabled by default for performance
|
||||
thresholds_by_form: Dict[str, Dict[str, float]] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserConfig:
|
||||
"""
|
||||
Configuration for HTML parser.
|
||||
|
||||
Attributes:
|
||||
max_document_size: Maximum document size in bytes
|
||||
streaming_threshold: Document size threshold for streaming mode
|
||||
cache_size: Maximum number of cached items
|
||||
enable_parallel: Enable parallel processing for tables
|
||||
strict_mode: Fail on parsing errors vs. best effort
|
||||
extract_xbrl: Extract inline XBRL facts
|
||||
extract_styles: Extract and process CSS styles
|
||||
preserve_whitespace: Preserve original whitespace
|
||||
optimize_for_ai: Enable AI-specific optimizations
|
||||
max_token_estimation: Maximum estimated tokens for AI optimization
|
||||
features: Feature flags for optional functionality
|
||||
"""
|
||||
|
||||
# Performance settings
|
||||
max_document_size: int = 100 * 1024 * 1024 # 100MB (handles large filings like JPM)
|
||||
streaming_threshold: int = 10 * 1024 * 1024 # 10MB
|
||||
cache_size: int = 1000
|
||||
enable_parallel: bool = True
|
||||
max_workers: Optional[int] = None # None = use CPU count
|
||||
|
||||
# Parsing settings
|
||||
strict_mode: bool = False
|
||||
extract_xbrl: bool = True
|
||||
extract_styles: bool = True
|
||||
preserve_whitespace: bool = False
|
||||
normalize_text: bool = True
|
||||
extract_links: bool = True
|
||||
extract_images: bool = False
|
||||
|
||||
# AI optimization
|
||||
optimize_for_ai: bool = True
|
||||
max_token_estimation: int = 100_000
|
||||
chunk_size: int = 512
|
||||
chunk_overlap: int = 128
|
||||
|
||||
# Table processing
|
||||
table_extraction: bool = True
|
||||
detect_table_types: bool = True
|
||||
extract_table_relationships: bool = True
|
||||
fast_table_rendering: bool = True # Fast renderer is now production-ready (7-10x faster than Rich)
|
||||
|
||||
# Section detection
|
||||
detect_sections: bool = True
|
||||
eager_section_extraction: bool = False # Extract sections during parsing vs. on first access (default: lazy)
|
||||
form: Optional[str] = None # Required for section detection (e.g. '10-K', '10-Q', '8-K')
|
||||
detection_thresholds: DetectionThresholds = field(default_factory=DetectionThresholds)
|
||||
section_patterns: Dict[str, List[str]] = field(default_factory=lambda: {
|
||||
'business': [
|
||||
r'item\s+1\.?\s*business',
|
||||
r'business\s+overview',
|
||||
r'our\s+business'
|
||||
],
|
||||
'risk_factors': [
|
||||
r'item\s+1a\.?\s*risk\s+factors',
|
||||
r'risk\s+factors',
|
||||
r'factors\s+that\s+may\s+affect'
|
||||
],
|
||||
'properties': [
|
||||
r'item\s+2\.?\s*properties',
|
||||
r'properties'
|
||||
],
|
||||
'legal_proceedings': [
|
||||
r'item\s+3\.?\s*legal\s+proceedings',
|
||||
r'legal\s+proceedings',
|
||||
r'litigation'
|
||||
],
|
||||
'mda': [
|
||||
r'item\s+7\.?\s*management\'?s?\s+discussion',
|
||||
r'md&a',
|
||||
r'management\'?s?\s+discussion\s+and\s+analysis'
|
||||
],
|
||||
'financial_statements': [
|
||||
r'item\s+8\.?\s*financial\s+statements',
|
||||
r'consolidated\s+financial\s+statements',
|
||||
r'financial\s+statements'
|
||||
]
|
||||
})
|
||||
|
||||
# Feature flags
|
||||
features: Dict[str, bool] = field(default_factory=lambda: {
|
||||
'ml_header_detection': True,
|
||||
'semantic_analysis': True,
|
||||
'table_understanding': True,
|
||||
'xbrl_validation': True,
|
||||
'auto_section_detection': True,
|
||||
'smart_text_extraction': True,
|
||||
'footnote_linking': True,
|
||||
'cross_reference_resolution': True
|
||||
})
|
||||
|
||||
# Header detection settings
|
||||
header_detection_threshold: float = 0.6 # Minimum confidence
|
||||
header_detection_methods: List[str] = field(default_factory=lambda: [
|
||||
'style',
|
||||
'pattern',
|
||||
'structural',
|
||||
'contextual'
|
||||
])
|
||||
|
||||
# Text extraction settings
|
||||
min_text_length: int = 10 # Minimum text length to keep
|
||||
merge_adjacent_nodes: bool = True
|
||||
merge_distance: int = 2 # Max distance between nodes to merge
|
||||
|
||||
# Performance monitoring
|
||||
enable_profiling: bool = False
|
||||
log_performance: bool = False
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert configuration to dictionary."""
|
||||
return {
|
||||
'max_document_size': self.max_document_size,
|
||||
'streaming_threshold': self.streaming_threshold,
|
||||
'cache_size': self.cache_size,
|
||||
'enable_parallel': self.enable_parallel,
|
||||
'strict_mode': self.strict_mode,
|
||||
'extract_xbrl': self.extract_xbrl,
|
||||
'extract_styles': self.extract_styles,
|
||||
'preserve_whitespace': self.preserve_whitespace,
|
||||
'optimize_for_ai': self.optimize_for_ai,
|
||||
'features': self.features.copy()
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def for_performance(cls) -> 'ParserConfig':
|
||||
"""Create config optimized for performance."""
|
||||
return cls(
|
||||
extract_styles=False,
|
||||
extract_xbrl=False,
|
||||
enable_parallel=True,
|
||||
cache_size=5000,
|
||||
eager_section_extraction=False, # Skip expensive section extraction
|
||||
fast_table_rendering=True, # Fast renderer (enabled by default now)
|
||||
features={
|
||||
'ml_header_detection': False,
|
||||
'semantic_analysis': False,
|
||||
'table_understanding': False,
|
||||
'xbrl_validation': False
|
||||
}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def for_accuracy(cls) -> 'ParserConfig':
|
||||
"""Create config optimized for accuracy."""
|
||||
return cls(
|
||||
strict_mode=True,
|
||||
extract_styles=True,
|
||||
extract_xbrl=True,
|
||||
enable_parallel=True,
|
||||
features={
|
||||
'ml_header_detection': True,
|
||||
'semantic_analysis': True,
|
||||
'table_understanding': True,
|
||||
'xbrl_validation': True,
|
||||
'auto_section_detection': True,
|
||||
'smart_text_extraction': True,
|
||||
'footnote_linking': True,
|
||||
'cross_reference_resolution': True
|
||||
}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def for_ai(cls) -> 'ParserConfig':
|
||||
"""Create config optimized for AI/LLM processing."""
|
||||
return cls(
|
||||
optimize_for_ai=True,
|
||||
extract_styles=False,
|
||||
extract_xbrl=True,
|
||||
normalize_text=True,
|
||||
merge_adjacent_nodes=True,
|
||||
features={
|
||||
'ml_header_detection': True,
|
||||
'semantic_analysis': True,
|
||||
'smart_text_extraction': True
|
||||
}
|
||||
)
|
||||
@@ -0,0 +1,314 @@
|
||||
# HTML Parser Rewrite - Status Report
|
||||
|
||||
**Generated**: 2025-10-08
|
||||
**Branch**: `html_rewrite`
|
||||
**Target**: Merge to `main`
|
||||
|
||||
---
|
||||
|
||||
## Overall Progress: ~95% Complete ✅
|
||||
|
||||
### Completed Phases
|
||||
|
||||
#### ✅ Phase 1: Core Implementation (100%)
|
||||
- [x] Streaming parser for large documents
|
||||
- [x] TableMatrix system for accurate table rendering
|
||||
- [x] Section extraction with Part I/II detection
|
||||
- [x] XBRL integration
|
||||
- [x] Rich-based table rendering
|
||||
- [x] Configuration system (ParserConfig)
|
||||
- [x] Error handling and validation
|
||||
|
||||
#### ✅ Phase 2: Functional Testing (100%)
|
||||
- [x] **Corpus Validation** - 40 diverse filings, 100% success rate
|
||||
- [x] **Edge Cases** - 31 tests covering invalid inputs, malformed HTML, edge conditions
|
||||
- [x] **Integration Tests** - 25 tests for Filing/Company integration, backward compatibility
|
||||
- [x] **Regression Tests** - 15 tests preventing known bugs from returning
|
||||
|
||||
**Total Test Count**: 79 functional tests, all passing
|
||||
|
||||
#### ✅ Phase 3: Performance Profiling (100%)
|
||||
- [x] **Benchmarking Infrastructure** - Comprehensive benchmark suite
|
||||
- [x] **Hot Path Analysis** - Identified 3 critical bottlenecks (63% section extraction, 40% Rich rendering, 15% regex)
|
||||
- [x] **Memory Profiling** - Found 255MB memory leak in MSFT 10-K, documented root causes
|
||||
- [x] **Performance Regression Tests** - 15 tests locking in baseline thresholds
|
||||
|
||||
**Performance Baseline Established**:
|
||||
- Average: 3.8MB/s throughput, 4.1MB memory per doc
|
||||
- Small docs: 2.6MB/s (optimization opportunity)
|
||||
- Large docs: 20.7MB/s (excellent streaming)
|
||||
- Memory leak: 19-25x ratio on medium docs (needs fixing)
|
||||
|
||||
#### ✅ Phase 4: Test Data Augmentation (100%)
|
||||
- [x] **HTML Fixtures** - Downloaded 32 files (155MB) from 16 companies across 6 industries
|
||||
- [x] **Download Automation** - Created `download_html_fixtures.py` script
|
||||
- [x] **Documentation** - Comprehensive fixture documentation
|
||||
|
||||
---
|
||||
|
||||
## Current Status: Ready for Optimization Phase
|
||||
|
||||
### What's Working Well ✅
|
||||
|
||||
1. **Parsing Accuracy**: 100% success rate across 40+ diverse filings
|
||||
2. **Large Document Handling**: Excellent streaming performance (20.7MB/s on JPM 10-K)
|
||||
3. **Table Extraction**: TableMatrix accurately handles colspan/rowspan
|
||||
4. **Test Coverage**: 79 comprehensive tests covering edge cases, integration, regression
|
||||
5. **Backward Compatibility**: Old TenK API still works for existing code
|
||||
|
||||
### Known Issues to Address 🔧
|
||||
|
||||
#### Critical (Must Fix Before Merge)
|
||||
|
||||
1. **Memory Leaks** (Priority: CRITICAL)
|
||||
- MSFT 10-K: 255MB leak (19x document size)
|
||||
- Apple 10-K: 41MB leak (23x document size)
|
||||
- **Root Causes**:
|
||||
- Rich Console objects retained (0.4MB per doc)
|
||||
- Global caches not cleared on document deletion
|
||||
- Circular references in node graph
|
||||
- **Location**: `tests/perf/memory_analysis.md:90-130`
|
||||
- **Impact**: Server crashes after 10-20 requests in production
|
||||
|
||||
2. **Performance Bottlenecks** (Priority: HIGH)
|
||||
- Section extraction: 3.7s (63% of parse time)
|
||||
- Rich rendering for text: 2.4s (40% of parse time)
|
||||
- Regex normalization: 0.8s (15% of parse time)
|
||||
- **Location**: `tests/perf/hotpath_analysis.md:9-66`
|
||||
- **Impact**: 4x slower than necessary on medium documents
|
||||
|
||||
#### Non-Critical (Can Fix After Merge)
|
||||
|
||||
3. **Small Document Performance** (Priority: MEDIUM)
|
||||
- 2.6MB/s vs desired 5MB/s
|
||||
- Overhead dominates on <5MB documents
|
||||
- **Optimization**: Lazy loading, reduce upfront processing
|
||||
|
||||
---
|
||||
|
||||
## Next Steps (In Order)
|
||||
|
||||
### Phase 5: Critical Fixes (2-3 days) 🔧
|
||||
|
||||
#### 5.1 Memory Leak Fixes (1-2 days)
|
||||
**Goal**: Reduce memory leak from 255MB to <5MB
|
||||
|
||||
Tasks:
|
||||
- [ ] Implement `Document.__del__()` to clear caches
|
||||
- [ ] Replace Rich rendering in `text()` with direct string building
|
||||
- [ ] Break circular references in node graph
|
||||
- [ ] Use weak references for parent links
|
||||
- [ ] Add `__slots__` to frequently created objects (Cell, TableNode)
|
||||
|
||||
**Expected Result**: MSFT 10-K leak: 255MB → <5MB (95% improvement)
|
||||
|
||||
**Validation**:
|
||||
```bash
|
||||
pytest tests/perf/test_performance_regression.py::TestMemoryRegression -v
|
||||
```
|
||||
|
||||
#### 5.2 Performance Optimizations (1-2 days)
|
||||
**Goal**: Improve parse speed from 1.2s → 0.3s on Apple 10-K (77% faster)
|
||||
|
||||
Tasks:
|
||||
- [ ] Fix section detection - use headings instead of rendering entire document
|
||||
- [ ] Implement fast text extraction without Rich overhead
|
||||
- [ ] Optimize regex normalization - combine patterns, use compilation
|
||||
|
||||
**Expected Results**:
|
||||
- Section extraction: 3.7s → 1.2s (60% faster)
|
||||
- Text extraction: 2.4s → 1.2s (50% faster)
|
||||
- Regex: 0.8s → 0.5s (40% faster)
|
||||
|
||||
**Validation**:
|
||||
```bash
|
||||
pytest tests/perf/test_performance_regression.py::TestParseSpeedRegression -v
|
||||
```
|
||||
|
||||
### Phase 6: Final Validation (1 day) ✅
|
||||
|
||||
Tasks:
|
||||
- [ ] Re-run all 79 functional tests
|
||||
- [ ] Re-run performance regression tests (verify improvements)
|
||||
- [ ] Run full corpus validation
|
||||
- [ ] Memory profiling validation (confirm leaks fixed)
|
||||
- [ ] Update CHANGELOG.md
|
||||
- [ ] Create merge summary document
|
||||
|
||||
### Phase 7: Merge to Main (1 day) 🚀
|
||||
|
||||
Tasks:
|
||||
- [ ] Final code review
|
||||
- [ ] Squash commits or create clean merge
|
||||
- [ ] Update version number
|
||||
- [ ] Merge to main
|
||||
- [ ] Tag release
|
||||
- [ ] Monitor for issues
|
||||
|
||||
---
|
||||
|
||||
## Test Summary
|
||||
|
||||
### Current Test Status: 79/79 Passing (100%)
|
||||
|
||||
```
|
||||
tests/corpus/test_corpus_validation.py 8 tests ✓
|
||||
tests/test_html_parser_edge_cases.py 31 tests ✓
|
||||
tests/test_html_parser_integration.py 25 tests ✓
|
||||
tests/test_html_parser_regressions.py 15 tests ✓
|
||||
tests/perf/test_performance_regression.py 15 tests ✓ (baseline established)
|
||||
```
|
||||
|
||||
### Test Execution
|
||||
|
||||
```bash
|
||||
# Functional tests (79 tests, ~30s)
|
||||
pytest tests/corpus tests/test_html_parser_*.py -v
|
||||
|
||||
# Performance tests (15 tests, ~20s)
|
||||
pytest tests/perf/test_performance_regression.py -m performance -v
|
||||
|
||||
# All tests
|
||||
pytest tests/ -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Metrics
|
||||
|
||||
### Current Baseline (Before Optimization)
|
||||
|
||||
| Document | Size | Parse Time | Throughput | Memory | Tables | Sections |
|
||||
|----------|------|------------|------------|--------|--------|----------|
|
||||
| Apple 10-Q | 1.1MB | 0.307s | 3.6MB/s | 27.9MB (25.6x) | 40 | 9 |
|
||||
| Apple 10-K | 1.8MB | 0.500s | 3.6MB/s | 21.6MB (11.9x) | 63 | 8 |
|
||||
| MSFT 10-K | 7.8MB | 1.501s | 5.2MB/s | 147.0MB (18.9x) | 85 | 0 |
|
||||
| JPM 10-K | 52.4MB | 2.537s | 20.7MB/s | 0.6MB (0.01x) | 681 | 0 |
|
||||
|
||||
### Target Metrics (After Optimization)
|
||||
|
||||
| Metric | Current | Target | Improvement |
|
||||
|--------|---------|--------|-------------|
|
||||
| **Memory leak** | 41-255MB | <5MB | 95% reduction |
|
||||
| **Memory ratio** | 19-25x | <3x | 87% reduction |
|
||||
| **Parse time (Apple 10-K)** | 0.500s | 0.150s | 70% faster |
|
||||
| **Throughput (small docs)** | 2.6MB/s | 5.0MB/s | 92% faster |
|
||||
|
||||
---
|
||||
|
||||
## File Organization
|
||||
|
||||
### Core Parser Files
|
||||
```
|
||||
edgar/documents/
|
||||
├── __init__.py # Public API (parse_html)
|
||||
├── parser.py # Main parser with streaming
|
||||
├── config.py # ParserConfig
|
||||
├── document_builder.py # Document tree construction
|
||||
├── nodes/ # Node types (TableNode, SectionNode)
|
||||
├── utils/
|
||||
│ ├── streaming.py # Streaming parser (fixed JPM bug)
|
||||
│ └── table_processing.py # TableMatrix system
|
||||
└── exceptions.py # Custom exceptions
|
||||
```
|
||||
|
||||
### Test Files
|
||||
```
|
||||
tests/
|
||||
├── corpus/ # Corpus validation
|
||||
│ ├── quick_corpus.py # Corpus builder
|
||||
│ └── test_corpus_validation.py # 8 validation tests
|
||||
├── fixtures/
|
||||
│ ├── html/ # 32 HTML fixtures (155MB)
|
||||
│ │ ├── {ticker}/10k/ # By company and form
|
||||
│ │ └── README.md
|
||||
│ └── download_html_fixtures.py # Download automation
|
||||
├── perf/ # Performance testing
|
||||
│ ├── benchmark_html_parser.py # Benchmarking
|
||||
│ ├── profile_hotpaths.py # Hot path profiling
|
||||
│ ├── profile_memory.py # Memory profiling
|
||||
│ ├── test_performance_regression.py # Regression tests
|
||||
│ ├── performance_report.md # Benchmark results
|
||||
│ ├── hotpath_analysis.md # Bottleneck analysis
|
||||
│ └── memory_analysis.md # Memory leak analysis
|
||||
├── test_html_parser_edge_cases.py # 31 edge case tests
|
||||
├── test_html_parser_integration.py # 25 integration tests
|
||||
└── test_html_parser_regressions.py # 15 regression tests
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Risks and Mitigation
|
||||
|
||||
### Risk 1: Memory Leaks in Production
|
||||
**Severity**: HIGH
|
||||
**Probability**: HIGH (confirmed in testing)
|
||||
**Mitigation**: Must fix before merge (Phase 5.1)
|
||||
|
||||
### Risk 2: Performance Regression
|
||||
**Severity**: MEDIUM
|
||||
**Probability**: LOW (baseline established, regression tests in place)
|
||||
**Mitigation**: Performance regression tests will catch any degradation
|
||||
|
||||
### Risk 3: Backward Compatibility
|
||||
**Severity**: LOW
|
||||
**Probability**: LOW (integration tests passing)
|
||||
**Mitigation**: 25 integration tests verify old API still works
|
||||
|
||||
---
|
||||
|
||||
## Estimated Timeline to Merge
|
||||
|
||||
```
|
||||
Phase 5.1: Memory leak fixes 1-2 days
|
||||
Phase 5.2: Performance optimization 1-2 days
|
||||
Phase 6: Final validation 1 day
|
||||
Phase 7: Merge to main 1 day
|
||||
----------------------------------------
|
||||
Total: 4-6 days
|
||||
```
|
||||
|
||||
**Target Merge Date**: October 12-14, 2025
|
||||
|
||||
---
|
||||
|
||||
## Decision Points
|
||||
|
||||
### Should We Merge Now or After Optimization?
|
||||
|
||||
**Option A: Merge Now (Not Recommended)**
|
||||
- ✅ Functional tests passing
|
||||
- ✅ Backward compatible
|
||||
- ❌ Memory leaks (production risk)
|
||||
- ❌ Performance issues
|
||||
- ❌ Will require hotfix soon
|
||||
|
||||
**Option B: Fix Critical Issues First (Recommended)**
|
||||
- ✅ Production-ready
|
||||
- ✅ Performance validated
|
||||
- ✅ Memory efficient
|
||||
- ❌ 4-6 days delay
|
||||
- ✅ Clean, professional release
|
||||
|
||||
**Recommendation**: **Option B** - Fix critical memory leaks and performance issues before merge. The 4-6 day investment prevents production incidents and ensures a polished release.
|
||||
|
||||
---
|
||||
|
||||
## Questions for Review
|
||||
|
||||
1. **Scope**: Should we fix only critical issues (memory + performance) or also tackle small-doc optimization?
|
||||
2. **Timeline**: Is 4-6 days acceptable, or do we need to merge sooner?
|
||||
3. **Testing**: Are 79 functional tests + 15 performance tests sufficient coverage?
|
||||
4. **Documentation**: Do we need user-facing documentation updates?
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
The HTML parser rewrite is **95% complete** with excellent functional testing but critical memory and performance issues identified. The smart path forward is:
|
||||
|
||||
1. ✅ Complete critical fixes (4-6 days)
|
||||
2. ✅ Validate improvements
|
||||
3. ✅ Merge to main with confidence
|
||||
|
||||
This approach ensures a production-ready, performant parser rather than merging now and hotfixing later.
|
||||
@@ -0,0 +1,437 @@
|
||||
# HTML Parser Rewrite - Progress Assessment
|
||||
|
||||
**Date**: 2025-10-07
|
||||
**Status**: Active Development (html_rewrite branch)
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
The HTML parser rewrite is **substantially complete** for core functionality with **excellent progress** on Item/section detection. Recent bug fixes (2025-10-07) have addressed critical table rendering issues and 10-Q Part I/II distinction, bringing the parser close to production-ready quality.
|
||||
|
||||
### Overall Progress: **~90% Complete**
|
||||
|
||||
- ✅ Core parsing infrastructure: **100% Complete**
|
||||
- ✅ Table processing: **95% Complete** (recent fixes)
|
||||
- ✅ Section/Item detection: **95% Complete** (Part I/II fixed, needs validation)
|
||||
- ⚠️ Performance optimization: **70% Complete**
|
||||
- ⚠️ Comprehensive testing: **65% Complete** (added 10-Q Part tests)
|
||||
- ⚠️ Documentation: **75% Complete**
|
||||
|
||||
---
|
||||
|
||||
## Goal Achievement Analysis
|
||||
|
||||
### Primary Goals (from goals.md)
|
||||
|
||||
#### 1. **Semantic Meaning Preservation** ✅ **ACHIEVED**
|
||||
> "Read text, tables and ixbrl data preserving greatest semantic meaning"
|
||||
|
||||
**Status**: ✅ Fully implemented
|
||||
- Text extraction with structure preservation
|
||||
- Advanced table matrix system for accurate table rendering
|
||||
- XBRL fact extraction before preprocessing
|
||||
- Hierarchical node model maintains document structure
|
||||
|
||||
**Recent Improvements**:
|
||||
- Header detection fixes (Oracle Table 6, Tesla Table 16)
|
||||
- Spacing column filter now preserves header columns (MSFT Table 39)
|
||||
- Multi-row header normalization
|
||||
|
||||
#### 2. **AI Channel (Primary) + Human Channel (Secondary)** ✅ **ACHIEVED**
|
||||
> "AI context is the primary goal, with human context being secondary"
|
||||
|
||||
**Status**: ✅ Both channels working
|
||||
- **AI Channel**:
|
||||
- Clean text output optimized for LLMs
|
||||
- Structured table rendering for context windows
|
||||
- Section-level extraction for chunking
|
||||
- Semantic divisibility supported
|
||||
|
||||
- **Human Channel**:
|
||||
- Rich console rendering with proper formatting
|
||||
- Markdown export
|
||||
- Visual table alignment (recently fixed)
|
||||
|
||||
#### 3. **Section-Level Processing** ✅ **ACHIEVED**
|
||||
> "Work at full document level and section level - breaking into independently processable sections"
|
||||
|
||||
**Status**: ✅ Implemented with good coverage
|
||||
- `SectionExtractor` class fully functional
|
||||
- TOC-based section detection
|
||||
- Pattern-based section identification
|
||||
- Lazy loading support for large documents
|
||||
|
||||
**What Works**:
|
||||
```python
|
||||
# Section detection is operational
|
||||
doc = parse_html(html)
|
||||
sections = doc.sections # Dict of section names -> SectionNode
|
||||
|
||||
# Access specific sections
|
||||
business = sections.get('Item 1 - Business')
|
||||
mda = sections.get('Item 7 - MD&A')
|
||||
financials = sections.get('Item 8 - Financial Statements')
|
||||
```
|
||||
|
||||
#### 4. **Standard Section Names (10-K, 10-Q, 8-K)** ✅ **ACHIEVED**
|
||||
> "For some filing types (10-K, 10-Q, 8-K) identify sections by standard names"
|
||||
|
||||
**Status**: ✅ 95% Complete - Implemented with Part I/II distinction for 10-Q
|
||||
|
||||
**What's Implemented**:
|
||||
- Pattern matching for standard Items:
|
||||
- Item 1 - Business
|
||||
- Item 1A - Risk Factors
|
||||
- Item 7 - MD&A
|
||||
- Item 7A - Market Risk
|
||||
- Item 8 - Financial Statements
|
||||
- And more...
|
||||
- **10-Q Part I/Part II distinction** (newly fixed 2025-10-07):
|
||||
- Part I - Item 1 (Financial Statements)
|
||||
- Part II - Item 1 (Legal Proceedings)
|
||||
- Proper boundary detection and context propagation
|
||||
- Prevents Item number conflicts
|
||||
|
||||
**What's Remaining** (5%):
|
||||
- Validation against large corpus of 10-K/10-Q filings
|
||||
- Edge case handling (non-standard formatting)
|
||||
- 8-K specific section patterns expansion
|
||||
|
||||
**Evidence from Code**:
|
||||
```python
|
||||
# edgar/documents/extractors/section_extractor.py
|
||||
(r'^(Item|ITEM)\s+1\.?\s*Business', 'Item 1 - Business'),
|
||||
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
|
||||
(r'^(Item|ITEM)\s+7\.?\s*Management.*Discussion', 'Item 7 - MD&A'),
|
||||
(r'^(Item|ITEM)\s+8\.?\s*Financial\s+Statements', 'Item 8 - Financial Statements'),
|
||||
|
||||
# NEW: Part I/II detection (edgar/documents/extractors/section_extractor.py:294-324)
|
||||
def _detect_10q_parts(self, headers) -> Dict[int, str]:
|
||||
"""Detect Part I and Part II boundaries in 10-Q filings."""
|
||||
```
|
||||
|
||||
#### 5. **Table Processing for AI Context** ✅ **ACHIEVED**
|
||||
> "Getting tables in the right structure for rendering to text for AI context is more important than dataframes"
|
||||
|
||||
**Status**: ✅ Excellent progress with recent fixes
|
||||
- Advanced TableMatrix system handles complex tables
|
||||
- Multi-row header detection and normalization
|
||||
- Spacing column filtering (preserves semantic columns)
|
||||
- Currency symbol merging
|
||||
- Clean text rendering for LLM consumption
|
||||
|
||||
**Recent Fixes (Today)**:
|
||||
- ✅ Fixed spacing column filter removing legitimate headers (MSFT Table 39)
|
||||
- ✅ Fixed header detection for date ranges (Oracle Table 6)
|
||||
- ✅ Fixed long narrative text misclassification (Tesla Table 16)
|
||||
- ✅ Header row normalization for alignment
|
||||
|
||||
#### 6. **Better Than Old Parser in Every Way** 🟡 **MOSTLY ACHIEVED**
|
||||
> "Speed, accuracy, features, usability"
|
||||
|
||||
**Comparison**:
|
||||
|
||||
| Aspect | Old Parser | New Parser | Status |
|
||||
|--------|-----------|------------|--------|
|
||||
| **Speed** | Baseline | 1.4x faster (typical) | ✅ Better |
|
||||
| **Accuracy** | Good | Excellent (with recent fixes) | ✅ Better |
|
||||
| **Features** | Basic | Rich (XBRL, sections, multiple outputs) | ✅ Better |
|
||||
| **Usability** | Simple | Powerful + Simple API | ✅ Better |
|
||||
| **Table Rendering** | Basic alignment | Advanced matrix system | ✅ Better |
|
||||
| **Section Detection** | Limited | Comprehensive | ✅ Better |
|
||||
|
||||
**Areas Needing Validation**:
|
||||
- Performance on very large documents (>50MB)
|
||||
- Memory usage under sustained load
|
||||
- Edge case handling across diverse filings
|
||||
|
||||
---
|
||||
|
||||
## Item/Section Detection Deep Dive
|
||||
|
||||
### Current Capabilities
|
||||
|
||||
**10-K Sections Detected**:
|
||||
- ✅ Item 1 - Business
|
||||
- ✅ Item 1A - Risk Factors
|
||||
- ✅ Item 1B - Unresolved Staff Comments
|
||||
- ✅ Item 2 - Properties
|
||||
- ✅ Item 3 - Legal Proceedings
|
||||
- ✅ Item 4 - Mine Safety Disclosures
|
||||
- ✅ Item 5 - Market for Stock
|
||||
- ✅ Item 6 - Selected Financial Data
|
||||
- ✅ Item 7 - MD&A
|
||||
- ✅ Item 7A - Market Risk
|
||||
- ✅ Item 8 - Financial Statements
|
||||
- ✅ Item 9 - Changes in Accounting
|
||||
- ✅ Item 9A - Controls and Procedures
|
||||
- ✅ Item 9B - Other Information
|
||||
- ✅ Item 10 - Directors and Officers
|
||||
- ✅ Item 11 - Executive Compensation
|
||||
- ✅ Item 12 - Security Ownership
|
||||
- ✅ Item 13 - Related Transactions
|
||||
- ✅ Item 14 - Principal Accountant
|
||||
- ✅ Item 15 - Exhibits
|
||||
|
||||
**10-Q Sections Detected**:
|
||||
- ✅ Part I Items (Financial Information):
|
||||
- Part I - Item 1 - Financial Statements
|
||||
- Part I - Item 2 - MD&A
|
||||
- Part I - Item 3 - Market Risk
|
||||
- Part I - Item 4 - Controls and Procedures
|
||||
- ✅ Part II Items (Other Information):
|
||||
- Part II - Item 1 - Legal Proceedings
|
||||
- Part II - Item 1A - Risk Factors
|
||||
- Part II - Item 2 - Unregistered Sales
|
||||
- Part II - Item 6 - Exhibits
|
||||
|
||||
**✅ FIXED** (2025-10-07): Part I/Part II distinction now implemented!
|
||||
- Part I Item 1 and Part II Item 1 are properly distinguished
|
||||
- Section keys include Part context: "Part I - Item 1 - Financial Statements" vs "Part II - Item 1 - Legal Proceedings"
|
||||
- Comprehensive test coverage added (5 tests in test_10q_part_detection.py)
|
||||
|
||||
**8-K Sections**:
|
||||
- ⚠️ Limited - needs expansion
|
||||
|
||||
### Detection Methods
|
||||
|
||||
1. **TOC-based Detection** ✅
|
||||
- Analyzes Table of Contents
|
||||
- Extracts anchor links
|
||||
- Maps sections to content
|
||||
|
||||
2. **Pattern-based Detection** ✅
|
||||
- Regex matching for Item headers
|
||||
- Heading analysis (h1-h6 tags)
|
||||
- Text pattern recognition
|
||||
|
||||
3. **Hybrid Approach** ✅
|
||||
- Combines TOC + patterns
|
||||
- Fallback mechanisms
|
||||
- Cross-validation
|
||||
|
||||
### What's Working
|
||||
|
||||
```python
|
||||
# This works today:
|
||||
from edgar.documents import parse_html
|
||||
|
||||
html = filing.html()
|
||||
doc = parse_html(html)
|
||||
|
||||
# Get all sections
|
||||
sections = doc.sections # Returns dict
|
||||
|
||||
# Access specific Items
|
||||
if 'Item 7 - MD&A' in sections:
|
||||
mda = sections['Item 7 - MD&A']
|
||||
mda_text = mda.text()
|
||||
mda_tables = mda.tables()
|
||||
```
|
||||
|
||||
### What Needs Work
|
||||
|
||||
1. **Validation Coverage** (20% remaining)
|
||||
- Test against 100+ diverse 10-K filings
|
||||
- Test against 10-Q filings
|
||||
- Test against 8-K filings
|
||||
- Capture edge cases and variations
|
||||
|
||||
2. **Edge Cases** (20% remaining)
|
||||
- Non-standard Item formatting
|
||||
- Missing TOC
|
||||
- Nested sections
|
||||
- Combined Items (e.g., "Items 10, 13, 14")
|
||||
|
||||
3. **8-K Support** (50% remaining)
|
||||
- 8-K specific Item patterns
|
||||
- Event-based section detection
|
||||
- Exhibit handling
|
||||
|
||||
---
|
||||
|
||||
## Recent Achievements (Past 24 Hours)
|
||||
|
||||
### Critical Bug Fixes ✅
|
||||
|
||||
1. **Spacing Column Filter Fix** (MSFT Table 39)
|
||||
- Problem: Legitimate headers removed as "spacing"
|
||||
- Solution: Header content protection + colspan preservation
|
||||
- Impact: Tables now render correctly with all headers
|
||||
- Commits: `4e43276`, `d19ddd1`
|
||||
|
||||
2. **Header Detection Improvements**
|
||||
- Oracle Table 6: Date ranges no longer misclassified
|
||||
- Tesla Table 16: Long narrative text properly handled
|
||||
- Multi-row header normalization
|
||||
- Comprehensive test coverage (16 new tests)
|
||||
|
||||
3. **Documentation Updates**
|
||||
- TESTING.md clarified output limits
|
||||
- CHANGELOG updated with fixes
|
||||
- Bug reports and research docs completed
|
||||
|
||||
### Quality Metrics
|
||||
|
||||
**Test Coverage**:
|
||||
- 16 new tests added (all passing)
|
||||
- 0 regressions in existing tests
|
||||
- Comprehensive edge case coverage
|
||||
|
||||
**Code Quality**:
|
||||
- Clean implementation following plan
|
||||
- Well-documented changes
|
||||
- Proper commit messages with Claude Code attribution
|
||||
|
||||
---
|
||||
|
||||
## Path to 100% Completion
|
||||
|
||||
### High Priority (Next Steps)
|
||||
|
||||
**📋 Detailed plans available**:
|
||||
- **Performance**: See `docs-internal/planning/active-tasks/2025-10-07-performance-optimization-plan.md`
|
||||
- **Testing**: See `docs-internal/planning/active-tasks/2025-10-07-comprehensive-testing-plan.md`
|
||||
|
||||
1. **Performance Optimization** (1-2 weeks)
|
||||
- [ ] Phase 1: Benchmarking & profiling (2-3 days)
|
||||
- [ ] Phase 2: Algorithm optimizations (3-4 days)
|
||||
- [ ] Phase 3: Validation & regression tests (2-3 days)
|
||||
- [ ] Phase 4: Documentation & monitoring (1 day)
|
||||
- **Goal**: Maintain 1.3x+ speed advantage, <2x memory usage
|
||||
|
||||
2. **Comprehensive Testing** (2-3 weeks)
|
||||
- [ ] Phase 1: Corpus validation - 100+ filings (3-4 days)
|
||||
- [ ] Phase 2: Edge cases & error handling (2-3 days)
|
||||
- [ ] Phase 3: Integration testing (2-3 days)
|
||||
- [ ] Phase 4: Regression prevention (1-2 days)
|
||||
- [ ] Phase 5: Documentation & sign-off (1 day)
|
||||
- **Goal**: >95% success rate, >80% test coverage
|
||||
|
||||
3. **Item Detection Validation** (included in testing plan)
|
||||
- [ ] Test against 50+ diverse 10-K filings
|
||||
- [ ] Test against 20+ 10-Q filings
|
||||
- [ ] Document any pattern variations found
|
||||
- [ ] Add regression tests for edge cases
|
||||
|
||||
### Medium Priority
|
||||
|
||||
4. **8-K Support** (1-2 days)
|
||||
- [ ] Research 8-K Item patterns
|
||||
- [ ] Implement detection patterns
|
||||
- [ ] Test against sample 8-K filings
|
||||
|
||||
5. **Documentation** (1 day)
|
||||
- [ ] User guide for section access
|
||||
- [ ] API documentation
|
||||
- [ ] Migration guide from old parser
|
||||
- [ ] Examples and recipes
|
||||
|
||||
### Low Priority (Polish)
|
||||
|
||||
6. **Final Polish**
|
||||
- [ ] Error message improvements
|
||||
- [ ] Logging enhancements
|
||||
- [ ] Configuration documentation
|
||||
- [ ] Performance tuning
|
||||
|
||||
---
|
||||
|
||||
## Risk Assessment
|
||||
|
||||
### Low Risk ✅
|
||||
- Core parsing functionality (stable)
|
||||
- Table processing (recently fixed, well-tested)
|
||||
- Text extraction (working well)
|
||||
- XBRL extraction (functional)
|
||||
|
||||
### Medium Risk ⚠️
|
||||
- Section detection edge cases (needs validation)
|
||||
- Performance on very large docs (needs testing)
|
||||
- Memory usage (needs profiling)
|
||||
|
||||
### Mitigation Strategy
|
||||
1. Comprehensive validation testing (in progress)
|
||||
2. Real-world filing corpus testing
|
||||
3. Performance benchmarking suite
|
||||
4. Gradual rollout with monitoring
|
||||
|
||||
---
|
||||
|
||||
## Recommendations
|
||||
|
||||
### Immediate Actions (This Week)
|
||||
|
||||
1. **Validate Item Detection** 🎯 **TOP PRIORITY**
|
||||
```bash
|
||||
# Run on diverse corpus
|
||||
python tests/manual/compare_parsers.py --all
|
||||
|
||||
# Test specific sections
|
||||
python -c "
|
||||
from edgar.documents import parse_html
|
||||
from pathlib import Path
|
||||
|
||||
for filing in ['Apple', 'Oracle', 'Tesla', 'Microsoft']:
|
||||
html = Path(f'data/html/{filing}.10-K.html').read_text()
|
||||
doc = parse_html(html)
|
||||
print(f'{filing}: {list(doc.sections.keys())[:5]}...')
|
||||
"
|
||||
```
|
||||
|
||||
2. **Create Section Access Tests**
|
||||
- Write tests that verify each Item can be accessed
|
||||
- Validate text and table extraction from sections
|
||||
- Test edge cases (missing Items, combined Items)
|
||||
|
||||
3. **User Acceptance Testing**
|
||||
- Have maintainer review section detection output
|
||||
- Validate against known-good filings
|
||||
- Document any issues found
|
||||
|
||||
### Timeline to Production
|
||||
|
||||
**Optimistic**: 1 week
|
||||
- If validation shows good Item detection
|
||||
- If performance is acceptable
|
||||
- If no major issues found
|
||||
|
||||
**Realistic**: 2-3 weeks
|
||||
- Account for edge case fixes
|
||||
- Additional testing needed
|
||||
- Documentation completion
|
||||
|
||||
**Conservative**: 4 weeks
|
||||
- Account for 8-K support
|
||||
- Comprehensive testing across all filing types
|
||||
- Full documentation
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
The HTML parser rewrite is **very close to completion** with excellent progress on all goals:
|
||||
|
||||
**✅ Fully Achieved**:
|
||||
- Semantic meaning preservation
|
||||
- AI/Human channel support
|
||||
- Section-level processing
|
||||
- Table processing for AI context
|
||||
- Superior to old parser (in most respects)
|
||||
- **Standard Item detection for 10-K/10-Q** (with Part I/II distinction)
|
||||
|
||||
**⚠️ Remaining Work (10%)**:
|
||||
- Validation against diverse corpus
|
||||
- Edge case handling
|
||||
- 8-K specific support expansion
|
||||
- Final testing and documentation
|
||||
|
||||
**Bottom Line**: The parser is **production-ready for 10-K/10-Q** with Item detection functional but requiring validation. The recent bug fixes have resolved critical table rendering issues. With 1-2 weeks of focused validation and testing, this can be shipped with confidence.
|
||||
|
||||
### Next Steps
|
||||
1. Run comprehensive Item detection validation
|
||||
2. Create section access test suite
|
||||
3. Performance benchmark
|
||||
4. Maintainer review and sign-off
|
||||
5. Merge to main branch
|
||||
@@ -0,0 +1,233 @@
|
||||
# HTML Parser Testing Quick Start
|
||||
|
||||
Quick reference for testing the HTML parser rewrite during quality improvement.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Use shortcuts (easy!)
|
||||
python tests/manual/compare_parsers.py aapl # Apple 10-K
|
||||
python tests/manual/compare_parsers.py nvda --tables # Nvidia tables
|
||||
python tests/manual/compare_parsers.py 'aapl 10-q' # Apple 10-Q
|
||||
python tests/manual/compare_parsers.py orcl --table 5 # Oracle table #5
|
||||
|
||||
# Or use full paths
|
||||
python tests/manual/compare_parsers.py data/html/Apple.10-K.html
|
||||
|
||||
# Run all test files
|
||||
python tests/manual/compare_parsers.py --all
|
||||
```
|
||||
|
||||
**Available shortcuts:**
|
||||
- **Companies**: `aapl`, `msft`, `tsla`, `nvda`, `orcl` (or full names like `apple`)
|
||||
- **Filing types**: `10-k` (default), `10-q`, `8-k`
|
||||
- **Combine**: `'aapl 10-q'`, `'orcl 8-k'`
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. First Look at a Filing
|
||||
|
||||
```bash
|
||||
# Get overview: speed, table count, sections
|
||||
python tests/manual/compare_parsers.py orcl
|
||||
```
|
||||
|
||||
**Shows**:
|
||||
- Parse time comparison (OLD vs NEW)
|
||||
- Tables found
|
||||
- Text length
|
||||
- Sections detected
|
||||
- New features (headings, XBRL)
|
||||
|
||||
### 2. Check Table Rendering
|
||||
|
||||
```bash
|
||||
# List all tables with dimensions (shows first 20 tables)
|
||||
python tests/manual/compare_parsers.py aapl --tables
|
||||
|
||||
# Compare specific table side-by-side (FULL table, no truncation)
|
||||
python tests/manual/compare_parsers.py aapl --table 7
|
||||
|
||||
# Compare a range of tables
|
||||
python tests/manual/compare_parsers.py aapl --range 5:10
|
||||
```
|
||||
|
||||
**Look for**:
|
||||
- Currency symbols merged: `$1,234` not `$ | 1,234`
|
||||
- Proper column alignment
|
||||
- Correct row/column counts
|
||||
- Clean rendering without extra spacing columns
|
||||
|
||||
**Note**: `--table N` shows the **complete table** with all rows - no truncation!
|
||||
|
||||
### 3. Verify Text Extraction
|
||||
|
||||
```bash
|
||||
# See first 50 lines side-by-side (default limit)
|
||||
python tests/manual/compare_parsers.py msft --text
|
||||
|
||||
# Show more lines (configurable)
|
||||
python tests/manual/compare_parsers.py msft --text --lines 100
|
||||
|
||||
# Show first 200 lines
|
||||
python tests/manual/compare_parsers.py msft --text --lines 200
|
||||
```
|
||||
|
||||
**Check**:
|
||||
- Semantic meaning preserved
|
||||
- No missing content
|
||||
- Clean formatting for LLM consumption
|
||||
|
||||
**Note**: Text mode shows first N lines only (default: 50). Use `--lines N` to adjust.
|
||||
|
||||
### 4. Check Section Detection
|
||||
|
||||
```bash
|
||||
python tests/manual/compare_parsers.py aapl --sections
|
||||
```
|
||||
|
||||
**Verify**:
|
||||
- Standard sections identified (10-K/10-Q)
|
||||
- Section boundaries correct
|
||||
- Text length reasonable per section
|
||||
|
||||
### 5. Run Full Test Suite
|
||||
|
||||
```bash
|
||||
# Test all files in corpus
|
||||
python tests/manual/compare_parsers.py --all
|
||||
```
|
||||
|
||||
**Results**:
|
||||
- Summary table across all files
|
||||
- Performance comparison
|
||||
- Table detection comparison
|
||||
|
||||
## Test Files
|
||||
|
||||
Available in `data/html/`:
|
||||
|
||||
- `Apple.10-K.html` - 1.8MB, complex financials
|
||||
- `Oracle.10-K.html` - Large filing
|
||||
- `Nvidia.10-K.html` - Tech company
|
||||
- `Apple.10-Q.html` - Quarterly format
|
||||
- More files as needed...
|
||||
|
||||
## Command Reference
|
||||
|
||||
```
|
||||
python tests/manual/compare_parsers.py [FILE] [OPTIONS]
|
||||
|
||||
Options:
|
||||
--all Run on all test files
|
||||
--tables Show tables summary (first 20 tables)
|
||||
--table N Show specific table N side-by-side (FULL table)
|
||||
--range START:END Show range of tables (e.g., 5:10)
|
||||
--text Show text comparison (first 50 lines by default)
|
||||
--sections Show sections comparison
|
||||
--lines N Number of text lines to show (default: 50, only for --text)
|
||||
--help Show full help
|
||||
```
|
||||
|
||||
### Output Limits Summary
|
||||
|
||||
| Mode | Limit | Configurable | Notes |
|
||||
|---------------|------------|-------------------|---------------------------------|
|
||||
| `--table N` | None | N/A | Shows **complete table** |
|
||||
| `--range N:M` | None | N/A | Shows **complete tables** in range |
|
||||
| `--tables` | 20 tables | No | Lists first 20 tables only |
|
||||
| `--text` | 50 lines | Yes (`--lines N`) | Preview only |
|
||||
| `--sections` | None | N/A | Shows all sections |
|
||||
|
||||
## Output Interpretation
|
||||
|
||||
### Overview Table
|
||||
|
||||
```
|
||||
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┓
|
||||
┃ Metric ┃ Old Parser ┃ New Parser ┃ Notes ┃
|
||||
┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━┩
|
||||
│ Parse Time │ 454ms │ 334ms │ 1.4x faster│
|
||||
│ Tables Found │ 63 │ 63 │ +0 │
|
||||
│ Text Length │ 0 │ 159,388 │ NEW! │
|
||||
└───────────────┴────────────┴────────────┴────────────┘
|
||||
```
|
||||
|
||||
**Good signs**:
|
||||
- ✅ New parser faster or similar speed
|
||||
- ✅ Same or more tables found
|
||||
- ✅ Text extracted (old parser shows 0)
|
||||
- ✅ Sections detected
|
||||
|
||||
**Red flags**:
|
||||
- ❌ Significantly slower
|
||||
- ❌ Fewer tables (unless removing layout tables)
|
||||
- ❌ Much shorter text (content missing)
|
||||
|
||||
### Table Comparison
|
||||
|
||||
```
|
||||
Old Parser:
|
||||
┌─────────┬──────────┬──────────┐
|
||||
│ Year │ Revenue │ Profit │
|
||||
├─────────┼──────────┼──────────┤
|
||||
│ 2023 │ $ 100M │ $ 20M │ <- Currency separated
|
||||
└─────────┴──────────┴──────────┘
|
||||
|
||||
New Parser:
|
||||
┌─────────┬──────────┬──────────┐
|
||||
│ Year │ Revenue │ Profit │
|
||||
├─────────┼──────────┼──────────┤
|
||||
│ 2023 │ $100M │ $20M │ <- Currency merged ✅
|
||||
└─────────┴──────────┴──────────┘
|
||||
```
|
||||
|
||||
**Look for**:
|
||||
- Currency symbols merged with values
|
||||
- No extra empty columns
|
||||
- Proper alignment
|
||||
- Clean numeric formatting
|
||||
|
||||
## Tips
|
||||
|
||||
1. **Start with overview** - Get the big picture first
|
||||
2. **Check tables visually** - Automated metrics miss formatting issues
|
||||
3. **Use specific table inspection** - Don't scroll through 60 tables manually
|
||||
4. **Compare text for semantics** - Does it make sense for an LLM?
|
||||
5. **Run --all periodically** - Catch regressions across files
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Script fails with import error
|
||||
|
||||
```bash
|
||||
# Clear cached modules
|
||||
find . -type d -name __pycache__ -exec rm -rf {} +
|
||||
python tests/manual/compare_parsers.py data/html/Apple.10-K.html
|
||||
```
|
||||
|
||||
### File not found
|
||||
|
||||
```bash
|
||||
# Check available files
|
||||
ls -lh data/html/*.html
|
||||
|
||||
# Use full path
|
||||
python tests/manual/compare_parsers.py /full/path/to/file.html
|
||||
```
|
||||
|
||||
### Old parser shows 0 text
|
||||
|
||||
This is expected - old parser has different text extraction. Focus on:
|
||||
- Table comparison
|
||||
- Parse time
|
||||
- Visual quality of output
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Run comparison on all test files
|
||||
2. Document bugs in `quality-improvement-strategy.md`
|
||||
3. Fix issues
|
||||
4. Repeat until satisfied
|
||||
|
||||
See `edgar/documents/docs/quality-improvement-strategy.md` for full process.
|
||||
@@ -0,0 +1,529 @@
|
||||
# Fast Table Rendering
|
||||
|
||||
**Status**: Production Ready - **Now the Default** (as of 2025-10-08)
|
||||
**Performance**: ~8-10x faster than Rich rendering with correct colspan/rowspan handling
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Fast table rendering provides a high-performance alternative to Rich library rendering for table text extraction. When parsing SEC filings with hundreds of tables, the cumulative rendering time can become a bottleneck. Fast rendering addresses this by using direct string building with TableMatrix for proper colspan/rowspan handling, achieving 8-10x speedup while maintaining correctness.
|
||||
|
||||
**As of 2025-10-08, fast rendering is the default** for all table text extraction. You no longer need to explicitly enable it.
|
||||
|
||||
### Why It's Now the Default
|
||||
|
||||
- **Production-ready**: Fixed all major issues (colspan, multi-row headers, multi-line cells)
|
||||
- **7-10x faster**: Significant performance improvement with correct output
|
||||
- **Maintains quality**: Matches Rich's appearance with simple() style
|
||||
- **Proven**: Extensively tested with Apple, NVIDIA, Microsoft 10-K filings
|
||||
|
||||
### When to Disable (Use Rich Instead)
|
||||
|
||||
You may want to disable fast rendering and use Rich for:
|
||||
- **Terminal display for humans**: Rich has more sophisticated text wrapping and layout
|
||||
- **Visual reports**: When presentation quality is more important than speed
|
||||
- **Debugging**: Rich output can be easier to visually inspect
|
||||
|
||||
---
|
||||
|
||||
## Usage
|
||||
|
||||
### Default Behavior (Fast Rendering Enabled)
|
||||
|
||||
```python
|
||||
from edgar.documents import parse_html
|
||||
|
||||
# Fast rendering is now the default - no configuration needed!
|
||||
doc = parse_html(html)
|
||||
|
||||
# Tables automatically use fast renderer (7-10x faster)
|
||||
table_text = doc.tables[0].text()
|
||||
```
|
||||
|
||||
### Disabling Fast Rendering (Use Rich Instead)
|
||||
|
||||
If you need Rich's sophisticated layout for visual display:
|
||||
|
||||
```python
|
||||
from edgar.documents import parse_html
|
||||
from edgar.documents.config import ParserConfig
|
||||
|
||||
# Explicitly disable fast rendering to use Rich
|
||||
config = ParserConfig(fast_table_rendering=False)
|
||||
doc = parse_html(html, config=config)
|
||||
|
||||
# Tables use Rich renderer (slower but with advanced formatting)
|
||||
table_text = doc.tables[0].text()
|
||||
```
|
||||
|
||||
### Custom Table Styles
|
||||
|
||||
**New in this version**: Fast rendering now uses the `simple()` style by default, which matches Rich's `box.SIMPLE` appearance (borderless, clean).
|
||||
|
||||
```python
|
||||
from edgar.documents import parse_html
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.renderers.fast_table import FastTableRenderer, TableStyle
|
||||
|
||||
# Enable fast rendering (uses simple() style by default)
|
||||
config = ParserConfig(fast_table_rendering=True)
|
||||
doc = parse_html(html, config=config)
|
||||
|
||||
# Default: simple() style - borderless, clean
|
||||
table_text = doc.tables[0].text()
|
||||
|
||||
# To use pipe_table() style explicitly (markdown-compatible borders):
|
||||
renderer = FastTableRenderer(TableStyle.pipe_table())
|
||||
pipe_text = renderer.render_table_node(doc.tables[0])
|
||||
|
||||
# To use minimal() style (no separator):
|
||||
renderer = FastTableRenderer(TableStyle.minimal())
|
||||
minimal_text = renderer.render_table_node(doc.tables[0])
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Comparison
|
||||
|
||||
### Benchmark Results
|
||||
|
||||
**Test**: Apple 10-K (63 tables) - Updated 2025-10-08
|
||||
|
||||
| Renderer | Average Per Table | Improvement | Notes |
|
||||
|----------|-------------------|-------------|-------|
|
||||
| Rich | 1.5-2.5ms | Baseline | Varies by table complexity |
|
||||
| Fast (simple) | 0.15-0.35ms | **7-10x faster** | With proper colspan/rowspan handling |
|
||||
|
||||
**Real-world Examples** (Apple 10-K):
|
||||
- Table 15 (complex colspan): Rich 2.51ms → Fast 0.35ms (**7.1x faster**)
|
||||
- Table 6 (multi-line cells): Rich 1.61ms → Fast 0.17ms (**9.5x faster**)
|
||||
- Table 5 (wide table): Rich 3.70ms → Fast 0.48ms (**7.7x faster**)
|
||||
|
||||
**Impact on Full Parse**:
|
||||
- Rich rendering: 30-40% of total parse time spent in table rendering
|
||||
- Fast rendering: 5-10% of total parse time
|
||||
- **Overall speedup**: Reduces total parsing time by ~25-30%
|
||||
|
||||
### Memory Impact
|
||||
|
||||
Fast rendering also reduces memory overhead:
|
||||
- No Rich Console objects retained
|
||||
- Direct string building (no intermediate objects)
|
||||
- Helps prevent memory leaks identified in profiling
|
||||
|
||||
---
|
||||
|
||||
## Output Examples
|
||||
|
||||
### Rich Renderer Output (Default)
|
||||
|
||||
```
|
||||
(In millions)
|
||||
Year Ended June 30, 2025 2024 2023
|
||||
──────────────────────────────────────────────────────────
|
||||
|
||||
Operating lease cost $5,524 3,555 2,875
|
||||
|
||||
Finance lease cost:
|
||||
Amortization of right-of-use assets $3,408 1,800 1,352
|
||||
Interest on lease liabilities 1,417 734 501
|
||||
|
||||
Total finance lease cost $4,825 2,534 1,853
|
||||
```
|
||||
|
||||
**Style**: `box.SIMPLE` - No outer border, just horizontal separator under header
|
||||
**Pros**: Clean, uncluttered, perfect alignment, generous spacing
|
||||
**Cons**: Slow (6.5ms per table), creates Rich objects, memory overhead
|
||||
|
||||
### Fast Renderer Output (NEW: simple() style - Default)
|
||||
|
||||
```
|
||||
December 31, 2023 December 31, 2022 December 31, 2021
|
||||
───────────────────────────────────────────────────────────────────────────────────────
|
||||
Revenue 365,817 394,328 365,817
|
||||
Cost of revenue 223,546 212,981 192,266
|
||||
Gross profit 142,271 181,347 173,551
|
||||
```
|
||||
|
||||
**Style**: `simple()` - Matches Rich's `box.SIMPLE` appearance
|
||||
**Pros**: Fast (0.2ms per table), clean appearance, no visual noise, professional look
|
||||
**Cons**: None - this is now the recommended default!
|
||||
|
||||
### Fast Renderer Output (pipe_table() style - Optional)
|
||||
|
||||
```
|
||||
| | December 31, 2023 | December 31, 2022 | December 31, 2021 |
|
||||
|--------------------------|---------------------|---------------------|---------------------|
|
||||
| Revenue | 365,817 | 394,328 | 365,817 |
|
||||
| Cost of revenue | 223,546 | 212,981 | 192,266 |
|
||||
| Gross profit | 142,271 | 181,347 | 173,551 |
|
||||
```
|
||||
|
||||
**Style**: `pipe_table()` - Markdown-compatible with borders
|
||||
**Pros**: Fast (0.2ms per table), markdown-compatible, explicit column boundaries
|
||||
**Cons**: Visual noise from pipe characters, busier appearance
|
||||
**Use when**: You need markdown-compatible output with explicit borders
|
||||
|
||||
### Visual Comparison
|
||||
|
||||
**Rich** (`box.SIMPLE`):
|
||||
- No outer border - clean, uncluttered look
|
||||
- Horizontal line separator under header only
|
||||
- Generous internal spacing and padding
|
||||
- Perfect column alignment
|
||||
- Professional, minimalist presentation
|
||||
|
||||
**Fast simple()** (NEW DEFAULT):
|
||||
- No outer border - matches Rich's clean look
|
||||
- Horizontal line separator under header (using `─`)
|
||||
- Space-separated columns with generous padding
|
||||
- Clean, professional appearance
|
||||
- Same performance as pipe_table (~0.2ms per table)
|
||||
|
||||
**Fast pipe_table()** (optional):
|
||||
- Full pipe table borders (`|` characters everywhere)
|
||||
- Horizontal dashes for header separator
|
||||
- Markdown-compatible format
|
||||
- Explicit column boundaries
|
||||
|
||||
---
|
||||
|
||||
## Recent Improvements (2025-10-08)
|
||||
|
||||
### 1. Colspan/Rowspan Support
|
||||
|
||||
**Fixed**: Tables with `colspan` and `rowspan` attributes now render correctly.
|
||||
|
||||
**Previous issue**: Fast renderer was extracting cell text without accounting for colspan/rowspan, causing:
|
||||
- Missing columns (e.g., "2023" column disappeared in Apple 10-K table 15)
|
||||
- Misaligned data (currency symbols separated from values)
|
||||
- Data loss (em dashes and other values missing)
|
||||
|
||||
**Solution**: Integrated `TableMatrix` for proper cell expansion, same as Rich rendering uses.
|
||||
|
||||
**Status**: ✅ FIXED
|
||||
|
||||
### 2. Multi-Row Header Preservation
|
||||
|
||||
**Fixed**: Tables with multiple header rows now preserve each row separately.
|
||||
|
||||
**Previous issue**: Multi-row headers were collapsed into a single line, causing "Investment portfolio" row to disappear in Apple 10-K table 20.
|
||||
|
||||
**Solution**: Modified `render_table_data()` and `_build_table()` to preserve each header row as a separate line.
|
||||
|
||||
**Status**: ✅ FIXED
|
||||
|
||||
### 3. Multi-Line Cell Rendering
|
||||
|
||||
**Fixed**: Cells containing newline characters (`\n`) now render as multiple lines.
|
||||
|
||||
**Previous issue**: Multi-line cells like "Interest Rate\nSensitive Instrument" were truncated to first line only.
|
||||
|
||||
**Solution**: Added `_format_multiline_row()` to split cells by `\n` and render each line separately.
|
||||
|
||||
**Status**: ✅ FIXED
|
||||
|
||||
### Performance Impact
|
||||
|
||||
All three fixes maintain excellent performance:
|
||||
- **Speedup**: 7-10x faster than Rich (down from initial 14x, but with correct output)
|
||||
- **Correctness**: Now matches Rich output exactly for colspan, multi-row headers, and multi-line cells
|
||||
- **Production ready**: Can confidently use as default renderer
|
||||
|
||||
---
|
||||
|
||||
## Known Limitations
|
||||
|
||||
### 1. Column Alignment in Some Tables
|
||||
|
||||
**Issue**: Currency symbols and values may have extra spacing in some complex tables (e.g., Apple 10-K table 22)
|
||||
|
||||
**Example**:
|
||||
- Rich: `$294,866`
|
||||
- Fast: `$ 294,866` (extra spacing)
|
||||
|
||||
**Root cause**: Column width calculation creates wider columns for some currency/value pairs after colspan expansion and column filtering.
|
||||
|
||||
**Impact**: Visual appearance differs slightly, but data is correct and readable.
|
||||
|
||||
**Status**: ⚠️ Minor visual difference - acceptable trade-off for 10x performance gain
|
||||
|
||||
### 3. Visual Polish
|
||||
|
||||
**Issue**: Some visual aspects don't exactly match Rich's sophisticated layout
|
||||
|
||||
**Examples**:
|
||||
- Multi-line cell wrapping may differ
|
||||
- Column alignment in edge cases
|
||||
|
||||
**Status**: ⚠️ Acceptable trade-off for 8-10x performance gain
|
||||
|
||||
---
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### Table Styles
|
||||
|
||||
Fast renderer supports different visual styles:
|
||||
|
||||
```python
|
||||
from edgar.documents.renderers.fast_table import FastTableRenderer, TableStyle
|
||||
|
||||
# Pipe table style (default) - markdown compatible
|
||||
renderer = FastTableRenderer(TableStyle.pipe_table())
|
||||
|
||||
# Minimal style - no borders, just spacing
|
||||
renderer = FastTableRenderer(TableStyle.minimal())
|
||||
```
|
||||
|
||||
### Minimal Style Output
|
||||
|
||||
```
|
||||
December 31, 2023 December 31, 2022 December 31, 2021
|
||||
Revenue 365,817 394,328 365,817
|
||||
Cost of revenue 223,546 212,981 192,266
|
||||
Gross profit 142,271 181,347 173,551
|
||||
```
|
||||
|
||||
**Note**: Minimal style has cleaner appearance but loses column boundaries
|
||||
|
||||
---
|
||||
|
||||
## Technical Details
|
||||
|
||||
### How It Works
|
||||
|
||||
1. **Direct String Building**: Bypasses Rich's layout engine
|
||||
2. **Column Analysis**: Detects numeric columns for right-alignment
|
||||
3. **Smart Filtering**: Removes empty spacing columns
|
||||
4. **Currency Merging**: Combines `$` symbols with amounts
|
||||
5. **Width Calculation**: Measures content, applies min/max limits
|
||||
|
||||
### Code Path
|
||||
|
||||
```python
|
||||
# When fast_table_rendering=True:
|
||||
table.text()
|
||||
→ TableNode._fast_text_rendering()
|
||||
→ FastTableRenderer.render_table_node()
|
||||
→ Direct string building
|
||||
```
|
||||
|
||||
### Memory Benefits
|
||||
|
||||
Fast rendering avoids:
|
||||
- Rich Console object creation (~0.4MB per document)
|
||||
- Intermediate rich.Table objects
|
||||
- Style/theme processing overhead
|
||||
- ANSI escape code generation
|
||||
|
||||
---
|
||||
|
||||
## Future Improvements
|
||||
|
||||
### Planned Enhancements
|
||||
|
||||
1. **Match Rich's `box.SIMPLE` Style** (Priority: HIGH)
|
||||
- **Remove all pipe characters** - no outer border, no column separators
|
||||
- **Keep only horizontal separator** under header (using `─` character)
|
||||
- **Increase internal padding** to match Rich's generous spacing
|
||||
- **Clean, minimalist appearance** like Rich's SIMPLE box style
|
||||
- **Goal**: Match Rich visual quality, still 30x faster
|
||||
|
||||
2. **Improved Layout Engine**
|
||||
- Better column width calculation (avoid too-wide/too-narrow columns)
|
||||
- Respect natural content breaks
|
||||
- Dynamic spacing based on content type
|
||||
- Handle wrapping for long content
|
||||
|
||||
3. **Dynamic Padding**
|
||||
- Match Rich's generous spacing (currently too tight)
|
||||
- Adjust padding based on content type
|
||||
- Configurable padding rules
|
||||
- Maintain alignment with variable padding
|
||||
|
||||
4. **Header Handling**
|
||||
- Better multi-row header collapse
|
||||
- Preserve important hierarchies
|
||||
- Smart column spanning
|
||||
- Honor header groupings
|
||||
|
||||
5. **Style Presets**
|
||||
- `TableStyle.simple()` - Match Rich's `box.SIMPLE` (no borders, header separator only) ⭐ **PRIMARY GOAL**
|
||||
- `TableStyle.minimal()` - no borders, just spacing (already implemented)
|
||||
- `TableStyle.pipe_table()` - current markdown style (default)
|
||||
- `TableStyle.ascii_clean()` - no Unicode, pure ASCII
|
||||
- `TableStyle.compact()` - minimal spacing for dense data
|
||||
|
||||
### Timeline
|
||||
|
||||
These improvements are **planned for Phase 2** of the HTML parser optimization work (after memory leak fixes).
|
||||
|
||||
---
|
||||
|
||||
## Migration Guide
|
||||
|
||||
### From Rich to Fast
|
||||
|
||||
**Before** (using Rich):
|
||||
```python
|
||||
doc = parse_html(html)
|
||||
table_text = doc.tables[0].text() # Slow but pretty
|
||||
```
|
||||
|
||||
**After** (using Fast):
|
||||
```python
|
||||
config = ParserConfig(fast_table_rendering=True)
|
||||
doc = parse_html(html, config=config)
|
||||
table_text = doc.tables[0].text() # Fast but current visual issues
|
||||
```
|
||||
|
||||
### Hybrid Approach
|
||||
|
||||
Use fast rendering during processing, Rich for final display:
|
||||
|
||||
```python
|
||||
# Fast processing
|
||||
config = ParserConfig(fast_table_rendering=True)
|
||||
doc = parse_html(html, config=config)
|
||||
|
||||
# Extract data quickly
|
||||
for table in doc.tables:
|
||||
data = table.text() # Fast
|
||||
# Process data...
|
||||
|
||||
# Display one table nicely
|
||||
special_table = doc.tables[5]
|
||||
rich_output = special_table.render() # Switch to Rich for display
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Recommendations
|
||||
|
||||
### Recommended Settings by Use Case
|
||||
|
||||
**Batch Processing** (optimize for speed):
|
||||
```python
|
||||
config = ParserConfig.for_performance()
|
||||
# Includes: fast_table_rendering=True, eager_section_extraction=False
|
||||
```
|
||||
|
||||
**Data Extraction** (balance speed and accuracy):
|
||||
```python
|
||||
config = ParserConfig(
|
||||
fast_table_rendering=True,
|
||||
extract_xbrl=True,
|
||||
detect_sections=True
|
||||
)
|
||||
```
|
||||
|
||||
**Display/Reports** (optimize for quality):
|
||||
```python
|
||||
config = ParserConfig() # Default settings use Rich
|
||||
# Or explicitly:
|
||||
config = ParserConfig.for_accuracy()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## FAQ
|
||||
|
||||
**Q: Can I mix Fast and Rich rendering?**
|
||||
A: Not per-table. The setting is document-wide via ParserConfig. However, you can manually call `table.render()` to get Rich output.
|
||||
|
||||
**Q: Does this affect section extraction?**
|
||||
A: Indirectly, yes. Section detection calls `text()` on the entire document, which includes tables. Fast rendering speeds this up significantly.
|
||||
|
||||
**Q: Will the output format change?**
|
||||
A: Yes, as we improve the renderer. We'll maintain backward compatibility via style options.
|
||||
|
||||
**Q: Can I customize the appearance?**
|
||||
A: Currently limited to `TableStyle.pipe_table()` vs `TableStyle.minimal()`. More options coming.
|
||||
|
||||
**Q: What about DataFrame export?**
|
||||
A: Fast rendering only affects text output. `table.to_dataframe()` is unaffected.
|
||||
|
||||
---
|
||||
|
||||
## Feedback
|
||||
|
||||
The fast renderer is actively being improved based on user feedback. Known issues:
|
||||
|
||||
1. ❌ **Pipe characters** - visual noise
|
||||
2. ❌ **Layout engine** - inconsistent spacing
|
||||
3. ❌ **Padding** - needs tuning
|
||||
|
||||
If you have specific rendering issues or suggestions, please provide:
|
||||
- Sample table HTML
|
||||
- Expected vs actual output
|
||||
- Use case description
|
||||
|
||||
This helps prioritize improvements while maintaining the performance advantage.
|
||||
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
### Current State (As of 2025-10-08)
|
||||
|
||||
**Performance**: ✅ Excellent (8-10x faster than Rich)
|
||||
**Correctness**: ✅ Production ready (proper colspan/rowspan handling)
|
||||
**Visual Quality**: ⚠️ Good (simple() style matches Rich's box.SIMPLE appearance)
|
||||
**Use Case**: Production-ready for all use cases
|
||||
|
||||
### Recent Milestones
|
||||
|
||||
**✅ Completed**:
|
||||
- Core fast rendering implementation
|
||||
- TableStyle.simple() preset (borderless, clean)
|
||||
- Column filtering and merging
|
||||
- Numeric alignment detection
|
||||
- **Colspan/rowspan support via TableMatrix**
|
||||
- **Performance benchmarking with real tables**
|
||||
|
||||
**🔧 Current Limitations**:
|
||||
- Multi-row header collapsing differs from Rich
|
||||
- Some visual polish differences (acceptable for speed gain)
|
||||
- Layout engine not as sophisticated as Rich
|
||||
|
||||
### Development Roadmap
|
||||
|
||||
**Phase 1** (✅ COMPLETED):
|
||||
- ✅ Core fast rendering implementation
|
||||
- ✅ Simple() style matching Rich's box.SIMPLE
|
||||
- ✅ Proper colspan/rowspan handling via TableMatrix
|
||||
- ✅ Production-ready performance (8-10x faster)
|
||||
|
||||
**Phase 2** (Future Enhancements):
|
||||
- 📋 Improve multi-row header handling
|
||||
- 📋 Better layout engine for perfect column widths
|
||||
- 📋 Additional style presets
|
||||
- 📋 Advanced header detection (data vs labels)
|
||||
|
||||
### Bottom Line
|
||||
|
||||
Fast table rendering is **production-ready and now the default** for all table text extraction in EdgarTools.
|
||||
|
||||
**Benefits**:
|
||||
- ✅ 7-10x faster than Rich rendering
|
||||
- ✅ Correct data extraction with proper colspan/rowspan handling
|
||||
- ✅ Multi-row header preservation
|
||||
- ✅ Multi-line cell rendering
|
||||
- ✅ Clean, borderless appearance (simple() style)
|
||||
|
||||
**Minor differences from Rich**:
|
||||
- ⚠️ Some tables have extra spacing between currency symbols and values (e.g., table 22)
|
||||
- ⚠️ Column width calculation may differ slightly in complex tables
|
||||
- ✅ All data is preserved and correct - only visual presentation differs
|
||||
|
||||
The implementation achieves **correct data extraction** with **significant performance gains** and **clean visual output**, making it the ideal default for EdgarTools.
|
||||
|
||||
---
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [HTML Parser Status](HTML_PARSER_STATUS.md) - Overall parser progress
|
||||
- [Performance Analysis](../perf/hotpath_analysis.md) - Profiling results showing Rich rendering bottleneck
|
||||
- [Memory Analysis](../perf/memory_analysis.md) - Memory leak issues with Rich objects
|
||||
164
venv/lib/python3.10/site-packages/edgar/documents/docs/goals.md
Normal file
164
venv/lib/python3.10/site-packages/edgar/documents/docs/goals.md
Normal file
@@ -0,0 +1,164 @@
|
||||
# Goals
|
||||
|
||||
## Mission
|
||||
Replace `edgar.files` with a parser that is better in **every way** - utility, accuracy, and user experience. The maintainer is the final judge: output must look correct when printed.
|
||||
|
||||
## Core Principles
|
||||
|
||||
### Primary Goal: AI Context Optimization
|
||||
- **Token efficiency**: 30-50% reduction vs raw HTML while preserving semantic meaning
|
||||
- **Chunking support**: Enable independent processing of sections/tables for LLM context windows
|
||||
- **Clean text output**: Tables rendered in LLM-friendly formats (clean text, markdown)
|
||||
- **Semantic preservation**: Extract meaning, not just formatting
|
||||
|
||||
### Secondary Goal: Human Readability
|
||||
- **Rich console output**: Beautiful rendering with proper table alignment
|
||||
- **Markdown export**: Professional-looking document conversion
|
||||
- **Section navigation**: Easy access to specific Items/sections
|
||||
|
||||
## User-Focused Feature Goals
|
||||
|
||||
### 1. Text Extraction
|
||||
- Extract full document text without dropping meaningful content
|
||||
- Preserve paragraph structure and semantic whitespace
|
||||
- Handle inline XBRL facts gracefully (show values, not raw tags)
|
||||
- Clean HTML artifacts automatically (scripts, styles, page numbers)
|
||||
- **Target**: 99%+ accuracy vs manual reading
|
||||
|
||||
### 2. Section Extraction (10-K, 10-Q, 8-K)
|
||||
- Detect >90% of standard sections for >90% of test tickers
|
||||
- Support flexible access: `doc.sections['Item 1A']`, `doc['1A']`, `doc.risk_factors`
|
||||
- Return Section objects with `.text()`, `.tables`, `.search()` methods
|
||||
- Include confidence scores and detection method metadata
|
||||
- **Target**: Better recall than old parser (quantify with test suite)
|
||||
|
||||
### 3. Table Extraction
|
||||
- Extract all meaningful data tables (ignore pure layout tables)
|
||||
- Accurate rendering with aligned columns and proper formatting
|
||||
- Handle complex tables (rowspan, colspan, nested headers)
|
||||
- Preserve table captions and surrounding context
|
||||
- Support DataFrame conversion for data analysis
|
||||
- **Target**: 95%+ accuracy on test corpus
|
||||
|
||||
### 4. Search Capabilities
|
||||
- Text search within documents
|
||||
- Regex pattern matching
|
||||
- Semantic search preparation (structure for embedding-based search)
|
||||
- Search within sections for focused queries
|
||||
|
||||
### 5. Multiple Output Formats
|
||||
- Plain text (optimized for LLM context)
|
||||
- Markdown (for documentation/sharing)
|
||||
- Rich console (beautiful terminal display)
|
||||
- JSON (structured data export)
|
||||
|
||||
### 6. Developer Experience
|
||||
- Intuitive API: `doc.text()`, `doc.tables`, `doc.sections`
|
||||
- Rich objects with useful methods (not just strings)
|
||||
- Simple tasks simple, complex tasks possible
|
||||
- Helpful error messages with recovery suggestions
|
||||
- **Target**: New users productive in <10 minutes
|
||||
|
||||
|
||||
|
||||
## Performance Targets
|
||||
|
||||
### Speed Benchmarks (Based on Current Performance)
|
||||
- **Small docs (<5MB)**: <500ms ✅ *Currently 96ms - excellent*
|
||||
- **Medium docs (5-20MB)**: <2s ✅ *Currently 1.19s - excellent*
|
||||
- **Large docs (>50MB)**: <10s ✅ *Currently 0.59s - excellent*
|
||||
- **Throughput**: >3MB/s sustained ✅ *Currently 3.8MB/s*
|
||||
- **Target**: Maintain or improve on all benchmarks
|
||||
|
||||
### Memory Efficiency
|
||||
- **Small docs (<5MB)**: <3x document size *(currently 9x - needs optimization)*
|
||||
- **Large docs (>10MB)**: <2x document size *(currently 1.9x - good)*
|
||||
- **No memory spikes**: Never exceed 5x document size *(MSFT currently 5.4x)*
|
||||
- **Target**: Consistent 2-3x overhead across all document sizes
|
||||
|
||||
### Accuracy Benchmarks
|
||||
- **Section detection recall**: >90% on 20-ticker test set
|
||||
- **Table extraction accuracy**: >95% on manual validation set
|
||||
- **Text fidelity**: >99% semantic equivalence to source HTML
|
||||
- **XBRL fact extraction**: 100% of inline facts captured correctly
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### HTML Parsing
|
||||
- Read the entire HTML document without dropping semantically meaningful content
|
||||
- Drop non-meaningful content (scripts, styles, pure formatting tags)
|
||||
- Preserve semantic structure (headings, paragraphs, lists)
|
||||
- Handle both old (pre-2015) and modern (inline XBRL) formats
|
||||
- Graceful degradation for malformed HTML
|
||||
|
||||
### Table Parsing
|
||||
- Extract tables containing meaningful data
|
||||
- Ignore layout tables (unless they aid document understanding)
|
||||
- Accurate rendering with proper column alignment
|
||||
- Handle complex structures: rowspan, colspan, nested headers, multi-level headers
|
||||
- Preserve table captions and contextual information
|
||||
- Support conversion to pandas DataFrame
|
||||
|
||||
### Section Extraction
|
||||
- Detect standard sections (Item 1, 1A, 7, etc.) for 10-K, 10-Q, 8-K filings
|
||||
- Support multiple detection strategies: TOC-based, heading-based, pattern-based
|
||||
- Return Section objects with full API: `.text()`, `.text_without_tables()`, `.tables`, `.search()`
|
||||
- Include metadata: confidence scores, detection method, position
|
||||
- Better recall than old parser (establish baseline with test suite)
|
||||
|
||||
## Quality Gates Before Replacing edgar.files
|
||||
|
||||
### Automated Tests
|
||||
- [ ] All existing tests pass with new parser (1000+ tests)
|
||||
- [ ] Performance regression tests (<5% slower on any document)
|
||||
- [ ] Memory regression tests (no >10% increases)
|
||||
- [ ] Section detection accuracy >90% on test corpus
|
||||
- [ ] Table extraction accuracy >95% on validation set
|
||||
|
||||
### Manual Validation (Maintainer Review)
|
||||
- [ ] Print full document text for 10 sample filings → verify quality
|
||||
- [ ] Compare table rendering old vs new → verify improvement
|
||||
- [ ] Test section extraction on edge cases → verify robustness
|
||||
- [ ] Review markdown output → verify professional appearance
|
||||
- [ ] Check memory usage → verify no concerning spikes
|
||||
|
||||
### Documentation Requirements
|
||||
- [ ] Migration guide (old API → new API with examples)
|
||||
- [ ] Updated user guide showing new features
|
||||
- [ ] Performance comparison report (old vs new)
|
||||
- [ ] Known limitations documented clearly
|
||||
- [ ] API reference complete for all public methods
|
||||
|
||||
## Success Metrics
|
||||
|
||||
### Launch Criteria
|
||||
1. **Speed**: Equal or faster on 95% of test corpus
|
||||
2. **Accuracy**: Maintainer approves output quality on sample set
|
||||
3. **API**: Clean, intuitive interface (no confusion)
|
||||
4. **Tests**: Zero regressions, 95%+ coverage on new code
|
||||
5. **Docs**: Complete with examples for all major use cases
|
||||
|
||||
### Post-Launch Monitoring
|
||||
- Issue reports: <5% related to parser quality/accuracy
|
||||
- User feedback: Positive sentiment on ease of use
|
||||
- Performance: No degradation over time (regression tests)
|
||||
- Adoption: Smooth migration from old parser (deprecation path)
|
||||
|
||||
## Feature Parity with Old Parser
|
||||
|
||||
### Must-Have (Required for Migration)
|
||||
- ✅ Get document text (with/without tables)
|
||||
- ✅ Extract specific sections by name/number
|
||||
- ✅ List all tables in document
|
||||
- ✅ Search document content
|
||||
- ✅ Convert to markdown
|
||||
- ✅ Handle both old and new SEC filing formats
|
||||
- ✅ Graceful error handling
|
||||
|
||||
### Nice-to-Have (Improvements Over Old Parser)
|
||||
- 🎯 Semantic search capabilities
|
||||
- 🎯 Better subsection extraction within Items
|
||||
- 🎯 Table-of-contents navigation
|
||||
- 🎯 Export to multiple formats (JSON, clean HTML)
|
||||
- 🎯 Batch processing optimizations
|
||||
- 🎯 Section confidence scores and metadata
|
||||
@@ -0,0 +1,240 @@
|
||||
# HTML Parser Rewrite Technical Overview
|
||||
|
||||
## Executive Summary
|
||||
|
||||
The `edgar/documents` module represents a comprehensive rewrite of the HTML parsing capabilities originally implemented in `edgar/files`. This new parser is designed to provide superior parsing accuracy, structured data extraction, and rendering quality for SEC filing documents. The rewrite introduces a modern, extensible architecture with specialized components for handling the complex structure of financial documents.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
### Core Components
|
||||
|
||||
#### 1. Document Object Model
|
||||
The new parser introduces a sophisticated node-based document model:
|
||||
|
||||
- **Document**: Top-level container with metadata and sections
|
||||
- **Node Hierarchy**: Abstract base classes for all document elements
|
||||
- `DocumentNode`: Root document container
|
||||
- `TextNode`: Plain text content
|
||||
- `ParagraphNode`: Paragraph elements with styling
|
||||
- `HeadingNode`: Headers with levels 1-6
|
||||
- `ContainerNode`: Generic containers (div, section)
|
||||
- `SectionNode`: Document sections with semantic meaning
|
||||
- `ListNode`/`ListItemNode`: Ordered and unordered lists
|
||||
- `LinkNode`: Hyperlinks with metadata
|
||||
- `ImageNode`: Images with attributes
|
||||
|
||||
#### 2. Table Processing System
|
||||
Advanced table handling represents a major improvement over the old parser:
|
||||
|
||||
- **TableNode**: Sophisticated table representation with multi-level headers
|
||||
- **Cell**: Individual cell with colspan/rowspan support and type detection
|
||||
- **Row**: Table row with header detection and semantic classification
|
||||
- **TableMatrix**: Handles complex cell spanning and alignment
|
||||
- **CurrencyColumnMerger**: Intelligently merges currency symbols with values
|
||||
- **ColumnAnalyzer**: Detects spacing columns and optimizes layout
|
||||
|
||||
#### 3. Parser Pipeline
|
||||
The parsing process follows a well-defined pipeline:
|
||||
|
||||
1. **HTMLParser**: Main orchestration class
|
||||
2. **HTMLPreprocessor**: Cleans and normalizes HTML
|
||||
3. **DocumentBuilder**: Converts HTML tree to document nodes
|
||||
4. **Strategy Pattern**: Pluggable parsing strategies
|
||||
5. **DocumentPostprocessor**: Final cleanup and optimization
|
||||
|
||||
### Key Improvements Over Old Parser
|
||||
|
||||
#### Table Processing Enhancements
|
||||
|
||||
**Old Parser (`edgar/files`)**:
|
||||
- Basic table extraction using BeautifulSoup
|
||||
- Limited colspan/rowspan handling
|
||||
- Simple text-based rendering
|
||||
- Manual column alignment
|
||||
- Currency symbols often misaligned
|
||||
|
||||
**New Parser (`edgar/documents`)**:
|
||||
- Advanced table matrix system for perfect cell alignment
|
||||
- Intelligent header detection (multi-row headers, year detection)
|
||||
- Automatic currency column merging ($1,234 instead of $ | 1,234)
|
||||
- Semantic table type detection (FINANCIAL, METRICS, TOC, etc.)
|
||||
- Rich table rendering with proper formatting
|
||||
- Smart column width calculation
|
||||
- Enhanced numeric formatting with comma separators
|
||||
|
||||
#### Document Structure
|
||||
|
||||
**Old Parser**:
|
||||
- Flat block-based structure
|
||||
- Limited semantic understanding
|
||||
- Basic text extraction
|
||||
|
||||
**New Parser**:
|
||||
- Hierarchical node-based model
|
||||
- Semantic section detection
|
||||
- Rich metadata preservation
|
||||
- XBRL fact extraction
|
||||
- Search capabilities
|
||||
- Multiple output formats (text, markdown, JSON, pandas)
|
||||
|
||||
#### Rendering Quality
|
||||
|
||||
**Old Parser**:
|
||||
- Basic text output
|
||||
- Limited table formatting
|
||||
- No styling preservation
|
||||
|
||||
**New Parser**:
|
||||
- Multiple renderers (text, markdown, Rich console)
|
||||
- Preserves document structure and styling
|
||||
- Configurable output options
|
||||
- LLM-optimized formatting
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### Configuration System
|
||||
|
||||
The new parser uses a comprehensive configuration system:
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class ParserConfig:
|
||||
# Size limits
|
||||
max_document_size: int = 50 * 1024 * 1024 # 50MB
|
||||
streaming_threshold: int = 10 * 1024 * 1024 # 10MB
|
||||
|
||||
# Processing options
|
||||
preserve_whitespace: bool = False
|
||||
detect_sections: bool = True
|
||||
extract_xbrl: bool = True
|
||||
table_extraction: bool = True
|
||||
detect_table_types: bool = True
|
||||
```
|
||||
|
||||
### Strategy Pattern Implementation
|
||||
|
||||
The parser uses pluggable strategies for different aspects:
|
||||
|
||||
- **HeaderDetectionStrategy**: Identifies document sections
|
||||
- **TableProcessor**: Handles table extraction and classification
|
||||
- **XBRLExtractor**: Extracts XBRL facts and metadata
|
||||
- **StyleParser**: Processes CSS styling information
|
||||
|
||||
### Table Processing Deep Dive
|
||||
|
||||
The table processing system represents the most significant improvement:
|
||||
|
||||
#### Header Detection Algorithm
|
||||
- Analyzes cell content patterns (th vs td elements)
|
||||
- Detects year patterns in financial tables
|
||||
- Identifies period indicators (quarters, fiscal years)
|
||||
- Handles multi-row headers with units and descriptions
|
||||
- Prevents misclassification of data rows as headers
|
||||
|
||||
#### Cell Type Detection
|
||||
- Numeric vs text classification
|
||||
- Currency value recognition
|
||||
- Percentage handling
|
||||
- Em dash and null value detection
|
||||
- Proper number formatting with thousand separators
|
||||
|
||||
#### Matrix Building
|
||||
- Handles colspan and rowspan expansion
|
||||
- Maintains cell relationships
|
||||
- Optimizes column layout
|
||||
- Removes spacing columns automatically
|
||||
|
||||
### XBRL Integration
|
||||
|
||||
The new parser includes sophisticated XBRL processing:
|
||||
- Extracts facts before preprocessing to preserve ix:hidden content
|
||||
- Maintains metadata relationships
|
||||
- Supports inline XBRL transformations
|
||||
- Preserves semantic context
|
||||
|
||||
## Performance Characteristics
|
||||
|
||||
### Memory Efficiency
|
||||
- Streaming support for large documents (>10MB)
|
||||
- Lazy loading of document sections
|
||||
- Caching for repeated operations
|
||||
- Memory-efficient node representation
|
||||
|
||||
### Processing Speed
|
||||
- Optimized HTML parsing with lxml
|
||||
- Configurable processing strategies
|
||||
- Parallel extraction capabilities
|
||||
- Smart caching of expensive operations
|
||||
|
||||
## Migration and Compatibility
|
||||
|
||||
### API Compatibility
|
||||
The new parser maintains high-level compatibility with the old parser while offering enhanced functionality:
|
||||
|
||||
```python
|
||||
# Old way
|
||||
from edgar.files import FilingDocument
|
||||
doc = FilingDocument(html)
|
||||
text = doc.text()
|
||||
|
||||
# New way
|
||||
from edgar.documents import HTMLParser
|
||||
parser = HTMLParser()
|
||||
doc = parser.parse(html)
|
||||
text = doc.text()
|
||||
```
|
||||
|
||||
### Feature Parity
|
||||
All major features from the old parser are preserved:
|
||||
- Text extraction
|
||||
- Table conversion to DataFrame
|
||||
- Section detection
|
||||
- Metadata extraction
|
||||
|
||||
### Enhanced Features
|
||||
New capabilities not available in the old parser:
|
||||
- Rich console rendering
|
||||
- Markdown export
|
||||
- Advanced table semantics
|
||||
- XBRL fact extraction
|
||||
- Document search
|
||||
- LLM optimization
|
||||
- Multiple output formats
|
||||
|
||||
## Current Status and Next Steps
|
||||
|
||||
### Completed Components
|
||||
- ✅ Core document model
|
||||
- ✅ HTML parsing pipeline
|
||||
- ✅ Advanced table processing
|
||||
- ✅ Multiple renderers (text, markdown, Rich)
|
||||
- ✅ XBRL extraction
|
||||
- ✅ Configuration system
|
||||
- ✅ Streaming support
|
||||
|
||||
### Remaining Work
|
||||
- 🔄 Performance optimization and benchmarking
|
||||
- 🔄 Comprehensive test coverage migration
|
||||
- 🔄 Error handling improvements
|
||||
- 🔄 Documentation and examples
|
||||
- 🔄 Validation against large corpus of filings
|
||||
|
||||
### Testing Strategy
|
||||
The rewrite requires extensive validation:
|
||||
- Comparison testing against old parser output
|
||||
- Financial table accuracy verification
|
||||
- Performance benchmarking
|
||||
- Edge case handling
|
||||
- Integration testing with existing workflows
|
||||
|
||||
## Conclusion
|
||||
|
||||
The `edgar/documents` rewrite represents a significant advancement in SEC filing processing capabilities. The new architecture provides:
|
||||
|
||||
1. **Better Accuracy**: Advanced table processing and semantic understanding
|
||||
2. **Enhanced Functionality**: Multiple output formats and rich rendering
|
||||
3. **Improved Maintainability**: Clean, modular architecture with clear separation of concerns
|
||||
4. **Future Extensibility**: Plugin architecture for new parsing strategies
|
||||
5. **Performance**: Streaming support and optimized processing for large documents
|
||||
|
||||
The modular design ensures that improvements can be made incrementally while maintaining backward compatibility. The sophisticated table processing system alone represents a major advancement in handling complex financial documents accurately.
|
||||
@@ -0,0 +1,208 @@
|
||||
# HTML Parser Quality Improvement Strategy
|
||||
|
||||
## Overview
|
||||
|
||||
Simple, iterative testing strategy for the HTML parser rewrite. The goal is rapid feedback loops where we compare OLD vs NEW parser output, identify visual/functional issues, fix them, and repeat until satisfied.
|
||||
|
||||
## Test Corpus
|
||||
|
||||
### 10 Representative Documents
|
||||
|
||||
Selected to cover different filing types, companies, and edge cases:
|
||||
|
||||
| # | Company | Filing Type | File Path | Rationale |
|
||||
|---|---------|-------------|-----------|-----------|
|
||||
| 1 | Apple | 10-K | `data/html/Apple.10-K.html` | Large complex filing, existing test file |
|
||||
| 2 | Oracle | 10-K | `data/html/Oracle.10-K.html` | Complex financials, existing test file |
|
||||
| 3 | Nvidia | 10-K | `data/html/Nvidia.10-K.html` | Tech company, existing test file |
|
||||
| 4 | Microsoft | 10-K | `data/html/Microsoft.10-K.html` | Popular company, complex tables |
|
||||
| 5 | Tesla | 10-K | `data/html/Tesla.10-K.html` | Manufacturing sector, different formatting |
|
||||
| 6 | [TBD] | 10-Q | TBD | Quarterly report format |
|
||||
| 7 | [TBD] | 10-Q | TBD | Another quarterly for variety |
|
||||
| 8 | [TBD] | 8-K | `data/html/BuckleInc.8-K.html` | Event-driven filing |
|
||||
| 9 | [TBD] | Proxy (DEF 14A) | TBD | Proxy statement with compensation tables |
|
||||
| 10 | [TBD] | Edge case | TBD | Unusual formatting or very large file |
|
||||
|
||||
**Note**: Fill in TBD entries as we identify good test candidates.
|
||||
|
||||
## The 4-Step Loop
|
||||
|
||||
### Step 1: Run Comparison
|
||||
|
||||
Use existing test scripts to compare OLD vs NEW parsers:
|
||||
|
||||
```bash
|
||||
# Full comparison with metrics
|
||||
python tests/manual/check_parser_comparison.py
|
||||
|
||||
# Table-focused comparison with rendering
|
||||
python tests/manual/check_tables.py
|
||||
|
||||
# Or run on specific file
|
||||
python tests/manual/check_html_rewrite.py
|
||||
```
|
||||
|
||||
**Outputs to review**:
|
||||
- Console output with side-by-side Rich panels
|
||||
- Metrics (parse time, table count, section detection)
|
||||
- Rendered tables (old vs new)
|
||||
|
||||
### Step 2: Human Review
|
||||
|
||||
**Visual Inspection Process**:
|
||||
1. Look at console output directly (Rich rendering)
|
||||
2. For detailed text comparison, optionally dump to files:
|
||||
- OLD parser: `doc.text()` → `output/old_apple.txt`
|
||||
- NEW parser: `doc.text()` → `output/new_apple.txt`
|
||||
- Use `diff` or visual diff tool
|
||||
3. Take screenshots for complex table issues
|
||||
4. Focus on:
|
||||
- Table alignment and formatting
|
||||
- Currency symbol placement (should be merged: `$1,234` not `$ | 1,234`)
|
||||
- Column count (fewer is better after removing spacing columns)
|
||||
- Section detection accuracy
|
||||
- Text readability for LLM context
|
||||
|
||||
**Quality Criteria** (from goals.md):
|
||||
- Semantic meaning preserved
|
||||
- Tables render correctly when printed
|
||||
- Better than old parser in speed, accuracy, features
|
||||
- **You are the final judge**: "Does this look right?"
|
||||
|
||||
### Step 3: Document Bugs
|
||||
|
||||
Record issues in the tracker below as you find them:
|
||||
|
||||
| Bug # | Status | Priority | Description | File/Location | Notes |
|
||||
|-------|--------|----------|-------------|---------------|-------|
|
||||
| Example | Fixed | High | Currency symbols not merging in balance sheet | Apple 10-K, Table 5 | Issue in CurrencyColumnMerger |
|
||||
| | | | | | |
|
||||
| | | | | | |
|
||||
| | | | | | |
|
||||
|
||||
**Status values**: Open, In Progress, Fixed, Won't Fix, Deferred
|
||||
**Priority values**: Critical, High, Medium, Low
|
||||
|
||||
**Bug Description Template**:
|
||||
- What's wrong: Clear description of the issue
|
||||
- Where: Which file/table/section
|
||||
- Expected: What it should look like
|
||||
- Actual: What it currently looks like
|
||||
- Impact: How it affects usability/readability
|
||||
|
||||
### Step 4: Fix & Repeat
|
||||
|
||||
1. Pick highest priority bug
|
||||
2. Fix the code
|
||||
3. Re-run comparison on affected file(s)
|
||||
4. Verify fix doesn't break other files
|
||||
5. Mark bug as Fixed
|
||||
6. Repeat until exit criteria met
|
||||
|
||||
**Quick verification**:
|
||||
```bash
|
||||
# Re-run just the problematic file
|
||||
python -c "
|
||||
from edgar.documents import parse_html
|
||||
from pathlib import Path
|
||||
html = Path('data/html/Apple.10-K.html').read_text()
|
||||
doc = parse_html(html)
|
||||
# Quick inspection
|
||||
print(f'Tables: {len(doc.tables)}')
|
||||
print(doc.tables[5].render(width=200)) # Check specific table
|
||||
"
|
||||
```
|
||||
|
||||
## Exit Criteria
|
||||
|
||||
We're done when:
|
||||
1. ✅ All 10 test documents parse successfully
|
||||
2. ✅ Visual output looks correct (maintainer approval)
|
||||
3. ✅ Tables render cleanly with proper alignment
|
||||
4. ✅ No critical or high priority bugs remain
|
||||
5. ✅ Performance is equal or better than old parser
|
||||
6. ✅ Text extraction is complete and clean for AI context
|
||||
|
||||
**Final approval**: Maintainer says "This is good enough to ship."
|
||||
|
||||
## Testing Infrastructure
|
||||
|
||||
### Primary Tool: compare_parsers.py
|
||||
|
||||
Simple command-line tool for the quality improvement loop:
|
||||
|
||||
```bash
|
||||
# Quick overview comparison (using shortcuts!)
|
||||
python tests/manual/compare_parsers.py aapl
|
||||
|
||||
# See all tables in a document
|
||||
python tests/manual/compare_parsers.py aapl --tables
|
||||
|
||||
# Compare specific table (OLD vs NEW side-by-side)
|
||||
python tests/manual/compare_parsers.py aapl --table 5
|
||||
|
||||
# Compare text extraction
|
||||
python tests/manual/compare_parsers.py msft --text
|
||||
|
||||
# See section detection
|
||||
python tests/manual/compare_parsers.py orcl --sections
|
||||
|
||||
# Test with 10-Q filings
|
||||
python tests/manual/compare_parsers.py 'aapl 10-q'
|
||||
|
||||
# Run all test files at once
|
||||
python tests/manual/compare_parsers.py --all
|
||||
```
|
||||
|
||||
**Shortcuts available**:
|
||||
- Companies: `aapl`, `msft`, `tsla`, `nvda`, `orcl`
|
||||
- Filing types: `10-k` (default), `10-q`, `8-k`
|
||||
- Or use full file paths
|
||||
|
||||
**Features**:
|
||||
- Clean command-line interface
|
||||
- Side-by-side OLD vs NEW comparison
|
||||
- Rich console output with colors and tables
|
||||
- Performance metrics
|
||||
- Individual table inspection
|
||||
|
||||
### Other Available Scripts
|
||||
|
||||
Additional tools for specific testing:
|
||||
|
||||
- `tests/manual/check_parser_comparison.py` - Full comparison with metrics
|
||||
- `tests/manual/check_tables.py` - Table-specific comparison with rendering
|
||||
- `tests/manual/check_html_rewrite.py` - General HTML parsing checks
|
||||
- `tests/manual/check_html_parser_real_files.py` - Real filing tests
|
||||
|
||||
## Quick Reference
|
||||
|
||||
For day-to-day testing commands and usage examples, see [TESTING.md](TESTING.md).
|
||||
|
||||
## Notes
|
||||
|
||||
- **Keep it simple**: This is about rapid iteration, not comprehensive automation
|
||||
- **Visual inspection is key**: Automated metrics don't catch layout/formatting issues
|
||||
- **Use screenshots**: When describing bugs, screenshots speak louder than words
|
||||
- **Iterative approach**: Don't try to fix everything at once, prioritize
|
||||
- **Trust your judgment**: If it looks wrong, it probably is wrong
|
||||
|
||||
## Bug Tracker
|
||||
|
||||
### Active Issues
|
||||
|
||||
(Add bugs here as they're discovered)
|
||||
|
||||
### Fixed Issues
|
||||
|
||||
(Move completed bugs here for history)
|
||||
|
||||
### Deferred Issues
|
||||
|
||||
(Issues that aren't blocking release but could be improved later)
|
||||
|
||||
---
|
||||
|
||||
**Status**: Initial draft
|
||||
**Last Updated**: 2025-10-07
|
||||
**Maintainer**: Dwight Gunning
|
||||
931
venv/lib/python3.10/site-packages/edgar/documents/document.py
Normal file
931
venv/lib/python3.10/site-packages/edgar/documents/document.py
Normal file
@@ -0,0 +1,931 @@
|
||||
"""
|
||||
Document model for parsed HTML.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional, Any, Iterator
|
||||
|
||||
from rich.table import Table as RichTable
|
||||
from rich.console import Group
|
||||
from rich.text import Text
|
||||
from edgar.richtools import repr_rich
|
||||
|
||||
from edgar.documents.nodes import Node, SectionNode
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
from edgar.documents.types import XBRLFact, SearchResult
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentMetadata:
|
||||
"""
|
||||
Document metadata.
|
||||
|
||||
Contains information about the source document and parsing process.
|
||||
"""
|
||||
source: Optional[str] = None
|
||||
form: Optional[str] = None
|
||||
company: Optional[str] = None
|
||||
cik: Optional[str] = None
|
||||
accession_number: Optional[str] = None
|
||||
filing_date: Optional[str] = None
|
||||
report_date: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
size: int = 0
|
||||
parse_time: float = 0.0
|
||||
parser_version: str = "2.0.0"
|
||||
xbrl_data: Optional[List[XBRLFact]] = None
|
||||
preserve_whitespace: bool = False
|
||||
original_html: Optional[str] = None # Store original HTML for anchor analysis
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert metadata to dictionary."""
|
||||
return {
|
||||
'source': self.source,
|
||||
'form': self.form,
|
||||
'company': self.company,
|
||||
'cik': self.cik,
|
||||
'accession_number': self.accession_number,
|
||||
'filing_date': self.filing_date,
|
||||
'report_date': self.report_date,
|
||||
'url': self.url,
|
||||
'size': self.size,
|
||||
'parse_time': self.parse_time,
|
||||
'parser_version': self.parser_version,
|
||||
'xbrl_data': [fact.to_dict() for fact in self.xbrl_data] if self.xbrl_data else None
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Section:
|
||||
"""
|
||||
Document section representation.
|
||||
|
||||
Represents a logical section of the document (e.g., Risk Factors, MD&A).
|
||||
|
||||
Attributes:
|
||||
name: Section identifier (e.g., "item_1", "part_i_item_1", "risk_factors")
|
||||
title: Display title (e.g., "Item 1 - Business")
|
||||
node: Node containing section content
|
||||
start_offset: Character position where section starts
|
||||
end_offset: Character position where section ends
|
||||
confidence: Detection confidence score (0.0-1.0)
|
||||
detection_method: How section was detected ('toc', 'heading', 'pattern')
|
||||
validated: Whether section has been cross-validated
|
||||
part: Optional part identifier for 10-Q filings ("I", "II", or None for 10-K)
|
||||
item: Optional item identifier (e.g., "1", "1A", "2")
|
||||
_text_extractor: Optional callback for lazy text extraction (for TOC-based sections)
|
||||
"""
|
||||
name: str
|
||||
title: str
|
||||
node: SectionNode
|
||||
start_offset: int = 0
|
||||
end_offset: int = 0
|
||||
confidence: float = 1.0 # Detection confidence (0.0-1.0)
|
||||
detection_method: str = 'unknown' # 'toc', 'heading', 'pattern', or 'unknown'
|
||||
validated: bool = False # Cross-validated flag
|
||||
part: Optional[str] = None # Part identifier for 10-Q: "I", "II", or None for 10-K
|
||||
item: Optional[str] = None # Item identifier: "1", "1A", "2", etc.
|
||||
_text_extractor: Optional[Any] = field(default=None, repr=False) # Callback for lazy text extraction
|
||||
|
||||
def text(self, **kwargs) -> str:
|
||||
"""Extract text from section."""
|
||||
# If we have a text extractor callback (TOC-based sections), use it
|
||||
if self._text_extractor is not None:
|
||||
return self._text_extractor(self.name, **kwargs)
|
||||
|
||||
# Otherwise extract from node (heading/pattern-based sections)
|
||||
from edgar.documents.extractors.text_extractor import TextExtractor
|
||||
extractor = TextExtractor(**kwargs)
|
||||
return extractor.extract_from_node(self.node)
|
||||
|
||||
def tables(self) -> List[TableNode]:
|
||||
"""Get all tables in section."""
|
||||
return self.node.find(lambda n: isinstance(n, TableNode))
|
||||
|
||||
def search(self, query: str) -> List[SearchResult]:
|
||||
"""Search within section."""
|
||||
# Implementation would use semantic search
|
||||
results = []
|
||||
# Simple text search for now
|
||||
text = self.text().lower()
|
||||
query_lower = query.lower()
|
||||
|
||||
if query_lower in text:
|
||||
# Find snippet around match
|
||||
index = text.find(query_lower)
|
||||
start = max(0, index - 50)
|
||||
end = min(len(text), index + len(query) + 50)
|
||||
snippet = text[start:end]
|
||||
|
||||
results.append(SearchResult(
|
||||
node=self.node,
|
||||
score=1.0,
|
||||
snippet=snippet,
|
||||
section=self.name
|
||||
))
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def parse_section_name(section_name: str) -> tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Parse section name to extract part and item identifiers.
|
||||
|
||||
Handles both 10-Q part-aware names and 10-K simple names.
|
||||
|
||||
Args:
|
||||
section_name: Section identifier (e.g., "part_i_item_1", "item_1a", "risk_factors")
|
||||
|
||||
Returns:
|
||||
Tuple of (part, item) where:
|
||||
- part: "I", "II", or None for 10-K sections
|
||||
- item: "1", "1A", "2", etc. or None if not an item section
|
||||
|
||||
Examples:
|
||||
>>> Section.parse_section_name("part_i_item_1")
|
||||
("I", "1")
|
||||
>>> Section.parse_section_name("part_ii_item_1a")
|
||||
("II", "1A")
|
||||
>>> Section.parse_section_name("item_7")
|
||||
(None, "7")
|
||||
>>> Section.parse_section_name("risk_factors")
|
||||
(None, None)
|
||||
"""
|
||||
import re
|
||||
|
||||
section_lower = section_name.lower()
|
||||
|
||||
# Match 10-Q format: "part_i_item_1", "part_ii_item_1a"
|
||||
part_item_match = re.match(r'part_([ivx]+)_item_(\d+[a-z]?)', section_lower)
|
||||
if part_item_match:
|
||||
part_roman = part_item_match.group(1).upper()
|
||||
item_num = part_item_match.group(2).upper()
|
||||
return (part_roman, item_num)
|
||||
|
||||
# Match 10-K format: "item_1", "item_1a", "item_7"
|
||||
item_match = re.match(r'item_(\d+[a-z]?)', section_lower)
|
||||
if item_match:
|
||||
item_num = item_match.group(1).upper()
|
||||
return (None, item_num)
|
||||
|
||||
# Not a structured item section
|
||||
return (None, None)
|
||||
|
||||
|
||||
class Sections(Dict[str, Section]):
|
||||
"""
|
||||
Dictionary wrapper for sections with rich display support.
|
||||
|
||||
Behaves like a normal dict but provides beautiful terminal display
|
||||
via __rich__() method when printed in rich-enabled environments.
|
||||
"""
|
||||
|
||||
def __rich__(self):
|
||||
"""Return rich representation for display."""
|
||||
if not self:
|
||||
return Text("No sections detected", style="dim")
|
||||
|
||||
# Create summary table
|
||||
table = RichTable(title="Document Sections", show_header=True, header_style="bold magenta")
|
||||
table.add_column("Section", style="cyan", no_wrap=True)
|
||||
table.add_column("Title", style="white")
|
||||
table.add_column("Confidence", justify="right", style="green")
|
||||
table.add_column("Method", style="yellow")
|
||||
table.add_column("Part/Item", style="blue")
|
||||
|
||||
# Sort sections by part (roman numeral) and item number
|
||||
def sort_key(item):
|
||||
name, section = item
|
||||
# Convert roman numerals to integers for sorting
|
||||
roman_to_int = {'i': 1, 'ii': 2, 'iii': 3, 'iv': 4, 'v': 5}
|
||||
|
||||
part = section.part.lower() if section.part else ''
|
||||
item_str = section.item if section.item else ''
|
||||
|
||||
# Extract part number
|
||||
part_num = roman_to_int.get(part, 0)
|
||||
|
||||
# Extract item number and letter
|
||||
import re
|
||||
if item_str:
|
||||
match = re.match(r'(\d+)([a-z]?)', item_str.lower())
|
||||
if match:
|
||||
item_num = int(match.group(1))
|
||||
item_letter = match.group(2) or ''
|
||||
return (part_num, item_num, item_letter)
|
||||
|
||||
# Fallback to name sorting
|
||||
return (part_num, 999, name)
|
||||
|
||||
sorted_sections = sorted(self.items(), key=sort_key)
|
||||
|
||||
# Add rows for each section
|
||||
for name, section in sorted_sections:
|
||||
# Format confidence as percentage
|
||||
confidence = f"{section.confidence:.1%}"
|
||||
|
||||
# Format part/item info
|
||||
part_item = ""
|
||||
if section.part and section.item:
|
||||
part_item = f"Part {section.part}, Item {section.item}"
|
||||
elif section.item:
|
||||
part_item = f"Item {section.item}"
|
||||
elif section.part:
|
||||
part_item = f"Part {section.part}"
|
||||
|
||||
# Truncate title if too long
|
||||
title = section.title
|
||||
if len(title) > 50:
|
||||
title = title[:47] + "..."
|
||||
|
||||
table.add_row(
|
||||
name,
|
||||
title,
|
||||
confidence,
|
||||
section.detection_method,
|
||||
part_item
|
||||
)
|
||||
|
||||
# Create summary stats
|
||||
total = len(self)
|
||||
high_conf = sum(1 for s in self.values() if s.confidence >= 0.8)
|
||||
methods = {}
|
||||
for section in self.values():
|
||||
methods[section.detection_method] = methods.get(section.detection_method, 0) + 1
|
||||
|
||||
summary = Text()
|
||||
summary.append(f"\nTotal: {total} sections | ", style="dim")
|
||||
summary.append(f"High confidence (≥80%): {high_conf} | ", style="dim")
|
||||
summary.append(f"Methods: {', '.join(f'{m}={c}' for m, c in methods.items())}", style="dim")
|
||||
|
||||
return Group(table, summary)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def get_item(self, item: str, part: str = None) -> Optional[Section]:
|
||||
"""
|
||||
Get section by item number with optional part specification.
|
||||
|
||||
Args:
|
||||
item: Item identifier (e.g., "1", "1A", "7", "Item 1", "Item 7A")
|
||||
part: Optional part specification (e.g., "I", "II", "Part I", "Part II")
|
||||
If not specified and multiple parts contain the item, returns first match.
|
||||
|
||||
Returns:
|
||||
Section object if found, None otherwise
|
||||
|
||||
Examples:
|
||||
>>> sections.get_item("1") # Returns first Item 1 (any part)
|
||||
>>> sections.get_item("1", "I") # Returns Part I, Item 1
|
||||
>>> sections.get_item("Item 1A") # Returns first Item 1A
|
||||
>>> sections.get_item("7A", "II") # Returns Part II, Item 7A
|
||||
"""
|
||||
# Normalize item string - remove "Item " prefix if present
|
||||
item_clean = item.replace("Item ", "").replace("item ", "").strip().upper()
|
||||
|
||||
# Normalize part string if provided
|
||||
part_clean = None
|
||||
if part:
|
||||
part_clean = part.replace("Part ", "").replace("part ", "").replace("PART ", "").strip().upper()
|
||||
|
||||
# Search through sections
|
||||
for name, section in self.items():
|
||||
if section.item and section.item.upper() == item_clean:
|
||||
if part_clean is None:
|
||||
# No part specified - return first match
|
||||
return section
|
||||
elif section.part and section.part.upper() == part_clean:
|
||||
# Part matches
|
||||
return section
|
||||
|
||||
return None
|
||||
|
||||
def get_part(self, part: str) -> Dict[str, Section]:
|
||||
"""
|
||||
Get all sections in a specific part.
|
||||
|
||||
Args:
|
||||
part: Part identifier (e.g., "I", "II", "Part I", "Part II")
|
||||
|
||||
Returns:
|
||||
Dictionary of sections in that part
|
||||
|
||||
Examples:
|
||||
>>> sections.get_part("I") # All Part I sections
|
||||
>>> sections.get_part("Part II") # All Part II sections
|
||||
"""
|
||||
# Normalize part string
|
||||
part_clean = part.replace("Part ", "").replace("part ", "").replace("PART ", "").strip().upper()
|
||||
|
||||
result = {}
|
||||
for name, section in self.items():
|
||||
if section.part and section.part.upper() == part_clean:
|
||||
result[name] = section
|
||||
|
||||
return result
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""
|
||||
Enhanced get method that supports flexible key formats.
|
||||
|
||||
Supports:
|
||||
- Standard dict key: "part_i_item_1"
|
||||
- Item number: "Item 1", "1", "1A"
|
||||
- Part+Item: ("I", "1"), ("Part II", "7A")
|
||||
|
||||
Args:
|
||||
key: Section key (string or tuple)
|
||||
default: Default value if not found
|
||||
|
||||
Returns:
|
||||
Section object or default value
|
||||
"""
|
||||
# Try standard dict lookup first
|
||||
if isinstance(key, str):
|
||||
result = super().get(key, None)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
# Try as item number
|
||||
result = self.get_item(key)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
# Try as (part, item) tuple
|
||||
elif isinstance(key, tuple) and len(key) == 2:
|
||||
part, item = key
|
||||
result = self.get_item(item, part)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
return default
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""
|
||||
Enhanced __getitem__ that supports flexible key formats.
|
||||
|
||||
Supports:
|
||||
- Standard dict key: sections["part_i_item_1"]
|
||||
- Item number: sections["Item 1"], sections["1A"]
|
||||
- Part+Item tuple: sections[("I", "1")], sections[("II", "7A")]
|
||||
|
||||
Raises KeyError if not found (standard dict behavior).
|
||||
"""
|
||||
# Try standard dict lookup first
|
||||
if isinstance(key, str):
|
||||
try:
|
||||
return super().__getitem__(key)
|
||||
except KeyError:
|
||||
# Try as item number
|
||||
result = self.get_item(key)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
# Try as (part, item) tuple
|
||||
elif isinstance(key, tuple) and len(key) == 2:
|
||||
part, item = key
|
||||
result = self.get_item(item, part)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
# Not found - raise KeyError
|
||||
raise KeyError(key)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Document:
|
||||
"""
|
||||
Main document class.
|
||||
|
||||
Represents a parsed HTML document with methods for content extraction,
|
||||
search, and transformation.
|
||||
"""
|
||||
|
||||
# Core properties
|
||||
root: Node
|
||||
metadata: DocumentMetadata = field(default_factory=DocumentMetadata)
|
||||
|
||||
# Cached extractions
|
||||
_sections: Optional[Sections] = field(default=None, init=False, repr=False)
|
||||
_tables: Optional[List[TableNode]] = field(default=None, init=False, repr=False)
|
||||
_headings: Optional[List[Node]] = field(default=None, init=False, repr=False)
|
||||
_xbrl_facts: Optional[List[XBRLFact]] = field(default=None, init=False, repr=False)
|
||||
_text_cache: Optional[str] = field(default=None, init=False, repr=False)
|
||||
_config: Optional[Any] = field(default=None, init=False, repr=False) # ParserConfig reference
|
||||
|
||||
@property
|
||||
def sections(self) -> Sections:
|
||||
"""
|
||||
Get document sections using hybrid multi-strategy detection.
|
||||
|
||||
Tries detection methods in order of reliability:
|
||||
1. TOC-based (0.95 confidence)
|
||||
2. Heading-based (0.7-0.9 confidence)
|
||||
3. Pattern-based (0.6 confidence)
|
||||
|
||||
Returns a Sections dictionary wrapper that provides rich terminal display
|
||||
via __rich__() method. Each section includes confidence score and detection method.
|
||||
"""
|
||||
if self._sections is None:
|
||||
# Get form type from config or metadata
|
||||
form = None
|
||||
if self._config and hasattr(self._config, 'form'):
|
||||
form = self._config.form
|
||||
elif self.metadata and self.metadata.form:
|
||||
form = self.metadata.form
|
||||
|
||||
# Only detect sections for supported form types (including amendments)
|
||||
# Normalize form type by removing /A suffix for amendments
|
||||
base_form = form.replace('/A', '') if form else None
|
||||
|
||||
if base_form and base_form in ['10-K', '10-Q', '8-K']:
|
||||
from edgar.documents.extractors.hybrid_section_detector import HybridSectionDetector
|
||||
# Pass thresholds from config if available
|
||||
thresholds = self._config.detection_thresholds if self._config else None
|
||||
# Use base form type for detection (10-K/A → 10-K)
|
||||
detector = HybridSectionDetector(self, base_form, thresholds)
|
||||
detected_sections = detector.detect_sections()
|
||||
else:
|
||||
# Fallback to pattern-based for other types or unknown
|
||||
from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
|
||||
extractor = SectionExtractor(form) if form else SectionExtractor()
|
||||
detected_sections = extractor.extract(self)
|
||||
|
||||
# Wrap detected sections in Sections class for rich display
|
||||
self._sections = Sections(detected_sections)
|
||||
|
||||
return self._sections
|
||||
|
||||
@property
|
||||
def tables(self) -> List[TableNode]:
|
||||
"""Get all tables in document."""
|
||||
if self._tables is None:
|
||||
self._tables = self.root.find(lambda n: isinstance(n, TableNode))
|
||||
return self._tables
|
||||
|
||||
@property
|
||||
def headings(self) -> List[Node]:
|
||||
"""Get all headings in document."""
|
||||
if self._headings is None:
|
||||
from edgar.documents.nodes import HeadingNode
|
||||
self._headings = self.root.find(lambda n: isinstance(n, HeadingNode))
|
||||
return self._headings
|
||||
|
||||
@property
|
||||
def xbrl_facts(self) -> List[XBRLFact]:
|
||||
"""Get all XBRL facts in document."""
|
||||
if self._xbrl_facts is None:
|
||||
self._xbrl_facts = self._extract_xbrl_facts()
|
||||
return self._xbrl_facts
|
||||
|
||||
def text(self,
|
||||
clean: bool = True,
|
||||
include_tables: bool = True,
|
||||
include_metadata: bool = False,
|
||||
max_length: Optional[int] = None) -> str:
|
||||
"""
|
||||
Extract text from document.
|
||||
|
||||
Args:
|
||||
clean: Clean and normalize text
|
||||
include_tables: Include table content in text
|
||||
include_metadata: Include metadata annotations
|
||||
max_length: Maximum text length
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
# Use cache if available and parameters match
|
||||
if (self._text_cache is not None and
|
||||
clean and not include_tables and not include_metadata and max_length is None):
|
||||
return self._text_cache
|
||||
|
||||
# If whitespace was preserved during parsing and clean is default (True),
|
||||
# respect the preserve_whitespace setting
|
||||
if self.metadata.preserve_whitespace and clean:
|
||||
clean = False
|
||||
|
||||
from edgar.documents.extractors.text_extractor import TextExtractor
|
||||
extractor = TextExtractor(
|
||||
clean=clean,
|
||||
include_tables=include_tables,
|
||||
include_metadata=include_metadata,
|
||||
max_length=max_length
|
||||
)
|
||||
text = extractor.extract(self)
|
||||
|
||||
# Apply navigation link filtering when cleaning
|
||||
if clean:
|
||||
# Use cached/integrated navigation filtering (optimized approach)
|
||||
try:
|
||||
from edgar.documents.utils.anchor_cache import filter_with_cached_patterns
|
||||
# Use minimal cached approach (no memory overhead)
|
||||
original_html = getattr(self.metadata, 'original_html', None)
|
||||
text = filter_with_cached_patterns(text, html_content=original_html)
|
||||
except:
|
||||
# Fallback to pattern-based filtering
|
||||
from edgar.documents.utils.toc_filter import filter_toc_links
|
||||
text = filter_toc_links(text)
|
||||
|
||||
# Cache if using default parameters
|
||||
if clean and not include_tables and not include_metadata and max_length is None:
|
||||
self._text_cache = text
|
||||
|
||||
return text
|
||||
|
||||
def search(self, query: str, top_k: int = 10) -> List[SearchResult]:
|
||||
"""
|
||||
Search document for query.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
top_k: Maximum results to return
|
||||
|
||||
Returns:
|
||||
List of search results
|
||||
"""
|
||||
from edgar.documents.search import DocumentSearch
|
||||
searcher = DocumentSearch(self)
|
||||
return searcher.search(query, top_k=top_k)
|
||||
|
||||
def get_section(self, section_name: str, part: Optional[str] = None) -> Optional[Section]:
|
||||
"""
|
||||
Get section by name with optional part specification for 10-Q filings.
|
||||
|
||||
Args:
|
||||
section_name: Section identifier (e.g., "item_1", "part_i_item_1")
|
||||
part: Optional part specification for 10-Q ("I", "II", "i", "ii")
|
||||
If provided, searches for "part_{part}_{section_name}"
|
||||
|
||||
Returns:
|
||||
Section object if found, None otherwise
|
||||
|
||||
Examples:
|
||||
# 10-K usage (unchanged)
|
||||
>>> doc.get_section("item_1") # Returns Item 1
|
||||
|
||||
# 10-Q usage with explicit part
|
||||
>>> doc.get_section("item_1", part="I") # Returns Part I Item 1
|
||||
>>> doc.get_section("item_1", part="II") # Returns Part II Item 1
|
||||
|
||||
# 10-Q usage with full name
|
||||
>>> doc.get_section("part_i_item_1") # Returns Part I Item 1
|
||||
"""
|
||||
# If part is specified, construct part-aware name
|
||||
if part:
|
||||
part_normalized = part.upper()
|
||||
# Remove "item_" prefix if present in section_name
|
||||
item_name = section_name.replace("item_", "") if section_name.startswith("item_") else section_name
|
||||
full_name = f"part_{part_normalized.lower()}_item_{item_name.lower()}"
|
||||
return self.sections.get(full_name)
|
||||
|
||||
# Direct lookup (works for both 10-K "item_1" and 10-Q "part_i_item_1")
|
||||
section = self.sections.get(section_name)
|
||||
if section:
|
||||
return section
|
||||
|
||||
# If not found and looks like an item without part, check if we have multiple parts
|
||||
# In that case, raise a helpful error
|
||||
if section_name.startswith("item_") or section_name.replace("_", "").startswith("item"):
|
||||
# Check if we have part-aware sections (10-Q)
|
||||
matching_sections = [name for name in self.sections.keys()
|
||||
if section_name in name and "part_" in name]
|
||||
if matching_sections:
|
||||
# Multiple parts available - user needs to specify which one
|
||||
parts = sorted(set(s.split("_")[1] for s in matching_sections if s.startswith("part_")))
|
||||
raise ValueError(
|
||||
f"Ambiguous section '{section_name}' in 10-Q filing. "
|
||||
f"Found in parts: {parts}. "
|
||||
f"Please specify part: get_section('{section_name}', part='I') or part='II'"
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_section_text(self, section_name: str) -> Optional[str]:
|
||||
"""Extract text from specific section."""
|
||||
section = self.get_section(section_name)
|
||||
if section:
|
||||
return section.text()
|
||||
return None
|
||||
|
||||
def get_sec_section(self, section_name: str, clean: bool = True,
|
||||
include_subsections: bool = True) -> Optional[str]:
|
||||
"""
|
||||
Extract content from a specific SEC filing section using anchor analysis.
|
||||
|
||||
Args:
|
||||
section_name: Section name (e.g., "Item 1", "Item 1A", "Part I")
|
||||
clean: Whether to apply text cleaning and navigation filtering
|
||||
include_subsections: Whether to include subsections
|
||||
|
||||
Returns:
|
||||
Section text content or None if section not found
|
||||
|
||||
Examples:
|
||||
>>> doc.get_sec_section("Item 1") # Business description
|
||||
>>> doc.get_sec_section("Item 1A") # Risk factors
|
||||
>>> doc.get_sec_section("Item 7") # MD&A
|
||||
"""
|
||||
# Lazy-load section extractor
|
||||
if not hasattr(self, '_section_extractor'):
|
||||
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
|
||||
self._section_extractor = SECSectionExtractor(self)
|
||||
|
||||
return self._section_extractor.get_section_text(
|
||||
section_name, include_subsections, clean
|
||||
)
|
||||
|
||||
def get_available_sec_sections(self) -> List[str]:
|
||||
"""
|
||||
Get list of SEC sections available for extraction.
|
||||
|
||||
Returns:
|
||||
List of section names that can be passed to get_sec_section()
|
||||
|
||||
Example:
|
||||
>>> sections = doc.get_available_sec_sections()
|
||||
>>> print(sections)
|
||||
['Part I', 'Item 1', 'Item 1A', 'Item 1B', 'Item 2', ...]
|
||||
"""
|
||||
if not hasattr(self, '_section_extractor'):
|
||||
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
|
||||
self._section_extractor = SECSectionExtractor(self)
|
||||
|
||||
return self._section_extractor.get_available_sections()
|
||||
|
||||
def get_sec_section_info(self, section_name: str) -> Optional[Dict]:
|
||||
"""
|
||||
Get detailed information about an SEC section.
|
||||
|
||||
Args:
|
||||
section_name: Section name to look up
|
||||
|
||||
Returns:
|
||||
Dict with section metadata including anchor info
|
||||
"""
|
||||
if not hasattr(self, '_section_extractor'):
|
||||
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
|
||||
self._section_extractor = SECSectionExtractor(self)
|
||||
|
||||
return self._section_extractor.get_section_info(section_name)
|
||||
|
||||
def to_markdown(self) -> str:
|
||||
"""Convert document to Markdown."""
|
||||
from edgar.documents.renderers.markdown_renderer import MarkdownRenderer
|
||||
renderer = MarkdownRenderer()
|
||||
return renderer.render(self)
|
||||
|
||||
def to_json(self, include_content: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert document to JSON.
|
||||
|
||||
Args:
|
||||
include_content: Include full content or just structure
|
||||
|
||||
Returns:
|
||||
JSON-serializable dictionary
|
||||
"""
|
||||
result = {
|
||||
'metadata': self.metadata.to_dict(),
|
||||
'sections': list(self.sections.keys()),
|
||||
'table_count': len(self.tables),
|
||||
'xbrl_fact_count': len(self.xbrl_facts)
|
||||
}
|
||||
|
||||
if include_content:
|
||||
result['sections_detail'] = {
|
||||
name: {
|
||||
'title': section.title,
|
||||
'text_length': len(section.text()),
|
||||
'table_count': len(section.tables())
|
||||
}
|
||||
for name, section in self.sections.items()
|
||||
}
|
||||
|
||||
result['tables'] = [
|
||||
{
|
||||
'type': table.table_type.name,
|
||||
'rows': len(table.rows),
|
||||
'columns': len(table.headers[0]) if table.headers else 0,
|
||||
'caption': table.caption
|
||||
}
|
||||
for table in self.tables
|
||||
]
|
||||
|
||||
return result
|
||||
|
||||
def to_dataframe(self) -> 'pd.DataFrame':
|
||||
"""
|
||||
Convert document tables to pandas DataFrame.
|
||||
|
||||
Returns a DataFrame with all tables concatenated.
|
||||
"""
|
||||
import pandas as pd
|
||||
|
||||
if not self.tables:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Convert each table to DataFrame
|
||||
dfs = []
|
||||
for i, table in enumerate(self.tables):
|
||||
df = table.to_dataframe()
|
||||
# Add table index
|
||||
df['_table_index'] = i
|
||||
df['_table_type'] = table.table_type.name
|
||||
if table.caption:
|
||||
df['_table_caption'] = table.caption
|
||||
dfs.append(df)
|
||||
|
||||
# Concatenate all tables
|
||||
return pd.concat(dfs, ignore_index=True)
|
||||
|
||||
def chunks(self, chunk_size: int = 512, overlap: int = 128) -> Iterator['DocumentChunk']:
|
||||
"""
|
||||
Generate document chunks for processing.
|
||||
|
||||
Args:
|
||||
chunk_size: Target chunk size in tokens
|
||||
overlap: Overlap between chunks
|
||||
|
||||
Yields:
|
||||
Document chunks
|
||||
"""
|
||||
from edgar.documents.extractors.chunk_extractor import ChunkExtractor
|
||||
extractor = ChunkExtractor(chunk_size=chunk_size, overlap=overlap)
|
||||
return extractor.extract(self)
|
||||
|
||||
def prepare_for_llm(self,
|
||||
max_tokens: int = 4000,
|
||||
preserve_structure: bool = True,
|
||||
focus_sections: Optional[List[str]] = None) -> 'LLMDocument':
|
||||
"""
|
||||
Prepare document for LLM processing.
|
||||
|
||||
Args:
|
||||
max_tokens: Maximum tokens
|
||||
preserve_structure: Preserve document structure
|
||||
focus_sections: Sections to focus on
|
||||
|
||||
Returns:
|
||||
LLM-optimized document
|
||||
"""
|
||||
from edgar.documents.ai.llm_optimizer import LLMOptimizer
|
||||
optimizer = LLMOptimizer()
|
||||
return optimizer.optimize(
|
||||
self,
|
||||
max_tokens=max_tokens,
|
||||
preserve_structure=preserve_structure,
|
||||
focus_sections=focus_sections
|
||||
)
|
||||
|
||||
def extract_key_information(self) -> Dict[str, Any]:
|
||||
"""Extract key information from document."""
|
||||
return {
|
||||
'company': self.metadata.company,
|
||||
'form': self.metadata.form,
|
||||
'filing_date': self.metadata.filing_date,
|
||||
'sections': list(self.sections.keys()),
|
||||
'financial_tables': sum(1 for t in self.tables if t.is_financial_table),
|
||||
'total_tables': len(self.tables),
|
||||
'xbrl_facts': len(self.xbrl_facts),
|
||||
'document_length': len(self.text())
|
||||
}
|
||||
|
||||
def _extract_xbrl_facts(self) -> List[XBRLFact]:
|
||||
"""Extract XBRL facts from document."""
|
||||
facts = []
|
||||
|
||||
# Find all nodes with XBRL metadata
|
||||
xbrl_nodes = self.root.find(
|
||||
lambda n: n.get_metadata('ix_tag') is not None
|
||||
)
|
||||
|
||||
for node in xbrl_nodes:
|
||||
fact = XBRLFact(
|
||||
concept=node.get_metadata('ix_tag'),
|
||||
value=node.text(),
|
||||
context_ref=node.get_metadata('ix_context'),
|
||||
unit_ref=node.get_metadata('ix_unit'),
|
||||
decimals=node.get_metadata('ix_decimals'),
|
||||
scale=node.get_metadata('ix_scale')
|
||||
)
|
||||
facts.append(fact)
|
||||
|
||||
return facts
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get number of top-level nodes."""
|
||||
return len(self.root.children)
|
||||
|
||||
def __iter__(self) -> Iterator[Node]:
|
||||
"""Iterate over top-level nodes."""
|
||||
return iter(self.root.children)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.text()
|
||||
|
||||
def walk(self) -> Iterator[Node]:
|
||||
"""Walk entire document tree."""
|
||||
return self.root.walk()
|
||||
|
||||
def find_nodes(self, predicate) -> List[Node]:
|
||||
"""Find all nodes matching predicate."""
|
||||
return self.root.find(predicate)
|
||||
|
||||
def find_first_node(self, predicate) -> Optional[Node]:
|
||||
"""Find first node matching predicate."""
|
||||
return self.root.find_first(predicate)
|
||||
|
||||
@property
|
||||
def is_empty(self) -> bool:
|
||||
"""Check if document is empty."""
|
||||
return len(self.root.children) == 0
|
||||
|
||||
@property
|
||||
def has_tables(self) -> bool:
|
||||
"""Check if document has tables."""
|
||||
return len(self.tables) > 0
|
||||
|
||||
@property
|
||||
def has_xbrl(self) -> bool:
|
||||
"""Check if document has XBRL data."""
|
||||
return len(self.xbrl_facts) > 0
|
||||
|
||||
def validate(self) -> List[str]:
|
||||
"""
|
||||
Validate document structure.
|
||||
|
||||
Returns list of validation issues.
|
||||
"""
|
||||
issues = []
|
||||
|
||||
# Check for empty document
|
||||
if self.is_empty:
|
||||
issues.append("Document is empty")
|
||||
|
||||
# Check for sections
|
||||
if not self.sections:
|
||||
issues.append("No sections detected")
|
||||
|
||||
# Check for common sections in filings
|
||||
if self.metadata.form in ['10-K', '10-Q']:
|
||||
expected_sections = ['business', 'risk_factors', 'mda']
|
||||
missing = [s for s in expected_sections if s not in self.sections]
|
||||
if missing:
|
||||
issues.append(f"Missing expected sections: {', '.join(missing)}")
|
||||
|
||||
# Check for orphaned nodes
|
||||
orphaned = self.root.find(lambda n: n.parent is None and n != self.root)
|
||||
if orphaned:
|
||||
issues.append(f"Found {len(orphaned)} orphaned nodes")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentChunk:
|
||||
"""Represents a chunk of document for processing."""
|
||||
content: str
|
||||
start_node: Node
|
||||
end_node: Node
|
||||
section: Optional[str] = None
|
||||
token_count: int = 0
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert chunk to dictionary."""
|
||||
return {
|
||||
'content': self.content,
|
||||
'section': self.section,
|
||||
'token_count': self.token_count,
|
||||
'start_path': self.start_node.path,
|
||||
'end_path': self.end_node.path
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMDocument:
|
||||
"""Document optimized for LLM processing."""
|
||||
content: str
|
||||
metadata: Dict[str, Any]
|
||||
token_count: int
|
||||
sections: List[str]
|
||||
truncated: bool = False
|
||||
|
||||
def to_prompt(self) -> str:
|
||||
"""Convert to LLM prompt."""
|
||||
parts = []
|
||||
|
||||
# Add metadata context
|
||||
parts.append(f"Document: {self.metadata.get('form', 'Unknown')}")
|
||||
parts.append(f"Company: {self.metadata.get('company', 'Unknown')}")
|
||||
parts.append(f"Date: {self.metadata.get('filing_date', 'Unknown')}")
|
||||
parts.append("")
|
||||
|
||||
# Add content
|
||||
parts.append(self.content)
|
||||
|
||||
if self.truncated:
|
||||
parts.append("\n[Content truncated due to length]")
|
||||
|
||||
return '\n'.join(parts)
|
||||
@@ -0,0 +1,81 @@
|
||||
"""
|
||||
Custom exceptions for the HTML parser.
|
||||
"""
|
||||
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
|
||||
class ParsingError(Exception):
|
||||
"""Base exception for parsing errors."""
|
||||
|
||||
def __init__(self,
|
||||
message: str,
|
||||
context: Optional[Dict[str, Any]] = None,
|
||||
suggestions: Optional[list] = None):
|
||||
super().__init__(message)
|
||||
self.message = message
|
||||
self.context = context or {}
|
||||
self.suggestions = suggestions or []
|
||||
|
||||
def __str__(self):
|
||||
result = self.message
|
||||
if self.context:
|
||||
result += f"\nContext: {self.context}"
|
||||
if self.suggestions:
|
||||
result += f"\nSuggestions: {', '.join(self.suggestions)}"
|
||||
return result
|
||||
|
||||
|
||||
class HTMLParsingError(ParsingError):
|
||||
"""Error parsing HTML structure."""
|
||||
pass
|
||||
|
||||
|
||||
class StyleParsingError(ParsingError):
|
||||
"""Error parsing CSS styles."""
|
||||
pass
|
||||
|
||||
|
||||
class XBRLParsingError(ParsingError):
|
||||
"""Error parsing inline XBRL."""
|
||||
pass
|
||||
|
||||
|
||||
class TableParsingError(ParsingError):
|
||||
"""Error parsing table structure."""
|
||||
pass
|
||||
|
||||
|
||||
class SectionDetectionError(ParsingError):
|
||||
"""Error detecting document sections."""
|
||||
pass
|
||||
|
||||
|
||||
class DocumentTooLargeError(ParsingError):
|
||||
"""Document exceeds maximum size."""
|
||||
|
||||
def __init__(self, size: int, max_size: int):
|
||||
super().__init__(
|
||||
f"Document size ({size:,} bytes) exceeds maximum ({max_size:,} bytes)",
|
||||
context={'size': size, 'max_size': max_size},
|
||||
suggestions=[
|
||||
"Use streaming parser for large documents",
|
||||
"Increase max_document_size in configuration",
|
||||
"Split document into smaller parts"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class InvalidConfigurationError(ParsingError):
|
||||
"""Invalid parser configuration."""
|
||||
pass
|
||||
|
||||
|
||||
class NodeNotFoundError(ParsingError):
|
||||
"""Requested node not found in document."""
|
||||
pass
|
||||
|
||||
|
||||
class ExtractionError(ParsingError):
|
||||
"""Error extracting content from document."""
|
||||
pass
|
||||
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
Content extractors for documents.
|
||||
"""
|
||||
|
||||
from edgar.documents.extractors.text_extractor import TextExtractor
|
||||
from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
|
||||
from edgar.documents.extractors.hybrid_section_detector import HybridSectionDetector
|
||||
from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
|
||||
|
||||
__all__ = [
|
||||
'TextExtractor',
|
||||
'SectionExtractor',
|
||||
'HybridSectionDetector',
|
||||
'TOCSectionDetector'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,170 @@
|
||||
"""
|
||||
Heading-based section detection strategy.
|
||||
|
||||
Detects sections by analyzing heading nodes with HeaderInfo metadata.
|
||||
This strategy provides moderate confidence (0.7-0.9) and serves as a
|
||||
fallback when TOC-based detection is not available.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
from edgar.documents.document import Document, Section
|
||||
from edgar.documents.nodes import HeadingNode, SectionNode
|
||||
from edgar.documents.types import HeaderInfo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HeadingSectionDetector:
|
||||
"""
|
||||
Heading-based section detection using HeaderInfo.
|
||||
|
||||
Analyzes heading nodes that have been annotated with HeaderInfo
|
||||
during parsing. Detects sections based on:
|
||||
- Item numbers (Item 1, Item 1A, etc.)
|
||||
- Heading confidence scores
|
||||
- Heading hierarchy
|
||||
|
||||
Provides moderate confidence (0.7-0.9) detection.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
document: Document,
|
||||
form: Optional[str] = None,
|
||||
min_confidence: float = 0.5 # Lower threshold, let hybrid detector filter
|
||||
):
|
||||
"""
|
||||
Initialize heading-based detector.
|
||||
|
||||
Args:
|
||||
document: Document to analyze
|
||||
form: Optional filing type for context ('10-K', '10-Q', '8-K')
|
||||
min_confidence: Minimum confidence for headings (default 0.5)
|
||||
"""
|
||||
self.document = document
|
||||
self.form = form
|
||||
self.min_confidence = min_confidence
|
||||
|
||||
def detect(self) -> Optional[Dict[str, Section]]:
|
||||
"""
|
||||
Detect sections from heading nodes with HeaderInfo.
|
||||
|
||||
Returns:
|
||||
Dictionary of sections if successful, None if no sections found
|
||||
"""
|
||||
try:
|
||||
# Get heading nodes from document
|
||||
headings = self.document.headings
|
||||
if not headings:
|
||||
logger.debug("No headings found in document")
|
||||
return None
|
||||
|
||||
sections = {}
|
||||
|
||||
for heading in headings:
|
||||
# Check if heading has header info
|
||||
if not hasattr(heading, 'header_info') or not heading.header_info:
|
||||
continue
|
||||
|
||||
header_info = heading.header_info
|
||||
|
||||
# Only use headings with sufficient confidence
|
||||
if header_info.confidence < self.min_confidence:
|
||||
continue
|
||||
|
||||
# Check if it's an item header
|
||||
if not header_info.is_item:
|
||||
continue
|
||||
|
||||
# Extract section from this heading
|
||||
section = self._extract_section_from_heading(heading, header_info)
|
||||
if section:
|
||||
section.confidence = header_info.confidence
|
||||
section.detection_method = 'heading'
|
||||
sections[section.name] = section
|
||||
|
||||
if not sections:
|
||||
logger.debug("No item headers found with sufficient confidence")
|
||||
return None
|
||||
|
||||
logger.info(f"Heading detection found {len(sections)} sections")
|
||||
return sections
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Heading detection failed: {e}")
|
||||
return None
|
||||
|
||||
def _extract_section_from_heading(
|
||||
self, heading: HeadingNode, header_info: HeaderInfo
|
||||
) -> Optional[Section]:
|
||||
"""
|
||||
Extract section content from heading node to next heading.
|
||||
|
||||
Args:
|
||||
heading: HeadingNode representing section start
|
||||
header_info: HeaderInfo with section metadata
|
||||
|
||||
Returns:
|
||||
Section object if successful, None otherwise
|
||||
"""
|
||||
try:
|
||||
# Create section name from item number
|
||||
if header_info.item_number:
|
||||
# Normalize: "1A" -> "item_1a", "7" -> "item_7"
|
||||
section_name = f"item_{header_info.item_number.replace('.', '_').lower()}"
|
||||
else:
|
||||
section_name = "unknown"
|
||||
|
||||
# Create section node
|
||||
section_node = SectionNode(section_name=section_name)
|
||||
|
||||
# Find next heading at same or higher level to determine section end
|
||||
current_level = header_info.level
|
||||
parent = heading.parent
|
||||
if not parent:
|
||||
logger.debug(f"Heading {header_info.text} has no parent")
|
||||
return None
|
||||
|
||||
# Find heading position in parent's children
|
||||
try:
|
||||
heading_index = parent.children.index(heading)
|
||||
except ValueError:
|
||||
logger.debug(f"Could not find heading in parent's children")
|
||||
return None
|
||||
|
||||
# Collect nodes until next section heading
|
||||
for i in range(heading_index + 1, len(parent.children)):
|
||||
child = parent.children[i]
|
||||
|
||||
# Stop at next heading of same or higher level
|
||||
if isinstance(child, HeadingNode):
|
||||
if hasattr(child, 'header_info') and child.header_info:
|
||||
if child.header_info.level <= current_level:
|
||||
break
|
||||
|
||||
# Add child to section
|
||||
section_node.add_child(child)
|
||||
|
||||
# Parse section name to extract part and item identifiers
|
||||
part, item = Section.parse_section_name(section_name)
|
||||
|
||||
# Create Section object
|
||||
section = Section(
|
||||
name=section_name,
|
||||
title=header_info.text,
|
||||
node=section_node,
|
||||
start_offset=0, # Would need actual text position
|
||||
end_offset=0, # Would need actual text position
|
||||
confidence=header_info.confidence,
|
||||
detection_method='heading',
|
||||
part=part,
|
||||
item=item
|
||||
)
|
||||
|
||||
return section
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract section from heading: {e}")
|
||||
return None
|
||||
@@ -0,0 +1,489 @@
|
||||
"""
|
||||
Hybrid section detection system with multiple fallback strategies.
|
||||
|
||||
This module implements a multi-strategy approach to section detection:
|
||||
1. TOC-based (primary): High confidence, uses Table of Contents structure
|
||||
2. Heading-based (fallback): Moderate confidence, uses multi-strategy heading detection
|
||||
3. Pattern-based (last resort): Lower confidence, uses regex pattern matching
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Optional, List
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
|
||||
from edgar.documents.document import Document, Section
|
||||
from edgar.documents.nodes import SectionNode, HeadingNode
|
||||
from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
|
||||
from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
|
||||
from edgar.documents.config import DetectionThresholds
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HybridSectionDetector:
|
||||
"""
|
||||
Multi-strategy section detector with fallback.
|
||||
|
||||
Tries strategies in order of reliability:
|
||||
1. TOC-based (0.95 confidence) - Most reliable
|
||||
2. Multi-strategy heading detection (0.7-0.9 confidence) - Fallback
|
||||
3. Pattern matching (0.6 confidence) - Last resort
|
||||
|
||||
Example:
|
||||
>>> detector = HybridSectionDetector(document, '10-K')
|
||||
>>> sections = detector.detect_sections()
|
||||
>>> for name, section in sections.items():
|
||||
... print(f"{name}: {section.confidence:.2f} ({section.detection_method})")
|
||||
"""
|
||||
|
||||
def __init__(self, document: Document, form: str, thresholds: Optional[DetectionThresholds] = None):
|
||||
"""
|
||||
Initialize hybrid detector.
|
||||
|
||||
Args:
|
||||
document: Document to extract sections from
|
||||
form: Filing type ('10-K', '10-Q', '8-K')
|
||||
thresholds: Detection thresholds configuration
|
||||
"""
|
||||
self.document = document
|
||||
self.form = form
|
||||
self.thresholds = thresholds or DetectionThresholds()
|
||||
|
||||
# Initialize detection strategies
|
||||
self.toc_detector = TOCSectionDetector(document)
|
||||
self.pattern_extractor = SectionExtractor(form)
|
||||
|
||||
def detect_sections(self) -> Dict[str, Section]:
|
||||
"""
|
||||
Detect sections using hybrid approach with fallback and validation.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section names to Section objects with confidence scores
|
||||
"""
|
||||
# Strategy 1: TOC-based (most reliable)
|
||||
logger.debug("Trying TOC-based detection...")
|
||||
sections = self.toc_detector.detect()
|
||||
if sections:
|
||||
logger.info(f"TOC detection successful: {len(sections)} sections found")
|
||||
return self._validate_pipeline(sections, enable_cross_validation=True)
|
||||
|
||||
# Strategy 2: Heading-based (fallback)
|
||||
logger.debug("TOC detection failed, trying heading detection...")
|
||||
sections = self._try_heading_detection()
|
||||
if sections:
|
||||
logger.info(f"Heading detection successful: {len(sections)} sections found")
|
||||
return self._validate_pipeline(sections, enable_cross_validation=False)
|
||||
|
||||
# Strategy 3: Pattern-based (last resort)
|
||||
logger.debug("Heading detection failed, trying pattern matching...")
|
||||
sections = self._try_pattern_detection()
|
||||
if sections:
|
||||
logger.info(f"Pattern detection successful: {len(sections)} sections found")
|
||||
return self._validate_pipeline(sections, enable_cross_validation=False)
|
||||
|
||||
logger.warning("All detection strategies failed, no sections found")
|
||||
return {}
|
||||
|
||||
def _validate_pipeline(
|
||||
self,
|
||||
sections: Dict[str, Section],
|
||||
enable_cross_validation: bool = False
|
||||
) -> Dict[str, Section]:
|
||||
"""
|
||||
Apply validation pipeline to sections.
|
||||
|
||||
Centralizes validation logic to eliminate duplication.
|
||||
|
||||
Args:
|
||||
sections: Sections to validate
|
||||
enable_cross_validation: Whether to enable cross-validation (expensive)
|
||||
|
||||
Returns:
|
||||
Validated sections
|
||||
"""
|
||||
if not sections:
|
||||
return sections
|
||||
|
||||
# Cross-validate (optional, expensive)
|
||||
if enable_cross_validation and self.thresholds.enable_cross_validation:
|
||||
sections = self._cross_validate(sections)
|
||||
|
||||
# Validate boundaries
|
||||
sections = self._validate_boundaries(sections)
|
||||
|
||||
# Deduplicate
|
||||
sections = self._deduplicate(sections)
|
||||
|
||||
# Filter by confidence
|
||||
sections = self._filter_by_confidence(sections)
|
||||
|
||||
return sections
|
||||
|
||||
def _try_heading_detection(self) -> Optional[Dict[str, Section]]:
|
||||
"""
|
||||
Try multi-strategy heading detection.
|
||||
|
||||
Returns:
|
||||
Dictionary of sections if successful, None if failed
|
||||
"""
|
||||
try:
|
||||
# Get heading nodes from document
|
||||
headings = self.document.headings
|
||||
if not headings:
|
||||
return None
|
||||
|
||||
sections = {}
|
||||
|
||||
for heading in headings:
|
||||
# Check if heading has header info
|
||||
if not hasattr(heading, 'header_info') or not heading.header_info:
|
||||
continue
|
||||
|
||||
header_info = heading.header_info
|
||||
|
||||
# Only use headings with sufficient confidence
|
||||
if header_info.confidence < 0.7:
|
||||
continue
|
||||
|
||||
# Check if it's an item header
|
||||
if not header_info.is_item:
|
||||
continue
|
||||
|
||||
# Extract section from this heading to next
|
||||
section = self._extract_section_from_heading(heading, header_info)
|
||||
if section:
|
||||
section.confidence = header_info.confidence
|
||||
section.detection_method = 'heading'
|
||||
sections[section.name] = section
|
||||
|
||||
return sections if sections else None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Heading detection failed: {e}")
|
||||
return None
|
||||
|
||||
def _try_pattern_detection(self) -> Optional[Dict[str, Section]]:
|
||||
"""
|
||||
Try pattern-based extraction.
|
||||
|
||||
Returns:
|
||||
Dictionary of sections if successful, None if failed
|
||||
"""
|
||||
try:
|
||||
# Use pattern extractor
|
||||
sections = self.pattern_extractor.extract(self.document)
|
||||
|
||||
# Mark with pattern detection confidence
|
||||
for section in sections.values():
|
||||
section.confidence = 0.6 # Pattern-based = lower confidence
|
||||
section.detection_method = 'pattern'
|
||||
|
||||
return sections if sections else None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Pattern detection failed: {e}")
|
||||
return None
|
||||
|
||||
def _extract_section_from_heading(self, heading: HeadingNode, header_info) -> Optional[Section]:
|
||||
"""
|
||||
Extract section content from heading node to next heading.
|
||||
|
||||
Args:
|
||||
heading: HeadingNode representing section start
|
||||
header_info: HeaderInfo with section metadata
|
||||
|
||||
Returns:
|
||||
Section object if successful, None otherwise
|
||||
"""
|
||||
try:
|
||||
# Create section name from item number
|
||||
section_name = f"item_{header_info.item_number.replace('.', '_')}" if header_info.item_number else "unknown"
|
||||
|
||||
# Create section node
|
||||
section_node = SectionNode(section_name=section_name)
|
||||
|
||||
# Find next heading at same or higher level to determine section end
|
||||
current_level = header_info.level
|
||||
parent = heading.parent
|
||||
if not parent:
|
||||
return None
|
||||
|
||||
# Find heading position in parent's children
|
||||
try:
|
||||
heading_index = parent.children.index(heading)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Collect nodes until next section heading
|
||||
for i in range(heading_index + 1, len(parent.children)):
|
||||
child = parent.children[i]
|
||||
|
||||
# Stop at next heading of same or higher level
|
||||
if isinstance(child, HeadingNode):
|
||||
if hasattr(child, 'header_info') and child.header_info:
|
||||
if child.header_info.level <= current_level:
|
||||
break
|
||||
|
||||
# Add child to section
|
||||
section_node.add_child(child)
|
||||
|
||||
# Create Section object
|
||||
section = Section(
|
||||
name=section_name,
|
||||
title=header_info.text,
|
||||
node=section_node,
|
||||
start_offset=0, # Would need actual text position
|
||||
end_offset=0, # Would need actual text position
|
||||
confidence=header_info.confidence,
|
||||
detection_method='heading'
|
||||
)
|
||||
|
||||
return section
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract section from heading: {e}")
|
||||
return None
|
||||
|
||||
def _cross_validate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
|
||||
"""
|
||||
Cross-validate sections using multiple detection methods.
|
||||
|
||||
Boosts confidence if multiple methods detect the same section.
|
||||
Reduces confidence if methods disagree.
|
||||
|
||||
Args:
|
||||
sections: Sections detected by primary method
|
||||
|
||||
Returns:
|
||||
Validated sections with adjusted confidence scores
|
||||
"""
|
||||
validated = {}
|
||||
|
||||
# Get pattern-based sections once for comparison (not per section)
|
||||
try:
|
||||
pattern_sections = self.pattern_extractor.extract(self.document)
|
||||
except Exception as e:
|
||||
logger.debug(f"Pattern extraction failed for cross-validation: {e}")
|
||||
pattern_sections = {}
|
||||
|
||||
for name, section in sections.items():
|
||||
# Try alternative detection (pattern matching for validation)
|
||||
try:
|
||||
# Check if this section is also found by pattern matching
|
||||
found_in_patterns = False
|
||||
for pattern_name, pattern_section in pattern_sections.items():
|
||||
# Check for name similarity or overlap
|
||||
if self._sections_similar(section, pattern_section):
|
||||
found_in_patterns = True
|
||||
break
|
||||
|
||||
# Boost confidence if methods agree
|
||||
if found_in_patterns:
|
||||
section.confidence = min(section.confidence * self.thresholds.cross_validation_boost, 1.0)
|
||||
section.validated = True
|
||||
logger.debug(f"Section {name} validated by multiple methods, confidence boosted to {section.confidence:.2f}")
|
||||
else:
|
||||
# Slight reduction if not validated
|
||||
section.confidence *= self.thresholds.disagreement_penalty
|
||||
section.validated = False
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Cross-validation failed for {name}: {e}")
|
||||
# Keep original confidence if validation fails
|
||||
pass
|
||||
|
||||
validated[name] = section
|
||||
|
||||
return validated
|
||||
|
||||
def _validate_boundaries(self, sections: Dict[str, Section]) -> Dict[str, Section]:
|
||||
"""
|
||||
Validate section boundaries for overlaps, gaps, and ordering.
|
||||
|
||||
Args:
|
||||
sections: Sections to validate
|
||||
|
||||
Returns:
|
||||
Sections with validated boundaries
|
||||
"""
|
||||
if not sections:
|
||||
return sections
|
||||
|
||||
# Sort by start offset
|
||||
sorted_sections = sorted(sections.items(), key=lambda x: x[1].start_offset)
|
||||
|
||||
validated = {}
|
||||
prev_section = None
|
||||
|
||||
for name, section in sorted_sections:
|
||||
# Check for overlap with previous section
|
||||
if prev_section and section.start_offset > 0:
|
||||
if section.start_offset < prev_section[1].end_offset:
|
||||
# Overlap detected - adjust boundary at midpoint
|
||||
gap_mid = (prev_section[1].end_offset + section.start_offset) // 2
|
||||
prev_section[1].end_offset = gap_mid
|
||||
section.start_offset = gap_mid
|
||||
|
||||
# Reduce confidence due to boundary adjustment
|
||||
section.confidence *= self.thresholds.boundary_overlap_penalty
|
||||
prev_section[1].confidence *= self.thresholds.boundary_overlap_penalty
|
||||
|
||||
logger.debug(f"Adjusted boundary between {prev_section[0]} and {name}")
|
||||
|
||||
# Check for large gap (>10% of document size)
|
||||
elif prev_section[1].end_offset > 0:
|
||||
gap_size = section.start_offset - prev_section[1].end_offset
|
||||
if gap_size > 100000: # Arbitrary large gap threshold
|
||||
# Large gap - might indicate missing section
|
||||
section.confidence *= 0.9
|
||||
logger.debug(f"Large gap detected before {name}")
|
||||
|
||||
validated[name] = section
|
||||
prev_section = (name, section)
|
||||
|
||||
return validated
|
||||
|
||||
def _deduplicate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
|
||||
"""
|
||||
Remove duplicate sections detected by multiple methods.
|
||||
|
||||
Keeps the detection with highest confidence.
|
||||
|
||||
Args:
|
||||
sections: Sections possibly containing duplicates
|
||||
|
||||
Returns:
|
||||
Deduplicated sections
|
||||
"""
|
||||
if len(sections) <= 1:
|
||||
return sections
|
||||
|
||||
# Group similar sections
|
||||
groups = self._group_similar_sections(sections)
|
||||
|
||||
deduplicated = {}
|
||||
for group in groups:
|
||||
if len(group) == 1:
|
||||
# No duplicates
|
||||
deduplicated[group[0].name] = group[0]
|
||||
else:
|
||||
# Keep section with highest confidence
|
||||
best = max(group, key=lambda s: s.confidence)
|
||||
|
||||
# Merge detection methods
|
||||
methods = set(s.detection_method for s in group)
|
||||
if len(methods) > 1:
|
||||
best.detection_method = ','.join(sorted(methods))
|
||||
# Boost confidence for multi-method detection
|
||||
best.confidence = min(best.confidence * 1.15, 1.0)
|
||||
best.validated = True
|
||||
logger.debug(f"Merged duplicate sections for {best.name}, methods: {best.detection_method}")
|
||||
|
||||
deduplicated[best.name] = best
|
||||
|
||||
return deduplicated
|
||||
|
||||
def _group_similar_sections(self, sections: Dict[str, Section]) -> List[List[Section]]:
|
||||
"""
|
||||
Group sections that appear to be duplicates.
|
||||
|
||||
Args:
|
||||
sections: Sections to group
|
||||
|
||||
Returns:
|
||||
List of section groups
|
||||
"""
|
||||
groups = []
|
||||
used = set()
|
||||
|
||||
for name1, section1 in sections.items():
|
||||
if name1 in used:
|
||||
continue
|
||||
|
||||
group = [section1]
|
||||
used.add(name1)
|
||||
|
||||
for name2, section2 in sections.items():
|
||||
if name2 in used:
|
||||
continue
|
||||
|
||||
# Check if sections are similar
|
||||
if self._sections_similar(section1, section2):
|
||||
group.append(section2)
|
||||
used.add(name2)
|
||||
|
||||
groups.append(group)
|
||||
|
||||
return groups
|
||||
|
||||
def _sections_similar(self, section1: Section, section2: Section) -> bool:
|
||||
"""
|
||||
Check if two sections are similar (likely duplicates).
|
||||
|
||||
Args:
|
||||
section1: First section
|
||||
section2: Second section
|
||||
|
||||
Returns:
|
||||
True if sections are similar
|
||||
"""
|
||||
# Normalize names for comparison
|
||||
name1 = section1.name.lower().replace('_', ' ').strip()
|
||||
name2 = section2.name.lower().replace('_', ' ').strip()
|
||||
|
||||
# Check exact match after normalization
|
||||
if name1 == name2:
|
||||
return True
|
||||
|
||||
# Check title similarity (exact match)
|
||||
title1 = section1.title.lower().strip()
|
||||
title2 = section2.title.lower().strip()
|
||||
|
||||
if title1 == title2:
|
||||
return True
|
||||
|
||||
# Check for position overlap (if positions are set)
|
||||
if section1.start_offset > 0 and section2.start_offset > 0:
|
||||
# Calculate overlap
|
||||
overlap_start = max(section1.start_offset, section2.start_offset)
|
||||
overlap_end = min(section1.end_offset, section2.end_offset)
|
||||
|
||||
if overlap_end > overlap_start:
|
||||
# There is overlap
|
||||
overlap_size = overlap_end - overlap_start
|
||||
min_size = min(
|
||||
section1.end_offset - section1.start_offset,
|
||||
section2.end_offset - section2.start_offset
|
||||
)
|
||||
|
||||
# If overlap is >50% of smaller section, consider similar
|
||||
if min_size > 0 and overlap_size / min_size > 0.5:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _filter_by_confidence(self, sections: Dict[str, Section]) -> Dict[str, Section]:
|
||||
"""
|
||||
Filter sections by minimum confidence threshold.
|
||||
|
||||
Args:
|
||||
sections: Sections to filter
|
||||
|
||||
Returns:
|
||||
Filtered sections meeting minimum confidence
|
||||
"""
|
||||
# Check for filing-specific thresholds
|
||||
min_conf = self.thresholds.min_confidence
|
||||
if self.form in self.thresholds.thresholds_by_form:
|
||||
filing_thresholds = self.thresholds.thresholds_by_form[self.form]
|
||||
min_conf = filing_thresholds.get('min_confidence', min_conf)
|
||||
|
||||
filtered = {}
|
||||
for name, section in sections.items():
|
||||
if section.confidence >= min_conf:
|
||||
filtered[name] = section
|
||||
else:
|
||||
logger.debug(f"Filtered out section {name} with confidence {section.confidence:.2f} < {min_conf:.2f}")
|
||||
|
||||
return filtered
|
||||
@@ -0,0 +1,405 @@
|
||||
"""
|
||||
Section extraction from documents.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from edgar.documents.document import Document, Section
|
||||
from edgar.documents.nodes import Node, HeadingNode, SectionNode
|
||||
|
||||
|
||||
class SectionExtractor:
|
||||
"""
|
||||
Extracts logical sections from documents.
|
||||
|
||||
Identifies document sections like:
|
||||
- Business Overview (Item 1)
|
||||
- Risk Factors (Item 1A)
|
||||
- MD&A (Item 7)
|
||||
- Financial Statements (Item 8)
|
||||
"""
|
||||
|
||||
# Common section patterns for different filing types
|
||||
SECTION_PATTERNS = {
|
||||
'10-K': {
|
||||
'business': [
|
||||
(r'^(Item|ITEM)\s+1\.?\s*Business', 'Item 1 - Business'),
|
||||
(r'^Business\s*$', 'Business'),
|
||||
(r'^Business Overview', 'Business Overview'),
|
||||
(r'^Our Business', 'Our Business'),
|
||||
(r'^Company Overview', 'Company Overview')
|
||||
],
|
||||
'risk_factors': [
|
||||
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
|
||||
(r'^Risk\s+Factors', 'Risk Factors'),
|
||||
(r'^Factors\s+That\s+May\s+Affect', 'Risk Factors')
|
||||
],
|
||||
'properties': [
|
||||
(r'^(Item|ITEM)\s+2\.?\s*Properties', 'Item 2 - Properties'),
|
||||
(r'^Properties', 'Properties'),
|
||||
(r'^Real\s+Estate', 'Real Estate')
|
||||
],
|
||||
'legal_proceedings': [
|
||||
(r'^(Item|ITEM)\s+3\.?\s*Legal\s+Proceedings', 'Item 3 - Legal Proceedings'),
|
||||
(r'^Legal\s+Proceedings', 'Legal Proceedings'),
|
||||
(r'^Litigation', 'Litigation')
|
||||
],
|
||||
'market_risk': [
|
||||
(r'^(Item|ITEM)\s+7A\.?\s*Quantitative.*Disclosures', 'Item 7A - Market Risk'),
|
||||
(r'^Market\s+Risk', 'Market Risk'),
|
||||
(r'^Quantitative.*Qualitative.*Market\s+Risk', 'Market Risk')
|
||||
],
|
||||
'mda': [
|
||||
(r'^(Item|ITEM)\s+7\.?\s*Management.*Discussion', 'Item 7 - MD&A'),
|
||||
(r'^Management.*Discussion.*Analysis', 'MD&A'),
|
||||
(r'^MD&A', 'MD&A')
|
||||
],
|
||||
'financial_statements': [
|
||||
(r'^(Item|ITEM)\s+8\.?\s*Financial\s+Statements', 'Item 8 - Financial Statements'),
|
||||
(r'^Financial\s+Statements', 'Financial Statements'),
|
||||
(r'^Consolidated\s+Financial\s+Statements', 'Consolidated Financial Statements')
|
||||
],
|
||||
'controls_procedures': [
|
||||
(r'^(Item|ITEM)\s+9A\.?\s*Controls.*Procedures', 'Item 9A - Controls and Procedures'),
|
||||
(r'^Controls.*Procedures', 'Controls and Procedures'),
|
||||
(r'^Internal\s+Control', 'Internal Controls')
|
||||
]
|
||||
},
|
||||
'10-Q': {
|
||||
'financial_statements': [
|
||||
(r'^(Item|ITEM)\s+1\.?\s*Financial\s+Statements', 'Item 1 - Financial Statements'),
|
||||
(r'^Financial\s+Statements', 'Financial Statements'),
|
||||
(r'^Condensed.*Financial\s+Statements', 'Condensed Financial Statements')
|
||||
],
|
||||
'mda': [
|
||||
(r'^(Item|ITEM)\s+2\.?\s*Management.*Discussion', 'Item 2 - MD&A'),
|
||||
(r'^Management.*Discussion.*Analysis', 'MD&A')
|
||||
],
|
||||
'market_risk': [
|
||||
(r'^(Item|ITEM)\s+3\.?\s*Quantitative.*Disclosures', 'Item 3 - Market Risk'),
|
||||
(r'^Market\s+Risk', 'Market Risk')
|
||||
],
|
||||
'controls_procedures': [
|
||||
(r'^(Item|ITEM)\s+4\.?\s*Controls.*Procedures', 'Item 4 - Controls and Procedures'),
|
||||
(r'^Controls.*Procedures', 'Controls and Procedures')
|
||||
],
|
||||
'legal_proceedings': [
|
||||
(r'^(Item|ITEM)\s+1\.?\s*Legal\s+Proceedings', 'Item 1 - Legal Proceedings'),
|
||||
(r'^Legal\s+Proceedings', 'Legal Proceedings')
|
||||
],
|
||||
'risk_factors': [
|
||||
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
|
||||
(r'^Risk\s+Factors', 'Risk Factors')
|
||||
]
|
||||
},
|
||||
'8-K': {
|
||||
'item_101': [
|
||||
(r'^(Item|ITEM)\s+1\.01', 'Item 1.01 - Entry into Material Agreement'),
|
||||
(r'^Entry.*Material.*Agreement', 'Material Agreement')
|
||||
],
|
||||
'item_201': [
|
||||
(r'^(Item|ITEM)\s+2\.01', 'Item 2.01 - Completion of Acquisition'),
|
||||
(r'^Completion.*Acquisition', 'Acquisition')
|
||||
],
|
||||
'item_202': [
|
||||
(r'^(Item|ITEM)\s+2\.02', 'Item 2.02 - Results of Operations'),
|
||||
(r'^Results.*Operations', 'Results of Operations')
|
||||
],
|
||||
'item_503': [
|
||||
(r'^(Item|ITEM)\s+5\.03', 'Item 5.03 - Director/Officer Changes'),
|
||||
(r'^Amendments.*Articles', 'Charter Amendments')
|
||||
],
|
||||
'item_801': [
|
||||
(r'^(Item|ITEM)\s+8\.01', 'Item 8.01 - Other Events'),
|
||||
(r'^Other\s+Events', 'Other Events')
|
||||
],
|
||||
'item_901': [
|
||||
(r'^(Item|ITEM)\s+9\.01', 'Item 9.01 - Financial Statements and Exhibits'),
|
||||
(r'^Financial.*Exhibits', 'Financial Statements and Exhibits')
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self, form: Optional[str] = None):
|
||||
"""
|
||||
Initialize section extractor.
|
||||
|
||||
Args:
|
||||
form: Type of filing (10-K, 10-Q, 8-K, etc.)
|
||||
"""
|
||||
self.form = form
|
||||
|
||||
def extract(self, document: Document) -> Dict[str, Section]:
|
||||
"""
|
||||
Extract sections from document.
|
||||
|
||||
Args:
|
||||
document: Document to extract sections from
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section names to Section objects
|
||||
"""
|
||||
# Get filing type from instance, metadata, or document config
|
||||
# NOTE: We no longer auto-detect filing type (expensive and unnecessary)
|
||||
form = None
|
||||
|
||||
if self.form:
|
||||
form = self.form
|
||||
elif document.metadata and document.metadata.form:
|
||||
form = document.metadata.form
|
||||
elif hasattr(document, '_config') and document._config and document._config.form:
|
||||
form = document._config.form
|
||||
|
||||
# Only extract sections for forms that have standard sections
|
||||
if not form or form not in ['10-K', '10-Q', '8-K']:
|
||||
return {} # No filing type or unsupported form = no section detection
|
||||
|
||||
# Get patterns for filing type
|
||||
patterns = self.SECTION_PATTERNS.get(form, {})
|
||||
if not patterns:
|
||||
return {} # No patterns defined for this form type
|
||||
|
||||
# Find section headers
|
||||
headers = self._find_section_headers(document)
|
||||
|
||||
# For 10-Q, detect Part I/Part II boundaries
|
||||
part_context = None
|
||||
if form == '10-Q':
|
||||
part_context = self._detect_10q_parts(headers)
|
||||
|
||||
# Match headers to sections
|
||||
sections = self._match_sections(headers, patterns, document, part_context)
|
||||
|
||||
# Create section objects
|
||||
return self._create_sections(sections, document)
|
||||
|
||||
# NOTE: _detect_form() removed - form type should be known from context
|
||||
# Filing metadata should be set by the caller (Filing class, TenK/TenQ, etc.)
|
||||
|
||||
# NOTE: _infer_form_from_headers() kept for backward compatibility but not used
|
||||
# in normal flow anymore. Form type should always be provided explicitly.
|
||||
def _infer_form_from_headers(self, document: Document) -> str:
|
||||
"""
|
||||
Infer filing type from section headers.
|
||||
|
||||
NOTE: This method is kept for backward compatibility but should not be used
|
||||
in the normal flow. Form type should be explicitly provided via config or metadata.
|
||||
"""
|
||||
headers = document.headings
|
||||
header_texts = [h.text().upper() for h in headers if h.text()]
|
||||
|
||||
# Check for 10-K specific sections
|
||||
has_10k_sections = any(
|
||||
'ITEM 1.' in text or 'ITEM 1A.' in text or 'ITEM 7.' in text or 'ITEM 8.' in text
|
||||
for text in header_texts
|
||||
)
|
||||
|
||||
# Check for 10-Q specific sections
|
||||
has_10q_sections = any(
|
||||
('ITEM 1.' in text and 'FINANCIAL STATEMENTS' in text) or
|
||||
('ITEM 2.' in text and 'MANAGEMENT' in text) or
|
||||
'ITEM 3.' in text or 'ITEM 4.' in text
|
||||
for text in header_texts
|
||||
)
|
||||
|
||||
# Check for 8-K specific sections
|
||||
has_8k_sections = any(
|
||||
re.search(r'ITEM \d\.\d{2}', text) for text in header_texts
|
||||
)
|
||||
|
||||
if has_10k_sections and not has_10q_sections:
|
||||
return '10-K'
|
||||
elif has_10q_sections:
|
||||
return '10-Q'
|
||||
elif has_8k_sections:
|
||||
return '8-K'
|
||||
else:
|
||||
return 'UNKNOWN'
|
||||
|
||||
def _get_general_patterns(self) -> Dict[str, List[Tuple[str, str]]]:
|
||||
"""Get general section patterns."""
|
||||
return {
|
||||
'business': [
|
||||
(r'^Business', 'Business'),
|
||||
(r'^Overview', 'Overview'),
|
||||
(r'^Company', 'Company')
|
||||
],
|
||||
'financial': [
|
||||
(r'^Financial\s+Statements', 'Financial Statements'),
|
||||
(r'^Consolidated.*Statements', 'Consolidated Statements')
|
||||
],
|
||||
'notes': [
|
||||
(r'^Notes\s+to.*Financial\s+Statements', 'Notes to Financial Statements'),
|
||||
(r'^Notes\s+to.*Statements', 'Notes')
|
||||
]
|
||||
}
|
||||
|
||||
def _find_section_headers(self, document: Document) -> List[Tuple[Node, str, int]]:
|
||||
"""Find all potential section headers."""
|
||||
headers = []
|
||||
|
||||
# Find all heading nodes
|
||||
heading_nodes = document.root.find(lambda n: isinstance(n, HeadingNode))
|
||||
|
||||
for node in heading_nodes:
|
||||
text = node.text()
|
||||
if text:
|
||||
# Get position in document
|
||||
position = self._get_node_position(node, document)
|
||||
headers.append((node, text, position))
|
||||
|
||||
# Also check for section nodes
|
||||
section_nodes = document.root.find(lambda n: isinstance(n, SectionNode))
|
||||
for node in section_nodes:
|
||||
# Get first heading in section
|
||||
first_heading = node.find_first(lambda n: isinstance(n, HeadingNode))
|
||||
if first_heading:
|
||||
text = first_heading.text()
|
||||
if text:
|
||||
position = self._get_node_position(node, document)
|
||||
headers.append((node, text, position))
|
||||
|
||||
# Sort by position
|
||||
headers.sort(key=lambda x: x[2])
|
||||
|
||||
return headers
|
||||
|
||||
def _get_node_position(self, node: Node, document: Document) -> int:
|
||||
"""Get position of node in document."""
|
||||
position = 0
|
||||
for n in document.root.walk():
|
||||
if n == node:
|
||||
return position
|
||||
position += 1
|
||||
return position
|
||||
|
||||
def _detect_10q_parts(self, headers: List[Tuple[Node, str, int]]) -> Dict[int, str]:
|
||||
"""
|
||||
Detect Part I and Part II boundaries in 10-Q filings.
|
||||
|
||||
Args:
|
||||
headers: List of (node, text, position) tuples
|
||||
|
||||
Returns:
|
||||
Dict mapping header index to part name ("Part I" or "Part II")
|
||||
"""
|
||||
part_context = {}
|
||||
current_part = None
|
||||
|
||||
part_i_pattern = re.compile(r'^\s*PART\s+I\b', re.IGNORECASE)
|
||||
part_ii_pattern = re.compile(r'^\s*PART\s+II\b', re.IGNORECASE)
|
||||
|
||||
for i, (node, text, position) in enumerate(headers):
|
||||
text_stripped = text.strip()
|
||||
|
||||
# Check if this is a Part I or Part II header
|
||||
if part_i_pattern.match(text_stripped):
|
||||
current_part = "Part I"
|
||||
part_context[i] = current_part
|
||||
elif part_ii_pattern.match(text_stripped):
|
||||
current_part = "Part II"
|
||||
part_context[i] = current_part
|
||||
elif current_part:
|
||||
# Headers after a Part declaration belong to that part
|
||||
part_context[i] = current_part
|
||||
|
||||
return part_context
|
||||
|
||||
def _match_sections(self,
|
||||
headers: List[Tuple[Node, str, int]],
|
||||
patterns: Dict[str, List[Tuple[str, str]]],
|
||||
document: Document,
|
||||
part_context: Optional[Dict[int, str]] = None) -> Dict[str, Tuple[Node, str, int, int]]:
|
||||
"""Match headers to section patterns."""
|
||||
matched_sections = {}
|
||||
used_headers = set()
|
||||
|
||||
# Try to match each pattern
|
||||
for section_name, section_patterns in patterns.items():
|
||||
for pattern, title in section_patterns:
|
||||
for i, (node, text, position) in enumerate(headers):
|
||||
if i in used_headers:
|
||||
continue
|
||||
|
||||
# Try to match pattern
|
||||
if re.match(pattern, text.strip(), re.IGNORECASE):
|
||||
# Find end position (next section or end of document)
|
||||
end_position = self._find_section_end(i, headers, document)
|
||||
|
||||
# For 10-Q, prefix with Part I or Part II
|
||||
final_title = title
|
||||
if part_context and i in part_context:
|
||||
final_title = f"{part_context[i]} - {title}"
|
||||
|
||||
# Use final_title as key to avoid conflicts
|
||||
section_key = final_title if part_context and i in part_context else section_name
|
||||
matched_sections[section_key] = (node, final_title, position, end_position)
|
||||
used_headers.add(i)
|
||||
break
|
||||
|
||||
# If we found a match, move to next section
|
||||
if section_name in matched_sections:
|
||||
break
|
||||
|
||||
return matched_sections
|
||||
|
||||
def _find_section_end(self,
|
||||
section_index: int,
|
||||
headers: List[Tuple[Node, str, int]],
|
||||
document: Document) -> int:
|
||||
"""Find where section ends."""
|
||||
# Next section starts where next header at same or higher level begins
|
||||
if section_index + 1 < len(headers):
|
||||
current_node = headers[section_index][0]
|
||||
current_level = current_node.level if isinstance(current_node, HeadingNode) else 1
|
||||
|
||||
for i in range(section_index + 1, len(headers)):
|
||||
next_node = headers[i][0]
|
||||
next_level = next_node.level if isinstance(next_node, HeadingNode) else 1
|
||||
|
||||
# If next header is at same or higher level, that's our end
|
||||
if next_level <= current_level:
|
||||
return headers[i][2]
|
||||
|
||||
# Otherwise, section goes to end of document
|
||||
return sum(1 for _ in document.root.walk())
|
||||
|
||||
def _create_sections(self,
|
||||
matched_sections: Dict[str, Tuple[Node, str, int, int]],
|
||||
document: Document) -> Dict[str, Section]:
|
||||
"""Create Section objects from matches."""
|
||||
sections = {}
|
||||
|
||||
for section_name, (node, title, start_pos, end_pos) in matched_sections.items():
|
||||
# Create section node containing all content in range
|
||||
section_node = SectionNode(section_name=section_name)
|
||||
|
||||
# Find all nodes in position range
|
||||
position = 0
|
||||
for n in document.root.walk():
|
||||
if start_pos <= position < end_pos:
|
||||
# Clone node and add to section
|
||||
# (In real implementation, would properly handle node hierarchy)
|
||||
section_node.add_child(n)
|
||||
position += 1
|
||||
|
||||
# Parse section name to extract part and item identifiers
|
||||
part, item = Section.parse_section_name(section_name)
|
||||
|
||||
# Create Section object
|
||||
section = Section(
|
||||
name=section_name,
|
||||
title=title,
|
||||
node=section_node,
|
||||
start_offset=start_pos,
|
||||
end_offset=end_pos,
|
||||
confidence=0.7, # Pattern-based detection = moderate confidence
|
||||
detection_method='pattern', # Method: regex pattern matching
|
||||
part=part,
|
||||
item=item
|
||||
)
|
||||
|
||||
sections[section_name] = section
|
||||
|
||||
return sections
|
||||
@@ -0,0 +1,348 @@
|
||||
"""
|
||||
Text extraction from documents with various options.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Optional, Set
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
from edgar.documents.types import NodeType
|
||||
|
||||
|
||||
class TextExtractor:
|
||||
"""
|
||||
Extracts text from documents with configurable options.
|
||||
|
||||
Supports:
|
||||
- Clean text extraction for AI/NLP
|
||||
- Table inclusion/exclusion
|
||||
- Metadata annotations
|
||||
- Length limiting
|
||||
- Smart whitespace handling
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
clean: bool = True,
|
||||
include_tables: bool = True,
|
||||
include_metadata: bool = False,
|
||||
include_links: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
preserve_structure: bool = False):
|
||||
"""
|
||||
Initialize text extractor.
|
||||
|
||||
Args:
|
||||
clean: Clean and normalize text
|
||||
include_tables: Include table content
|
||||
include_metadata: Include metadata annotations
|
||||
include_links: Include link URLs
|
||||
max_length: Maximum text length
|
||||
preserve_structure: Preserve document structure with markers
|
||||
"""
|
||||
self.clean = clean
|
||||
self.include_tables = include_tables
|
||||
self.include_metadata = include_metadata
|
||||
self.include_links = include_links
|
||||
self.max_length = max_length
|
||||
self.preserve_structure = preserve_structure
|
||||
|
||||
# Track what we've extracted to avoid duplicates
|
||||
self._extracted_ids: Set[str] = set()
|
||||
|
||||
def extract(self, document: Document) -> str:
|
||||
"""
|
||||
Extract text from document.
|
||||
|
||||
Args:
|
||||
document: Document to extract from
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
parts = []
|
||||
self._extracted_ids.clear()
|
||||
|
||||
# Extract from root
|
||||
self._extract_from_node(document.root, parts, depth=0)
|
||||
|
||||
# Join parts
|
||||
if self.preserve_structure:
|
||||
text = '\n'.join(parts)
|
||||
else:
|
||||
text = '\n\n'.join(filter(None, parts))
|
||||
|
||||
# Apply minimal global cleaning - tables are already handled appropriately per node
|
||||
if self.clean:
|
||||
text = self._clean_document_text(text)
|
||||
|
||||
# Limit length if requested
|
||||
if self.max_length and len(text) > self.max_length:
|
||||
text = self._truncate_text(text, self.max_length)
|
||||
|
||||
return text
|
||||
|
||||
def extract_from_node(self, node: Node) -> str:
|
||||
"""Extract text from a specific node."""
|
||||
parts = []
|
||||
self._extracted_ids.clear()
|
||||
self._extract_from_node(node, parts, depth=0)
|
||||
|
||||
text = '\n\n'.join(filter(None, parts))
|
||||
|
||||
if self.clean:
|
||||
text = self._clean_document_text(text)
|
||||
|
||||
return text
|
||||
|
||||
def _extract_from_node(self, node: Node, parts: List[str], depth: int):
|
||||
"""Recursively extract text from node - render each node type appropriately."""
|
||||
# Skip if already extracted (handles shared nodes)
|
||||
if node.id in self._extracted_ids:
|
||||
return
|
||||
self._extracted_ids.add(node.id)
|
||||
|
||||
# Handle based on node type - like old parser's block.get_text()
|
||||
if isinstance(node, TableNode):
|
||||
if self.include_tables:
|
||||
# Tables render themselves - preserve their formatting
|
||||
self._extract_table(node, parts)
|
||||
|
||||
elif isinstance(node, HeadingNode):
|
||||
# Headings get cleaned text
|
||||
self._extract_heading(node, parts, depth)
|
||||
|
||||
elif isinstance(node, TextNode):
|
||||
# Text nodes get cleaned if cleaning is enabled
|
||||
text = node.text()
|
||||
if text:
|
||||
if self.clean:
|
||||
text = self._clean_text_content(text) # Clean non-table text
|
||||
if self.include_metadata and node.metadata:
|
||||
text = self._annotate_with_metadata(text, node.metadata)
|
||||
parts.append(text)
|
||||
|
||||
elif isinstance(node, ParagraphNode):
|
||||
# Extract paragraph as unified text to maintain flow of inline elements
|
||||
text = node.text()
|
||||
if text:
|
||||
if self.clean:
|
||||
text = self._clean_text_content(text)
|
||||
if self.include_metadata and node.metadata:
|
||||
text = self._annotate_with_metadata(text, node.metadata)
|
||||
parts.append(text)
|
||||
# Don't process children since we already got the paragraph text
|
||||
return
|
||||
|
||||
else:
|
||||
# Check if this looks like a bullet point container that should flow together
|
||||
if self._is_bullet_point_container(node):
|
||||
# Extract text from bullet point children and join with spaces (not newlines)
|
||||
bullet_parts = []
|
||||
for child in node.children:
|
||||
child_text = child.text() if hasattr(child, 'text') else ""
|
||||
if child_text and child_text.strip():
|
||||
bullet_parts.append(child_text.strip())
|
||||
|
||||
if bullet_parts:
|
||||
# Join with spaces for bullet points
|
||||
text = ' '.join(bullet_parts)
|
||||
if self.clean:
|
||||
text = self._clean_text_content(text)
|
||||
if self.include_metadata and node.metadata:
|
||||
text = self._annotate_with_metadata(text, node.metadata)
|
||||
parts.append(text)
|
||||
# Don't process children since we already got the unified text
|
||||
return
|
||||
|
||||
# For other nodes, extract text content and clean if appropriate
|
||||
if hasattr(node, 'content') and isinstance(node.content, str):
|
||||
text = node.content
|
||||
if text and text.strip():
|
||||
if self.clean:
|
||||
text = self._clean_text_content(text) # Clean non-table text
|
||||
if self.include_metadata and node.metadata:
|
||||
text = self._annotate_with_metadata(text, node.metadata)
|
||||
parts.append(text)
|
||||
|
||||
# Process children
|
||||
for child in node.children:
|
||||
self._extract_from_node(child, parts, depth + 1)
|
||||
|
||||
def _extract_heading(self, node: HeadingNode, parts: List[str], depth: int):
|
||||
"""Extract heading with optional structure markers."""
|
||||
text = node.text()
|
||||
if not text:
|
||||
return
|
||||
|
||||
if self.preserve_structure:
|
||||
# Add structure markers
|
||||
marker = '#' * node.level
|
||||
text = f"{marker} {text}"
|
||||
|
||||
if self.include_metadata and node.metadata:
|
||||
text = self._annotate_with_metadata(text, node.metadata)
|
||||
|
||||
parts.append(text)
|
||||
|
||||
def _extract_table(self, table: TableNode, parts: List[str]):
|
||||
"""Extract table content - preserve original formatting like old parser."""
|
||||
if self.preserve_structure:
|
||||
parts.append("[TABLE START]")
|
||||
|
||||
# Add table caption if present
|
||||
if table.caption:
|
||||
caption_text = table.caption
|
||||
if self.clean:
|
||||
caption_text = self._clean_text_content(caption_text) # Clean caption but not table content
|
||||
if self.preserve_structure:
|
||||
parts.append(f"Caption: {caption_text}")
|
||||
else:
|
||||
parts.append(caption_text)
|
||||
|
||||
# Extract table text - PRESERVE FORMATTING (like old parser's TableBlock.get_text())
|
||||
table_text = table.text()
|
||||
if table_text:
|
||||
# Tables render their own formatting - don't apply text cleaning to preserve alignment
|
||||
parts.append(table_text) # Keep original spacing and alignment
|
||||
|
||||
if self.preserve_structure:
|
||||
parts.append("[TABLE END]")
|
||||
|
||||
def _annotate_with_metadata(self, text: str, metadata: dict) -> str:
|
||||
"""Add metadata annotations to text."""
|
||||
annotations = []
|
||||
|
||||
# Add XBRL annotations
|
||||
if 'ix_tag' in metadata:
|
||||
annotations.append(f"[XBRL: {metadata['ix_tag']}]")
|
||||
|
||||
# Add section annotations
|
||||
if 'section_name' in metadata:
|
||||
annotations.append(f"[Section: {metadata['section_name']}]")
|
||||
|
||||
# Add semantic type
|
||||
if 'semantic_type' in metadata:
|
||||
annotations.append(f"[Type: {metadata['semantic_type']}]")
|
||||
|
||||
if annotations:
|
||||
return f"{' '.join(annotations)} {text}"
|
||||
|
||||
return text
|
||||
|
||||
def _clean_text_content(self, text: str) -> str:
|
||||
"""Clean regular text content (not tables) - like old parser text cleaning."""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Replace multiple spaces with single space for regular text
|
||||
text = re.sub(r' {2,}', ' ', text)
|
||||
|
||||
# Clean up space around newlines
|
||||
text = re.sub(r' *\n *', '\n', text)
|
||||
|
||||
# Remove leading/trailing whitespace from lines
|
||||
lines = text.split('\n')
|
||||
lines = [line.strip() for line in lines]
|
||||
text = '\n'.join(lines)
|
||||
|
||||
# Normalize quotes and dashes
|
||||
text = self._normalize_punctuation(text)
|
||||
|
||||
return text
|
||||
|
||||
def _is_bullet_point_container(self, node) -> bool:
|
||||
"""Check if a container node represents a bullet point that should flow as one line."""
|
||||
from edgar.documents.nodes import ContainerNode
|
||||
|
||||
if not isinstance(node, ContainerNode):
|
||||
return False
|
||||
|
||||
# Must have at least 2 children (bullet + content)
|
||||
if len(node.children) < 2:
|
||||
return False
|
||||
|
||||
# Get the text of all children to check for bullet patterns
|
||||
all_text = node.text()
|
||||
if not all_text:
|
||||
return False
|
||||
|
||||
# Check if starts with common bullet characters
|
||||
bullet_chars = ['•', '●', '▪', '▫', '◦', '‣', '-', '*']
|
||||
starts_with_bullet = any(all_text.strip().startswith(char) for char in bullet_chars)
|
||||
|
||||
if not starts_with_bullet:
|
||||
return False
|
||||
|
||||
# Check if container has flex display (common for bullet point layouts)
|
||||
if hasattr(node, 'style') and node.style and hasattr(node.style, 'display'):
|
||||
if node.style.display == 'flex':
|
||||
return True
|
||||
|
||||
# Check if it has bullet-like structure: short first child + longer content
|
||||
if len(node.children) >= 2:
|
||||
first_child_text = node.children[0].text() if hasattr(node.children[0], 'text') else ""
|
||||
second_child_text = node.children[1].text() if hasattr(node.children[1], 'text') else ""
|
||||
|
||||
# First child is very short (likely bullet), second is longer (content)
|
||||
if len(first_child_text.strip()) <= 3 and len(second_child_text.strip()) > 10:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _clean_document_text(self, text: str) -> str:
|
||||
"""Apply minimal document-level cleaning that preserves table formatting."""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Only apply global formatting that doesn't affect table alignment:
|
||||
|
||||
# Replace excessive newlines (4+ consecutive) with triple newline
|
||||
text = re.sub(r'\n{4,}', '\n\n\n', text)
|
||||
|
||||
# Remove empty lines at start/end only
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
def _normalize_punctuation(self, text: str) -> str:
|
||||
"""Normalize punctuation for cleaner text."""
|
||||
# Normalize quotes
|
||||
text = text.replace('"', '"').replace('"', '"')
|
||||
text = text.replace(''', "'").replace(''', "'")
|
||||
|
||||
# Normalize dashes
|
||||
text = text.replace('—', ' - ') # em dash
|
||||
text = text.replace('–', ' - ') # en dash
|
||||
|
||||
# Fix spacing around punctuation
|
||||
text = re.sub(r'\s+([.,;!?])', r'\1', text)
|
||||
text = re.sub(r'([.,;!?])\s*', r'\1 ', text)
|
||||
|
||||
# Remove extra spaces
|
||||
text = re.sub(r' {2,}', ' ', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def _truncate_text(self, text: str, max_length: int) -> str:
|
||||
"""Truncate text intelligently."""
|
||||
if len(text) <= max_length:
|
||||
return text
|
||||
|
||||
# Try to truncate at sentence boundary
|
||||
truncated = text[:max_length]
|
||||
last_period = truncated.rfind('.')
|
||||
last_newline = truncated.rfind('\n')
|
||||
|
||||
# Choose the better truncation point
|
||||
truncate_at = max(last_period, last_newline)
|
||||
if truncate_at > max_length * 0.8: # If we found a good boundary
|
||||
return text[:truncate_at + 1].strip()
|
||||
|
||||
# Otherwise truncate at word boundary
|
||||
last_space = truncated.rfind(' ')
|
||||
if last_space > max_length * 0.9:
|
||||
return text[:last_space].strip() + '...'
|
||||
|
||||
# Last resort: hard truncate
|
||||
return text[:max_length - 3].strip() + '...'
|
||||
@@ -0,0 +1,178 @@
|
||||
"""
|
||||
TOC-based section detection strategy.
|
||||
|
||||
Detects sections using Table of Contents structure. Provides highest
|
||||
confidence (0.95) and includes full text extraction capabilities.
|
||||
|
||||
This detector wraps SECSectionExtractor which has proven implementations of:
|
||||
- Multi-column TOC support (checks all preceding table cells)
|
||||
- Nested anchor handling (traverses up to find content container)
|
||||
- Full section text extraction
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
from edgar.documents.document import Document, Section
|
||||
from edgar.documents.nodes import SectionNode
|
||||
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TOCSectionDetector:
|
||||
"""
|
||||
TOC-based section detection strategy.
|
||||
|
||||
Uses Table of Contents structure to identify section boundaries and
|
||||
extract full section content. Provides high confidence (0.95) detection.
|
||||
|
||||
This implementation wraps the proven SECSectionExtractor which includes:
|
||||
- Multi-column TOC support for edge cases like Morgan Stanley
|
||||
- Nested anchor handling for sections with no sibling content
|
||||
- Complete text extraction with proper boundary detection
|
||||
"""
|
||||
|
||||
def __init__(self, document: Document):
|
||||
"""
|
||||
Initialize TOC-based detector.
|
||||
|
||||
Args:
|
||||
document: Document to analyze (must have metadata.original_html)
|
||||
"""
|
||||
self.document = document
|
||||
self.extractor = SECSectionExtractor(document)
|
||||
|
||||
def detect(self) -> Optional[Dict[str, Section]]:
|
||||
"""
|
||||
Detect sections using TOC structure.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section names to Section objects, or None if unavailable
|
||||
|
||||
Note:
|
||||
Requires document.metadata.original_html to be available.
|
||||
Returns None if HTML is not available or no sections found.
|
||||
"""
|
||||
# Check if original HTML is available
|
||||
html_content = getattr(self.document.metadata, 'original_html', None)
|
||||
if not html_content:
|
||||
logger.debug("TOC detection unavailable: original_html not in document metadata")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Get available sections from TOC
|
||||
available = self.extractor.get_available_sections()
|
||||
if not available:
|
||||
logger.debug("No sections found in TOC")
|
||||
return None
|
||||
|
||||
sections = {}
|
||||
|
||||
# Extract each section
|
||||
for section_name in available:
|
||||
# Get section metadata first to check for subsections
|
||||
section_info = self.extractor.get_section_info(section_name)
|
||||
if not section_info:
|
||||
logger.debug(f"Skipping {section_name}: no section info")
|
||||
continue
|
||||
|
||||
# Get section text (may be empty for container sections)
|
||||
section_text = self.extractor.get_section_text(section_name, include_subsections=True)
|
||||
|
||||
# Check if this section has subsections
|
||||
has_subsections = section_info.get('subsections', [])
|
||||
|
||||
if not section_text and not has_subsections:
|
||||
# Skip only if no text AND no subsections
|
||||
logger.debug(f"Skipping {section_name}: no text and no subsections")
|
||||
continue
|
||||
|
||||
# Create section node (placeholder - actual content extracted lazily)
|
||||
section_node = SectionNode(section_name=section_name)
|
||||
|
||||
# For container sections (Item 1, Item 10), text will include all subsections
|
||||
section_length = len(section_text) if section_text else 0
|
||||
|
||||
# Create text extractor callback for lazy loading
|
||||
def make_text_extractor(extractor, name):
|
||||
"""Create a closure that captures extractor and section name."""
|
||||
def extract_text(section_name=None, **kwargs):
|
||||
# Use captured name, ignore passed section_name
|
||||
clean = kwargs.get('clean', True)
|
||||
return extractor.get_section_text(name, include_subsections=True, clean=clean) or ""
|
||||
return extract_text
|
||||
|
||||
# Parse section name to extract part and item identifiers
|
||||
part, item = Section.parse_section_name(section_name)
|
||||
|
||||
# Create Section with TOC confidence
|
||||
section = Section(
|
||||
name=section_name,
|
||||
title=section_info.get('canonical_name', section_name),
|
||||
node=section_node,
|
||||
start_offset=0, # Would need actual offsets from parsing
|
||||
end_offset=section_length,
|
||||
confidence=0.95, # TOC-based = high confidence
|
||||
detection_method='toc',
|
||||
part=part,
|
||||
item=item,
|
||||
_text_extractor=make_text_extractor(self.extractor, section_name)
|
||||
)
|
||||
|
||||
sections[section_name] = section
|
||||
|
||||
if sections:
|
||||
logger.info(f"TOC detection found {len(sections)} sections")
|
||||
return sections
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"TOC detection failed: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
def get_section_text(document: Document, section_name: str) -> Optional[str]:
|
||||
"""
|
||||
Get section text using TOC-based extraction.
|
||||
|
||||
Args:
|
||||
document: Document to extract from
|
||||
section_name: Section name (e.g., 'Item 1', 'Item 1A')
|
||||
|
||||
Returns:
|
||||
Section text if available, None otherwise
|
||||
"""
|
||||
html_content = getattr(document.metadata, 'original_html', None)
|
||||
if not html_content:
|
||||
return None
|
||||
|
||||
try:
|
||||
extractor = SECSectionExtractor(document)
|
||||
return extractor.get_section_text(section_name)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get section text for {section_name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_available_sections(document: Document) -> list[str]:
|
||||
"""
|
||||
Get list of available sections from TOC.
|
||||
|
||||
Args:
|
||||
document: Document to analyze
|
||||
|
||||
Returns:
|
||||
List of section names found in TOC
|
||||
"""
|
||||
html_content = getattr(document.metadata, 'original_html', None)
|
||||
if not html_content:
|
||||
return []
|
||||
|
||||
try:
|
||||
extractor = SECSectionExtractor(document)
|
||||
return extractor.get_available_sections()
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get available sections: {e}")
|
||||
return []
|
||||
@@ -0,0 +1,383 @@
|
||||
"""
|
||||
Section extraction for SEC filings using Table of Contents analysis.
|
||||
|
||||
This system uses TOC structure to extract specific sections like "Item 1",
|
||||
"Item 1A", etc. from SEC filings. This approach works consistently across
|
||||
all SEC filings regardless of whether they use semantic anchors or generated IDs.
|
||||
"""
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple, Set
|
||||
from dataclasses import dataclass
|
||||
from lxml import html as lxml_html
|
||||
|
||||
from edgar.documents.nodes import Node, SectionNode
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.utils.toc_analyzer import TOCAnalyzer
|
||||
|
||||
|
||||
@dataclass
|
||||
class SectionBoundary:
|
||||
"""Represents the boundaries of a document section."""
|
||||
name: str
|
||||
anchor_id: str
|
||||
start_element_id: Optional[str] = None
|
||||
end_element_id: Optional[str] = None
|
||||
start_node: Optional[Node] = None
|
||||
end_node: Optional[Node] = None
|
||||
text_start: Optional[int] = None # Character position in full text
|
||||
text_end: Optional[int] = None
|
||||
confidence: float = 1.0 # Detection confidence (0.0-1.0)
|
||||
detection_method: str = 'unknown' # How section was detected
|
||||
|
||||
|
||||
class SECSectionExtractor:
|
||||
"""
|
||||
Extract specific sections from SEC filings using Table of Contents analysis.
|
||||
|
||||
This uses TOC structure to identify section boundaries and extract content
|
||||
between them. Works consistently for all SEC filings.
|
||||
"""
|
||||
|
||||
def __init__(self, document: Document):
|
||||
self.document = document
|
||||
self.section_map = {} # Maps section names to canonical names
|
||||
self.section_boundaries = {} # Maps section names to boundaries
|
||||
self.toc_analyzer = TOCAnalyzer()
|
||||
self._analyze_sections()
|
||||
|
||||
def _analyze_sections(self) -> None:
|
||||
"""
|
||||
Analyze the document using TOC structure to identify section boundaries.
|
||||
|
||||
This creates a map of section names to their anchor positions using
|
||||
Table of Contents analysis, which works for all SEC filings.
|
||||
"""
|
||||
# Get the original HTML if available
|
||||
html_content = getattr(self.document.metadata, 'original_html', None)
|
||||
if not html_content:
|
||||
return
|
||||
|
||||
# Use TOC analysis to find sections
|
||||
toc_mapping = self.toc_analyzer.analyze_toc_structure(html_content)
|
||||
|
||||
if not toc_mapping:
|
||||
return # No sections found
|
||||
|
||||
# Handle XML declaration issues
|
||||
if html_content.startswith('<?xml'):
|
||||
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
|
||||
|
||||
tree = lxml_html.fromstring(html_content)
|
||||
|
||||
sec_sections = {}
|
||||
|
||||
for section_name, anchor_id in toc_mapping.items():
|
||||
# Verify the anchor target exists
|
||||
target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
|
||||
if target_elements:
|
||||
element = target_elements[0]
|
||||
|
||||
# Use TOC-based section info
|
||||
section_type, order = self.toc_analyzer._get_section_type_and_order(section_name)
|
||||
|
||||
sec_sections[section_name] = {
|
||||
'anchor_id': anchor_id,
|
||||
'element': element,
|
||||
'canonical_name': section_name,
|
||||
'type': section_type,
|
||||
'order': order,
|
||||
'confidence': 0.95, # TOC-based detection = high confidence
|
||||
'detection_method': 'toc' # Method: Table of Contents
|
||||
}
|
||||
|
||||
if not sec_sections:
|
||||
return # No valid sections found
|
||||
|
||||
# Sort sections by their logical order
|
||||
sorted_sections = sorted(sec_sections.items(), key=lambda x: x[1]['order'])
|
||||
|
||||
# Calculate section boundaries
|
||||
for i, (section_name, section_data) in enumerate(sorted_sections):
|
||||
start_anchor = section_data['anchor_id']
|
||||
|
||||
# End boundary is the start of the next section (if any)
|
||||
end_anchor = None
|
||||
if i + 1 < len(sorted_sections):
|
||||
next_section = sorted_sections[i + 1][1]
|
||||
end_anchor = next_section['anchor_id']
|
||||
|
||||
self.section_boundaries[section_name] = SectionBoundary(
|
||||
name=section_name,
|
||||
anchor_id=start_anchor,
|
||||
end_element_id=end_anchor,
|
||||
confidence=section_data.get('confidence', 0.95),
|
||||
detection_method=section_data.get('detection_method', 'toc')
|
||||
)
|
||||
|
||||
self.section_map = {name: data['canonical_name'] for name, data in sec_sections.items()}
|
||||
|
||||
|
||||
|
||||
def get_available_sections(self) -> List[str]:
|
||||
"""
|
||||
Get list of available sections that can be extracted.
|
||||
|
||||
Returns:
|
||||
List of section names
|
||||
"""
|
||||
return sorted(self.section_boundaries.keys(),
|
||||
key=lambda x: self.section_boundaries[x].anchor_id)
|
||||
|
||||
def get_section_text(self, section_name: str,
|
||||
include_subsections: bool = True,
|
||||
clean: bool = True) -> Optional[str]:
|
||||
"""
|
||||
Extract text content for a specific section.
|
||||
|
||||
Args:
|
||||
section_name: Name of section (e.g., "Item 1", "Item 1A", "Part I")
|
||||
include_subsections: Whether to include subsections
|
||||
clean: Whether to apply text cleaning
|
||||
|
||||
Returns:
|
||||
Section text content or None if section not found
|
||||
"""
|
||||
# Normalize section name
|
||||
normalized_name = self._normalize_section_name(section_name)
|
||||
|
||||
if normalized_name not in self.section_boundaries:
|
||||
return None
|
||||
|
||||
boundary = self.section_boundaries[normalized_name]
|
||||
|
||||
# Extract content between boundaries using HTML parsing
|
||||
html_content = getattr(self.document.metadata, 'original_html', None)
|
||||
if not html_content:
|
||||
return None
|
||||
|
||||
try:
|
||||
section_text = self._extract_section_content(html_content, boundary, include_subsections, clean)
|
||||
|
||||
# If no direct content but include_subsections=True, aggregate subsection text
|
||||
if not section_text and include_subsections:
|
||||
subsections = self._get_subsections(normalized_name)
|
||||
if subsections:
|
||||
# Recursively get text from all subsections
|
||||
subsection_texts = []
|
||||
for subsection_name in subsections:
|
||||
subsection_text = self.get_section_text(subsection_name, include_subsections=True, clean=clean)
|
||||
if subsection_text:
|
||||
subsection_texts.append(subsection_text)
|
||||
|
||||
if subsection_texts:
|
||||
section_text = '\n\n'.join(subsection_texts)
|
||||
|
||||
return section_text
|
||||
except Exception as e:
|
||||
# Fallback to simple text extraction
|
||||
return self._extract_section_fallback(section_name, clean)
|
||||
|
||||
def _normalize_section_name(self, section_name: str) -> str:
|
||||
"""Normalize section name for lookup."""
|
||||
# Handle common variations
|
||||
name = section_name.strip()
|
||||
|
||||
# "Item 1" vs "Item 1." vs "Item 1:"
|
||||
name = re.sub(r'[.:]$', '', name)
|
||||
|
||||
# Case normalization
|
||||
if re.match(r'item\s+\d+', name, re.IGNORECASE):
|
||||
match = re.match(r'item\s+(\d+[a-z]?)', name, re.IGNORECASE)
|
||||
if match:
|
||||
name = f"Item {match.group(1).upper()}"
|
||||
elif re.match(r'part\s+[ivx]+', name, re.IGNORECASE):
|
||||
match = re.match(r'part\s+([ivx]+)', name, re.IGNORECASE)
|
||||
if match:
|
||||
name = f"Part {match.group(1).upper()}"
|
||||
|
||||
return name
|
||||
|
||||
def _extract_section_content(self, html_content: str, boundary: SectionBoundary,
|
||||
include_subsections: bool, clean: bool) -> str:
|
||||
"""
|
||||
Extract section content from HTML between anchors.
|
||||
|
||||
Args:
|
||||
html_content: Full HTML content
|
||||
boundary: Section boundary info
|
||||
include_subsections: Whether to include subsections
|
||||
clean: Whether to clean the text
|
||||
|
||||
Returns:
|
||||
Extracted section text
|
||||
"""
|
||||
# Handle XML declaration issues
|
||||
if html_content.startswith('<?xml'):
|
||||
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
|
||||
|
||||
tree = lxml_html.fromstring(html_content)
|
||||
|
||||
# Find start element
|
||||
start_elements = tree.xpath(f'//*[@id="{boundary.anchor_id}"]')
|
||||
if not start_elements:
|
||||
return ""
|
||||
|
||||
start_element = start_elements[0]
|
||||
|
||||
# Collect content until we hit the end boundary (if specified)
|
||||
content_elements = []
|
||||
|
||||
# If anchor has no siblings (nested in empty container), traverse up to find content container
|
||||
# This handles cases like <div id="item7"><div></div></div> where content is after the container
|
||||
current = start_element.getnext()
|
||||
if current is None:
|
||||
# No sibling - traverse up to find a container with siblings
|
||||
container = start_element.getparent()
|
||||
while container is not None and container.getnext() is None:
|
||||
container = container.getparent()
|
||||
|
||||
# Start from the container's next sibling if found
|
||||
if container is not None:
|
||||
current = container.getnext()
|
||||
|
||||
# Collect content from siblings
|
||||
if current is not None:
|
||||
# Normal case - anchor has siblings
|
||||
while current is not None:
|
||||
# Check if we've reached the end boundary
|
||||
if boundary.end_element_id:
|
||||
current_id = current.get('id', '')
|
||||
if current_id == boundary.end_element_id:
|
||||
break
|
||||
|
||||
# Also check if this is a sibling section we should stop at
|
||||
if not include_subsections and self._is_sibling_section(current_id, boundary.name):
|
||||
break
|
||||
|
||||
content_elements.append(current)
|
||||
current = current.getnext()
|
||||
|
||||
# Extract text from collected elements
|
||||
section_texts = []
|
||||
for element in content_elements:
|
||||
text = self._extract_element_text(element)
|
||||
if text.strip():
|
||||
section_texts.append(text)
|
||||
|
||||
combined_text = '\n\n'.join(section_texts)
|
||||
|
||||
# Apply cleaning if requested
|
||||
if clean:
|
||||
combined_text = self._clean_section_text(combined_text)
|
||||
|
||||
return combined_text
|
||||
|
||||
def _is_sibling_section(self, element_id: str, current_section: str) -> bool:
|
||||
"""Check if element ID represents a sibling section."""
|
||||
if not element_id:
|
||||
return False
|
||||
|
||||
# Check if this looks like another item at the same level
|
||||
if 'item' in current_section.lower() and 'item' in element_id.lower():
|
||||
current_item = re.search(r'item\s*(\d+)', current_section, re.IGNORECASE)
|
||||
other_item = re.search(r'item[\s_]*(\d+)', element_id, re.IGNORECASE)
|
||||
|
||||
if current_item and other_item:
|
||||
return current_item.group(1) != other_item.group(1)
|
||||
|
||||
return False
|
||||
|
||||
def _extract_element_text(self, element) -> str:
|
||||
"""Extract clean text from an HTML element."""
|
||||
# This would integrate with your existing text extraction logic
|
||||
# For now, simple text extraction
|
||||
return element.text_content() or ""
|
||||
|
||||
def _clean_section_text(self, text: str) -> str:
|
||||
"""Clean extracted section text."""
|
||||
# Apply the same cleaning as the main document
|
||||
from edgar.documents.utils.anchor_cache import filter_with_cached_patterns
|
||||
|
||||
# Remove excessive whitespace
|
||||
text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
|
||||
|
||||
# Filter navigation links
|
||||
html_content = getattr(self.document.metadata, 'original_html', None)
|
||||
if html_content:
|
||||
text = filter_with_cached_patterns(text, html_content)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def _extract_section_fallback(self, section_name: str, clean: bool) -> Optional[str]:
|
||||
"""
|
||||
Fallback section extraction using document nodes.
|
||||
|
||||
This is used when HTML-based extraction fails.
|
||||
"""
|
||||
# Search through document sections
|
||||
for name, section in self.document.sections.items():
|
||||
if section_name.lower() in name.lower():
|
||||
return section.text(clean=clean)
|
||||
|
||||
return None
|
||||
|
||||
def get_section_info(self, section_name: str) -> Optional[Dict]:
|
||||
"""
|
||||
Get detailed information about a section.
|
||||
|
||||
Args:
|
||||
section_name: Section name to look up
|
||||
|
||||
Returns:
|
||||
Dict with section metadata
|
||||
"""
|
||||
normalized_name = self._normalize_section_name(section_name)
|
||||
|
||||
if normalized_name not in self.section_boundaries:
|
||||
return None
|
||||
|
||||
boundary = self.section_boundaries[normalized_name]
|
||||
|
||||
return {
|
||||
'name': boundary.name,
|
||||
'anchor_id': boundary.anchor_id,
|
||||
'available': True,
|
||||
'estimated_length': None, # Could calculate if needed
|
||||
'subsections': self._get_subsections(normalized_name)
|
||||
}
|
||||
|
||||
def _get_subsections(self, parent_section: str) -> List[str]:
|
||||
"""
|
||||
Get subsections of a parent section.
|
||||
|
||||
For example:
|
||||
- "Item 1" has subsections "Item 1A", "Item 1B" (valid)
|
||||
- "Item 1" does NOT have subsection "Item 10" (invalid - different item)
|
||||
"""
|
||||
subsections = []
|
||||
|
||||
# Look for sections that start with the parent name
|
||||
for section_name in self.section_boundaries:
|
||||
if section_name == parent_section:
|
||||
continue
|
||||
|
||||
if section_name.startswith(parent_section):
|
||||
# Check if this is a true subsection (e.g., Item 1A)
|
||||
# vs a different section that happens to start with same prefix (e.g., Item 10)
|
||||
remainder = section_name[len(parent_section):]
|
||||
|
||||
# Valid subsection patterns:
|
||||
# - "Item 1A" (remainder: "A") - letter suffix
|
||||
# - "Item 1 - Business" (remainder: " - Business") - has separator
|
||||
# Invalid patterns:
|
||||
# - "Item 10" (remainder: "0") - digit continues the number
|
||||
|
||||
if remainder and remainder[0].isalpha():
|
||||
# Letter suffix like "A", "B" - valid subsection
|
||||
subsections.append(section_name)
|
||||
elif remainder and remainder[0] in [' ', '-', '.', ':']:
|
||||
# Has separator - could be descriptive title
|
||||
subsections.append(section_name)
|
||||
# If remainder starts with digit, it's NOT a subsection (e.g., "Item 10")
|
||||
|
||||
return sorted(subsections)
|
||||
318
venv/lib/python3.10/site-packages/edgar/documents/migration.py
Normal file
318
venv/lib/python3.10/site-packages/edgar/documents/migration.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""
|
||||
Migration and compatibility layer for transitioning from old parser to new.
|
||||
|
||||
NOTE: This compatibility layer is documented for user migration from v1.x → v2.0
|
||||
It is intentionally not used internally but kept for user convenience.
|
||||
Do not remove without versioning consideration.
|
||||
"""
|
||||
|
||||
from typing import Optional, List, Dict, Any
|
||||
import warnings
|
||||
from edgar.documents import HTMLParser, Document, ParserConfig
|
||||
from edgar.documents.search import DocumentSearch
|
||||
|
||||
|
||||
class LegacyHTMLDocument:
|
||||
"""
|
||||
Compatibility wrapper that mimics the old Document API.
|
||||
|
||||
This allows existing code to work with the new parser
|
||||
while providing deprecation warnings.
|
||||
"""
|
||||
|
||||
def __init__(self, new_document: Document):
|
||||
"""Initialize with new document."""
|
||||
self._doc = new_document
|
||||
self._warn_on_use = True
|
||||
|
||||
def _deprecation_warning(self, old_method: str, new_method: str = None):
|
||||
"""Issue deprecation warning."""
|
||||
if self._warn_on_use:
|
||||
msg = f"Document.{old_method} is deprecated."
|
||||
if new_method:
|
||||
msg += f" Use {new_method} instead."
|
||||
warnings.warn(msg, DeprecationWarning, stacklevel=3)
|
||||
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""Get document text (old API)."""
|
||||
self._deprecation_warning("text", "Document.text()")
|
||||
return self._doc.text()
|
||||
|
||||
def get_text(self, clean: bool = True) -> str:
|
||||
"""Get text with options (old API)."""
|
||||
self._deprecation_warning("get_text()", "Document.text()")
|
||||
return self._doc.text()
|
||||
|
||||
@property
|
||||
def tables(self) -> List[Any]:
|
||||
"""Get tables (old API)."""
|
||||
self._deprecation_warning("tables", "Document.tables")
|
||||
return self._doc.tables
|
||||
|
||||
def find_all(self, tag: str) -> List[Any]:
|
||||
"""Find elements by tag (old API)."""
|
||||
self._deprecation_warning("find_all()", "Document.root.find()")
|
||||
|
||||
# Map old tag names to node types
|
||||
from edgar.documents.types import NodeType
|
||||
|
||||
tag_map = {
|
||||
'h1': NodeType.HEADING,
|
||||
'h2': NodeType.HEADING,
|
||||
'h3': NodeType.HEADING,
|
||||
'p': NodeType.PARAGRAPH,
|
||||
'table': NodeType.TABLE,
|
||||
}
|
||||
|
||||
node_type = tag_map.get(tag.lower())
|
||||
if node_type:
|
||||
return self._doc.root.find(lambda n: n.type == node_type)
|
||||
|
||||
return []
|
||||
|
||||
def search(self, pattern: str) -> List[str]:
|
||||
"""Search document (old API)."""
|
||||
self._deprecation_warning("search()", "DocumentSearch.search()")
|
||||
|
||||
search = DocumentSearch(self._doc)
|
||||
results = search.search(pattern)
|
||||
return [r.text for r in results]
|
||||
|
||||
@property
|
||||
def sections(self) -> Dict[str, Any]:
|
||||
"""Get sections (old API)."""
|
||||
# Convert new sections to old format
|
||||
new_sections = self._doc.sections
|
||||
old_sections = {}
|
||||
|
||||
for name, section in new_sections.items():
|
||||
old_sections[name] = {
|
||||
'title': section.title,
|
||||
'text': section.text(),
|
||||
'start': section.start_offset,
|
||||
'end': section.end_offset
|
||||
}
|
||||
|
||||
return old_sections
|
||||
|
||||
def to_markdown(self) -> str:
|
||||
"""Convert to markdown (old API)."""
|
||||
self._deprecation_warning("to_markdown()", "MarkdownRenderer.render()")
|
||||
|
||||
from edgar.documents.renderers import MarkdownRenderer
|
||||
renderer = MarkdownRenderer()
|
||||
return renderer.render(self._doc)
|
||||
|
||||
|
||||
class LegacySECHTMLParser:
|
||||
"""
|
||||
Compatibility wrapper for old SECHTMLParser.
|
||||
|
||||
Maps old parser methods to new parser.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""Initialize with optional config."""
|
||||
# Convert old config to new
|
||||
new_config = self._convert_config(config)
|
||||
self._parser = HTMLParser(new_config)
|
||||
self._warn_on_use = True
|
||||
|
||||
def _convert_config(self, old_config: Optional[Dict[str, Any]]) -> ParserConfig:
|
||||
"""Convert old config format to new."""
|
||||
if not old_config:
|
||||
return ParserConfig()
|
||||
|
||||
new_config = ParserConfig()
|
||||
|
||||
# Map old config keys to new
|
||||
if 'clean_text' in old_config:
|
||||
new_config.clean_text = old_config['clean_text']
|
||||
|
||||
if 'extract_tables' in old_config:
|
||||
new_config.table_extraction = old_config['extract_tables']
|
||||
|
||||
if 'preserve_layout' in old_config:
|
||||
new_config.preserve_whitespace = old_config['preserve_layout']
|
||||
|
||||
return new_config
|
||||
|
||||
def parse(self, html: str) -> LegacyHTMLDocument:
|
||||
"""Parse HTML (old API)."""
|
||||
if self._warn_on_use:
|
||||
warnings.warn(
|
||||
"SECHTMLParser is deprecated. Use HTMLParser instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
|
||||
new_doc = self._parser.parse(html)
|
||||
return LegacyHTMLDocument(new_doc)
|
||||
|
||||
def parse_file(self, filepath: str) -> LegacyHTMLDocument:
|
||||
"""Parse HTML file (old API)."""
|
||||
if self._warn_on_use:
|
||||
warnings.warn(
|
||||
"SECHTMLParser.parse_file() is deprecated. Use HTMLParser.parse_file() instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
|
||||
new_doc = self._parser.parse_file(filepath)
|
||||
return LegacyHTMLDocument(new_doc)
|
||||
|
||||
|
||||
def migrate_parser_usage(code: str) -> str:
|
||||
"""
|
||||
Helper to migrate code from old parser to new.
|
||||
|
||||
Args:
|
||||
code: Python code using old parser
|
||||
|
||||
Returns:
|
||||
Updated code using new parser
|
||||
"""
|
||||
replacements = [
|
||||
# Import statements
|
||||
("from edgar.files.html import SECHTMLParser",
|
||||
"from edgar.documents import HTMLParser"),
|
||||
|
||||
("from edgar.files.html import Document",
|
||||
"from edgar.documents import Document"),
|
||||
|
||||
# Class instantiation
|
||||
("SECHTMLParser(", "HTMLParser("),
|
||||
|
||||
# Method calls
|
||||
("document.text", "document.text()"),
|
||||
("document.get_text(", "document.text("),
|
||||
("document.find_all(", "document.root.find(lambda n: n.tag == "),
|
||||
("document.to_markdown(", "MarkdownRenderer().render(document"),
|
||||
|
||||
# Config changes
|
||||
("extract_tables=", "table_extraction="),
|
||||
("preserve_layout=", "preserve_whitespace="),
|
||||
]
|
||||
|
||||
migrated = code
|
||||
for old, new in replacements:
|
||||
migrated = migrated.replace(old, new)
|
||||
|
||||
return migrated
|
||||
|
||||
|
||||
class MigrationGuide:
|
||||
"""
|
||||
Provides migration guidance and utilities.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def check_compatibility(old_parser_instance) -> Dict[str, Any]:
|
||||
"""
|
||||
Check if old parser instance can be migrated.
|
||||
|
||||
Returns:
|
||||
Dict with compatibility info
|
||||
"""
|
||||
return {
|
||||
'can_migrate': True,
|
||||
'warnings': [],
|
||||
'recommendations': [
|
||||
"Replace SECHTMLParser with HTMLParser",
|
||||
"Update document.text to document.text()",
|
||||
"Use DocumentSearch for search functionality",
|
||||
"Use MarkdownRenderer for markdown conversion"
|
||||
]
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def print_migration_guide():
|
||||
"""Print migration guide."""
|
||||
guide = """
|
||||
HTML Parser Migration Guide
|
||||
==========================
|
||||
|
||||
The new HTML parser provides significant improvements:
|
||||
- 10x performance improvement
|
||||
- Better table parsing
|
||||
- Reliable section detection
|
||||
- Advanced search capabilities
|
||||
|
||||
Key Changes:
|
||||
-----------
|
||||
|
||||
1. Imports:
|
||||
OLD: from edgar.files.html import SECHTMLParser, Document
|
||||
NEW: from edgar.documents import HTMLParser, Document
|
||||
|
||||
2. Parser Creation:
|
||||
OLD: parser = SECHTMLParser()
|
||||
NEW: parser = HTMLParser()
|
||||
|
||||
3. Document Text:
|
||||
OLD: document.text or document.get_text()
|
||||
NEW: document.text()
|
||||
|
||||
4. Search:
|
||||
OLD: document.search(pattern)
|
||||
NEW: search = DocumentSearch(document)
|
||||
results = search.search(pattern)
|
||||
|
||||
5. Tables:
|
||||
OLD: document.tables
|
||||
NEW: document.tables (same, but returns richer TableNode objects)
|
||||
|
||||
6. Sections:
|
||||
OLD: document.sections
|
||||
NEW: document.sections (returns Section objects with more features)
|
||||
|
||||
7. Markdown:
|
||||
OLD: document.to_markdown()
|
||||
NEW: renderer = MarkdownRenderer()
|
||||
markdown = renderer.render(document)
|
||||
|
||||
Compatibility:
|
||||
-------------
|
||||
|
||||
For gradual migration, use the compatibility layer:
|
||||
|
||||
from edgar.documents.migration import LegacySECHTMLParser
|
||||
parser = LegacySECHTMLParser() # Works like old parser
|
||||
|
||||
This will issue deprecation warnings to help you migrate.
|
||||
|
||||
Performance Config:
|
||||
------------------
|
||||
|
||||
For best performance:
|
||||
parser = HTMLParser.create_for_performance()
|
||||
|
||||
For best accuracy:
|
||||
parser = HTMLParser.create_for_accuracy()
|
||||
|
||||
For AI/LLM processing:
|
||||
parser = HTMLParser.create_for_ai()
|
||||
"""
|
||||
|
||||
print(guide)
|
||||
|
||||
|
||||
# Compatibility aliases
|
||||
SECHTMLParser = LegacySECHTMLParser
|
||||
HTMLDocument = LegacyHTMLDocument
|
||||
|
||||
|
||||
# Auto-migration for common imports
|
||||
def __getattr__(name):
|
||||
"""Provide compatibility imports with warnings."""
|
||||
if name == "SECHTMLParser":
|
||||
warnings.warn(
|
||||
"Importing SECHTMLParser from edgar.documents.migration is deprecated. "
|
||||
"Use HTMLParser from edgar.documents instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
return LegacySECHTMLParser
|
||||
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
@@ -0,0 +1,160 @@
|
||||
"""
|
||||
Example showing how to migrate from old parser to new.
|
||||
"""
|
||||
|
||||
def old_parser_example():
|
||||
"""Example using old parser API."""
|
||||
|
||||
# This is how code might look with the old parser
|
||||
from edgar.documents.migration import SECHTMLParser # Using compatibility layer
|
||||
|
||||
# Create parser
|
||||
parser = SECHTMLParser({
|
||||
'extract_tables': True,
|
||||
'clean_text': True,
|
||||
'preserve_layout': False
|
||||
})
|
||||
|
||||
# Parse HTML
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<h1>Item 1. Business</h1>
|
||||
<p>We are a technology company.</p>
|
||||
|
||||
<table>
|
||||
<tr><th>Year</th><th>Revenue</th></tr>
|
||||
<tr><td>2023</td><td>$100M</td></tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
document = parser.parse(html)
|
||||
|
||||
# Old API usage (will show deprecation warnings)
|
||||
|
||||
# Search
|
||||
document.search("revenue")
|
||||
|
||||
# Convert to markdown
|
||||
document.to_markdown()
|
||||
|
||||
|
||||
def new_parser_example():
|
||||
"""Example using new parser API."""
|
||||
|
||||
# New imports
|
||||
from edgar.documents import DocumentSearch, HTMLParser, ParserConfig
|
||||
from edgar.documents.renderers import MarkdownRenderer
|
||||
|
||||
# Create parser with new config
|
||||
config = ParserConfig(
|
||||
table_extraction=True,
|
||||
clean_text=True,
|
||||
preserve_whitespace=False,
|
||||
detect_sections=True
|
||||
)
|
||||
|
||||
parser = HTMLParser(config)
|
||||
|
||||
# Parse HTML
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<h1>Item 1. Business</h1>
|
||||
<p>We are a technology company.</p>
|
||||
|
||||
<table>
|
||||
<tr><th>Year</th><th>Revenue</th></tr>
|
||||
<tr><td>2023</td><td>$100M</td></tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
document = parser.parse(html)
|
||||
|
||||
# New API usage
|
||||
|
||||
# Search with new API
|
||||
search = DocumentSearch(document)
|
||||
search.search("revenue")
|
||||
|
||||
# Convert to markdown with new API
|
||||
renderer = MarkdownRenderer()
|
||||
renderer.render(document)
|
||||
|
||||
# New features not available in old parser
|
||||
|
||||
# Advanced search
|
||||
search.find_tables(caption_pattern="Revenue")
|
||||
|
||||
# Performance-optimized parser
|
||||
HTMLParser.create_for_performance()
|
||||
|
||||
# Cache statistics
|
||||
from edgar.documents.utils import get_cache_manager
|
||||
get_cache_manager().get_stats()
|
||||
|
||||
|
||||
def migration_comparison():
|
||||
"""Show side-by-side comparison."""
|
||||
|
||||
|
||||
|
||||
|
||||
def automatic_migration_example():
|
||||
"""Show automatic code migration."""
|
||||
|
||||
from edgar.documents.migration import migrate_parser_usage
|
||||
|
||||
old_code = '''
|
||||
from edgar.files.html import SECHTMLParser, Document
|
||||
|
||||
def analyze_filing(html):
|
||||
parser = SECHTMLParser({'extract_tables': True})
|
||||
document = parser.parse(html)
|
||||
|
||||
# Get text
|
||||
text = document.text
|
||||
|
||||
# Search for revenue
|
||||
revenue_mentions = document.search("revenue")
|
||||
|
||||
# Convert to markdown
|
||||
markdown = document.to_markdown()
|
||||
|
||||
return {
|
||||
'text': text,
|
||||
'revenue_mentions': revenue_mentions,
|
||||
'markdown': markdown
|
||||
}
|
||||
'''
|
||||
|
||||
migrate_parser_usage(old_code)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run examples
|
||||
import warnings
|
||||
|
||||
# Show deprecation warnings
|
||||
warnings.filterwarnings('always', category=DeprecationWarning)
|
||||
|
||||
# Run old parser example (will show warnings)
|
||||
old_parser_example()
|
||||
|
||||
# Run new parser example
|
||||
new_parser_example()
|
||||
|
||||
# Show comparison
|
||||
migration_comparison()
|
||||
|
||||
# Show automatic migration
|
||||
automatic_migration_example()
|
||||
|
||||
# Print full migration guide
|
||||
from edgar.documents.migration import MigrationGuide
|
||||
MigrationGuide.print_migration_guide()
|
||||
456
venv/lib/python3.10/site-packages/edgar/documents/nodes.py
Normal file
456
venv/lib/python3.10/site-packages/edgar/documents/nodes.py
Normal file
@@ -0,0 +1,456 @@
|
||||
"""
|
||||
Node hierarchy for the document tree.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict, Any, Callable, Iterator
|
||||
|
||||
from edgar.documents.types import NodeType, SemanticType, Style
|
||||
from edgar.documents.cache_mixin import CacheableMixin
|
||||
|
||||
|
||||
@dataclass
|
||||
class Node(ABC):
|
||||
"""
|
||||
Base node class for document tree.
|
||||
|
||||
All nodes in the document inherit from this class and implement
|
||||
the abstract methods for text and HTML generation.
|
||||
"""
|
||||
|
||||
# Identity
|
||||
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
||||
type: NodeType = NodeType.DOCUMENT
|
||||
|
||||
# Hierarchy
|
||||
parent: Optional['Node'] = field(default=None, repr=False)
|
||||
children: List['Node'] = field(default_factory=list, repr=False)
|
||||
|
||||
# Content
|
||||
content: Any = None
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
style: Style = field(default_factory=Style)
|
||||
|
||||
# Semantic info
|
||||
semantic_type: Optional[SemanticType] = None
|
||||
semantic_role: Optional[str] = None
|
||||
|
||||
def add_child(self, child: 'Node') -> None:
|
||||
"""Add child node, maintaining parent reference."""
|
||||
child.parent = self
|
||||
self.children.append(child)
|
||||
|
||||
def remove_child(self, child: 'Node') -> None:
|
||||
"""Remove child node."""
|
||||
if child in self.children:
|
||||
self.children.remove(child)
|
||||
child.parent = None
|
||||
|
||||
def insert_child(self, index: int, child: 'Node') -> None:
|
||||
"""Insert child at specific index."""
|
||||
child.parent = self
|
||||
self.children.insert(index, child)
|
||||
|
||||
@abstractmethod
|
||||
def text(self) -> str:
|
||||
"""Extract text content from node and its children."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def html(self) -> str:
|
||||
"""Generate HTML representation of node."""
|
||||
pass
|
||||
|
||||
def find(self, predicate: Callable[['Node'], bool]) -> List['Node']:
|
||||
"""Find all nodes matching predicate."""
|
||||
results = []
|
||||
if predicate(self):
|
||||
results.append(self)
|
||||
for child in self.children:
|
||||
results.extend(child.find(predicate))
|
||||
return results
|
||||
|
||||
def find_first(self, predicate: Callable[['Node'], bool]) -> Optional['Node']:
|
||||
"""Find first node matching predicate."""
|
||||
if predicate(self):
|
||||
return self
|
||||
for child in self.children:
|
||||
result = child.find_first(predicate)
|
||||
if result:
|
||||
return result
|
||||
return None
|
||||
|
||||
def xpath(self, expression: str) -> List['Node']:
|
||||
"""
|
||||
Simple XPath-like node selection.
|
||||
|
||||
Supports:
|
||||
- //node_type - Find all nodes of type
|
||||
- /node_type - Direct children of type
|
||||
- [@attr=value] - Attribute matching
|
||||
"""
|
||||
# Simple implementation - can be extended
|
||||
if expression.startswith('//'):
|
||||
node_type = expression[2:].lower()
|
||||
return self.find(lambda n: n.type.name.lower() == node_type)
|
||||
elif expression.startswith('/'):
|
||||
node_type = expression[1:].lower()
|
||||
return [c for c in self.children if c.type.name.lower() == node_type]
|
||||
return []
|
||||
|
||||
def walk(self) -> Iterator['Node']:
|
||||
"""Walk the tree depth-first."""
|
||||
yield self
|
||||
for child in self.children:
|
||||
yield from child.walk()
|
||||
|
||||
@property
|
||||
def depth(self) -> int:
|
||||
"""Get depth of node in tree."""
|
||||
depth = 0
|
||||
current = self.parent
|
||||
while current:
|
||||
depth += 1
|
||||
current = current.parent
|
||||
return depth
|
||||
|
||||
@property
|
||||
def path(self) -> str:
|
||||
"""Get path from root to this node."""
|
||||
parts = []
|
||||
current = self
|
||||
while current:
|
||||
parts.append(current.type.name)
|
||||
current = current.parent
|
||||
return '/'.join(reversed(parts))
|
||||
|
||||
def get_metadata(self, key: str, default: Any = None) -> Any:
|
||||
"""Get metadata value with default."""
|
||||
return self.metadata.get(key, default)
|
||||
|
||||
def set_metadata(self, key: str, value: Any) -> None:
|
||||
"""Set metadata value."""
|
||||
self.metadata[key] = value
|
||||
|
||||
def has_metadata(self, key: str) -> bool:
|
||||
"""Check if metadata key exists."""
|
||||
return key in self.metadata
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentNode(Node, CacheableMixin):
|
||||
"""Root document node."""
|
||||
type: NodeType = field(default=NodeType.DOCUMENT, init=False)
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract all text from document with caching."""
|
||||
def _generate_text():
|
||||
parts = []
|
||||
for child in self.children:
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return '\n\n'.join(parts)
|
||||
|
||||
return self._get_cached_text(_generate_text)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate complete HTML document."""
|
||||
body_content = '\n'.join(child.html() for child in self.children)
|
||||
return f"""<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Document</title>
|
||||
</head>
|
||||
<body>
|
||||
{body_content}
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextNode(Node):
|
||||
"""Plain text content node."""
|
||||
type: NodeType = field(default=NodeType.TEXT, init=False)
|
||||
content: str = ""
|
||||
|
||||
def text(self) -> str:
|
||||
"""Return text content."""
|
||||
return self.content
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate HTML for text."""
|
||||
# Escape HTML entities
|
||||
text = self.content
|
||||
text = text.replace('&', '&')
|
||||
text = text.replace('<', '<')
|
||||
text = text.replace('>', '>')
|
||||
return text
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParagraphNode(Node, CacheableMixin):
|
||||
"""Paragraph node."""
|
||||
type: NodeType = field(default=NodeType.PARAGRAPH, init=False)
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract paragraph text with intelligent spacing and caching."""
|
||||
def _generate_text():
|
||||
parts = []
|
||||
for i, child in enumerate(self.children):
|
||||
text = child.text()
|
||||
if text:
|
||||
# For the first child, just add the text
|
||||
if i == 0:
|
||||
parts.append(text)
|
||||
else:
|
||||
# For subsequent children, check if previous child had tail whitespace
|
||||
prev_child = self.children[i - 1]
|
||||
should_add_space = False
|
||||
|
||||
# Add space if previous child had tail whitespace
|
||||
if hasattr(prev_child, 'get_metadata') and prev_child.get_metadata('has_tail_whitespace'):
|
||||
should_add_space = True
|
||||
|
||||
# Add space if current text starts with space (preserve intended spacing)
|
||||
elif text.startswith(' '):
|
||||
should_add_space = True
|
||||
# Remove the leading space from text since we're adding it as separation
|
||||
text = text.lstrip()
|
||||
|
||||
# Add space if previous text ends with punctuation (sentence boundaries)
|
||||
elif parts and parts[-1].rstrip()[-1:] in '.!?:;':
|
||||
should_add_space = True
|
||||
|
||||
# Add space between adjacent inline elements if the current text starts with a letter/digit
|
||||
# This handles cases where whitespace was stripped but spacing is semantically important
|
||||
elif (text and text[0].isalpha() and
|
||||
parts and parts[-1] and not parts[-1].endswith(' ') and
|
||||
hasattr(child, 'get_metadata') and child.get_metadata('original_tag') in ['span', 'a', 'em', 'strong', 'i', 'b']):
|
||||
should_add_space = True
|
||||
|
||||
if should_add_space:
|
||||
parts.append(' ' + text)
|
||||
else:
|
||||
# Concatenate directly without space
|
||||
if parts:
|
||||
parts[-1] += text
|
||||
else:
|
||||
parts.append(text)
|
||||
|
||||
return ''.join(parts)
|
||||
|
||||
return self._get_cached_text(_generate_text)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate paragraph HTML."""
|
||||
content = ''.join(child.html() for child in self.children)
|
||||
style_attr = self._generate_style_attr()
|
||||
return f'<p{style_attr}>{content}</p>'
|
||||
|
||||
def _generate_style_attr(self) -> str:
|
||||
"""Generate style attribute from style object."""
|
||||
if not self.style:
|
||||
return ''
|
||||
|
||||
styles = []
|
||||
if self.style.text_align:
|
||||
styles.append(f'text-align: {self.style.text_align}')
|
||||
if self.style.margin_top:
|
||||
styles.append(f'margin-top: {self.style.margin_top}px')
|
||||
if self.style.margin_bottom:
|
||||
styles.append(f'margin-bottom: {self.style.margin_bottom}px')
|
||||
|
||||
if styles:
|
||||
return f' style="{"; ".join(styles)}"'
|
||||
return ''
|
||||
|
||||
|
||||
@dataclass
|
||||
class HeadingNode(Node):
|
||||
"""Heading node with level."""
|
||||
type: NodeType = field(default=NodeType.HEADING, init=False)
|
||||
level: int = 1
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract heading text."""
|
||||
if isinstance(self.content, str):
|
||||
return self.content
|
||||
|
||||
parts = []
|
||||
for child in self.children:
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return ' '.join(parts)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate heading HTML."""
|
||||
level = max(1, min(6, self.level)) # Ensure level is 1-6
|
||||
content = self.text()
|
||||
style_attr = self._generate_style_attr()
|
||||
return f'<h{level}{style_attr}>{content}</h{level}>'
|
||||
|
||||
def _generate_style_attr(self) -> str:
|
||||
"""Generate style attribute."""
|
||||
styles = []
|
||||
if self.style.text_align:
|
||||
styles.append(f'text-align: {self.style.text_align}')
|
||||
if self.style.color:
|
||||
styles.append(f'color: {self.style.color}')
|
||||
if styles:
|
||||
return f' style="{"; ".join(styles)}"'
|
||||
return ''
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContainerNode(Node, CacheableMixin):
|
||||
"""Generic container node (div, section, etc.)."""
|
||||
type: NodeType = field(default=NodeType.CONTAINER, init=False)
|
||||
tag_name: str = 'div'
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract text from container with caching."""
|
||||
def _generate_text():
|
||||
parts = []
|
||||
for child in self.children:
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return '\n'.join(parts)
|
||||
|
||||
return self._get_cached_text(_generate_text)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate container HTML."""
|
||||
content = '\n'.join(child.html() for child in self.children)
|
||||
style_attr = self._generate_style_attr()
|
||||
class_attr = f' class="{self.semantic_role}"' if self.semantic_role else ''
|
||||
return f'<{self.tag_name}{style_attr}{class_attr}>{content}</{self.tag_name}>'
|
||||
|
||||
def _generate_style_attr(self) -> str:
|
||||
"""Generate style attribute."""
|
||||
if not self.style:
|
||||
return ''
|
||||
|
||||
styles = []
|
||||
if self.style.margin_top:
|
||||
styles.append(f'margin-top: {self.style.margin_top}px')
|
||||
if self.style.margin_bottom:
|
||||
styles.append(f'margin-bottom: {self.style.margin_bottom}px')
|
||||
if self.style.padding_left:
|
||||
styles.append(f'padding-left: {self.style.padding_left}px')
|
||||
|
||||
if styles:
|
||||
return f' style="{"; ".join(styles)}"'
|
||||
return ''
|
||||
|
||||
|
||||
@dataclass
|
||||
class SectionNode(ContainerNode):
|
||||
"""Document section node."""
|
||||
type: NodeType = field(default=NodeType.SECTION, init=False)
|
||||
section_name: Optional[str] = None
|
||||
tag_name: str = field(default='section', init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
if self.section_name:
|
||||
self.set_metadata('section_name', self.section_name)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListNode(Node):
|
||||
"""List node (ordered or unordered)."""
|
||||
type: NodeType = field(default=NodeType.LIST, init=False)
|
||||
ordered: bool = False
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract list text."""
|
||||
parts = []
|
||||
for i, child in enumerate(self.children):
|
||||
if self.ordered:
|
||||
prefix = f"{i+1}. "
|
||||
else:
|
||||
prefix = "• "
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(f"{prefix}{text}")
|
||||
return '\n'.join(parts)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate list HTML."""
|
||||
tag = 'ol' if self.ordered else 'ul'
|
||||
items = '\n'.join(child.html() for child in self.children)
|
||||
return f'<{tag}>\n{items}\n</{tag}>'
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListItemNode(Node):
|
||||
"""List item node."""
|
||||
type: NodeType = field(default=NodeType.LIST_ITEM, init=False)
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract list item text."""
|
||||
parts = []
|
||||
for child in self.children:
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return ' '.join(parts)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate list item HTML."""
|
||||
content = ''.join(child.html() for child in self.children)
|
||||
return f'<li>{content}</li>'
|
||||
|
||||
|
||||
@dataclass
|
||||
class LinkNode(Node):
|
||||
"""Hyperlink node."""
|
||||
type: NodeType = field(default=NodeType.LINK, init=False)
|
||||
href: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract link text."""
|
||||
if isinstance(self.content, str):
|
||||
return self.content
|
||||
|
||||
parts = []
|
||||
for child in self.children:
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return ' '.join(parts)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate link HTML."""
|
||||
content = self.text()
|
||||
href_attr = f' href="{self.href}"' if self.href else ''
|
||||
title_attr = f' title="{self.title}"' if self.title else ''
|
||||
return f'<a{href_attr}{title_attr}>{content}</a>'
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImageNode(Node):
|
||||
"""Image node."""
|
||||
type: NodeType = field(default=NodeType.IMAGE, init=False)
|
||||
src: Optional[str] = None
|
||||
alt: Optional[str] = None
|
||||
width: Optional[int] = None
|
||||
height: Optional[int] = None
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract image alt text."""
|
||||
return self.alt or ''
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate image HTML."""
|
||||
src_attr = f' src="{self.src}"' if self.src else ''
|
||||
alt_attr = f' alt="{self.alt}"' if self.alt else ''
|
||||
width_attr = f' width="{self.width}"' if self.width else ''
|
||||
height_attr = f' height="{self.height}"' if self.height else ''
|
||||
return f'<img{src_attr}{alt_attr}{width_attr}{height_attr}>'
|
||||
387
venv/lib/python3.10/site-packages/edgar/documents/parser.py
Normal file
387
venv/lib/python3.10/site-packages/edgar/documents/parser.py
Normal file
@@ -0,0 +1,387 @@
|
||||
"""
|
||||
Main HTML parser implementation.
|
||||
"""
|
||||
|
||||
import time
|
||||
from typing import List, Union
|
||||
|
||||
import lxml.html
|
||||
from lxml import etree
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.document import Document, DocumentMetadata
|
||||
from edgar.documents.exceptions import (
|
||||
HTMLParsingError, DocumentTooLargeError, InvalidConfigurationError
|
||||
)
|
||||
from edgar.documents.nodes import DocumentNode
|
||||
from edgar.documents.processors.postprocessor import DocumentPostprocessor
|
||||
from edgar.documents.processors.preprocessor import HTMLPreprocessor
|
||||
from edgar.documents.strategies.document_builder import DocumentBuilder
|
||||
from edgar.documents.types import XBRLFact
|
||||
from edgar.documents.utils import get_cache_manager
|
||||
from edgar.documents.utils.html_utils import remove_xml_declaration, create_lxml_parser
|
||||
|
||||
|
||||
class HTMLParser:
|
||||
"""
|
||||
Main HTML parser class.
|
||||
|
||||
Orchestrates the parsing pipeline with configurable strategies
|
||||
and processors.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ParserConfig = None):
|
||||
"""
|
||||
Initialize parser with configuration.
|
||||
|
||||
Args:
|
||||
config: Parser configuration
|
||||
"""
|
||||
self.config = config or ParserConfig()
|
||||
self._validate_config()
|
||||
|
||||
# Initialize components
|
||||
self.cache_manager = get_cache_manager()
|
||||
self.preprocessor = HTMLPreprocessor(self.config)
|
||||
self.postprocessor = DocumentPostprocessor(self.config)
|
||||
|
||||
# Initialize strategies
|
||||
self._init_strategies()
|
||||
|
||||
def _validate_config(self):
|
||||
"""Validate configuration."""
|
||||
if self.config.max_document_size <= 0:
|
||||
raise InvalidConfigurationError("max_document_size must be positive")
|
||||
|
||||
if self.config.streaming_threshold and self.config.max_document_size:
|
||||
if self.config.streaming_threshold > self.config.max_document_size:
|
||||
raise InvalidConfigurationError(
|
||||
"streaming_threshold cannot exceed max_document_size"
|
||||
)
|
||||
|
||||
def _init_strategies(self):
|
||||
"""Initialize parsing strategies based on configuration."""
|
||||
self.strategies = {}
|
||||
|
||||
# Header detection strategy
|
||||
if self.config.detect_sections:
|
||||
from edgar.documents.strategies.header_detection import HeaderDetectionStrategy
|
||||
self.strategies['header_detection'] = HeaderDetectionStrategy(self.config)
|
||||
|
||||
# Table processing strategy
|
||||
if self.config.table_extraction:
|
||||
from edgar.documents.strategies.table_processing import TableProcessor
|
||||
self.strategies['table_processing'] = TableProcessor(self.config)
|
||||
|
||||
# XBRL extraction strategy
|
||||
if self.config.extract_xbrl:
|
||||
from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
|
||||
self.strategies['xbrl_extraction'] = XBRLExtractor()
|
||||
|
||||
def parse(self, html: Union[str, bytes]) -> Document:
|
||||
"""
|
||||
Parse HTML into Document.
|
||||
|
||||
Args:
|
||||
html: HTML content as string or bytes
|
||||
|
||||
Returns:
|
||||
Parsed Document object
|
||||
|
||||
Raises:
|
||||
DocumentTooLargeError: If document exceeds size limit
|
||||
HTMLParsingError: If parsing fails
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Validate input type
|
||||
if html is None:
|
||||
raise TypeError("HTML input cannot be None")
|
||||
|
||||
if not isinstance(html, (str, bytes)):
|
||||
raise TypeError(f"HTML must be string or bytes, got {type(html).__name__}")
|
||||
|
||||
# Convert bytes to string if needed
|
||||
if isinstance(html, bytes):
|
||||
html = html.decode('utf-8', errors='replace')
|
||||
|
||||
# Handle empty HTML
|
||||
if not html.strip():
|
||||
# Return empty document
|
||||
root = DocumentNode()
|
||||
metadata = DocumentMetadata(
|
||||
size=0,
|
||||
parse_time=time.time() - start_time,
|
||||
parser_version="2.0.0"
|
||||
)
|
||||
return Document(root=root, metadata=metadata)
|
||||
|
||||
# Check document size
|
||||
doc_size = len(html.encode('utf-8'))
|
||||
if doc_size > self.config.max_document_size:
|
||||
raise DocumentTooLargeError(doc_size, self.config.max_document_size)
|
||||
|
||||
# Check if streaming is needed
|
||||
if doc_size > self.config.streaming_threshold:
|
||||
return self._parse_streaming(html)
|
||||
|
||||
try:
|
||||
# Store original HTML BEFORE preprocessing (needed for TOC analysis)
|
||||
original_html = html
|
||||
|
||||
# Extract XBRL data BEFORE preprocessing (to preserve ix:hidden content)
|
||||
xbrl_facts = []
|
||||
if self.config.extract_xbrl:
|
||||
xbrl_facts = self._extract_xbrl_pre_process(html)
|
||||
|
||||
# Preprocessing (will remove ix:hidden for rendering)
|
||||
html = self.preprocessor.process(html)
|
||||
|
||||
# Parse with lxml
|
||||
tree = self._parse_html(html)
|
||||
|
||||
# Extract metadata
|
||||
metadata = self._extract_metadata(tree, html)
|
||||
metadata.preserve_whitespace = self.config.preserve_whitespace
|
||||
|
||||
# Store ORIGINAL unmodified HTML for section extraction (TOC analysis)
|
||||
# Must be the raw HTML before preprocessing
|
||||
metadata.original_html = original_html
|
||||
|
||||
# Add XBRL facts to metadata if found
|
||||
if xbrl_facts:
|
||||
metadata.xbrl_data = {'facts': xbrl_facts}
|
||||
|
||||
# Build document
|
||||
document = self._build_document(tree, metadata)
|
||||
|
||||
# Store config reference for section extraction
|
||||
document._config = self.config
|
||||
|
||||
# Postprocessing
|
||||
document = self.postprocessor.process(document)
|
||||
|
||||
# Record parse time
|
||||
document.metadata.parse_time = time.time() - start_time
|
||||
document.metadata.size = doc_size
|
||||
|
||||
return document
|
||||
|
||||
except Exception as e:
|
||||
if isinstance(e, (DocumentTooLargeError, HTMLParsingError)):
|
||||
raise
|
||||
raise HTMLParsingError(
|
||||
f"Failed to parse HTML: {str(e)}",
|
||||
context={'error_type': type(e).__name__}
|
||||
)
|
||||
|
||||
def _parse_html(self, html: str) -> HtmlElement:
|
||||
"""Parse HTML with lxml."""
|
||||
try:
|
||||
# Remove XML declaration if present
|
||||
html = remove_xml_declaration(html)
|
||||
|
||||
parser = create_lxml_parser(
|
||||
remove_blank_text=not self.config.preserve_whitespace,
|
||||
remove_comments=True,
|
||||
recover=True,
|
||||
encoding='utf-8'
|
||||
)
|
||||
|
||||
# Parse HTML
|
||||
tree = lxml.html.fromstring(html, parser=parser)
|
||||
|
||||
# Ensure we have a proper document structure
|
||||
if tree.tag != 'html':
|
||||
# Wrap in html/body if needed
|
||||
html_tree = lxml.html.Element('html')
|
||||
body = etree.SubElement(html_tree, 'body')
|
||||
body.append(tree)
|
||||
tree = html_tree
|
||||
|
||||
return tree
|
||||
|
||||
except Exception as e:
|
||||
raise HTMLParsingError(
|
||||
f"lxml parsing failed: {str(e)}",
|
||||
context={'parser': 'lxml.html'}
|
||||
)
|
||||
|
||||
def _extract_metadata(self, tree: HtmlElement, html: str) -> DocumentMetadata:
|
||||
"""Extract metadata from HTML tree."""
|
||||
metadata = DocumentMetadata()
|
||||
|
||||
# Use filing type from config if provided (avoids expensive detection)
|
||||
if self.config.form:
|
||||
metadata.form = self.config.form
|
||||
|
||||
# Try to extract from meta tags
|
||||
for meta in tree.xpath('//meta'):
|
||||
name = meta.get('name', '').lower()
|
||||
content = meta.get('content', '')
|
||||
|
||||
if name == 'company':
|
||||
metadata.company = content
|
||||
elif name == 'filing-type':
|
||||
metadata.form = content
|
||||
elif name == 'cik':
|
||||
metadata.cik = content
|
||||
elif name == 'filing-date':
|
||||
metadata.filing_date = content
|
||||
elif name == 'accession-number':
|
||||
metadata.accession_number = content
|
||||
|
||||
# Try to extract from title
|
||||
title_elem = tree.find('.//title')
|
||||
if title_elem is not None and title_elem.text:
|
||||
# Parse title for filing info
|
||||
title = title_elem.text.strip()
|
||||
# Example: "APPLE INC - 10-K - 2023-09-30"
|
||||
parts = title.split(' - ')
|
||||
if len(parts) >= 2:
|
||||
if not metadata.company:
|
||||
metadata.company = parts[0].strip()
|
||||
if not metadata.form:
|
||||
metadata.form = parts[1].strip()
|
||||
|
||||
# Try to extract from document content
|
||||
if not metadata.form:
|
||||
# Look for form type in first 1000 chars
|
||||
text_start = html[:1000].upper()
|
||||
for form_type in ['10-K', '10-Q', '8-K', 'DEF 14A', 'S-1']:
|
||||
if form_type in text_start:
|
||||
metadata.form = form_type
|
||||
break
|
||||
|
||||
return metadata
|
||||
|
||||
def _build_document(self, tree: HtmlElement, metadata: DocumentMetadata) -> Document:
|
||||
"""Build document from parsed tree."""
|
||||
# Create document builder with strategies
|
||||
builder = DocumentBuilder(self.config, self.strategies)
|
||||
|
||||
# Build document node tree
|
||||
root_node = builder.build(tree)
|
||||
|
||||
# Create document
|
||||
document = Document(root=root_node, metadata=metadata)
|
||||
|
||||
return document
|
||||
|
||||
def _parse_streaming(self, html: str) -> Document:
|
||||
"""Parse large document in streaming mode."""
|
||||
from edgar.documents.utils.streaming import StreamingParser
|
||||
|
||||
streaming_parser = StreamingParser(self.config, self.strategies)
|
||||
return streaming_parser.parse(html)
|
||||
|
||||
def _extract_xbrl_pre_process(self, html: str) -> List[XBRLFact]:
|
||||
"""
|
||||
Extract XBRL facts before preprocessing.
|
||||
This ensures we capture XBRL data from ix:hidden elements.
|
||||
"""
|
||||
try:
|
||||
# Parse HTML without preprocessing to preserve all XBRL content
|
||||
parser = create_lxml_parser(
|
||||
remove_blank_text=False,
|
||||
remove_comments=False,
|
||||
recover=True,
|
||||
encoding='utf-8'
|
||||
)
|
||||
|
||||
# Remove XML declaration if present
|
||||
html = remove_xml_declaration(html)
|
||||
|
||||
tree = lxml.html.fromstring(html, parser=parser)
|
||||
|
||||
# Use XBRL extractor
|
||||
from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
|
||||
extractor = XBRLExtractor()
|
||||
|
||||
facts = []
|
||||
|
||||
# Find all XBRL elements (including those in ix:hidden)
|
||||
# Simple approach: find all elements with ix: prefix
|
||||
for element in tree.iter():
|
||||
if element.tag and isinstance(element.tag, str) and 'ix:' in element.tag.lower():
|
||||
# Skip container elements
|
||||
local_name = element.tag.split(':')[-1].lower() if ':' in element.tag else element.tag.lower()
|
||||
if local_name in ['nonnumeric', 'nonfraction', 'continuation', 'footnote', 'fraction']:
|
||||
fact = extractor.extract_fact(element)
|
||||
if fact:
|
||||
# Mark if fact was in hidden section or header
|
||||
parent = element.getparent()
|
||||
while parent is not None:
|
||||
if parent.tag:
|
||||
tag_lower = parent.tag.lower()
|
||||
if 'ix:hidden' in tag_lower or 'ix:header' in tag_lower:
|
||||
fact.metadata = fact.metadata or {}
|
||||
fact.metadata['hidden'] = True
|
||||
break
|
||||
parent = parent.getparent()
|
||||
facts.append(fact)
|
||||
|
||||
return facts
|
||||
|
||||
except Exception as e:
|
||||
# Log error but don't fail parsing
|
||||
import logging
|
||||
logging.warning(f"Failed to extract XBRL data: {e}")
|
||||
return []
|
||||
|
||||
def parse_file(self, file_path: str) -> Document:
|
||||
"""
|
||||
Parse HTML from file.
|
||||
|
||||
Args:
|
||||
file_path: Path to HTML file
|
||||
|
||||
Returns:
|
||||
Parsed Document object
|
||||
"""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
html = f.read()
|
||||
|
||||
document = self.parse(html)
|
||||
document.metadata.source = file_path
|
||||
|
||||
return document
|
||||
|
||||
def parse_url(self, url: str) -> Document:
|
||||
"""
|
||||
Parse HTML from URL.
|
||||
|
||||
Args:
|
||||
url: URL to fetch and parse
|
||||
|
||||
Returns:
|
||||
Parsed Document object
|
||||
"""
|
||||
import requests
|
||||
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
document = self.parse(response.text)
|
||||
document.metadata.url = url
|
||||
|
||||
return document
|
||||
|
||||
@classmethod
|
||||
def create_for_performance(cls) -> 'HTMLParser':
|
||||
"""Create parser optimized for performance."""
|
||||
config = ParserConfig.for_performance()
|
||||
return cls(config)
|
||||
|
||||
@classmethod
|
||||
def create_for_accuracy(cls) -> 'HTMLParser':
|
||||
"""Create parser optimized for accuracy."""
|
||||
config = ParserConfig.for_accuracy()
|
||||
return cls(config)
|
||||
|
||||
@classmethod
|
||||
def create_for_ai(cls) -> 'HTMLParser':
|
||||
"""Create parser optimized for AI processing."""
|
||||
config = ParserConfig.for_ai()
|
||||
return cls(config)
|
||||
@@ -0,0 +1,11 @@
|
||||
"""
|
||||
Document processors for preprocessing and postprocessing.
|
||||
"""
|
||||
|
||||
from edgar.documents.processors.preprocessor import HTMLPreprocessor
|
||||
from edgar.documents.processors.postprocessor import DocumentPostprocessor
|
||||
|
||||
__all__ = [
|
||||
'HTMLPreprocessor',
|
||||
'DocumentPostprocessor'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,283 @@
|
||||
"""
|
||||
Document postprocessor for final processing after parsing.
|
||||
"""
|
||||
|
||||
from typing import List, Set
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.nodes import Node, TextNode, ParagraphNode, HeadingNode
|
||||
from edgar.documents.types import NodeType
|
||||
|
||||
|
||||
class DocumentPostprocessor:
|
||||
"""
|
||||
Postprocesses parsed documents to improve quality.
|
||||
|
||||
Handles:
|
||||
- Adjacent node merging
|
||||
- Empty node removal
|
||||
- Heading level normalization
|
||||
- Section detection enhancement
|
||||
- Metadata enrichment
|
||||
"""
|
||||
|
||||
def __init__(self, config: ParserConfig):
|
||||
"""Initialize postprocessor with configuration."""
|
||||
self.config = config
|
||||
|
||||
def process(self, document: Document) -> Document:
|
||||
"""
|
||||
Postprocess document.
|
||||
|
||||
Args:
|
||||
document: Parsed document
|
||||
|
||||
Returns:
|
||||
Processed document
|
||||
"""
|
||||
# Remove empty nodes
|
||||
self._remove_empty_nodes(document.root)
|
||||
|
||||
# Merge adjacent text nodes if configured
|
||||
if self.config.merge_adjacent_nodes:
|
||||
self._merge_adjacent_nodes(document.root)
|
||||
|
||||
# Normalize heading levels
|
||||
self._normalize_heading_levels(document.root)
|
||||
|
||||
# Enhance section detection if configured
|
||||
if self.config.detect_sections:
|
||||
self._enhance_sections(document)
|
||||
|
||||
# Add document statistics
|
||||
self._add_statistics(document)
|
||||
|
||||
# Validate document structure
|
||||
self._validate_structure(document)
|
||||
|
||||
return document
|
||||
|
||||
def _remove_empty_nodes(self, node: Node):
|
||||
"""Remove empty nodes from tree."""
|
||||
# Process children first (bottom-up)
|
||||
children_to_remove = []
|
||||
|
||||
for child in node.children:
|
||||
self._remove_empty_nodes(child)
|
||||
|
||||
# Check if child is empty
|
||||
if self._is_empty_node(child):
|
||||
children_to_remove.append(child)
|
||||
|
||||
# Remove empty children
|
||||
for child in children_to_remove:
|
||||
node.remove_child(child)
|
||||
|
||||
def _is_empty_node(self, node: Node) -> bool:
|
||||
"""Check if node is empty and can be removed."""
|
||||
# Never remove table nodes
|
||||
if node.type == NodeType.TABLE:
|
||||
return False
|
||||
|
||||
# Never remove nodes with metadata
|
||||
if node.metadata:
|
||||
return False
|
||||
|
||||
# Check text nodes
|
||||
if isinstance(node, TextNode):
|
||||
return not node.text().strip()
|
||||
|
||||
# Check other nodes with text content
|
||||
if hasattr(node, 'content') and isinstance(node.content, str):
|
||||
return not node.content.strip()
|
||||
|
||||
# Check container nodes
|
||||
if not node.children:
|
||||
# Empty container with no children
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _merge_adjacent_nodes(self, node: Node):
|
||||
"""Merge adjacent text nodes with similar properties."""
|
||||
if not node.children:
|
||||
return
|
||||
|
||||
# Process children first
|
||||
for child in node.children:
|
||||
self._merge_adjacent_nodes(child)
|
||||
|
||||
# Merge adjacent text nodes
|
||||
merged_children = []
|
||||
i = 0
|
||||
|
||||
while i < len(node.children):
|
||||
current = node.children[i]
|
||||
|
||||
# Look for mergeable nodes
|
||||
if self._can_merge(current):
|
||||
# Collect all adjacent mergeable nodes
|
||||
merge_group = [current]
|
||||
j = i + 1
|
||||
|
||||
while j < len(node.children) and self._can_merge_with(current, node.children[j]):
|
||||
merge_group.append(node.children[j])
|
||||
j += 1
|
||||
|
||||
# Merge if we have multiple nodes
|
||||
if len(merge_group) > 1:
|
||||
merged = self._merge_nodes(merge_group)
|
||||
merged_children.append(merged)
|
||||
i = j
|
||||
else:
|
||||
merged_children.append(current)
|
||||
i += 1
|
||||
else:
|
||||
merged_children.append(current)
|
||||
i += 1
|
||||
|
||||
# Update children
|
||||
node.children = merged_children
|
||||
|
||||
# Update parent references
|
||||
for child in node.children:
|
||||
child.parent = node
|
||||
|
||||
def _can_merge(self, node: Node) -> bool:
|
||||
"""Check if node can be merged."""
|
||||
# Only merge TextNodes, not ParagraphNodes
|
||||
return isinstance(node, TextNode) and not node.metadata
|
||||
|
||||
def _can_merge_with(self, node1: Node, node2: Node) -> bool:
|
||||
"""Check if two nodes can be merged."""
|
||||
# Must be same type
|
||||
if type(node1) != type(node2):
|
||||
return False
|
||||
|
||||
# Must have compatible styles
|
||||
if not self._compatible_styles(node1.style, node2.style):
|
||||
return False
|
||||
|
||||
# Must not have metadata
|
||||
if node1.metadata or node2.metadata:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _compatible_styles(self, style1, style2) -> bool:
|
||||
"""Check if two styles are compatible for merging."""
|
||||
# For now, just check key properties
|
||||
return (
|
||||
style1.font_size == style2.font_size and
|
||||
style1.font_weight == style2.font_weight and
|
||||
style1.text_align == style2.text_align
|
||||
)
|
||||
|
||||
def _merge_nodes(self, nodes: List[Node]) -> Node:
|
||||
"""Merge multiple nodes into one."""
|
||||
if not nodes:
|
||||
return None
|
||||
|
||||
# Use first node as base
|
||||
merged = nodes[0]
|
||||
|
||||
# Merge content
|
||||
if isinstance(merged, TextNode):
|
||||
texts = [n.text() for n in nodes]
|
||||
merged.content = '\n'.join(texts)
|
||||
elif isinstance(merged, ParagraphNode):
|
||||
# Merge all children
|
||||
for node in nodes[1:]:
|
||||
merged.children.extend(node.children)
|
||||
|
||||
return merged
|
||||
|
||||
def _normalize_heading_levels(self, node: Node):
|
||||
"""Normalize heading levels to ensure proper hierarchy."""
|
||||
# Collect all headings
|
||||
headings = []
|
||||
self._collect_headings(node, headings)
|
||||
|
||||
if not headings:
|
||||
return
|
||||
|
||||
# Analyze heading structure
|
||||
levels_used = set(h.level for h in headings)
|
||||
|
||||
# If we're missing level 1, promote headings
|
||||
if 1 not in levels_used and levels_used:
|
||||
min_level = min(levels_used)
|
||||
adjustment = min_level - 1
|
||||
|
||||
for heading in headings:
|
||||
heading.level = max(1, heading.level - adjustment)
|
||||
|
||||
def _collect_headings(self, node: Node, headings: List[HeadingNode]):
|
||||
"""Collect all heading nodes."""
|
||||
if isinstance(node, HeadingNode):
|
||||
headings.append(node)
|
||||
|
||||
for child in node.children:
|
||||
self._collect_headings(child, headings)
|
||||
|
||||
def _enhance_sections(self, document: Document):
|
||||
"""Enhance section detection and metadata."""
|
||||
# Only extract sections eagerly if configured to do so
|
||||
if not self.config.eager_section_extraction:
|
||||
return
|
||||
|
||||
# Force section extraction to populate cache
|
||||
_ = document.sections
|
||||
|
||||
# Add section metadata to nodes
|
||||
for section_name, section in document.sections.items():
|
||||
# Add section name to all nodes in section
|
||||
for node in section.node.walk():
|
||||
node.set_metadata('section', section_name)
|
||||
|
||||
def _add_statistics(self, document: Document):
|
||||
"""Add document statistics to metadata."""
|
||||
stats = {
|
||||
'node_count': sum(1 for _ in document.root.walk()),
|
||||
'text_length': len(document.text()),
|
||||
'table_count': len(document.tables),
|
||||
'heading_count': len(document.headings),
|
||||
}
|
||||
|
||||
# Only add section count if sections were extracted
|
||||
if self.config.eager_section_extraction:
|
||||
stats['section_count'] = len(document.sections)
|
||||
|
||||
document.metadata.statistics = stats
|
||||
|
||||
def _validate_structure(self, document: Document):
|
||||
"""Validate document structure and fix issues."""
|
||||
issues = []
|
||||
|
||||
# Check for orphaned nodes
|
||||
for node in document.root.walk():
|
||||
if node != document.root and node.parent is None:
|
||||
issues.append(f"Orphaned node: {node.type}")
|
||||
# Fix by adding to root
|
||||
document.root.add_child(node)
|
||||
|
||||
# Check for circular references
|
||||
visited = set()
|
||||
|
||||
def check_cycles(node: Node, path: Set[str]):
|
||||
if node.id in path:
|
||||
issues.append(f"Circular reference detected: {node.type}")
|
||||
return
|
||||
|
||||
path.add(node.id)
|
||||
visited.add(node.id)
|
||||
|
||||
for child in node.children:
|
||||
if child.id not in visited:
|
||||
check_cycles(child, path.copy())
|
||||
|
||||
check_cycles(document.root, set())
|
||||
|
||||
# Store validation results
|
||||
if issues:
|
||||
document.metadata.validation_issues = issues
|
||||
@@ -0,0 +1,242 @@
|
||||
"""
|
||||
HTML preprocessor for cleaning and normalizing HTML before parsing.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.utils.html_utils import remove_xml_declaration
|
||||
|
||||
|
||||
class HTMLPreprocessor:
|
||||
"""
|
||||
Preprocesses HTML to fix common issues and normalize content.
|
||||
|
||||
Handles:
|
||||
- Character encoding issues
|
||||
- Malformed HTML
|
||||
- Excessive whitespace
|
||||
- Script/style removal
|
||||
- Entity normalization
|
||||
"""
|
||||
|
||||
def __init__(self, config: ParserConfig):
|
||||
"""Initialize preprocessor with configuration."""
|
||||
self.config = config
|
||||
|
||||
# Pre-compile regex patterns for performance
|
||||
self._compiled_patterns = self._compile_patterns()
|
||||
|
||||
def _compile_patterns(self):
|
||||
"""Pre-compile frequently used regex patterns."""
|
||||
return {
|
||||
# Encoding and cleanup
|
||||
'control_chars': re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]'),
|
||||
|
||||
# Script/style removal
|
||||
'script_tags': re.compile(r'<script[^>]*>.*?</script>', re.IGNORECASE | re.DOTALL),
|
||||
'style_tags': re.compile(r'<style[^>]*>.*?</style>', re.IGNORECASE | re.DOTALL),
|
||||
'link_tags': re.compile(r'<link[^>]*>', re.IGNORECASE),
|
||||
'comments': re.compile(r'<!--.*?-->', re.DOTALL),
|
||||
'ix_hidden': re.compile(r'<ix:hidden[^>]*>.*?</ix:hidden>', re.IGNORECASE | re.DOTALL),
|
||||
'ix_header': re.compile(r'<ix:header[^>]*>.*?</ix:header>', re.IGNORECASE | re.DOTALL),
|
||||
|
||||
# Malformed tags
|
||||
'br_tags': re.compile(r'<br(?![^>]*/)>', re.IGNORECASE),
|
||||
'img_tags': re.compile(r'<img([^>]+)(?<!/)>', re.IGNORECASE),
|
||||
'input_tags': re.compile(r'<input([^>]+)(?<!/)>', re.IGNORECASE),
|
||||
'hr_tags': re.compile(r'<hr(?![^>]*/)>', re.IGNORECASE),
|
||||
'nested_p_open': re.compile(r'<p>\s*<p>', re.IGNORECASE),
|
||||
'nested_p_close': re.compile(r'</p>\s*</p>', re.IGNORECASE),
|
||||
|
||||
# Whitespace normalization
|
||||
'multiple_spaces': re.compile(r'[ \t]+'),
|
||||
'multiple_newlines': re.compile(r'\n{3,}'),
|
||||
'spaces_around_tags': re.compile(r'\s*(<[^>]+>)\s*'),
|
||||
|
||||
# Block element newlines - combined pattern for opening tags
|
||||
'block_open_tags': re.compile(
|
||||
r'(<(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)[^>]*>)',
|
||||
re.IGNORECASE
|
||||
),
|
||||
# Block element newlines - combined pattern for closing tags
|
||||
'block_close_tags': re.compile(
|
||||
r'(</(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)>)',
|
||||
re.IGNORECASE
|
||||
),
|
||||
|
||||
# Empty tags removal - combined pattern for all removable tags
|
||||
'empty_tags': re.compile(
|
||||
r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*>\s*</(?:span|div|p|font|b|i|u|strong|em)>',
|
||||
re.IGNORECASE
|
||||
),
|
||||
'empty_self_closing': re.compile(
|
||||
r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*/>\s*',
|
||||
re.IGNORECASE
|
||||
),
|
||||
|
||||
# Common issues
|
||||
'multiple_br': re.compile(r'(<br\s*/?>[\s\n]*){3,}', re.IGNORECASE),
|
||||
'space_before_punct': re.compile(r'\s+([.,;!?])'),
|
||||
'missing_space_after_punct': re.compile(r'([.,;!?])([A-Z])'),
|
||||
}
|
||||
|
||||
def process(self, html: str) -> str:
|
||||
"""
|
||||
Preprocess HTML content.
|
||||
|
||||
Args:
|
||||
html: Raw HTML content
|
||||
|
||||
Returns:
|
||||
Cleaned HTML ready for parsing
|
||||
"""
|
||||
# Remove BOM if present
|
||||
if html.startswith('\ufeff'):
|
||||
html = html[1:]
|
||||
|
||||
# Remove XML declaration if present
|
||||
html = remove_xml_declaration(html)
|
||||
|
||||
# Fix common character encoding issues
|
||||
html = self._fix_encoding_issues(html)
|
||||
|
||||
# Remove script and style tags
|
||||
html = self._remove_script_style(html)
|
||||
|
||||
# Normalize entities
|
||||
html = self._normalize_entities(html)
|
||||
|
||||
# Fix malformed tags
|
||||
html = self._fix_malformed_tags(html)
|
||||
|
||||
# Normalize whitespace if not preserving
|
||||
if not self.config.preserve_whitespace:
|
||||
html = self._normalize_whitespace(html)
|
||||
|
||||
# Remove empty tags
|
||||
html = self._remove_empty_tags(html)
|
||||
|
||||
# Fix common HTML issues
|
||||
html = self._fix_common_issues(html)
|
||||
|
||||
return html
|
||||
|
||||
def _fix_encoding_issues(self, html: str) -> str:
|
||||
"""Fix common character encoding issues."""
|
||||
# Replace Windows-1252 characters with Unicode equivalents
|
||||
replacements = {
|
||||
'\x91': "'", # Left single quote
|
||||
'\x92': "'", # Right single quote
|
||||
'\x93': '"', # Left double quote
|
||||
'\x94': '"', # Right double quote
|
||||
'\x95': '•', # Bullet
|
||||
'\x96': '–', # En dash
|
||||
'\x97': '—', # Em dash
|
||||
'\xa0': ' ', # Non-breaking space
|
||||
}
|
||||
|
||||
for old, new in replacements.items():
|
||||
html = html.replace(old, new)
|
||||
|
||||
# Remove other control characters
|
||||
html = self._compiled_patterns['control_chars'].sub('', html)
|
||||
|
||||
return html
|
||||
|
||||
def _remove_script_style(self, html: str) -> str:
|
||||
"""Remove script and style tags with content."""
|
||||
# Use pre-compiled patterns for better performance
|
||||
html = self._compiled_patterns['script_tags'].sub('', html)
|
||||
html = self._compiled_patterns['style_tags'].sub('', html)
|
||||
html = self._compiled_patterns['link_tags'].sub('', html)
|
||||
html = self._compiled_patterns['comments'].sub('', html)
|
||||
html = self._compiled_patterns['ix_hidden'].sub('', html)
|
||||
html = self._compiled_patterns['ix_header'].sub('', html)
|
||||
|
||||
return html
|
||||
|
||||
def _normalize_entities(self, html: str) -> str:
|
||||
"""Normalize HTML entities."""
|
||||
# Common entity replacements
|
||||
entities = {
|
||||
' ': ' ',
|
||||
' ': ' ',
|
||||
' ': ' ',
|
||||
' ': ' ',
|
||||
' ': ' ',
|
||||
' ': ' ',
|
||||
'‍': '', # Zero-width joiner
|
||||
'‌': '', # Zero-width non-joiner
|
||||
'​': '', # Zero-width space
|
||||
}
|
||||
|
||||
for entity, replacement in entities.items():
|
||||
html = html.replace(entity, replacement)
|
||||
|
||||
# Fix double-encoded entities
|
||||
html = html.replace('&amp;', '&')
|
||||
html = html.replace('&nbsp;', ' ')
|
||||
html = html.replace('&lt;', '<')
|
||||
html = html.replace('&gt;', '>')
|
||||
|
||||
return html
|
||||
|
||||
def _fix_malformed_tags(self, html: str) -> str:
|
||||
"""Fix common malformed tag issues."""
|
||||
# Use pre-compiled patterns for better performance
|
||||
html = self._compiled_patterns['br_tags'].sub('<br/>', html)
|
||||
html = self._compiled_patterns['img_tags'].sub(r'<img\1/>', html)
|
||||
html = self._compiled_patterns['input_tags'].sub(r'<input\1/>', html)
|
||||
html = self._compiled_patterns['hr_tags'].sub('<hr/>', html)
|
||||
html = self._compiled_patterns['nested_p_open'].sub('<p>', html)
|
||||
html = self._compiled_patterns['nested_p_close'].sub('</p>', html)
|
||||
|
||||
return html
|
||||
|
||||
def _normalize_whitespace(self, html: str) -> str:
|
||||
"""Normalize whitespace in HTML."""
|
||||
# Use pre-compiled patterns for better performance
|
||||
# Replace multiple spaces with single space
|
||||
html = self._compiled_patterns['multiple_spaces'].sub(' ', html)
|
||||
|
||||
# Replace multiple newlines with double newline
|
||||
html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
|
||||
|
||||
# Remove spaces around tags
|
||||
html = self._compiled_patterns['spaces_around_tags'].sub(r'\1', html)
|
||||
|
||||
# Add newlines around block elements for readability
|
||||
# Using combined patterns instead of looping over individual tags
|
||||
html = self._compiled_patterns['block_open_tags'].sub(r'\n\1', html)
|
||||
html = self._compiled_patterns['block_close_tags'].sub(r'\1\n', html)
|
||||
|
||||
# Clean up excessive newlines (apply again after adding newlines)
|
||||
html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
|
||||
|
||||
return html.strip()
|
||||
|
||||
def _remove_empty_tags(self, html: str) -> str:
|
||||
"""Remove empty tags that don't contribute content."""
|
||||
# Use pre-compiled combined patterns instead of looping
|
||||
html = self._compiled_patterns['empty_tags'].sub('', html)
|
||||
html = self._compiled_patterns['empty_self_closing'].sub('', html)
|
||||
|
||||
return html
|
||||
|
||||
def _fix_common_issues(self, html: str) -> str:
|
||||
"""Fix other common HTML issues."""
|
||||
# Use pre-compiled patterns for better performance
|
||||
html = self._compiled_patterns['multiple_br'].sub('<br/><br/>', html)
|
||||
html = self._compiled_patterns['space_before_punct'].sub(r'\1', html)
|
||||
html = self._compiled_patterns['missing_space_after_punct'].sub(r'\1 \2', html)
|
||||
|
||||
# Remove zero-width spaces (simple string replace is faster than regex)
|
||||
html = html.replace('\u200b', '')
|
||||
html = html.replace('\ufeff', '')
|
||||
|
||||
# Fix common typos in tags (simple string replace is faster than regex)
|
||||
html = html.replace('<tabel', '<table')
|
||||
html = html.replace('</tabel>', '</table>')
|
||||
|
||||
return html
|
||||
@@ -0,0 +1,34 @@
|
||||
"""
|
||||
Advanced ranking functionality for edgar.documents.
|
||||
|
||||
This package provides BM25-based ranking with semantic structure awareness
|
||||
and intelligent index caching for performance optimization.
|
||||
"""
|
||||
|
||||
from edgar.documents.ranking.ranking import (
|
||||
RankingAlgorithm,
|
||||
RankingEngine,
|
||||
BM25Engine,
|
||||
HybridEngine,
|
||||
SemanticEngine,
|
||||
RankedResult,
|
||||
)
|
||||
from edgar.documents.ranking.cache import (
|
||||
SearchIndexCache,
|
||||
CacheEntry,
|
||||
get_search_cache,
|
||||
set_search_cache,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'RankingAlgorithm',
|
||||
'RankingEngine',
|
||||
'BM25Engine',
|
||||
'HybridEngine',
|
||||
'SemanticEngine',
|
||||
'RankedResult',
|
||||
'SearchIndexCache',
|
||||
'CacheEntry',
|
||||
'get_search_cache',
|
||||
'set_search_cache',
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,311 @@
|
||||
"""
|
||||
Search index caching for performance optimization.
|
||||
|
||||
Provides memory and disk caching with LRU eviction and TTL expiration.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List
|
||||
import hashlib
|
||||
import pickle
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CacheEntry:
|
||||
"""
|
||||
Cached search index entry.
|
||||
|
||||
Stores pre-built search indices for a document along with metadata
|
||||
for cache management (access tracking, TTL).
|
||||
"""
|
||||
document_hash: str
|
||||
index_data: Dict[str, Any] # Serialized BM25 index data
|
||||
created_at: datetime
|
||||
access_count: int = 0
|
||||
last_accessed: Optional[datetime] = None
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
class SearchIndexCache:
|
||||
"""
|
||||
Manages search index caching with memory + disk storage.
|
||||
|
||||
Features:
|
||||
- In-memory LRU cache for fast access
|
||||
- Optional disk persistence for reuse across sessions
|
||||
- TTL-based expiration
|
||||
- Access statistics tracking
|
||||
|
||||
Parameters:
|
||||
memory_cache_size: Maximum entries in memory (default: 10)
|
||||
disk_cache_enabled: Enable disk persistence (default: True)
|
||||
cache_dir: Directory for disk cache (default: ~/.edgar_cache/search)
|
||||
ttl_hours: Time-to-live for cached entries (default: 24)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
memory_cache_size: int = 10,
|
||||
disk_cache_enabled: bool = True,
|
||||
cache_dir: Optional[Path] = None,
|
||||
ttl_hours: int = 24):
|
||||
"""Initialize cache."""
|
||||
self.memory_cache_size = memory_cache_size
|
||||
self.disk_cache_enabled = disk_cache_enabled
|
||||
self.cache_dir = cache_dir or Path.home() / ".edgar_cache" / "search"
|
||||
self.ttl = timedelta(hours=ttl_hours)
|
||||
|
||||
# In-memory cache (LRU)
|
||||
self._memory_cache: Dict[str, CacheEntry] = {}
|
||||
self._access_order: List[str] = []
|
||||
|
||||
# Statistics
|
||||
self._hits = 0
|
||||
self._misses = 0
|
||||
|
||||
# Create cache directory
|
||||
if disk_cache_enabled:
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def compute_document_hash(self, document_id: str, content_sample: str) -> str:
|
||||
"""
|
||||
Compute cache key from document identifiers.
|
||||
|
||||
Uses document ID (e.g., accession number) and a content sample
|
||||
to create a unique, stable hash.
|
||||
|
||||
Args:
|
||||
document_id: Unique document identifier
|
||||
content_sample: Sample of document content for verification
|
||||
|
||||
Returns:
|
||||
16-character hex hash
|
||||
"""
|
||||
content = f"{document_id}:{content_sample}"
|
||||
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
||||
|
||||
def get(self, document_hash: str) -> Optional[CacheEntry]:
|
||||
"""
|
||||
Get cached entry.
|
||||
|
||||
Tries memory cache first, then disk cache. Updates LRU order
|
||||
and access statistics.
|
||||
|
||||
Args:
|
||||
document_hash: Cache key
|
||||
|
||||
Returns:
|
||||
CacheEntry if found and valid, None otherwise
|
||||
"""
|
||||
# Try memory cache first
|
||||
if document_hash in self._memory_cache:
|
||||
entry = self._memory_cache[document_hash]
|
||||
|
||||
# Check TTL
|
||||
if datetime.now() - entry.created_at > self.ttl:
|
||||
# Expired - remove from cache
|
||||
self._evict_memory(document_hash)
|
||||
self._misses += 1
|
||||
return None
|
||||
|
||||
# Update access tracking
|
||||
entry.access_count += 1
|
||||
entry.last_accessed = datetime.now()
|
||||
|
||||
# Update LRU order
|
||||
if document_hash in self._access_order:
|
||||
self._access_order.remove(document_hash)
|
||||
self._access_order.append(document_hash)
|
||||
|
||||
self._hits += 1
|
||||
logger.debug(f"Cache hit (memory): {document_hash}")
|
||||
return entry
|
||||
|
||||
# Try disk cache
|
||||
if self.disk_cache_enabled:
|
||||
entry = self._load_from_disk(document_hash)
|
||||
if entry:
|
||||
# Check TTL
|
||||
if datetime.now() - entry.created_at > self.ttl:
|
||||
# Expired - delete file
|
||||
self._delete_from_disk(document_hash)
|
||||
self._misses += 1
|
||||
return None
|
||||
|
||||
# Load into memory cache
|
||||
self._put_memory(document_hash, entry)
|
||||
self._hits += 1
|
||||
logger.debug(f"Cache hit (disk): {document_hash}")
|
||||
return entry
|
||||
|
||||
self._misses += 1
|
||||
logger.debug(f"Cache miss: {document_hash}")
|
||||
return None
|
||||
|
||||
def put(self, document_hash: str, entry: CacheEntry) -> None:
|
||||
"""
|
||||
Cache entry in memory and optionally on disk.
|
||||
|
||||
Args:
|
||||
document_hash: Cache key
|
||||
entry: Entry to cache
|
||||
"""
|
||||
# Put in memory cache
|
||||
self._put_memory(document_hash, entry)
|
||||
|
||||
# Put in disk cache
|
||||
if self.disk_cache_enabled:
|
||||
self._save_to_disk(document_hash, entry)
|
||||
|
||||
logger.debug(f"Cached entry: {document_hash}")
|
||||
|
||||
def _put_memory(self, document_hash: str, entry: CacheEntry) -> None:
|
||||
"""Put entry in memory cache with LRU eviction."""
|
||||
# Evict if cache full
|
||||
while len(self._memory_cache) >= self.memory_cache_size:
|
||||
if self._access_order:
|
||||
oldest = self._access_order.pop(0)
|
||||
self._evict_memory(oldest)
|
||||
else:
|
||||
break
|
||||
|
||||
self._memory_cache[document_hash] = entry
|
||||
self._access_order.append(document_hash)
|
||||
|
||||
def _evict_memory(self, document_hash: str) -> None:
|
||||
"""Evict entry from memory cache."""
|
||||
if document_hash in self._memory_cache:
|
||||
del self._memory_cache[document_hash]
|
||||
logger.debug(f"Evicted from memory: {document_hash}")
|
||||
|
||||
def _load_from_disk(self, document_hash: str) -> Optional[CacheEntry]:
|
||||
"""Load entry from disk cache."""
|
||||
cache_file = self.cache_dir / f"{document_hash}.pkl"
|
||||
if not cache_file.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(cache_file, 'rb') as f:
|
||||
entry = pickle.load(f)
|
||||
return entry
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load cache from disk: {e}")
|
||||
# Delete corrupted file
|
||||
try:
|
||||
cache_file.unlink()
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
def _save_to_disk(self, document_hash: str, entry: CacheEntry) -> None:
|
||||
"""Save entry to disk cache."""
|
||||
cache_file = self.cache_dir / f"{document_hash}.pkl"
|
||||
try:
|
||||
with open(cache_file, 'wb') as f:
|
||||
pickle.dump(entry, f)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to save cache to disk: {e}")
|
||||
|
||||
def _delete_from_disk(self, document_hash: str) -> None:
|
||||
"""Delete entry from disk cache."""
|
||||
cache_file = self.cache_dir / f"{document_hash}.pkl"
|
||||
try:
|
||||
if cache_file.exists():
|
||||
cache_file.unlink()
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete cache file: {e}")
|
||||
|
||||
def clear(self, memory_only: bool = False) -> None:
|
||||
"""
|
||||
Clear cache.
|
||||
|
||||
Args:
|
||||
memory_only: If True, only clear memory cache (keep disk)
|
||||
"""
|
||||
self._memory_cache.clear()
|
||||
self._access_order.clear()
|
||||
logger.info("Cleared memory cache")
|
||||
|
||||
if not memory_only and self.disk_cache_enabled:
|
||||
try:
|
||||
for cache_file in self.cache_dir.glob("*.pkl"):
|
||||
cache_file.unlink()
|
||||
logger.info("Cleared disk cache")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to clear disk cache: {e}")
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get cache statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with cache statistics
|
||||
"""
|
||||
disk_entries = 0
|
||||
if self.disk_cache_enabled:
|
||||
try:
|
||||
disk_entries = len(list(self.cache_dir.glob("*.pkl")))
|
||||
except:
|
||||
pass
|
||||
|
||||
total_requests = self._hits + self._misses
|
||||
hit_rate = self._hits / total_requests if total_requests > 0 else 0.0
|
||||
|
||||
return {
|
||||
"memory_entries": len(self._memory_cache),
|
||||
"disk_entries": disk_entries,
|
||||
"total_accesses": sum(e.access_count for e in self._memory_cache.values()),
|
||||
"cache_hits": self._hits,
|
||||
"cache_misses": self._misses,
|
||||
"hit_rate": hit_rate,
|
||||
"memory_size_mb": self._estimate_cache_size()
|
||||
}
|
||||
|
||||
def _estimate_cache_size(self) -> float:
|
||||
"""Estimate memory cache size in MB."""
|
||||
try:
|
||||
import sys
|
||||
total_bytes = sum(
|
||||
sys.getsizeof(entry.index_data)
|
||||
for entry in self._memory_cache.values()
|
||||
)
|
||||
return total_bytes / (1024 * 1024)
|
||||
except:
|
||||
# Rough estimate if sys.getsizeof fails
|
||||
return len(self._memory_cache) * 5.0 # Assume ~5MB per entry
|
||||
|
||||
|
||||
# Global cache instance
|
||||
_global_cache: Optional[SearchIndexCache] = None
|
||||
|
||||
|
||||
def get_search_cache() -> SearchIndexCache:
|
||||
"""
|
||||
Get global search cache instance.
|
||||
|
||||
Creates a singleton cache instance on first call.
|
||||
|
||||
Returns:
|
||||
Global SearchIndexCache instance
|
||||
"""
|
||||
global _global_cache
|
||||
if _global_cache is None:
|
||||
_global_cache = SearchIndexCache()
|
||||
return _global_cache
|
||||
|
||||
|
||||
def set_search_cache(cache: Optional[SearchIndexCache]) -> None:
|
||||
"""
|
||||
Set global search cache instance.
|
||||
|
||||
Useful for testing or custom cache configuration.
|
||||
|
||||
Args:
|
||||
cache: Cache instance to use globally (None to disable)
|
||||
"""
|
||||
global _global_cache
|
||||
_global_cache = cache
|
||||
@@ -0,0 +1,187 @@
|
||||
"""
|
||||
Text preprocessing for search.
|
||||
|
||||
Provides tokenization and text normalization for BM25 and semantic analysis.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Set
|
||||
|
||||
|
||||
# Common English stopwords (minimal set for financial documents)
|
||||
# We keep many financial terms that might be stopwords in other contexts
|
||||
STOPWORDS: Set[str] = {
|
||||
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for',
|
||||
'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on',
|
||||
'that', 'the', 'to', 'was', 'will', 'with'
|
||||
}
|
||||
|
||||
|
||||
def preprocess_text(text: str,
|
||||
lowercase: bool = True,
|
||||
remove_punctuation: bool = False) -> str:
|
||||
"""
|
||||
Preprocess text for search.
|
||||
|
||||
Args:
|
||||
text: Raw text
|
||||
lowercase: Convert to lowercase
|
||||
remove_punctuation: Remove punctuation (keep for financial data)
|
||||
|
||||
Returns:
|
||||
Preprocessed text
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Normalize whitespace
|
||||
text = ' '.join(text.split())
|
||||
|
||||
# Lowercase (important for BM25 matching)
|
||||
if lowercase:
|
||||
text = text.lower()
|
||||
|
||||
# Optionally remove punctuation (usually keep for "$5B", "Item 1A", etc.)
|
||||
if remove_punctuation:
|
||||
text = re.sub(r'[^\w\s]', ' ', text)
|
||||
text = ' '.join(text.split()) # Clean up extra spaces
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def tokenize(text: str,
|
||||
remove_stopwords: bool = False,
|
||||
min_token_length: int = 2) -> List[str]:
|
||||
"""
|
||||
Tokenize text for BM25 indexing.
|
||||
|
||||
Args:
|
||||
text: Text to tokenize
|
||||
remove_stopwords: Remove common stopwords
|
||||
min_token_length: Minimum token length to keep
|
||||
|
||||
Returns:
|
||||
List of tokens
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# Split on whitespace and punctuation boundaries
|
||||
# Keep alphanumeric + some special chars for financial terms
|
||||
tokens = re.findall(r'\b[\w$%]+\b', text.lower())
|
||||
|
||||
# Filter by length
|
||||
tokens = [t for t in tokens if len(t) >= min_token_length]
|
||||
|
||||
# Optionally remove stopwords
|
||||
if remove_stopwords:
|
||||
tokens = [t for t in tokens if t not in STOPWORDS]
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def extract_query_terms(query: str) -> List[str]:
|
||||
"""
|
||||
Extract important terms from query for boosting.
|
||||
|
||||
Identifies key financial terms, numbers, and important phrases.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
|
||||
Returns:
|
||||
List of important query terms
|
||||
"""
|
||||
# Tokenize
|
||||
tokens = tokenize(query, remove_stopwords=True)
|
||||
|
||||
# Extract important patterns
|
||||
important = []
|
||||
|
||||
# Financial amounts: $5B, $1.2M, etc.
|
||||
amounts = re.findall(r'\$[\d,.]+[BMK]?', query, re.IGNORECASE)
|
||||
important.extend(amounts)
|
||||
|
||||
# Percentages: 15%, 3.5%
|
||||
percentages = re.findall(r'\d+\.?\d*%', query)
|
||||
important.extend(percentages)
|
||||
|
||||
# Years: 2023, 2024
|
||||
years = re.findall(r'\b(19|20)\d{2}\b', query)
|
||||
important.extend(years)
|
||||
|
||||
# Item references: Item 1A, Item 7
|
||||
items = re.findall(r'item\s+\d+[a-z]?', query, re.IGNORECASE)
|
||||
important.extend(items)
|
||||
|
||||
# Add all tokens
|
||||
important.extend(tokens)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
result = []
|
||||
for term in important:
|
||||
term_lower = term.lower()
|
||||
if term_lower not in seen:
|
||||
seen.add(term_lower)
|
||||
result.append(term)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def normalize_financial_term(term: str) -> str:
|
||||
"""
|
||||
Normalize financial terms for consistent matching.
|
||||
|
||||
Examples:
|
||||
"$5 billion" -> "$5b"
|
||||
"5,000,000" -> "5000000"
|
||||
"Item 1A" -> "item1a"
|
||||
|
||||
Args:
|
||||
term: Financial term
|
||||
|
||||
Returns:
|
||||
Normalized term
|
||||
"""
|
||||
term = term.lower().strip()
|
||||
|
||||
# Remove commas from numbers
|
||||
term = term.replace(',', '')
|
||||
|
||||
# Normalize billion/million/thousand
|
||||
term = re.sub(r'\s*billion\b', 'b', term)
|
||||
term = re.sub(r'\s*million\b', 'm', term)
|
||||
term = re.sub(r'\s*thousand\b', 'k', term)
|
||||
|
||||
# Remove spaces in compound terms
|
||||
term = re.sub(r'(item|section|part)\s+(\d+[a-z]?)', r'\1\2', term)
|
||||
|
||||
# Remove extra whitespace
|
||||
term = ' '.join(term.split())
|
||||
|
||||
return term
|
||||
|
||||
|
||||
def get_ngrams(tokens: List[str], n: int = 2) -> List[str]:
|
||||
"""
|
||||
Generate n-grams from tokens.
|
||||
|
||||
Useful for phrase matching in BM25.
|
||||
|
||||
Args:
|
||||
tokens: List of tokens
|
||||
n: N-gram size
|
||||
|
||||
Returns:
|
||||
List of n-grams as strings
|
||||
"""
|
||||
if len(tokens) < n:
|
||||
return []
|
||||
|
||||
ngrams = []
|
||||
for i in range(len(tokens) - n + 1):
|
||||
ngram = ' '.join(tokens[i:i + n])
|
||||
ngrams.append(ngram)
|
||||
|
||||
return ngrams
|
||||
@@ -0,0 +1,401 @@
|
||||
"""
|
||||
Ranking engines for document search.
|
||||
|
||||
Provides BM25-based ranking with optional semantic structure boosting.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum, auto
|
||||
from typing import List, Optional, Dict, Any, TYPE_CHECKING
|
||||
|
||||
from rank_bm25 import BM25Okapi
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar.documents.nodes import Node
|
||||
|
||||
|
||||
class RankingAlgorithm(Enum):
|
||||
"""Supported ranking algorithms."""
|
||||
BM25 = auto() # Classic BM25 (Okapi variant)
|
||||
HYBRID = auto() # BM25 + Semantic structure boosting
|
||||
SEMANTIC = auto() # Pure structure-aware scoring
|
||||
|
||||
|
||||
@dataclass
|
||||
class RankedResult:
|
||||
"""
|
||||
A search result with ranking score.
|
||||
|
||||
Attributes:
|
||||
node: Document node containing the match
|
||||
score: Relevance score (higher is better)
|
||||
rank: Position in results (1-indexed)
|
||||
text: Matched text content
|
||||
bm25_score: Raw BM25 score (if applicable)
|
||||
semantic_score: Semantic boost score (if applicable)
|
||||
metadata: Additional result metadata
|
||||
"""
|
||||
node: 'Node'
|
||||
score: float
|
||||
rank: int
|
||||
text: str
|
||||
bm25_score: Optional[float] = None
|
||||
semantic_score: Optional[float] = None
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def snippet(self) -> str:
|
||||
"""Get text snippet (first 200 chars)."""
|
||||
if len(self.text) <= 200:
|
||||
return self.text
|
||||
return self.text[:197] + "..."
|
||||
|
||||
|
||||
class RankingEngine(ABC):
|
||||
"""Abstract base class for ranking engines."""
|
||||
|
||||
@abstractmethod
|
||||
def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
|
||||
"""
|
||||
Rank nodes by relevance to query.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
nodes: Nodes to rank
|
||||
|
||||
Returns:
|
||||
List of ranked results sorted by relevance (best first)
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_algorithm_name(self) -> str:
|
||||
"""Get name of ranking algorithm."""
|
||||
pass
|
||||
|
||||
|
||||
class BM25Engine(RankingEngine):
|
||||
"""
|
||||
BM25 ranking engine using Okapi variant.
|
||||
|
||||
BM25 is a probabilistic retrieval function that ranks documents based on
|
||||
query term frequency and inverse document frequency. Well-suited for
|
||||
financial documents where exact term matching is important.
|
||||
|
||||
Parameters:
|
||||
k1: Term frequency saturation parameter (default: 1.5)
|
||||
Controls how quickly term frequency impact plateaus.
|
||||
b: Length normalization parameter (default: 0.75)
|
||||
0 = no normalization, 1 = full normalization.
|
||||
"""
|
||||
|
||||
def __init__(self, k1: float = 1.5, b: float = 0.75):
|
||||
"""
|
||||
Initialize BM25 engine.
|
||||
|
||||
Args:
|
||||
k1: Term frequency saturation (1.2-2.0 typical)
|
||||
b: Length normalization (0.75 is standard)
|
||||
"""
|
||||
self.k1 = k1
|
||||
self.b = b
|
||||
self._bm25: Optional[BM25Okapi] = None
|
||||
self._corpus_nodes: Optional[List['Node']] = None
|
||||
self._tokenized_corpus: Optional[List[List[str]]] = None
|
||||
|
||||
def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
|
||||
"""
|
||||
Rank nodes using BM25 algorithm.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
nodes: Nodes to rank
|
||||
|
||||
Returns:
|
||||
Ranked results sorted by BM25 score
|
||||
"""
|
||||
if not nodes:
|
||||
return []
|
||||
|
||||
# Import preprocessing here to avoid circular dependency
|
||||
from edgar.documents.ranking.preprocessing import preprocess_text, tokenize
|
||||
|
||||
# Build index if needed or if nodes changed
|
||||
if self._corpus_nodes != nodes:
|
||||
self._build_index(nodes)
|
||||
|
||||
# Tokenize and preprocess query
|
||||
query_tokens = tokenize(preprocess_text(query))
|
||||
|
||||
if not query_tokens:
|
||||
return []
|
||||
|
||||
# Get BM25 scores
|
||||
scores = self._bm25.get_scores(query_tokens)
|
||||
|
||||
# Create ranked results
|
||||
results = []
|
||||
for idx, (node, score) in enumerate(zip(nodes, scores)):
|
||||
if score > 0: # Only include nodes with positive scores
|
||||
text = node.text() if hasattr(node, 'text') else str(node)
|
||||
results.append(RankedResult(
|
||||
node=node,
|
||||
score=float(score),
|
||||
rank=0, # Will be set after sorting
|
||||
text=text,
|
||||
bm25_score=float(score),
|
||||
metadata={'algorithm': 'BM25'}
|
||||
))
|
||||
|
||||
# Sort by score (highest first) and assign ranks
|
||||
results.sort(key=lambda r: r.score, reverse=True)
|
||||
for rank, result in enumerate(results, start=1):
|
||||
result.rank = rank
|
||||
|
||||
return results
|
||||
|
||||
def _build_index(self, nodes: List['Node']):
|
||||
"""Build BM25 index from nodes."""
|
||||
from edgar.documents.ranking.preprocessing import preprocess_text, tokenize
|
||||
|
||||
# Store corpus
|
||||
self._corpus_nodes = nodes
|
||||
|
||||
# Tokenize all nodes
|
||||
self._tokenized_corpus = []
|
||||
for node in nodes:
|
||||
text = node.text() if hasattr(node, 'text') else str(node)
|
||||
processed = preprocess_text(text)
|
||||
tokens = tokenize(processed)
|
||||
self._tokenized_corpus.append(tokens)
|
||||
|
||||
# Build BM25 index with custom parameters
|
||||
self._bm25 = BM25Okapi(
|
||||
self._tokenized_corpus,
|
||||
k1=self.k1,
|
||||
b=self.b
|
||||
)
|
||||
|
||||
def get_index_data(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serialize index data for caching.
|
||||
|
||||
Returns:
|
||||
Dictionary with serializable index data
|
||||
"""
|
||||
return {
|
||||
'tokenized_corpus': self._tokenized_corpus,
|
||||
'k1': self.k1,
|
||||
'b': self.b,
|
||||
'algorithm': 'BM25'
|
||||
}
|
||||
|
||||
def load_index_data(self, index_data: Dict[str, Any], nodes: List['Node']) -> None:
|
||||
"""
|
||||
Load index from cached data.
|
||||
|
||||
Args:
|
||||
index_data: Serialized index data
|
||||
nodes: Nodes corresponding to the index
|
||||
"""
|
||||
self._corpus_nodes = nodes
|
||||
self._tokenized_corpus = index_data['tokenized_corpus']
|
||||
self.k1 = index_data['k1']
|
||||
self.b = index_data['b']
|
||||
|
||||
# Rebuild BM25 index from tokenized corpus
|
||||
self._bm25 = BM25Okapi(
|
||||
self._tokenized_corpus,
|
||||
k1=self.k1,
|
||||
b=self.b
|
||||
)
|
||||
|
||||
def get_algorithm_name(self) -> str:
|
||||
"""Get algorithm name."""
|
||||
return "BM25"
|
||||
|
||||
|
||||
class HybridEngine(RankingEngine):
|
||||
"""
|
||||
Hybrid ranking engine: BM25 + Semantic structure boosting.
|
||||
|
||||
Combines classic BM25 text matching with semantic structure awareness:
|
||||
- BM25 provides strong exact-match ranking for financial terms
|
||||
- Semantic scoring boosts results based on document structure:
|
||||
* Headings and section markers
|
||||
* Cross-references ("See Item X")
|
||||
* Gateway content (summaries, overviews)
|
||||
* Table and XBRL importance
|
||||
|
||||
This approach is agent-friendly: it surfaces starting points for
|
||||
investigation rather than fragmented chunks.
|
||||
|
||||
Parameters:
|
||||
bm25_weight: Weight for BM25 score (default: 0.8)
|
||||
semantic_weight: Weight for semantic score (default: 0.2)
|
||||
k1: BM25 term frequency saturation
|
||||
b: BM25 length normalization
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
bm25_weight: float = 0.8,
|
||||
semantic_weight: float = 0.2,
|
||||
k1: float = 1.5,
|
||||
b: float = 0.75,
|
||||
boost_sections: Optional[List[str]] = None):
|
||||
"""
|
||||
Initialize hybrid engine.
|
||||
|
||||
Args:
|
||||
bm25_weight: Weight for BM25 component (0-1)
|
||||
semantic_weight: Weight for semantic component (0-1)
|
||||
k1: BM25 k1 parameter
|
||||
b: BM25 b parameter
|
||||
boost_sections: Section names to boost (e.g., ["Risk Factors"])
|
||||
"""
|
||||
self.bm25_engine = BM25Engine(k1=k1, b=b)
|
||||
self.bm25_weight = bm25_weight
|
||||
self.semantic_weight = semantic_weight
|
||||
self.boost_sections = boost_sections or []
|
||||
|
||||
# Validate weights
|
||||
total_weight = bm25_weight + semantic_weight
|
||||
if not (0.99 <= total_weight <= 1.01): # Allow small floating point error
|
||||
raise ValueError(f"Weights must sum to 1.0, got {total_weight}")
|
||||
|
||||
def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
|
||||
"""
|
||||
Rank nodes using hybrid approach.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
nodes: Nodes to rank
|
||||
|
||||
Returns:
|
||||
Ranked results with combined BM25 + semantic scores
|
||||
"""
|
||||
if not nodes:
|
||||
return []
|
||||
|
||||
# Get BM25 results
|
||||
bm25_results = self.bm25_engine.rank(query, nodes)
|
||||
|
||||
if not bm25_results:
|
||||
return []
|
||||
|
||||
# Import semantic scoring
|
||||
from edgar.documents.ranking.semantic import compute_semantic_scores
|
||||
|
||||
# Get semantic scores for all nodes
|
||||
semantic_scores_dict = compute_semantic_scores(
|
||||
nodes=nodes,
|
||||
query=query,
|
||||
boost_sections=self.boost_sections
|
||||
)
|
||||
|
||||
# Normalize BM25 scores to 0-1 range
|
||||
max_bm25 = max(r.bm25_score for r in bm25_results)
|
||||
if max_bm25 > 0:
|
||||
for result in bm25_results:
|
||||
result.bm25_score = result.bm25_score / max_bm25
|
||||
|
||||
# Combine scores
|
||||
for result in bm25_results:
|
||||
semantic_score = semantic_scores_dict.get(id(result.node), 0.0)
|
||||
result.semantic_score = semantic_score
|
||||
|
||||
# Weighted combination
|
||||
result.score = (
|
||||
self.bm25_weight * result.bm25_score +
|
||||
self.semantic_weight * semantic_score
|
||||
)
|
||||
|
||||
result.metadata['algorithm'] = 'Hybrid'
|
||||
result.metadata['bm25_weight'] = self.bm25_weight
|
||||
result.metadata['semantic_weight'] = self.semantic_weight
|
||||
|
||||
# Re-sort by combined score
|
||||
bm25_results.sort(key=lambda r: r.score, reverse=True)
|
||||
|
||||
# Update ranks
|
||||
for rank, result in enumerate(bm25_results, start=1):
|
||||
result.rank = rank
|
||||
|
||||
return bm25_results
|
||||
|
||||
def get_algorithm_name(self) -> str:
|
||||
"""Get algorithm name."""
|
||||
return "Hybrid"
|
||||
|
||||
|
||||
class SemanticEngine(RankingEngine):
|
||||
"""
|
||||
Pure semantic/structure-based ranking (no text matching).
|
||||
|
||||
Ranks nodes purely by structural importance:
|
||||
- Section headings
|
||||
- Cross-references
|
||||
- Gateway content
|
||||
- Document structure position
|
||||
|
||||
Useful for understanding document organization without specific queries.
|
||||
"""
|
||||
|
||||
def __init__(self, boost_sections: Optional[List[str]] = None):
|
||||
"""
|
||||
Initialize semantic engine.
|
||||
|
||||
Args:
|
||||
boost_sections: Section names to boost
|
||||
"""
|
||||
self.boost_sections = boost_sections or []
|
||||
|
||||
def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
|
||||
"""
|
||||
Rank nodes by semantic importance.
|
||||
|
||||
Args:
|
||||
query: Search query (used for context)
|
||||
nodes: Nodes to rank
|
||||
|
||||
Returns:
|
||||
Ranked results by structural importance
|
||||
"""
|
||||
if not nodes:
|
||||
return []
|
||||
|
||||
from edgar.documents.ranking.semantic import compute_semantic_scores
|
||||
|
||||
# Get semantic scores
|
||||
semantic_scores = compute_semantic_scores(
|
||||
nodes=nodes,
|
||||
query=query,
|
||||
boost_sections=self.boost_sections
|
||||
)
|
||||
|
||||
# Create results
|
||||
results = []
|
||||
for node in nodes:
|
||||
score = semantic_scores.get(id(node), 0.0)
|
||||
if score > 0:
|
||||
text = node.text() if hasattr(node, 'text') else str(node)
|
||||
results.append(RankedResult(
|
||||
node=node,
|
||||
score=score,
|
||||
rank=0,
|
||||
text=text,
|
||||
semantic_score=score,
|
||||
metadata={'algorithm': 'Semantic'}
|
||||
))
|
||||
|
||||
# Sort and rank
|
||||
results.sort(key=lambda r: r.score, reverse=True)
|
||||
for rank, result in enumerate(results, start=1):
|
||||
result.rank = rank
|
||||
|
||||
return results
|
||||
|
||||
def get_algorithm_name(self) -> str:
|
||||
"""Get algorithm name."""
|
||||
return "Semantic"
|
||||
@@ -0,0 +1,333 @@
|
||||
"""
|
||||
Semantic scoring for document structure awareness.
|
||||
|
||||
Provides structure-based boosting without ML/embeddings:
|
||||
- Node type importance (headings, tables, XBRL)
|
||||
- Cross-reference detection (gateway content)
|
||||
- Section importance
|
||||
- Text quality signals
|
||||
|
||||
This is NOT embedding-based semantic search. It's structure-aware ranking
|
||||
that helps agents find investigation starting points.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Dict, Optional, TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar.documents.nodes import Node
|
||||
|
||||
from edgar.documents.types import NodeType, SemanticType
|
||||
|
||||
|
||||
# Gateway terms that indicate summary/overview content
|
||||
GATEWAY_TERMS = [
|
||||
'summary', 'overview', 'introduction', 'highlights',
|
||||
'key points', 'executive summary', 'in summary',
|
||||
'table of contents', 'index'
|
||||
]
|
||||
|
||||
# Cross-reference patterns
|
||||
CROSS_REFERENCE_PATTERNS = [
|
||||
r'\bsee\s+item\s+\d+[a-z]?\b', # "See Item 1A"
|
||||
r'\bsee\s+(?:part|section)\s+\d+\b', # "See Part II"
|
||||
r'\brefer\s+to\s+item\s+\d+[a-z]?\b', # "Refer to Item 7"
|
||||
r'\bas\s+discussed\s+in\s+item\s+\d+\b', # "As discussed in Item 1"
|
||||
r'\bfor\s+(?:more|additional)\s+information\b', # "For more information"
|
||||
]
|
||||
|
||||
# Section importance weights
|
||||
SECTION_IMPORTANCE = {
|
||||
'risk factors': 1.5,
|
||||
'management discussion': 1.4,
|
||||
'md&a': 1.4,
|
||||
'business': 1.3,
|
||||
'financial statements': 1.2,
|
||||
'controls and procedures': 1.2,
|
||||
}
|
||||
|
||||
|
||||
def compute_semantic_scores(nodes: List['Node'],
|
||||
query: str,
|
||||
boost_sections: Optional[List[str]] = None) -> Dict[int, float]:
|
||||
"""
|
||||
Compute semantic/structure scores for nodes.
|
||||
|
||||
This provides structure-aware boosting based on:
|
||||
1. Node type (headings > tables > paragraphs)
|
||||
2. Cross-references (gateway content)
|
||||
3. Section importance
|
||||
4. Gateway terms (summaries, overviews)
|
||||
5. XBRL presence
|
||||
6. Text quality
|
||||
|
||||
Args:
|
||||
nodes: Nodes to score
|
||||
query: Search query (for context-aware boosting)
|
||||
boost_sections: Additional sections to boost
|
||||
|
||||
Returns:
|
||||
Dictionary mapping node id to semantic score (0-1 range)
|
||||
"""
|
||||
scores = {}
|
||||
boost_sections = boost_sections or []
|
||||
|
||||
# Get query context
|
||||
query_lower = query.lower()
|
||||
is_item_query = bool(re.search(r'item\s+\d+[a-z]?', query_lower))
|
||||
|
||||
for node in nodes:
|
||||
score = 0.0
|
||||
|
||||
# 1. Node Type Boosting
|
||||
score += _get_node_type_boost(node)
|
||||
|
||||
# 2. Semantic Type Boosting
|
||||
score += _get_semantic_type_boost(node)
|
||||
|
||||
# 3. Cross-Reference Detection (gateway content)
|
||||
score += _detect_cross_references(node)
|
||||
|
||||
# 4. Gateway Content Detection
|
||||
score += _detect_gateway_content(node, query_lower)
|
||||
|
||||
# 5. Section Importance Boosting
|
||||
score += _get_section_boost(node, boost_sections)
|
||||
|
||||
# 6. XBRL Fact Boosting (for financial queries)
|
||||
score += _get_xbrl_boost(node)
|
||||
|
||||
# 7. Text Quality Signals
|
||||
score += _get_quality_boost(node)
|
||||
|
||||
# 8. Query-Specific Boosting
|
||||
if is_item_query:
|
||||
score += _get_item_header_boost(node)
|
||||
|
||||
# Normalize to 0-1 range (max possible score is ~7.0)
|
||||
normalized_score = min(score / 7.0, 1.0)
|
||||
|
||||
scores[id(node)] = normalized_score
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def _get_node_type_boost(node: 'Node') -> float:
|
||||
"""
|
||||
Boost based on node type.
|
||||
|
||||
Headings and structural elements are more important for navigation.
|
||||
"""
|
||||
type_boosts = {
|
||||
NodeType.HEADING: 2.0, # Headings are key navigation points
|
||||
NodeType.SECTION: 1.5, # Section markers
|
||||
NodeType.TABLE: 1.0, # Tables contain structured data
|
||||
NodeType.XBRL_FACT: 0.8, # Financial facts
|
||||
NodeType.LIST: 0.5, # Lists
|
||||
NodeType.PARAGRAPH: 0.3, # Regular text
|
||||
NodeType.TEXT: 0.1, # Plain text nodes
|
||||
}
|
||||
|
||||
return type_boosts.get(node.type, 0.0)
|
||||
|
||||
|
||||
def _get_semantic_type_boost(node: 'Node') -> float:
|
||||
"""
|
||||
Boost based on semantic type.
|
||||
|
||||
Section headers and items are important for SEC filings.
|
||||
"""
|
||||
if not hasattr(node, 'semantic_type') or node.semantic_type is None:
|
||||
return 0.0
|
||||
|
||||
semantic_boosts = {
|
||||
SemanticType.ITEM_HEADER: 2.0, # Item headers are critical
|
||||
SemanticType.SECTION_HEADER: 1.5, # Section headers
|
||||
SemanticType.FINANCIAL_STATEMENT: 1.2, # Financial statements
|
||||
SemanticType.TABLE_OF_CONTENTS: 1.0, # TOC is a gateway
|
||||
SemanticType.TITLE: 0.8,
|
||||
SemanticType.HEADER: 0.6,
|
||||
}
|
||||
|
||||
return semantic_boosts.get(node.semantic_type, 0.0)
|
||||
|
||||
|
||||
def _detect_cross_references(node: 'Node') -> float:
|
||||
"""
|
||||
Detect cross-references that indicate gateway content.
|
||||
|
||||
Content that points to other sections is useful for navigation.
|
||||
"""
|
||||
text = node.text() if hasattr(node, 'text') else ''
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
text_lower = text.lower()
|
||||
|
||||
# Check each pattern
|
||||
matches = 0
|
||||
for pattern in CROSS_REFERENCE_PATTERNS:
|
||||
if re.search(pattern, text_lower):
|
||||
matches += 1
|
||||
|
||||
# Boost increases with number of cross-references
|
||||
return min(matches * 0.5, 1.5) # Cap at 1.5
|
||||
|
||||
|
||||
def _detect_gateway_content(node: 'Node', query_lower: str) -> float:
|
||||
"""
|
||||
Detect gateway content (summaries, overviews, introductions).
|
||||
|
||||
These are excellent starting points for investigation.
|
||||
"""
|
||||
text = node.text() if hasattr(node, 'text') else ''
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
text_lower = text.lower()
|
||||
|
||||
# Check for gateway terms in text
|
||||
for term in GATEWAY_TERMS:
|
||||
if term in text_lower:
|
||||
return 1.0
|
||||
|
||||
# Check if this is an introductory paragraph (first ~200 chars)
|
||||
if len(text) < 200 and len(text) > 20:
|
||||
# Short intro paragraphs are often summaries
|
||||
if any(word in text_lower for word in ['provides', 'describes', 'includes', 'contains']):
|
||||
return 0.5
|
||||
|
||||
return 0.0
|
||||
|
||||
|
||||
def _get_section_boost(node: 'Node', boost_sections: List[str]) -> float:
|
||||
"""
|
||||
Boost nodes in important sections.
|
||||
|
||||
Some SEC sections are more relevant for certain queries.
|
||||
"""
|
||||
# Try to determine section from node or ancestors
|
||||
section_name = _get_node_section(node)
|
||||
if not section_name:
|
||||
return 0.0
|
||||
|
||||
section_lower = section_name.lower()
|
||||
|
||||
# Check built-in importance
|
||||
for key, boost in SECTION_IMPORTANCE.items():
|
||||
if key in section_lower:
|
||||
return boost
|
||||
|
||||
# Check user-specified sections
|
||||
for boost_section in boost_sections:
|
||||
if boost_section.lower() in section_lower:
|
||||
return 1.5
|
||||
|
||||
return 0.0
|
||||
|
||||
|
||||
def _get_xbrl_boost(node: 'Node') -> float:
|
||||
"""
|
||||
Boost XBRL facts and tables with XBRL data.
|
||||
|
||||
Financial data is important for financial queries.
|
||||
"""
|
||||
if node.type == NodeType.XBRL_FACT:
|
||||
return 0.8
|
||||
|
||||
# Check if table contains XBRL facts
|
||||
if node.type == NodeType.TABLE:
|
||||
# Check metadata for XBRL indicator
|
||||
if hasattr(node, 'metadata') and node.metadata.get('has_xbrl'):
|
||||
return 0.6
|
||||
|
||||
return 0.0
|
||||
|
||||
|
||||
def _get_quality_boost(node: 'Node') -> float:
|
||||
"""
|
||||
Boost based on text quality signals.
|
||||
|
||||
Higher quality content tends to be more useful:
|
||||
- Appropriate length (not too short, not too long)
|
||||
- Good structure (sentences, punctuation)
|
||||
- Substantive content (not just formatting)
|
||||
"""
|
||||
text = node.text() if hasattr(node, 'text') else ''
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
score = 0.0
|
||||
|
||||
# Length signal
|
||||
text_len = len(text)
|
||||
if 50 <= text_len <= 1000:
|
||||
score += 0.3 # Good length
|
||||
elif text_len > 1000:
|
||||
score += 0.1 # Long but might be comprehensive
|
||||
else:
|
||||
score += 0.0 # Too short, likely not substantive
|
||||
|
||||
# Sentence structure
|
||||
sentence_count = text.count('.') + text.count('?') + text.count('!')
|
||||
if sentence_count >= 2:
|
||||
score += 0.2 # Multiple sentences indicate substantive content
|
||||
|
||||
# Avoid pure formatting/navigation
|
||||
if text.strip() in ['...', '—', '-', 'Table of Contents', 'Page', '']:
|
||||
return 0.0 # Skip pure formatting
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def _get_item_header_boost(node: 'Node') -> float:
|
||||
"""
|
||||
Boost Item headers when query is about items.
|
||||
|
||||
"Item 1A" queries should prioritize Item 1A headers.
|
||||
"""
|
||||
if node.type != NodeType.HEADING:
|
||||
return 0.0
|
||||
|
||||
text = node.text() if hasattr(node, 'text') else ''
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
# Check if this is an Item header
|
||||
if re.match(r'^\s*item\s+\d+[a-z]?[:\.\s]', text, re.IGNORECASE):
|
||||
return 1.5
|
||||
|
||||
return 0.0
|
||||
|
||||
|
||||
def _get_node_section(node: 'Node') -> Optional[str]:
|
||||
"""
|
||||
Get section name for a node by walking up the tree.
|
||||
|
||||
Returns:
|
||||
Section name if found, None otherwise
|
||||
"""
|
||||
# Check if node has section in metadata
|
||||
if hasattr(node, 'metadata') and 'section' in node.metadata:
|
||||
return node.metadata['section']
|
||||
|
||||
# Walk up tree looking for section marker
|
||||
current = node
|
||||
while current:
|
||||
if hasattr(current, 'semantic_type'):
|
||||
if current.semantic_type in (SemanticType.SECTION_HEADER, SemanticType.ITEM_HEADER):
|
||||
return current.text() if hasattr(current, 'text') else None
|
||||
|
||||
current = current.parent if hasattr(current, 'parent') else None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_section_importance_names() -> List[str]:
|
||||
"""
|
||||
Get list of important section names for reference.
|
||||
|
||||
Returns:
|
||||
List of section names with built-in importance boosts
|
||||
"""
|
||||
return list(SECTION_IMPORTANCE.keys())
|
||||
@@ -0,0 +1,13 @@
|
||||
"""
|
||||
Document renderers for various output formats.
|
||||
"""
|
||||
|
||||
from edgar.documents.renderers.markdown import MarkdownRenderer
|
||||
from edgar.documents.renderers.text import TextRenderer
|
||||
from edgar.documents.renderers.fast_table import FastTableRenderer
|
||||
|
||||
__all__ = [
|
||||
'MarkdownRenderer',
|
||||
'TextRenderer',
|
||||
'FastTableRenderer'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,669 @@
|
||||
"""
|
||||
Fast table renderer for edgar.documents - optimized for performance.
|
||||
|
||||
This module provides a high-performance alternative to Rich table rendering
|
||||
while maintaining professional output quality and readability.
|
||||
|
||||
Performance target: ~32x faster than Rich rendering (0.2ms vs 6.5ms per table)
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Optional, Union, Tuple
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class Alignment(Enum):
|
||||
"""Column alignment options."""
|
||||
LEFT = "left"
|
||||
RIGHT = "right"
|
||||
CENTER = "center"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColumnConfig:
|
||||
"""Configuration for a table column."""
|
||||
alignment: Alignment = Alignment.LEFT
|
||||
min_width: int = 8
|
||||
max_width: Optional[int] = None
|
||||
padding: int = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class TableStyle:
|
||||
"""Table styling configuration."""
|
||||
border_char: str = "|"
|
||||
header_separator: str = "-"
|
||||
corner_char: str = "+"
|
||||
padding: int = 1
|
||||
min_col_width: int = 8
|
||||
max_col_width: int = 50
|
||||
|
||||
@classmethod
|
||||
def pipe_table(cls) -> 'TableStyle':
|
||||
"""Markdown-compatible pipe table style."""
|
||||
return cls(
|
||||
border_char="|",
|
||||
header_separator="-",
|
||||
corner_char="|",
|
||||
padding=1,
|
||||
min_col_width=8,
|
||||
max_col_width=50
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def minimal(cls) -> 'TableStyle':
|
||||
"""Minimal table style with spacing only."""
|
||||
return cls(
|
||||
border_char="",
|
||||
header_separator="",
|
||||
corner_char="",
|
||||
padding=2,
|
||||
min_col_width=6,
|
||||
max_col_width=40
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def simple(cls) -> 'TableStyle':
|
||||
"""
|
||||
Simple table style matching Rich's box.SIMPLE.
|
||||
|
||||
Features:
|
||||
- No outer border
|
||||
- No column separators
|
||||
- Single horizontal line under header
|
||||
- Space-separated columns with generous padding
|
||||
- Clean, professional appearance
|
||||
|
||||
This style provides the best balance of visual quality and performance,
|
||||
matching Rich's box.SIMPLE aesthetic while maintaining fast rendering speed.
|
||||
"""
|
||||
return cls(
|
||||
border_char="", # No pipes/borders
|
||||
header_separator="─", # Unicode horizontal line
|
||||
corner_char="", # No corners
|
||||
padding=2, # Generous spacing (was 1 in pipe_table)
|
||||
min_col_width=6, # Slightly relaxed (was 8)
|
||||
max_col_width=60 # Raised from 50 for wider columns
|
||||
)
|
||||
|
||||
|
||||
class FastTableRenderer:
|
||||
"""
|
||||
High-performance table renderer optimized for speed.
|
||||
|
||||
Features:
|
||||
- 30x+ faster than Rich table rendering
|
||||
- Professional, readable output
|
||||
- Configurable alignment and styling
|
||||
- Handles complex SEC filing table structures
|
||||
- Markdown-compatible output
|
||||
- Memory efficient
|
||||
"""
|
||||
|
||||
def __init__(self, style: Optional[TableStyle] = None):
|
||||
"""Initialize renderer with optional style configuration."""
|
||||
self.style = style or TableStyle.pipe_table()
|
||||
|
||||
# Pre-compile format strings for performance
|
||||
self._format_cache = {}
|
||||
|
||||
def render_table_node(self, table_node) -> str:
|
||||
"""
|
||||
Render a TableNode to text format with proper colspan/rowspan handling.
|
||||
|
||||
Args:
|
||||
table_node: TableNode instance from edgar.documents
|
||||
|
||||
Returns:
|
||||
Formatted table string
|
||||
"""
|
||||
from edgar.documents.utils.table_matrix import TableMatrix
|
||||
|
||||
# Build matrix to handle colspan/rowspan properly
|
||||
# This ensures cells are expanded to fill their full colspan/rowspan
|
||||
matrix = TableMatrix()
|
||||
matrix.build_from_rows(table_node.headers, table_node.rows)
|
||||
|
||||
# Extract headers from expanded matrix
|
||||
headers = []
|
||||
if table_node.headers:
|
||||
for row_idx in range(len(table_node.headers)):
|
||||
expanded_row = matrix.get_expanded_row(row_idx)
|
||||
# Convert Cell objects to strings, handling None values
|
||||
row_texts = [cell.text().strip() if cell else '' for cell in expanded_row]
|
||||
headers.append(row_texts)
|
||||
|
||||
# Extract data rows from expanded matrix
|
||||
rows = []
|
||||
start_row = len(table_node.headers) if table_node.headers else 0
|
||||
for row_idx in range(start_row, matrix.row_count):
|
||||
expanded_row = matrix.get_expanded_row(row_idx)
|
||||
# Convert Cell objects to strings, handling None values
|
||||
row_texts = [cell.text().strip() if cell else '' for cell in expanded_row]
|
||||
rows.append(row_texts)
|
||||
|
||||
# Render the table
|
||||
table_text = self.render_table_data(headers, rows)
|
||||
|
||||
# Add caption if present (matches Rich renderer behavior)
|
||||
if hasattr(table_node, 'caption') and table_node.caption:
|
||||
return f"{table_node.caption}\n{table_text}"
|
||||
|
||||
return table_text
|
||||
|
||||
def render_table_data(self, headers: List[List[str]], rows: List[List[str]]) -> str:
|
||||
"""
|
||||
Render table data with headers and rows.
|
||||
|
||||
Args:
|
||||
headers: List of header rows (for multi-row headers)
|
||||
rows: List of data rows
|
||||
|
||||
Returns:
|
||||
Formatted table string
|
||||
"""
|
||||
if not headers and not rows:
|
||||
return ""
|
||||
|
||||
# Determine column count from all rows (headers + data)
|
||||
all_rows = headers + rows if headers else rows
|
||||
if not all_rows:
|
||||
return ""
|
||||
|
||||
max_cols = max(len(row) for row in all_rows) if all_rows else 0
|
||||
if max_cols == 0:
|
||||
return ""
|
||||
|
||||
# Filter out empty/spacing columns
|
||||
meaningful_columns = self._identify_meaningful_columns(all_rows, max_cols)
|
||||
if not meaningful_columns:
|
||||
return ""
|
||||
|
||||
# Filter all rows (both headers and data) to only meaningful columns
|
||||
filtered_headers = [self._filter_row_to_columns(row, meaningful_columns) for row in headers] if headers else []
|
||||
filtered_rows = [self._filter_row_to_columns(row, meaningful_columns) for row in rows]
|
||||
|
||||
# Post-process to merge related columns (e.g., currency symbols with amounts)
|
||||
# Apply to all rows including headers
|
||||
all_filtered = filtered_headers + filtered_rows
|
||||
if all_filtered:
|
||||
# Merge using first filtered row as reference
|
||||
_, all_merged = self._merge_related_columns(all_filtered[0], all_filtered)
|
||||
# Split back into headers and data
|
||||
if filtered_headers:
|
||||
filtered_headers = all_merged[:len(filtered_headers)]
|
||||
filtered_rows = all_merged[len(filtered_headers):]
|
||||
else:
|
||||
filtered_rows = all_merged
|
||||
|
||||
# Recalculate with filtered and merged data
|
||||
filtered_all_rows = filtered_headers + filtered_rows if filtered_headers else filtered_rows
|
||||
filtered_max_cols = max(len(row) for row in filtered_all_rows) if filtered_all_rows else 0
|
||||
|
||||
# Calculate optimal column widths for filtered columns
|
||||
col_widths = self._calculate_column_widths(filtered_all_rows, filtered_max_cols)
|
||||
|
||||
# Detect column alignments based on filtered content
|
||||
alignments = self._detect_alignments(filtered_all_rows, filtered_max_cols)
|
||||
|
||||
# Build table with filtered data - pass headers as multiple rows
|
||||
return self._build_table(filtered_headers, filtered_rows, col_widths, alignments)
|
||||
|
||||
def _combine_headers(self, headers: List[List[str]]) -> List[str]:
|
||||
"""
|
||||
Combine multi-row headers intelligently.
|
||||
|
||||
For SEC tables, this prioritizes specific dates/periods over generic labels.
|
||||
"""
|
||||
if not headers:
|
||||
return []
|
||||
|
||||
if len(headers) == 1:
|
||||
return headers[0]
|
||||
|
||||
# Determine max columns across all header rows
|
||||
max_cols = max(len(row) for row in headers) if headers else 0
|
||||
combined = [""] * max_cols
|
||||
|
||||
for col in range(max_cols):
|
||||
# Collect all values for this column
|
||||
values = []
|
||||
for header_row in headers:
|
||||
if col < len(header_row) and header_row[col].strip():
|
||||
values.append(header_row[col].strip())
|
||||
|
||||
if values:
|
||||
# Prioritize date-like values over generic terms
|
||||
date_values = [v for v in values if self._looks_like_date(v)]
|
||||
if date_values:
|
||||
combined[col] = date_values[0]
|
||||
elif len(values) == 1:
|
||||
combined[col] = values[0]
|
||||
else:
|
||||
# Skip generic terms like "Year Ended" if we have something more specific
|
||||
specific_values = [v for v in values
|
||||
if v.lower() not in {'year ended', 'years ended', 'period ended'}]
|
||||
combined[col] = specific_values[0] if specific_values else values[0]
|
||||
|
||||
return combined
|
||||
|
||||
def _looks_like_date(self, text: str) -> bool:
|
||||
"""Quick date detection for header processing."""
|
||||
if not text or len(text) < 4:
|
||||
return False
|
||||
|
||||
text_lower = text.lower().replace('\n', ' ').strip()
|
||||
|
||||
# Common date indicators
|
||||
date_indicators = [
|
||||
'january', 'february', 'march', 'april', 'may', 'june',
|
||||
'july', 'august', 'september', 'october', 'november', 'december',
|
||||
'20', '19', # Year prefixes
|
||||
]
|
||||
|
||||
return any(indicator in text_lower for indicator in date_indicators) and \
|
||||
any(c.isdigit() for c in text)
|
||||
|
||||
def _identify_meaningful_columns(self, all_rows: List[List[str]], max_cols: int) -> List[int]:
|
||||
"""
|
||||
Identify columns that contain meaningful content (not just spacing).
|
||||
|
||||
Returns:
|
||||
List of column indices that have meaningful content
|
||||
"""
|
||||
column_scores = []
|
||||
|
||||
for col_idx in range(max_cols):
|
||||
content_score = 0
|
||||
total_rows = 0
|
||||
|
||||
# Score each column based on content quality
|
||||
for row in all_rows:
|
||||
if col_idx < len(row):
|
||||
total_rows += 1
|
||||
cell_content = str(row[col_idx]).strip()
|
||||
|
||||
if cell_content:
|
||||
# Higher score for longer, more substantial content
|
||||
if len(cell_content) >= 3: # Substantial content
|
||||
content_score += 3
|
||||
elif len(cell_content) == 2 and cell_content.isalnum():
|
||||
content_score += 2
|
||||
elif len(cell_content) == 1 and (cell_content.isalnum() or cell_content == '$'):
|
||||
content_score += 1
|
||||
# Skip single spaces, dashes, or other likely spacing characters
|
||||
|
||||
# Calculate average score per row for this column
|
||||
avg_score = content_score / max(total_rows, 1)
|
||||
column_scores.append((col_idx, avg_score, content_score))
|
||||
|
||||
# Sort by score descending
|
||||
column_scores.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Take columns with meaningful content (score >= 0.5 or among top columns)
|
||||
meaningful_columns = []
|
||||
for col_idx, avg_score, total_score in column_scores:
|
||||
# Include if it has good average score or significant total content
|
||||
if avg_score >= 0.5 or total_score >= 5:
|
||||
meaningful_columns.append(col_idx)
|
||||
# Limit to reasonable number of columns for readability
|
||||
if len(meaningful_columns) >= 8:
|
||||
break
|
||||
|
||||
# Sort by original column order
|
||||
meaningful_columns.sort()
|
||||
|
||||
return meaningful_columns
|
||||
|
||||
def _filter_row_to_columns(self, row: List[str], column_indices: List[int]) -> List[str]:
|
||||
"""
|
||||
Filter a row to only include the specified column indices.
|
||||
|
||||
Args:
|
||||
row: Original row data
|
||||
column_indices: List of column indices to keep
|
||||
|
||||
Returns:
|
||||
Filtered row with only the specified columns
|
||||
"""
|
||||
if not row:
|
||||
return []
|
||||
|
||||
filtered_row = []
|
||||
for col_idx in column_indices:
|
||||
if col_idx < len(row):
|
||||
filtered_row.append(row[col_idx])
|
||||
else:
|
||||
filtered_row.append("") # Missing column
|
||||
|
||||
return filtered_row
|
||||
|
||||
def _merge_related_columns(self, headers: List[str], rows: List[List[str]]) -> tuple:
|
||||
"""
|
||||
Merge related columns (e.g., currency symbols with their amounts).
|
||||
|
||||
Returns:
|
||||
Tuple of (merged_headers, merged_rows)
|
||||
"""
|
||||
if not rows or not any(rows):
|
||||
return headers, rows
|
||||
|
||||
# Find columns that should be merged
|
||||
merge_pairs = []
|
||||
max_cols = max(len(row) for row in [headers] + rows if row) if rows else len(headers) if headers else 0
|
||||
|
||||
for col_idx in range(max_cols - 1):
|
||||
# Check if this column and the next should be merged
|
||||
should_merge = self._should_merge_columns(headers, rows, col_idx, col_idx + 1)
|
||||
if should_merge:
|
||||
merge_pairs.append((col_idx, col_idx + 1))
|
||||
|
||||
# Apply merges (from right to left to avoid index shifting)
|
||||
merged_headers = headers[:] if headers else []
|
||||
merged_rows = [row[:] for row in rows]
|
||||
|
||||
for left_idx, right_idx in reversed(merge_pairs):
|
||||
# Merge headers
|
||||
if merged_headers and left_idx < len(merged_headers) and right_idx < len(merged_headers):
|
||||
left_header = merged_headers[left_idx].strip()
|
||||
right_header = merged_headers[right_idx].strip()
|
||||
merged_header = f"{left_header} {right_header}".strip()
|
||||
merged_headers[left_idx] = merged_header
|
||||
merged_headers.pop(right_idx)
|
||||
|
||||
# Merge rows
|
||||
for row in merged_rows:
|
||||
if left_idx < len(row) and right_idx < len(row):
|
||||
left_cell = str(row[left_idx]).strip()
|
||||
right_cell = str(row[right_idx]).strip()
|
||||
|
||||
# Smart merging based on content
|
||||
if left_cell == '$' and right_cell:
|
||||
merged_cell = f"${right_cell}"
|
||||
elif left_cell and right_cell:
|
||||
merged_cell = f"{left_cell} {right_cell}"
|
||||
else:
|
||||
merged_cell = left_cell or right_cell
|
||||
|
||||
row[left_idx] = merged_cell
|
||||
if right_idx < len(row):
|
||||
row.pop(right_idx)
|
||||
|
||||
return merged_headers, merged_rows
|
||||
|
||||
def _should_merge_columns(self, headers: List[str], rows: List[List[str]], left_idx: int, right_idx: int) -> bool:
|
||||
"""
|
||||
Determine if two adjacent columns should be merged.
|
||||
|
||||
Returns:
|
||||
True if columns should be merged
|
||||
"""
|
||||
# Check if left column is mostly currency symbols
|
||||
currency_count = 0
|
||||
total_count = 0
|
||||
|
||||
for row in rows:
|
||||
if left_idx < len(row) and right_idx < len(row):
|
||||
total_count += 1
|
||||
left_cell = str(row[left_idx]).strip()
|
||||
right_cell = str(row[right_idx]).strip()
|
||||
|
||||
# If left is '$' and right is a number, they should be merged
|
||||
if left_cell == '$' and right_cell and (right_cell.replace(',', '').replace('.', '').isdigit()):
|
||||
currency_count += 1
|
||||
|
||||
# If most rows have currency symbol + number pattern, merge them
|
||||
if total_count > 0 and currency_count / total_count >= 0.5:
|
||||
return True
|
||||
|
||||
# Check for other merge patterns (e.g., empty left column with content right column)
|
||||
empty_left_count = 0
|
||||
for row in rows:
|
||||
if left_idx < len(row) and right_idx < len(row):
|
||||
left_cell = str(row[left_idx]).strip()
|
||||
right_cell = str(row[right_idx]).strip()
|
||||
|
||||
if not left_cell and right_cell:
|
||||
empty_left_count += 1
|
||||
|
||||
# If left column is mostly empty, consider merging
|
||||
if total_count > 0 and empty_left_count / total_count >= 0.7:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _calculate_column_widths(self, all_rows: List[List[str]], max_cols: int) -> List[int]:
|
||||
"""Calculate optimal column widths based on content."""
|
||||
col_widths = [self.style.min_col_width] * max_cols
|
||||
|
||||
# Find the maximum content width for each column
|
||||
for row in all_rows:
|
||||
for col_idx in range(min(len(row), max_cols)):
|
||||
content = str(row[col_idx]) if row[col_idx] else ""
|
||||
# Handle multi-line content
|
||||
max_line_width = max((len(line) for line in content.split('\n')), default=0)
|
||||
content_width = max_line_width + (self.style.padding * 2)
|
||||
|
||||
# Apply limits
|
||||
content_width = min(content_width, self.style.max_col_width)
|
||||
col_widths[col_idx] = max(col_widths[col_idx], content_width)
|
||||
|
||||
return col_widths
|
||||
|
||||
def _detect_alignments(self, all_rows: List[List[str]], max_cols: int) -> List[Alignment]:
|
||||
"""Detect appropriate alignment for each column based on content."""
|
||||
alignments = [Alignment.LEFT] * max_cols
|
||||
|
||||
for col_idx in range(max_cols):
|
||||
# Analyze column content (skip header row if present)
|
||||
data_rows = all_rows[1:] if len(all_rows) > 1 else all_rows
|
||||
|
||||
numeric_count = 0
|
||||
total_count = 0
|
||||
|
||||
for row in data_rows:
|
||||
if col_idx < len(row) and row[col_idx].strip():
|
||||
total_count += 1
|
||||
content = row[col_idx].strip()
|
||||
|
||||
# Check if content looks numeric (currency, percentages, numbers)
|
||||
if self._looks_numeric(content):
|
||||
numeric_count += 1
|
||||
|
||||
# If most values in column are numeric, right-align
|
||||
if total_count > 0 and numeric_count / total_count >= 0.7:
|
||||
alignments[col_idx] = Alignment.RIGHT
|
||||
|
||||
return alignments
|
||||
|
||||
def _looks_numeric(self, text: str) -> bool:
|
||||
"""Check if text content looks numeric."""
|
||||
if not text:
|
||||
return False
|
||||
|
||||
# Remove common formatting characters
|
||||
clean_text = text.replace(',', '').replace('$', '').replace('%', '').replace('(', '').replace(')', '').strip()
|
||||
|
||||
# Handle negative numbers in parentheses
|
||||
if text.strip().startswith('(') and text.strip().endswith(')'):
|
||||
clean_text = text.strip()[1:-1].replace(',', '').replace('$', '').strip()
|
||||
|
||||
# Check if remaining text is numeric
|
||||
try:
|
||||
float(clean_text)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def _build_table(self, headers: List[List[str]], rows: List[List[str]],
|
||||
col_widths: List[int], alignments: List[Alignment]) -> str:
|
||||
"""
|
||||
Build the final table string.
|
||||
|
||||
Args:
|
||||
headers: List of header rows (can be multiple rows for multi-row headers)
|
||||
rows: List of data rows
|
||||
col_widths: Column widths
|
||||
alignments: Column alignments
|
||||
"""
|
||||
lines = []
|
||||
|
||||
# Header rows (can be multiple)
|
||||
if headers:
|
||||
for header_row in headers:
|
||||
# Only add header rows with meaningful content
|
||||
if any(cell.strip() for cell in header_row):
|
||||
# Handle multi-line cells in header rows
|
||||
formatted_lines = self._format_multiline_row(header_row, col_widths, alignments)
|
||||
lines.extend(formatted_lines)
|
||||
|
||||
# Header separator (after all header rows)
|
||||
if self.style.header_separator:
|
||||
sep_line = self._create_separator_line(col_widths)
|
||||
lines.append(sep_line)
|
||||
|
||||
# Data rows
|
||||
for row in rows:
|
||||
# Only add rows with meaningful content
|
||||
if any(cell.strip() for cell in row):
|
||||
row_line = self._format_row(row, col_widths, alignments)
|
||||
lines.append(row_line)
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
def _format_row(self, row: List[str], col_widths: List[int],
|
||||
alignments: List[Alignment]) -> str:
|
||||
"""Format a single row with proper alignment and padding."""
|
||||
cells = []
|
||||
border = self.style.border_char
|
||||
|
||||
for col_idx, width in enumerate(col_widths):
|
||||
# Get cell content
|
||||
content = str(row[col_idx]) if col_idx < len(row) else ""
|
||||
|
||||
# Handle multi-line content (take first line only for table)
|
||||
if '\n' in content:
|
||||
content = content.split('\n')[0]
|
||||
|
||||
content = content.strip()
|
||||
|
||||
# Calculate available width for content
|
||||
available_width = width - (self.style.padding * 2)
|
||||
|
||||
# Truncate if too long
|
||||
if len(content) > available_width:
|
||||
content = content[:available_width-3] + "..."
|
||||
|
||||
# Apply alignment
|
||||
alignment = alignments[col_idx] if col_idx < len(alignments) else Alignment.LEFT
|
||||
|
||||
if alignment == Alignment.RIGHT:
|
||||
aligned_content = content.rjust(available_width)
|
||||
elif alignment == Alignment.CENTER:
|
||||
aligned_content = content.center(available_width)
|
||||
else: # LEFT
|
||||
aligned_content = content.ljust(available_width)
|
||||
|
||||
# Add padding
|
||||
padded_cell = ' ' * self.style.padding + aligned_content + ' ' * self.style.padding
|
||||
cells.append(padded_cell)
|
||||
|
||||
# Join with borders
|
||||
if border:
|
||||
return border + border.join(cells) + border
|
||||
else:
|
||||
return ' '.join(cells)
|
||||
|
||||
def _format_multiline_row(self, row: List[str], col_widths: List[int],
|
||||
alignments: List[Alignment]) -> List[str]:
|
||||
"""
|
||||
Format a row that may contain multi-line cells (cells with \n characters).
|
||||
|
||||
Returns a list of formatted lines, one for each line of text in the cells.
|
||||
"""
|
||||
# Split each cell by newlines
|
||||
cell_lines = []
|
||||
max_lines = 1
|
||||
|
||||
for col_idx, content in enumerate(row):
|
||||
lines = content.split('\n') if content else ['']
|
||||
cell_lines.append(lines)
|
||||
max_lines = max(max_lines, len(lines))
|
||||
|
||||
# Build output lines
|
||||
output_lines = []
|
||||
for line_idx in range(max_lines):
|
||||
# Build row for this line
|
||||
current_row = []
|
||||
for col_idx in range(len(row)):
|
||||
# Get the line for this cell, or empty string if this cell has fewer lines
|
||||
if line_idx < len(cell_lines[col_idx]):
|
||||
current_row.append(cell_lines[col_idx][line_idx])
|
||||
else:
|
||||
current_row.append('')
|
||||
|
||||
# Format this line
|
||||
formatted_line = self._format_row(current_row, col_widths, alignments)
|
||||
output_lines.append(formatted_line)
|
||||
|
||||
return output_lines
|
||||
|
||||
def _create_separator_line(self, col_widths: List[int]) -> str:
|
||||
"""
|
||||
Create header separator line.
|
||||
|
||||
For bordered styles: |-------|-------|
|
||||
For borderless styles: ─────────────── (full width horizontal line)
|
||||
"""
|
||||
sep_char = self.style.header_separator
|
||||
border = self.style.border_char
|
||||
|
||||
if not sep_char:
|
||||
# No separator at all (minimal style)
|
||||
return ""
|
||||
|
||||
if border:
|
||||
# Bordered style: create separator matching column widths
|
||||
separators = []
|
||||
for width in col_widths:
|
||||
separators.append(sep_char * width)
|
||||
return border + border.join(separators) + border
|
||||
else:
|
||||
# Borderless style (simple): single horizontal line across full width
|
||||
# Calculate total width: sum of column widths + gaps between columns
|
||||
total_width = sum(col_widths) + (len(col_widths) - 1) * 2 # 2-space gaps
|
||||
|
||||
# Add leading space for indentation (matching row indentation)
|
||||
return " " + sep_char * total_width
|
||||
|
||||
|
||||
# Factory functions for easy usage
|
||||
def create_fast_renderer(style: str = "pipe") -> FastTableRenderer:
|
||||
"""
|
||||
Create a FastTableRenderer with predefined style.
|
||||
|
||||
Args:
|
||||
style: Style name ("pipe", "minimal")
|
||||
|
||||
Returns:
|
||||
Configured FastTableRenderer instance
|
||||
"""
|
||||
if style == "minimal":
|
||||
return FastTableRenderer(TableStyle.minimal())
|
||||
else: # Default to pipe
|
||||
return FastTableRenderer(TableStyle.pipe_table())
|
||||
|
||||
|
||||
def render_table_fast(table_node, style: str = "pipe") -> str:
|
||||
"""
|
||||
Convenience function to quickly render a table.
|
||||
|
||||
Args:
|
||||
table_node: TableNode instance
|
||||
style: Style name ("pipe", "minimal")
|
||||
|
||||
Returns:
|
||||
Formatted table string
|
||||
"""
|
||||
renderer = create_fast_renderer(style)
|
||||
return renderer.render_table_node(table_node)
|
||||
@@ -0,0 +1,613 @@
|
||||
"""
|
||||
Markdown renderer for parsed documents.
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Dict, Set
|
||||
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode, ListNode, ListItemNode
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
|
||||
|
||||
class MarkdownRenderer:
|
||||
"""
|
||||
Renders parsed documents to Markdown format.
|
||||
|
||||
Features:
|
||||
- Preserves document structure
|
||||
- Handles tables with proper formatting
|
||||
- Supports nested lists
|
||||
- Includes metadata annotations
|
||||
- Configurable output options
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
include_metadata: bool = False,
|
||||
include_toc: bool = False,
|
||||
max_heading_level: int = 6,
|
||||
table_format: str = 'pipe',
|
||||
wrap_width: Optional[int] = None):
|
||||
"""
|
||||
Initialize markdown renderer.
|
||||
|
||||
Args:
|
||||
include_metadata: Include metadata annotations
|
||||
include_toc: Generate table of contents
|
||||
max_heading_level: Maximum heading level to render
|
||||
table_format: Table format ('pipe', 'grid', 'simple')
|
||||
wrap_width: Wrap text at specified width
|
||||
"""
|
||||
self.include_metadata = include_metadata
|
||||
self.include_toc = include_toc
|
||||
self.max_heading_level = max_heading_level
|
||||
self.table_format = table_format
|
||||
self.wrap_width = wrap_width
|
||||
|
||||
# Track state during rendering
|
||||
self._toc_entries: List[tuple] = []
|
||||
self._rendered_ids: Set[str] = set()
|
||||
self._list_depth = 0
|
||||
self._in_table = False
|
||||
|
||||
def render(self, document: Document) -> str:
|
||||
"""
|
||||
Render document to Markdown.
|
||||
|
||||
Args:
|
||||
document: Document to render
|
||||
|
||||
Returns:
|
||||
Markdown formatted text
|
||||
"""
|
||||
self._reset_state()
|
||||
|
||||
parts = []
|
||||
|
||||
# Add metadata header if requested
|
||||
if self.include_metadata:
|
||||
parts.append(self._render_metadata(document))
|
||||
parts.append("")
|
||||
|
||||
# Placeholder for TOC
|
||||
if self.include_toc:
|
||||
toc_placeholder = "<!-- TOC -->"
|
||||
parts.append(toc_placeholder)
|
||||
parts.append("")
|
||||
|
||||
# Render document content
|
||||
content = self._render_node(document.root)
|
||||
parts.append(content)
|
||||
|
||||
# Join parts
|
||||
markdown = "\n".join(parts)
|
||||
|
||||
# Replace TOC placeholder
|
||||
if self.include_toc and self._toc_entries:
|
||||
toc = self._generate_toc()
|
||||
markdown = markdown.replace(toc_placeholder, toc)
|
||||
|
||||
return markdown.strip()
|
||||
|
||||
def render_node(self, node: Node) -> str:
|
||||
"""
|
||||
Render a specific node to Markdown.
|
||||
|
||||
Args:
|
||||
node: Node to render
|
||||
|
||||
Returns:
|
||||
Markdown formatted text
|
||||
"""
|
||||
self._reset_state()
|
||||
return self._render_node(node)
|
||||
|
||||
def _reset_state(self):
|
||||
"""Reset renderer state."""
|
||||
self._toc_entries = []
|
||||
self._rendered_ids = set()
|
||||
self._list_depth = 0
|
||||
self._in_table = False
|
||||
|
||||
def _render_node(self, node: Node) -> str:
|
||||
"""Render a node and its children."""
|
||||
# Skip if already rendered (handles shared nodes)
|
||||
if node.id in self._rendered_ids:
|
||||
return ""
|
||||
self._rendered_ids.add(node.id)
|
||||
|
||||
# Dispatch based on node type
|
||||
if isinstance(node, HeadingNode):
|
||||
return self._render_heading(node)
|
||||
elif isinstance(node, ParagraphNode):
|
||||
return self._render_paragraph(node)
|
||||
elif isinstance(node, TextNode):
|
||||
return self._render_text(node)
|
||||
elif isinstance(node, TableNode):
|
||||
return self._render_table(node)
|
||||
elif isinstance(node, ListNode):
|
||||
return self._render_list(node)
|
||||
elif isinstance(node, ListItemNode):
|
||||
return self._render_list_item(node)
|
||||
else:
|
||||
# Default: render children
|
||||
return self._render_children(node)
|
||||
|
||||
def _render_heading(self, node: HeadingNode) -> str:
|
||||
"""Render heading node."""
|
||||
# Limit heading level
|
||||
level = min(node.level, self.max_heading_level)
|
||||
|
||||
# Get heading text
|
||||
text = node.text().strip()
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Add to TOC
|
||||
if self.include_toc:
|
||||
self._toc_entries.append((level, text, node.id))
|
||||
|
||||
# Create markdown heading
|
||||
markdown = "#" * level + " " + text
|
||||
|
||||
# Add metadata if requested
|
||||
if self.include_metadata and node.metadata:
|
||||
metadata = self._format_metadata(node.metadata)
|
||||
if metadata:
|
||||
markdown += f" <!-- {metadata} -->"
|
||||
|
||||
# Add children content
|
||||
children_content = self._render_children(node)
|
||||
if children_content:
|
||||
markdown += "\n\n" + children_content
|
||||
|
||||
return markdown
|
||||
|
||||
def _render_paragraph(self, node: ParagraphNode) -> str:
|
||||
"""Render paragraph node."""
|
||||
# Get paragraph content
|
||||
content = self._render_children(node).strip()
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
# Wrap if requested
|
||||
if self.wrap_width:
|
||||
content = self._wrap_text(content, self.wrap_width)
|
||||
|
||||
# Add metadata if requested
|
||||
if self.include_metadata and node.metadata:
|
||||
metadata = self._format_metadata(node.metadata)
|
||||
if metadata:
|
||||
content = f"<!-- {metadata} -->\n{content}"
|
||||
|
||||
return content
|
||||
|
||||
def _render_text(self, node: TextNode) -> str:
|
||||
"""Render text node."""
|
||||
text = node.text()
|
||||
|
||||
# Escape markdown special characters
|
||||
text = self._escape_markdown(text)
|
||||
|
||||
# Apply text formatting based on style
|
||||
if node.style:
|
||||
if node.style.font_weight in ['bold', '700', '800', '900']:
|
||||
text = f"**{text}**"
|
||||
elif node.style.font_style == 'italic':
|
||||
text = f"*{text}*"
|
||||
elif node.style.text_decoration == 'underline':
|
||||
text = f"<u>{text}</u>"
|
||||
|
||||
return text
|
||||
|
||||
def _render_table(self, node: TableNode) -> str:
|
||||
"""Render table node."""
|
||||
self._in_table = True
|
||||
|
||||
parts = []
|
||||
|
||||
# Add caption if present
|
||||
if node.caption:
|
||||
parts.append(f"**Table: {node.caption}**")
|
||||
parts.append("")
|
||||
|
||||
# Render based on format
|
||||
if self.table_format == 'pipe':
|
||||
table_md = self._render_table_pipe(node)
|
||||
elif self.table_format == 'grid':
|
||||
table_md = self._render_table_grid(node)
|
||||
else: # simple
|
||||
table_md = self._render_table_simple(node)
|
||||
|
||||
parts.append(table_md)
|
||||
|
||||
# Add metadata if requested
|
||||
if self.include_metadata and node.metadata:
|
||||
metadata = self._format_metadata(node.metadata)
|
||||
if metadata:
|
||||
parts.append(f"<!-- Table metadata: {metadata} -->")
|
||||
|
||||
self._in_table = False
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
def _render_table_pipe(self, node: TableNode) -> str:
|
||||
"""Render table in pipe format with proper column spanning support."""
|
||||
# Handle complex SEC filing tables with column spanning
|
||||
expanded_headers, expanded_data_rows = self._expand_table_structure(node)
|
||||
|
||||
# Identify and filter to meaningful columns
|
||||
content_columns = self._identify_content_columns(expanded_headers, expanded_data_rows)
|
||||
|
||||
if not content_columns:
|
||||
return ""
|
||||
|
||||
rows = []
|
||||
|
||||
# Render headers with intelligent multi-row combination
|
||||
if expanded_headers:
|
||||
combined_headers = self._combine_multi_row_headers(expanded_headers)
|
||||
filtered_headers = [combined_headers[i] if i < len(combined_headers) else "" for i in content_columns]
|
||||
|
||||
row_md = "| " + " | ".join(filtered_headers) + " |"
|
||||
rows.append(row_md)
|
||||
|
||||
# Add separator
|
||||
separator = "| " + " | ".join(["---"] * len(filtered_headers)) + " |"
|
||||
rows.append(separator)
|
||||
|
||||
# Render data rows
|
||||
for expanded_row in expanded_data_rows:
|
||||
filtered_row = [expanded_row[i] if i < len(expanded_row) else "" for i in content_columns]
|
||||
|
||||
# Only add rows with meaningful content
|
||||
if any(cell.strip() for cell in filtered_row):
|
||||
row_md = "| " + " | ".join(filtered_row) + " |"
|
||||
rows.append(row_md)
|
||||
|
||||
return "\n".join(rows)
|
||||
|
||||
def _render_table_grid(self, node: TableNode) -> str:
|
||||
"""Render table in grid format."""
|
||||
# Simplified grid format
|
||||
all_rows = []
|
||||
|
||||
# Add headers
|
||||
if node.headers:
|
||||
for header_row in node.headers:
|
||||
cells = [cell.text() for cell in header_row]
|
||||
all_rows.append(" | ".join(cells))
|
||||
|
||||
# Add data rows
|
||||
for row in node.rows:
|
||||
cells = [cell.text() for cell in row.cells]
|
||||
all_rows.append(" | ".join(cells))
|
||||
|
||||
if all_rows:
|
||||
# Add borders
|
||||
max_width = max(len(row) for row in all_rows)
|
||||
border = "+" + "-" * (max_width + 2) + "+"
|
||||
result = [border]
|
||||
for row in all_rows:
|
||||
result.append(f"| {row:<{max_width}} |")
|
||||
result.append(border)
|
||||
return "\n".join(result)
|
||||
|
||||
return ""
|
||||
|
||||
def _render_table_simple(self, node: TableNode) -> str:
|
||||
"""Render table in simple format."""
|
||||
rows = []
|
||||
|
||||
# Add headers
|
||||
if node.headers:
|
||||
for header_row in node.headers:
|
||||
cells = [cell.text() for cell in header_row]
|
||||
rows.append(" ".join(cells))
|
||||
|
||||
# Add separator if we have headers
|
||||
if node.headers and node.rows:
|
||||
rows.append("")
|
||||
|
||||
# Add data rows
|
||||
for row in node.rows:
|
||||
cells = [cell.text() for cell in row.cells]
|
||||
rows.append(" ".join(cells))
|
||||
|
||||
return "\n".join(rows)
|
||||
|
||||
def _render_list(self, node: ListNode) -> str:
|
||||
"""Render list node."""
|
||||
self._list_depth += 1
|
||||
|
||||
items = []
|
||||
for child in node.children:
|
||||
if isinstance(child, ListItemNode):
|
||||
item_md = self._render_list_item(child)
|
||||
if item_md:
|
||||
items.append(item_md)
|
||||
|
||||
self._list_depth -= 1
|
||||
|
||||
return "\n".join(items)
|
||||
|
||||
def _render_list_item(self, node: ListItemNode) -> str:
|
||||
"""Render list item node."""
|
||||
# Determine bullet/number
|
||||
if node.parent and hasattr(node.parent, 'ordered') and node.parent.ordered:
|
||||
# Ordered list
|
||||
index = node.parent.children.index(node) + 1
|
||||
marker = f"{index}."
|
||||
else:
|
||||
# Unordered list
|
||||
markers = ['*', '-', '+']
|
||||
marker = markers[(self._list_depth - 1) % len(markers)]
|
||||
|
||||
# Indentation
|
||||
indent = " " * (self._list_depth - 1)
|
||||
|
||||
# Get content
|
||||
content = self._render_children(node).strip()
|
||||
|
||||
# Format item
|
||||
if '\n' in content:
|
||||
# Multi-line content
|
||||
lines = content.split('\n')
|
||||
result = indent + marker + " " + lines[0]
|
||||
for line in lines[1:]:
|
||||
result += "\n" + indent + " " + line
|
||||
return result
|
||||
else:
|
||||
# Single line
|
||||
return indent + marker + " " + content
|
||||
|
||||
def _render_children(self, node: Node) -> str:
|
||||
"""Render all children of a node."""
|
||||
parts = []
|
||||
|
||||
for child in node.children:
|
||||
child_md = self._render_node(child)
|
||||
if child_md:
|
||||
parts.append(child_md)
|
||||
|
||||
# Join with appropriate separator
|
||||
if self._in_table:
|
||||
return " ".join(parts)
|
||||
elif any(isinstance(child, (HeadingNode, ParagraphNode, TableNode, ListNode))
|
||||
for child in node.children):
|
||||
return "\n\n".join(parts)
|
||||
else:
|
||||
return " ".join(parts)
|
||||
|
||||
def _render_metadata(self, document: Document) -> str:
|
||||
"""Render document metadata."""
|
||||
lines = ["---"]
|
||||
|
||||
if document.metadata.company:
|
||||
lines.append(f"company: {document.metadata.company}")
|
||||
if document.metadata.form:
|
||||
lines.append(f"form: {document.metadata.form}")
|
||||
if document.metadata.filing_date:
|
||||
lines.append(f"filing_date: {document.metadata.filing_date}")
|
||||
if document.metadata.cik:
|
||||
lines.append(f"cik: {document.metadata.cik}")
|
||||
if document.metadata.accession_number:
|
||||
lines.append(f"accession_number: {document.metadata.accession_number}")
|
||||
|
||||
lines.append("---")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _generate_toc(self) -> str:
|
||||
"""Generate table of contents."""
|
||||
lines = ["## Table of Contents", ""]
|
||||
|
||||
for level, text, node_id in self._toc_entries:
|
||||
# Create anchor link
|
||||
anchor = self._create_anchor(text)
|
||||
|
||||
# Indentation based on level
|
||||
indent = " " * (level - 1)
|
||||
|
||||
# Add TOC entry
|
||||
lines.append(f"{indent}- [{text}](#{anchor})")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _create_anchor(self, text: str) -> str:
|
||||
"""Create anchor from heading text."""
|
||||
# Convert to lowercase and replace spaces with hyphens
|
||||
anchor = text.lower()
|
||||
anchor = anchor.replace(' ', '-')
|
||||
|
||||
# Remove special characters
|
||||
import re
|
||||
anchor = re.sub(r'[^a-z0-9\-]', '', anchor)
|
||||
|
||||
# Remove multiple hyphens
|
||||
anchor = re.sub(r'-+', '-', anchor)
|
||||
|
||||
return anchor.strip('-')
|
||||
|
||||
def _format_metadata(self, metadata: Dict) -> str:
|
||||
"""Format metadata for display."""
|
||||
parts = []
|
||||
|
||||
for key, value in metadata.items():
|
||||
if key == 'semantic_type':
|
||||
parts.append(f"type:{value}")
|
||||
elif key == 'section':
|
||||
parts.append(f"section:{value}")
|
||||
elif key == 'ix_tag':
|
||||
parts.append(f"xbrl:{value}")
|
||||
else:
|
||||
parts.append(f"{key}:{value}")
|
||||
|
||||
return " ".join(parts)
|
||||
|
||||
def _escape_markdown(self, text: str) -> str:
|
||||
"""Escape markdown special characters."""
|
||||
# Don't escape in tables
|
||||
if self._in_table:
|
||||
return text
|
||||
|
||||
# Escape special characters
|
||||
for char in ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']:
|
||||
text = text.replace(char, '\\' + char)
|
||||
|
||||
return text
|
||||
|
||||
def _wrap_text(self, text: str, width: int) -> str:
|
||||
"""Wrap text at specified width."""
|
||||
import textwrap
|
||||
return textwrap.fill(text, width=width, break_long_words=False)
|
||||
|
||||
def _expand_table_structure(self, node: TableNode) -> tuple:
|
||||
"""
|
||||
Expand table structure to handle column spanning properly.
|
||||
Returns (expanded_headers, expanded_data_rows).
|
||||
"""
|
||||
# Calculate the logical column count from colspan
|
||||
max_columns = 0
|
||||
|
||||
# Check all rows for maximum column span
|
||||
all_rows = []
|
||||
if node.headers:
|
||||
for header_row in node.headers:
|
||||
all_rows.append(header_row)
|
||||
for row in node.rows:
|
||||
all_rows.append(row.cells)
|
||||
|
||||
for row in all_rows:
|
||||
column_count = sum(cell.colspan for cell in row)
|
||||
max_columns = max(max_columns, column_count)
|
||||
|
||||
# Expand headers
|
||||
expanded_headers = []
|
||||
if node.headers:
|
||||
for header_row in node.headers:
|
||||
expanded = self._expand_row_to_columns(header_row, max_columns)
|
||||
expanded_headers.append(expanded)
|
||||
|
||||
# Expand data rows
|
||||
expanded_data_rows = []
|
||||
for row in node.rows:
|
||||
expanded = self._expand_row_to_columns(row.cells, max_columns)
|
||||
expanded_data_rows.append(expanded)
|
||||
|
||||
return expanded_headers, expanded_data_rows
|
||||
|
||||
def _expand_row_to_columns(self, cells: List, target_columns: int) -> List[str]:
|
||||
"""Expand a row with colspan cells to match the target column count."""
|
||||
expanded = []
|
||||
current_column = 0
|
||||
|
||||
for cell in cells:
|
||||
cell_text = cell.text().strip()
|
||||
|
||||
# Add the cell content
|
||||
expanded.append(cell_text)
|
||||
current_column += 1
|
||||
|
||||
# Add empty cells for remaining colspan
|
||||
for _ in range(cell.colspan - 1):
|
||||
if current_column < target_columns:
|
||||
expanded.append("")
|
||||
current_column += 1
|
||||
|
||||
# Pad to target column count if needed
|
||||
while len(expanded) < target_columns:
|
||||
expanded.append("")
|
||||
|
||||
return expanded[:target_columns]
|
||||
|
||||
def _identify_content_columns(self, expanded_headers: List[List[str]],
|
||||
expanded_data_rows: List[List[str]]) -> List[int]:
|
||||
"""Identify which columns actually contain meaningful content."""
|
||||
if not expanded_headers and not expanded_data_rows:
|
||||
return []
|
||||
|
||||
# Get the column count
|
||||
max_cols = 0
|
||||
if expanded_headers:
|
||||
max_cols = max(max_cols, max(len(row) for row in expanded_headers))
|
||||
if expanded_data_rows:
|
||||
max_cols = max(max_cols, max(len(row) for row in expanded_data_rows))
|
||||
|
||||
content_columns = []
|
||||
|
||||
for col in range(max_cols):
|
||||
has_content = False
|
||||
|
||||
# Check headers
|
||||
for header_row in expanded_headers:
|
||||
if col < len(header_row) and header_row[col].strip():
|
||||
has_content = True
|
||||
break
|
||||
|
||||
# Check data rows
|
||||
if not has_content:
|
||||
for data_row in expanded_data_rows:
|
||||
if col < len(data_row) and data_row[col].strip():
|
||||
has_content = True
|
||||
break
|
||||
|
||||
if has_content:
|
||||
content_columns.append(col)
|
||||
|
||||
return content_columns
|
||||
|
||||
def _combine_multi_row_headers(self, header_rows: List[List[str]]) -> List[str]:
|
||||
"""
|
||||
Combine multi-row headers intelligently for SEC filing tables.
|
||||
Prioritizes specific dates/periods over generic labels.
|
||||
"""
|
||||
if not header_rows:
|
||||
return []
|
||||
|
||||
num_columns = len(header_rows[0])
|
||||
combined = [""] * num_columns
|
||||
|
||||
for col in range(num_columns):
|
||||
# Collect all values for this column across header rows
|
||||
column_values = []
|
||||
for row in header_rows:
|
||||
if col < len(row) and row[col].strip():
|
||||
column_values.append(row[col].strip())
|
||||
|
||||
if column_values:
|
||||
# Prioritize date-like values over generic labels
|
||||
date_values = [v for v in column_values if self._looks_like_date(v)]
|
||||
if date_values:
|
||||
# Clean up line breaks in dates
|
||||
combined[col] = date_values[0].replace('\n', ' ')
|
||||
elif len(column_values) == 1:
|
||||
combined[col] = column_values[0].replace('\n', ' ')
|
||||
else:
|
||||
# Skip generic terms like "Year Ended" if we have something more specific
|
||||
specific_values = [v for v in column_values
|
||||
if v.lower() not in ['year ended', 'years ended']]
|
||||
if specific_values:
|
||||
combined[col] = specific_values[0].replace('\n', ' ')
|
||||
else:
|
||||
combined[col] = column_values[0].replace('\n', ' ')
|
||||
|
||||
return combined
|
||||
|
||||
def _looks_like_date(self, text: str) -> bool:
|
||||
"""Check if text looks like a date."""
|
||||
import re
|
||||
|
||||
# Common date patterns in SEC filings
|
||||
date_patterns = [
|
||||
r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s*\d{4}',
|
||||
r'\d{1,2}/\d{1,2}/\d{4}',
|
||||
r'\d{4}-\d{2}-\d{2}',
|
||||
r'^\d{4}$', # Just a year
|
||||
]
|
||||
|
||||
text_clean = text.replace('\n', ' ').strip()
|
||||
for pattern in date_patterns:
|
||||
if re.search(pattern, text_clean, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
return False
|
||||
@@ -0,0 +1,51 @@
|
||||
"""
|
||||
Plain text renderer for parsed documents.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.extractors.text_extractor import TextExtractor
|
||||
|
||||
|
||||
class TextRenderer:
|
||||
"""
|
||||
Renders parsed documents to plain text.
|
||||
|
||||
This is a simple wrapper around TextExtractor for consistency
|
||||
with other renderers.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
clean: bool = True,
|
||||
include_tables: bool = True,
|
||||
max_length: Optional[int] = None,
|
||||
preserve_structure: bool = False):
|
||||
"""
|
||||
Initialize text renderer.
|
||||
|
||||
Args:
|
||||
clean: Clean and normalize text
|
||||
include_tables: Include table content
|
||||
max_length: Maximum text length
|
||||
preserve_structure: Preserve document structure
|
||||
"""
|
||||
self.extractor = TextExtractor(
|
||||
clean=clean,
|
||||
include_tables=include_tables,
|
||||
include_metadata=False,
|
||||
include_links=False,
|
||||
max_length=max_length,
|
||||
preserve_structure=preserve_structure
|
||||
)
|
||||
|
||||
def render(self, document: Document) -> str:
|
||||
"""
|
||||
Render document to plain text.
|
||||
|
||||
Args:
|
||||
document: Document to render
|
||||
|
||||
Returns:
|
||||
Plain text
|
||||
"""
|
||||
return self.extractor.extract(document)
|
||||
769
venv/lib/python3.10/site-packages/edgar/documents/search.py
Normal file
769
venv/lib/python3.10/site-packages/edgar/documents/search.py
Normal file
@@ -0,0 +1,769 @@
|
||||
"""
|
||||
Search functionality for parsed documents.
|
||||
|
||||
Provides both traditional search modes (TEXT, REGEX, SEMANTIC, XPATH) and
|
||||
advanced BM25-based ranking with semantic structure awareness.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Dict, Any, TYPE_CHECKING
|
||||
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.nodes import Node, HeadingNode
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
from edgar.documents.types import NodeType, SemanticType
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar.documents.types import SearchResult as TypesSearchResult
|
||||
|
||||
|
||||
class SearchMode(Enum):
|
||||
"""Search modes."""
|
||||
TEXT = "text" # Plain text search
|
||||
REGEX = "regex" # Regular expression search
|
||||
SEMANTIC = "semantic" # Semantic/structural search
|
||||
XPATH = "xpath" # XPath-like search
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
"""Result from a search operation."""
|
||||
node: Node # Node containing match
|
||||
text: str # Matched text
|
||||
start_offset: int # Start position in text
|
||||
end_offset: int # End position in text
|
||||
context: Optional[str] = None # Surrounding context
|
||||
score: float = 1.0 # Relevance score
|
||||
|
||||
@property
|
||||
def snippet(self) -> str:
|
||||
"""Get text snippet with match highlighted."""
|
||||
if self.context:
|
||||
# Highlight match in context
|
||||
before = self.context[:self.start_offset]
|
||||
match = self.context[self.start_offset:self.end_offset]
|
||||
after = self.context[self.end_offset:]
|
||||
return f"{before}**{match}**{after}"
|
||||
return f"**{self.text}**"
|
||||
|
||||
|
||||
class DocumentSearch:
|
||||
"""
|
||||
Search functionality for parsed documents.
|
||||
|
||||
Supports various search modes and options.
|
||||
"""
|
||||
|
||||
def __init__(self, document: Document, use_cache: bool = True):
|
||||
"""
|
||||
Initialize search with document.
|
||||
|
||||
Args:
|
||||
document: Document to search
|
||||
use_cache: Enable index caching for faster repeated searches (default: True)
|
||||
"""
|
||||
self.document = document
|
||||
self.use_cache = use_cache
|
||||
self._ranking_engines: Dict[str, Any] = {} # Cached ranking engines
|
||||
self._build_index()
|
||||
|
||||
def _build_index(self):
|
||||
"""Build search index for performance."""
|
||||
# Text index: map text to nodes
|
||||
self.text_index: Dict[str, List[Node]] = {}
|
||||
|
||||
# Type index: map node types to nodes
|
||||
self.type_index: Dict[NodeType, List[Node]] = {}
|
||||
|
||||
# Semantic index: map semantic types to nodes
|
||||
self.semantic_index: Dict[SemanticType, List[Node]] = {}
|
||||
|
||||
# Build indices
|
||||
for node in self.document.root.walk():
|
||||
# Text index
|
||||
if hasattr(node, 'text'):
|
||||
text = node.text()
|
||||
if text:
|
||||
text_lower = text.lower()
|
||||
if text_lower not in self.text_index:
|
||||
self.text_index[text_lower] = []
|
||||
self.text_index[text_lower].append(node)
|
||||
|
||||
# Type index
|
||||
if node.type not in self.type_index:
|
||||
self.type_index[node.type] = []
|
||||
self.type_index[node.type].append(node)
|
||||
|
||||
# Semantic index
|
||||
if hasattr(node, 'semantic_type') and node.semantic_type:
|
||||
if node.semantic_type not in self.semantic_index:
|
||||
self.semantic_index[node.semantic_type] = []
|
||||
self.semantic_index[node.semantic_type].append(node)
|
||||
|
||||
def search(self,
|
||||
query: str,
|
||||
mode: SearchMode = SearchMode.TEXT,
|
||||
case_sensitive: bool = False,
|
||||
whole_word: bool = False,
|
||||
limit: Optional[int] = None,
|
||||
node_types: Optional[List[NodeType]] = None,
|
||||
in_section: Optional[str] = None) -> List[SearchResult]:
|
||||
"""
|
||||
Search document.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
mode: Search mode
|
||||
case_sensitive: Case sensitive search
|
||||
whole_word: Match whole words only
|
||||
limit: Maximum results to return
|
||||
node_types: Limit search to specific node types
|
||||
in_section: Limit search to specific section
|
||||
|
||||
Returns:
|
||||
List of search results
|
||||
"""
|
||||
if mode == SearchMode.TEXT:
|
||||
results = self._text_search(query, case_sensitive, whole_word)
|
||||
elif mode == SearchMode.REGEX:
|
||||
results = self._regex_search(query, case_sensitive)
|
||||
elif mode == SearchMode.SEMANTIC:
|
||||
results = self._semantic_search(query)
|
||||
elif mode == SearchMode.XPATH:
|
||||
results = self._xpath_search(query)
|
||||
else:
|
||||
raise ValueError(f"Unsupported search mode: {mode}")
|
||||
|
||||
# Filter by node types
|
||||
if node_types:
|
||||
results = [r for r in results if r.node.type in node_types]
|
||||
|
||||
# Filter by section
|
||||
if in_section:
|
||||
section_nodes = self._get_section_nodes(in_section)
|
||||
results = [r for r in results if r.node in section_nodes]
|
||||
|
||||
# Apply limit
|
||||
if limit and len(results) > limit:
|
||||
results = results[:limit]
|
||||
|
||||
return results
|
||||
|
||||
def _text_search(self, query: str, case_sensitive: bool, whole_word: bool) -> List[SearchResult]:
|
||||
"""Perform text search."""
|
||||
results = []
|
||||
|
||||
# Prepare query
|
||||
if not case_sensitive:
|
||||
query = query.lower()
|
||||
|
||||
# Search only leaf nodes to avoid duplicates
|
||||
for node in self.document.root.walk():
|
||||
# Skip nodes with children (they aggregate child text)
|
||||
if hasattr(node, 'children') and node.children:
|
||||
continue
|
||||
|
||||
if not hasattr(node, 'text'):
|
||||
continue
|
||||
|
||||
text = node.text()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
search_text = text if case_sensitive else text.lower()
|
||||
|
||||
# Find all occurrences
|
||||
if whole_word:
|
||||
# Use word boundary regex
|
||||
pattern = r'\b' + re.escape(query) + r'\b'
|
||||
flags = 0 if case_sensitive else re.IGNORECASE
|
||||
|
||||
for match in re.finditer(pattern, text, flags):
|
||||
results.append(SearchResult(
|
||||
node=node,
|
||||
text=match.group(),
|
||||
start_offset=match.start(),
|
||||
end_offset=match.end(),
|
||||
context=self._get_context(text, match.start(), match.end())
|
||||
))
|
||||
else:
|
||||
# Simple substring search
|
||||
start = 0
|
||||
while True:
|
||||
pos = search_text.find(query, start)
|
||||
if pos == -1:
|
||||
break
|
||||
|
||||
results.append(SearchResult(
|
||||
node=node,
|
||||
text=text[pos:pos + len(query)],
|
||||
start_offset=pos,
|
||||
end_offset=pos + len(query),
|
||||
context=self._get_context(text, pos, pos + len(query))
|
||||
))
|
||||
|
||||
start = pos + 1
|
||||
|
||||
return results
|
||||
|
||||
def _regex_search(self, pattern: str, case_sensitive: bool) -> List[SearchResult]:
|
||||
"""Perform regex search."""
|
||||
results = []
|
||||
|
||||
try:
|
||||
flags = 0 if case_sensitive else re.IGNORECASE
|
||||
regex = re.compile(pattern, flags)
|
||||
except re.error as e:
|
||||
raise ValueError(f"Invalid regex pattern: {e}")
|
||||
|
||||
# Search only leaf nodes to avoid duplicates
|
||||
for node in self.document.root.walk():
|
||||
# Skip nodes with children (they aggregate child text)
|
||||
if hasattr(node, 'children') and node.children:
|
||||
continue
|
||||
|
||||
if not hasattr(node, 'text'):
|
||||
continue
|
||||
|
||||
text = node.text()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# Find all matches
|
||||
for match in regex.finditer(text):
|
||||
results.append(SearchResult(
|
||||
node=node,
|
||||
text=match.group(),
|
||||
start_offset=match.start(),
|
||||
end_offset=match.end(),
|
||||
context=self._get_context(text, match.start(), match.end())
|
||||
))
|
||||
|
||||
return results
|
||||
|
||||
def _semantic_search(self, query: str) -> List[SearchResult]:
|
||||
"""Perform semantic/structural search."""
|
||||
results = []
|
||||
|
||||
# Parse semantic query
|
||||
# Examples: "heading:Item 1", "table:revenue", "section:risk factors"
|
||||
if ':' in query:
|
||||
search_type, search_text = query.split(':', 1)
|
||||
search_type = search_type.lower().strip()
|
||||
search_text = search_text.strip()
|
||||
else:
|
||||
# Default to text search in headings
|
||||
search_type = 'heading'
|
||||
search_text = query
|
||||
|
||||
if search_type == 'heading':
|
||||
# Search headings
|
||||
for node in self.type_index.get(NodeType.HEADING, []):
|
||||
if isinstance(node, HeadingNode):
|
||||
heading_text = node.text()
|
||||
if heading_text and search_text.lower() in heading_text.lower():
|
||||
results.append(SearchResult(
|
||||
node=node,
|
||||
text=heading_text,
|
||||
start_offset=0,
|
||||
end_offset=len(heading_text),
|
||||
score=self._calculate_heading_score(node)
|
||||
))
|
||||
|
||||
elif search_type == 'table':
|
||||
# Search tables
|
||||
for node in self.type_index.get(NodeType.TABLE, []):
|
||||
if isinstance(node, TableNode):
|
||||
# Search in table content
|
||||
table_text = node.text()
|
||||
if table_text and search_text.lower() in table_text.lower():
|
||||
results.append(SearchResult(
|
||||
node=node,
|
||||
text=f"Table: {node.caption or 'Untitled'}",
|
||||
start_offset=0,
|
||||
end_offset=len(table_text),
|
||||
context=table_text[:200] + "..." if len(table_text) > 200 else table_text
|
||||
))
|
||||
|
||||
elif search_type == 'section':
|
||||
# Search sections
|
||||
sections = self.document.sections
|
||||
for section_name, section in sections.items():
|
||||
if search_text.lower() in section_name.lower():
|
||||
results.append(SearchResult(
|
||||
node=section.node,
|
||||
text=section.title,
|
||||
start_offset=section.start_offset,
|
||||
end_offset=section.end_offset,
|
||||
score=2.0 # Boost section matches
|
||||
))
|
||||
|
||||
# Sort by score
|
||||
results.sort(key=lambda r: r.score, reverse=True)
|
||||
|
||||
return results
|
||||
|
||||
def _xpath_search(self, xpath: str) -> List[SearchResult]:
|
||||
"""Perform XPath-like search."""
|
||||
results = []
|
||||
|
||||
# Simple XPath parser
|
||||
# Examples: "//h1", "//table[@class='financial']", "//p[contains(text(),'revenue')]"
|
||||
|
||||
# Extract tag name
|
||||
tag_match = re.match(r'//(\w+)', xpath)
|
||||
if not tag_match:
|
||||
raise ValueError(f"Invalid XPath: {xpath}")
|
||||
|
||||
tag_name = tag_match.group(1).lower()
|
||||
|
||||
# Map tag to node type
|
||||
tag_to_type = {
|
||||
'h1': NodeType.HEADING,
|
||||
'h2': NodeType.HEADING,
|
||||
'h3': NodeType.HEADING,
|
||||
'h4': NodeType.HEADING,
|
||||
'h5': NodeType.HEADING,
|
||||
'h6': NodeType.HEADING,
|
||||
'p': NodeType.PARAGRAPH,
|
||||
'table': NodeType.TABLE,
|
||||
'section': NodeType.SECTION
|
||||
}
|
||||
|
||||
node_type = tag_to_type.get(tag_name)
|
||||
if not node_type:
|
||||
return results
|
||||
|
||||
# Get nodes of type
|
||||
nodes = self.type_index.get(node_type, [])
|
||||
|
||||
# Apply filters
|
||||
if '[' in xpath:
|
||||
# Extract condition
|
||||
condition_match = re.search(r'\[(.*?)\]', xpath)
|
||||
if condition_match:
|
||||
condition = condition_match.group(1)
|
||||
nodes = self._apply_xpath_condition(nodes, condition)
|
||||
|
||||
# Create results
|
||||
for node in nodes:
|
||||
text = node.text() if hasattr(node, 'text') else str(node)
|
||||
results.append(SearchResult(
|
||||
node=node,
|
||||
text=text[:100] + "..." if len(text) > 100 else text,
|
||||
start_offset=0,
|
||||
end_offset=len(text)
|
||||
))
|
||||
|
||||
return results
|
||||
|
||||
def _apply_xpath_condition(self, nodes: List[Node], condition: str) -> List[Node]:
|
||||
"""Apply XPath condition to filter nodes."""
|
||||
filtered = []
|
||||
|
||||
# Parse condition
|
||||
if condition.startswith('@'):
|
||||
# Attribute condition
|
||||
attr_match = re.match(r'@(\w+)=["\']([^"\']+)["\']', condition)
|
||||
if attr_match:
|
||||
attr_name, attr_value = attr_match.groups()
|
||||
for node in nodes:
|
||||
if node.metadata.get(attr_name) == attr_value:
|
||||
filtered.append(node)
|
||||
|
||||
elif 'contains(text()' in condition:
|
||||
# Text contains condition
|
||||
text_match = re.search(r'contains\(text\(\),\s*["\']([^"\']+)["\']\)', condition)
|
||||
if text_match:
|
||||
search_text = text_match.group(1).lower()
|
||||
for node in nodes:
|
||||
if hasattr(node, 'text'):
|
||||
node_text = node.text()
|
||||
if node_text and search_text in node_text.lower():
|
||||
filtered.append(node)
|
||||
|
||||
else:
|
||||
# Level condition for headings
|
||||
try:
|
||||
level = int(condition)
|
||||
for node in nodes:
|
||||
if isinstance(node, HeadingNode) and node.level == level:
|
||||
filtered.append(node)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return filtered
|
||||
|
||||
def _get_context(self, text: str, start: int, end: int, context_size: int = 50) -> str:
|
||||
"""Get context around match."""
|
||||
# Calculate context boundaries
|
||||
context_start = max(0, start - context_size)
|
||||
context_end = min(len(text), end + context_size)
|
||||
|
||||
# Get context
|
||||
context = text[context_start:context_end]
|
||||
|
||||
# Add ellipsis if truncated
|
||||
if context_start > 0:
|
||||
context = "..." + context
|
||||
if context_end < len(text):
|
||||
context = context + "..."
|
||||
|
||||
# Adjust offsets for context
|
||||
if context_start > 0:
|
||||
start = start - context_start + 3 # Account for "..."
|
||||
end = end - context_start + 3
|
||||
else:
|
||||
start = start - context_start
|
||||
end = end - context_start
|
||||
|
||||
return context
|
||||
|
||||
def _calculate_heading_score(self, heading: HeadingNode) -> float:
|
||||
"""Calculate relevance score for heading."""
|
||||
# Higher level headings get higher scores
|
||||
base_score = 7 - heading.level # H1=6, H2=5, etc.
|
||||
|
||||
# Boost section headers
|
||||
if heading.semantic_type == SemanticType.SECTION_HEADER:
|
||||
base_score *= 1.5
|
||||
|
||||
return base_score
|
||||
|
||||
def _get_section_nodes(self, section_name: str) -> List[Node]:
|
||||
"""Get all nodes in a section."""
|
||||
nodes = []
|
||||
|
||||
sections = self.document.sections
|
||||
if section_name in sections:
|
||||
section = sections[section_name]
|
||||
# Get all nodes in section
|
||||
for node in section.node.walk():
|
||||
nodes.append(node)
|
||||
|
||||
return nodes
|
||||
|
||||
def find_tables(self,
|
||||
caption_pattern: Optional[str] = None,
|
||||
min_rows: Optional[int] = None,
|
||||
min_cols: Optional[int] = None) -> List[TableNode]:
|
||||
"""
|
||||
Find tables matching criteria.
|
||||
|
||||
Args:
|
||||
caption_pattern: Regex pattern for caption
|
||||
min_rows: Minimum number of rows
|
||||
min_cols: Minimum number of columns
|
||||
|
||||
Returns:
|
||||
List of matching tables
|
||||
"""
|
||||
tables = []
|
||||
|
||||
for node in self.type_index.get(NodeType.TABLE, []):
|
||||
if not isinstance(node, TableNode):
|
||||
continue
|
||||
|
||||
# Check caption
|
||||
if caption_pattern and node.caption:
|
||||
if not re.search(caption_pattern, node.caption, re.IGNORECASE):
|
||||
continue
|
||||
|
||||
# Check dimensions
|
||||
if min_rows and node.row_count < min_rows:
|
||||
continue
|
||||
if min_cols and node.col_count < min_cols:
|
||||
continue
|
||||
|
||||
tables.append(node)
|
||||
|
||||
return tables
|
||||
|
||||
def find_headings(self,
|
||||
level: Optional[int] = None,
|
||||
pattern: Optional[str] = None) -> List[HeadingNode]:
|
||||
"""
|
||||
Find headings matching criteria.
|
||||
|
||||
Args:
|
||||
level: Heading level (1-6)
|
||||
pattern: Regex pattern for heading text
|
||||
|
||||
Returns:
|
||||
List of matching headings
|
||||
"""
|
||||
headings = []
|
||||
|
||||
for node in self.type_index.get(NodeType.HEADING, []):
|
||||
if not isinstance(node, HeadingNode):
|
||||
continue
|
||||
|
||||
# Check level
|
||||
if level and node.level != level:
|
||||
continue
|
||||
|
||||
# Check pattern
|
||||
if pattern:
|
||||
heading_text = node.text()
|
||||
if not heading_text or not re.search(pattern, heading_text, re.IGNORECASE):
|
||||
continue
|
||||
|
||||
headings.append(node)
|
||||
|
||||
return headings
|
||||
|
||||
def ranked_search(self,
|
||||
query: str,
|
||||
algorithm: str = "hybrid",
|
||||
top_k: int = 10,
|
||||
node_types: Optional[List[NodeType]] = None,
|
||||
in_section: Optional[str] = None,
|
||||
boost_sections: Optional[List[str]] = None) -> List['TypesSearchResult']:
|
||||
"""
|
||||
Advanced search with BM25-based ranking and semantic structure awareness.
|
||||
|
||||
This provides relevance-ranked results better suited for financial documents
|
||||
than simple substring matching. Uses BM25 for exact term matching combined
|
||||
with semantic structure boosting for gateway content detection.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
algorithm: Ranking algorithm ("bm25", "hybrid", "semantic")
|
||||
top_k: Maximum results to return
|
||||
node_types: Limit search to specific node types
|
||||
in_section: Limit search to specific section
|
||||
boost_sections: Section names to boost (e.g., ["Risk Factors"])
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects with relevance scores (from types.py)
|
||||
|
||||
Examples:
|
||||
>>> searcher = DocumentSearch(document)
|
||||
>>> results = searcher.ranked_search("revenue growth", algorithm="hybrid", top_k=5)
|
||||
>>> for result in results:
|
||||
>>> print(f"Score: {result.score:.3f}")
|
||||
>>> print(f"Text: {result.snippet}")
|
||||
>>> print(f"Full context: {result.full_context[:200]}...")
|
||||
"""
|
||||
from edgar.documents.ranking.ranking import (
|
||||
BM25Engine,
|
||||
HybridEngine,
|
||||
SemanticEngine
|
||||
)
|
||||
from edgar.documents.types import SearchResult as TypesSearchResult
|
||||
|
||||
# Get all leaf nodes for ranking (avoid duplicates from parent nodes)
|
||||
nodes = []
|
||||
for node in self.document.root.walk():
|
||||
# Only include leaf nodes with text
|
||||
if hasattr(node, 'children') and node.children:
|
||||
continue # Skip parent nodes
|
||||
if hasattr(node, 'text'):
|
||||
text = node.text()
|
||||
if text and len(text.strip()) > 0:
|
||||
nodes.append(node)
|
||||
|
||||
# Filter by node types if specified
|
||||
if node_types:
|
||||
nodes = [n for n in nodes if n.type in node_types]
|
||||
|
||||
# Filter by section if specified
|
||||
if in_section:
|
||||
section_nodes = self._get_section_nodes(in_section)
|
||||
nodes = [n for n in nodes if n in section_nodes]
|
||||
|
||||
if not nodes:
|
||||
return []
|
||||
|
||||
# Select ranking engine (with caching)
|
||||
engine = self._get_ranking_engine(algorithm.lower(), nodes, boost_sections)
|
||||
|
||||
# Rank nodes
|
||||
ranked_results = engine.rank(query, nodes)
|
||||
|
||||
# Convert to types.SearchResult format and add section context
|
||||
search_results = []
|
||||
for ranked in ranked_results[:top_k]:
|
||||
# Try to find which section this node belongs to
|
||||
section_obj = self._find_node_section(ranked.node)
|
||||
|
||||
search_results.append(TypesSearchResult(
|
||||
node=ranked.node,
|
||||
score=ranked.score,
|
||||
snippet=ranked.snippet,
|
||||
section=section_obj.name if section_obj else None,
|
||||
context=ranked.text if len(ranked.text) <= 500 else ranked.text[:497] + "...",
|
||||
_section_obj=section_obj # Agent navigation support
|
||||
))
|
||||
|
||||
return search_results
|
||||
|
||||
def _get_ranking_engine(self, algorithm: str, nodes: List[Node],
|
||||
boost_sections: Optional[List[str]] = None):
|
||||
"""
|
||||
Get or create ranking engine with caching support.
|
||||
|
||||
Args:
|
||||
algorithm: Ranking algorithm ("bm25", "hybrid", "semantic")
|
||||
nodes: Nodes to index
|
||||
boost_sections: Section names to boost (for hybrid/semantic)
|
||||
|
||||
Returns:
|
||||
Ready-to-use ranking engine
|
||||
"""
|
||||
from edgar.documents.ranking.ranking import (
|
||||
BM25Engine,
|
||||
HybridEngine,
|
||||
SemanticEngine
|
||||
)
|
||||
from edgar.documents.ranking.cache import get_search_cache, CacheEntry
|
||||
from datetime import datetime
|
||||
|
||||
# Create cache key
|
||||
# Use document ID, algorithm, and sample of first node for stability
|
||||
content_sample = nodes[0].text()[:200] if nodes and hasattr(nodes[0], 'text') else ""
|
||||
cache_key = f"{self.document.accession_number if hasattr(self.document, 'accession_number') else id(self.document)}_{algorithm}"
|
||||
|
||||
# Check instance cache first (for same search session)
|
||||
if cache_key in self._ranking_engines:
|
||||
engine, cached_nodes = self._ranking_engines[cache_key]
|
||||
# Verify nodes haven't changed
|
||||
if cached_nodes == nodes:
|
||||
return engine
|
||||
|
||||
# Create engine based on algorithm
|
||||
if algorithm == "bm25":
|
||||
engine = BM25Engine()
|
||||
elif algorithm == "hybrid":
|
||||
engine = HybridEngine(boost_sections=boost_sections)
|
||||
elif algorithm == "semantic":
|
||||
engine = SemanticEngine(boost_sections=boost_sections)
|
||||
else:
|
||||
raise ValueError(f"Unsupported algorithm: {algorithm}")
|
||||
|
||||
# Try to load from global cache if enabled
|
||||
if self.use_cache and algorithm == "bm25": # Only cache BM25 for now
|
||||
search_cache = get_search_cache()
|
||||
document_hash = search_cache.compute_document_hash(
|
||||
document_id=cache_key,
|
||||
content_sample=content_sample
|
||||
)
|
||||
|
||||
cached_entry = search_cache.get(document_hash)
|
||||
if cached_entry:
|
||||
# Load index from cache
|
||||
try:
|
||||
engine.load_index_data(cached_entry.index_data, nodes)
|
||||
# Cache in instance
|
||||
self._ranking_engines[cache_key] = (engine, nodes)
|
||||
return engine
|
||||
except Exception as e:
|
||||
# Cache load failed, rebuild
|
||||
pass
|
||||
|
||||
# Build fresh index
|
||||
# For BM25/Hybrid, index is built lazily on first rank() call
|
||||
# But we can force it here and cache the result
|
||||
if self.use_cache and algorithm == "bm25":
|
||||
# Force index build by doing a dummy rank
|
||||
engine._build_index(nodes)
|
||||
|
||||
# Save to global cache
|
||||
try:
|
||||
search_cache = get_search_cache()
|
||||
document_hash = search_cache.compute_document_hash(
|
||||
document_id=cache_key,
|
||||
content_sample=content_sample
|
||||
)
|
||||
|
||||
index_data = engine.get_index_data()
|
||||
cache_entry = CacheEntry(
|
||||
document_hash=document_hash,
|
||||
index_data=index_data,
|
||||
created_at=datetime.now()
|
||||
)
|
||||
search_cache.put(document_hash, cache_entry)
|
||||
except Exception as e:
|
||||
# Cache save failed, not critical
|
||||
pass
|
||||
|
||||
# Cache in instance
|
||||
self._ranking_engines[cache_key] = (engine, nodes)
|
||||
|
||||
return engine
|
||||
|
||||
def get_cache_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get search cache statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with cache performance metrics including:
|
||||
- memory_entries: Number of indices in memory
|
||||
- disk_entries: Number of indices on disk
|
||||
- cache_hits: Total cache hits
|
||||
- cache_misses: Total cache misses
|
||||
- hit_rate: Cache hit rate (0-1)
|
||||
- memory_size_mb: Estimated memory usage in MB
|
||||
|
||||
Examples:
|
||||
>>> searcher = DocumentSearch(document)
|
||||
>>> searcher.ranked_search("revenue", algorithm="bm25")
|
||||
>>> stats = searcher.get_cache_stats()
|
||||
>>> print(f"Hit rate: {stats['hit_rate']:.1%}")
|
||||
"""
|
||||
from edgar.documents.ranking.cache import get_search_cache
|
||||
|
||||
stats = {
|
||||
'instance_cache_entries': len(self._ranking_engines),
|
||||
'global_cache_stats': {}
|
||||
}
|
||||
|
||||
if self.use_cache:
|
||||
cache = get_search_cache()
|
||||
stats['global_cache_stats'] = cache.get_stats()
|
||||
|
||||
return stats
|
||||
|
||||
def clear_cache(self, memory_only: bool = False) -> None:
|
||||
"""
|
||||
Clear search caches.
|
||||
|
||||
Args:
|
||||
memory_only: If True, only clear in-memory caches (default: False)
|
||||
|
||||
Examples:
|
||||
>>> searcher = DocumentSearch(document)
|
||||
>>> searcher.clear_cache() # Clear all caches
|
||||
>>> searcher.clear_cache(memory_only=True) # Only clear memory
|
||||
"""
|
||||
# Clear instance cache
|
||||
self._ranking_engines.clear()
|
||||
|
||||
# Clear global cache if enabled
|
||||
if self.use_cache:
|
||||
from edgar.documents.ranking.cache import get_search_cache
|
||||
cache = get_search_cache()
|
||||
cache.clear(memory_only=memory_only)
|
||||
|
||||
def _find_node_section(self, node: Node):
|
||||
"""
|
||||
Find which section a node belongs to.
|
||||
|
||||
Returns:
|
||||
Section object or None
|
||||
"""
|
||||
# Walk up the tree to find section markers
|
||||
current = node
|
||||
while current:
|
||||
# Check if any section contains this node
|
||||
for section_name, section in self.document.sections.items():
|
||||
# Check if node is in section's subtree
|
||||
for section_node in section.node.walk():
|
||||
if section_node is current or section_node is node:
|
||||
return section
|
||||
|
||||
current = current.parent if hasattr(current, 'parent') else None
|
||||
|
||||
return None
|
||||
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
Parsing strategies for different content types.
|
||||
"""
|
||||
|
||||
from edgar.documents.strategies.document_builder import DocumentBuilder
|
||||
from edgar.documents.strategies.header_detection import HeaderDetectionStrategy
|
||||
from edgar.documents.strategies.table_processing import TableProcessor
|
||||
from edgar.documents.strategies.xbrl_extraction import XBRLExtractor
|
||||
|
||||
__all__ = [
|
||||
'DocumentBuilder',
|
||||
'HeaderDetectionStrategy',
|
||||
'TableProcessor',
|
||||
'XBRLExtractor'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,670 @@
|
||||
"""
|
||||
Document builder that converts parsed HTML tree into document nodes.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.nodes import (
|
||||
Node, DocumentNode, TextNode, ParagraphNode, HeadingNode,
|
||||
ContainerNode, SectionNode, ListNode, ListItemNode, LinkNode, ImageNode
|
||||
)
|
||||
from edgar.documents.strategies.style_parser import StyleParser
|
||||
from edgar.documents.table_nodes import TableNode, Cell, Row
|
||||
from edgar.documents.types import Style, ParseContext, SemanticType
|
||||
|
||||
|
||||
class DocumentBuilder:
|
||||
"""
|
||||
Builds Document node tree from parsed HTML.
|
||||
|
||||
Handles the conversion of HTML elements into structured nodes
|
||||
with proper hierarchy and metadata.
|
||||
"""
|
||||
|
||||
# Block-level elements
|
||||
BLOCK_ELEMENTS = {
|
||||
'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'ul', 'ol', 'li', 'blockquote', 'pre', 'hr',
|
||||
'table', 'form', 'fieldset', 'address', 'section',
|
||||
'article', 'aside', 'nav', 'header', 'footer', 'main'
|
||||
}
|
||||
|
||||
# Inline elements
|
||||
INLINE_ELEMENTS = {
|
||||
'span', 'a', 'em', 'strong', 'b', 'i', 'u', 's',
|
||||
'small', 'mark', 'del', 'ins', 'sub', 'sup',
|
||||
'code', 'kbd', 'var', 'samp', 'abbr', 'cite',
|
||||
'q', 'time', 'font',
|
||||
# IXBRL inline elements for simple values - should not break text flow
|
||||
'ix:nonfraction', 'ix:footnote', 'ix:fraction'
|
||||
}
|
||||
|
||||
# Elements to skip
|
||||
SKIP_ELEMENTS = {
|
||||
'script', 'style', 'meta', 'link', 'noscript',
|
||||
# IXBRL exclude elements - content that should not appear in final document
|
||||
'ix:exclude'
|
||||
}
|
||||
|
||||
def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
|
||||
"""
|
||||
Initialize document builder.
|
||||
|
||||
Args:
|
||||
config: Parser configuration
|
||||
strategies: Dictionary of parsing strategies
|
||||
"""
|
||||
self.config = config
|
||||
self.strategies = strategies
|
||||
self.style_parser = StyleParser()
|
||||
self.context = ParseContext()
|
||||
|
||||
# Track XBRL context
|
||||
self.xbrl_context_stack = []
|
||||
self.xbrl_continuations = {}
|
||||
|
||||
def build(self, tree: HtmlElement) -> DocumentNode:
|
||||
"""
|
||||
Build document from HTML tree.
|
||||
|
||||
Args:
|
||||
tree: Parsed HTML tree
|
||||
|
||||
Returns:
|
||||
Document root node
|
||||
"""
|
||||
# Create root document node
|
||||
root = DocumentNode()
|
||||
|
||||
# Find body element
|
||||
body = tree.find('.//body')
|
||||
if body is None:
|
||||
# If no body, use the entire tree
|
||||
body = tree
|
||||
|
||||
# Process body content
|
||||
self._process_element(body, root)
|
||||
|
||||
# Apply node merging if configured
|
||||
if self.config.merge_adjacent_nodes:
|
||||
self._merge_adjacent_nodes(root)
|
||||
|
||||
return root
|
||||
|
||||
def _process_element(self, element: HtmlElement, parent: Node) -> Optional[Node]:
|
||||
"""
|
||||
Process HTML element into node.
|
||||
|
||||
Args:
|
||||
element: HTML element to process
|
||||
parent: Parent node
|
||||
|
||||
Returns:
|
||||
Created node or None if skipped
|
||||
"""
|
||||
|
||||
# Skip certain elements but preserve their tail text
|
||||
if element.tag in self.SKIP_ELEMENTS:
|
||||
# Process tail text even when skipping element
|
||||
if element.tail:
|
||||
if self.config.preserve_whitespace:
|
||||
text_node = TextNode(content=element.tail)
|
||||
parent.add_child(text_node)
|
||||
else:
|
||||
if element.tail.strip():
|
||||
text_node = TextNode(content=element.tail.strip())
|
||||
parent.add_child(text_node)
|
||||
return None
|
||||
|
||||
# Skip page number containers
|
||||
if self._is_page_number_container(element):
|
||||
return None
|
||||
|
||||
# Skip page break elements
|
||||
if self._is_page_break_element(element):
|
||||
return None
|
||||
|
||||
# Skip navigation containers that follow page breaks
|
||||
if self._is_page_navigation_container(element):
|
||||
return None
|
||||
|
||||
# Track parsing depth
|
||||
self.context.depth += 1
|
||||
|
||||
try:
|
||||
# Handle XBRL elements
|
||||
if element.tag.startswith('{'): # Namespaced element
|
||||
self._enter_xbrl_context(element)
|
||||
|
||||
# Extract style
|
||||
style = self._extract_style(element)
|
||||
|
||||
# Create appropriate node based on element type
|
||||
node = self._create_node_for_element(element, style)
|
||||
|
||||
if node:
|
||||
# Add XBRL metadata if in context
|
||||
if self.xbrl_context_stack:
|
||||
node.metadata.update(self._get_current_xbrl_metadata())
|
||||
|
||||
# Add to parent
|
||||
parent.add_child(node)
|
||||
|
||||
# Process children for container nodes
|
||||
if self._should_process_children(element, node):
|
||||
# Add element's direct text first
|
||||
if element.text:
|
||||
if self.config.preserve_whitespace:
|
||||
if element.text: # Don't strip whitespace
|
||||
text_node = TextNode(content=element.text)
|
||||
node.add_child(text_node)
|
||||
else:
|
||||
if element.text.strip():
|
||||
text_node = TextNode(content=element.text.strip())
|
||||
node.add_child(text_node)
|
||||
|
||||
# Process child elements
|
||||
for child in element:
|
||||
self._process_element(child, node)
|
||||
|
||||
# Process text after children
|
||||
if element.tail:
|
||||
if self.config.preserve_whitespace:
|
||||
text_node = TextNode(content=element.tail)
|
||||
parent.add_child(text_node)
|
||||
else:
|
||||
if element.tail.strip():
|
||||
text_node = TextNode(content=element.tail.strip())
|
||||
parent.add_child(text_node)
|
||||
elif element.tail.isspace():
|
||||
# Even if tail is just whitespace, preserve the spacing info
|
||||
# This helps with inline element spacing decisions
|
||||
if hasattr(node, 'set_metadata'):
|
||||
node.set_metadata('has_tail_whitespace', True)
|
||||
else:
|
||||
# Node created but children not processed - still need to handle tail
|
||||
if element.tail:
|
||||
if self.config.preserve_whitespace:
|
||||
text_node = TextNode(content=element.tail)
|
||||
parent.add_child(text_node)
|
||||
else:
|
||||
if element.tail.strip():
|
||||
text_node = TextNode(content=element.tail.strip())
|
||||
parent.add_child(text_node)
|
||||
elif element.tail.isspace():
|
||||
# Even if tail is just whitespace, preserve the spacing info
|
||||
if hasattr(node, 'set_metadata'):
|
||||
node.set_metadata('has_tail_whitespace', True)
|
||||
else:
|
||||
# No node created, process children with same parent
|
||||
for child in element:
|
||||
self._process_element(child, parent)
|
||||
|
||||
# Process tail text
|
||||
if element.tail:
|
||||
if self.config.preserve_whitespace:
|
||||
text_node = TextNode(content=element.tail)
|
||||
parent.add_child(text_node)
|
||||
else:
|
||||
if element.tail.strip():
|
||||
text_node = TextNode(content=element.tail.strip())
|
||||
parent.add_child(text_node)
|
||||
|
||||
# Exit XBRL context
|
||||
if element.tag.startswith('{'):
|
||||
self._exit_xbrl_context(element)
|
||||
|
||||
return node
|
||||
|
||||
finally:
|
||||
self.context.depth -= 1
|
||||
|
||||
def _create_node_for_element(self, element: HtmlElement, style: Style) -> Optional[Node]:
|
||||
"""Create appropriate node for HTML element."""
|
||||
tag = element.tag.lower() if not element.tag.startswith('{') else element.tag
|
||||
|
||||
|
||||
# Check for heading
|
||||
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
level = int(tag[1])
|
||||
text = self._get_element_text(element)
|
||||
if text:
|
||||
return HeadingNode(content=text, level=level, style=style)
|
||||
|
||||
# Handle specific elements first before header detection
|
||||
if tag == 'p':
|
||||
return ParagraphNode(style=style)
|
||||
|
||||
elif tag == 'li':
|
||||
return ListItemNode(style=style)
|
||||
|
||||
# Check if element might be a heading based on style/content
|
||||
# Skip header detection for certain tags that should never be headers
|
||||
skip_header_detection_tags = {
|
||||
'li', 'td', 'th', 'option', 'a', 'button', 'label',
|
||||
# IXBRL inline elements - should not be treated as headers
|
||||
'ix:nonfraction', 'ix:footnote', 'ix:fraction',
|
||||
# IXBRL elements that can contain tables and complex content
|
||||
'ix:nonNumeric', 'ix:continuation'
|
||||
}
|
||||
if tag not in skip_header_detection_tags and self.strategies.get('header_detection'):
|
||||
header_info = self.strategies['header_detection'].detect(element, self.context)
|
||||
if header_info and header_info.confidence > self.config.header_detection_threshold:
|
||||
text = self._get_element_text(element)
|
||||
if text:
|
||||
node = HeadingNode(
|
||||
content=text,
|
||||
level=header_info.level,
|
||||
style=style
|
||||
)
|
||||
# Add header metadata
|
||||
node.set_metadata('detection_method', header_info.detection_method)
|
||||
node.set_metadata('confidence', header_info.confidence)
|
||||
if header_info.is_item:
|
||||
node.semantic_type = SemanticType.ITEM_HEADER
|
||||
node.set_metadata('item_number', header_info.item_number)
|
||||
return node
|
||||
|
||||
# Continue handling other specific elements
|
||||
if tag == 'table':
|
||||
if self.strategies.get('table_processing'):
|
||||
return self.strategies['table_processing'].process(element)
|
||||
else:
|
||||
return self._process_table_basic(element, style)
|
||||
|
||||
elif tag in ['ul', 'ol']:
|
||||
return ListNode(ordered=(tag == 'ol'), style=style)
|
||||
|
||||
elif tag == 'li':
|
||||
return ListItemNode(style=style)
|
||||
|
||||
elif tag == 'a':
|
||||
href = element.get('href', '')
|
||||
title = element.get('title', '')
|
||||
text = self._get_element_text(element)
|
||||
return LinkNode(content=text, href=href, title=title, style=style)
|
||||
|
||||
elif tag == 'img':
|
||||
return ImageNode(
|
||||
src=element.get('src'),
|
||||
alt=element.get('alt'),
|
||||
width=self._parse_dimension(element.get('width')),
|
||||
height=self._parse_dimension(element.get('height')),
|
||||
style=style
|
||||
)
|
||||
|
||||
elif tag == 'br':
|
||||
# Line break - add as text node
|
||||
return TextNode(content='\n')
|
||||
|
||||
elif tag in ['section', 'article']:
|
||||
return SectionNode(style=style)
|
||||
|
||||
elif tag == 'div' or tag in self.BLOCK_ELEMENTS:
|
||||
# Check if CSS display property makes this inline
|
||||
if style.display in ['inline', 'inline-block']:
|
||||
# Treat as inline element despite being a div
|
||||
text = self._get_element_text(element)
|
||||
if text:
|
||||
text_node = TextNode(content=text, style=style)
|
||||
text_node.set_metadata('original_tag', tag)
|
||||
text_node.set_metadata('inline_via_css', True)
|
||||
return text_node
|
||||
# If no text but inline, still process children inline
|
||||
return ContainerNode(tag_name=tag, style=style)
|
||||
|
||||
# Normal block behavior
|
||||
# Check if this is just a text container with only inline elements
|
||||
if self._is_text_only_container(element):
|
||||
# Create ParagraphNode for divs containing only inline elements
|
||||
# This ensures proper text concatenation for spans, etc.
|
||||
return ParagraphNode(style=style)
|
||||
else:
|
||||
return ContainerNode(tag_name=tag, style=style)
|
||||
|
||||
elif tag in self.INLINE_ELEMENTS:
|
||||
# Inline elements - extract text and add to parent
|
||||
text = self._get_element_text(element)
|
||||
if text:
|
||||
text_node = TextNode(content=text, style=style)
|
||||
# Preserve inline element metadata
|
||||
text_node.set_metadata('original_tag', tag)
|
||||
return text_node
|
||||
|
||||
elif tag in ['ix:nonNumeric', 'ix:continuation']:
|
||||
# IXBRL elements that can contain complex content including tables
|
||||
# Process as container to allow proper table parsing
|
||||
return ContainerNode(tag_name=tag, style=style)
|
||||
|
||||
# Default: create container for unknown elements
|
||||
return ContainerNode(tag_name=tag, style=style)
|
||||
|
||||
def _is_page_number_container(self, element: HtmlElement) -> bool:
|
||||
"""Detect and filter page number containers across various SEC filing patterns."""
|
||||
import re
|
||||
|
||||
# Get text content first - all page numbers should be short
|
||||
text_content = element.text_content().strip()
|
||||
|
||||
# Must be short content (1-8 chars to handle "Page X" format)
|
||||
if len(text_content) > 8 or len(text_content) == 0:
|
||||
return False
|
||||
|
||||
# Must be numeric, roman numerals, or "Page X" format
|
||||
if not self._is_page_number_content(text_content):
|
||||
return False
|
||||
|
||||
# Check various patterns based on element type and styling
|
||||
tag = element.tag.lower()
|
||||
|
||||
# Pattern 1: Oracle-style flexbox containers (highest confidence)
|
||||
if tag == 'div' and self._is_flexbox_page_number(element):
|
||||
return True
|
||||
|
||||
# Pattern 2: Center/right aligned paragraphs (common pattern)
|
||||
if tag == 'p' and self._is_aligned_page_number(element):
|
||||
return True
|
||||
|
||||
# Pattern 3: Footer-style divs with centered page numbers
|
||||
if tag == 'div' and self._is_footer_page_number(element):
|
||||
return True
|
||||
|
||||
# Pattern 4: Simple divs with page break context
|
||||
if tag == 'div' and self._is_page_break_context(element):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _is_page_number_content(self, text: str) -> bool:
|
||||
"""Check if text content looks like a page number."""
|
||||
import re
|
||||
|
||||
# Simple numeric (most common)
|
||||
if text.isdigit():
|
||||
return True
|
||||
|
||||
# Roman numerals
|
||||
if re.match(r'^[ivxlcdm]+$', text.lower()):
|
||||
return True
|
||||
|
||||
# "Page X" or "Page X of Y" format
|
||||
if re.match(r'^page\s+\d+(\s+of\s+\d+)?$', text.lower()):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _is_flexbox_page_number(self, element: HtmlElement) -> bool:
|
||||
"""Detect Oracle-style flexbox page number containers."""
|
||||
import re
|
||||
|
||||
style_attr = element.get('style', '')
|
||||
if not style_attr:
|
||||
return False
|
||||
|
||||
# Must have: display:flex, justify-content:flex-end, min-height:1in
|
||||
required_patterns = [
|
||||
r'display:\s*flex',
|
||||
r'justify-content:\s*flex-end',
|
||||
r'min-height:\s*1in'
|
||||
]
|
||||
|
||||
return all(re.search(pattern, style_attr) for pattern in required_patterns)
|
||||
|
||||
def _is_aligned_page_number(self, element: HtmlElement) -> bool:
|
||||
"""Detect center or right-aligned page number paragraphs."""
|
||||
import re
|
||||
|
||||
style_attr = element.get('style', '')
|
||||
|
||||
# Check for center or right alignment
|
||||
alignment_pattern = r'text-align:\s*(center|right)'
|
||||
if not re.search(alignment_pattern, style_attr):
|
||||
return False
|
||||
|
||||
# Optional: check for smaller font size (common in page numbers)
|
||||
font_size_pattern = r'font-size:\s*([0-9]+)pt'
|
||||
font_match = re.search(font_size_pattern, style_attr)
|
||||
if font_match:
|
||||
font_size = int(font_match.group(1))
|
||||
# Page numbers often use smaller fonts (8-12pt)
|
||||
if font_size <= 12:
|
||||
return True
|
||||
|
||||
return True # Any center/right aligned short content
|
||||
|
||||
def _is_footer_page_number(self, element: HtmlElement) -> bool:
|
||||
"""Detect footer-style page number containers."""
|
||||
import re
|
||||
|
||||
style_attr = element.get('style', '')
|
||||
|
||||
# Look for bottom positioning or footer-like styling
|
||||
footer_patterns = [
|
||||
r'bottom:\s*[0-9]',
|
||||
r'position:\s*absolute',
|
||||
r'margin-bottom:\s*0',
|
||||
r'text-align:\s*center'
|
||||
]
|
||||
|
||||
# Need at least 2 footer indicators
|
||||
matches = sum(1 for pattern in footer_patterns if re.search(pattern, style_attr))
|
||||
return matches >= 2
|
||||
|
||||
def _is_page_break_context(self, element: HtmlElement) -> bool:
|
||||
"""Check if element is near page breaks (common page number context)."""
|
||||
|
||||
# Check next sibling for page break HR
|
||||
next_elem = element.getnext()
|
||||
if next_elem is not None and next_elem.tag == 'hr':
|
||||
hr_style = next_elem.get('style', '')
|
||||
if 'page-break' in hr_style:
|
||||
return True
|
||||
|
||||
# Check if element has page-break styling itself
|
||||
style_attr = element.get('style', '')
|
||||
if 'page-break' in style_attr:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _is_page_break_element(self, element: HtmlElement) -> bool:
|
||||
"""Detect page break HR elements."""
|
||||
if element.tag.lower() != 'hr':
|
||||
return False
|
||||
|
||||
style_attr = element.get('style', '')
|
||||
|
||||
# Check for page-break-after:always or similar page break styles
|
||||
return 'page-break' in style_attr
|
||||
|
||||
def _is_page_navigation_container(self, element: HtmlElement) -> bool:
|
||||
"""Detect navigation containers that appear after page breaks."""
|
||||
if element.tag.lower() != 'div':
|
||||
return False
|
||||
|
||||
style_attr = element.get('style', '')
|
||||
|
||||
# Check for navigation container patterns
|
||||
# Often have: padding-top, min-height:1in, box-sizing:border-box
|
||||
nav_indicators = [
|
||||
r'padding-top:\s*0\.5in',
|
||||
r'min-height:\s*1in',
|
||||
r'box-sizing:\s*border-box'
|
||||
]
|
||||
|
||||
import re
|
||||
matches = sum(1 for pattern in nav_indicators if re.search(pattern, style_attr))
|
||||
|
||||
# Need at least 2 indicators
|
||||
if matches < 2:
|
||||
return False
|
||||
|
||||
# Check if it contains typical navigation content
|
||||
text_content = element.text_content().strip().lower()
|
||||
|
||||
# Common navigation phrases
|
||||
nav_phrases = [
|
||||
'table of contents',
|
||||
'index to financial statements',
|
||||
'table of content',
|
||||
'index to financial statement'
|
||||
]
|
||||
|
||||
return any(phrase in text_content for phrase in nav_phrases)
|
||||
|
||||
def _extract_style(self, element: HtmlElement) -> Style:
|
||||
"""Extract style from element."""
|
||||
style_str = element.get('style', '')
|
||||
style = self.style_parser.parse(style_str)
|
||||
|
||||
# Add tag-specific styles
|
||||
tag = element.tag.lower()
|
||||
if tag == 'b' or tag == 'strong':
|
||||
style.font_weight = 'bold'
|
||||
elif tag == 'i' or tag == 'em':
|
||||
style.font_style = 'italic'
|
||||
elif tag == 'u':
|
||||
style.text_decoration = 'underline'
|
||||
|
||||
# Handle alignment
|
||||
align = element.get('align')
|
||||
if align:
|
||||
style.text_align = align
|
||||
|
||||
return style
|
||||
|
||||
def _get_element_text(self, element: HtmlElement) -> str:
|
||||
"""Get text content from element."""
|
||||
text_parts = []
|
||||
|
||||
# Get element's direct text
|
||||
if element.text:
|
||||
# For inline elements, preserve leading/trailing whitespace
|
||||
if element.tag.lower() in self.INLINE_ELEMENTS:
|
||||
text_parts.append(element.text)
|
||||
else:
|
||||
text_parts.append(element.text.strip())
|
||||
|
||||
# For simple elements, get all text content
|
||||
if element.tag.lower() in self.INLINE_ELEMENTS or \
|
||||
element.tag.lower() in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
# Get all text including from child elements
|
||||
for child in element:
|
||||
if child.tag.lower() not in self.SKIP_ELEMENTS:
|
||||
child_text = child.text_content()
|
||||
if child_text:
|
||||
# For inline elements, preserve whitespace in child content too
|
||||
if element.tag.lower() in self.INLINE_ELEMENTS:
|
||||
text_parts.append(child_text)
|
||||
else:
|
||||
text_parts.append(child_text.strip())
|
||||
|
||||
# For inline elements with preserved whitespace, concatenate directly
|
||||
# For others, join with spaces
|
||||
if element.tag.lower() in self.INLINE_ELEMENTS and len(text_parts) == 1:
|
||||
return text_parts[0] if text_parts else ''
|
||||
else:
|
||||
return ' '.join(text_parts)
|
||||
|
||||
def _is_text_only_container(self, element: HtmlElement) -> bool:
|
||||
"""Check if element contains only text and inline elements."""
|
||||
for child in element:
|
||||
if child.tag.lower() in self.BLOCK_ELEMENTS:
|
||||
return False
|
||||
if child.tag.lower() == 'table':
|
||||
return False
|
||||
return True
|
||||
|
||||
def _should_process_children(self, element: HtmlElement, node: Node) -> bool:
|
||||
"""Determine if children should be processed."""
|
||||
# Don't process children for certain node types
|
||||
if isinstance(node, (TextNode, HeadingNode)):
|
||||
return False
|
||||
|
||||
# Tables are processed separately
|
||||
if isinstance(node, TableNode):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _process_table_basic(self, element: HtmlElement, style: Style) -> TableNode:
|
||||
"""Basic table processing without advanced strategy."""
|
||||
table = TableNode(style=style)
|
||||
|
||||
# Set config for rendering decisions
|
||||
table._config = self.config
|
||||
|
||||
# Extract caption
|
||||
caption_elem = element.find('.//caption')
|
||||
if caption_elem is not None:
|
||||
table.caption = caption_elem.text_content().strip()
|
||||
|
||||
# Process rows
|
||||
for tr in element.findall('.//tr'):
|
||||
cells = []
|
||||
for td in tr.findall('.//td') + tr.findall('.//th'):
|
||||
cell = Cell(
|
||||
content=td.text_content().strip(),
|
||||
colspan=int(td.get('colspan', '1')),
|
||||
rowspan=int(td.get('rowspan', '1')),
|
||||
is_header=(td.tag == 'th'),
|
||||
align=td.get('align')
|
||||
)
|
||||
cells.append(cell)
|
||||
|
||||
if cells:
|
||||
row = Row(cells=cells, is_header=(tr.find('.//th') is not None))
|
||||
|
||||
# Determine if header or data row
|
||||
if tr.getparent().tag == 'thead' or row.is_header:
|
||||
table.headers.append(cells)
|
||||
else:
|
||||
table.rows.append(row)
|
||||
|
||||
return table
|
||||
|
||||
def _parse_dimension(self, value: Optional[str]) -> Optional[int]:
|
||||
"""Parse dimension value (width/height)."""
|
||||
if not value:
|
||||
return None
|
||||
|
||||
# Remove 'px' suffix if present
|
||||
value = value.strip().rstrip('px')
|
||||
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def _enter_xbrl_context(self, element: HtmlElement):
|
||||
"""Enter XBRL context."""
|
||||
if self.config.extract_xbrl and self.strategies.get('xbrl_extraction'):
|
||||
xbrl_data = self.strategies['xbrl_extraction'].extract_context(element)
|
||||
if xbrl_data:
|
||||
self.xbrl_context_stack.append(xbrl_data)
|
||||
|
||||
def _exit_xbrl_context(self, element: HtmlElement):
|
||||
"""Exit XBRL context."""
|
||||
if self.xbrl_context_stack:
|
||||
self.xbrl_context_stack.pop()
|
||||
|
||||
def _get_current_xbrl_metadata(self) -> Dict[str, Any]:
|
||||
"""Get current XBRL metadata."""
|
||||
if not self.xbrl_context_stack:
|
||||
return {}
|
||||
|
||||
# Merge all contexts in stack
|
||||
metadata = {}
|
||||
for context in self.xbrl_context_stack:
|
||||
metadata.update(context)
|
||||
|
||||
return metadata
|
||||
|
||||
def _merge_adjacent_nodes(self, root: Node):
|
||||
"""Merge adjacent text nodes with similar styles."""
|
||||
# Implementation would recursively merge adjacent text nodes
|
||||
# This is a placeholder for the actual implementation
|
||||
pass
|
||||
@@ -0,0 +1,450 @@
|
||||
"""
|
||||
Multi-strategy header detection for document structure.
|
||||
"""
|
||||
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional, List, Dict
|
||||
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.types import HeaderInfo, ParseContext
|
||||
|
||||
|
||||
class HeaderDetector(ABC):
|
||||
"""Abstract base class for header detectors."""
|
||||
|
||||
@abstractmethod
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""Detect if element is a header."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def name(self) -> str:
|
||||
"""Detector name."""
|
||||
pass
|
||||
|
||||
|
||||
class StyleBasedDetector(HeaderDetector):
|
||||
"""Detect headers based on CSS styles."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "style"
|
||||
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""Detect headers based on style attributes."""
|
||||
# Get element style
|
||||
style = context.get_current_style()
|
||||
|
||||
# Skip if no style info
|
||||
if not style:
|
||||
return None
|
||||
|
||||
# Get text content
|
||||
text = element.text_content().strip()
|
||||
if not text or len(text) > 200: # Skip very long text
|
||||
return None
|
||||
|
||||
confidence = 0.0
|
||||
level = 3 # Default level
|
||||
|
||||
# Check font size
|
||||
if style.font_size and context.base_font_size:
|
||||
size_ratio = style.font_size / context.base_font_size
|
||||
|
||||
if size_ratio >= 2.0:
|
||||
confidence += 0.8
|
||||
level = 1
|
||||
elif size_ratio >= 1.5:
|
||||
confidence += 0.7
|
||||
level = 2
|
||||
elif size_ratio >= 1.2:
|
||||
confidence += 0.5
|
||||
level = 3
|
||||
elif size_ratio >= 1.1:
|
||||
confidence += 0.3
|
||||
level = 4
|
||||
|
||||
# Check font weight
|
||||
if style.is_bold:
|
||||
confidence += 0.3
|
||||
if level == 3: # Adjust level for bold text
|
||||
level = 2
|
||||
|
||||
# Check text alignment
|
||||
if style.is_centered:
|
||||
confidence += 0.2
|
||||
|
||||
# Check for uppercase
|
||||
if text.isupper() and len(text.split()) <= 10:
|
||||
confidence += 0.2
|
||||
|
||||
# Check margins (headers often have larger margins)
|
||||
if style.margin_top and style.margin_top > 20:
|
||||
confidence += 0.1
|
||||
if style.margin_bottom and style.margin_bottom > 10:
|
||||
confidence += 0.1
|
||||
|
||||
# Normalize confidence
|
||||
confidence = min(confidence, 1.0)
|
||||
|
||||
if confidence > 0.4: # Threshold for style-based detection
|
||||
return HeaderInfo.from_text(text, level, confidence, self.name)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class PatternBasedDetector(HeaderDetector):
|
||||
"""Detect headers based on text patterns."""
|
||||
|
||||
# Common header patterns in SEC filings
|
||||
HEADER_PATTERNS = [
|
||||
# Item patterns
|
||||
(r'^(Item|ITEM)\s+(\d+[A-Z]?)[.\s]+(.+)$', 1, 0.95),
|
||||
(r'^Part\s+[IVX]+[.\s]*$', 1, 0.9),
|
||||
(r'^PART\s+[IVX]+[.\s]*$', 1, 0.9),
|
||||
|
||||
# Section patterns
|
||||
(r'^(BUSINESS|RISK FACTORS|PROPERTIES|LEGAL PROCEEDINGS)$', 2, 0.85),
|
||||
(r'^(Management\'?s?\s+Discussion|MD&A)', 2, 0.85),
|
||||
(r'^(Financial\s+Statements|Consolidated\s+Financial\s+Statements)$', 2, 0.85),
|
||||
|
||||
# Numbered sections
|
||||
(r'^\d+\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
|
||||
(r'^[A-Z]\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7),
|
||||
(r'^\([a-z]\)\s+[A-Z][A-Za-z\s]+$', 4, 0.6),
|
||||
|
||||
# Title case headers
|
||||
(r'^[A-Z][A-Za-z\s]+[A-Za-z]$', 3, 0.5),
|
||||
|
||||
# All caps headers
|
||||
(r'^[A-Z\s]+$', 3, 0.6),
|
||||
]
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "pattern"
|
||||
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""Detect headers based on text patterns."""
|
||||
text = element.text_content().strip()
|
||||
|
||||
# Skip empty or very long text
|
||||
if not text or len(text) > 200:
|
||||
return None
|
||||
|
||||
# Skip single punctuation - never headers
|
||||
if len(text) == 1 and text in '.,!?;:()[]{}':
|
||||
return None
|
||||
|
||||
# Skip if text contains multiple sentences (likely paragraph)
|
||||
if text.count('.') > 2:
|
||||
return None
|
||||
|
||||
# Check against patterns
|
||||
for pattern, level, base_confidence in self.HEADER_PATTERNS:
|
||||
match = re.match(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
# Adjust confidence based on context
|
||||
confidence = base_confidence
|
||||
|
||||
# Boost confidence if element is alone in parent
|
||||
if len(element.getparent()) == 1:
|
||||
confidence += 0.1
|
||||
|
||||
# Boost confidence if followed by substantial text
|
||||
next_elem = element.getnext()
|
||||
if next_elem is not None and len(next_elem.text_content()) > 100:
|
||||
confidence += 0.1
|
||||
|
||||
confidence = min(confidence, 1.0)
|
||||
|
||||
return HeaderInfo.from_text(text, level, confidence, self.name)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class StructuralDetector(HeaderDetector):
|
||||
"""Detect headers based on DOM structure."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "structural"
|
||||
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""Detect headers based on structural cues."""
|
||||
text = element.text_content().strip()
|
||||
|
||||
# Skip empty or very long text
|
||||
if not text or len(text) > 200:
|
||||
return None
|
||||
|
||||
# Skip single punctuation - never headers
|
||||
if len(text) == 1 and text in '.,!?;:()[]{}':
|
||||
return None
|
||||
|
||||
confidence = 0.0
|
||||
level = 3
|
||||
|
||||
# Check if element is in a header tag
|
||||
tag = element.tag.lower()
|
||||
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
confidence = 1.0
|
||||
level = int(tag[1])
|
||||
return HeaderInfo.from_text(text, level, confidence, self.name)
|
||||
|
||||
# Check parent structure
|
||||
parent = element.getparent()
|
||||
if parent is not None:
|
||||
parent_tag = parent.tag.lower()
|
||||
|
||||
# Check if in header-like container
|
||||
if parent_tag in ['header', 'thead', 'caption']:
|
||||
confidence += 0.6
|
||||
level = 2
|
||||
|
||||
# Check if parent has few children (isolated element)
|
||||
if len(parent) <= 3:
|
||||
confidence += 0.3
|
||||
|
||||
# Check if parent is centered
|
||||
parent_align = parent.get('align')
|
||||
if parent_align == 'center':
|
||||
confidence += 0.2
|
||||
|
||||
# Check element properties
|
||||
if tag in ['strong', 'b']:
|
||||
confidence += 0.3
|
||||
|
||||
if element.get('align') == 'center':
|
||||
confidence += 0.2
|
||||
|
||||
# Check if followed by block content
|
||||
next_elem = element.getnext()
|
||||
if next_elem is not None:
|
||||
next_tag = next_elem.tag.lower()
|
||||
if next_tag in ['p', 'div', 'table', 'ul', 'ol']:
|
||||
confidence += 0.2
|
||||
|
||||
# Check text characteristics
|
||||
words = text.split()
|
||||
if 1 <= len(words) <= 10: # Short text
|
||||
confidence += 0.1
|
||||
|
||||
# Normalize confidence
|
||||
confidence = min(confidence, 1.0)
|
||||
|
||||
if confidence > 0.5:
|
||||
return HeaderInfo.from_text(text, level, confidence, self.name)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class ContextualDetector(HeaderDetector):
|
||||
"""Detect headers based on surrounding context."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "contextual"
|
||||
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""Detect headers based on contextual clues."""
|
||||
text = element.text_content().strip()
|
||||
|
||||
# Skip empty or very long text
|
||||
if not text or len(text) > 200:
|
||||
return None
|
||||
|
||||
# Skip single punctuation - never headers
|
||||
if len(text) == 1 and text in '.,!?;:()[]{}':
|
||||
return None
|
||||
|
||||
confidence = 0.0
|
||||
level = 3
|
||||
|
||||
# Check if text looks like a header
|
||||
if self._looks_like_header(text):
|
||||
confidence += 0.4
|
||||
|
||||
# Check relationship to previous content
|
||||
prev_elem = element.getprevious()
|
||||
if prev_elem is not None:
|
||||
prev_text = prev_elem.text_content().strip()
|
||||
|
||||
# Check if previous was also a header (section hierarchy)
|
||||
if prev_text and self._looks_like_header(prev_text):
|
||||
confidence += 0.3
|
||||
# Adjust level based on comparison
|
||||
if len(text) > len(prev_text):
|
||||
level = 2
|
||||
else:
|
||||
level = 3
|
||||
|
||||
# Check relationship to next content
|
||||
next_elem = element.getnext()
|
||||
if next_elem is not None:
|
||||
next_text = next_elem.text_content().strip()
|
||||
|
||||
# Headers are often followed by longer content
|
||||
if len(next_text) > len(text) * 3:
|
||||
confidence += 0.3
|
||||
|
||||
# Check if next element is indented or styled differently
|
||||
next_style = next_elem.get('style', '')
|
||||
if 'margin-left' in next_style or 'padding-left' in next_style:
|
||||
confidence += 0.2
|
||||
|
||||
# Check position in document
|
||||
if context.current_section is None and context.depth < 5:
|
||||
# Early in document, more likely to be header
|
||||
confidence += 0.2
|
||||
|
||||
# Normalize confidence
|
||||
confidence = min(confidence, 1.0)
|
||||
|
||||
if confidence > 0.5:
|
||||
return HeaderInfo.from_text(text, level, confidence, self.name)
|
||||
|
||||
return None
|
||||
|
||||
def _looks_like_header(self, text: str) -> bool:
|
||||
"""Check if text looks like a header."""
|
||||
# Short text
|
||||
if len(text.split()) > 15:
|
||||
return False
|
||||
|
||||
# No ending punctuation (except colon)
|
||||
if text.rstrip().endswith(('.', '!', '?', ';')):
|
||||
return False
|
||||
|
||||
# Title case or all caps
|
||||
if text.istitle() or text.isupper():
|
||||
return True
|
||||
|
||||
# Starts with capital letter
|
||||
if text and text[0].isupper():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class HeaderDetectionStrategy:
|
||||
"""
|
||||
Multi-strategy header detection.
|
||||
|
||||
Combines multiple detection methods with weighted voting.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ParserConfig):
|
||||
"""Initialize with configuration."""
|
||||
self.config = config
|
||||
self.detectors = self._init_detectors()
|
||||
|
||||
def _init_detectors(self) -> List[HeaderDetector]:
|
||||
"""Initialize enabled detectors."""
|
||||
detectors = []
|
||||
|
||||
# Always include basic detectors
|
||||
detectors.extend([
|
||||
StyleBasedDetector(),
|
||||
PatternBasedDetector(),
|
||||
StructuralDetector(),
|
||||
ContextualDetector()
|
||||
])
|
||||
|
||||
# Add ML detector if enabled
|
||||
if self.config.features.get('ml_header_detection'):
|
||||
# Would add MLBasedDetector here
|
||||
pass
|
||||
|
||||
return detectors
|
||||
|
||||
def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]:
|
||||
"""
|
||||
Detect if element is a header using multiple strategies.
|
||||
|
||||
Args:
|
||||
element: HTML element to check
|
||||
context: Current parsing context
|
||||
|
||||
Returns:
|
||||
HeaderInfo if element is detected as header, None otherwise
|
||||
"""
|
||||
# Skip if element has no text
|
||||
text = element.text_content().strip()
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# Collect results from all detectors
|
||||
results: List[HeaderInfo] = []
|
||||
|
||||
for detector in self.detectors:
|
||||
try:
|
||||
result = detector.detect(element, context)
|
||||
if result:
|
||||
results.append(result)
|
||||
except Exception:
|
||||
# Don't let one detector failure stop others
|
||||
continue
|
||||
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# If only one detector fired, use its result if confident enough
|
||||
if len(results) == 1:
|
||||
if results[0].confidence >= self.config.header_detection_threshold:
|
||||
return results[0]
|
||||
return None
|
||||
|
||||
# Multiple detectors - combine results
|
||||
return self._combine_results(results, text)
|
||||
|
||||
def _combine_results(self, results: List[HeaderInfo], text: str) -> HeaderInfo:
|
||||
"""Combine multiple detection results."""
|
||||
# Weight different detectors
|
||||
detector_weights = {
|
||||
'style': 0.3,
|
||||
'pattern': 0.4,
|
||||
'structural': 0.2,
|
||||
'contextual': 0.1,
|
||||
'ml': 0.5 # Would be highest if available
|
||||
}
|
||||
|
||||
# Calculate weighted confidence
|
||||
total_confidence = 0.0
|
||||
total_weight = 0.0
|
||||
|
||||
# Group by level
|
||||
level_votes: Dict[int, float] = {}
|
||||
|
||||
for result in results:
|
||||
weight = detector_weights.get(result.detection_method, 0.1)
|
||||
total_confidence += result.confidence * weight
|
||||
total_weight += weight
|
||||
|
||||
# Vote for level
|
||||
if result.level not in level_votes:
|
||||
level_votes[result.level] = 0.0
|
||||
level_votes[result.level] += result.confidence * weight
|
||||
|
||||
# Normalize confidence
|
||||
final_confidence = total_confidence / total_weight if total_weight > 0 else 0.0
|
||||
|
||||
# Choose most voted level
|
||||
final_level = max(level_votes.items(), key=lambda x: x[1])[0]
|
||||
|
||||
# Check if any detector found this is an item
|
||||
is_item = any(r.is_item for r in results)
|
||||
item_number = next((r.item_number for r in results if r.item_number), None)
|
||||
|
||||
return HeaderInfo(
|
||||
level=final_level,
|
||||
confidence=final_confidence,
|
||||
text=text,
|
||||
detection_method='combined',
|
||||
is_item=is_item,
|
||||
item_number=item_number
|
||||
)
|
||||
@@ -0,0 +1,344 @@
|
||||
"""
|
||||
CSS style parser for HTML elements.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, Optional, Tuple, Union
|
||||
from edgar.documents.types import Style
|
||||
from edgar.documents.utils import get_cache_manager
|
||||
|
||||
|
||||
class StyleParser:
|
||||
"""
|
||||
Parser for CSS style attributes.
|
||||
|
||||
Handles inline styles and converts them to Style objects.
|
||||
"""
|
||||
|
||||
# Common CSS units
|
||||
ABSOLUTE_UNITS = {'px', 'pt', 'pc', 'cm', 'mm', 'in'}
|
||||
RELATIVE_UNITS = {'em', 'rem', 'ex', 'ch', 'vw', 'vh', '%'}
|
||||
|
||||
# Font weight mappings
|
||||
FONT_WEIGHT_MAP = {
|
||||
'normal': '400',
|
||||
'bold': '700',
|
||||
'bolder': '800',
|
||||
'lighter': '300'
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize style parser with cache."""
|
||||
self._cache = get_cache_manager().style_cache
|
||||
|
||||
def parse(self, style_string: str) -> Style:
|
||||
"""
|
||||
Parse CSS style string into Style object.
|
||||
|
||||
Args:
|
||||
style_string: CSS style string (e.g., "font-size: 14px; color: red")
|
||||
|
||||
Returns:
|
||||
Parsed Style object
|
||||
"""
|
||||
if not style_string:
|
||||
return Style()
|
||||
|
||||
# Check cache first
|
||||
cached_style = self._cache.get(style_string)
|
||||
if cached_style is not None:
|
||||
return cached_style
|
||||
|
||||
# Parse style
|
||||
style = Style()
|
||||
|
||||
# Split into individual declarations
|
||||
declarations = self._split_declarations(style_string)
|
||||
|
||||
for prop, value in declarations.items():
|
||||
self._apply_property(style, prop, value)
|
||||
|
||||
# Cache result
|
||||
self._cache.put(style_string, style)
|
||||
|
||||
return style
|
||||
|
||||
def _split_declarations(self, style_string: str) -> Dict[str, str]:
|
||||
"""Split style string into property-value pairs."""
|
||||
declarations = {}
|
||||
|
||||
# Split by semicolon, handling potential issues
|
||||
parts = style_string.split(';')
|
||||
|
||||
for part in parts:
|
||||
part = part.strip()
|
||||
if not part:
|
||||
continue
|
||||
|
||||
# Split property and value
|
||||
if ':' in part:
|
||||
prop, value = part.split(':', 1)
|
||||
prop = prop.strip().lower()
|
||||
value = value.strip()
|
||||
|
||||
if prop and value:
|
||||
declarations[prop] = value
|
||||
|
||||
return declarations
|
||||
|
||||
def _apply_property(self, style: Style, prop: str, value: str):
|
||||
"""Apply CSS property to Style object."""
|
||||
# Font properties
|
||||
if prop == 'font-size':
|
||||
size = self._parse_length(value)
|
||||
if size is not None:
|
||||
style.font_size = size
|
||||
|
||||
elif prop == 'font-weight':
|
||||
style.font_weight = self._normalize_font_weight(value)
|
||||
|
||||
elif prop == 'font-style':
|
||||
if value in ['italic', 'oblique']:
|
||||
style.font_style = 'italic'
|
||||
elif value == 'normal':
|
||||
style.font_style = 'normal'
|
||||
|
||||
# Text properties
|
||||
elif prop == 'text-align':
|
||||
if value in ['left', 'right', 'center', 'justify']:
|
||||
style.text_align = value
|
||||
|
||||
elif prop == 'text-decoration':
|
||||
style.text_decoration = value
|
||||
|
||||
# Color properties
|
||||
elif prop == 'color':
|
||||
style.color = self._normalize_color(value)
|
||||
|
||||
elif prop in ['background-color', 'background']:
|
||||
color = self._extract_background_color(value)
|
||||
if color:
|
||||
style.background_color = color
|
||||
|
||||
# Spacing properties
|
||||
elif prop == 'margin':
|
||||
self._parse_box_property(style, 'margin', value)
|
||||
elif prop == 'margin-top':
|
||||
margin = self._parse_length(value)
|
||||
if margin is not None:
|
||||
style.margin_top = margin
|
||||
elif prop == 'margin-bottom':
|
||||
margin = self._parse_length(value)
|
||||
if margin is not None:
|
||||
style.margin_bottom = margin
|
||||
elif prop == 'margin-left':
|
||||
margin = self._parse_length(value)
|
||||
if margin is not None:
|
||||
style.margin_left = margin
|
||||
elif prop == 'margin-right':
|
||||
margin = self._parse_length(value)
|
||||
if margin is not None:
|
||||
style.margin_right = margin
|
||||
|
||||
elif prop == 'padding':
|
||||
self._parse_box_property(style, 'padding', value)
|
||||
elif prop == 'padding-top':
|
||||
padding = self._parse_length(value)
|
||||
if padding is not None:
|
||||
style.padding_top = padding
|
||||
elif prop == 'padding-bottom':
|
||||
padding = self._parse_length(value)
|
||||
if padding is not None:
|
||||
style.padding_bottom = padding
|
||||
elif prop == 'padding-left':
|
||||
padding = self._parse_length(value)
|
||||
if padding is not None:
|
||||
style.padding_left = padding
|
||||
elif prop == 'padding-right':
|
||||
padding = self._parse_length(value)
|
||||
if padding is not None:
|
||||
style.padding_right = padding
|
||||
|
||||
# Display properties
|
||||
elif prop == 'display':
|
||||
style.display = value
|
||||
|
||||
# Size properties
|
||||
elif prop == 'width':
|
||||
style.width = self._parse_dimension(value)
|
||||
elif prop == 'height':
|
||||
style.height = self._parse_dimension(value)
|
||||
|
||||
# Line height
|
||||
elif prop == 'line-height':
|
||||
line_height = self._parse_line_height(value)
|
||||
if line_height is not None:
|
||||
style.line_height = line_height
|
||||
|
||||
def _parse_length(self, value: str) -> Optional[float]:
|
||||
"""Parse CSS length value to pixels."""
|
||||
value = value.strip().lower()
|
||||
|
||||
# Handle special values
|
||||
if value in ['0', 'auto', 'inherit', 'initial']:
|
||||
return 0.0 if value == '0' else None
|
||||
|
||||
# Extract number and unit
|
||||
match = re.match(r'^(-?\d*\.?\d+)\s*([a-z%]*)$', value)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
num_str, unit = match.groups()
|
||||
try:
|
||||
num = float(num_str)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Convert to pixels
|
||||
if not unit or unit == 'px':
|
||||
return num
|
||||
elif unit == 'pt':
|
||||
return num * 1.333 # 1pt = 1.333px
|
||||
elif unit == 'em':
|
||||
return num * 16 # Assume 16px base
|
||||
elif unit == 'rem':
|
||||
return num * 16 # Assume 16px root
|
||||
elif unit == '%':
|
||||
return None # Can't convert percentage without context
|
||||
elif unit == 'in':
|
||||
return num * 96 # 1in = 96px
|
||||
elif unit == 'cm':
|
||||
return num * 37.8 # 1cm = 37.8px
|
||||
elif unit == 'mm':
|
||||
return num * 3.78 # 1mm = 3.78px
|
||||
|
||||
return None
|
||||
|
||||
def _parse_dimension(self, value: str) -> Optional[Union[float, str]]:
|
||||
"""Parse dimension value (width/height)."""
|
||||
value = value.strip()
|
||||
|
||||
# Check for percentage
|
||||
if value.endswith('%'):
|
||||
return value # Return as string
|
||||
|
||||
# Try to parse as length
|
||||
length = self._parse_length(value)
|
||||
return length
|
||||
|
||||
def _parse_line_height(self, value: str) -> Optional[float]:
|
||||
"""Parse line-height value."""
|
||||
value = value.strip()
|
||||
|
||||
# Unitless number (multiplier)
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Try as length
|
||||
return self._parse_length(value)
|
||||
|
||||
def _normalize_font_weight(self, value: str) -> str:
|
||||
"""Normalize font weight value."""
|
||||
value = value.strip().lower()
|
||||
|
||||
# Map keywords to numeric values
|
||||
if value in self.FONT_WEIGHT_MAP:
|
||||
return self.FONT_WEIGHT_MAP[value]
|
||||
|
||||
# Check if it's already numeric
|
||||
if value.isdigit() and 100 <= int(value) <= 900:
|
||||
return value
|
||||
|
||||
return value
|
||||
|
||||
def _normalize_color(self, value: str) -> str:
|
||||
"""Normalize color value."""
|
||||
value = value.strip().lower()
|
||||
|
||||
# Handle rgb/rgba
|
||||
if value.startswith(('rgb(', 'rgba(')):
|
||||
return value
|
||||
|
||||
# Handle hex colors
|
||||
if value.startswith('#'):
|
||||
# Expand 3-char hex to 6-char
|
||||
if len(value) == 4:
|
||||
return '#' + ''.join(c*2 for c in value[1:])
|
||||
return value
|
||||
|
||||
# Return named colors as-is
|
||||
return value
|
||||
|
||||
def _extract_background_color(self, value: str) -> Optional[str]:
|
||||
"""Extract color from background property."""
|
||||
# Simple extraction - could be enhanced
|
||||
parts = value.split()
|
||||
for part in parts:
|
||||
if part.startswith('#') or part.startswith('rgb'):
|
||||
return self._normalize_color(part)
|
||||
# Check for named colors
|
||||
if not any(unit in part for unit in self.ABSOLUTE_UNITS | self.RELATIVE_UNITS):
|
||||
return part
|
||||
|
||||
return None
|
||||
|
||||
def _parse_box_property(self, style: Style, prop_type: str, value: str):
|
||||
"""Parse box property (margin/padding) with multiple values."""
|
||||
parts = value.split()
|
||||
|
||||
if not parts:
|
||||
return
|
||||
|
||||
# Convert all parts to lengths
|
||||
lengths = []
|
||||
for part in parts:
|
||||
length = self._parse_length(part)
|
||||
if length is not None:
|
||||
lengths.append(length)
|
||||
|
||||
if not lengths:
|
||||
return
|
||||
|
||||
# Apply based on number of values (CSS box model)
|
||||
if len(lengths) == 1:
|
||||
# All sides
|
||||
val = lengths[0]
|
||||
setattr(style, f'{prop_type}_top', val)
|
||||
setattr(style, f'{prop_type}_right', val)
|
||||
setattr(style, f'{prop_type}_bottom', val)
|
||||
setattr(style, f'{prop_type}_left', val)
|
||||
elif len(lengths) == 2:
|
||||
# Vertical, horizontal
|
||||
vert, horiz = lengths
|
||||
setattr(style, f'{prop_type}_top', vert)
|
||||
setattr(style, f'{prop_type}_bottom', vert)
|
||||
setattr(style, f'{prop_type}_left', horiz)
|
||||
setattr(style, f'{prop_type}_right', horiz)
|
||||
elif len(lengths) == 3:
|
||||
# Top, horizontal, bottom
|
||||
top, horiz, bottom = lengths
|
||||
setattr(style, f'{prop_type}_top', top)
|
||||
setattr(style, f'{prop_type}_bottom', bottom)
|
||||
setattr(style, f'{prop_type}_left', horiz)
|
||||
setattr(style, f'{prop_type}_right', horiz)
|
||||
elif len(lengths) >= 4:
|
||||
# Top, right, bottom, left
|
||||
setattr(style, f'{prop_type}_top', lengths[0])
|
||||
setattr(style, f'{prop_type}_right', lengths[1])
|
||||
setattr(style, f'{prop_type}_bottom', lengths[2])
|
||||
setattr(style, f'{prop_type}_left', lengths[3])
|
||||
|
||||
def merge_styles(self, base: Style, override: Style) -> Style:
|
||||
"""
|
||||
Merge two styles with override taking precedence.
|
||||
|
||||
Args:
|
||||
base: Base style
|
||||
override: Override style
|
||||
|
||||
Returns:
|
||||
Merged style
|
||||
"""
|
||||
return base.merge(override)
|
||||
@@ -0,0 +1,637 @@
|
||||
"""
|
||||
Advanced table processing strategy.
|
||||
"""
|
||||
|
||||
import re
|
||||
from functools import lru_cache
|
||||
from typing import List, Optional
|
||||
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.strategies.style_parser import StyleParser
|
||||
from edgar.documents.table_nodes import TableNode, Cell, Row
|
||||
from edgar.documents.types import TableType
|
||||
|
||||
|
||||
class TableProcessor:
|
||||
"""
|
||||
Advanced table processing with type detection and structure analysis.
|
||||
"""
|
||||
|
||||
# HTML entities that need replacement
|
||||
ENTITY_REPLACEMENTS = {
|
||||
'―': '-----',
|
||||
'—': '-----',
|
||||
'–': '---',
|
||||
'−': '-',
|
||||
'‐': '-',
|
||||
'‐': '-',
|
||||
' ': ' ',
|
||||
'&': '&',
|
||||
'<': '<',
|
||||
'>': '>',
|
||||
'"': '"',
|
||||
''': "'",
|
||||
' ': ' ',
|
||||
'​': '',
|
||||
'—': '-----',
|
||||
'–': '---',
|
||||
'−': '-',
|
||||
}
|
||||
|
||||
# Financial keywords for table type detection
|
||||
FINANCIAL_KEYWORDS = {
|
||||
'revenue', 'income', 'expense', 'asset', 'liability',
|
||||
'cash', 'equity', 'profit', 'loss', 'margin',
|
||||
'earnings', 'cost', 'sales', 'operating', 'net',
|
||||
'gross', 'total', 'balance', 'statement', 'consolidated',
|
||||
'provision', 'tax', 'taxes', 'compensation', 'stock',
|
||||
'share', 'shares', 'rsu', 'option', 'grant', 'vest'
|
||||
}
|
||||
|
||||
# Metrics keywords
|
||||
METRICS_KEYWORDS = {
|
||||
'ratio', 'percentage', 'percent', '%', 'rate',
|
||||
'growth', 'change', 'increase', 'decrease',
|
||||
'average', 'median', 'total', 'count', 'number'
|
||||
}
|
||||
|
||||
def __init__(self, config: ParserConfig):
|
||||
"""Initialize table processor."""
|
||||
self.config = config
|
||||
self.style_parser = StyleParser()
|
||||
|
||||
def process(self, element: HtmlElement) -> TableNode:
|
||||
"""
|
||||
Process table element into TableNode.
|
||||
|
||||
Args:
|
||||
element: HTML table element
|
||||
|
||||
Returns:
|
||||
Processed TableNode
|
||||
"""
|
||||
# Extract table metadata
|
||||
table_id = element.get('id')
|
||||
table_class = element.get('class', '').split()
|
||||
table_style = self.style_parser.parse(element.get('style', ''))
|
||||
|
||||
# Create table node
|
||||
table = TableNode(style=table_style)
|
||||
|
||||
# Set config for rendering decisions
|
||||
table._config = self.config
|
||||
|
||||
# Add metadata
|
||||
if table_id:
|
||||
table.set_metadata('id', table_id)
|
||||
if table_class:
|
||||
table.set_metadata('classes', table_class)
|
||||
|
||||
# Extract caption
|
||||
caption_elem = element.find('.//caption')
|
||||
if caption_elem is not None:
|
||||
table.caption = self._extract_text(caption_elem)
|
||||
|
||||
# Extract summary
|
||||
summary = element.get('summary')
|
||||
if summary:
|
||||
table.summary = summary
|
||||
|
||||
# Process table structure
|
||||
self._process_table_structure(element, table)
|
||||
|
||||
# Detect table type if configured
|
||||
if self.config.detect_table_types:
|
||||
table.table_type = self._detect_table_type(table)
|
||||
|
||||
# Extract relationships if configured
|
||||
if self.config.extract_table_relationships:
|
||||
self._extract_relationships(table)
|
||||
|
||||
return table
|
||||
|
||||
def _process_table_structure(self, element: HtmlElement, table: TableNode):
|
||||
"""Process table structure (thead, tbody, tfoot)."""
|
||||
# Process thead
|
||||
thead = element.find('.//thead')
|
||||
if thead is not None:
|
||||
for tr in thead.findall('.//tr'):
|
||||
cells = self._process_row(tr, is_header=True)
|
||||
if cells:
|
||||
table.headers.append(cells)
|
||||
|
||||
# Process tbody (or direct rows)
|
||||
tbody = element.find('.//tbody')
|
||||
rows_container = tbody if tbody is not None else element
|
||||
|
||||
# Track if we've seen headers and data rows
|
||||
headers_found = bool(table.headers)
|
||||
consecutive_header_rows = 0
|
||||
data_rows_started = False
|
||||
|
||||
for tr in rows_container.findall('.//tr'):
|
||||
# Skip if already processed in thead
|
||||
if thead is not None and tr.getparent() == thead:
|
||||
continue
|
||||
|
||||
# Check if this might be a header row
|
||||
is_header_row = False
|
||||
|
||||
# Continue checking for headers if:
|
||||
# 1. We haven't found any headers yet, OR
|
||||
# 2. We've found headers but haven't seen data rows yet (multi-row headers)
|
||||
if not data_rows_started:
|
||||
is_header_row = self._is_header_row(tr)
|
||||
|
||||
# Additional check for multi-row headers in financial tables
|
||||
# If the previous row was a header and this row has years or units,
|
||||
# it's likely part of the header
|
||||
if headers_found and not is_header_row:
|
||||
row_text = tr.text_content().strip()
|
||||
# Check for units like "(in millions)" or "(in thousands)"
|
||||
if '(in millions)' in row_text or '(in thousands)' in row_text or '(in billions)' in row_text:
|
||||
is_header_row = True
|
||||
# Check for year rows that follow "Year Ended" headers
|
||||
elif len(table.headers) > 0:
|
||||
last_header_text = ' '.join(cell.text() for cell in table.headers[-1])
|
||||
if 'year ended' in last_header_text.lower() or 'years ended' in last_header_text.lower():
|
||||
# Check if this row has years
|
||||
year_pattern = r'\b(19\d{2}|20\d{2})\b'
|
||||
years_found = re.findall(year_pattern, row_text)
|
||||
if years_found:
|
||||
is_header_row = True
|
||||
|
||||
cells = self._process_row(tr, is_header=is_header_row)
|
||||
if cells:
|
||||
if is_header_row:
|
||||
table.headers.append(cells)
|
||||
headers_found = True
|
||||
consecutive_header_rows += 1
|
||||
else:
|
||||
# Only mark data_rows_started if this row has actual content
|
||||
# Empty rows at the beginning shouldn't stop header detection
|
||||
row = Row(cells=cells, is_header=False)
|
||||
table.rows.append(row)
|
||||
|
||||
# Check if row has significant content that indicates data rows have started
|
||||
# But be smart about it - descriptive rows like "(in millions)" or pure spacing
|
||||
# shouldn't stop header detection
|
||||
has_content = any(cell.text().strip() for cell in cells)
|
||||
if has_content:
|
||||
# Get the row text for smarter analysis
|
||||
row_text = ' '.join(cell.text().strip() for cell in cells).strip()
|
||||
row_text_lower = row_text.lower()
|
||||
|
||||
# Don't consider this as "data started" if it's likely a header-related row
|
||||
is_header_related = (
|
||||
# Unit descriptions
|
||||
'(in millions)' in row_text_lower or
|
||||
'(in thousands)' in row_text_lower or
|
||||
'(in billions)' in row_text_lower or
|
||||
'except per share' in row_text_lower or
|
||||
# Financial period descriptions
|
||||
'year ended' in row_text_lower or
|
||||
'months ended' in row_text_lower or
|
||||
# Mostly just spacing/formatting
|
||||
len(row_text.strip()) < 5 or
|
||||
# Contains years (might be misclassified header)
|
||||
bool(re.search(r'\b(19\d{2}|20\d{2})\b', row_text))
|
||||
)
|
||||
|
||||
# Only mark data_rows_started if this seems like actual data, not header-related
|
||||
if not is_header_related:
|
||||
data_rows_started = True
|
||||
|
||||
consecutive_header_rows = 0
|
||||
|
||||
# Process tfoot
|
||||
tfoot = element.find('.//tfoot')
|
||||
if tfoot is not None:
|
||||
for tr in tfoot.findall('.//tr'):
|
||||
cells = self._process_row(tr, is_header=False)
|
||||
if cells:
|
||||
row = Row(cells=cells, is_header=False)
|
||||
table.footer.append(row)
|
||||
|
||||
def _process_row(self, tr: HtmlElement, is_header: bool) -> List[Cell]:
|
||||
"""Process table row into cells."""
|
||||
cells = []
|
||||
|
||||
# Process both td and th elements
|
||||
for cell_elem in tr.findall('.//td') + tr.findall('.//th'):
|
||||
cell = self._process_cell(cell_elem, is_header or cell_elem.tag == 'th')
|
||||
if cell:
|
||||
cells.append(cell)
|
||||
|
||||
return cells
|
||||
|
||||
def _process_cell(self, elem: HtmlElement, is_header: bool) -> Optional[Cell]:
|
||||
"""Process table cell."""
|
||||
# Extract cell properties
|
||||
colspan = int(elem.get('colspan', '1'))
|
||||
rowspan = int(elem.get('rowspan', '1'))
|
||||
align = elem.get('align')
|
||||
|
||||
# Extract style
|
||||
style = self.style_parser.parse(elem.get('style', ''))
|
||||
if style.text_align:
|
||||
align = style.text_align
|
||||
|
||||
# Extract content
|
||||
content = self._extract_cell_content(elem)
|
||||
|
||||
# Create cell
|
||||
cell = Cell(
|
||||
content=content,
|
||||
colspan=colspan,
|
||||
rowspan=rowspan,
|
||||
is_header=is_header,
|
||||
align=align
|
||||
)
|
||||
|
||||
return cell
|
||||
|
||||
def _extract_cell_content(self, elem: HtmlElement) -> str:
|
||||
"""Extract and clean cell content."""
|
||||
# Check for nested structure
|
||||
divs = elem.findall('.//div')
|
||||
if divs and len(divs) > 1:
|
||||
# Multiple divs - likely multi-line content
|
||||
lines = []
|
||||
for div in divs:
|
||||
text = self._extract_text(div)
|
||||
if text:
|
||||
lines.append(text)
|
||||
return '\n'.join(lines)
|
||||
|
||||
# Handle line breaks
|
||||
for br in elem.findall('.//br'):
|
||||
br.tail = '\n' + (br.tail or '')
|
||||
|
||||
# Extract text
|
||||
text = self._extract_text(elem)
|
||||
|
||||
return text
|
||||
|
||||
def _extract_text(self, elem: HtmlElement) -> str:
|
||||
"""Extract and clean text from element."""
|
||||
# Use itertext() to get all text fragments
|
||||
# This preserves spaces better than text_content()
|
||||
text_parts = []
|
||||
for text in elem.itertext():
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
# Join parts, ensuring we don't lose spaces
|
||||
# If a part doesn't end with whitespace and the next doesn't start with whitespace,
|
||||
# we need to add a space between them
|
||||
if not text_parts:
|
||||
return ''
|
||||
|
||||
result = []
|
||||
for i, part in enumerate(text_parts):
|
||||
if i == 0:
|
||||
result.append(part)
|
||||
else:
|
||||
prev_part = text_parts[i-1]
|
||||
# Check if we need to add a space between parts
|
||||
# Don't add space if previous ends with space or current starts with space
|
||||
if prev_part and part:
|
||||
if not prev_part[-1].isspace() and not part[0].isspace():
|
||||
# Check for punctuation that shouldn't have space before it
|
||||
if part[0] not in ',.;:!?%)]':
|
||||
result.append(' ')
|
||||
result.append(part)
|
||||
|
||||
text = ''.join(result)
|
||||
|
||||
# Replace entities
|
||||
for entity, replacement in self.ENTITY_REPLACEMENTS.items():
|
||||
text = text.replace(entity, replacement)
|
||||
|
||||
# Clean whitespace
|
||||
text = text.strip()
|
||||
|
||||
# Normalize internal whitespace but preserve line breaks
|
||||
lines = text.split('\n')
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
# Collapse multiple spaces to single space
|
||||
line = ' '.join(line.split())
|
||||
cleaned_lines.append(line)
|
||||
|
||||
return '\n'.join(cleaned_lines)
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=1)
|
||||
def _get_period_header_pattern():
|
||||
"""
|
||||
Compile comprehensive regex for financial period headers.
|
||||
Adapted from old parser's proven patterns.
|
||||
|
||||
Returns:
|
||||
Compiled regex pattern matching financial period headers
|
||||
"""
|
||||
# Base components
|
||||
periods = r'(?:three|six|nine|twelve|[1-4]|first|second|third|fourth)'
|
||||
timeframes = r'(?:month|quarter|year|week)'
|
||||
ended_variants = r'(?:ended|ending|end|period)'
|
||||
as_of_variants = r'(?:as\s+of|at|as\s+at)'
|
||||
|
||||
# Date pattern
|
||||
months = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
|
||||
day = r'\d{1,2}'
|
||||
year = r'(?:19|20)\d{2}'
|
||||
date = f'{months}\\s*\\.?\\s*{day}\\s*,?\\s*{year}'
|
||||
|
||||
# Combined patterns
|
||||
patterns = [
|
||||
# Standard period headers
|
||||
f'{periods}\\s+{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
|
||||
f'(?:fiscal\\s+)?{timeframes}\\s+{ended_variants}',
|
||||
f'{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
|
||||
|
||||
# Balance sheet date headers
|
||||
f'{as_of_variants}\\s+{date}',
|
||||
|
||||
# Multiple date sequences
|
||||
f'{date}(?:\\s*(?:and|,)\\s*{date})*',
|
||||
|
||||
# Single dates
|
||||
f'(?:{ended_variants}\\s+)?{date}'
|
||||
]
|
||||
|
||||
pattern = '|'.join(f'(?:{p})' for p in patterns)
|
||||
return re.compile(pattern, re.IGNORECASE)
|
||||
|
||||
def _is_header_row(self, tr: HtmlElement) -> bool:
|
||||
"""Detect if row is likely a header row in SEC filings."""
|
||||
# Check if contains th elements (most reliable indicator)
|
||||
if tr.find('.//th') is not None:
|
||||
return True
|
||||
|
||||
cells = tr.findall('.//td')
|
||||
if not cells:
|
||||
return False
|
||||
|
||||
# Get row text for analysis
|
||||
row_text = tr.text_content()
|
||||
row_text_lower = row_text.lower()
|
||||
|
||||
# Check for date ranges with financial data (Oracle Table 6 pattern)
|
||||
# Date ranges like "March 1, 2024—March 31, 2024" should be data rows, not headers
|
||||
date_range_pattern = r'(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}\s*[—–-]\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}'
|
||||
has_date_range = bool(re.search(date_range_pattern, row_text_lower))
|
||||
|
||||
# Check for financial data indicators
|
||||
has_currency = bool(re.search(r'\$[\s]*[\d,\.]+', row_text))
|
||||
has_decimals = bool(re.search(r'\b\d+\.\d+\b', row_text))
|
||||
has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
|
||||
|
||||
# If row has date range + financial data, it's definitely a data row
|
||||
if has_date_range and (has_currency or has_decimals or has_large_numbers):
|
||||
return False
|
||||
|
||||
# Check for year patterns (very common in financial headers)
|
||||
year_pattern = r'\b(19\d{2}|20\d{2})\b'
|
||||
years_found = re.findall(year_pattern, row_text)
|
||||
if len(years_found) >= 2: # Multiple years suggest header row
|
||||
# IMPORTANT: Check for date ranges and same-year repetition
|
||||
# Date ranges like "March 1, 2024—March 31, 2024" contain the same year twice
|
||||
# but are data rows, not multi-year comparison headers
|
||||
|
||||
# If all years are the same (date range pattern)
|
||||
if len(set(years_found)) == 1:
|
||||
# Same year repeated - likely a date range like "Jan 1, 2024 - Mar 31, 2024"
|
||||
# Not a multi-year comparison header
|
||||
pass # Don't return True
|
||||
# Multiple different years suggest multi-year comparison header
|
||||
elif 'total' not in row_text_lower[:20]: # Check first 20 chars
|
||||
return True
|
||||
|
||||
# Enhanced year detection - check individual cells for year patterns
|
||||
# This handles cases where years are in separate cells
|
||||
year_cells = 0
|
||||
date_phrases = 0
|
||||
for cell in cells:
|
||||
cell_text = cell.text_content().strip()
|
||||
if cell_text:
|
||||
# Check for individual years
|
||||
if re.match(r'^\s*(19\d{2}|20\d{2})\s*$', cell_text):
|
||||
year_cells += 1
|
||||
# Check for date phrases like "June 30, 2025"
|
||||
elif 'june 30' in cell_text.lower() or 'december 31' in cell_text.lower():
|
||||
date_phrases += 1
|
||||
|
||||
# If we have multiple year cells or year + date phrases, likely a header
|
||||
if year_cells >= 2 or (year_cells >= 1 and date_phrases >= 1):
|
||||
if 'total' not in row_text_lower[:20]:
|
||||
return True
|
||||
|
||||
# Check for comprehensive financial period patterns (from old parser)
|
||||
period_pattern = self._get_period_header_pattern()
|
||||
if period_pattern.search(row_text_lower):
|
||||
# Additional validation: ensure it's not a data row with period text
|
||||
# Check for absence of strong data indicators
|
||||
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\s*[+\-*/]\s*\d+|\(\s*\d+(?:,\d{3})*\s*\))'
|
||||
if not re.search(data_pattern, row_text):
|
||||
return True
|
||||
|
||||
# Check for units notation (in millions, thousands, billions)
|
||||
units_pattern = r'\(in\s+(?:millions|thousands|billions)\)'
|
||||
if re.search(units_pattern, row_text_lower):
|
||||
return True
|
||||
|
||||
# Check for period indicators (quarters, months)
|
||||
# But be careful with "fiscal" - it could be data like "Fiscal 2025"
|
||||
period_keywords = ['quarter', 'q1', 'q2', 'q3', 'q4', 'month',
|
||||
'january', 'february', 'march', 'april', 'may', 'june',
|
||||
'july', 'august', 'september', 'october', 'november', 'december',
|
||||
'ended', 'three months', 'six months', 'nine months']
|
||||
|
||||
# Special handling for "fiscal" - only treat as header if it's part of a phrase like "fiscal year ended"
|
||||
if 'fiscal' in row_text_lower:
|
||||
# Check if row has numeric values (suggests it's data, not header)
|
||||
# Look for patterns like "Fiscal 2025 $10,612"
|
||||
has_currency_values = bool(re.search(r'\$[\s]*[\d,]+', row_text))
|
||||
has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
|
||||
|
||||
# If it has currency or large numbers, it's likely data
|
||||
if has_currency_values or has_large_numbers:
|
||||
return False
|
||||
|
||||
# Check if it's just "Fiscal YYYY" which is likely data, not a header
|
||||
fiscal_year_only = re.match(r'^\s*fiscal\s+\d{4}\s*$', row_text_lower.strip())
|
||||
if fiscal_year_only:
|
||||
return False # This is data, not a header
|
||||
|
||||
# Check for header-like phrases with fiscal
|
||||
if 'fiscal year' in row_text_lower and ('ended' in row_text_lower or 'ending' in row_text_lower):
|
||||
return True
|
||||
|
||||
if any(keyword in row_text_lower for keyword in period_keywords):
|
||||
# Validate it's not a data row with period keywords
|
||||
# Check for strong data indicators
|
||||
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
|
||||
if not re.search(data_pattern, row_text):
|
||||
return True
|
||||
|
||||
# Check for column descriptors (but NOT total)
|
||||
# These are words commonly found in headers but not data rows
|
||||
header_keywords = ['description', 'item', 'category', 'type', 'classification',
|
||||
'change', 'percent', 'increase', 'decrease', 'variance']
|
||||
if any(keyword in row_text_lower for keyword in header_keywords):
|
||||
# Make sure it's not a total row
|
||||
if 'total' not in row_text_lower[:30]:
|
||||
# Additional validation: long narrative text is not a header
|
||||
# Headers are typically concise (< 150 chars)
|
||||
if len(row_text) > 150:
|
||||
return False
|
||||
# Check for data indicators (would indicate data row, not header)
|
||||
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
|
||||
if re.search(data_pattern, row_text):
|
||||
return False
|
||||
return True
|
||||
|
||||
# Check if all cells are bold (common header formatting)
|
||||
bold_count = 0
|
||||
for cell in cells:
|
||||
style = cell.get('style', '')
|
||||
if 'font-weight' in style and 'bold' in style:
|
||||
bold_count += 1
|
||||
elif cell.find('.//b') is not None or cell.find('.//strong') is not None:
|
||||
bold_count += 1
|
||||
|
||||
# Only consider it a header if ALL cells are bold (not just some)
|
||||
if bold_count == len(cells) and bold_count > 0:
|
||||
return True
|
||||
|
||||
# Check content type ratio - headers usually have more text than numbers
|
||||
# Count cells with primarily text vs primarily numbers
|
||||
text_cells = 0
|
||||
number_cells = 0
|
||||
for cell in cells:
|
||||
cell_text = cell.text_content().strip()
|
||||
if cell_text:
|
||||
# Remove common symbols for analysis
|
||||
clean_text = cell_text.replace('$', '').replace('%', '').replace(',', '').replace('(', '').replace(')', '')
|
||||
if clean_text.replace('.', '').replace('-', '').strip().isdigit():
|
||||
number_cells += 1
|
||||
else:
|
||||
text_cells += 1
|
||||
|
||||
# Be very careful about treating text-heavy rows as headers
|
||||
# Many data rows start with text labels (e.g., "Impact of...", "Effect of...")
|
||||
# Only consider it a header if it has mostly text AND doesn't look like a data label
|
||||
if text_cells > number_cells * 2 and text_cells >= 3:
|
||||
# Check for common data row patterns
|
||||
data_row_indicators = [
|
||||
'impact of', 'effect of', 'adjustment', 'provision for', 'benefit',
|
||||
'expense', 'income from', 'loss on', 'gain on', 'charge', 'credit',
|
||||
'earnings', 'computed', 'state taxes', 'research', 'excess tax'
|
||||
]
|
||||
|
||||
# If it starts with any of these, it's likely a data row, not a header
|
||||
for indicator in data_row_indicators:
|
||||
if row_text_lower.startswith(indicator) or indicator in row_text_lower[:50]:
|
||||
return False
|
||||
|
||||
# Also not a header if it starts with "total"
|
||||
if not row_text_lower.startswith('total'):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _detect_table_type(self, table: TableNode) -> TableType:
|
||||
"""Detect the type of table based on content."""
|
||||
# Collect text from headers and first few rows
|
||||
text_parts = []
|
||||
|
||||
# Add caption
|
||||
if table.caption:
|
||||
text_parts.append(table.caption.lower())
|
||||
|
||||
# Add headers
|
||||
for header_row in table.headers:
|
||||
for cell in header_row:
|
||||
text_parts.append(cell.text().lower())
|
||||
|
||||
# Add first few rows
|
||||
for row in table.rows[:3]:
|
||||
for cell in row.cells:
|
||||
text_parts.append(cell.text().lower())
|
||||
|
||||
combined_text = ' '.join(text_parts)
|
||||
|
||||
# Check for financial table
|
||||
financial_count = sum(1 for keyword in self.FINANCIAL_KEYWORDS if keyword in combined_text)
|
||||
if financial_count >= 2: # Lowered threshold for better detection
|
||||
return TableType.FINANCIAL
|
||||
|
||||
# Check for metrics table
|
||||
metrics_count = sum(1 for keyword in self.METRICS_KEYWORDS if keyword in combined_text)
|
||||
numeric_cells = sum(1 for row in table.rows for cell in row.cells if cell.is_numeric)
|
||||
total_cells = sum(len(row.cells) for row in table.rows)
|
||||
|
||||
if total_cells > 0:
|
||||
numeric_ratio = numeric_cells / total_cells
|
||||
# More lenient metrics detection
|
||||
if metrics_count >= 1 or numeric_ratio > 0.3:
|
||||
return TableType.METRICS
|
||||
|
||||
# Check for table of contents
|
||||
if 'content' in combined_text or 'index' in combined_text:
|
||||
# Look for page numbers
|
||||
has_page_numbers = any(
|
||||
re.search(r'\b\d{1,3}\b', cell.text())
|
||||
for row in table.rows
|
||||
for cell in row.cells
|
||||
)
|
||||
if has_page_numbers:
|
||||
return TableType.TABLE_OF_CONTENTS
|
||||
|
||||
# Check for exhibit index
|
||||
if 'exhibit' in combined_text:
|
||||
return TableType.EXHIBIT_INDEX
|
||||
|
||||
# Check for reference table (citations, definitions, etc.)
|
||||
if any(word in combined_text for word in ['reference', 'definition', 'glossary', 'citation']):
|
||||
return TableType.REFERENCE
|
||||
|
||||
return TableType.GENERAL
|
||||
|
||||
def _extract_relationships(self, table: TableNode):
|
||||
"""Extract relationships within table data."""
|
||||
# This would implement relationship extraction
|
||||
# For now, just set a flag that relationships were processed
|
||||
table.set_metadata('relationships_extracted', True)
|
||||
|
||||
# Example relationships to extract:
|
||||
# - Parent-child relationships (indented rows)
|
||||
# - Total rows that sum other rows
|
||||
# - Cross-references between cells
|
||||
# - Time series relationships
|
||||
|
||||
# Detect total rows
|
||||
total_rows = []
|
||||
for i, row in enumerate(table.rows):
|
||||
if row.is_total_row:
|
||||
total_rows.append(i)
|
||||
|
||||
if total_rows:
|
||||
table.set_metadata('total_rows', total_rows)
|
||||
|
||||
# Detect indentation patterns (parent-child)
|
||||
indentation_levels = []
|
||||
for row in table.rows:
|
||||
if row.cells:
|
||||
first_cell_text = row.cells[0].text()
|
||||
# Count leading spaces
|
||||
indent = len(first_cell_text) - len(first_cell_text.lstrip())
|
||||
indentation_levels.append(indent)
|
||||
|
||||
if any(level > 0 for level in indentation_levels):
|
||||
table.set_metadata('has_hierarchy', True)
|
||||
table.set_metadata('indentation_levels', indentation_levels)
|
||||
@@ -0,0 +1,345 @@
|
||||
"""
|
||||
XBRL extraction strategy for inline XBRL documents.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from edgar.documents.types import XBRLFact
|
||||
|
||||
|
||||
class XBRLExtractor:
|
||||
"""
|
||||
Extracts XBRL facts from inline XBRL (iXBRL) documents.
|
||||
|
||||
Handles:
|
||||
- ix:nonFraction, ix:nonNumeric facts
|
||||
- Context and unit resolution
|
||||
- Continuation handling
|
||||
- Transformation rules
|
||||
"""
|
||||
|
||||
# XBRL namespaces
|
||||
NAMESPACES = {
|
||||
'ix': 'http://www.xbrl.org/2013/inlineXBRL',
|
||||
'xbrli': 'http://www.xbrl.org/2003/instance',
|
||||
'xbrldi': 'http://xbrl.org/2006/xbrldi',
|
||||
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
|
||||
}
|
||||
|
||||
# Common transformation formats
|
||||
TRANSFORMATIONS = {
|
||||
'ixt:numdotdecimal': lambda x: x.replace(',', ''),
|
||||
'ixt:numcommadecimal': lambda x: x.replace('.', '_').replace(',', '.').replace('_', ','),
|
||||
'ixt:zerodash': lambda x: '0' if x == '-' else x,
|
||||
'ixt:datedoteu': lambda x: x.replace('.', '-'),
|
||||
'ixt:datedotus': lambda x: x.replace('.', '/'),
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize XBRL extractor."""
|
||||
self.contexts: Dict[str, Dict[str, Any]] = {}
|
||||
self.units: Dict[str, str] = {}
|
||||
self.continuations: Dict[str, str] = {}
|
||||
self._initialized = False
|
||||
|
||||
def extract_context(self, element: HtmlElement) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract XBRL context from element.
|
||||
|
||||
Args:
|
||||
element: HTML element that might contain XBRL
|
||||
|
||||
Returns:
|
||||
XBRL metadata if found
|
||||
"""
|
||||
# Check if element is an ix: tag
|
||||
if not self._is_xbrl_element(element):
|
||||
return None
|
||||
|
||||
# Initialize context if needed
|
||||
if not self._initialized:
|
||||
self._initialize_context(element)
|
||||
|
||||
# Extract based on element type
|
||||
tag_name = self._get_local_name(element.tag)
|
||||
|
||||
if tag_name == 'nonfraction':
|
||||
return self._extract_nonfraction(element)
|
||||
elif tag_name == 'nonnumeric':
|
||||
return self._extract_nonnumeric(element)
|
||||
elif tag_name == 'continuation':
|
||||
return self._extract_continuation(element)
|
||||
elif tag_name == 'footnote':
|
||||
return self._extract_footnote(element)
|
||||
elif tag_name == 'fraction':
|
||||
return self._extract_fraction(element)
|
||||
|
||||
return None
|
||||
|
||||
def extract_fact(self, element: HtmlElement) -> Optional[XBRLFact]:
|
||||
"""Extract XBRL fact from element."""
|
||||
context = self.extract_context(element)
|
||||
if not context:
|
||||
return None
|
||||
|
||||
# Get fact value
|
||||
value = self._get_fact_value(element)
|
||||
|
||||
# Create fact
|
||||
fact = XBRLFact(
|
||||
concept=context.get('name', ''),
|
||||
value=value,
|
||||
context_ref=context.get('contextRef'),
|
||||
unit_ref=context.get('unitRef'),
|
||||
decimals=context.get('decimals'),
|
||||
scale=context.get('scale'),
|
||||
format=context.get('format'),
|
||||
sign=context.get('sign')
|
||||
)
|
||||
|
||||
# Resolve references
|
||||
if fact.context_ref and fact.context_ref in self.contexts:
|
||||
fact.context = self.contexts[fact.context_ref]
|
||||
|
||||
if fact.unit_ref and fact.unit_ref in self.units:
|
||||
fact.unit = self.units[fact.unit_ref]
|
||||
|
||||
return fact
|
||||
|
||||
def _is_xbrl_element(self, element: HtmlElement) -> bool:
|
||||
"""Check if element is an XBRL element."""
|
||||
tag = element.tag
|
||||
if not isinstance(tag, str):
|
||||
return False
|
||||
|
||||
# Handle both namespaced and non-namespaced tags
|
||||
tag_lower = tag.lower()
|
||||
return (
|
||||
tag.startswith('{' + self.NAMESPACES['ix'] + '}') or
|
||||
tag.startswith('ix:') or
|
||||
tag_lower.startswith('ix:')
|
||||
)
|
||||
|
||||
def _get_local_name(self, tag: str) -> str:
|
||||
"""Get local name from qualified tag."""
|
||||
if '}' in tag:
|
||||
return tag.split('}')[1].lower()
|
||||
elif ':' in tag:
|
||||
return tag.split(':')[1].lower()
|
||||
return tag.lower()
|
||||
|
||||
def _initialize_context(self, element: HtmlElement):
|
||||
"""Initialize context and unit information from document."""
|
||||
# Find root element
|
||||
root = element.getroottree().getroot()
|
||||
|
||||
# Extract contexts
|
||||
self._extract_contexts(root)
|
||||
|
||||
# Extract units
|
||||
self._extract_units(root)
|
||||
|
||||
self._initialized = True
|
||||
|
||||
def _extract_contexts(self, root: HtmlElement):
|
||||
"""Extract all context definitions."""
|
||||
# Look for xbrli:context elements
|
||||
for context in root.xpath('//xbrli:context', namespaces=self.NAMESPACES):
|
||||
context_id = context.get('id')
|
||||
if not context_id:
|
||||
continue
|
||||
|
||||
context_data = {
|
||||
'id': context_id
|
||||
}
|
||||
|
||||
# Extract entity
|
||||
entity = context.find('.//xbrli:entity', namespaces=self.NAMESPACES)
|
||||
if entity is not None:
|
||||
identifier = entity.find('.//xbrli:identifier', namespaces=self.NAMESPACES)
|
||||
if identifier is not None:
|
||||
context_data['entity'] = identifier.text
|
||||
context_data['scheme'] = identifier.get('scheme')
|
||||
|
||||
# Extract period
|
||||
period = context.find('.//xbrli:period', namespaces=self.NAMESPACES)
|
||||
if period is not None:
|
||||
instant = period.find('.//xbrli:instant', namespaces=self.NAMESPACES)
|
||||
if instant is not None:
|
||||
context_data['instant'] = instant.text
|
||||
context_data['period_type'] = 'instant'
|
||||
else:
|
||||
start = period.find('.//xbrli:startDate', namespaces=self.NAMESPACES)
|
||||
end = period.find('.//xbrli:endDate', namespaces=self.NAMESPACES)
|
||||
if start is not None and end is not None:
|
||||
context_data['start_date'] = start.text
|
||||
context_data['end_date'] = end.text
|
||||
context_data['period_type'] = 'duration'
|
||||
|
||||
# Extract dimensions
|
||||
segment = context.find('.//xbrli:segment', namespaces=self.NAMESPACES)
|
||||
if segment is not None:
|
||||
dimensions = {}
|
||||
for member in segment.findall('.//xbrldi:explicitMember', namespaces=self.NAMESPACES):
|
||||
dim = member.get('dimension')
|
||||
if dim:
|
||||
dimensions[dim] = member.text
|
||||
if dimensions:
|
||||
context_data['dimensions'] = dimensions
|
||||
|
||||
self.contexts[context_id] = context_data
|
||||
|
||||
def _extract_units(self, root: HtmlElement):
|
||||
"""Extract all unit definitions."""
|
||||
# Look for xbrli:unit elements
|
||||
for unit in root.xpath('//xbrli:unit', namespaces=self.NAMESPACES):
|
||||
unit_id = unit.get('id')
|
||||
if not unit_id:
|
||||
continue
|
||||
|
||||
# Check for simple measure
|
||||
measure = unit.find('.//xbrli:measure', namespaces=self.NAMESPACES)
|
||||
if measure is not None:
|
||||
self.units[unit_id] = self._normalize_unit(measure.text)
|
||||
continue
|
||||
|
||||
# Check for complex unit (divide)
|
||||
divide = unit.find('.//xbrli:divide', namespaces=self.NAMESPACES)
|
||||
if divide is not None:
|
||||
numerator = divide.find('.//xbrli:unitNumerator/xbrli:measure', namespaces=self.NAMESPACES)
|
||||
denominator = divide.find('.//xbrli:unitDenominator/xbrli:measure', namespaces=self.NAMESPACES)
|
||||
|
||||
if numerator is not None and denominator is not None:
|
||||
num_unit = self._normalize_unit(numerator.text)
|
||||
den_unit = self._normalize_unit(denominator.text)
|
||||
self.units[unit_id] = f"{num_unit}/{den_unit}"
|
||||
|
||||
def _normalize_unit(self, unit_text: str) -> str:
|
||||
"""Normalize unit text."""
|
||||
if not unit_text:
|
||||
return ''
|
||||
|
||||
# Remove namespace prefix
|
||||
if ':' in unit_text:
|
||||
unit_text = unit_text.split(':')[-1]
|
||||
|
||||
# Common normalizations
|
||||
unit_map = {
|
||||
'usd': 'USD',
|
||||
'shares': 'shares',
|
||||
'pure': 'pure',
|
||||
'percent': '%'
|
||||
}
|
||||
|
||||
return unit_map.get(unit_text.lower(), unit_text)
|
||||
|
||||
def _extract_nonfraction(self, element: HtmlElement) -> Dict[str, Any]:
|
||||
"""Extract ix:nonFraction element."""
|
||||
metadata = {
|
||||
'type': 'nonFraction',
|
||||
'name': element.get('name'),
|
||||
'contextRef': element.get('contextRef') or element.get('contextref'),
|
||||
'unitRef': element.get('unitRef') or element.get('unitref'),
|
||||
'decimals': element.get('decimals'),
|
||||
'scale': element.get('scale'),
|
||||
'format': element.get('format'),
|
||||
'sign': element.get('sign')
|
||||
}
|
||||
|
||||
# Clean None values
|
||||
return {k: v for k, v in metadata.items() if v is not None}
|
||||
|
||||
def _extract_nonnumeric(self, element: HtmlElement) -> Dict[str, Any]:
|
||||
"""Extract ix:nonNumeric element."""
|
||||
metadata = {
|
||||
'type': 'nonNumeric',
|
||||
'name': element.get('name'),
|
||||
'contextRef': element.get('contextRef') or element.get('contextref'),
|
||||
'format': element.get('format')
|
||||
}
|
||||
|
||||
# Clean None values
|
||||
return {k: v for k, v in metadata.items() if v is not None}
|
||||
|
||||
def _extract_continuation(self, element: HtmlElement) -> Dict[str, Any]:
|
||||
"""Extract ix:continuation element."""
|
||||
cont_id = element.get('id')
|
||||
continued_at = element.get('continuedAt')
|
||||
|
||||
if cont_id and continued_at:
|
||||
# Map continuation to original
|
||||
if continued_at in self.continuations:
|
||||
original = self.continuations[continued_at]
|
||||
self.continuations[cont_id] = original
|
||||
return original
|
||||
else:
|
||||
# Store for later resolution
|
||||
metadata = {
|
||||
'type': 'continuation',
|
||||
'id': cont_id,
|
||||
'continuedAt': continued_at
|
||||
}
|
||||
self.continuations[cont_id] = metadata
|
||||
return metadata
|
||||
|
||||
return {}
|
||||
|
||||
def _extract_footnote(self, element: HtmlElement) -> Dict[str, Any]:
|
||||
"""Extract ix:footnote element."""
|
||||
return {
|
||||
'type': 'footnote',
|
||||
'footnoteRole': element.get('footnoteRole'),
|
||||
'footnoteID': element.get('footnoteID')
|
||||
}
|
||||
|
||||
def _extract_fraction(self, element: HtmlElement) -> Dict[str, Any]:
|
||||
"""Extract ix:fraction element."""
|
||||
metadata = {
|
||||
'type': 'fraction',
|
||||
'name': element.get('name'),
|
||||
'contextRef': element.get('contextRef'),
|
||||
'unitRef': element.get('unitRef')
|
||||
}
|
||||
|
||||
# Extract numerator and denominator
|
||||
numerator = element.find('.//ix:numerator', namespaces=self.NAMESPACES)
|
||||
denominator = element.find('.//ix:denominator', namespaces=self.NAMESPACES)
|
||||
|
||||
if numerator is not None:
|
||||
metadata['numerator'] = numerator.text
|
||||
if denominator is not None:
|
||||
metadata['denominator'] = denominator.text
|
||||
|
||||
return {k: v for k, v in metadata.items() if v is not None}
|
||||
|
||||
def _get_fact_value(self, element: HtmlElement) -> str:
|
||||
"""Get fact value from element with transformations."""
|
||||
# Get raw value
|
||||
value = element.text or ''
|
||||
|
||||
# Apply format transformation if specified
|
||||
format_attr = element.get('format')
|
||||
if format_attr and format_attr in self.TRANSFORMATIONS:
|
||||
transform = self.TRANSFORMATIONS[format_attr]
|
||||
value = transform(value)
|
||||
|
||||
# Apply scale if specified
|
||||
scale = element.get('scale')
|
||||
if scale:
|
||||
try:
|
||||
scale_factor = int(scale)
|
||||
numeric_value = float(value.replace(',', ''))
|
||||
scaled_value = numeric_value * (10 ** scale_factor)
|
||||
value = str(scaled_value)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Apply sign if specified
|
||||
sign = element.get('sign')
|
||||
if sign == '-':
|
||||
if value and not value.startswith('-'):
|
||||
value = '-' + value
|
||||
|
||||
return value.strip()
|
||||
1192
venv/lib/python3.10/site-packages/edgar/documents/table_nodes.py
Normal file
1192
venv/lib/python3.10/site-packages/edgar/documents/table_nodes.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,62 @@
|
||||
"""
|
||||
Table processing utilities for document parsing.
|
||||
|
||||
This module consolidates the standard table matrix processing pipeline used
|
||||
across table rendering implementations (TableNode.render(), TableNode.to_dataframe(),
|
||||
and FastTableRenderer.render_table_node()).
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar.documents.utils.table_matrix import TableMatrix, ColumnAnalyzer
|
||||
from edgar.documents.utils.currency_merger import CurrencyColumnMerger
|
||||
|
||||
|
||||
def process_table_matrix(matrix: "TableMatrix", headers, rows) -> "TableMatrix":
|
||||
"""
|
||||
Standard table matrix processing pipeline.
|
||||
|
||||
This function applies the standard three-step processing pipeline:
|
||||
1. Build matrix from headers and rows (handles colspan/rowspan)
|
||||
2. Filter out spacing columns (columns with only whitespace)
|
||||
3. Detect and merge currency symbol columns with adjacent value columns
|
||||
|
||||
Args:
|
||||
matrix: TableMatrix instance to populate
|
||||
headers: List of header rows (each row is a list of Cell objects)
|
||||
rows: List of data rows (each row is a list of Cell objects)
|
||||
|
||||
Returns:
|
||||
Processed TableMatrix with spacing columns removed and currency columns merged
|
||||
|
||||
Example:
|
||||
>>> matrix = TableMatrix()
|
||||
>>> clean_matrix = process_table_matrix(matrix, headers, rows)
|
||||
>>> # clean_matrix now has colspan/rowspan expanded, spacing removed, and currencies merged
|
||||
|
||||
Note:
|
||||
This consolidates the identical processing sequence that appeared in:
|
||||
- table_nodes.py:240-251 (TableNode.render())
|
||||
- table_nodes.py:XXX (TableNode.to_dataframe())
|
||||
- renderers/fast_table.py:XXX (FastTableRenderer.render_table_node())
|
||||
"""
|
||||
# Import at runtime to avoid circular imports
|
||||
from edgar.documents.utils.table_matrix import ColumnAnalyzer
|
||||
from edgar.documents.utils.currency_merger import CurrencyColumnMerger
|
||||
|
||||
# Step 1: Build matrix from rows (expands colspan/rowspan)
|
||||
matrix.build_from_rows(headers, rows)
|
||||
|
||||
# Step 2: Remove spacing columns (columns with only whitespace/empty cells)
|
||||
# Note: ColumnAnalyzer is created but unused in original implementation
|
||||
analyzer = ColumnAnalyzer(matrix)
|
||||
clean_matrix = matrix.filter_spacing_columns()
|
||||
|
||||
# Step 3: Detect and merge currency columns ($ with adjacent numbers)
|
||||
currency_merger = CurrencyColumnMerger(clean_matrix)
|
||||
currency_merger.detect_currency_pairs()
|
||||
if currency_merger.merge_pairs:
|
||||
clean_matrix = currency_merger.apply_merges()
|
||||
|
||||
return clean_matrix
|
||||
282
venv/lib/python3.10/site-packages/edgar/documents/types.py
Normal file
282
venv/lib/python3.10/site-packages/edgar/documents/types.py
Normal file
@@ -0,0 +1,282 @@
|
||||
"""
|
||||
Type definitions for the HTML parser.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from typing import Protocol, Union, Optional, Dict, Any, List
|
||||
|
||||
|
||||
class NodeType(Enum):
|
||||
"""Types of nodes in the document tree."""
|
||||
DOCUMENT = auto()
|
||||
SECTION = auto()
|
||||
HEADING = auto()
|
||||
PARAGRAPH = auto()
|
||||
TABLE = auto()
|
||||
LIST = auto()
|
||||
LIST_ITEM = auto()
|
||||
LINK = auto()
|
||||
IMAGE = auto()
|
||||
XBRL_FACT = auto()
|
||||
TEXT = auto()
|
||||
CONTAINER = auto()
|
||||
|
||||
|
||||
class SemanticType(Enum):
|
||||
"""Semantic types for document understanding."""
|
||||
TITLE = auto()
|
||||
HEADER = auto()
|
||||
BODY_TEXT = auto()
|
||||
FOOTNOTE = auto()
|
||||
TABLE_OF_CONTENTS = auto()
|
||||
FINANCIAL_STATEMENT = auto()
|
||||
DISCLOSURE = auto()
|
||||
ITEM_HEADER = auto()
|
||||
SECTION_HEADER = auto()
|
||||
SIGNATURE = auto()
|
||||
EXHIBIT = auto()
|
||||
|
||||
|
||||
class TableType(Enum):
|
||||
"""Types of tables for semantic understanding."""
|
||||
FINANCIAL = auto()
|
||||
METRICS = auto()
|
||||
REFERENCE = auto()
|
||||
GENERAL = auto()
|
||||
TABLE_OF_CONTENTS = auto()
|
||||
EXHIBIT_INDEX = auto()
|
||||
|
||||
|
||||
@dataclass
|
||||
class Style:
|
||||
"""Unified style representation."""
|
||||
font_size: Optional[float] = None
|
||||
font_weight: Optional[str] = None
|
||||
font_style: Optional[str] = None
|
||||
text_align: Optional[str] = None
|
||||
text_decoration: Optional[str] = None
|
||||
color: Optional[str] = None
|
||||
background_color: Optional[str] = None
|
||||
margin_top: Optional[float] = None
|
||||
margin_bottom: Optional[float] = None
|
||||
margin_left: Optional[float] = None
|
||||
margin_right: Optional[float] = None
|
||||
padding_top: Optional[float] = None
|
||||
padding_bottom: Optional[float] = None
|
||||
padding_left: Optional[float] = None
|
||||
padding_right: Optional[float] = None
|
||||
display: Optional[str] = None
|
||||
width: Optional[Union[float, str]] = None
|
||||
height: Optional[Union[float, str]] = None
|
||||
line_height: Optional[float] = None
|
||||
|
||||
def merge(self, other: 'Style') -> 'Style':
|
||||
"""Merge this style with another, with other taking precedence."""
|
||||
merged = Style()
|
||||
for field in self.__dataclass_fields__:
|
||||
other_value = getattr(other, field)
|
||||
if other_value is not None:
|
||||
setattr(merged, field, other_value)
|
||||
else:
|
||||
setattr(merged, field, getattr(self, field))
|
||||
return merged
|
||||
|
||||
@property
|
||||
def is_bold(self) -> bool:
|
||||
"""Check if style represents bold text."""
|
||||
return self.font_weight in ('bold', '700', '800', '900')
|
||||
|
||||
@property
|
||||
def is_italic(self) -> bool:
|
||||
"""Check if style represents italic text."""
|
||||
return self.font_style == 'italic'
|
||||
|
||||
@property
|
||||
def is_centered(self) -> bool:
|
||||
"""Check if text is centered."""
|
||||
return self.text_align == 'center'
|
||||
|
||||
|
||||
class NodeProtocol(Protocol):
|
||||
"""Protocol for all nodes."""
|
||||
id: str
|
||||
type: NodeType
|
||||
content: Any
|
||||
metadata: Dict[str, Any]
|
||||
style: Style
|
||||
parent: Optional['NodeProtocol']
|
||||
children: List['NodeProtocol']
|
||||
|
||||
def text(self) -> str: ...
|
||||
def html(self) -> str: ...
|
||||
def find(self, predicate) -> List['NodeProtocol']: ...
|
||||
|
||||
|
||||
@dataclass
|
||||
class HeaderInfo:
|
||||
"""Information about detected headers."""
|
||||
level: int # 1-6
|
||||
confidence: float # 0.0-1.0
|
||||
text: str
|
||||
detection_method: str
|
||||
is_item: bool = False
|
||||
item_number: Optional[str] = None
|
||||
|
||||
@classmethod
|
||||
def from_text(cls, text: str, level: int, confidence: float, method: str) -> 'HeaderInfo':
|
||||
"""Create HeaderInfo from text, detecting if it's an item header."""
|
||||
# Check for item patterns
|
||||
item_pattern = re.compile(r'^(Item|ITEM)\s+(\d+[A-Z]?\.?)', re.IGNORECASE)
|
||||
match = item_pattern.match(text.strip())
|
||||
|
||||
is_item = bool(match)
|
||||
item_number = match.group(2).rstrip('.') if match else None
|
||||
|
||||
return cls(
|
||||
level=level,
|
||||
confidence=confidence,
|
||||
text=text,
|
||||
detection_method=method,
|
||||
is_item=is_item,
|
||||
item_number=item_number
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class XBRLFact:
|
||||
"""Represents an XBRL fact extracted from inline XBRL."""
|
||||
concept: str
|
||||
value: str
|
||||
context_ref: Optional[str] = None
|
||||
unit_ref: Optional[str] = None
|
||||
decimals: Optional[str] = None
|
||||
scale: Optional[str] = None
|
||||
format: Optional[str] = None
|
||||
sign: Optional[str] = None
|
||||
|
||||
# Resolved references
|
||||
context: Optional[Dict[str, Any]] = None
|
||||
unit: Optional[str] = None
|
||||
|
||||
# Additional metadata
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
@property
|
||||
def numeric_value(self) -> Optional[float]:
|
||||
"""Get numeric value if applicable."""
|
||||
try:
|
||||
# Remove commas and convert
|
||||
clean_value = self.value.replace(',', '')
|
||||
return float(clean_value)
|
||||
except (ValueError, AttributeError):
|
||||
return None
|
||||
|
||||
@property
|
||||
def is_numeric(self) -> bool:
|
||||
"""Check if this is a numeric fact."""
|
||||
return self.numeric_value is not None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert XBRLFact to dictionary."""
|
||||
return {
|
||||
'concept': self.concept,
|
||||
'value': self.value,
|
||||
'context_ref': self.context_ref,
|
||||
'unit_ref': self.unit_ref,
|
||||
'decimals': self.decimals,
|
||||
'scale': self.scale,
|
||||
'format': self.format,
|
||||
'sign': self.sign,
|
||||
'context': self.context,
|
||||
'unit': self.unit,
|
||||
'is_numeric': self.is_numeric,
|
||||
'numeric_value': self.numeric_value
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
"""
|
||||
Result from document search.
|
||||
|
||||
Designed for agent-friendly investigation workflows - provides access to
|
||||
full section context rather than fragmented chunks.
|
||||
"""
|
||||
node: 'NodeProtocol'
|
||||
score: float
|
||||
snippet: str
|
||||
section: Optional[str] = None
|
||||
context: Optional[str] = None
|
||||
_section_obj: Optional[Any] = None # Hidden Section object for agent navigation
|
||||
|
||||
@property
|
||||
def section_object(self) -> Optional[Any]:
|
||||
"""
|
||||
Get full Section object for agent navigation.
|
||||
|
||||
Enables multi-step investigation by providing access to complete
|
||||
section content, not just the matched fragment.
|
||||
|
||||
Returns:
|
||||
Section object with text(), tables(), and search() methods
|
||||
"""
|
||||
return self._section_obj
|
||||
|
||||
@property
|
||||
def full_context(self) -> str:
|
||||
"""
|
||||
Get complete section text for agent investigation.
|
||||
|
||||
Returns full section content instead of fragmented chunks.
|
||||
This supports the post-RAG "investigation not retrieval" pattern.
|
||||
|
||||
Returns:
|
||||
Complete section text if section available, else snippet
|
||||
"""
|
||||
if self._section_obj and hasattr(self._section_obj, 'text'):
|
||||
return self._section_obj.text()
|
||||
return self.snippet
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParseContext:
|
||||
"""Context information during parsing."""
|
||||
base_font_size: float = 10.0
|
||||
current_section: Optional[str] = None
|
||||
in_table: bool = False
|
||||
in_list: bool = False
|
||||
depth: int = 0
|
||||
style_stack: List[Style] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.style_stack is None:
|
||||
self.style_stack = []
|
||||
|
||||
def push_style(self, style: Style):
|
||||
"""Push style onto stack."""
|
||||
self.style_stack.append(style)
|
||||
|
||||
def pop_style(self):
|
||||
"""Pop style from stack."""
|
||||
if self.style_stack:
|
||||
self.style_stack.pop()
|
||||
|
||||
def get_current_style(self) -> Style:
|
||||
"""Get combined style from stack."""
|
||||
if not self.style_stack:
|
||||
return Style()
|
||||
|
||||
result = self.style_stack[0]
|
||||
for style in self.style_stack[1:]:
|
||||
result = result.merge(style)
|
||||
return result
|
||||
|
||||
|
||||
# Type aliases for clarity
|
||||
NodeId = str
|
||||
SectionName = str
|
||||
ConceptName = str
|
||||
ContextRef = str
|
||||
UnitRef = str
|
||||
@@ -0,0 +1,51 @@
|
||||
"""
|
||||
Utility modules for HTML parsing.
|
||||
"""
|
||||
|
||||
from edgar.documents.utils.cache import (
|
||||
LRUCache,
|
||||
WeakCache,
|
||||
TimeBasedCache,
|
||||
CacheManager,
|
||||
get_cache_manager,
|
||||
cached,
|
||||
CacheStats
|
||||
)
|
||||
from edgar.documents.utils.streaming import (
|
||||
StreamingParser
|
||||
)
|
||||
from edgar.documents.utils.table_matrix import (
|
||||
TableMatrix,
|
||||
ColumnAnalyzer,
|
||||
MatrixCell
|
||||
)
|
||||
from edgar.documents.utils.currency_merger import (
|
||||
CurrencyColumnMerger
|
||||
)
|
||||
# Note: CacheableMixin not exported to avoid circular imports
|
||||
# Import directly: from edgar.documents.cache_mixin import CacheableMixin
|
||||
from edgar.documents.utils.html_utils import (
|
||||
remove_xml_declaration,
|
||||
create_lxml_parser
|
||||
)
|
||||
# Note: table_utils not exported to avoid circular imports
|
||||
# Import directly: from edgar.documents.utils.table_utils import process_table_matrix
|
||||
|
||||
__all__ = [
|
||||
'LRUCache',
|
||||
'WeakCache',
|
||||
'TimeBasedCache',
|
||||
'CacheManager',
|
||||
'get_cache_manager',
|
||||
'cached',
|
||||
'CacheStats',
|
||||
'StreamingParser',
|
||||
'TableMatrix',
|
||||
'ColumnAnalyzer',
|
||||
'MatrixCell',
|
||||
'CurrencyColumnMerger',
|
||||
# 'CacheableMixin', # Not exported - import directly to avoid circular imports
|
||||
'remove_xml_declaration',
|
||||
'create_lxml_parser',
|
||||
# 'process_table_matrix' # Not exported - import directly to avoid circular imports
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,205 @@
|
||||
"""
|
||||
Lightweight anchor analysis cache to avoid re-parsing HTML.
|
||||
|
||||
This provides a middle-ground approach that caches anchor analysis results
|
||||
while minimizing memory overhead.
|
||||
"""
|
||||
import re
|
||||
from typing import Dict, Set, Optional
|
||||
from collections import Counter
|
||||
import hashlib
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class AnchorCache:
|
||||
"""
|
||||
Cache for anchor link analysis results.
|
||||
|
||||
Stores navigation patterns by HTML hash to avoid re-analysis.
|
||||
"""
|
||||
|
||||
def __init__(self, cache_dir: Optional[Path] = None):
|
||||
self.cache_dir = cache_dir or Path.home() / '.edgar_cache' / 'anchors'
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._memory_cache = {} # In-memory cache for current session
|
||||
|
||||
def _get_html_hash(self, html_content: str) -> str:
|
||||
"""Get hash of HTML content for caching."""
|
||||
return hashlib.md5(html_content.encode('utf-8')).hexdigest()
|
||||
|
||||
def get_navigation_patterns(self, html_content: str) -> Optional[Set[str]]:
|
||||
"""
|
||||
Get cached navigation patterns for HTML content.
|
||||
|
||||
Args:
|
||||
html_content: HTML to analyze
|
||||
|
||||
Returns:
|
||||
Set of navigation patterns or None if not cached
|
||||
"""
|
||||
html_hash = self._get_html_hash(html_content)
|
||||
|
||||
# Check in-memory cache first
|
||||
if html_hash in self._memory_cache:
|
||||
return self._memory_cache[html_hash]
|
||||
|
||||
# Check disk cache
|
||||
cache_file = self.cache_dir / f"{html_hash}.pkl"
|
||||
if cache_file.exists():
|
||||
try:
|
||||
with open(cache_file, 'rb') as f:
|
||||
patterns = pickle.load(f)
|
||||
self._memory_cache[html_hash] = patterns
|
||||
return patterns
|
||||
except:
|
||||
# Corrupted cache file, remove it
|
||||
cache_file.unlink(missing_ok=True)
|
||||
|
||||
return None
|
||||
|
||||
def cache_navigation_patterns(self, html_content: str, patterns: Set[str]) -> None:
|
||||
"""
|
||||
Cache navigation patterns for HTML content.
|
||||
|
||||
Args:
|
||||
html_content: HTML content
|
||||
patterns: Navigation patterns to cache
|
||||
"""
|
||||
html_hash = self._get_html_hash(html_content)
|
||||
|
||||
# Store in memory
|
||||
self._memory_cache[html_hash] = patterns
|
||||
|
||||
# Store on disk (async to avoid blocking)
|
||||
try:
|
||||
cache_file = self.cache_dir / f"{html_hash}.pkl"
|
||||
with open(cache_file, 'wb') as f:
|
||||
pickle.dump(patterns, f)
|
||||
except:
|
||||
# Ignore cache write errors
|
||||
pass
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
"""Clear all cached data."""
|
||||
self._memory_cache.clear()
|
||||
for cache_file in self.cache_dir.glob("*.pkl"):
|
||||
cache_file.unlink(missing_ok=True)
|
||||
|
||||
|
||||
# Global cache instance
|
||||
_anchor_cache = AnchorCache()
|
||||
|
||||
|
||||
def get_cached_navigation_patterns(html_content: str,
|
||||
force_analyze: bool = False) -> Set[str]:
|
||||
"""
|
||||
Get navigation patterns with caching.
|
||||
|
||||
Args:
|
||||
html_content: HTML to analyze
|
||||
force_analyze: Force re-analysis even if cached
|
||||
|
||||
Returns:
|
||||
Set of navigation link texts to filter
|
||||
"""
|
||||
if not force_analyze:
|
||||
cached_patterns = _anchor_cache.get_navigation_patterns(html_content)
|
||||
if cached_patterns is not None:
|
||||
return cached_patterns
|
||||
|
||||
# Need to analyze - use minimal approach
|
||||
patterns = _analyze_navigation_minimal(html_content)
|
||||
|
||||
# Cache results
|
||||
_anchor_cache.cache_navigation_patterns(html_content, patterns)
|
||||
|
||||
return patterns
|
||||
|
||||
|
||||
def _analyze_navigation_minimal(html_content: str, min_frequency: int = 5) -> Set[str]:
|
||||
"""
|
||||
Minimal navigation analysis using regex instead of full HTML parsing.
|
||||
|
||||
This avoids BeautifulSoup overhead by using regex to find anchor patterns.
|
||||
"""
|
||||
patterns = set()
|
||||
|
||||
# Find all anchor links with regex (faster than BeautifulSoup)
|
||||
anchor_pattern = re.compile(r'<a[^>]*href\s*=\s*["\']#([^"\']*)["\'][^>]*>(.*?)</a>',
|
||||
re.IGNORECASE | re.DOTALL)
|
||||
|
||||
link_counts = Counter()
|
||||
|
||||
for match in anchor_pattern.finditer(html_content):
|
||||
anchor_id = match.group(1).strip()
|
||||
link_text = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Remove HTML tags
|
||||
link_text = ' '.join(link_text.split()) # Normalize whitespace
|
||||
|
||||
if link_text and len(link_text) < 100: # Reasonable link text length
|
||||
link_counts[link_text] += 1
|
||||
|
||||
# Add frequently occurring links
|
||||
for text, count in link_counts.items():
|
||||
if count >= min_frequency:
|
||||
patterns.add(text)
|
||||
|
||||
return patterns
|
||||
|
||||
|
||||
def filter_with_cached_patterns(text: str, html_content: str = None) -> str:
|
||||
"""
|
||||
Filter text using cached navigation patterns.
|
||||
|
||||
Preserves first occurrences of patterns (document structure headers)
|
||||
while filtering out repeated navigation links.
|
||||
|
||||
Args:
|
||||
text: Text to filter
|
||||
html_content: HTML for pattern analysis (optional)
|
||||
|
||||
Returns:
|
||||
Filtered text
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Get patterns (cached or analyze)
|
||||
if html_content:
|
||||
patterns = get_cached_navigation_patterns(html_content)
|
||||
else:
|
||||
# Fallback to common SEC patterns
|
||||
patterns = {
|
||||
'Table of Contents',
|
||||
'Index to Financial Statements',
|
||||
'Index to Exhibits'
|
||||
}
|
||||
|
||||
if not patterns:
|
||||
return text
|
||||
|
||||
# Smart filtering: preserve first few occurrences, filter out repetitions
|
||||
lines = text.split('\n')
|
||||
filtered_lines = []
|
||||
pattern_counts = {} # Track how many times we've seen each pattern
|
||||
|
||||
# Allow first few occurrences of each pattern (document structure headers)
|
||||
max_allowed_per_pattern = 2 # Allow up to 2 occurrences of each pattern
|
||||
|
||||
for line in lines:
|
||||
stripped_line = line.strip()
|
||||
|
||||
if stripped_line in patterns:
|
||||
# This line matches a navigation pattern
|
||||
count = pattern_counts.get(stripped_line, 0)
|
||||
|
||||
if count < max_allowed_per_pattern:
|
||||
# Keep this occurrence (likely a document structure header)
|
||||
filtered_lines.append(line)
|
||||
pattern_counts[stripped_line] = count + 1
|
||||
# else: skip this line (it's a repetitive navigation link)
|
||||
else:
|
||||
# Not a navigation pattern, always keep
|
||||
filtered_lines.append(line)
|
||||
|
||||
return '\n'.join(filtered_lines)
|
||||
426
venv/lib/python3.10/site-packages/edgar/documents/utils/cache.py
Normal file
426
venv/lib/python3.10/site-packages/edgar/documents/utils/cache.py
Normal file
@@ -0,0 +1,426 @@
|
||||
"""
|
||||
Cache utilities for performance optimization.
|
||||
"""
|
||||
|
||||
import weakref
|
||||
from collections import OrderedDict
|
||||
from typing import Any, Dict, Optional, Callable, TypeVar, Generic
|
||||
from functools import wraps
|
||||
import time
|
||||
import threading
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
@dataclass
|
||||
class CacheStats:
|
||||
"""Statistics for cache performance monitoring."""
|
||||
hits: int = 0
|
||||
misses: int = 0
|
||||
evictions: int = 0
|
||||
total_time: float = 0.0
|
||||
last_reset: datetime = field(default_factory=datetime.now)
|
||||
|
||||
@property
|
||||
def hit_rate(self) -> float:
|
||||
"""Calculate cache hit rate."""
|
||||
total = self.hits + self.misses
|
||||
return self.hits / total if total > 0 else 0.0
|
||||
|
||||
@property
|
||||
def avg_access_time(self) -> float:
|
||||
"""Calculate average access time."""
|
||||
total = self.hits + self.misses
|
||||
return self.total_time / total if total > 0 else 0.0
|
||||
|
||||
def reset(self):
|
||||
"""Reset statistics."""
|
||||
self.hits = 0
|
||||
self.misses = 0
|
||||
self.evictions = 0
|
||||
self.total_time = 0.0
|
||||
self.last_reset = datetime.now()
|
||||
|
||||
|
||||
class LRUCache(Generic[T]):
|
||||
"""
|
||||
Thread-safe LRU cache implementation.
|
||||
|
||||
Used for caching expensive operations like style parsing
|
||||
and header detection results.
|
||||
"""
|
||||
|
||||
def __init__(self, max_size: int = 1000):
|
||||
"""
|
||||
Initialize LRU cache.
|
||||
|
||||
Args:
|
||||
max_size: Maximum number of items to cache
|
||||
"""
|
||||
self.max_size = max_size
|
||||
self._cache: OrderedDict[str, T] = OrderedDict()
|
||||
self._lock = threading.RLock()
|
||||
self.stats = CacheStats()
|
||||
|
||||
def get(self, key: str) -> Optional[T]:
|
||||
"""
|
||||
Get item from cache.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
|
||||
Returns:
|
||||
Cached value or None if not found
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
if key in self._cache:
|
||||
# Move to end (most recently used)
|
||||
self._cache.move_to_end(key)
|
||||
self.stats.hits += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return self._cache[key]
|
||||
|
||||
self.stats.misses += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return None
|
||||
|
||||
def put(self, key: str, value: T) -> None:
|
||||
"""
|
||||
Put item in cache.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
value: Value to cache
|
||||
"""
|
||||
with self._lock:
|
||||
if key in self._cache:
|
||||
# Update existing
|
||||
self._cache.move_to_end(key)
|
||||
self._cache[key] = value
|
||||
else:
|
||||
# Add new
|
||||
self._cache[key] = value
|
||||
|
||||
# Evict oldest if over capacity
|
||||
if len(self._cache) > self.max_size:
|
||||
self._cache.popitem(last=False)
|
||||
self.stats.evictions += 1
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached items."""
|
||||
with self._lock:
|
||||
self._cache.clear()
|
||||
|
||||
def size(self) -> int:
|
||||
"""Get current cache size."""
|
||||
with self._lock:
|
||||
return len(self._cache)
|
||||
|
||||
|
||||
class WeakCache:
|
||||
"""
|
||||
Weak reference cache for parsed nodes.
|
||||
|
||||
Allows garbage collection of unused nodes while
|
||||
maintaining references to actively used ones.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize weak cache."""
|
||||
self._cache: Dict[str, weakref.ref] = {}
|
||||
self._lock = threading.RLock()
|
||||
self.stats = CacheStats()
|
||||
|
||||
def get(self, key: str) -> Optional[Any]:
|
||||
"""
|
||||
Get item from cache.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
|
||||
Returns:
|
||||
Cached object or None if not found or collected
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
ref = self._cache.get(key)
|
||||
if ref is not None:
|
||||
obj = ref()
|
||||
if obj is not None:
|
||||
self.stats.hits += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return obj
|
||||
else:
|
||||
# Object was garbage collected
|
||||
del self._cache[key]
|
||||
|
||||
self.stats.misses += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return None
|
||||
|
||||
def put(self, key: str, value: Any) -> None:
|
||||
"""
|
||||
Put item in cache with weak reference.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
value: Object to cache
|
||||
"""
|
||||
with self._lock:
|
||||
self._cache[key] = weakref.ref(value)
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached references."""
|
||||
with self._lock:
|
||||
self._cache.clear()
|
||||
|
||||
def cleanup(self) -> int:
|
||||
"""
|
||||
Remove dead references.
|
||||
|
||||
Returns:
|
||||
Number of references removed
|
||||
"""
|
||||
with self._lock:
|
||||
dead_keys = [
|
||||
key for key, ref in self._cache.items()
|
||||
if ref() is None
|
||||
]
|
||||
|
||||
for key in dead_keys:
|
||||
del self._cache[key]
|
||||
|
||||
return len(dead_keys)
|
||||
|
||||
|
||||
class TimeBasedCache(Generic[T]):
|
||||
"""
|
||||
Time-based expiring cache.
|
||||
|
||||
Items expire after a specified duration.
|
||||
"""
|
||||
|
||||
def __init__(self, ttl_seconds: int = 3600):
|
||||
"""
|
||||
Initialize time-based cache.
|
||||
|
||||
Args:
|
||||
ttl_seconds: Time to live in seconds
|
||||
"""
|
||||
self.ttl = timedelta(seconds=ttl_seconds)
|
||||
self._cache: Dict[str, tuple[T, datetime]] = {}
|
||||
self._lock = threading.RLock()
|
||||
self.stats = CacheStats()
|
||||
|
||||
def get(self, key: str) -> Optional[T]:
|
||||
"""
|
||||
Get item from cache if not expired.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
|
||||
Returns:
|
||||
Cached value or None if not found or expired
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
if key in self._cache:
|
||||
value, timestamp = self._cache[key]
|
||||
if datetime.now() - timestamp < self.ttl:
|
||||
self.stats.hits += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return value
|
||||
else:
|
||||
# Expired
|
||||
del self._cache[key]
|
||||
self.stats.evictions += 1
|
||||
|
||||
self.stats.misses += 1
|
||||
self.stats.total_time += time.time() - start_time
|
||||
return None
|
||||
|
||||
def put(self, key: str, value: T) -> None:
|
||||
"""
|
||||
Put item in cache with timestamp.
|
||||
|
||||
Args:
|
||||
key: Cache key
|
||||
value: Value to cache
|
||||
"""
|
||||
with self._lock:
|
||||
self._cache[key] = (value, datetime.now())
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached items."""
|
||||
with self._lock:
|
||||
self._cache.clear()
|
||||
|
||||
def cleanup(self) -> int:
|
||||
"""
|
||||
Remove expired items.
|
||||
|
||||
Returns:
|
||||
Number of items removed
|
||||
"""
|
||||
with self._lock:
|
||||
now = datetime.now()
|
||||
expired_keys = [
|
||||
key for key, (_, timestamp) in self._cache.items()
|
||||
if now - timestamp >= self.ttl
|
||||
]
|
||||
|
||||
for key in expired_keys:
|
||||
del self._cache[key]
|
||||
self.stats.evictions += 1
|
||||
|
||||
return len(expired_keys)
|
||||
|
||||
|
||||
def cached(cache: LRUCache, key_func: Optional[Callable] = None):
|
||||
"""
|
||||
Decorator for caching function results.
|
||||
|
||||
Args:
|
||||
cache: Cache instance to use
|
||||
key_func: Function to generate cache key from arguments
|
||||
|
||||
Returns:
|
||||
Decorated function
|
||||
"""
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
# Generate cache key
|
||||
if key_func:
|
||||
key = key_func(*args, **kwargs)
|
||||
else:
|
||||
# Default key generation
|
||||
key = f"{func.__name__}:{str(args)}:{str(sorted(kwargs.items()))}"
|
||||
|
||||
# Check cache
|
||||
result = cache.get(key)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
# Compute and cache result
|
||||
result = func(*args, **kwargs)
|
||||
cache.put(key, result)
|
||||
|
||||
return result
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
class CacheManager:
|
||||
"""
|
||||
Manages multiple caches for the parser.
|
||||
|
||||
Provides centralized cache management and monitoring.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize cache manager."""
|
||||
# Style parsing cache
|
||||
self.style_cache = LRUCache[dict](max_size=5000)
|
||||
|
||||
# Header detection cache
|
||||
self.header_cache = LRUCache[bool](max_size=2000)
|
||||
|
||||
# Pattern matching cache
|
||||
self.pattern_cache = LRUCache[bool](max_size=10000)
|
||||
|
||||
# Node reference cache
|
||||
self.node_cache = WeakCache()
|
||||
|
||||
# Compiled regex cache
|
||||
self.regex_cache = LRUCache[Any](max_size=500)
|
||||
|
||||
# All caches for management
|
||||
self._caches = {
|
||||
'style': self.style_cache,
|
||||
'header': self.header_cache,
|
||||
'pattern': self.pattern_cache,
|
||||
'node': self.node_cache,
|
||||
'regex': self.regex_cache
|
||||
}
|
||||
|
||||
def get_stats(self) -> Dict[str, CacheStats]:
|
||||
"""Get statistics for all caches."""
|
||||
return {
|
||||
name: cache.stats
|
||||
for name, cache in self._caches.items()
|
||||
if hasattr(cache, 'stats')
|
||||
}
|
||||
|
||||
def reset_stats(self) -> None:
|
||||
"""Reset statistics for all caches."""
|
||||
for cache in self._caches.values():
|
||||
if hasattr(cache, 'stats'):
|
||||
cache.stats.reset()
|
||||
|
||||
def clear_all(self) -> None:
|
||||
"""Clear all caches."""
|
||||
for cache in self._caches.values():
|
||||
cache.clear()
|
||||
|
||||
def cleanup(self) -> Dict[str, int]:
|
||||
"""
|
||||
Cleanup expired/dead entries in all caches.
|
||||
|
||||
Returns:
|
||||
Number of entries cleaned up per cache
|
||||
"""
|
||||
cleanup_counts = {}
|
||||
|
||||
# Cleanup weak cache
|
||||
if hasattr(self.node_cache, 'cleanup'):
|
||||
cleanup_counts['node'] = self.node_cache.cleanup()
|
||||
|
||||
return cleanup_counts
|
||||
|
||||
def get_memory_usage(self) -> Dict[str, int]:
|
||||
"""
|
||||
Estimate memory usage of caches.
|
||||
|
||||
Returns:
|
||||
Approximate memory usage in bytes per cache
|
||||
"""
|
||||
import sys
|
||||
|
||||
usage = {}
|
||||
|
||||
for name, cache in self._caches.items():
|
||||
if hasattr(cache, '_cache'):
|
||||
# Rough estimation
|
||||
size = 0
|
||||
if isinstance(cache._cache, dict):
|
||||
for key, value in cache._cache.items():
|
||||
size += sys.getsizeof(key)
|
||||
if hasattr(value, '__sizeof__'):
|
||||
size += sys.getsizeof(value)
|
||||
else:
|
||||
size += 1000 # Default estimate
|
||||
|
||||
usage[name] = size
|
||||
|
||||
return usage
|
||||
|
||||
|
||||
# Global cache manager instance
|
||||
_cache_manager = None
|
||||
|
||||
|
||||
def get_cache_manager() -> CacheManager:
|
||||
"""Get global cache manager instance."""
|
||||
global _cache_manager
|
||||
if _cache_manager is None:
|
||||
_cache_manager = CacheManager()
|
||||
return _cache_manager
|
||||
@@ -0,0 +1,277 @@
|
||||
"""
|
||||
Currency column merger for handling separated currency symbols in SEC filings.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Tuple
|
||||
|
||||
from edgar.documents.table_nodes import Cell
|
||||
from edgar.documents.utils.table_matrix import TableMatrix, MatrixCell
|
||||
|
||||
|
||||
class CurrencyColumnMerger:
|
||||
"""
|
||||
Detects and merges currency symbol columns with their value columns.
|
||||
|
||||
SEC filings often split currency values into two cells:
|
||||
- Cell 1: "$" (left-aligned)
|
||||
- Cell 2: "224.11" (right-aligned)
|
||||
|
||||
This class detects this pattern and merges them into "$224.11"
|
||||
"""
|
||||
|
||||
# Common currency symbols
|
||||
CURRENCY_SYMBOLS = {'$', '€', '£', '¥', '₹', 'Rs', 'USD', 'EUR', 'GBP'}
|
||||
|
||||
# Pattern for numeric values (with commas, decimals)
|
||||
NUMERIC_PATTERN = re.compile(r'^[\d,]+\.?\d*$')
|
||||
|
||||
def __init__(self, matrix: TableMatrix):
|
||||
"""Initialize with a table matrix."""
|
||||
self.matrix = matrix
|
||||
self.merge_pairs: List[Tuple[int, int]] = []
|
||||
|
||||
def detect_currency_pairs(self) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Detect column pairs that should be merged (currency symbol + value).
|
||||
|
||||
Returns:
|
||||
List of (symbol_col, value_col) pairs to merge
|
||||
"""
|
||||
pairs = []
|
||||
|
||||
for col_idx in range(self.matrix.col_count - 1):
|
||||
if self._is_currency_column(col_idx):
|
||||
next_col = col_idx + 1
|
||||
if self._is_numeric_column(next_col):
|
||||
# Check if they're consistently paired
|
||||
if self._verify_pairing(col_idx, next_col):
|
||||
pairs.append((col_idx, next_col))
|
||||
|
||||
self.merge_pairs = pairs
|
||||
return pairs
|
||||
|
||||
def _is_currency_column(self, col_idx: int) -> bool:
|
||||
"""
|
||||
Check if a column contains only currency symbols.
|
||||
|
||||
A currency column typically:
|
||||
- Contains only currency symbols or empty cells
|
||||
- Has very narrow width (1-3 characters)
|
||||
- Is left-aligned (though we check content, not style)
|
||||
"""
|
||||
currency_count = 0
|
||||
empty_count = 0
|
||||
other_count = 0
|
||||
header_rows = 0
|
||||
|
||||
for row_idx in range(self.matrix.row_count):
|
||||
cell = self.matrix.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
|
||||
# Skip header rows (first 2 rows typically)
|
||||
if row_idx < 2 and text and not text in self.CURRENCY_SYMBOLS:
|
||||
header_rows += 1
|
||||
continue
|
||||
|
||||
if not text:
|
||||
empty_count += 1
|
||||
elif text in self.CURRENCY_SYMBOLS or text == '$':
|
||||
currency_count += 1
|
||||
elif len(text) <= 3 and text in ['$', '€', '£', '¥']:
|
||||
currency_count += 1
|
||||
else:
|
||||
other_count += 1
|
||||
|
||||
# Column should be mostly currency symbols with some empty cells
|
||||
# Exclude header rows from the calculation
|
||||
total_non_empty = currency_count + other_count
|
||||
if total_non_empty == 0:
|
||||
return False
|
||||
|
||||
# At least 60% of non-empty, non-header cells should be currency symbols
|
||||
# Lower threshold since we're excluding headers
|
||||
# Also accept if there's at least 1 currency symbol and no other non-currency content
|
||||
return (currency_count >= 1 and other_count == 0) or \
|
||||
(currency_count >= 2 and currency_count / total_non_empty >= 0.6)
|
||||
|
||||
def _is_numeric_column(self, col_idx: int) -> bool:
|
||||
"""
|
||||
Check if a column contains numeric values.
|
||||
"""
|
||||
numeric_count = 0
|
||||
non_empty_count = 0
|
||||
|
||||
for row_idx in range(self.matrix.row_count):
|
||||
cell = self.matrix.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
|
||||
# Skip header rows
|
||||
if row_idx < 2:
|
||||
continue
|
||||
|
||||
if text:
|
||||
non_empty_count += 1
|
||||
# Remove formatting and check if numeric
|
||||
clean_text = text.replace(',', '').replace('%', '').replace('(', '').replace(')', '')
|
||||
if self.NUMERIC_PATTERN.match(clean_text):
|
||||
numeric_count += 1
|
||||
|
||||
if non_empty_count == 0:
|
||||
return False
|
||||
|
||||
# At least 60% should be numeric (lowered threshold)
|
||||
return numeric_count / non_empty_count >= 0.6
|
||||
|
||||
def _verify_pairing(self, symbol_col: int, value_col: int) -> bool:
|
||||
"""
|
||||
Verify that symbol and value columns are consistently paired.
|
||||
|
||||
They should have content in the same rows (when symbol present, value present).
|
||||
"""
|
||||
paired_rows = 0
|
||||
mismatched_rows = 0
|
||||
|
||||
for row_idx in range(self.matrix.row_count):
|
||||
symbol_cell = self.matrix.matrix[row_idx][symbol_col]
|
||||
value_cell = self.matrix.matrix[row_idx][value_col]
|
||||
|
||||
if symbol_cell.original_cell and value_cell.original_cell:
|
||||
symbol_text = symbol_cell.original_cell.text().strip()
|
||||
value_text = value_cell.original_cell.text().strip()
|
||||
|
||||
# Check if they're paired (both have content or both empty)
|
||||
if symbol_text in self.CURRENCY_SYMBOLS and value_text:
|
||||
paired_rows += 1
|
||||
elif not symbol_text and not value_text:
|
||||
# Both empty is fine
|
||||
pass
|
||||
elif symbol_text in self.CURRENCY_SYMBOLS and not value_text:
|
||||
# Symbol without value - might be header
|
||||
if row_idx < 2: # Allow in headers
|
||||
pass
|
||||
else:
|
||||
mismatched_rows += 1
|
||||
elif not symbol_text and value_text:
|
||||
# Value without symbol - could be valid (continuation)
|
||||
pass
|
||||
|
||||
# Should have more paired than mismatched
|
||||
return paired_rows > mismatched_rows
|
||||
|
||||
def apply_merges(self) -> 'TableMatrix':
|
||||
"""
|
||||
Create a new matrix with currency columns merged.
|
||||
|
||||
Returns:
|
||||
New TableMatrix with merged columns
|
||||
"""
|
||||
if not self.merge_pairs:
|
||||
self.detect_currency_pairs()
|
||||
|
||||
if not self.merge_pairs:
|
||||
# No merges needed
|
||||
return self.matrix
|
||||
|
||||
# Calculate new column count (each merge removes one column)
|
||||
new_col_count = self.matrix.col_count - len(self.merge_pairs)
|
||||
|
||||
# Create mapping from old to new columns
|
||||
old_to_new = {}
|
||||
merged_cols = set(pair[0] for pair in self.merge_pairs) # Symbol columns to remove
|
||||
|
||||
new_col = 0
|
||||
for old_col in range(self.matrix.col_count):
|
||||
if old_col in merged_cols:
|
||||
# This column will be merged with next, skip it
|
||||
continue
|
||||
old_to_new[old_col] = new_col
|
||||
new_col += 1
|
||||
|
||||
# Create new matrix
|
||||
new_matrix = TableMatrix()
|
||||
new_matrix.row_count = self.matrix.row_count
|
||||
new_matrix.col_count = new_col_count
|
||||
new_matrix.matrix = []
|
||||
|
||||
# Build new matrix with merged cells
|
||||
for row_idx in range(self.matrix.row_count):
|
||||
new_row = [MatrixCell() for _ in range(new_col_count)]
|
||||
|
||||
for old_col in range(self.matrix.col_count):
|
||||
# Check if this is a symbol column to merge
|
||||
merge_pair = next((pair for pair in self.merge_pairs if pair[0] == old_col), None)
|
||||
|
||||
if merge_pair:
|
||||
# Merge symbol with value
|
||||
symbol_col, value_col = merge_pair
|
||||
symbol_cell = self.matrix.matrix[row_idx][symbol_col]
|
||||
value_cell = self.matrix.matrix[row_idx][value_col]
|
||||
|
||||
if value_cell.original_cell:
|
||||
# Create merged cell
|
||||
new_cell_content = self._merge_cell_content(symbol_cell, value_cell)
|
||||
if new_cell_content:
|
||||
# Create new merged cell
|
||||
merged_cell = Cell(
|
||||
content=new_cell_content,
|
||||
colspan=value_cell.original_cell.colspan,
|
||||
rowspan=value_cell.original_cell.rowspan,
|
||||
is_header=value_cell.original_cell.is_header,
|
||||
align=value_cell.original_cell.align
|
||||
)
|
||||
|
||||
new_col_idx = old_to_new.get(value_col)
|
||||
if new_col_idx is not None:
|
||||
new_row[new_col_idx] = MatrixCell(
|
||||
original_cell=merged_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col_idx
|
||||
)
|
||||
|
||||
elif old_col not in set(pair[1] for pair in self.merge_pairs):
|
||||
# Regular column, not involved in merging
|
||||
new_col_idx = old_to_new.get(old_col)
|
||||
if new_col_idx is not None:
|
||||
new_row[new_col_idx] = self.matrix.matrix[row_idx][old_col]
|
||||
|
||||
new_matrix.matrix.append(new_row)
|
||||
|
||||
return new_matrix
|
||||
|
||||
def _merge_cell_content(self, symbol_cell: MatrixCell, value_cell: MatrixCell) -> str:
|
||||
"""
|
||||
Merge symbol and value cell contents.
|
||||
|
||||
Returns:
|
||||
Merged content like "$224.11" or original value if no symbol
|
||||
"""
|
||||
value_text = value_cell.original_cell.text().strip() if value_cell.original_cell else ""
|
||||
symbol_text = symbol_cell.original_cell.text().strip() if symbol_cell.original_cell else ""
|
||||
|
||||
if not value_text:
|
||||
return symbol_text # Just return symbol if no value
|
||||
|
||||
if symbol_text in self.CURRENCY_SYMBOLS:
|
||||
# Merge symbol with value (no space for $, others may vary)
|
||||
if symbol_text == '$':
|
||||
return f"${value_text}"
|
||||
else:
|
||||
return f"{symbol_text}{value_text}"
|
||||
else:
|
||||
# No symbol, just return value
|
||||
return value_text
|
||||
|
||||
def get_merge_summary(self) -> str:
|
||||
"""Get a summary of merges to be applied."""
|
||||
if not self.merge_pairs:
|
||||
return "No currency column merges detected"
|
||||
|
||||
summary = f"Currency merges detected: {len(self.merge_pairs)} pairs\n"
|
||||
for symbol_col, value_col in self.merge_pairs:
|
||||
summary += f" • Column {symbol_col} ($) + Column {value_col} (value)\n"
|
||||
|
||||
return summary
|
||||
@@ -0,0 +1,96 @@
|
||||
"""
|
||||
HTML utility functions for document parsing.
|
||||
|
||||
This module consolidates common HTML processing utilities used across
|
||||
the parser, preprocessor, and simple parser implementations.
|
||||
"""
|
||||
|
||||
import lxml.html
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def remove_xml_declaration(html: str) -> str:
|
||||
"""
|
||||
Remove XML declaration from HTML if present.
|
||||
|
||||
SEC HTML documents sometimes include XML declarations like:
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
These can interfere with HTML parsing and are safely removed since
|
||||
the encoding is handled separately by the parser.
|
||||
|
||||
Args:
|
||||
html: HTML string that may contain XML declaration
|
||||
|
||||
Returns:
|
||||
HTML string with XML declaration removed (if present)
|
||||
|
||||
Examples:
|
||||
>>> html = '<?xml version="1.0"?><!DOCTYPE html><html>...'
|
||||
>>> remove_xml_declaration(html)
|
||||
'<!DOCTYPE html><html>...'
|
||||
|
||||
>>> html = '<!DOCTYPE html><html>...' # No XML declaration
|
||||
>>> remove_xml_declaration(html)
|
||||
'<!DOCTYPE html><html>...'
|
||||
"""
|
||||
html_stripped = html.strip()
|
||||
if html_stripped.startswith('<?xml'):
|
||||
xml_end = html.find('?>') + 2
|
||||
return html[xml_end:]
|
||||
return html
|
||||
|
||||
|
||||
def create_lxml_parser(
|
||||
remove_blank_text: bool = True,
|
||||
remove_comments: bool = True,
|
||||
recover: bool = True,
|
||||
encoding: Optional[str] = 'utf-8'
|
||||
) -> lxml.html.HTMLParser:
|
||||
"""
|
||||
Create a configured lxml HTMLParser.
|
||||
|
||||
This factory function creates an lxml HTMLParser with consistent
|
||||
configuration settings used across the document parsing system.
|
||||
|
||||
Args:
|
||||
remove_blank_text: Remove blank text nodes between tags.
|
||||
Default True for cleaner tree structure.
|
||||
remove_comments: Remove HTML comments from parsed tree.
|
||||
Default True since comments are rarely needed.
|
||||
recover: Enable error recovery mode to handle malformed HTML.
|
||||
Default True since SEC filings often have HTML issues.
|
||||
encoding: Character encoding for the parser.
|
||||
Default 'utf-8'. Set to None to disable encoding handling.
|
||||
|
||||
Returns:
|
||||
Configured lxml.html.HTMLParser instance
|
||||
|
||||
Examples:
|
||||
>>> # Standard parser (removes whitespace and comments, recovers from errors)
|
||||
>>> parser = create_lxml_parser()
|
||||
|
||||
>>> # Parser that preserves all content (for XBRL)
|
||||
>>> parser = create_lxml_parser(
|
||||
... remove_blank_text=False,
|
||||
... remove_comments=False
|
||||
... )
|
||||
|
||||
>>> # Parser without encoding (auto-detect)
|
||||
>>> parser = create_lxml_parser(encoding=None)
|
||||
|
||||
Note:
|
||||
The recover=True setting is critical for SEC documents which
|
||||
often contain non-standard HTML structures.
|
||||
"""
|
||||
kwargs = {
|
||||
'remove_blank_text': remove_blank_text,
|
||||
'remove_comments': remove_comments,
|
||||
'recover': recover,
|
||||
}
|
||||
|
||||
# Only add encoding if specified
|
||||
if encoding is not None:
|
||||
kwargs['encoding'] = encoding
|
||||
|
||||
return lxml.html.HTMLParser(**kwargs)
|
||||
@@ -0,0 +1,375 @@
|
||||
"""
|
||||
Streaming parser for large HTML documents.
|
||||
"""
|
||||
|
||||
import io
|
||||
from typing import Dict, Any, TYPE_CHECKING
|
||||
|
||||
from lxml import etree
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.exceptions import HTMLParsingError, DocumentTooLargeError
|
||||
|
||||
# Use TYPE_CHECKING to avoid circular imports
|
||||
if TYPE_CHECKING:
|
||||
from edgar.documents.document import Document, DocumentMetadata
|
||||
from edgar.documents.nodes import DocumentNode, HeadingNode, ParagraphNode, TextNode, SectionNode, ContainerNode
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
from edgar.documents.types import SemanticType
|
||||
|
||||
|
||||
class StreamingParser:
|
||||
"""
|
||||
Streaming parser for large HTML documents.
|
||||
|
||||
Processes documents in chunks to minimize memory usage
|
||||
while maintaining parse quality.
|
||||
"""
|
||||
|
||||
# Chunk size for streaming (1MB)
|
||||
CHUNK_SIZE = 1024 * 1024
|
||||
|
||||
# Maximum node buffer before flush
|
||||
MAX_NODE_BUFFER = 1000
|
||||
|
||||
def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
|
||||
"""
|
||||
Initialize streaming parser.
|
||||
|
||||
Args:
|
||||
config: Parser configuration
|
||||
strategies: Parsing strategies to use
|
||||
"""
|
||||
self.config = config
|
||||
self.strategies = strategies
|
||||
self._reset_state()
|
||||
|
||||
def _reset_state(self):
|
||||
"""Reset parser state."""
|
||||
# Import here to avoid circular import
|
||||
from edgar.documents.document import DocumentMetadata
|
||||
from edgar.documents.nodes import DocumentNode
|
||||
|
||||
self.current_section = None
|
||||
self.node_buffer = []
|
||||
self.metadata = DocumentMetadata()
|
||||
self.root = DocumentNode()
|
||||
self.current_parent = self.root
|
||||
self.tag_stack = []
|
||||
self.text_buffer = []
|
||||
self.in_table = False
|
||||
self.table_buffer = []
|
||||
self.bytes_processed = 0
|
||||
|
||||
def parse(self, html: str) -> "Document":
|
||||
"""
|
||||
Parse HTML in streaming mode.
|
||||
|
||||
Args:
|
||||
html: HTML content to parse
|
||||
|
||||
Returns:
|
||||
Parsed Document
|
||||
|
||||
Raises:
|
||||
DocumentTooLargeError: If document exceeds size limit
|
||||
HTMLParsingError: If parsing fails
|
||||
"""
|
||||
self._reset_state()
|
||||
|
||||
# Store original HTML BEFORE parsing (needed for TOC-based section detection)
|
||||
original_html = html
|
||||
|
||||
try:
|
||||
# Create streaming parser
|
||||
parser = etree.iterparse(
|
||||
io.BytesIO(html.encode('utf-8')),
|
||||
events=('start', 'end'),
|
||||
html=True,
|
||||
recover=True,
|
||||
encoding='utf-8'
|
||||
)
|
||||
|
||||
# Process events
|
||||
for event, elem in parser:
|
||||
self._process_event(event, elem)
|
||||
|
||||
# Check size limit
|
||||
self.bytes_processed += len(etree.tostring(elem, encoding='unicode', method='html'))
|
||||
if self.bytes_processed > self.config.max_document_size:
|
||||
raise DocumentTooLargeError(self.bytes_processed, self.config.max_document_size)
|
||||
|
||||
# Flush buffer if needed
|
||||
if len(self.node_buffer) >= self.MAX_NODE_BUFFER:
|
||||
self._flush_buffer()
|
||||
|
||||
# Clean up processed elements to save memory
|
||||
elem.clear()
|
||||
while elem.getprevious() is not None:
|
||||
parent = elem.getparent()
|
||||
if parent is not None:
|
||||
del parent[0]
|
||||
else:
|
||||
break
|
||||
|
||||
# Final flush
|
||||
self._flush_buffer()
|
||||
|
||||
# Store original HTML in metadata for section detection (TOC analysis)
|
||||
self.metadata.original_html = original_html
|
||||
|
||||
# Create document (import here to avoid circular import)
|
||||
from edgar.documents.document import Document
|
||||
document = Document(root=self.root, metadata=self.metadata)
|
||||
|
||||
# Store config reference (required for section detection)
|
||||
document._config = self.config
|
||||
|
||||
# Apply post-processing
|
||||
from edgar.documents.processors.postprocessor import DocumentPostprocessor
|
||||
postprocessor = DocumentPostprocessor(self.config)
|
||||
document = postprocessor.process(document)
|
||||
|
||||
return document
|
||||
|
||||
except etree.ParseError as e:
|
||||
raise HTMLParsingError(f"Streaming parse failed: {str(e)}")
|
||||
except Exception as e:
|
||||
if isinstance(e, (DocumentTooLargeError, HTMLParsingError)):
|
||||
raise
|
||||
raise HTMLParsingError(f"Unexpected error during streaming parse: {str(e)}")
|
||||
|
||||
def _process_event(self, event: str, elem: HtmlElement):
|
||||
"""Process a parse event."""
|
||||
if event == 'start':
|
||||
self._handle_start_tag(elem)
|
||||
elif event == 'end':
|
||||
self._handle_end_tag(elem)
|
||||
|
||||
def _handle_start_tag(self, elem: HtmlElement):
|
||||
"""Handle opening tag."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import ContainerNode
|
||||
|
||||
tag = elem.tag.lower()
|
||||
|
||||
# Track tag stack
|
||||
self.tag_stack.append(tag)
|
||||
|
||||
# Extract metadata from early elements
|
||||
if tag == 'title' and elem.text:
|
||||
self._extract_title_metadata(elem.text)
|
||||
elif tag == 'meta':
|
||||
self._extract_meta_metadata(elem)
|
||||
|
||||
# Handle specific tags
|
||||
if tag == 'body':
|
||||
# Create a container for body content
|
||||
body_container = ContainerNode(tag_name='body')
|
||||
self.root.add_child(body_container)
|
||||
self.current_parent = body_container
|
||||
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
self._start_heading(elem)
|
||||
elif tag == 'p':
|
||||
self._start_paragraph(elem)
|
||||
elif tag == 'table':
|
||||
self._start_table(elem)
|
||||
elif tag == 'section':
|
||||
self._start_section(elem)
|
||||
|
||||
def _handle_end_tag(self, elem: HtmlElement):
|
||||
"""Handle closing tag."""
|
||||
tag = elem.tag.lower()
|
||||
|
||||
# Remove from tag stack
|
||||
if self.tag_stack and self.tag_stack[-1] == tag:
|
||||
self.tag_stack.pop()
|
||||
|
||||
# Handle specific tags
|
||||
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
self._end_heading(elem)
|
||||
elif tag == 'p':
|
||||
self._end_paragraph(elem)
|
||||
elif tag == 'table':
|
||||
self._end_table(elem)
|
||||
elif tag == 'section':
|
||||
self._end_section(elem)
|
||||
elif tag == 'body':
|
||||
# When body ends, flush any remaining nodes
|
||||
self._flush_buffer()
|
||||
|
||||
# Handle text content
|
||||
if elem.text:
|
||||
self.text_buffer.append(elem.text.strip())
|
||||
if elem.tail:
|
||||
self.text_buffer.append(elem.tail.strip())
|
||||
|
||||
def _start_heading(self, elem: HtmlElement):
|
||||
"""Start processing a heading."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import HeadingNode
|
||||
|
||||
level = int(elem.tag[1])
|
||||
text = self._get_text_content(elem)
|
||||
|
||||
# Create heading node
|
||||
heading = HeadingNode(
|
||||
level=level,
|
||||
content=text
|
||||
)
|
||||
|
||||
# Check if this is a section header
|
||||
if self.strategies.get('header_detection'):
|
||||
detector = self.strategies['header_detection']
|
||||
if detector.is_section_header(text, elem):
|
||||
heading.semantic_type = SemanticType.SECTION_HEADER
|
||||
|
||||
self.node_buffer.append(heading)
|
||||
|
||||
def _end_heading(self, elem: HtmlElement):
|
||||
"""End processing a heading."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import HeadingNode
|
||||
|
||||
# Get text content from element
|
||||
text = self._get_text_content(elem)
|
||||
if text and self.node_buffer and isinstance(self.node_buffer[-1], HeadingNode):
|
||||
self.node_buffer[-1].content = text
|
||||
|
||||
# Clear any accumulated text buffer
|
||||
self.text_buffer.clear()
|
||||
|
||||
def _start_paragraph(self, elem: HtmlElement):
|
||||
"""Start processing a paragraph."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import ParagraphNode
|
||||
|
||||
para = ParagraphNode()
|
||||
|
||||
# Get style if present
|
||||
style_attr = elem.get('style')
|
||||
if style_attr and self.strategies.get('style_parser'):
|
||||
style_parser = self.strategies['style_parser']
|
||||
para.style = style_parser.parse(style_attr)
|
||||
|
||||
self.node_buffer.append(para)
|
||||
|
||||
def _end_paragraph(self, elem: HtmlElement):
|
||||
"""End processing a paragraph."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import ParagraphNode, TextNode
|
||||
|
||||
# Get text content from element
|
||||
text = self._get_text_content(elem)
|
||||
if text and self.node_buffer and isinstance(self.node_buffer[-1], ParagraphNode):
|
||||
text_node = TextNode(content=text)
|
||||
self.node_buffer[-1].add_child(text_node)
|
||||
|
||||
# Clear any accumulated text buffer
|
||||
self.text_buffer.clear()
|
||||
|
||||
def _start_table(self, elem: HtmlElement):
|
||||
"""Start processing a table."""
|
||||
self.in_table = True
|
||||
self.table_buffer = []
|
||||
|
||||
# Store table element for later processing
|
||||
self.table_elem = elem
|
||||
|
||||
def _end_table(self, elem: HtmlElement):
|
||||
"""End processing a table."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
|
||||
self.in_table = False
|
||||
|
||||
# Process table with table processor if available
|
||||
if self.strategies.get('table_processing'):
|
||||
processor = self.strategies['table_processing']
|
||||
table_node = processor.process(elem)
|
||||
if table_node:
|
||||
self.node_buffer.append(table_node)
|
||||
else:
|
||||
# Basic table node
|
||||
table = TableNode()
|
||||
self.node_buffer.append(table)
|
||||
|
||||
self.table_buffer.clear()
|
||||
|
||||
def _start_section(self, elem: HtmlElement):
|
||||
"""Start processing a section."""
|
||||
# Import node types at runtime to avoid circular imports
|
||||
from edgar.documents.nodes import SectionNode
|
||||
|
||||
section = SectionNode()
|
||||
|
||||
# Get section attributes
|
||||
section_id = elem.get('id')
|
||||
if section_id:
|
||||
section.metadata['id'] = section_id
|
||||
|
||||
section_class = elem.get('class')
|
||||
if section_class:
|
||||
section.metadata['class'] = section_class
|
||||
|
||||
self.current_section = section
|
||||
self.node_buffer.append(section)
|
||||
|
||||
def _end_section(self, elem: HtmlElement):
|
||||
"""End processing a section."""
|
||||
self.current_section = None
|
||||
|
||||
def _flush_buffer(self):
|
||||
"""Flush node buffer to document tree."""
|
||||
for node in self.node_buffer:
|
||||
# Add to current parent
|
||||
if self.current_section:
|
||||
self.current_section.add_child(node)
|
||||
else:
|
||||
self.current_parent.add_child(node)
|
||||
|
||||
self.node_buffer.clear()
|
||||
|
||||
def _get_text_content(self, elem: HtmlElement) -> str:
|
||||
"""Extract text content from element."""
|
||||
text_parts = []
|
||||
|
||||
if elem.text:
|
||||
text_parts.append(elem.text.strip())
|
||||
|
||||
for child in elem:
|
||||
child_text = self._get_text_content(child)
|
||||
if child_text:
|
||||
text_parts.append(child_text)
|
||||
if child.tail:
|
||||
text_parts.append(child.tail.strip())
|
||||
|
||||
return ' '.join(text_parts)
|
||||
|
||||
def _extract_title_metadata(self, title: str):
|
||||
"""Extract metadata from title."""
|
||||
# Example: "APPLE INC - 10-K - 2023-09-30"
|
||||
parts = title.split(' - ')
|
||||
if len(parts) >= 2:
|
||||
self.metadata.company = parts[0].strip()
|
||||
self.metadata.form = parts[1].strip()
|
||||
if len(parts) >= 3:
|
||||
self.metadata.filing_date = parts[2].strip()
|
||||
|
||||
def _extract_meta_metadata(self, elem: HtmlElement):
|
||||
"""Extract metadata from meta tags."""
|
||||
name = elem.get('name', '').lower()
|
||||
content = elem.get('content', '')
|
||||
|
||||
if name and content:
|
||||
if name == 'company':
|
||||
self.metadata.company = content
|
||||
elif name == 'filing-type':
|
||||
self.metadata.form = content
|
||||
elif name == 'cik':
|
||||
self.metadata.cik = content
|
||||
elif name == 'filing-date':
|
||||
self.metadata.filing_date = content
|
||||
elif name == 'accession-number':
|
||||
self.metadata.accession_number = content
|
||||
@@ -0,0 +1,858 @@
|
||||
"""
|
||||
Table matrix builder for handling complex colspan/rowspan structures.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
|
||||
from edgar.documents.table_nodes import Cell, Row
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatrixCell:
|
||||
"""Cell in the matrix with reference to original cell"""
|
||||
original_cell: Optional[Cell] = None
|
||||
is_spanned: bool = False # True if this is part of a colspan/rowspan
|
||||
row_origin: int = -1 # Original row index
|
||||
col_origin: int = -1 # Original column index
|
||||
|
||||
|
||||
class TableMatrix:
|
||||
"""
|
||||
Build a 2D matrix representation of table with proper handling of merged cells.
|
||||
|
||||
This class converts a table with colspan/rowspan into a regular 2D grid
|
||||
where each merged cell occupies multiple positions in the matrix.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize empty matrix"""
|
||||
self.matrix: List[List[MatrixCell]] = []
|
||||
self.row_count = 0
|
||||
self.col_count = 0
|
||||
self.header_row_count = 0 # Track number of header rows
|
||||
|
||||
def build_from_rows(self, header_rows: List[List[Cell]], data_rows: List[Row]) -> 'TableMatrix':
|
||||
"""
|
||||
Build matrix from header rows and data rows.
|
||||
|
||||
Args:
|
||||
header_rows: List of header rows (each row is a list of Cells)
|
||||
data_rows: List of Row objects
|
||||
|
||||
Returns:
|
||||
Self for chaining
|
||||
"""
|
||||
# Store header row count for later use
|
||||
self.header_row_count = len(header_rows)
|
||||
|
||||
# Combine all rows for processing
|
||||
all_rows = []
|
||||
|
||||
# Add header rows
|
||||
for header_row in header_rows:
|
||||
all_rows.append(header_row)
|
||||
|
||||
# Add data rows
|
||||
for row in data_rows:
|
||||
all_rows.append(row.cells)
|
||||
|
||||
if not all_rows:
|
||||
return self
|
||||
|
||||
# Calculate dimensions
|
||||
self.row_count = len(all_rows)
|
||||
|
||||
# First pass: determine actual column count
|
||||
self._calculate_dimensions(all_rows)
|
||||
|
||||
# Initialize matrix
|
||||
self.matrix = [[MatrixCell() for _ in range(self.col_count)]
|
||||
for _ in range(self.row_count)]
|
||||
|
||||
# Second pass: place cells in matrix
|
||||
self._place_cells(all_rows)
|
||||
|
||||
return self
|
||||
|
||||
def _calculate_dimensions(self, rows: List[List[Cell]]):
|
||||
"""Calculate the actual dimensions considering colspan"""
|
||||
max_cols = 0
|
||||
|
||||
for row_idx, row in enumerate(rows):
|
||||
col_pos = 0
|
||||
for cell in row:
|
||||
# Skip positions that might be occupied by rowspan from above
|
||||
while col_pos < max_cols and self._is_occupied(row_idx, col_pos):
|
||||
col_pos += 1
|
||||
|
||||
# This cell will occupy from col_pos to col_pos + colspan
|
||||
col_end = col_pos + cell.colspan
|
||||
max_cols = max(max_cols, col_end)
|
||||
col_pos = col_end
|
||||
|
||||
self.col_count = max_cols
|
||||
|
||||
def _is_occupied(self, row: int, col: int) -> bool:
|
||||
"""Check if a position is occupied by a cell from a previous row (rowspan)"""
|
||||
if row == 0:
|
||||
return False
|
||||
|
||||
# Check if any cell above has rowspan that reaches this position
|
||||
for prev_row in range(row):
|
||||
if prev_row < len(self.matrix) and col < len(self.matrix[prev_row]):
|
||||
cell = self.matrix[prev_row][col]
|
||||
if cell.original_cell and cell.row_origin == prev_row:
|
||||
# Check if this cell's rowspan reaches current row
|
||||
if prev_row + cell.original_cell.rowspan > row:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _place_cells(self, rows: List[List[Cell]]):
|
||||
"""Place cells in the matrix handling colspan and rowspan"""
|
||||
for row_idx, row in enumerate(rows):
|
||||
col_pos = 0
|
||||
|
||||
for cell_idx, cell in enumerate(row):
|
||||
# Find next available column position
|
||||
while col_pos < self.col_count and self.matrix[row_idx][col_pos].original_cell is not None:
|
||||
col_pos += 1
|
||||
|
||||
if col_pos >= self.col_count:
|
||||
# Need to expand matrix
|
||||
self._expand_columns(col_pos + cell.colspan)
|
||||
|
||||
# Special handling for cells with colspan > 1 containing numeric values
|
||||
# Only apply this logic for Table 15-style alignment issues
|
||||
# Check if this looks like a financial value that should be right-aligned
|
||||
cell_text = cell.text().strip()
|
||||
|
||||
# Check for numeric values that need special alignment
|
||||
# This is specifically for cases like "167,045" that should align with "$167,045"
|
||||
has_comma_separator = ',' in cell_text
|
||||
digit_ratio = sum(c.isdigit() for c in cell_text) / len(cell_text) if cell_text else 0
|
||||
|
||||
# Only apply special placement for colspan=2 numeric values in data rows
|
||||
# This handles Table 15's specific case without breaking Table 13
|
||||
is_special_numeric = (cell.colspan == 2 and # Specifically colspan=2
|
||||
has_comma_separator and
|
||||
digit_ratio > 0.5 and # More than 50% digits
|
||||
not cell_text.startswith('$') and
|
||||
not any(month in cell_text.lower() for month in
|
||||
['jan', 'feb', 'mar', 'apr', 'may', 'jun',
|
||||
'jul', 'aug', 'sep', 'oct', 'nov', 'dec']) and
|
||||
row_idx > 1) # Not a header row (allow for multi-row headers)
|
||||
|
||||
if is_special_numeric:
|
||||
# Place empty cell at first position, content at second position
|
||||
# This is specifically for Table 15 alignment
|
||||
for r in range(cell.rowspan):
|
||||
# First column of span: empty
|
||||
if row_idx + r < self.row_count and col_pos < self.col_count:
|
||||
self.matrix[row_idx + r][col_pos] = MatrixCell()
|
||||
|
||||
# Second column of span: the actual content
|
||||
if row_idx + r < self.row_count and col_pos + 1 < self.col_count:
|
||||
matrix_cell = MatrixCell(
|
||||
original_cell=cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=col_pos + 1
|
||||
)
|
||||
self.matrix[row_idx + r][col_pos + 1] = matrix_cell
|
||||
|
||||
# Remaining columns of span: mark as spanned (though colspan=2 has no remaining)
|
||||
for c in range(2, cell.colspan):
|
||||
if row_idx + r < self.row_count and col_pos + c < self.col_count:
|
||||
matrix_cell = MatrixCell(
|
||||
original_cell=cell,
|
||||
is_spanned=True,
|
||||
row_origin=row_idx,
|
||||
col_origin=col_pos + 1
|
||||
)
|
||||
self.matrix[row_idx + r][col_pos + c] = matrix_cell
|
||||
else:
|
||||
# Normal placement for other cells
|
||||
for r in range(cell.rowspan):
|
||||
for c in range(cell.colspan):
|
||||
if row_idx + r < self.row_count and col_pos + c < self.col_count:
|
||||
matrix_cell = MatrixCell(
|
||||
original_cell=cell,
|
||||
is_spanned=(r > 0 or c > 0),
|
||||
row_origin=row_idx,
|
||||
col_origin=col_pos
|
||||
)
|
||||
self.matrix[row_idx + r][col_pos + c] = matrix_cell
|
||||
|
||||
col_pos += cell.colspan
|
||||
|
||||
def _expand_columns(self, new_col_count: int):
|
||||
"""Expand matrix to accommodate more columns"""
|
||||
if new_col_count <= self.col_count:
|
||||
return
|
||||
|
||||
for row in self.matrix:
|
||||
row.extend([MatrixCell() for _ in range(new_col_count - self.col_count)])
|
||||
|
||||
self.col_count = new_col_count
|
||||
|
||||
def get_actual_columns(self) -> int:
|
||||
"""Get the actual number of data columns (excluding empty/spacing columns)"""
|
||||
non_empty_cols = 0
|
||||
|
||||
for col_idx in range(self.col_count):
|
||||
has_content = False
|
||||
for row_idx in range(self.row_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
# Check if cell has actual content
|
||||
text = cell.original_cell.text().strip()
|
||||
if text and text not in ['', ' ', '\xa0']:
|
||||
has_content = True
|
||||
break
|
||||
|
||||
if has_content:
|
||||
non_empty_cols += 1
|
||||
|
||||
return non_empty_cols
|
||||
|
||||
def get_column_widths(self) -> List[float]:
|
||||
"""Estimate column widths based on content"""
|
||||
widths = []
|
||||
|
||||
for col_idx in range(self.col_count):
|
||||
max_width = 0
|
||||
content_count = 0
|
||||
|
||||
for row_idx in range(self.row_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text:
|
||||
max_width = max(max_width, len(text))
|
||||
content_count += 1
|
||||
|
||||
# If column has no content, it's likely a spacing column
|
||||
if content_count == 0:
|
||||
widths.append(0)
|
||||
else:
|
||||
widths.append(max_width)
|
||||
|
||||
return widths
|
||||
|
||||
def get_cell(self, row_idx: int, col_idx: int) -> Optional[Cell]:
|
||||
"""
|
||||
Get a cell at specific position in the matrix.
|
||||
|
||||
Args:
|
||||
row_idx: Row index
|
||||
col_idx: Column index
|
||||
|
||||
Returns:
|
||||
Cell at position or None if out of bounds
|
||||
"""
|
||||
if row_idx >= self.row_count or col_idx >= self.col_count or row_idx < 0 or col_idx < 0:
|
||||
return None
|
||||
|
||||
matrix_cell = self.matrix[row_idx][col_idx]
|
||||
|
||||
# Return the original cell
|
||||
if matrix_cell.original_cell:
|
||||
return matrix_cell.original_cell
|
||||
|
||||
# Return empty cell for empty positions
|
||||
return Cell("")
|
||||
|
||||
def get_expanded_row(self, row_idx: int) -> List[Optional[Cell]]:
|
||||
"""
|
||||
Get a row with cells expanded to match column count.
|
||||
|
||||
For cells with colspan > 1, the cell appears in the first position
|
||||
and None in subsequent positions.
|
||||
"""
|
||||
if row_idx >= self.row_count:
|
||||
return []
|
||||
|
||||
expanded = []
|
||||
for col_idx in range(self.col_count):
|
||||
matrix_cell = self.matrix[row_idx][col_idx]
|
||||
if matrix_cell.original_cell:
|
||||
if not matrix_cell.is_spanned:
|
||||
# This is the origin cell
|
||||
expanded.append(matrix_cell.original_cell)
|
||||
else:
|
||||
# This is a spanned position
|
||||
expanded.append(None)
|
||||
else:
|
||||
# Empty cell
|
||||
expanded.append(None)
|
||||
|
||||
return expanded
|
||||
|
||||
def get_data_columns(self) -> List[int]:
|
||||
"""
|
||||
Get indices of columns that contain actual data (not spacing).
|
||||
Uses strategy similar to old parser - keeps single empty columns for spacing.
|
||||
|
||||
Returns:
|
||||
List of column indices that contain data
|
||||
"""
|
||||
# First, identify which columns are empty
|
||||
empty_cols = []
|
||||
for col_idx in range(self.col_count):
|
||||
has_content = False
|
||||
for row_idx in range(self.row_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text:
|
||||
has_content = True
|
||||
break
|
||||
if not has_content:
|
||||
empty_cols.append(col_idx)
|
||||
|
||||
# Apply old parser's strategy
|
||||
cols_to_remove = set()
|
||||
|
||||
# Remove leading empty columns
|
||||
for col in range(self.col_count):
|
||||
if col in empty_cols:
|
||||
cols_to_remove.add(col)
|
||||
else:
|
||||
break
|
||||
|
||||
# Remove trailing empty columns
|
||||
for col in reversed(range(self.col_count)):
|
||||
if col in empty_cols:
|
||||
cols_to_remove.add(col)
|
||||
else:
|
||||
break
|
||||
|
||||
# Remove consecutive empty columns in the middle (keep single empty cols for spacing)
|
||||
i = 0
|
||||
while i < self.col_count - 1:
|
||||
if i in empty_cols and (i + 1) in empty_cols:
|
||||
# Found consecutive empty columns
|
||||
consecutive_count = 0
|
||||
j = i
|
||||
while j < self.col_count and j in empty_cols:
|
||||
consecutive_count += 1
|
||||
j += 1
|
||||
# Keep first empty column as spacer, remove the rest
|
||||
cols_to_remove.update(range(i + 1, i + consecutive_count))
|
||||
i = j
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Return columns that are NOT in the removal set
|
||||
data_cols = [col for col in range(self.col_count) if col not in cols_to_remove]
|
||||
|
||||
return data_cols
|
||||
|
||||
def filter_spacing_columns(self) -> 'TableMatrix':
|
||||
"""
|
||||
Create a new matrix with spacing columns removed.
|
||||
Also handles colspan-generated duplicate columns and misalignment.
|
||||
|
||||
Returns:
|
||||
New TableMatrix with only data columns
|
||||
"""
|
||||
# First pass: identify primary header columns (those with colspan > 1 headers)
|
||||
# and data columns
|
||||
primary_header_cols = set()
|
||||
all_header_cols = set()
|
||||
data_cols = set()
|
||||
|
||||
# Find primary header columns (those that start a colspan)
|
||||
for row_idx in range(min(3, self.row_count)):
|
||||
for col_idx in range(self.col_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
if cell.original_cell.text().strip():
|
||||
all_header_cols.add(col_idx)
|
||||
# Check if this is a primary header (colspan > 1)
|
||||
if cell.original_cell.colspan > 1:
|
||||
primary_header_cols.add(col_idx)
|
||||
|
||||
# If no primary headers found, use all headers as primary
|
||||
if not primary_header_cols:
|
||||
primary_header_cols = all_header_cols
|
||||
|
||||
# Phase 1.5: Identify columns with header content
|
||||
# Any column with non-empty text in ANY header row must be preserved
|
||||
# This prevents legitimate header columns from being removed as "spacing"
|
||||
# Also preserve columns that are spanned by headers (colspan > 1)
|
||||
header_content_columns = set()
|
||||
for col_idx in range(self.col_count):
|
||||
for row_idx in range(self.header_row_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell:
|
||||
# Check for original header cell with content
|
||||
if not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text:
|
||||
header_content_columns.add(col_idx)
|
||||
# Also add all columns spanned by this header
|
||||
if cell.original_cell.colspan > 1:
|
||||
for span_offset in range(1, cell.original_cell.colspan):
|
||||
span_col = col_idx + span_offset
|
||||
if span_col < self.col_count:
|
||||
header_content_columns.add(span_col)
|
||||
break # Found content, no need to check other header rows
|
||||
# Also preserve columns that are spanned (part of a colspan)
|
||||
elif cell.is_spanned:
|
||||
# This column is part of a header's colspan
|
||||
text = cell.original_cell.text().strip()
|
||||
if text:
|
||||
header_content_columns.add(col_idx)
|
||||
|
||||
# Find columns with data (skip header rows)
|
||||
# Count actual header rows by checking for non-data content
|
||||
actual_header_rows = 0
|
||||
for row_idx in range(min(3, self.row_count)):
|
||||
has_numeric_data = False
|
||||
for col_idx in range(self.col_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
# Check if it looks like numeric data (has commas or starts with $)
|
||||
if text and (',' in text and any(c.isdigit() for c in text)) or text == '$':
|
||||
has_numeric_data = True
|
||||
break
|
||||
if has_numeric_data:
|
||||
break
|
||||
actual_header_rows += 1
|
||||
|
||||
data_start_row = max(1, actual_header_rows)
|
||||
|
||||
# Track columns with significant data (not just isolated cells)
|
||||
col_data_count = {}
|
||||
for row_idx in range(data_start_row, self.row_count):
|
||||
for col_idx in range(self.col_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
if cell.original_cell.text().strip():
|
||||
data_cols.add(col_idx)
|
||||
col_data_count[col_idx] = col_data_count.get(col_idx, 0) + 1
|
||||
|
||||
# Build initial list of columns to keep
|
||||
# Always include column 0 if it contains row labels
|
||||
cols_to_keep = set(primary_header_cols)
|
||||
|
||||
# Add columns with header content (prevents removing legitimate headers)
|
||||
cols_to_keep.update(header_content_columns)
|
||||
|
||||
# Identify misaligned data columns that need to be consolidated
|
||||
# These are data columns that are not primary header columns
|
||||
misaligned_data_cols = data_cols - primary_header_cols
|
||||
|
||||
# Map misaligned data columns to their nearest column for consolidation
|
||||
# Only consolidate directly adjacent columns with specific patterns
|
||||
consolidation_map = {}
|
||||
|
||||
# First pass: identify all potential consolidations
|
||||
potential_consolidations = {}
|
||||
for data_col in sorted(misaligned_data_cols):
|
||||
# Check if this column should be consolidated with an adjacent column
|
||||
# Check the column immediately before this one
|
||||
prev_col = data_col - 1
|
||||
|
||||
# Sample some cells to see if consolidation makes sense
|
||||
consolidation_type = None
|
||||
|
||||
for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
|
||||
prev_cell = self.matrix[row_idx][prev_col] if prev_col >= 0 else None
|
||||
curr_cell = self.matrix[row_idx][data_col]
|
||||
|
||||
if prev_cell and prev_cell.original_cell and curr_cell.original_cell:
|
||||
prev_text = prev_cell.original_cell.text().strip()
|
||||
curr_text = curr_cell.original_cell.text().strip()
|
||||
|
||||
# Skip empty cells
|
||||
if not prev_text or not curr_text:
|
||||
continue
|
||||
|
||||
# Check for patterns that indicate consolidation
|
||||
if prev_text == '$' and curr_text and curr_text[0].isdigit():
|
||||
consolidation_type = 'currency'
|
||||
break
|
||||
elif prev_text.startswith('(') and curr_text == ')':
|
||||
consolidation_type = 'parentheses'
|
||||
break
|
||||
elif curr_text == '%' and prev_text and prev_text[-1].isdigit():
|
||||
consolidation_type = 'percentage'
|
||||
break
|
||||
|
||||
if consolidation_type:
|
||||
potential_consolidations[data_col] = (prev_col, consolidation_type)
|
||||
|
||||
# Second pass: resolve conflicts
|
||||
# If column Y is a target for consolidation from Y+1 (e.g., parentheses),
|
||||
# then don't consolidate Y into another column
|
||||
columns_needed_as_targets = set()
|
||||
for data_col, (target_col, cons_type) in potential_consolidations.items():
|
||||
if cons_type == 'parentheses':
|
||||
# This target column is needed for parentheses consolidation
|
||||
columns_needed_as_targets.add(target_col)
|
||||
|
||||
# Build final consolidation map, skipping consolidations that would remove needed targets
|
||||
for data_col, (target_col, cons_type) in potential_consolidations.items():
|
||||
# Don't consolidate this column if it's needed as a target for parentheses
|
||||
if data_col in columns_needed_as_targets and cons_type != 'parentheses':
|
||||
continue
|
||||
|
||||
# CRITICAL: Don't consolidate columns that have header content
|
||||
# This prevents legitimate header columns from being merged together
|
||||
if data_col in header_content_columns or target_col in header_content_columns:
|
||||
continue
|
||||
|
||||
consolidation_map[data_col] = target_col
|
||||
# Debug: uncomment to see consolidation mapping
|
||||
# import os
|
||||
# if os.environ.get('DEBUG_TABLE_CONSOLIDATION'):
|
||||
# print(f"Consolidating column {data_col} into {target_col}")
|
||||
|
||||
# Special case: Keep data columns that are associated with header columns
|
||||
# This handles cases where headers span multiple columns but data is in specific columns
|
||||
for header_col in primary_header_cols:
|
||||
# Check if there's a data column immediately after the header column
|
||||
# This is common when headers span multiple columns
|
||||
for offset in range(1, 3): # Check next 1-2 columns
|
||||
data_col = header_col + offset
|
||||
if data_col in data_cols and data_col not in cols_to_keep:
|
||||
# Check if this column has meaningful data
|
||||
has_data = False
|
||||
for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
|
||||
cell = self.matrix[row_idx][data_col]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text and text not in ['', '-', '—', '–']:
|
||||
has_data = True
|
||||
break
|
||||
if has_data:
|
||||
cols_to_keep.add(data_col)
|
||||
|
||||
# Keep data columns that have significant content but aren't near header columns
|
||||
# This includes columns with dates, text descriptions, etc.
|
||||
for col_idx in data_cols:
|
||||
if col_idx not in cols_to_keep:
|
||||
# Check if this column has important data
|
||||
has_important_data = False
|
||||
non_empty_count = 0
|
||||
text_samples = []
|
||||
|
||||
for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text and text not in ['', '-', '—', '–']:
|
||||
non_empty_count += 1
|
||||
if len(text_samples) < 3:
|
||||
text_samples.append(text)
|
||||
|
||||
# Check for important patterns
|
||||
# Dates, years, text descriptions, etc.
|
||||
if any([
|
||||
len(text) > 3 and not text.replace(',', '').replace('.', '').isdigit(), # Non-trivial text
|
||||
any(month in text for month in ['January', 'February', 'March', 'April', 'May', 'June',
|
||||
'July', 'August', 'September', 'October', 'November', 'December']),
|
||||
any(month in text for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
|
||||
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']),
|
||||
'20' in text and any(c.isdigit() for c in text), # Likely contains year
|
||||
]):
|
||||
has_important_data = True
|
||||
|
||||
# Keep columns with consistent important data
|
||||
if has_important_data and non_empty_count >= 3:
|
||||
cols_to_keep.add(col_idx)
|
||||
|
||||
# Special case: If we have very few primary headers but lots of data columns,
|
||||
# we might have a table where headers are in data rows (like years)
|
||||
# Keep columns that have significant financial data
|
||||
if len(primary_header_cols) <= 2 and len(data_cols) > 4:
|
||||
# Check for financial data patterns in columns
|
||||
for col_idx in data_cols:
|
||||
has_financial_data = False
|
||||
sample_count = 0
|
||||
|
||||
# Sample a few cells from this column
|
||||
for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text:
|
||||
sample_count += 1
|
||||
# Check for financial patterns
|
||||
if any([
|
||||
text.startswith('(') and any(c.isdigit() for c in text), # Negative numbers
|
||||
text == ')' and col_idx > 0, # Closing parenthesis
|
||||
'$' in text, # Currency
|
||||
'%' in text, # Percentages
|
||||
text.replace(',', '').replace('.', '').isdigit(), # Plain numbers
|
||||
text in ['—', '–', '-', '*'] # Common placeholders
|
||||
]):
|
||||
has_financial_data = True
|
||||
break
|
||||
|
||||
# Keep columns with financial data
|
||||
if has_financial_data and sample_count > 0:
|
||||
cols_to_keep.add(col_idx)
|
||||
|
||||
# Check if column 0 contains row labels (non-empty cells in data rows)
|
||||
col_0_has_labels = False
|
||||
data_start_row = max(1, actual_header_rows)
|
||||
for row_idx in range(data_start_row, self.row_count):
|
||||
cell = self.matrix[row_idx][0]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
if text and not text.isdigit() and not text.startswith('$') and len(text) > 1:
|
||||
col_0_has_labels = True
|
||||
break
|
||||
|
||||
# Include column 0 if it has labels
|
||||
if col_0_has_labels:
|
||||
cols_to_keep.add(0)
|
||||
|
||||
# Remove columns that will be consolidated into other columns
|
||||
# These columns' data will be merged into their target columns
|
||||
cols_to_remove = set(consolidation_map.keys())
|
||||
cols_to_keep = cols_to_keep - cols_to_remove
|
||||
|
||||
cols_to_keep = sorted(cols_to_keep)
|
||||
|
||||
# Create new matrix with consolidated columns
|
||||
if not cols_to_keep:
|
||||
return self
|
||||
|
||||
new_matrix = TableMatrix()
|
||||
new_matrix.row_count = self.row_count
|
||||
new_matrix.col_count = len(cols_to_keep)
|
||||
new_matrix.header_row_count = self.header_row_count # Preserve header row count
|
||||
new_matrix.matrix = []
|
||||
|
||||
# Create mapping from old to new column indices
|
||||
old_to_new = {old_col: new_idx for new_idx, old_col in enumerate(cols_to_keep)}
|
||||
|
||||
# Build new matrix with consolidation
|
||||
for row_idx in range(self.row_count):
|
||||
new_row = [MatrixCell() for _ in range(new_matrix.col_count)]
|
||||
|
||||
# Track which cells we've already placed to handle colspan properly
|
||||
placed_origins = {} # Maps (row_origin, col_origin) to new column index
|
||||
|
||||
# First, copy cells from kept columns
|
||||
for old_col in sorted(cols_to_keep):
|
||||
if old_col not in old_to_new:
|
||||
continue
|
||||
new_col = old_to_new[old_col]
|
||||
cell = self.matrix[row_idx][old_col]
|
||||
if cell.original_cell:
|
||||
origin_key = (cell.row_origin, cell.col_origin)
|
||||
|
||||
# Check if we've already placed this cell (due to colspan)
|
||||
if origin_key in placed_origins:
|
||||
# This is a continuation of a colspan - mark as spanned
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=cell.original_cell,
|
||||
is_spanned=True, # Mark as spanned since it's part of a colspan
|
||||
row_origin=cell.row_origin,
|
||||
col_origin=placed_origins[origin_key] # Point to the original placement
|
||||
)
|
||||
else:
|
||||
# First occurrence of this cell - place normally
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=cell.original_cell,
|
||||
is_spanned=False, # This is the primary cell
|
||||
row_origin=cell.row_origin,
|
||||
col_origin=new_col
|
||||
)
|
||||
placed_origins[origin_key] = new_col
|
||||
|
||||
# Then, consolidate misaligned data into header columns
|
||||
for data_col, header_col in consolidation_map.items():
|
||||
if header_col in old_to_new:
|
||||
new_col = old_to_new[header_col]
|
||||
data_cell = self.matrix[row_idx][data_col] if data_col < len(self.matrix[row_idx]) else None
|
||||
|
||||
|
||||
# If data cell has content, merge it with header column
|
||||
if data_cell and data_cell.original_cell and not data_cell.is_spanned:
|
||||
# Skip empty data cells
|
||||
if not data_cell.original_cell.text().strip():
|
||||
continue
|
||||
# Check the original header column cell to see if it has content to merge
|
||||
header_cell = self.matrix[row_idx][header_col]
|
||||
existing_cell = new_row[new_col]
|
||||
|
||||
# Check if we need to merge (e.g., $ with value)
|
||||
if header_cell.original_cell and header_cell.original_cell.text().strip():
|
||||
existing_text = header_cell.original_cell.text().strip()
|
||||
new_text = data_cell.original_cell.text().strip()
|
||||
|
||||
|
||||
# Merge currency symbol with value OR value with percentage OR parentheses
|
||||
if existing_text == '$' and new_text:
|
||||
# Currency merge: $ + number
|
||||
merged_text = f"${new_text}"
|
||||
# Create new cell with merged content
|
||||
merged_cell = Cell(
|
||||
content=merged_text,
|
||||
colspan=header_cell.original_cell.colspan,
|
||||
rowspan=header_cell.original_cell.rowspan,
|
||||
is_header=header_cell.original_cell.is_header,
|
||||
align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
|
||||
)
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=merged_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col
|
||||
)
|
||||
elif new_text == ')' and existing_text.startswith('('):
|
||||
# Parentheses merge: (number + )
|
||||
merged_text = f"{existing_text})"
|
||||
# Create new cell with merged content
|
||||
merged_cell = Cell(
|
||||
content=merged_text,
|
||||
colspan=header_cell.original_cell.colspan,
|
||||
rowspan=header_cell.original_cell.rowspan,
|
||||
is_header=header_cell.original_cell.is_header,
|
||||
align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
|
||||
)
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=merged_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col
|
||||
)
|
||||
elif new_text == '%' and existing_text:
|
||||
# Percentage merge: number + %
|
||||
merged_text = f"{existing_text}%"
|
||||
# Create new cell with merged content
|
||||
merged_cell = Cell(
|
||||
content=merged_text,
|
||||
colspan=header_cell.original_cell.colspan,
|
||||
rowspan=header_cell.original_cell.rowspan,
|
||||
is_header=header_cell.original_cell.is_header,
|
||||
align=header_cell.original_cell.align if hasattr(header_cell.original_cell, 'align') else None
|
||||
)
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=merged_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col
|
||||
)
|
||||
else:
|
||||
# Just keep the data cell if can't merge
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=data_cell.original_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col
|
||||
)
|
||||
else:
|
||||
# No existing content, just move the data
|
||||
new_row[new_col] = MatrixCell(
|
||||
original_cell=data_cell.original_cell,
|
||||
is_spanned=False,
|
||||
row_origin=row_idx,
|
||||
col_origin=new_col
|
||||
)
|
||||
|
||||
new_matrix.matrix.append(new_row)
|
||||
|
||||
return new_matrix
|
||||
|
||||
def to_cell_grid(self) -> List[List[Optional[Cell]]]:
|
||||
"""
|
||||
Convert matrix to a simple 2D grid of cells.
|
||||
|
||||
Returns:
|
||||
2D list where each position contains either a Cell or None
|
||||
"""
|
||||
grid = []
|
||||
|
||||
for row_idx in range(self.row_count):
|
||||
row = []
|
||||
for col_idx in range(self.col_count):
|
||||
matrix_cell = self.matrix[row_idx][col_idx]
|
||||
if matrix_cell.original_cell and not matrix_cell.is_spanned:
|
||||
row.append(matrix_cell.original_cell)
|
||||
else:
|
||||
row.append(None)
|
||||
grid.append(row)
|
||||
|
||||
return grid
|
||||
|
||||
def debug_print(self):
|
||||
"""Print matrix structure for debugging"""
|
||||
print(f"Matrix: {self.row_count}×{self.col_count}")
|
||||
|
||||
for row_idx in range(self.row_count):
|
||||
row_str = []
|
||||
for col_idx in range(self.col_count):
|
||||
cell = self.matrix[row_idx][col_idx]
|
||||
if cell.original_cell:
|
||||
text = cell.original_cell.text()[:10]
|
||||
if cell.is_spanned:
|
||||
row_str.append(f"[{text}...]")
|
||||
else:
|
||||
row_str.append(f"{text}...")
|
||||
else:
|
||||
row_str.append("___")
|
||||
print(f"Row {row_idx}: {' | '.join(row_str)}")
|
||||
|
||||
|
||||
class ColumnAnalyzer:
|
||||
"""Analyze column structure to identify data vs spacing columns"""
|
||||
|
||||
def __init__(self, matrix: TableMatrix):
|
||||
"""Initialize with a table matrix"""
|
||||
self.matrix = matrix
|
||||
|
||||
def identify_spacing_columns(self) -> List[int]:
|
||||
"""
|
||||
Identify columns used only for spacing.
|
||||
|
||||
Returns:
|
||||
List of column indices that are spacing columns
|
||||
"""
|
||||
spacing_cols = []
|
||||
widths = self.matrix.get_column_widths()
|
||||
total_width = sum(widths)
|
||||
|
||||
for col_idx in range(self.matrix.col_count):
|
||||
if self._is_spacing_column(col_idx, widths, total_width):
|
||||
spacing_cols.append(col_idx)
|
||||
|
||||
return spacing_cols
|
||||
|
||||
def _is_spacing_column(self, col_idx: int, widths: List[float], total_width: float) -> bool:
|
||||
"""
|
||||
Check if a column is used for spacing.
|
||||
Only mark as spacing if column is completely empty.
|
||||
|
||||
Criteria:
|
||||
- Column has absolutely no content across all rows
|
||||
"""
|
||||
# Check if column is completely empty
|
||||
for row_idx in range(self.matrix.row_count):
|
||||
cell = self.matrix.matrix[row_idx][col_idx]
|
||||
if cell.original_cell and not cell.is_spanned:
|
||||
text = cell.original_cell.text().strip()
|
||||
# If there's any text at all, it's not a spacing column
|
||||
if text:
|
||||
return False
|
||||
|
||||
# Column is completely empty
|
||||
return True
|
||||
|
||||
def get_clean_column_indices(self) -> List[int]:
|
||||
"""
|
||||
Get indices of non-spacing columns.
|
||||
|
||||
Returns:
|
||||
List of column indices that contain actual data
|
||||
"""
|
||||
spacing = set(self.identify_spacing_columns())
|
||||
return [i for i in range(self.matrix.col_count) if i not in spacing]
|
||||
@@ -0,0 +1,440 @@
|
||||
"""
|
||||
Table of Contents analyzer for SEC filings.
|
||||
|
||||
This module analyzes the TOC structure to map section names to anchor IDs,
|
||||
enabling section extraction for API filings with generated anchor IDs.
|
||||
"""
|
||||
import re
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from dataclasses import dataclass
|
||||
from lxml import html as lxml_html
|
||||
|
||||
|
||||
@dataclass
|
||||
class TOCSection:
|
||||
"""Represents a section found in the Table of Contents."""
|
||||
name: str
|
||||
anchor_id: str
|
||||
normalized_name: str
|
||||
section_type: str # 'item', 'part', 'other'
|
||||
order: int
|
||||
part: Optional[str] = None # NEW: "Part I", "Part II", or None for 10-K
|
||||
|
||||
|
||||
class TOCAnalyzer:
|
||||
"""
|
||||
Analyzes Table of Contents structure to map section names to anchor IDs.
|
||||
|
||||
This enables section extraction for filings where anchor IDs are generated
|
||||
rather than semantic (like API filings vs local HTML files).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# SEC section patterns for normalization
|
||||
self.section_patterns = [
|
||||
(r'(?:item|part)\s+\d+[a-z]?', 'item'),
|
||||
(r'business', 'item'),
|
||||
(r'risk\s+factors?', 'item'),
|
||||
(r'properties', 'item'),
|
||||
(r'legal\s+proceedings', 'item'),
|
||||
(r'management.*discussion', 'item'),
|
||||
(r'md&a', 'item'),
|
||||
(r'financial\s+statements?', 'item'),
|
||||
(r'exhibits?', 'item'),
|
||||
(r'signatures?', 'item'),
|
||||
(r'part\s+[ivx]+', 'part'),
|
||||
]
|
||||
|
||||
def analyze_toc_structure(self, html_content: str) -> Dict[str, str]:
|
||||
"""
|
||||
Analyze HTML content to extract section mappings from TOC.
|
||||
|
||||
Args:
|
||||
html_content: Raw HTML content
|
||||
|
||||
Returns:
|
||||
Dict mapping normalized section names to anchor IDs
|
||||
"""
|
||||
section_mapping = {}
|
||||
|
||||
try:
|
||||
# Handle XML declaration issues
|
||||
if html_content.startswith('<?xml'):
|
||||
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
|
||||
|
||||
tree = lxml_html.fromstring(html_content)
|
||||
|
||||
# Find all anchor links that could be TOC links
|
||||
anchor_links = tree.xpath('//a[@href]')
|
||||
|
||||
toc_sections = []
|
||||
current_part = None # Track current part context for 10-Q filings
|
||||
part_pattern = re.compile(r'^\s*Part\s+([IVX]+)\b', re.IGNORECASE)
|
||||
|
||||
for link in anchor_links:
|
||||
href = link.get('href', '').strip()
|
||||
text = (link.text_content() or '').strip()
|
||||
|
||||
# Check if this link or its row represents a part header
|
||||
# Part headers in 10-Q TOCs typically appear as separate rows: "Part I", "Part II"
|
||||
part_match = part_pattern.match(text)
|
||||
if part_match:
|
||||
# Update current part context
|
||||
current_part = f"Part {part_match.group(1).upper()}"
|
||||
# Don't create a section for the part header itself
|
||||
continue
|
||||
|
||||
# Look for internal anchor links
|
||||
if href.startswith('#') and text:
|
||||
anchor_id = href[1:] # Remove #
|
||||
|
||||
# Try to find item number in preceding context (for table-based TOCs)
|
||||
preceding_item = self._extract_preceding_item_label(link)
|
||||
|
||||
# Check if this looks like a section reference (check text, anchor ID, and context)
|
||||
if self._is_section_link(text, anchor_id, preceding_item):
|
||||
# Verify target exists
|
||||
target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
|
||||
if target_elements:
|
||||
# Try to extract item number from: anchor ID > preceding context > text
|
||||
normalized_name = self._normalize_section_name(text, anchor_id, preceding_item)
|
||||
section_type, order = self._get_section_type_and_order(normalized_name)
|
||||
|
||||
toc_section = TOCSection(
|
||||
name=text,
|
||||
anchor_id=anchor_id,
|
||||
normalized_name=normalized_name,
|
||||
section_type=section_type,
|
||||
order=order,
|
||||
part=current_part # Assign current part context
|
||||
)
|
||||
toc_sections.append(toc_section)
|
||||
|
||||
# Build mapping prioritizing the most standard section names
|
||||
section_mapping = self._build_section_mapping(toc_sections)
|
||||
|
||||
except Exception as e:
|
||||
# Return empty mapping on error - fallback to other methods
|
||||
pass
|
||||
|
||||
return section_mapping
|
||||
|
||||
def _extract_preceding_item_label(self, link_element) -> str:
|
||||
"""
|
||||
Extract item/part label from preceding context.
|
||||
|
||||
Handles table-based TOCs where item number is in a separate cell:
|
||||
<td>Item 1.</td><td><a href="...">Business</a></td>
|
||||
|
||||
Also handles nested structures like:
|
||||
<td>Item 1.</td><td><div><span><a href="...">Business</a></span></div></td>
|
||||
|
||||
Args:
|
||||
link_element: The <a> element
|
||||
|
||||
Returns:
|
||||
Item label like "Item 1", "Item 1A", "Part I" or empty string
|
||||
"""
|
||||
try:
|
||||
# Traverse up to find the containing <td> or <th> (up to 5 levels)
|
||||
current = link_element
|
||||
td_element = None
|
||||
|
||||
for _ in range(5):
|
||||
parent = current.getparent()
|
||||
if parent is None:
|
||||
break
|
||||
|
||||
if parent.tag in ['td', 'th']:
|
||||
td_element = parent
|
||||
break
|
||||
|
||||
current = parent
|
||||
|
||||
# If we found a <td>, check ALL preceding siblings in the row
|
||||
# This handles TOCs where item number is not in the immediately adjacent cell
|
||||
# Example: ['Business', 'I', '1', '5'] where '1' is the item number
|
||||
if td_element is not None:
|
||||
# Check all preceding siblings (rightmost to leftmost)
|
||||
prev_sibling = td_element.getprevious()
|
||||
while prev_sibling is not None:
|
||||
if prev_sibling.tag in ['td', 'th']:
|
||||
prev_text = (prev_sibling.text_content() or '').strip()
|
||||
|
||||
# Look for "Item X" or just "X" (bare number) pattern
|
||||
# Match full format: "Item 1A"
|
||||
item_match = re.match(r'(Item\s+\d+[A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
|
||||
if item_match:
|
||||
return item_match.group(1)
|
||||
|
||||
# Match bare item number: "1A" or "1" (only valid 10-K item numbers: 1-15)
|
||||
# This prevents page numbers (50, 108, etc.) from being treated as items
|
||||
bare_item_match = re.match(r'^([1-9]|1[0-5])([A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
|
||||
if bare_item_match:
|
||||
item_num = bare_item_match.group(1)
|
||||
item_letter = bare_item_match.group(2)
|
||||
return f"Item {item_num}{item_letter}"
|
||||
|
||||
# Match part: "Part I" or just "I"
|
||||
part_match = re.match(r'(Part\s+[IVX]+)\.?\s*$', prev_text, re.IGNORECASE)
|
||||
if part_match:
|
||||
return part_match.group(1)
|
||||
|
||||
# Match bare part: "I", "II", etc.
|
||||
bare_part_match = re.match(r'^([IVX]+)\.?\s*$', prev_text)
|
||||
if bare_part_match:
|
||||
return f"Part {bare_part_match.group(1)}"
|
||||
|
||||
prev_sibling = prev_sibling.getprevious()
|
||||
|
||||
# Also check immediate parent's text for inline patterns (div/span structures)
|
||||
parent = link_element.getparent()
|
||||
if parent is not None and parent.tag in ['div', 'span', 'p']:
|
||||
if parent.text:
|
||||
text_before = parent.text.strip()
|
||||
item_match = re.search(r'(Item\s+\d+[A-Z]?)\.?\s*$', text_before, re.IGNORECASE)
|
||||
if item_match:
|
||||
return item_match.group(1)
|
||||
|
||||
part_match = re.search(r'(Part\s+[IVX]+)\.?\s*$', text_before, re.IGNORECASE)
|
||||
if part_match:
|
||||
return part_match.group(1)
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return ''
|
||||
|
||||
def _is_section_link(self, text: str, anchor_id: str = '', preceding_item: str = '') -> bool:
|
||||
"""
|
||||
Check if link represents a section reference.
|
||||
|
||||
Checks link text, anchor ID, and preceding context to handle cases where:
|
||||
- Text is descriptive (e.g., "Executive Compensation")
|
||||
- Anchor ID contains item number (e.g., "item_11_executive_compensation")
|
||||
- Item number is in preceding table cell (e.g., <td>Item 1.</td><td><a>Business</a></td>)
|
||||
|
||||
Args:
|
||||
text: Link text
|
||||
anchor_id: Anchor ID from href (without #)
|
||||
preceding_item: Item/part label from preceding context (e.g., "Item 1A")
|
||||
|
||||
Returns:
|
||||
True if this appears to be a section link
|
||||
"""
|
||||
if not text:
|
||||
return False
|
||||
|
||||
# First check if there's a preceding item label (table-based TOC)
|
||||
if preceding_item:
|
||||
return True
|
||||
|
||||
# Then check anchor ID for item/part patterns (most reliable)
|
||||
if anchor_id:
|
||||
anchor_lower = anchor_id.lower()
|
||||
# Match patterns like: item_1, item_1a, item1, item1a, part_i, part_ii, etc.
|
||||
if re.search(r'item_?\d+[a-z]?', anchor_lower):
|
||||
return True
|
||||
if re.search(r'part_?[ivx]+', anchor_lower):
|
||||
return True
|
||||
|
||||
# Then check text (with relaxed length limit for descriptive section names)
|
||||
if len(text) > 150: # Increased from 100 to accommodate longer section titles
|
||||
return False
|
||||
|
||||
# Check against known patterns
|
||||
for pattern, _ in self.section_patterns:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Also consider links with section keywords
|
||||
if len(text) < 100 and any(keyword in text.lower() for keyword in
|
||||
['item', 'part', 'business', 'risk', 'properties', 'legal',
|
||||
'compensation', 'ownership', 'governance', 'directors']):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _normalize_section_name(self, text: str, anchor_id: str = '', preceding_item: str = '') -> str:
|
||||
"""
|
||||
Normalize section name for consistent lookup.
|
||||
|
||||
Prioritizes:
|
||||
1. Preceding item label (table-based TOC)
|
||||
2. Anchor ID pattern
|
||||
3. Text-based normalization
|
||||
|
||||
Args:
|
||||
text: Link text
|
||||
anchor_id: Anchor ID from href (without #)
|
||||
preceding_item: Item/part label from preceding context
|
||||
|
||||
Returns:
|
||||
Normalized section name (e.g., "Item 1A", "Part II")
|
||||
"""
|
||||
text = text.strip()
|
||||
|
||||
# HIGHEST PRIORITY: Use preceding item label if available (table-based TOC)
|
||||
if preceding_item:
|
||||
# Clean up and normalize the preceding item
|
||||
item_match = re.match(r'item\s+(\d+[a-z]?)', preceding_item, re.IGNORECASE)
|
||||
if item_match:
|
||||
return f"Item {item_match.group(1).upper()}"
|
||||
|
||||
part_match = re.match(r'part\s+([ivx]+)', preceding_item, re.IGNORECASE)
|
||||
if part_match:
|
||||
return f"Part {part_match.group(1).upper()}"
|
||||
|
||||
# SECOND PRIORITY: Try to extract from anchor ID
|
||||
if anchor_id:
|
||||
anchor_lower = anchor_id.lower()
|
||||
|
||||
# Match item patterns: item_1a, item1a, item_1_business, etc.
|
||||
item_match = re.search(r'item_?(\d+[a-z]?)', anchor_lower)
|
||||
if item_match:
|
||||
item_num = item_match.group(1).upper()
|
||||
return f"Item {item_num}"
|
||||
|
||||
# Match part patterns: part_i, part_ii, parti, partii, etc.
|
||||
part_match = re.search(r'part_?([ivx]+)', anchor_lower)
|
||||
if part_match:
|
||||
part_num = part_match.group(1).upper()
|
||||
return f"Part {part_num}"
|
||||
|
||||
# THIRD PRIORITY: Text-based normalization
|
||||
# Handle common Item patterns in text
|
||||
item_match = re.match(r'item\s+(\d+[a-z]?)', text, re.IGNORECASE)
|
||||
if item_match:
|
||||
return f"Item {item_match.group(1).upper()}"
|
||||
|
||||
# Handle Part patterns
|
||||
part_match = re.match(r'part\s+([ivx]+)', text, re.IGNORECASE)
|
||||
if part_match:
|
||||
return f"Part {part_match.group(1).upper()}"
|
||||
|
||||
# Handle specific known sections by text
|
||||
text_lower = text.lower()
|
||||
if 'business' in text_lower and 'item' not in text_lower:
|
||||
return "Item 1"
|
||||
elif 'risk factors' in text_lower and 'item' not in text_lower:
|
||||
return "Item 1A"
|
||||
elif 'properties' in text_lower and 'item' not in text_lower:
|
||||
return "Item 2"
|
||||
elif 'legal proceedings' in text_lower and 'item' not in text_lower:
|
||||
return "Item 3"
|
||||
elif 'management' in text_lower and 'discussion' in text_lower:
|
||||
return "Item 7"
|
||||
elif 'financial statements' in text_lower:
|
||||
return "Item 8"
|
||||
elif 'exhibits' in text_lower:
|
||||
return "Item 15"
|
||||
|
||||
return text # Return as-is if no normalization applies
|
||||
|
||||
def _get_section_type_and_order(self, text: str) -> Tuple[str, int]:
|
||||
"""Get section type and order for sorting."""
|
||||
text_lower = text.lower()
|
||||
|
||||
# Items
|
||||
item_match = re.search(r'item\s*(\d+)([a-z]?)', text_lower)
|
||||
if item_match:
|
||||
item_num = int(item_match.group(1))
|
||||
item_letter = item_match.group(2) or ''
|
||||
# Order: Item 1=1000, Item 1A=1001, Item 2=2000, etc.
|
||||
order = item_num * 1000 + (ord(item_letter.upper()) - ord('A') + 1 if item_letter else 0)
|
||||
return 'item', order
|
||||
|
||||
# Parts
|
||||
part_match = re.search(r'part\s*([ivx]+)', text_lower)
|
||||
if part_match:
|
||||
part_roman = part_match.group(1)
|
||||
part_num = self._roman_to_int(part_roman)
|
||||
return 'part', part_num * 100 # Part I=100, Part II=200, etc.
|
||||
|
||||
# Known sections without explicit item numbers
|
||||
if 'business' in text_lower:
|
||||
return 'item', 1000 # Item 1
|
||||
elif 'risk factors' in text_lower:
|
||||
return 'item', 1001 # Item 1A
|
||||
elif 'properties' in text_lower:
|
||||
return 'item', 2000 # Item 2
|
||||
elif 'legal proceedings' in text_lower:
|
||||
return 'item', 3000 # Item 3
|
||||
elif 'management' in text_lower and 'discussion' in text_lower:
|
||||
return 'item', 7000 # Item 7
|
||||
elif 'financial statements' in text_lower:
|
||||
return 'item', 8000 # Item 8
|
||||
elif 'exhibits' in text_lower:
|
||||
return 'item', 15000 # Item 15
|
||||
|
||||
return 'other', 99999
|
||||
|
||||
def _roman_to_int(self, roman: str) -> int:
|
||||
"""Convert roman numerals to integers."""
|
||||
roman_map = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
|
||||
roman = roman.lower()
|
||||
result = 0
|
||||
prev = 0
|
||||
|
||||
for char in reversed(roman):
|
||||
value = roman_map.get(char, 0)
|
||||
if value < prev:
|
||||
result -= value
|
||||
else:
|
||||
result += value
|
||||
prev = value
|
||||
|
||||
return result
|
||||
|
||||
def _build_section_mapping(self, toc_sections: List[TOCSection]) -> Dict[str, str]:
|
||||
"""Build final section mapping, handling duplicates intelligently.
|
||||
|
||||
For 10-Q filings with part context, generates part-aware section names
|
||||
like "part_i_item_1" and "part_ii_item_1" to distinguish sections
|
||||
with the same item number across different parts.
|
||||
"""
|
||||
# Sort sections by order
|
||||
toc_sections.sort(key=lambda x: x.order)
|
||||
|
||||
mapping = {}
|
||||
seen_names = set()
|
||||
|
||||
for section in toc_sections:
|
||||
# Generate part-aware section name for 10-Q filings
|
||||
if section.part:
|
||||
# Convert "Part I" -> "part_i", "Part II" -> "part_ii"
|
||||
part_key = section.part.lower().replace(' ', '_')
|
||||
# Convert "Item 1" -> "item_1", "Item 1A" -> "item_1a"
|
||||
item_key = section.normalized_name.lower().replace(' ', '_')
|
||||
section_name = f"{part_key}_{item_key}"
|
||||
else:
|
||||
# 10-K filings: use normalized name as-is
|
||||
section_name = section.normalized_name
|
||||
|
||||
# Skip if we already have this section (prefer first occurrence)
|
||||
if section_name in seen_names:
|
||||
continue
|
||||
|
||||
mapping[section_name] = section.anchor_id
|
||||
seen_names.add(section_name)
|
||||
|
||||
return mapping
|
||||
|
||||
def get_section_suggestions(self, html_content: str) -> List[str]:
|
||||
"""Get list of available sections that can be extracted."""
|
||||
mapping = self.analyze_toc_structure(html_content)
|
||||
return sorted(mapping.keys(), key=lambda x: self._get_section_type_and_order(x)[1])
|
||||
|
||||
|
||||
def analyze_toc_for_sections(html_content: str) -> Dict[str, str]:
|
||||
"""
|
||||
Convenience function to analyze TOC and return section mapping.
|
||||
|
||||
Args:
|
||||
html_content: Raw HTML content
|
||||
|
||||
Returns:
|
||||
Dict mapping section names to anchor IDs
|
||||
"""
|
||||
analyzer = TOCAnalyzer()
|
||||
return analyzer.analyze_toc_structure(html_content)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user