211 lines
7.4 KiB
Python
211 lines
7.4 KiB
Python
"""
|
|
Configuration for the HTML parser.
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
|
|
@dataclass
|
|
class DetectionThresholds:
|
|
"""
|
|
Configurable thresholds for section detection strategies.
|
|
|
|
Attributes:
|
|
min_confidence: Minimum confidence score to include a section (0.0-1.0)
|
|
cross_validation_boost: Multiplier when multiple methods agree (>1.0)
|
|
disagreement_penalty: Multiplier when methods disagree (<1.0)
|
|
boundary_overlap_penalty: Multiplier for overlapping sections (<1.0)
|
|
enable_cross_validation: Whether to run cross-validation (slower but more accurate)
|
|
thresholds_by_form: Filing-specific threshold overrides
|
|
"""
|
|
min_confidence: float = 0.6
|
|
cross_validation_boost: float = 1.2
|
|
disagreement_penalty: float = 0.8
|
|
boundary_overlap_penalty: float = 0.9
|
|
enable_cross_validation: bool = False # Disabled by default for performance
|
|
thresholds_by_form: Dict[str, Dict[str, float]] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class ParserConfig:
|
|
"""
|
|
Configuration for HTML parser.
|
|
|
|
Attributes:
|
|
max_document_size: Maximum document size in bytes
|
|
streaming_threshold: Document size threshold for streaming mode
|
|
cache_size: Maximum number of cached items
|
|
enable_parallel: Enable parallel processing for tables
|
|
strict_mode: Fail on parsing errors vs. best effort
|
|
extract_xbrl: Extract inline XBRL facts
|
|
extract_styles: Extract and process CSS styles
|
|
preserve_whitespace: Preserve original whitespace
|
|
optimize_for_ai: Enable AI-specific optimizations
|
|
max_token_estimation: Maximum estimated tokens for AI optimization
|
|
features: Feature flags for optional functionality
|
|
"""
|
|
|
|
# Performance settings
|
|
max_document_size: int = 100 * 1024 * 1024 # 100MB (handles large filings like JPM)
|
|
streaming_threshold: int = 10 * 1024 * 1024 # 10MB
|
|
cache_size: int = 1000
|
|
enable_parallel: bool = True
|
|
max_workers: Optional[int] = None # None = use CPU count
|
|
|
|
# Parsing settings
|
|
strict_mode: bool = False
|
|
extract_xbrl: bool = True
|
|
extract_styles: bool = True
|
|
preserve_whitespace: bool = False
|
|
normalize_text: bool = True
|
|
extract_links: bool = True
|
|
extract_images: bool = False
|
|
|
|
# AI optimization
|
|
optimize_for_ai: bool = True
|
|
max_token_estimation: int = 100_000
|
|
chunk_size: int = 512
|
|
chunk_overlap: int = 128
|
|
|
|
# Table processing
|
|
table_extraction: bool = True
|
|
detect_table_types: bool = True
|
|
extract_table_relationships: bool = True
|
|
fast_table_rendering: bool = True # Fast renderer is now production-ready (7-10x faster than Rich)
|
|
|
|
# Section detection
|
|
detect_sections: bool = True
|
|
eager_section_extraction: bool = False # Extract sections during parsing vs. on first access (default: lazy)
|
|
form: Optional[str] = None # Required for section detection (e.g. '10-K', '10-Q', '8-K')
|
|
detection_thresholds: DetectionThresholds = field(default_factory=DetectionThresholds)
|
|
section_patterns: Dict[str, List[str]] = field(default_factory=lambda: {
|
|
'business': [
|
|
r'item\s+1\.?\s*business',
|
|
r'business\s+overview',
|
|
r'our\s+business'
|
|
],
|
|
'risk_factors': [
|
|
r'item\s+1a\.?\s*risk\s+factors',
|
|
r'risk\s+factors',
|
|
r'factors\s+that\s+may\s+affect'
|
|
],
|
|
'properties': [
|
|
r'item\s+2\.?\s*properties',
|
|
r'properties'
|
|
],
|
|
'legal_proceedings': [
|
|
r'item\s+3\.?\s*legal\s+proceedings',
|
|
r'legal\s+proceedings',
|
|
r'litigation'
|
|
],
|
|
'mda': [
|
|
r'item\s+7\.?\s*management\'?s?\s+discussion',
|
|
r'md&a',
|
|
r'management\'?s?\s+discussion\s+and\s+analysis'
|
|
],
|
|
'financial_statements': [
|
|
r'item\s+8\.?\s*financial\s+statements',
|
|
r'consolidated\s+financial\s+statements',
|
|
r'financial\s+statements'
|
|
]
|
|
})
|
|
|
|
# Feature flags
|
|
features: Dict[str, bool] = field(default_factory=lambda: {
|
|
'ml_header_detection': True,
|
|
'semantic_analysis': True,
|
|
'table_understanding': True,
|
|
'xbrl_validation': True,
|
|
'auto_section_detection': True,
|
|
'smart_text_extraction': True,
|
|
'footnote_linking': True,
|
|
'cross_reference_resolution': True
|
|
})
|
|
|
|
# Header detection settings
|
|
header_detection_threshold: float = 0.6 # Minimum confidence
|
|
header_detection_methods: List[str] = field(default_factory=lambda: [
|
|
'style',
|
|
'pattern',
|
|
'structural',
|
|
'contextual'
|
|
])
|
|
|
|
# Text extraction settings
|
|
min_text_length: int = 10 # Minimum text length to keep
|
|
merge_adjacent_nodes: bool = True
|
|
merge_distance: int = 2 # Max distance between nodes to merge
|
|
|
|
# Performance monitoring
|
|
enable_profiling: bool = False
|
|
log_performance: bool = False
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert configuration to dictionary."""
|
|
return {
|
|
'max_document_size': self.max_document_size,
|
|
'streaming_threshold': self.streaming_threshold,
|
|
'cache_size': self.cache_size,
|
|
'enable_parallel': self.enable_parallel,
|
|
'strict_mode': self.strict_mode,
|
|
'extract_xbrl': self.extract_xbrl,
|
|
'extract_styles': self.extract_styles,
|
|
'preserve_whitespace': self.preserve_whitespace,
|
|
'optimize_for_ai': self.optimize_for_ai,
|
|
'features': self.features.copy()
|
|
}
|
|
|
|
@classmethod
|
|
def for_performance(cls) -> 'ParserConfig':
|
|
"""Create config optimized for performance."""
|
|
return cls(
|
|
extract_styles=False,
|
|
extract_xbrl=False,
|
|
enable_parallel=True,
|
|
cache_size=5000,
|
|
eager_section_extraction=False, # Skip expensive section extraction
|
|
fast_table_rendering=True, # Fast renderer (enabled by default now)
|
|
features={
|
|
'ml_header_detection': False,
|
|
'semantic_analysis': False,
|
|
'table_understanding': False,
|
|
'xbrl_validation': False
|
|
}
|
|
)
|
|
|
|
@classmethod
|
|
def for_accuracy(cls) -> 'ParserConfig':
|
|
"""Create config optimized for accuracy."""
|
|
return cls(
|
|
strict_mode=True,
|
|
extract_styles=True,
|
|
extract_xbrl=True,
|
|
enable_parallel=True,
|
|
features={
|
|
'ml_header_detection': True,
|
|
'semantic_analysis': True,
|
|
'table_understanding': True,
|
|
'xbrl_validation': True,
|
|
'auto_section_detection': True,
|
|
'smart_text_extraction': True,
|
|
'footnote_linking': True,
|
|
'cross_reference_resolution': True
|
|
}
|
|
)
|
|
|
|
@classmethod
|
|
def for_ai(cls) -> 'ParserConfig':
|
|
"""Create config optimized for AI/LLM processing."""
|
|
return cls(
|
|
optimize_for_ai=True,
|
|
extract_styles=False,
|
|
extract_xbrl=True,
|
|
normalize_text=True,
|
|
merge_adjacent_nodes=True,
|
|
features={
|
|
'ml_header_detection': True,
|
|
'semantic_analysis': True,
|
|
'smart_text_extraction': True
|
|
}
|
|
) |