Files
edgartools/venv/lib/python3.10/site-packages/edgar/documents/config.py
2025-12-09 12:13:01 +01:00

211 lines
7.4 KiB
Python

"""
Configuration for the HTML parser.
"""
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
@dataclass
class DetectionThresholds:
"""
Configurable thresholds for section detection strategies.
Attributes:
min_confidence: Minimum confidence score to include a section (0.0-1.0)
cross_validation_boost: Multiplier when multiple methods agree (>1.0)
disagreement_penalty: Multiplier when methods disagree (<1.0)
boundary_overlap_penalty: Multiplier for overlapping sections (<1.0)
enable_cross_validation: Whether to run cross-validation (slower but more accurate)
thresholds_by_form: Filing-specific threshold overrides
"""
min_confidence: float = 0.6
cross_validation_boost: float = 1.2
disagreement_penalty: float = 0.8
boundary_overlap_penalty: float = 0.9
enable_cross_validation: bool = False # Disabled by default for performance
thresholds_by_form: Dict[str, Dict[str, float]] = field(default_factory=dict)
@dataclass
class ParserConfig:
"""
Configuration for HTML parser.
Attributes:
max_document_size: Maximum document size in bytes
streaming_threshold: Document size threshold for streaming mode
cache_size: Maximum number of cached items
enable_parallel: Enable parallel processing for tables
strict_mode: Fail on parsing errors vs. best effort
extract_xbrl: Extract inline XBRL facts
extract_styles: Extract and process CSS styles
preserve_whitespace: Preserve original whitespace
optimize_for_ai: Enable AI-specific optimizations
max_token_estimation: Maximum estimated tokens for AI optimization
features: Feature flags for optional functionality
"""
# Performance settings
max_document_size: int = 100 * 1024 * 1024 # 100MB (handles large filings like JPM)
streaming_threshold: int = 10 * 1024 * 1024 # 10MB
cache_size: int = 1000
enable_parallel: bool = True
max_workers: Optional[int] = None # None = use CPU count
# Parsing settings
strict_mode: bool = False
extract_xbrl: bool = True
extract_styles: bool = True
preserve_whitespace: bool = False
normalize_text: bool = True
extract_links: bool = True
extract_images: bool = False
# AI optimization
optimize_for_ai: bool = True
max_token_estimation: int = 100_000
chunk_size: int = 512
chunk_overlap: int = 128
# Table processing
table_extraction: bool = True
detect_table_types: bool = True
extract_table_relationships: bool = True
fast_table_rendering: bool = True # Fast renderer is now production-ready (7-10x faster than Rich)
# Section detection
detect_sections: bool = True
eager_section_extraction: bool = False # Extract sections during parsing vs. on first access (default: lazy)
form: Optional[str] = None # Required for section detection (e.g. '10-K', '10-Q', '8-K')
detection_thresholds: DetectionThresholds = field(default_factory=DetectionThresholds)
section_patterns: Dict[str, List[str]] = field(default_factory=lambda: {
'business': [
r'item\s+1\.?\s*business',
r'business\s+overview',
r'our\s+business'
],
'risk_factors': [
r'item\s+1a\.?\s*risk\s+factors',
r'risk\s+factors',
r'factors\s+that\s+may\s+affect'
],
'properties': [
r'item\s+2\.?\s*properties',
r'properties'
],
'legal_proceedings': [
r'item\s+3\.?\s*legal\s+proceedings',
r'legal\s+proceedings',
r'litigation'
],
'mda': [
r'item\s+7\.?\s*management\'?s?\s+discussion',
r'md&a',
r'management\'?s?\s+discussion\s+and\s+analysis'
],
'financial_statements': [
r'item\s+8\.?\s*financial\s+statements',
r'consolidated\s+financial\s+statements',
r'financial\s+statements'
]
})
# Feature flags
features: Dict[str, bool] = field(default_factory=lambda: {
'ml_header_detection': True,
'semantic_analysis': True,
'table_understanding': True,
'xbrl_validation': True,
'auto_section_detection': True,
'smart_text_extraction': True,
'footnote_linking': True,
'cross_reference_resolution': True
})
# Header detection settings
header_detection_threshold: float = 0.6 # Minimum confidence
header_detection_methods: List[str] = field(default_factory=lambda: [
'style',
'pattern',
'structural',
'contextual'
])
# Text extraction settings
min_text_length: int = 10 # Minimum text length to keep
merge_adjacent_nodes: bool = True
merge_distance: int = 2 # Max distance between nodes to merge
# Performance monitoring
enable_profiling: bool = False
log_performance: bool = False
def to_dict(self) -> Dict[str, Any]:
"""Convert configuration to dictionary."""
return {
'max_document_size': self.max_document_size,
'streaming_threshold': self.streaming_threshold,
'cache_size': self.cache_size,
'enable_parallel': self.enable_parallel,
'strict_mode': self.strict_mode,
'extract_xbrl': self.extract_xbrl,
'extract_styles': self.extract_styles,
'preserve_whitespace': self.preserve_whitespace,
'optimize_for_ai': self.optimize_for_ai,
'features': self.features.copy()
}
@classmethod
def for_performance(cls) -> 'ParserConfig':
"""Create config optimized for performance."""
return cls(
extract_styles=False,
extract_xbrl=False,
enable_parallel=True,
cache_size=5000,
eager_section_extraction=False, # Skip expensive section extraction
fast_table_rendering=True, # Fast renderer (enabled by default now)
features={
'ml_header_detection': False,
'semantic_analysis': False,
'table_understanding': False,
'xbrl_validation': False
}
)
@classmethod
def for_accuracy(cls) -> 'ParserConfig':
"""Create config optimized for accuracy."""
return cls(
strict_mode=True,
extract_styles=True,
extract_xbrl=True,
enable_parallel=True,
features={
'ml_header_detection': True,
'semantic_analysis': True,
'table_understanding': True,
'xbrl_validation': True,
'auto_section_detection': True,
'smart_text_extraction': True,
'footnote_linking': True,
'cross_reference_resolution': True
}
)
@classmethod
def for_ai(cls) -> 'ParserConfig':
"""Create config optimized for AI/LLM processing."""
return cls(
optimize_for_ai=True,
extract_styles=False,
extract_xbrl=True,
normalize_text=True,
merge_adjacent_nodes=True,
features={
'ml_header_detection': True,
'semantic_analysis': True,
'smart_text_extraction': True
}
)