edgartools/venv/lib/python3.10/site-packages/edgar/documents/config.py

"""
Configuration for the HTML parser.
"""

from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional


@dataclass
class DetectionThresholds:
    """
    Configurable thresholds for section detection strategies.

    Attributes:
        min_confidence: Minimum confidence score to include a section (0.0-1.0)
        cross_validation_boost: Multiplier when multiple methods agree (>1.0)
        disagreement_penalty: Multiplier when methods disagree (<1.0)
        boundary_overlap_penalty: Multiplier for overlapping sections (<1.0)
        enable_cross_validation: Whether to run cross-validation (slower but more accurate)
        thresholds_by_form: Filing-specific threshold overrides
    """
    min_confidence: float = 0.6
    cross_validation_boost: float = 1.2
    disagreement_penalty: float = 0.8
    boundary_overlap_penalty: float = 0.9
    enable_cross_validation: bool = False  # Disabled by default for performance
    thresholds_by_form: Dict[str, Dict[str, float]] = field(default_factory=dict)


@dataclass
class ParserConfig:
    """
    Configuration for HTML parser.

    Attributes:
        max_document_size: Maximum document size in bytes
        streaming_threshold: Document size threshold for streaming mode
        cache_size: Maximum number of cached items
        enable_parallel: Enable parallel processing for tables
        strict_mode: Fail on parsing errors vs. best effort
        extract_xbrl: Extract inline XBRL facts
        extract_styles: Extract and process CSS styles
        preserve_whitespace: Preserve original whitespace
        optimize_for_ai: Enable AI-specific optimizations
        max_token_estimation: Maximum estimated tokens for AI optimization
        features: Feature flags for optional functionality
    """

    # Performance settings
    max_document_size: int = 100 * 1024 * 1024  # 100MB (handles large filings like JPM)
    streaming_threshold: int = 10 * 1024 * 1024  # 10MB
    cache_size: int = 1000
    enable_parallel: bool = True
    max_workers: Optional[int] = None  # None = use CPU count

    # Parsing settings
    strict_mode: bool = False
    extract_xbrl: bool = True
    extract_styles: bool = True
    preserve_whitespace: bool = False
    normalize_text: bool = True
    extract_links: bool = True
    extract_images: bool = False

    # AI optimization
    optimize_for_ai: bool = True
    max_token_estimation: int = 100_000
    chunk_size: int = 512
    chunk_overlap: int = 128

    # Table processing
    table_extraction: bool = True
    detect_table_types: bool = True
    extract_table_relationships: bool = True
    fast_table_rendering: bool = True  # Fast renderer is now production-ready (7-10x faster than Rich)

    # Section detection
    detect_sections: bool = True
    eager_section_extraction: bool = False  # Extract sections during parsing vs. on first access (default: lazy)
    form: Optional[str] = None  # Required for section detection (e.g. '10-K', '10-Q', '8-K')
    detection_thresholds: DetectionThresholds = field(default_factory=DetectionThresholds)
    section_patterns: Dict[str, List[str]] = field(default_factory=lambda: {
        'business': [
            r'item\s+1\.?\s*business',
            r'business\s+overview',
            r'our\s+business'
        ],
        'risk_factors': [
            r'item\s+1a\.?\s*risk\s+factors',
            r'risk\s+factors',
            r'factors\s+that\s+may\s+affect'
        ],
        'properties': [
            r'item\s+2\.?\s*properties',
            r'properties'
        ],
        'legal_proceedings': [
            r'item\s+3\.?\s*legal\s+proceedings',
            r'legal\s+proceedings',
            r'litigation'
        ],
        'mda': [
            r'item\s+7\.?\s*management\'?s?\s+discussion',
            r'md&a',
            r'management\'?s?\s+discussion\s+and\s+analysis'
        ],
        'financial_statements': [
            r'item\s+8\.?\s*financial\s+statements',
            r'consolidated\s+financial\s+statements',
            r'financial\s+statements'
        ]
    })

    # Feature flags
    features: Dict[str, bool] = field(default_factory=lambda: {
        'ml_header_detection': True,
        'semantic_analysis': True,
        'table_understanding': True,
        'xbrl_validation': True,
        'auto_section_detection': True,
        'smart_text_extraction': True,
        'footnote_linking': True,
        'cross_reference_resolution': True
    })

    # Header detection settings
    header_detection_threshold: float = 0.6  # Minimum confidence
    header_detection_methods: List[str] = field(default_factory=lambda: [
        'style',
        'pattern',
        'structural',
        'contextual'
    ])

    # Text extraction settings
    min_text_length: int = 10  # Minimum text length to keep
    merge_adjacent_nodes: bool = True
    merge_distance: int = 2  # Max distance between nodes to merge

    # Performance monitoring
    enable_profiling: bool = False
    log_performance: bool = False

    def to_dict(self) -> Dict[str, Any]:
        """Convert configuration to dictionary."""
        return {
            'max_document_size': self.max_document_size,
            'streaming_threshold': self.streaming_threshold,
            'cache_size': self.cache_size,
            'enable_parallel': self.enable_parallel,
            'strict_mode': self.strict_mode,
            'extract_xbrl': self.extract_xbrl,
            'extract_styles': self.extract_styles,
            'preserve_whitespace': self.preserve_whitespace,
            'optimize_for_ai': self.optimize_for_ai,
            'features': self.features.copy()
        }

    @classmethod
    def for_performance(cls) -> 'ParserConfig':
        """Create config optimized for performance."""
        return cls(
            extract_styles=False,
            extract_xbrl=False,
            enable_parallel=True,
            cache_size=5000,
            eager_section_extraction=False,  # Skip expensive section extraction
            fast_table_rendering=True,  # Fast renderer (enabled by default now)
            features={
                'ml_header_detection': False,
                'semantic_analysis': False,
                'table_understanding': False,
                'xbrl_validation': False
            }
        )

    @classmethod
    def for_accuracy(cls) -> 'ParserConfig':
        """Create config optimized for accuracy."""
        return cls(
            strict_mode=True,
            extract_styles=True,
            extract_xbrl=True,
            enable_parallel=True,
            features={
                'ml_header_detection': True,
                'semantic_analysis': True,
                'table_understanding': True,
                'xbrl_validation': True,
                'auto_section_detection': True,
                'smart_text_extraction': True,
                'footnote_linking': True,
                'cross_reference_resolution': True
            }
        )

    @classmethod
    def for_ai(cls) -> 'ParserConfig':
        """Create config optimized for AI/LLM processing."""
        return cls(
            optimize_for_ai=True,
            extract_styles=False,
            extract_xbrl=True,
            normalize_text=True,
            merge_adjacent_nodes=True,
            features={
                'ml_header_detection': True,
                'semantic_analysis': True,
                'smart_text_extraction': True
            }
        )