""" Multi-strategy header detection for document structure. """ import re from abc import ABC, abstractmethod from typing import Optional, List, Dict from lxml.html import HtmlElement from edgar.documents.config import ParserConfig from edgar.documents.types import HeaderInfo, ParseContext class HeaderDetector(ABC): """Abstract base class for header detectors.""" @abstractmethod def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]: """Detect if element is a header.""" pass @property @abstractmethod def name(self) -> str: """Detector name.""" pass class StyleBasedDetector(HeaderDetector): """Detect headers based on CSS styles.""" @property def name(self) -> str: return "style" def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]: """Detect headers based on style attributes.""" # Get element style style = context.get_current_style() # Skip if no style info if not style: return None # Get text content text = element.text_content().strip() if not text or len(text) > 200: # Skip very long text return None confidence = 0.0 level = 3 # Default level # Check font size if style.font_size and context.base_font_size: size_ratio = style.font_size / context.base_font_size if size_ratio >= 2.0: confidence += 0.8 level = 1 elif size_ratio >= 1.5: confidence += 0.7 level = 2 elif size_ratio >= 1.2: confidence += 0.5 level = 3 elif size_ratio >= 1.1: confidence += 0.3 level = 4 # Check font weight if style.is_bold: confidence += 0.3 if level == 3: # Adjust level for bold text level = 2 # Check text alignment if style.is_centered: confidence += 0.2 # Check for uppercase if text.isupper() and len(text.split()) <= 10: confidence += 0.2 # Check margins (headers often have larger margins) if style.margin_top and style.margin_top > 20: confidence += 0.1 if style.margin_bottom and style.margin_bottom > 10: confidence += 0.1 # Normalize confidence confidence = min(confidence, 1.0) if confidence > 0.4: # Threshold for style-based detection return HeaderInfo.from_text(text, level, confidence, self.name) return None class PatternBasedDetector(HeaderDetector): """Detect headers based on text patterns.""" # Common header patterns in SEC filings HEADER_PATTERNS = [ # Item patterns (r'^(Item|ITEM)\s+(\d+[A-Z]?)[.\s]+(.+)$', 1, 0.95), (r'^Part\s+[IVX]+[.\s]*$', 1, 0.9), (r'^PART\s+[IVX]+[.\s]*$', 1, 0.9), # Section patterns (r'^(BUSINESS|RISK FACTORS|PROPERTIES|LEGAL PROCEEDINGS)$', 2, 0.85), (r'^(Management\'?s?\s+Discussion|MD&A)', 2, 0.85), (r'^(Financial\s+Statements|Consolidated\s+Financial\s+Statements)$', 2, 0.85), # Numbered sections (r'^\d+\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7), (r'^[A-Z]\.\s+[A-Z][A-Za-z\s]+$', 3, 0.7), (r'^\([a-z]\)\s+[A-Z][A-Za-z\s]+$', 4, 0.6), # Title case headers (r'^[A-Z][A-Za-z\s]+[A-Za-z]$', 3, 0.5), # All caps headers (r'^[A-Z\s]+$', 3, 0.6), ] @property def name(self) -> str: return "pattern" def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]: """Detect headers based on text patterns.""" text = element.text_content().strip() # Skip empty or very long text if not text or len(text) > 200: return None # Skip single punctuation - never headers if len(text) == 1 and text in '.,!?;:()[]{}': return None # Skip if text contains multiple sentences (likely paragraph) if text.count('.') > 2: return None # Check against patterns for pattern, level, base_confidence in self.HEADER_PATTERNS: match = re.match(pattern, text, re.IGNORECASE) if match: # Adjust confidence based on context confidence = base_confidence # Boost confidence if element is alone in parent if len(element.getparent()) == 1: confidence += 0.1 # Boost confidence if followed by substantial text next_elem = element.getnext() if next_elem is not None and len(next_elem.text_content()) > 100: confidence += 0.1 confidence = min(confidence, 1.0) return HeaderInfo.from_text(text, level, confidence, self.name) return None class StructuralDetector(HeaderDetector): """Detect headers based on DOM structure.""" @property def name(self) -> str: return "structural" def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]: """Detect headers based on structural cues.""" text = element.text_content().strip() # Skip empty or very long text if not text or len(text) > 200: return None # Skip single punctuation - never headers if len(text) == 1 and text in '.,!?;:()[]{}': return None confidence = 0.0 level = 3 # Check if element is in a header tag tag = element.tag.lower() if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: confidence = 1.0 level = int(tag[1]) return HeaderInfo.from_text(text, level, confidence, self.name) # Check parent structure parent = element.getparent() if parent is not None: parent_tag = parent.tag.lower() # Check if in header-like container if parent_tag in ['header', 'thead', 'caption']: confidence += 0.6 level = 2 # Check if parent has few children (isolated element) if len(parent) <= 3: confidence += 0.3 # Check if parent is centered parent_align = parent.get('align') if parent_align == 'center': confidence += 0.2 # Check element properties if tag in ['strong', 'b']: confidence += 0.3 if element.get('align') == 'center': confidence += 0.2 # Check if followed by block content next_elem = element.getnext() if next_elem is not None: next_tag = next_elem.tag.lower() if next_tag in ['p', 'div', 'table', 'ul', 'ol']: confidence += 0.2 # Check text characteristics words = text.split() if 1 <= len(words) <= 10: # Short text confidence += 0.1 # Normalize confidence confidence = min(confidence, 1.0) if confidence > 0.5: return HeaderInfo.from_text(text, level, confidence, self.name) return None class ContextualDetector(HeaderDetector): """Detect headers based on surrounding context.""" @property def name(self) -> str: return "contextual" def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]: """Detect headers based on contextual clues.""" text = element.text_content().strip() # Skip empty or very long text if not text or len(text) > 200: return None # Skip single punctuation - never headers if len(text) == 1 and text in '.,!?;:()[]{}': return None confidence = 0.0 level = 3 # Check if text looks like a header if self._looks_like_header(text): confidence += 0.4 # Check relationship to previous content prev_elem = element.getprevious() if prev_elem is not None: prev_text = prev_elem.text_content().strip() # Check if previous was also a header (section hierarchy) if prev_text and self._looks_like_header(prev_text): confidence += 0.3 # Adjust level based on comparison if len(text) > len(prev_text): level = 2 else: level = 3 # Check relationship to next content next_elem = element.getnext() if next_elem is not None: next_text = next_elem.text_content().strip() # Headers are often followed by longer content if len(next_text) > len(text) * 3: confidence += 0.3 # Check if next element is indented or styled differently next_style = next_elem.get('style', '') if 'margin-left' in next_style or 'padding-left' in next_style: confidence += 0.2 # Check position in document if context.current_section is None and context.depth < 5: # Early in document, more likely to be header confidence += 0.2 # Normalize confidence confidence = min(confidence, 1.0) if confidence > 0.5: return HeaderInfo.from_text(text, level, confidence, self.name) return None def _looks_like_header(self, text: str) -> bool: """Check if text looks like a header.""" # Short text if len(text.split()) > 15: return False # No ending punctuation (except colon) if text.rstrip().endswith(('.', '!', '?', ';')): return False # Title case or all caps if text.istitle() or text.isupper(): return True # Starts with capital letter if text and text[0].isupper(): return True return False class HeaderDetectionStrategy: """ Multi-strategy header detection. Combines multiple detection methods with weighted voting. """ def __init__(self, config: ParserConfig): """Initialize with configuration.""" self.config = config self.detectors = self._init_detectors() def _init_detectors(self) -> List[HeaderDetector]: """Initialize enabled detectors.""" detectors = [] # Always include basic detectors detectors.extend([ StyleBasedDetector(), PatternBasedDetector(), StructuralDetector(), ContextualDetector() ]) # Add ML detector if enabled if self.config.features.get('ml_header_detection'): # Would add MLBasedDetector here pass return detectors def detect(self, element: HtmlElement, context: ParseContext) -> Optional[HeaderInfo]: """ Detect if element is a header using multiple strategies. Args: element: HTML element to check context: Current parsing context Returns: HeaderInfo if element is detected as header, None otherwise """ # Skip if element has no text text = element.text_content().strip() if not text: return None # Collect results from all detectors results: List[HeaderInfo] = [] for detector in self.detectors: try: result = detector.detect(element, context) if result: results.append(result) except Exception: # Don't let one detector failure stop others continue if not results: return None # If only one detector fired, use its result if confident enough if len(results) == 1: if results[0].confidence >= self.config.header_detection_threshold: return results[0] return None # Multiple detectors - combine results return self._combine_results(results, text) def _combine_results(self, results: List[HeaderInfo], text: str) -> HeaderInfo: """Combine multiple detection results.""" # Weight different detectors detector_weights = { 'style': 0.3, 'pattern': 0.4, 'structural': 0.2, 'contextual': 0.1, 'ml': 0.5 # Would be highest if available } # Calculate weighted confidence total_confidence = 0.0 total_weight = 0.0 # Group by level level_votes: Dict[int, float] = {} for result in results: weight = detector_weights.get(result.detection_method, 0.1) total_confidence += result.confidence * weight total_weight += weight # Vote for level if result.level not in level_votes: level_votes[result.level] = 0.0 level_votes[result.level] += result.confidence * weight # Normalize confidence final_confidence = total_confidence / total_weight if total_weight > 0 else 0.0 # Choose most voted level final_level = max(level_votes.items(), key=lambda x: x[1])[0] # Check if any detector found this is an item is_item = any(r.is_item for r in results) item_number = next((r.item_number for r in results if r.item_number), None) return HeaderInfo( level=final_level, confidence=final_confidence, text=text, detection_method='combined', is_item=is_item, item_number=item_number )