import re from dataclasses import dataclass from enum import Enum from typing import Any, Dict, Optional, Tuple, Union from bs4 import Tag from edgar.core import log as logger __all__ = ['StyleInfo', 'UnitType', 'StyleUnit', 'parse_style', 'is_heading', 'get_heading_level'] base_font_size = 10.0 # First define the patterns at module level for reliability HEADING_PATTERNS = { # Level 1 patterns (Parts) 'l1': re.compile(r'(?i)^part\s+[IVX0-9]+(?:\s.*)?$', re.IGNORECASE), # Level 2 patterns (Items, Articles, Major Sections) 'l2': [ re.compile(r'(?i)^item\s+[0-9]+[A-Z]?\.?(?:\s.*)?$'), re.compile(r'(?i)^article\s+[IVX0-9]+(?:[\s\.].*)?$'), re.compile(r'(?i)^section\s+[0-9]+(?:\.[0-9]+)*(?:\s.*)?$') ], # Level 3 patterns (Major subsections) 'l3': [ re.compile(r'^[A-Z][A-Z\s\-\&]{5,}$'), re.compile(r'(?i)^(?:consolidated|combined)\s+[A-Z\s]+$'), re.compile(r'(?i)^management[A-Z\s]+(?:discussion|analysis)$'), re.compile(r'(?i)^notes?\s+to\s+[A-Z\s]+$'), re.compile(r'(?i)^selected\s+financial\s+data$'), re.compile(r'(?i)^supplementary\s+information$'), re.compile(r'(?i)^signatures?$'), re.compile(r'(?i)^exhibits?\s+and\s+financial\s+statement\s+schedules$') ] } class UnitType(Enum): POINT = 'pt' PIXEL = 'px' INCH = 'in' CM = 'cm' MM = 'mm' PERCENT = '%' EM = 'em' REM = 'rem' @dataclass class StyleUnit: """Represents a CSS measurement with original and normalized values The original value is what was parsed from the CSS string, while the normalized value is converted to a standard unit characters for display in the terminal. """ value: float unit: UnitType def __init__(self, value: float, unit: Union[str, UnitType]): self.value = value self.unit = UnitType(unit) if isinstance(unit, str) else unit def to_chars(self, console_width: int) -> int: """Convert width to character count based on console width""" # Base conversion rates (at standard 80-char width) BASE_CONSOLE_WIDTH = 80 # standard width CHARS_PER_INCH = 12.3 # at standard width # Scale factor based on actual console width scale = console_width / BASE_CONSOLE_WIDTH # Handle percentage specifically if self.unit == UnitType.PERCENT: return round(console_width * (self.value / 100)) # Convert to inches first inches = self._to_inches() # Convert to characters, scaling based on console width chars = round(inches * CHARS_PER_INCH * scale) return chars def _to_inches(self) -> float: """Convert any unit to inches""" conversions = { UnitType.INCH: 1.0, UnitType.POINT: 1 / 72, # 72 points per inch UnitType.PIXEL: 1 / 96, # 96 pixels per inch UnitType.CM: 0.393701, # 1 cm = 0.393701 inches UnitType.MM: 0.0393701, # 1 mm = 0.0393701 inches UnitType.EM: 1 / 6, # Approximate, assumes 1em = 1/6 inch UnitType.REM: 1 / 6, # Same as EM UnitType.PERCENT: 1.0 # Handled separately in to_chars } return self.value * conversions[self.unit] def __eq__(self, other: object) -> bool: if not isinstance(other, StyleUnit): return NotImplemented if self.unit == other.unit: return self.value == other.value # Compare by converting both to inches return self._to_inches() == other._to_inches() def __gt__(self, other: Union['StyleUnit', float]) -> bool: if isinstance(other, float): # Assume points when comparing with raw numbers other = StyleUnit(other, UnitType.POINT) return self._to_inches() > other._to_inches() def __ge__(self, other: Union['StyleUnit', float]) -> bool: if isinstance(other, float): other = StyleUnit(other, UnitType.POINT) return self._to_inches() >= other._to_inches() def __str__(self) -> str: return f"{self.value}{self.unit.value}" @dataclass class Width: """Represents a width value with its unit""" value: float unit: UnitType def to_chars(self, console_width: int) -> int: """Convert width to character count based on console width""" # Base conversion rates (at standard 80-char width) BASE_CONSOLE_WIDTH = 80 # standard width CHARS_PER_INCH = 12.3 # at standard width # Scale factor based on actual console width scale = console_width / BASE_CONSOLE_WIDTH # Convert to inches first inches = self._to_inches() # Convert to characters, scaling based on console width chars = round(inches * CHARS_PER_INCH * scale) # Handle percentage if self.unit == '%': return round(console_width * (self.value / 100)) return min(chars, console_width) def _to_inches(self) -> float: """Convert any unit to inches""" conversions = { 'in': 1.0, 'pt': 1 / 72, # 72 points per inch 'px': 1 / 96, # 96 pixels per inch 'cm': 0.393701, # 1 cm = 0.393701 inches 'mm': 0.0393701, # 1 mm = 0.0393701 inches '%': 1.0 # percentage handled separately in to_chars } return self.value * conversions[self.unit] @dataclass class StyleInfo: """Style information with proper unit handling""" display: Optional[str] = None margin_top: Optional[StyleUnit] = None margin_bottom: Optional[StyleUnit] = None font_size: Optional[StyleUnit] = None font_weight: Optional[str] = None text_align: Optional[str] = None line_height: Optional[StyleUnit] = None width: Optional[StyleUnit] = None text_decoration: Optional[str] = None def merge(self, parent_style: Optional['StyleInfo']) -> 'StyleInfo': """Merge with parent style, child properties take precedence""" if not parent_style: return self return StyleInfo( display=self.display or parent_style.display, margin_top=self.margin_top or parent_style.margin_top, margin_bottom=self.margin_bottom or parent_style.margin_bottom, font_size=self.font_size or parent_style.font_size, font_weight=self.font_weight or parent_style.font_weight, text_align=self.text_align or parent_style.text_align, line_height=self.line_height or parent_style.line_height, width=self.width or parent_style.width, text_decoration=self.text_decoration or parent_style.text_decoration ) def parse_style(style_str: str) -> StyleInfo: """Parse inline CSS style string into StyleInfo object with robust unit validation""" style = StyleInfo() if not style_str: return style # Use UnitType enum for valid units valid_units = {unit.value for unit in UnitType} properties = [p.strip() for p in style_str.split(';') if p.strip()] for prop in properties: if ':' not in prop: continue key, value = prop.split(':', 1) key = key.strip().lower() value = value.strip().lower() # Handle non-numeric properties if key == 'font-weight': style.font_weight = value continue elif key == 'text-align': style.text_align = value continue elif key == 'display': style.display = value continue elif key == 'text-decoration': style.text_decoration = value continue # For properties that expect numeric values with units match = re.match(r'(-?\d*\.?\d+)([a-z%]*)', value) if match: try: num_val = float(match.group(1)) unit = match.group(2) or 'px' # Default to pixels # Validate the unit is supported if unit not in valid_units: continue # Skip this property if unit is invalid # Scientific notation check if 'e' in str(num_val).lower(): continue # Skip scientific notation values style_unit = StyleUnit(num_val, unit) if key == 'margin-top': style.margin_top = style_unit elif key == 'margin-bottom': style.margin_bottom = style_unit elif key == 'font-size': style.font_size = style_unit elif key == 'line-height': style.line_height = style_unit elif key == 'width': style.width = style_unit except (ValueError, TypeError): continue # Skip this property if number parsing fails return style def is_heading(element: Tag, style: StyleInfo) -> bool: """ Detect if an element is likely a heading based on multiple weighted factors. Returns True if enough heading indicators are present. """ if not style: return False # Initialize score and evidence score = 0 max_score = 6 # Get text content text = element.get_text(strip=True) if not text: return False debug_evidence = [] # 1. Length checks - fail fast for long text if len(text) > 100: debug_evidence.append("-5 excessive length") score -= 5 return False elif len(text) > 50: score -= 2 debug_evidence.append("-2 for medium length") # Primary document structure patterns primary_patterns = [ (r'(?i)^part\s+[IVX0-9]+(?:\s.*)?$', "PART pattern", 4), (r'(?i)^section\s+[0-9]+(?:\.[0-9]+)*(?:\s.*)?$', "SECTION pattern", 4), (r'(?i)^article\s+[IVX0-9]+(?:[\s\.].*)?$', "ARTICLE pattern", 4), (r'(?i)^item\s+[0-9]+[A-Z]?\.?(?:\s.*)?$', "ITEM pattern", 4), ] # Common SEC heading patterns sec_heading_patterns = [ (r'(?i)^(?:consolidated|combined)\s+[A-Z\s]+$', "Financial statement heading", 3), (r'(?i)^management[A-Z\s]+(?:discussion|analysis)$', "MD&A heading", 3), (r'(?i)^notes?\s+to\s+[A-Z\s]+$', "Notes heading", 3), (r'(?i)^[A-Z][A-Z\s]{2,}\s+(?:and|of|to|for|from)\s+[A-Z\s]+$', "Complex heading", 3), ] # Secondary patterns secondary_patterns = [ (r'^\d+\.\s*[A-Z].*$', "Numbered pattern", 3), (r'^[A-Z][A-Z\s\-\&]+$', "All caps text", 3), ] # Check patterns in order all_patterns = primary_patterns + sec_heading_patterns + secondary_patterns for pattern, desc, points in all_patterns: if re.match(pattern, text): score += points debug_evidence.append(f"+{points} for {desc}") break # 3. All caps bonus for short text if text.isupper() and len(text) <= 30 and not any(char.isdigit() for char in text): score += 1 debug_evidence.append("+1 for short all-caps text") # 4. Style properties if style.font_weight in ['bold', '700', '800', '900']: points = 2 if len(text) < 30 else 1 score += points debug_evidence.append(f"+{points} for bold weight") if style.font_size: base_size = StyleUnit(base_font_size, 'pt') size_ratio = style.font_size._to_inches() / base_size._to_inches() if size_ratio >= 1.2: score += 2 debug_evidence.append(f"+2 for large font ({size_ratio:.1f}x base)") elif size_ratio >= 1.1: score += 1 debug_evidence.append(f"+1 for medium font ({size_ratio:.1f}x base)") # Margin handling if style.margin_top: large_margin = StyleUnit(18, 'pt') medium_margin = StyleUnit(12, 'pt') if style.margin_top >= large_margin: score += 2 debug_evidence.append(f"+2 for large margin ({style.margin_top.value}{style.margin_top.unit.value})") elif style.margin_top >= medium_margin: score += 2 debug_evidence.append(f"+2 for medium margin ({style.margin_top.value}{style.margin_top.unit.value})") # Parent margin parent = element.parent if parent and isinstance(parent, Tag): parent_style = parse_style(parent.get('style', '')) if parent_style.margin_top: if parent_style.margin_top >= StyleUnit(18, 'pt'): score += 2 debug_evidence.append("+2 for large parent margin") elif parent_style.margin_top >= StyleUnit(12, 'pt'): score += 1 debug_evidence.append("+1 for medium parent margin") # Debug output return score >= max_score def _get_effective_style(element: Tag, base_style: StyleInfo, debug: bool = False) -> StyleInfo: """Get combined styles with parent-first approach and semantic tag handling""" if debug: pass # Start with base style effective_style = base_style or StyleInfo() # Get parent styles working up the tree for parent in element.parents: if parent.name == 'div': parent_style = parse_style(parent.get('style', '')) if debug: pass if parent_style: effective_style = effective_style.merge(parent_style) # Stop at first div to avoid going too far up if parent.name == 'div': break # Get styles from span parents for font-size span_parent = element.find_parent('span') if span_parent: span_style = parse_style(span_parent.get('style', '')) if debug: pass if span_style: effective_style = effective_style.merge(span_style) # Apply element's own style element_style = parse_style(element.get('style', '')) if element_style: effective_style = effective_style.merge(element_style) # Handle semantic bold tags if element.name in ['strong', 'b'] or element.find_parent(['strong', 'b']): effective_style = StyleInfo( font_weight='700', margin_top=effective_style.margin_top, margin_bottom=effective_style.margin_bottom, font_size=effective_style.font_size, text_align=effective_style.text_align, line_height=effective_style.line_height, width=effective_style.width, text_decoration=effective_style.text_decoration, display=effective_style.display ) if debug: pass return effective_style def _merge_styles(parent_style: StyleInfo, child_style: StyleInfo, debug: bool = False) -> StyleInfo: """ Helper function to properly merge parent and child styles """ if not parent_style: return child_style if not child_style: return parent_style merged = StyleInfo( display=child_style.display or parent_style.display, margin_top=child_style.margin_top or parent_style.margin_top, margin_bottom=child_style.margin_bottom or parent_style.margin_bottom, font_size=child_style.font_size or parent_style.font_size, font_weight=child_style.font_weight or parent_style.font_weight, text_align=child_style.text_align or parent_style.text_align, line_height=child_style.line_height or parent_style.line_height, width=child_style.width or parent_style.width, text_decoration=child_style.text_decoration or parent_style.text_decoration ) if debug: logger.debug("Merged style: %s", _format_style_debug(merged)) return merged def get_heading_level(element: Tag, style: StyleInfo, text: str, debug: bool = False) -> Optional[int]: """Get heading level with comprehensive debugging""" debug_info: Dict[str, Any] = {'text': text, 'decisions': []} def log_decision(stage: str, result: bool, reason: str): if debug: debug_info['decisions'].append({ 'stage': stage, 'result': result, 'reason': reason }) # Early return for empty or whitespace-only text if not text.strip(): if debug: pass return None # Special handling for elements inside a div parent_div = element.find_parent('div') if parent_div: # Get all spans in the div spans = parent_div.find_all('span') if len(spans) > 1: # Only process as split heading if multiple spans # Combine text from all spans combined_text = ' '.join(span.get_text(strip=True) for span in spans) if combined_text.strip(): # Get div's style div_style = parse_style(parent_div.get('style', '')) # Check for bold styling in any span has_bold = any( 'font-weight' in span.get('style', '').lower() and any(weight in span.get('style', '').lower() for weight in ['bold', '700', '800', '900']) for span in spans ) if has_bold: div_style = StyleInfo( font_weight='700', margin_top=div_style.margin_top, font_size=div_style.font_size, text_align=div_style.text_align, display=div_style.display ) if debug: pass # Process the combined heading return get_heading_level(parent_div, div_style, combined_text, debug) # Get complete style for the element complete_style = _get_effective_style(element, style, debug) if debug: pass # Check minimum heading traits has_min_traits, trait_details = _has_minimum_heading_traits(complete_style, text, return_details=True) if debug: for _trait, _value in trait_details.items(): pass if not has_min_traits: log_decision("Style Check", False, "Does not meet minimum heading traits") return None log_decision("Style Check", True, "Meets minimum heading traits") text_to_check = text.strip() # First check prominence since it affects L3 pattern matching is_prominent = _is_prominently_styled(complete_style, debug=debug) # Level 1 check (PART headers) if debug: pass if HEADING_PATTERNS['l1'].match(text_to_check): log_decision("Pattern Check", True, "Matches Level 1 (PART) pattern") return 1 # Level 2 check (Items, Articles) if debug: pass for pattern in HEADING_PATTERNS['l2']: if debug: pass if pattern.match(text_to_check): log_decision("Pattern Check", True, f"Matches Level 2 pattern: {pattern.pattern}") return 2 # Level 3 check (requires prominence) if is_prominent: if debug: pass for pattern in HEADING_PATTERNS['l3']: if debug: pass if pattern.match(text_to_check): log_decision("Pattern Check", True, f"Matches Level 3 pattern: {pattern.pattern}") return 3 # Check if it's a likely section heading even if it doesn't match exact patterns if _is_likely_section_heading(text_to_check, complete_style): log_decision("Pattern Check", True, "Matches section heading criteria") return 3 elif debug: pass # Level 4 check (minor subsections) # Check for basic heading traits that didn't match higher level patterns if (text_to_check and # Ensure there is non-empty text complete_style.font_weight in ['bold', '700', '800', '900'] and len(text_to_check) < 50 and not text_to_check.startswith(('Note:', '*', '(', '$')) and not text_to_check.endswith(':')): log_decision("Pattern Check", True, "Matches Level 4 (minor heading) criteria") return 4 log_decision("Pattern Check", False, "No heading patterns matched") return None def _format_style_debug(style: StyleInfo) -> Dict[str, str]: """Format style information for debugging""" if not style: return {"status": "no style"} return { "font_weight": str(style.font_weight), "font_size": str(style.font_size) if style.font_size else None, "margin_top": str(style.margin_top) if style.margin_top else None, "text_align": style.text_align, "display": style.display } def _has_minimum_heading_traits(style: StyleInfo, text: str, return_details: bool = False) -> Union[ bool, Tuple[bool, Dict[str, bool]]]: """ Check for minimum heading characteristics with improved font-weight handling """ if not style: return (False, {"reason": "no style"}) if return_details else False # Improved font-weight checking has_bold = False if style.font_weight: has_bold = ( style.font_weight == 'bold' or style.font_weight == '700' or style.font_weight == '800' or style.font_weight == '900' or # Also handle possible numeric values (style.font_weight.isdigit() and int(style.font_weight) >= 700) ) details = { "has_bold": has_bold, "has_large_font": bool(style.font_size and style.font_size > StyleUnit(11, 'pt')), "has_margin": bool(style.margin_top and style.margin_top >= StyleUnit(12, 'pt')), "has_center_caps": bool(style.text_align == 'center' and text.isupper() and len(text) > 4) } # Consider any combination of significant styling as valid result = details["has_bold"] or details["has_large_font"] or \ (details["has_margin"] and (details["has_bold"] or details["has_center_caps"])) if return_details: return result, details return result def _is_prominently_styled(style: StyleInfo, debug: bool = False) -> bool: """Check for prominent styling with detailed debug output""" if not style: if debug: pass return False prominence_checks = { "large_font": bool(style.font_size and style.font_size > StyleUnit(12, 'pt')), "large_margin": bool(style.margin_top and style.margin_top >= StyleUnit(18, 'pt')), "centered": style.text_align == 'center', "bold_with_margin": bool(style.font_weight in ('700', '800', '900', 'bold') and style.margin_top) } if debug: for _check, result in prominence_checks.items(): if result: pass result = any(prominence_checks.values()) if debug: pass return result def _get_prominence_detail(style: StyleInfo, check: str) -> str: """Get detailed information about why a prominence check passed""" if check == "large_font" and style.font_size: return f"Font size: {style.font_size}" elif check == "large_margin" and style.margin_top: return f"Margin top: {style.margin_top}" elif check == "centered": return f"Text align: {style.text_align}" elif check == "bold_with_margin": return f"Font weight: {style.font_weight}, Margin top: {style.margin_top}" return "" def _is_likely_minor_heading(text: str, style: StyleInfo, return_details: bool = False) -> Union[ bool, Tuple[bool, Dict[str, Any]]]: """Detect minor headings with detailed output""" details = { "length_ok": len(text) < 40, "has_bold": bool(style and style.font_weight in ('bold', '700')), "no_exclusions": not text.startswith(('Note:', '*', '(', '$')) and not text.endswith(':'), "text_sample": text[:30] + ('...' if len(text) > 30 else '') } result = all([details["length_ok"], details["has_bold"], details["no_exclusions"]]) if return_details: return result, details return result def _print_debug_info(debug_info: Dict[str, Any], debug: bool): """Print formatted debug information""" if not debug: return logger.debug("\nHeading Detection Analysis:") logger.debug("-" * 50) logger.debug(f"Text: '{debug_info['text']}'") logger.debug("\nStyle Information:") logger.debug(f" {debug_info.get('effective_style', 'No style info')}") if 'style_traits' in debug_info: logger.debug("\nStyle Traits:") for trait, value in debug_info['style_traits'].items(): logger.debug(f" {trait}: {value}") logger.debug("\nDecision Process:") for decision in debug_info['decisions']: result_mark = "✓" if decision['result'] else "✗" logger.debug(f" {result_mark} {decision['stage']}: {decision['reason']}") logger.debug("-" * 50) def _is_likely_section_heading(text: str, style: StyleInfo) -> bool: """ Check if text matches common SEC section heading patterns Uses heuristics based on common SEC document structure """ # Skip common false positives if len(text) < 8 or len(text) > 60: return False text_lower = text.lower() # Common SEC section keywords section_keywords = { 'overview', 'background', 'business', 'operations', 'risk factors', 'management', 'financial', 'discussion', 'analysis', 'results', 'liquidity', 'capital resources', 'critical accounting', 'controls', 'procedures' } # Check for keyword matches words = set(text_lower.split()) if len(words & section_keywords) >= 1: return True return False