Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/toc_analyzer.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/toc_analyzer.py
@@ -0,0 +1,440 @@
+"""
+Table of Contents analyzer for SEC filings.
+
+This module analyzes the TOC structure to map section names to anchor IDs,
+enabling section extraction for API filings with generated anchor IDs.
+"""
+import re
+from typing import Dict, List, Optional, Set, Tuple
+from dataclasses import dataclass
+from lxml import html as lxml_html
+
+
+@dataclass
+class TOCSection:
+    """Represents a section found in the Table of Contents."""
+    name: str
+    anchor_id: str
+    normalized_name: str
+    section_type: str  # 'item', 'part', 'other'
+    order: int
+    part: Optional[str] = None  # NEW: "Part I", "Part II", or None for 10-K
+
+
+class TOCAnalyzer:
+    """
+    Analyzes Table of Contents structure to map section names to anchor IDs.
+    
+    This enables section extraction for filings where anchor IDs are generated
+    rather than semantic (like API filings vs local HTML files).
+    """
+    
+    def __init__(self):
+        # SEC section patterns for normalization
+        self.section_patterns = [
+            (r'(?:item|part)\s+\d+[a-z]?', 'item'),
+            (r'business', 'item'),
+            (r'risk\s+factors?', 'item'),
+            (r'properties', 'item'),
+            (r'legal\s+proceedings', 'item'),
+            (r'management.*discussion', 'item'),
+            (r'md&a', 'item'),
+            (r'financial\s+statements?', 'item'),
+            (r'exhibits?', 'item'),
+            (r'signatures?', 'item'),
+            (r'part\s+[ivx]+', 'part'),
+        ]
+    
+    def analyze_toc_structure(self, html_content: str) -> Dict[str, str]:
+        """
+        Analyze HTML content to extract section mappings from TOC.
+
+        Args:
+            html_content: Raw HTML content
+
+        Returns:
+            Dict mapping normalized section names to anchor IDs
+        """
+        section_mapping = {}
+
+        try:
+            # Handle XML declaration issues
+            if html_content.startswith('<?xml'):
+                html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
+
+            tree = lxml_html.fromstring(html_content)
+
+            # Find all anchor links that could be TOC links
+            anchor_links = tree.xpath('//a[@href]')
+
+            toc_sections = []
+            current_part = None  # Track current part context for 10-Q filings
+            part_pattern = re.compile(r'^\s*Part\s+([IVX]+)\b', re.IGNORECASE)
+
+            for link in anchor_links:
+                href = link.get('href', '').strip()
+                text = (link.text_content() or '').strip()
+
+                # Check if this link or its row represents a part header
+                # Part headers in 10-Q TOCs typically appear as separate rows: "Part I", "Part II"
+                part_match = part_pattern.match(text)
+                if part_match:
+                    # Update current part context
+                    current_part = f"Part {part_match.group(1).upper()}"
+                    # Don't create a section for the part header itself
+                    continue
+
+                # Look for internal anchor links
+                if href.startswith('#') and text:
+                    anchor_id = href[1:]  # Remove #
+
+                    # Try to find item number in preceding context (for table-based TOCs)
+                    preceding_item = self._extract_preceding_item_label(link)
+
+                    # Check if this looks like a section reference (check text, anchor ID, and context)
+                    if self._is_section_link(text, anchor_id, preceding_item):
+                        # Verify target exists
+                        target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
+                        if target_elements:
+                            # Try to extract item number from: anchor ID > preceding context > text
+                            normalized_name = self._normalize_section_name(text, anchor_id, preceding_item)
+                            section_type, order = self._get_section_type_and_order(normalized_name)
+
+                            toc_section = TOCSection(
+                                name=text,
+                                anchor_id=anchor_id,
+                                normalized_name=normalized_name,
+                                section_type=section_type,
+                                order=order,
+                                part=current_part  # Assign current part context
+                            )
+                            toc_sections.append(toc_section)
+
+            # Build mapping prioritizing the most standard section names
+            section_mapping = self._build_section_mapping(toc_sections)
+
+        except Exception as e:
+            # Return empty mapping on error - fallback to other methods
+            pass
+
+        return section_mapping
+
+    def _extract_preceding_item_label(self, link_element) -> str:
+        """
+        Extract item/part label from preceding context.
+
+        Handles table-based TOCs where item number is in a separate cell:
+        <td>Item 1.</td><td><a href="...">Business</a></td>
+
+        Also handles nested structures like:
+        <td>Item 1.</td><td><div><span><a href="...">Business</a></span></div></td>
+
+        Args:
+            link_element: The <a> element
+
+        Returns:
+            Item label like "Item 1", "Item 1A", "Part I" or empty string
+        """
+        try:
+            # Traverse up to find the containing <td> or <th> (up to 5 levels)
+            current = link_element
+            td_element = None
+
+            for _ in range(5):
+                parent = current.getparent()
+                if parent is None:
+                    break
+
+                if parent.tag in ['td', 'th']:
+                    td_element = parent
+                    break
+
+                current = parent
+
+            # If we found a <td>, check ALL preceding siblings in the row
+            # This handles TOCs where item number is not in the immediately adjacent cell
+            # Example: ['Business', 'I', '1', '5'] where '1' is the item number
+            if td_element is not None:
+                # Check all preceding siblings (rightmost to leftmost)
+                prev_sibling = td_element.getprevious()
+                while prev_sibling is not None:
+                    if prev_sibling.tag in ['td', 'th']:
+                        prev_text = (prev_sibling.text_content() or '').strip()
+
+                        # Look for "Item X" or just "X" (bare number) pattern
+                        # Match full format: "Item 1A"
+                        item_match = re.match(r'(Item\s+\d+[A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
+                        if item_match:
+                            return item_match.group(1)
+
+                        # Match bare item number: "1A" or "1" (only valid 10-K item numbers: 1-15)
+                        # This prevents page numbers (50, 108, etc.) from being treated as items
+                        bare_item_match = re.match(r'^([1-9]|1[0-5])([A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
+                        if bare_item_match:
+                            item_num = bare_item_match.group(1)
+                            item_letter = bare_item_match.group(2)
+                            return f"Item {item_num}{item_letter}"
+
+                        # Match part: "Part I" or just "I"
+                        part_match = re.match(r'(Part\s+[IVX]+)\.?\s*$', prev_text, re.IGNORECASE)
+                        if part_match:
+                            return part_match.group(1)
+
+                        # Match bare part: "I", "II", etc.
+                        bare_part_match = re.match(r'^([IVX]+)\.?\s*$', prev_text)
+                        if bare_part_match:
+                            return f"Part {bare_part_match.group(1)}"
+
+                    prev_sibling = prev_sibling.getprevious()
+
+            # Also check immediate parent's text for inline patterns (div/span structures)
+            parent = link_element.getparent()
+            if parent is not None and parent.tag in ['div', 'span', 'p']:
+                if parent.text:
+                    text_before = parent.text.strip()
+                    item_match = re.search(r'(Item\s+\d+[A-Z]?)\.?\s*$', text_before, re.IGNORECASE)
+                    if item_match:
+                        return item_match.group(1)
+
+                    part_match = re.search(r'(Part\s+[IVX]+)\.?\s*$', text_before, re.IGNORECASE)
+                    if part_match:
+                        return part_match.group(1)
+
+        except Exception:
+            pass
+
+        return ''
+
+    def _is_section_link(self, text: str, anchor_id: str = '', preceding_item: str = '') -> bool:
+        """
+        Check if link represents a section reference.
+
+        Checks link text, anchor ID, and preceding context to handle cases where:
+        - Text is descriptive (e.g., "Executive Compensation")
+        - Anchor ID contains item number (e.g., "item_11_executive_compensation")
+        - Item number is in preceding table cell (e.g., <td>Item 1.</td><td><a>Business</a></td>)
+
+        Args:
+            text: Link text
+            anchor_id: Anchor ID from href (without #)
+            preceding_item: Item/part label from preceding context (e.g., "Item 1A")
+
+        Returns:
+            True if this appears to be a section link
+        """
+        if not text:
+            return False
+
+        # First check if there's a preceding item label (table-based TOC)
+        if preceding_item:
+            return True
+
+        # Then check anchor ID for item/part patterns (most reliable)
+        if anchor_id:
+            anchor_lower = anchor_id.lower()
+            # Match patterns like: item_1, item_1a, item1, item1a, part_i, part_ii, etc.
+            if re.search(r'item_?\d+[a-z]?', anchor_lower):
+                return True
+            if re.search(r'part_?[ivx]+', anchor_lower):
+                return True
+
+        # Then check text (with relaxed length limit for descriptive section names)
+        if len(text) > 150:  # Increased from 100 to accommodate longer section titles
+            return False
+
+        # Check against known patterns
+        for pattern, _ in self.section_patterns:
+            if re.search(pattern, text, re.IGNORECASE):
+                return True
+
+        # Also consider links with section keywords
+        if len(text) < 100 and any(keyword in text.lower() for keyword in
+                                   ['item', 'part', 'business', 'risk', 'properties', 'legal',
+                                    'compensation', 'ownership', 'governance', 'directors']):
+            return True
+
+        return False
+    
+    def _normalize_section_name(self, text: str, anchor_id: str = '', preceding_item: str = '') -> str:
+        """
+        Normalize section name for consistent lookup.
+
+        Prioritizes:
+        1. Preceding item label (table-based TOC)
+        2. Anchor ID pattern
+        3. Text-based normalization
+
+        Args:
+            text: Link text
+            anchor_id: Anchor ID from href (without #)
+            preceding_item: Item/part label from preceding context
+
+        Returns:
+            Normalized section name (e.g., "Item 1A", "Part II")
+        """
+        text = text.strip()
+
+        # HIGHEST PRIORITY: Use preceding item label if available (table-based TOC)
+        if preceding_item:
+            # Clean up and normalize the preceding item
+            item_match = re.match(r'item\s+(\d+[a-z]?)', preceding_item, re.IGNORECASE)
+            if item_match:
+                return f"Item {item_match.group(1).upper()}"
+
+            part_match = re.match(r'part\s+([ivx]+)', preceding_item, re.IGNORECASE)
+            if part_match:
+                return f"Part {part_match.group(1).upper()}"
+
+        # SECOND PRIORITY: Try to extract from anchor ID
+        if anchor_id:
+            anchor_lower = anchor_id.lower()
+
+            # Match item patterns: item_1a, item1a, item_1_business, etc.
+            item_match = re.search(r'item_?(\d+[a-z]?)', anchor_lower)
+            if item_match:
+                item_num = item_match.group(1).upper()
+                return f"Item {item_num}"
+
+            # Match part patterns: part_i, part_ii, parti, partii, etc.
+            part_match = re.search(r'part_?([ivx]+)', anchor_lower)
+            if part_match:
+                part_num = part_match.group(1).upper()
+                return f"Part {part_num}"
+
+        # THIRD PRIORITY: Text-based normalization
+        # Handle common Item patterns in text
+        item_match = re.match(r'item\s+(\d+[a-z]?)', text, re.IGNORECASE)
+        if item_match:
+            return f"Item {item_match.group(1).upper()}"
+
+        # Handle Part patterns
+        part_match = re.match(r'part\s+([ivx]+)', text, re.IGNORECASE)
+        if part_match:
+            return f"Part {part_match.group(1).upper()}"
+
+        # Handle specific known sections by text
+        text_lower = text.lower()
+        if 'business' in text_lower and 'item' not in text_lower:
+            return "Item 1"
+        elif 'risk factors' in text_lower and 'item' not in text_lower:
+            return "Item 1A"
+        elif 'properties' in text_lower and 'item' not in text_lower:
+            return "Item 2"
+        elif 'legal proceedings' in text_lower and 'item' not in text_lower:
+            return "Item 3"
+        elif 'management' in text_lower and 'discussion' in text_lower:
+            return "Item 7"
+        elif 'financial statements' in text_lower:
+            return "Item 8"
+        elif 'exhibits' in text_lower:
+            return "Item 15"
+
+        return text  # Return as-is if no normalization applies
+    
+    def _get_section_type_and_order(self, text: str) -> Tuple[str, int]:
+        """Get section type and order for sorting."""
+        text_lower = text.lower()
+        
+        # Items
+        item_match = re.search(r'item\s*(\d+)([a-z]?)', text_lower)
+        if item_match:
+            item_num = int(item_match.group(1))
+            item_letter = item_match.group(2) or ''
+            # Order: Item 1=1000, Item 1A=1001, Item 2=2000, etc.
+            order = item_num * 1000 + (ord(item_letter.upper()) - ord('A') + 1 if item_letter else 0)
+            return 'item', order
+        
+        # Parts
+        part_match = re.search(r'part\s*([ivx]+)', text_lower)
+        if part_match:
+            part_roman = part_match.group(1)
+            part_num = self._roman_to_int(part_roman)
+            return 'part', part_num * 100  # Part I=100, Part II=200, etc.
+        
+        # Known sections without explicit item numbers
+        if 'business' in text_lower:
+            return 'item', 1000  # Item 1
+        elif 'risk factors' in text_lower:
+            return 'item', 1001  # Item 1A
+        elif 'properties' in text_lower:
+            return 'item', 2000  # Item 2
+        elif 'legal proceedings' in text_lower:
+            return 'item', 3000  # Item 3
+        elif 'management' in text_lower and 'discussion' in text_lower:
+            return 'item', 7000  # Item 7
+        elif 'financial statements' in text_lower:
+            return 'item', 8000  # Item 8
+        elif 'exhibits' in text_lower:
+            return 'item', 15000  # Item 15
+        
+        return 'other', 99999
+    
+    def _roman_to_int(self, roman: str) -> int:
+        """Convert roman numerals to integers."""
+        roman_map = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
+        roman = roman.lower()
+        result = 0
+        prev = 0
+        
+        for char in reversed(roman):
+            value = roman_map.get(char, 0)
+            if value < prev:
+                result -= value
+            else:
+                result += value
+            prev = value
+        
+        return result
+    
+    def _build_section_mapping(self, toc_sections: List[TOCSection]) -> Dict[str, str]:
+        """Build final section mapping, handling duplicates intelligently.
+
+        For 10-Q filings with part context, generates part-aware section names
+        like "part_i_item_1" and "part_ii_item_1" to distinguish sections
+        with the same item number across different parts.
+        """
+        # Sort sections by order
+        toc_sections.sort(key=lambda x: x.order)
+
+        mapping = {}
+        seen_names = set()
+
+        for section in toc_sections:
+            # Generate part-aware section name for 10-Q filings
+            if section.part:
+                # Convert "Part I" -> "part_i", "Part II" -> "part_ii"
+                part_key = section.part.lower().replace(' ', '_')
+                # Convert "Item 1" -> "item_1", "Item 1A" -> "item_1a"
+                item_key = section.normalized_name.lower().replace(' ', '_')
+                section_name = f"{part_key}_{item_key}"
+            else:
+                # 10-K filings: use normalized name as-is
+                section_name = section.normalized_name
+
+            # Skip if we already have this section (prefer first occurrence)
+            if section_name in seen_names:
+                continue
+
+            mapping[section_name] = section.anchor_id
+            seen_names.add(section_name)
+
+        return mapping
+    
+    def get_section_suggestions(self, html_content: str) -> List[str]:
+        """Get list of available sections that can be extracted."""
+        mapping = self.analyze_toc_structure(html_content)
+        return sorted(mapping.keys(), key=lambda x: self._get_section_type_and_order(x)[1])
+
+
+def analyze_toc_for_sections(html_content: str) -> Dict[str, str]:
+    """
+    Convenience function to analyze TOC and return section mapping.
+    
+    Args:
+        html_content: Raw HTML content
+        
+    Returns:
+        Dict mapping section names to anchor IDs
+    """
+    analyzer = TOCAnalyzer()
+    return analyzer.analyze_toc_structure(html_content)