edgartools/venv/lib/python3.10/site-packages/edgar/documents/utils/toc_analyzer.py

"""
Table of Contents analyzer for SEC filings.

This module analyzes the TOC structure to map section names to anchor IDs,
enabling section extraction for API filings with generated anchor IDs.
"""
import re
from typing import Dict, List, Optional, Set, Tuple
from dataclasses import dataclass
from lxml import html as lxml_html


@dataclass
class TOCSection:
    """Represents a section found in the Table of Contents."""
    name: str
    anchor_id: str
    normalized_name: str
    section_type: str  # 'item', 'part', 'other'
    order: int
    part: Optional[str] = None  # NEW: "Part I", "Part II", or None for 10-K


class TOCAnalyzer:
    """
    Analyzes Table of Contents structure to map section names to anchor IDs.

    This enables section extraction for filings where anchor IDs are generated
    rather than semantic (like API filings vs local HTML files).
    """

    def __init__(self):
        # SEC section patterns for normalization
        self.section_patterns = [
            (r'(?:item|part)\s+\d+[a-z]?', 'item'),
            (r'business', 'item'),
            (r'risk\s+factors?', 'item'),
            (r'properties', 'item'),
            (r'legal\s+proceedings', 'item'),
            (r'management.*discussion', 'item'),
            (r'md&a', 'item'),
            (r'financial\s+statements?', 'item'),
            (r'exhibits?', 'item'),
            (r'signatures?', 'item'),
            (r'part\s+[ivx]+', 'part'),
        ]

    def analyze_toc_structure(self, html_content: str) -> Dict[str, str]:
        """
        Analyze HTML content to extract section mappings from TOC.

        Args:
            html_content: Raw HTML content

        Returns:
            Dict mapping normalized section names to anchor IDs
        """
        section_mapping = {}

        try:
            # Handle XML declaration issues
            if html_content.startswith('<?xml'):
                html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)

            tree = lxml_html.fromstring(html_content)

            # Find all anchor links that could be TOC links
            anchor_links = tree.xpath('//a[@href]')

            toc_sections = []
            current_part = None  # Track current part context for 10-Q filings
            part_pattern = re.compile(r'^\s*Part\s+([IVX]+)\b', re.IGNORECASE)

            for link in anchor_links:
                href = link.get('href', '').strip()
                text = (link.text_content() or '').strip()

                # Check if this link or its row represents a part header
                # Part headers in 10-Q TOCs typically appear as separate rows: "Part I", "Part II"
                part_match = part_pattern.match(text)
                if part_match:
                    # Update current part context
                    current_part = f"Part {part_match.group(1).upper()}"
                    # Don't create a section for the part header itself
                    continue

                # Look for internal anchor links
                if href.startswith('#') and text:
                    anchor_id = href[1:]  # Remove #

                    # Try to find item number in preceding context (for table-based TOCs)
                    preceding_item = self._extract_preceding_item_label(link)

                    # Check if this looks like a section reference (check text, anchor ID, and context)
                    if self._is_section_link(text, anchor_id, preceding_item):
                        # Verify target exists
                        target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
                        if target_elements:
                            # Try to extract item number from: anchor ID > preceding context > text
                            normalized_name = self._normalize_section_name(text, anchor_id, preceding_item)
                            section_type, order = self._get_section_type_and_order(normalized_name)

                            toc_section = TOCSection(
                                name=text,
                                anchor_id=anchor_id,
                                normalized_name=normalized_name,
                                section_type=section_type,
                                order=order,
                                part=current_part  # Assign current part context
                            )
                            toc_sections.append(toc_section)

            # Build mapping prioritizing the most standard section names
            section_mapping = self._build_section_mapping(toc_sections)

        except Exception as e:
            # Return empty mapping on error - fallback to other methods
            pass

        return section_mapping

    def _extract_preceding_item_label(self, link_element) -> str:
        """
        Extract item/part label from preceding context.

        Handles table-based TOCs where item number is in a separate cell:
        <td>Item 1.</td><td><a href="...">Business</a></td>

        Also handles nested structures like:
        <td>Item 1.</td><td><div><span><a href="...">Business</a></span></div></td>

        Args:
            link_element: The <a> element

        Returns:
            Item label like "Item 1", "Item 1A", "Part I" or empty string
        """
        try:
            # Traverse up to find the containing <td> or <th> (up to 5 levels)
            current = link_element
            td_element = None

            for _ in range(5):
                parent = current.getparent()
                if parent is None:
                    break

                if parent.tag in ['td', 'th']:
                    td_element = parent
                    break

                current = parent

            # If we found a <td>, check ALL preceding siblings in the row
            # This handles TOCs where item number is not in the immediately adjacent cell
            # Example: ['Business', 'I', '1', '5'] where '1' is the item number
            if td_element is not None:
                # Check all preceding siblings (rightmost to leftmost)
                prev_sibling = td_element.getprevious()
                while prev_sibling is not None:
                    if prev_sibling.tag in ['td', 'th']:
                        prev_text = (prev_sibling.text_content() or '').strip()

                        # Look for "Item X" or just "X" (bare number) pattern
                        # Match full format: "Item 1A"
                        item_match = re.match(r'(Item\s+\d+[A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
                        if item_match:
                            return item_match.group(1)

                        # Match bare item number: "1A" or "1" (only valid 10-K item numbers: 1-15)
                        # This prevents page numbers (50, 108, etc.) from being treated as items
                        bare_item_match = re.match(r'^([1-9]|1[0-5])([A-Z]?)\.?\s*$', prev_text, re.IGNORECASE)
                        if bare_item_match:
                            item_num = bare_item_match.group(1)
                            item_letter = bare_item_match.group(2)
                            return f"Item {item_num}{item_letter}"

                        # Match part: "Part I" or just "I"
                        part_match = re.match(r'(Part\s+[IVX]+)\.?\s*$', prev_text, re.IGNORECASE)
                        if part_match:
                            return part_match.group(1)

                        # Match bare part: "I", "II", etc.
                        bare_part_match = re.match(r'^([IVX]+)\.?\s*$', prev_text)
                        if bare_part_match:
                            return f"Part {bare_part_match.group(1)}"

                    prev_sibling = prev_sibling.getprevious()

            # Also check immediate parent's text for inline patterns (div/span structures)
            parent = link_element.getparent()
            if parent is not None and parent.tag in ['div', 'span', 'p']:
                if parent.text:
                    text_before = parent.text.strip()
                    item_match = re.search(r'(Item\s+\d+[A-Z]?)\.?\s*$', text_before, re.IGNORECASE)
                    if item_match:
                        return item_match.group(1)

                    part_match = re.search(r'(Part\s+[IVX]+)\.?\s*$', text_before, re.IGNORECASE)
                    if part_match:
                        return part_match.group(1)

        except Exception:
            pass

        return ''

    def _is_section_link(self, text: str, anchor_id: str = '', preceding_item: str = '') -> bool:
        """
        Check if link represents a section reference.

        Checks link text, anchor ID, and preceding context to handle cases where:
        - Text is descriptive (e.g., "Executive Compensation")
        - Anchor ID contains item number (e.g., "item_11_executive_compensation")
        - Item number is in preceding table cell (e.g., <td>Item 1.</td><td><a>Business</a></td>)

        Args:
            text: Link text
            anchor_id: Anchor ID from href (without #)
            preceding_item: Item/part label from preceding context (e.g., "Item 1A")

        Returns:
            True if this appears to be a section link
        """
        if not text:
            return False

        # First check if there's a preceding item label (table-based TOC)
        if preceding_item:
            return True

        # Then check anchor ID for item/part patterns (most reliable)
        if anchor_id:
            anchor_lower = anchor_id.lower()
            # Match patterns like: item_1, item_1a, item1, item1a, part_i, part_ii, etc.
            if re.search(r'item_?\d+[a-z]?', anchor_lower):
                return True
            if re.search(r'part_?[ivx]+', anchor_lower):
                return True

        # Then check text (with relaxed length limit for descriptive section names)
        if len(text) > 150:  # Increased from 100 to accommodate longer section titles
            return False

        # Check against known patterns
        for pattern, _ in self.section_patterns:
            if re.search(pattern, text, re.IGNORECASE):
                return True

        # Also consider links with section keywords
        if len(text) < 100 and any(keyword in text.lower() for keyword in
                                   ['item', 'part', 'business', 'risk', 'properties', 'legal',
                                    'compensation', 'ownership', 'governance', 'directors']):
            return True

        return False

    def _normalize_section_name(self, text: str, anchor_id: str = '', preceding_item: str = '') -> str:
        """
        Normalize section name for consistent lookup.

        Prioritizes:
        1. Preceding item label (table-based TOC)
        2. Anchor ID pattern
        3. Text-based normalization

        Args:
            text: Link text
            anchor_id: Anchor ID from href (without #)
            preceding_item: Item/part label from preceding context

        Returns:
            Normalized section name (e.g., "Item 1A", "Part II")
        """
        text = text.strip()

        # HIGHEST PRIORITY: Use preceding item label if available (table-based TOC)
        if preceding_item:
            # Clean up and normalize the preceding item
            item_match = re.match(r'item\s+(\d+[a-z]?)', preceding_item, re.IGNORECASE)
            if item_match:
                return f"Item {item_match.group(1).upper()}"

            part_match = re.match(r'part\s+([ivx]+)', preceding_item, re.IGNORECASE)
            if part_match:
                return f"Part {part_match.group(1).upper()}"

        # SECOND PRIORITY: Try to extract from anchor ID
        if anchor_id:
            anchor_lower = anchor_id.lower()

            # Match item patterns: item_1a, item1a, item_1_business, etc.
            item_match = re.search(r'item_?(\d+[a-z]?)', anchor_lower)
            if item_match:
                item_num = item_match.group(1).upper()
                return f"Item {item_num}"

            # Match part patterns: part_i, part_ii, parti, partii, etc.
            part_match = re.search(r'part_?([ivx]+)', anchor_lower)
            if part_match:
                part_num = part_match.group(1).upper()
                return f"Part {part_num}"

        # THIRD PRIORITY: Text-based normalization
        # Handle common Item patterns in text
        item_match = re.match(r'item\s+(\d+[a-z]?)', text, re.IGNORECASE)
        if item_match:
            return f"Item {item_match.group(1).upper()}"

        # Handle Part patterns
        part_match = re.match(r'part\s+([ivx]+)', text, re.IGNORECASE)
        if part_match:
            return f"Part {part_match.group(1).upper()}"

        # Handle specific known sections by text
        text_lower = text.lower()
        if 'business' in text_lower and 'item' not in text_lower:
            return "Item 1"
        elif 'risk factors' in text_lower and 'item' not in text_lower:
            return "Item 1A"
        elif 'properties' in text_lower and 'item' not in text_lower:
            return "Item 2"
        elif 'legal proceedings' in text_lower and 'item' not in text_lower:
            return "Item 3"
        elif 'management' in text_lower and 'discussion' in text_lower:
            return "Item 7"
        elif 'financial statements' in text_lower:
            return "Item 8"
        elif 'exhibits' in text_lower:
            return "Item 15"

        return text  # Return as-is if no normalization applies

    def _get_section_type_and_order(self, text: str) -> Tuple[str, int]:
        """Get section type and order for sorting."""
        text_lower = text.lower()

        # Items
        item_match = re.search(r'item\s*(\d+)([a-z]?)', text_lower)
        if item_match:
            item_num = int(item_match.group(1))
            item_letter = item_match.group(2) or ''
            # Order: Item 1=1000, Item 1A=1001, Item 2=2000, etc.
            order = item_num * 1000 + (ord(item_letter.upper()) - ord('A') + 1 if item_letter else 0)
            return 'item', order

        # Parts
        part_match = re.search(r'part\s*([ivx]+)', text_lower)
        if part_match:
            part_roman = part_match.group(1)
            part_num = self._roman_to_int(part_roman)
            return 'part', part_num * 100  # Part I=100, Part II=200, etc.

        # Known sections without explicit item numbers
        if 'business' in text_lower:
            return 'item', 1000  # Item 1
        elif 'risk factors' in text_lower:
            return 'item', 1001  # Item 1A
        elif 'properties' in text_lower:
            return 'item', 2000  # Item 2
        elif 'legal proceedings' in text_lower:
            return 'item', 3000  # Item 3
        elif 'management' in text_lower and 'discussion' in text_lower:
            return 'item', 7000  # Item 7
        elif 'financial statements' in text_lower:
            return 'item', 8000  # Item 8
        elif 'exhibits' in text_lower:
            return 'item', 15000  # Item 15

        return 'other', 99999

    def _roman_to_int(self, roman: str) -> int:
        """Convert roman numerals to integers."""
        roman_map = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
        roman = roman.lower()
        result = 0
        prev = 0

        for char in reversed(roman):
            value = roman_map.get(char, 0)
            if value < prev:
                result -= value
            else:
                result += value
            prev = value

        return result

    def _build_section_mapping(self, toc_sections: List[TOCSection]) -> Dict[str, str]:
        """Build final section mapping, handling duplicates intelligently.

        For 10-Q filings with part context, generates part-aware section names
        like "part_i_item_1" and "part_ii_item_1" to distinguish sections
        with the same item number across different parts.
        """
        # Sort sections by order
        toc_sections.sort(key=lambda x: x.order)

        mapping = {}
        seen_names = set()

        for section in toc_sections:
            # Generate part-aware section name for 10-Q filings
            if section.part:
                # Convert "Part I" -> "part_i", "Part II" -> "part_ii"
                part_key = section.part.lower().replace(' ', '_')
                # Convert "Item 1" -> "item_1", "Item 1A" -> "item_1a"
                item_key = section.normalized_name.lower().replace(' ', '_')
                section_name = f"{part_key}_{item_key}"
            else:
                # 10-K filings: use normalized name as-is
                section_name = section.normalized_name

            # Skip if we already have this section (prefer first occurrence)
            if section_name in seen_names:
                continue

            mapping[section_name] = section.anchor_id
            seen_names.add(section_name)

        return mapping

    def get_section_suggestions(self, html_content: str) -> List[str]:
        """Get list of available sections that can be extracted."""
        mapping = self.analyze_toc_structure(html_content)
        return sorted(mapping.keys(), key=lambda x: self._get_section_type_and_order(x)[1])


def analyze_toc_for_sections(html_content: str) -> Dict[str, str]:
    """
    Convenience function to analyze TOC and return section mapping.

    Args:
        html_content: Raw HTML content

    Returns:
        Dict mapping section names to anchor IDs
    """
    analyzer = TOCAnalyzer()
    return analyzer.analyze_toc_structure(html_content)