edgartools/venv/lib/python3.10/site-packages/edgar/documents/document.py

"""
Document model for parsed HTML.
"""

from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any, Iterator

from rich.table import Table as RichTable
from rich.console import Group
from rich.text import Text
from edgar.richtools import repr_rich

from edgar.documents.nodes import Node, SectionNode
from edgar.documents.table_nodes import TableNode
from edgar.documents.types import XBRLFact, SearchResult


@dataclass
class DocumentMetadata:
    """
    Document metadata.

    Contains information about the source document and parsing process.
    """
    source: Optional[str] = None
    form: Optional[str] = None
    company: Optional[str] = None
    cik: Optional[str] = None
    accession_number: Optional[str] = None
    filing_date: Optional[str] = None
    report_date: Optional[str] = None
    url: Optional[str] = None
    size: int = 0
    parse_time: float = 0.0
    parser_version: str = "2.0.0"
    xbrl_data: Optional[List[XBRLFact]] = None
    preserve_whitespace: bool = False
    original_html: Optional[str] = None  # Store original HTML for anchor analysis

    def to_dict(self) -> Dict[str, Any]:
        """Convert metadata to dictionary."""
        return {
            'source': self.source,
            'form': self.form,
            'company': self.company,
            'cik': self.cik,
            'accession_number': self.accession_number,
            'filing_date': self.filing_date,
            'report_date': self.report_date,
            'url': self.url,
            'size': self.size,
            'parse_time': self.parse_time,
            'parser_version': self.parser_version,
            'xbrl_data': [fact.to_dict() for fact in self.xbrl_data] if self.xbrl_data else None
        }


@dataclass
class Section:
    """
    Document section representation.

    Represents a logical section of the document (e.g., Risk Factors, MD&A).

    Attributes:
        name: Section identifier (e.g., "item_1", "part_i_item_1", "risk_factors")
        title: Display title (e.g., "Item 1 - Business")
        node: Node containing section content
        start_offset: Character position where section starts
        end_offset: Character position where section ends
        confidence: Detection confidence score (0.0-1.0)
        detection_method: How section was detected ('toc', 'heading', 'pattern')
        validated: Whether section has been cross-validated
        part: Optional part identifier for 10-Q filings ("I", "II", or None for 10-K)
        item: Optional item identifier (e.g., "1", "1A", "2")
        _text_extractor: Optional callback for lazy text extraction (for TOC-based sections)
    """
    name: str
    title: str
    node: SectionNode
    start_offset: int = 0
    end_offset: int = 0
    confidence: float = 1.0  # Detection confidence (0.0-1.0)
    detection_method: str = 'unknown'  # 'toc', 'heading', 'pattern', or 'unknown'
    validated: bool = False  # Cross-validated flag
    part: Optional[str] = None  # Part identifier for 10-Q: "I", "II", or None for 10-K
    item: Optional[str] = None  # Item identifier: "1", "1A", "2", etc.
    _text_extractor: Optional[Any] = field(default=None, repr=False)  # Callback for lazy text extraction

    def text(self, **kwargs) -> str:
        """Extract text from section."""
        # If we have a text extractor callback (TOC-based sections), use it
        if self._text_extractor is not None:
            return self._text_extractor(self.name, **kwargs)

        # Otherwise extract from node (heading/pattern-based sections)
        from edgar.documents.extractors.text_extractor import TextExtractor
        extractor = TextExtractor(**kwargs)
        return extractor.extract_from_node(self.node)

    def tables(self) -> List[TableNode]:
        """Get all tables in section."""
        return self.node.find(lambda n: isinstance(n, TableNode))

    def search(self, query: str) -> List[SearchResult]:
        """Search within section."""
        # Implementation would use semantic search
        results = []
        # Simple text search for now
        text = self.text().lower()
        query_lower = query.lower()

        if query_lower in text:
            # Find snippet around match
            index = text.find(query_lower)
            start = max(0, index - 50)
            end = min(len(text), index + len(query) + 50)
            snippet = text[start:end]

            results.append(SearchResult(
                node=self.node,
                score=1.0,
                snippet=snippet,
                section=self.name
            ))

        return results

    @staticmethod
    def parse_section_name(section_name: str) -> tuple[Optional[str], Optional[str]]:
        """
        Parse section name to extract part and item identifiers.

        Handles both 10-Q part-aware names and 10-K simple names.

        Args:
            section_name: Section identifier (e.g., "part_i_item_1", "item_1a", "risk_factors")

        Returns:
            Tuple of (part, item) where:
            - part: "I", "II", or None for 10-K sections
            - item: "1", "1A", "2", etc. or None if not an item section

        Examples:
            >>> Section.parse_section_name("part_i_item_1")
            ("I", "1")
            >>> Section.parse_section_name("part_ii_item_1a")
            ("II", "1A")
            >>> Section.parse_section_name("item_7")
            (None, "7")
            >>> Section.parse_section_name("risk_factors")
            (None, None)
        """
        import re

        section_lower = section_name.lower()

        # Match 10-Q format: "part_i_item_1", "part_ii_item_1a"
        part_item_match = re.match(r'part_([ivx]+)_item_(\d+[a-z]?)', section_lower)
        if part_item_match:
            part_roman = part_item_match.group(1).upper()
            item_num = part_item_match.group(2).upper()
            return (part_roman, item_num)

        # Match 10-K format: "item_1", "item_1a", "item_7"
        item_match = re.match(r'item_(\d+[a-z]?)', section_lower)
        if item_match:
            item_num = item_match.group(1).upper()
            return (None, item_num)

        # Not a structured item section
        return (None, None)


class Sections(Dict[str, Section]):
    """
    Dictionary wrapper for sections with rich display support.

    Behaves like a normal dict but provides beautiful terminal display
    via __rich__() method when printed in rich-enabled environments.
    """

    def __rich__(self):
        """Return rich representation for display."""
        if not self:
            return Text("No sections detected", style="dim")

        # Create summary table
        table = RichTable(title="Document Sections", show_header=True, header_style="bold magenta")
        table.add_column("Section", style="cyan", no_wrap=True)
        table.add_column("Title", style="white")
        table.add_column("Confidence", justify="right", style="green")
        table.add_column("Method", style="yellow")
        table.add_column("Part/Item", style="blue")

        # Sort sections by part (roman numeral) and item number
        def sort_key(item):
            name, section = item
            # Convert roman numerals to integers for sorting
            roman_to_int = {'i': 1, 'ii': 2, 'iii': 3, 'iv': 4, 'v': 5}

            part = section.part.lower() if section.part else ''
            item_str = section.item if section.item else ''

            # Extract part number
            part_num = roman_to_int.get(part, 0)

            # Extract item number and letter
            import re
            if item_str:
                match = re.match(r'(\d+)([a-z]?)', item_str.lower())
                if match:
                    item_num = int(match.group(1))
                    item_letter = match.group(2) or ''
                    return (part_num, item_num, item_letter)

            # Fallback to name sorting
            return (part_num, 999, name)

        sorted_sections = sorted(self.items(), key=sort_key)

        # Add rows for each section
        for name, section in sorted_sections:
            # Format confidence as percentage
            confidence = f"{section.confidence:.1%}"

            # Format part/item info
            part_item = ""
            if section.part and section.item:
                part_item = f"Part {section.part}, Item {section.item}"
            elif section.item:
                part_item = f"Item {section.item}"
            elif section.part:
                part_item = f"Part {section.part}"

            # Truncate title if too long
            title = section.title
            if len(title) > 50:
                title = title[:47] + "..."

            table.add_row(
                name,
                title,
                confidence,
                section.detection_method,
                part_item
            )

        # Create summary stats
        total = len(self)
        high_conf = sum(1 for s in self.values() if s.confidence >= 0.8)
        methods = {}
        for section in self.values():
            methods[section.detection_method] = methods.get(section.detection_method, 0) + 1

        summary = Text()
        summary.append(f"\nTotal: {total} sections | ", style="dim")
        summary.append(f"High confidence (≥80%): {high_conf} | ", style="dim")
        summary.append(f"Methods: {', '.join(f'{m}={c}' for m, c in methods.items())}", style="dim")

        return Group(table, summary)

    def __repr__(self):
        return repr_rich(self.__rich__())

    def get_item(self, item: str, part: str = None) -> Optional[Section]:
        """
        Get section by item number with optional part specification.

        Args:
            item: Item identifier (e.g., "1", "1A", "7", "Item 1", "Item 7A")
            part: Optional part specification (e.g., "I", "II", "Part I", "Part II")
                  If not specified and multiple parts contain the item, returns first match.

        Returns:
            Section object if found, None otherwise

        Examples:
            >>> sections.get_item("1")           # Returns first Item 1 (any part)
            >>> sections.get_item("1", "I")      # Returns Part I, Item 1
            >>> sections.get_item("Item 1A")     # Returns first Item 1A
            >>> sections.get_item("7A", "II")    # Returns Part II, Item 7A
        """
        # Normalize item string - remove "Item " prefix if present
        item_clean = item.replace("Item ", "").replace("item ", "").strip().upper()

        # Normalize part string if provided
        part_clean = None
        if part:
            part_clean = part.replace("Part ", "").replace("part ", "").replace("PART ", "").strip().upper()

        # Search through sections
        for name, section in self.items():
            if section.item and section.item.upper() == item_clean:
                if part_clean is None:
                    # No part specified - return first match
                    return section
                elif section.part and section.part.upper() == part_clean:
                    # Part matches
                    return section

        return None

    def get_part(self, part: str) -> Dict[str, Section]:
        """
        Get all sections in a specific part.

        Args:
            part: Part identifier (e.g., "I", "II", "Part I", "Part II")

        Returns:
            Dictionary of sections in that part

        Examples:
            >>> sections.get_part("I")        # All Part I sections
            >>> sections.get_part("Part II")  # All Part II sections
        """
        # Normalize part string
        part_clean = part.replace("Part ", "").replace("part ", "").replace("PART ", "").strip().upper()

        result = {}
        for name, section in self.items():
            if section.part and section.part.upper() == part_clean:
                result[name] = section

        return result

    def get(self, key, default=None):
        """
        Enhanced get method that supports flexible key formats.

        Supports:
        - Standard dict key: "part_i_item_1"
        - Item number: "Item 1", "1", "1A"
        - Part+Item: ("I", "1"), ("Part II", "7A")

        Args:
            key: Section key (string or tuple)
            default: Default value if not found

        Returns:
            Section object or default value
        """
        # Try standard dict lookup first
        if isinstance(key, str):
            result = super().get(key, None)
            if result is not None:
                return result

            # Try as item number
            result = self.get_item(key)
            if result is not None:
                return result

        # Try as (part, item) tuple
        elif isinstance(key, tuple) and len(key) == 2:
            part, item = key
            result = self.get_item(item, part)
            if result is not None:
                return result

        return default

    def __getitem__(self, key):
        """
        Enhanced __getitem__ that supports flexible key formats.

        Supports:
        - Standard dict key: sections["part_i_item_1"]
        - Item number: sections["Item 1"], sections["1A"]
        - Part+Item tuple: sections[("I", "1")], sections[("II", "7A")]

        Raises KeyError if not found (standard dict behavior).
        """
        # Try standard dict lookup first
        if isinstance(key, str):
            try:
                return super().__getitem__(key)
            except KeyError:
                # Try as item number
                result = self.get_item(key)
                if result is not None:
                    return result

        # Try as (part, item) tuple
        elif isinstance(key, tuple) and len(key) == 2:
            part, item = key
            result = self.get_item(item, part)
            if result is not None:
                return result

        # Not found - raise KeyError
        raise KeyError(key)


@dataclass
class Document:
    """
    Main document class.

    Represents a parsed HTML document with methods for content extraction,
    search, and transformation.
    """

    # Core properties
    root: Node
    metadata: DocumentMetadata = field(default_factory=DocumentMetadata)

    # Cached extractions
    _sections: Optional[Sections] = field(default=None, init=False, repr=False)
    _tables: Optional[List[TableNode]] = field(default=None, init=False, repr=False)
    _headings: Optional[List[Node]] = field(default=None, init=False, repr=False)
    _xbrl_facts: Optional[List[XBRLFact]] = field(default=None, init=False, repr=False)
    _text_cache: Optional[str] = field(default=None, init=False, repr=False)
    _config: Optional[Any] = field(default=None, init=False, repr=False)  # ParserConfig reference

    @property
    def sections(self) -> Sections:
        """
        Get document sections using hybrid multi-strategy detection.

        Tries detection methods in order of reliability:
        1. TOC-based (0.95 confidence)
        2. Heading-based (0.7-0.9 confidence)
        3. Pattern-based (0.6 confidence)

        Returns a Sections dictionary wrapper that provides rich terminal display
        via __rich__() method. Each section includes confidence score and detection method.
        """
        if self._sections is None:
            # Get form type from config or metadata
            form = None
            if self._config and hasattr(self._config, 'form'):
                form = self._config.form
            elif self.metadata and self.metadata.form:
                form = self.metadata.form

            # Only detect sections for supported form types (including amendments)
            # Normalize form type by removing /A suffix for amendments
            base_form = form.replace('/A', '') if form else None

            if base_form and base_form in ['10-K', '10-Q', '8-K']:
                from edgar.documents.extractors.hybrid_section_detector import HybridSectionDetector
                # Pass thresholds from config if available
                thresholds = self._config.detection_thresholds if self._config else None
                # Use base form type for detection (10-K/A → 10-K)
                detector = HybridSectionDetector(self, base_form, thresholds)
                detected_sections = detector.detect_sections()
            else:
                # Fallback to pattern-based for other types or unknown
                from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
                extractor = SectionExtractor(form) if form else SectionExtractor()
                detected_sections = extractor.extract(self)

            # Wrap detected sections in Sections class for rich display
            self._sections = Sections(detected_sections)

        return self._sections

    @property
    def tables(self) -> List[TableNode]:
        """Get all tables in document."""
        if self._tables is None:
            self._tables = self.root.find(lambda n: isinstance(n, TableNode))
        return self._tables

    @property
    def headings(self) -> List[Node]:
        """Get all headings in document."""
        if self._headings is None:
            from edgar.documents.nodes import HeadingNode
            self._headings = self.root.find(lambda n: isinstance(n, HeadingNode))
        return self._headings

    @property
    def xbrl_facts(self) -> List[XBRLFact]:
        """Get all XBRL facts in document."""
        if self._xbrl_facts is None:
            self._xbrl_facts = self._extract_xbrl_facts()
        return self._xbrl_facts

    def text(self,
             clean: bool = True,
             include_tables: bool = True,
             include_metadata: bool = False,
             max_length: Optional[int] = None) -> str:
        """
        Extract text from document.

        Args:
            clean: Clean and normalize text
            include_tables: Include table content in text
            include_metadata: Include metadata annotations
            max_length: Maximum text length

        Returns:
            Extracted text
        """
        # Use cache if available and parameters match
        if (self._text_cache is not None and
            clean and not include_tables and not include_metadata and max_length is None):
            return self._text_cache

        # If whitespace was preserved during parsing and clean is default (True),
        # respect the preserve_whitespace setting
        if self.metadata.preserve_whitespace and clean:
            clean = False

        from edgar.documents.extractors.text_extractor import TextExtractor
        extractor = TextExtractor(
            clean=clean,
            include_tables=include_tables,
            include_metadata=include_metadata,
            max_length=max_length
        )
        text = extractor.extract(self)

        # Apply navigation link filtering when cleaning
        if clean:
            # Use cached/integrated navigation filtering (optimized approach)
            try:
                from edgar.documents.utils.anchor_cache import filter_with_cached_patterns
                # Use minimal cached approach (no memory overhead)
                original_html = getattr(self.metadata, 'original_html', None)
                text = filter_with_cached_patterns(text, html_content=original_html)
            except:
                # Fallback to pattern-based filtering
                from edgar.documents.utils.toc_filter import filter_toc_links
                text = filter_toc_links(text)

        # Cache if using default parameters
        if clean and not include_tables and not include_metadata and max_length is None:
            self._text_cache = text

        return text

    def search(self, query: str, top_k: int = 10) -> List[SearchResult]:
        """
        Search document for query.

        Args:
            query: Search query
            top_k: Maximum results to return

        Returns:
            List of search results
        """
        from edgar.documents.search import DocumentSearch
        searcher = DocumentSearch(self)
        return searcher.search(query, top_k=top_k)

    def get_section(self, section_name: str, part: Optional[str] = None) -> Optional[Section]:
        """
        Get section by name with optional part specification for 10-Q filings.

        Args:
            section_name: Section identifier (e.g., "item_1", "part_i_item_1")
            part: Optional part specification for 10-Q ("I", "II", "i", "ii")
                  If provided, searches for "part_{part}_{section_name}"

        Returns:
            Section object if found, None otherwise

        Examples:
            # 10-K usage (unchanged)
            >>> doc.get_section("item_1")  # Returns Item 1

            # 10-Q usage with explicit part
            >>> doc.get_section("item_1", part="I")  # Returns Part I Item 1
            >>> doc.get_section("item_1", part="II")  # Returns Part II Item 1

            # 10-Q usage with full name
            >>> doc.get_section("part_i_item_1")  # Returns Part I Item 1
        """
        # If part is specified, construct part-aware name
        if part:
            part_normalized = part.upper()
            # Remove "item_" prefix if present in section_name
            item_name = section_name.replace("item_", "") if section_name.startswith("item_") else section_name
            full_name = f"part_{part_normalized.lower()}_item_{item_name.lower()}"
            return self.sections.get(full_name)

        # Direct lookup (works for both 10-K "item_1" and 10-Q "part_i_item_1")
        section = self.sections.get(section_name)
        if section:
            return section

        # If not found and looks like an item without part, check if we have multiple parts
        # In that case, raise a helpful error
        if section_name.startswith("item_") or section_name.replace("_", "").startswith("item"):
            # Check if we have part-aware sections (10-Q)
            matching_sections = [name for name in self.sections.keys()
                               if section_name in name and "part_" in name]
            if matching_sections:
                # Multiple parts available - user needs to specify which one
                parts = sorted(set(s.split("_")[1] for s in matching_sections if s.startswith("part_")))
                raise ValueError(
                    f"Ambiguous section '{section_name}' in 10-Q filing. "
                    f"Found in parts: {parts}. "
                    f"Please specify part: get_section('{section_name}', part='I') or part='II'"
                )

        return None

    def extract_section_text(self, section_name: str) -> Optional[str]:
        """Extract text from specific section."""
        section = self.get_section(section_name)
        if section:
            return section.text()
        return None

    def get_sec_section(self, section_name: str, clean: bool = True,
                       include_subsections: bool = True) -> Optional[str]:
        """
        Extract content from a specific SEC filing section using anchor analysis.

        Args:
            section_name: Section name (e.g., "Item 1", "Item 1A", "Part I")
            clean: Whether to apply text cleaning and navigation filtering
            include_subsections: Whether to include subsections

        Returns:
            Section text content or None if section not found

        Examples:
            >>> doc.get_sec_section("Item 1")  # Business description
            >>> doc.get_sec_section("Item 1A") # Risk factors
            >>> doc.get_sec_section("Item 7")  # MD&A
        """
        # Lazy-load section extractor
        if not hasattr(self, '_section_extractor'):
            from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
            self._section_extractor = SECSectionExtractor(self)

        return self._section_extractor.get_section_text(
            section_name, include_subsections, clean
        )

    def get_available_sec_sections(self) -> List[str]:
        """
        Get list of SEC sections available for extraction.

        Returns:
            List of section names that can be passed to get_sec_section()

        Example:
            >>> sections = doc.get_available_sec_sections()
            >>> print(sections)
            ['Part I', 'Item 1', 'Item 1A', 'Item 1B', 'Item 2', ...]
        """
        if not hasattr(self, '_section_extractor'):
            from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
            self._section_extractor = SECSectionExtractor(self)

        return self._section_extractor.get_available_sections()

    def get_sec_section_info(self, section_name: str) -> Optional[Dict]:
        """
        Get detailed information about an SEC section.

        Args:
            section_name: Section name to look up

        Returns:
            Dict with section metadata including anchor info
        """
        if not hasattr(self, '_section_extractor'):
            from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
            self._section_extractor = SECSectionExtractor(self)

        return self._section_extractor.get_section_info(section_name)

    def to_markdown(self) -> str:
        """Convert document to Markdown."""
        from edgar.documents.renderers.markdown_renderer import MarkdownRenderer
        renderer = MarkdownRenderer()
        return renderer.render(self)

    def to_json(self, include_content: bool = True) -> Dict[str, Any]:
        """
        Convert document to JSON.

        Args:
            include_content: Include full content or just structure

        Returns:
            JSON-serializable dictionary
        """
        result = {
            'metadata': self.metadata.to_dict(),
            'sections': list(self.sections.keys()),
            'table_count': len(self.tables),
            'xbrl_fact_count': len(self.xbrl_facts)
        }

        if include_content:
            result['sections_detail'] = {
                name: {
                    'title': section.title,
                    'text_length': len(section.text()),
                    'table_count': len(section.tables())
                }
                for name, section in self.sections.items()
            }

            result['tables'] = [
                {
                    'type': table.table_type.name,
                    'rows': len(table.rows),
                    'columns': len(table.headers[0]) if table.headers else 0,
                    'caption': table.caption
                }
                for table in self.tables
            ]

        return result

    def to_dataframe(self) -> 'pd.DataFrame':
        """
        Convert document tables to pandas DataFrame.

        Returns a DataFrame with all tables concatenated.
        """
        import pandas as pd

        if not self.tables:
            return pd.DataFrame()

        # Convert each table to DataFrame
        dfs = []
        for i, table in enumerate(self.tables):
            df = table.to_dataframe()
            # Add table index
            df['_table_index'] = i
            df['_table_type'] = table.table_type.name
            if table.caption:
                df['_table_caption'] = table.caption
            dfs.append(df)

        # Concatenate all tables
        return pd.concat(dfs, ignore_index=True)

    def chunks(self, chunk_size: int = 512, overlap: int = 128) -> Iterator['DocumentChunk']:
        """
        Generate document chunks for processing.

        Args:
            chunk_size: Target chunk size in tokens
            overlap: Overlap between chunks

        Yields:
            Document chunks
        """
        from edgar.documents.extractors.chunk_extractor import ChunkExtractor
        extractor = ChunkExtractor(chunk_size=chunk_size, overlap=overlap)
        return extractor.extract(self)

    def prepare_for_llm(self,
                       max_tokens: int = 4000,
                       preserve_structure: bool = True,
                       focus_sections: Optional[List[str]] = None) -> 'LLMDocument':
        """
        Prepare document for LLM processing.

        Args:
            max_tokens: Maximum tokens
            preserve_structure: Preserve document structure
            focus_sections: Sections to focus on

        Returns:
            LLM-optimized document
        """
        from edgar.documents.ai.llm_optimizer import LLMOptimizer
        optimizer = LLMOptimizer()
        return optimizer.optimize(
            self,
            max_tokens=max_tokens,
            preserve_structure=preserve_structure,
            focus_sections=focus_sections
        )

    def extract_key_information(self) -> Dict[str, Any]:
        """Extract key information from document."""
        return {
            'company': self.metadata.company,
            'form': self.metadata.form,
            'filing_date': self.metadata.filing_date,
            'sections': list(self.sections.keys()),
            'financial_tables': sum(1 for t in self.tables if t.is_financial_table),
            'total_tables': len(self.tables),
            'xbrl_facts': len(self.xbrl_facts),
            'document_length': len(self.text())
        }

    def _extract_xbrl_facts(self) -> List[XBRLFact]:
        """Extract XBRL facts from document."""
        facts = []

        # Find all nodes with XBRL metadata
        xbrl_nodes = self.root.find(
            lambda n: n.get_metadata('ix_tag') is not None
        )

        for node in xbrl_nodes:
            fact = XBRLFact(
                concept=node.get_metadata('ix_tag'),
                value=node.text(),
                context_ref=node.get_metadata('ix_context'),
                unit_ref=node.get_metadata('ix_unit'),
                decimals=node.get_metadata('ix_decimals'),
                scale=node.get_metadata('ix_scale')
            )
            facts.append(fact)

        return facts

    def __len__(self) -> int:
        """Get number of top-level nodes."""
        return len(self.root.children)

    def __iter__(self) -> Iterator[Node]:
        """Iterate over top-level nodes."""
        return iter(self.root.children)

    def __repr__(self) -> str:
        return self.text()

    def walk(self) -> Iterator[Node]:
        """Walk entire document tree."""
        return self.root.walk()

    def find_nodes(self, predicate) -> List[Node]:
        """Find all nodes matching predicate."""
        return self.root.find(predicate)

    def find_first_node(self, predicate) -> Optional[Node]:
        """Find first node matching predicate."""
        return self.root.find_first(predicate)

    @property
    def is_empty(self) -> bool:
        """Check if document is empty."""
        return len(self.root.children) == 0

    @property
    def has_tables(self) -> bool:
        """Check if document has tables."""
        return len(self.tables) > 0

    @property
    def has_xbrl(self) -> bool:
        """Check if document has XBRL data."""
        return len(self.xbrl_facts) > 0

    def validate(self) -> List[str]:
        """
        Validate document structure.

        Returns list of validation issues.
        """
        issues = []

        # Check for empty document
        if self.is_empty:
            issues.append("Document is empty")

        # Check for sections
        if not self.sections:
            issues.append("No sections detected")

        # Check for common sections in filings
        if self.metadata.form in ['10-K', '10-Q']:
            expected_sections = ['business', 'risk_factors', 'mda']
            missing = [s for s in expected_sections if s not in self.sections]
            if missing:
                issues.append(f"Missing expected sections: {', '.join(missing)}")

        # Check for orphaned nodes
        orphaned = self.root.find(lambda n: n.parent is None and n != self.root)
        if orphaned:
            issues.append(f"Found {len(orphaned)} orphaned nodes")

        return issues


@dataclass
class DocumentChunk:
    """Represents a chunk of document for processing."""
    content: str
    start_node: Node
    end_node: Node
    section: Optional[str] = None
    token_count: int = 0

    def to_dict(self) -> Dict[str, Any]:
        """Convert chunk to dictionary."""
        return {
            'content': self.content,
            'section': self.section,
            'token_count': self.token_count,
            'start_path': self.start_node.path,
            'end_path': self.end_node.path
        }


@dataclass
class LLMDocument:
    """Document optimized for LLM processing."""
    content: str
    metadata: Dict[str, Any]
    token_count: int
    sections: List[str]
    truncated: bool = False

    def to_prompt(self) -> str:
        """Convert to LLM prompt."""
        parts = []

        # Add metadata context
        parts.append(f"Document: {self.metadata.get('form', 'Unknown')}")
        parts.append(f"Company: {self.metadata.get('company', 'Unknown')}")
        parts.append(f"Date: {self.metadata.get('filing_date', 'Unknown')}")
        parts.append("")

        # Add content
        parts.append(self.content)

        if self.truncated:
            parts.append("\n[Content truncated due to length]")

        return '\n'.join(parts)