edgartools/venv/lib/python3.10/site-packages/edgar/documents/strategies/document_builder.py

"""
Document builder that converts parsed HTML tree into document nodes.
"""

from typing import Dict, Any, Optional

from lxml.html import HtmlElement

from edgar.documents.config import ParserConfig
from edgar.documents.nodes import (
    Node, DocumentNode, TextNode, ParagraphNode, HeadingNode,
    ContainerNode, SectionNode, ListNode, ListItemNode, LinkNode, ImageNode
)
from edgar.documents.strategies.style_parser import StyleParser
from edgar.documents.table_nodes import TableNode, Cell, Row
from edgar.documents.types import Style, ParseContext, SemanticType


class DocumentBuilder:
    """
    Builds Document node tree from parsed HTML.

    Handles the conversion of HTML elements into structured nodes
    with proper hierarchy and metadata.
    """

    # Block-level elements
    BLOCK_ELEMENTS = {
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
        'ul', 'ol', 'li', 'blockquote', 'pre', 'hr',
        'table', 'form', 'fieldset', 'address', 'section',
        'article', 'aside', 'nav', 'header', 'footer', 'main'
    }

    # Inline elements
    INLINE_ELEMENTS = {
        'span', 'a', 'em', 'strong', 'b', 'i', 'u', 's',
        'small', 'mark', 'del', 'ins', 'sub', 'sup',
        'code', 'kbd', 'var', 'samp', 'abbr', 'cite',
        'q', 'time', 'font',
        # IXBRL inline elements for simple values - should not break text flow
        'ix:nonfraction', 'ix:footnote', 'ix:fraction'
    }

    # Elements to skip
    SKIP_ELEMENTS = {
        'script', 'style', 'meta', 'link', 'noscript',
        # IXBRL exclude elements - content that should not appear in final document
        'ix:exclude'
    }

    def __init__(self, config: ParserConfig, strategies: Dict[str, Any]):
        """
        Initialize document builder.

        Args:
            config: Parser configuration
            strategies: Dictionary of parsing strategies
        """
        self.config = config
        self.strategies = strategies
        self.style_parser = StyleParser()
        self.context = ParseContext()

        # Track XBRL context
        self.xbrl_context_stack = []
        self.xbrl_continuations = {}

    def build(self, tree: HtmlElement) -> DocumentNode:
        """
        Build document from HTML tree.

        Args:
            tree: Parsed HTML tree

        Returns:
            Document root node
        """
        # Create root document node
        root = DocumentNode()

        # Find body element
        body = tree.find('.//body')
        if body is None:
            # If no body, use the entire tree
            body = tree

        # Process body content
        self._process_element(body, root)

        # Apply node merging if configured
        if self.config.merge_adjacent_nodes:
            self._merge_adjacent_nodes(root)

        return root

    def _process_element(self, element: HtmlElement, parent: Node) -> Optional[Node]:
        """
        Process HTML element into node.

        Args:
            element: HTML element to process
            parent: Parent node

        Returns:
            Created node or None if skipped
        """

        # Skip certain elements but preserve their tail text
        if element.tag in self.SKIP_ELEMENTS:
            # Process tail text even when skipping element
            if element.tail:
                if self.config.preserve_whitespace:
                    text_node = TextNode(content=element.tail)
                    parent.add_child(text_node)
                else:
                    if element.tail.strip():
                        text_node = TextNode(content=element.tail.strip())
                        parent.add_child(text_node)
            return None

        # Skip page number containers
        if self._is_page_number_container(element):
            return None

        # Skip page break elements
        if self._is_page_break_element(element):
            return None

        # Skip navigation containers that follow page breaks
        if self._is_page_navigation_container(element):
            return None

        # Track parsing depth
        self.context.depth += 1

        try:
            # Handle XBRL elements
            if element.tag.startswith('{'):  # Namespaced element
                self._enter_xbrl_context(element)

            # Extract style
            style = self._extract_style(element)

            # Create appropriate node based on element type
            node = self._create_node_for_element(element, style)

            if node:
                # Add XBRL metadata if in context
                if self.xbrl_context_stack:
                    node.metadata.update(self._get_current_xbrl_metadata())

                # Add to parent
                parent.add_child(node)

                # Process children for container nodes
                if self._should_process_children(element, node):
                    # Add element's direct text first
                    if element.text:
                        if self.config.preserve_whitespace:
                            if element.text:  # Don't strip whitespace
                                text_node = TextNode(content=element.text)
                                node.add_child(text_node)
                        else:
                            if element.text.strip():
                                text_node = TextNode(content=element.text.strip())
                                node.add_child(text_node)

                    # Process child elements
                    for child in element:
                        self._process_element(child, node)

                    # Process text after children
                    if element.tail:
                        if self.config.preserve_whitespace:
                            text_node = TextNode(content=element.tail)
                            parent.add_child(text_node)
                        else:
                            if element.tail.strip():
                                text_node = TextNode(content=element.tail.strip())
                                parent.add_child(text_node)
                            elif element.tail.isspace():
                                # Even if tail is just whitespace, preserve the spacing info
                                # This helps with inline element spacing decisions
                                if hasattr(node, 'set_metadata'):
                                    node.set_metadata('has_tail_whitespace', True)
                else:
                    # Node created but children not processed - still need to handle tail
                    if element.tail:
                        if self.config.preserve_whitespace:
                            text_node = TextNode(content=element.tail)
                            parent.add_child(text_node)
                        else:
                            if element.tail.strip():
                                text_node = TextNode(content=element.tail.strip())
                                parent.add_child(text_node)
                            elif element.tail.isspace():
                                # Even if tail is just whitespace, preserve the spacing info
                                if hasattr(node, 'set_metadata'):
                                    node.set_metadata('has_tail_whitespace', True)
            else:
                # No node created, process children with same parent
                for child in element:
                    self._process_element(child, parent)

                # Process tail text
                if element.tail:
                    if self.config.preserve_whitespace:
                        text_node = TextNode(content=element.tail)
                        parent.add_child(text_node)
                    else:
                        if element.tail.strip():
                            text_node = TextNode(content=element.tail.strip())
                            parent.add_child(text_node)

            # Exit XBRL context
            if element.tag.startswith('{'):
                self._exit_xbrl_context(element)

            return node

        finally:
            self.context.depth -= 1

    def _create_node_for_element(self, element: HtmlElement, style: Style) -> Optional[Node]:
        """Create appropriate node for HTML element."""
        tag = element.tag.lower() if not element.tag.startswith('{') else element.tag


        # Check for heading
        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            level = int(tag[1])
            text = self._get_element_text(element)
            if text:
                return HeadingNode(content=text, level=level, style=style)

        # Handle specific elements first before header detection
        if tag == 'p':
            return ParagraphNode(style=style)

        elif tag == 'li':
            return ListItemNode(style=style)

        # Check if element might be a heading based on style/content
        # Skip header detection for certain tags that should never be headers
        skip_header_detection_tags = {
            'li', 'td', 'th', 'option', 'a', 'button', 'label',
            # IXBRL inline elements - should not be treated as headers
            'ix:nonfraction', 'ix:footnote', 'ix:fraction',
            # IXBRL elements that can contain tables and complex content
            'ix:nonNumeric', 'ix:continuation'
        }
        if tag not in skip_header_detection_tags and self.strategies.get('header_detection'):
            header_info = self.strategies['header_detection'].detect(element, self.context)
            if header_info and header_info.confidence > self.config.header_detection_threshold:
                text = self._get_element_text(element)
                if text:
                    node = HeadingNode(
                        content=text,
                        level=header_info.level,
                        style=style
                    )
                    # Add header metadata
                    node.set_metadata('detection_method', header_info.detection_method)
                    node.set_metadata('confidence', header_info.confidence)
                    if header_info.is_item:
                        node.semantic_type = SemanticType.ITEM_HEADER
                        node.set_metadata('item_number', header_info.item_number)
                    return node

        # Continue handling other specific elements
        if tag == 'table':
            if self.strategies.get('table_processing'):
                return self.strategies['table_processing'].process(element)
            else:
                return self._process_table_basic(element, style)

        elif tag in ['ul', 'ol']:
            return ListNode(ordered=(tag == 'ol'), style=style)

        elif tag == 'li':
            return ListItemNode(style=style)

        elif tag == 'a':
            href = element.get('href', '')
            title = element.get('title', '')
            text = self._get_element_text(element)
            return LinkNode(content=text, href=href, title=title, style=style)

        elif tag == 'img':
            return ImageNode(
                src=element.get('src'),
                alt=element.get('alt'),
                width=self._parse_dimension(element.get('width')),
                height=self._parse_dimension(element.get('height')),
                style=style
            )

        elif tag == 'br':
            # Line break - add as text node
            return TextNode(content='\n')

        elif tag in ['section', 'article']:
            return SectionNode(style=style)

        elif tag == 'div' or tag in self.BLOCK_ELEMENTS:
            # Check if CSS display property makes this inline
            if style.display in ['inline', 'inline-block']:
                # Treat as inline element despite being a div
                text = self._get_element_text(element)
                if text:
                    text_node = TextNode(content=text, style=style)
                    text_node.set_metadata('original_tag', tag)
                    text_node.set_metadata('inline_via_css', True)
                    return text_node
                # If no text but inline, still process children inline
                return ContainerNode(tag_name=tag, style=style)

            # Normal block behavior
            # Check if this is just a text container with only inline elements
            if self._is_text_only_container(element):
                # Create ParagraphNode for divs containing only inline elements
                # This ensures proper text concatenation for spans, etc.
                return ParagraphNode(style=style)
            else:
                return ContainerNode(tag_name=tag, style=style)

        elif tag in self.INLINE_ELEMENTS:
            # Inline elements - extract text and add to parent
            text = self._get_element_text(element)
            if text:
                text_node = TextNode(content=text, style=style)
                # Preserve inline element metadata
                text_node.set_metadata('original_tag', tag)
                return text_node

        elif tag in ['ix:nonNumeric', 'ix:continuation']:
            # IXBRL elements that can contain complex content including tables
            # Process as container to allow proper table parsing
            return ContainerNode(tag_name=tag, style=style)

        # Default: create container for unknown elements
        return ContainerNode(tag_name=tag, style=style)

    def _is_page_number_container(self, element: HtmlElement) -> bool:
        """Detect and filter page number containers across various SEC filing patterns."""
        import re

        # Get text content first - all page numbers should be short
        text_content = element.text_content().strip()

        # Must be short content (1-8 chars to handle "Page X" format)
        if len(text_content) > 8 or len(text_content) == 0:
            return False

        # Must be numeric, roman numerals, or "Page X" format
        if not self._is_page_number_content(text_content):
            return False

        # Check various patterns based on element type and styling
        tag = element.tag.lower()

        # Pattern 1: Oracle-style flexbox containers (highest confidence)
        if tag == 'div' and self._is_flexbox_page_number(element):
            return True

        # Pattern 2: Center/right aligned paragraphs (common pattern)
        if tag == 'p' and self._is_aligned_page_number(element):
            return True

        # Pattern 3: Footer-style divs with centered page numbers
        if tag == 'div' and self._is_footer_page_number(element):
            return True

        # Pattern 4: Simple divs with page break context
        if tag == 'div' and self._is_page_break_context(element):
            return True

        return False

    def _is_page_number_content(self, text: str) -> bool:
        """Check if text content looks like a page number."""
        import re

        # Simple numeric (most common)
        if text.isdigit():
            return True

        # Roman numerals
        if re.match(r'^[ivxlcdm]+$', text.lower()):
            return True

        # "Page X" or "Page X of Y" format
        if re.match(r'^page\s+\d+(\s+of\s+\d+)?$', text.lower()):
            return True

        return False

    def _is_flexbox_page_number(self, element: HtmlElement) -> bool:
        """Detect Oracle-style flexbox page number containers."""
        import re

        style_attr = element.get('style', '')
        if not style_attr:
            return False

        # Must have: display:flex, justify-content:flex-end, min-height:1in
        required_patterns = [
            r'display:\s*flex',
            r'justify-content:\s*flex-end',
            r'min-height:\s*1in'
        ]

        return all(re.search(pattern, style_attr) for pattern in required_patterns)

    def _is_aligned_page_number(self, element: HtmlElement) -> bool:
        """Detect center or right-aligned page number paragraphs."""
        import re

        style_attr = element.get('style', '')

        # Check for center or right alignment
        alignment_pattern = r'text-align:\s*(center|right)'
        if not re.search(alignment_pattern, style_attr):
            return False

        # Optional: check for smaller font size (common in page numbers)
        font_size_pattern = r'font-size:\s*([0-9]+)pt'
        font_match = re.search(font_size_pattern, style_attr)
        if font_match:
            font_size = int(font_match.group(1))
            # Page numbers often use smaller fonts (8-12pt)
            if font_size <= 12:
                return True

        return True  # Any center/right aligned short content

    def _is_footer_page_number(self, element: HtmlElement) -> bool:
        """Detect footer-style page number containers."""
        import re

        style_attr = element.get('style', '')

        # Look for bottom positioning or footer-like styling
        footer_patterns = [
            r'bottom:\s*[0-9]',
            r'position:\s*absolute',
            r'margin-bottom:\s*0',
            r'text-align:\s*center'
        ]

        # Need at least 2 footer indicators
        matches = sum(1 for pattern in footer_patterns if re.search(pattern, style_attr))
        return matches >= 2

    def _is_page_break_context(self, element: HtmlElement) -> bool:
        """Check if element is near page breaks (common page number context)."""

        # Check next sibling for page break HR
        next_elem = element.getnext()
        if next_elem is not None and next_elem.tag == 'hr':
            hr_style = next_elem.get('style', '')
            if 'page-break' in hr_style:
                return True

        # Check if element has page-break styling itself
        style_attr = element.get('style', '')
        if 'page-break' in style_attr:
            return True

        return False

    def _is_page_break_element(self, element: HtmlElement) -> bool:
        """Detect page break HR elements."""
        if element.tag.lower() != 'hr':
            return False

        style_attr = element.get('style', '')

        # Check for page-break-after:always or similar page break styles
        return 'page-break' in style_attr

    def _is_page_navigation_container(self, element: HtmlElement) -> bool:
        """Detect navigation containers that appear after page breaks."""
        if element.tag.lower() != 'div':
            return False

        style_attr = element.get('style', '')

        # Check for navigation container patterns
        # Often have: padding-top, min-height:1in, box-sizing:border-box
        nav_indicators = [
            r'padding-top:\s*0\.5in',
            r'min-height:\s*1in',
            r'box-sizing:\s*border-box'
        ]

        import re
        matches = sum(1 for pattern in nav_indicators if re.search(pattern, style_attr))

        # Need at least 2 indicators
        if matches < 2:
            return False

        # Check if it contains typical navigation content
        text_content = element.text_content().strip().lower()

        # Common navigation phrases
        nav_phrases = [
            'table of contents',
            'index to financial statements',
            'table of content',
            'index to financial statement'
        ]

        return any(phrase in text_content for phrase in nav_phrases)

    def _extract_style(self, element: HtmlElement) -> Style:
        """Extract style from element."""
        style_str = element.get('style', '')
        style = self.style_parser.parse(style_str)

        # Add tag-specific styles
        tag = element.tag.lower()
        if tag == 'b' or tag == 'strong':
            style.font_weight = 'bold'
        elif tag == 'i' or tag == 'em':
            style.font_style = 'italic'
        elif tag == 'u':
            style.text_decoration = 'underline'

        # Handle alignment
        align = element.get('align')
        if align:
            style.text_align = align

        return style

    def _get_element_text(self, element: HtmlElement) -> str:
        """Get text content from element."""
        text_parts = []

        # Get element's direct text
        if element.text:
            # For inline elements, preserve leading/trailing whitespace
            if element.tag.lower() in self.INLINE_ELEMENTS:
                text_parts.append(element.text)
            else:
                text_parts.append(element.text.strip())

        # For simple elements, get all text content
        if element.tag.lower() in self.INLINE_ELEMENTS or \
           element.tag.lower() in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            # Get all text including from child elements
            for child in element:
                if child.tag.lower() not in self.SKIP_ELEMENTS:
                    child_text = child.text_content()
                    if child_text:
                        # For inline elements, preserve whitespace in child content too
                        if element.tag.lower() in self.INLINE_ELEMENTS:
                            text_parts.append(child_text)
                        else:
                            text_parts.append(child_text.strip())

        # For inline elements with preserved whitespace, concatenate directly
        # For others, join with spaces
        if element.tag.lower() in self.INLINE_ELEMENTS and len(text_parts) == 1:
            return text_parts[0] if text_parts else ''
        else:
            return ' '.join(text_parts)

    def _is_text_only_container(self, element: HtmlElement) -> bool:
        """Check if element contains only text and inline elements."""
        for child in element:
            if child.tag.lower() in self.BLOCK_ELEMENTS:
                return False
            if child.tag.lower() == 'table':
                return False
        return True

    def _should_process_children(self, element: HtmlElement, node: Node) -> bool:
        """Determine if children should be processed."""
        # Don't process children for certain node types
        if isinstance(node, (TextNode, HeadingNode)):
            return False

        # Tables are processed separately
        if isinstance(node, TableNode):
            return False

        return True

    def _process_table_basic(self, element: HtmlElement, style: Style) -> TableNode:
        """Basic table processing without advanced strategy."""
        table = TableNode(style=style)

        # Set config for rendering decisions
        table._config = self.config

        # Extract caption
        caption_elem = element.find('.//caption')
        if caption_elem is not None:
            table.caption = caption_elem.text_content().strip()

        # Process rows
        for tr in element.findall('.//tr'):
            cells = []
            for td in tr.findall('.//td') + tr.findall('.//th'):
                cell = Cell(
                    content=td.text_content().strip(),
                    colspan=int(td.get('colspan', '1')),
                    rowspan=int(td.get('rowspan', '1')),
                    is_header=(td.tag == 'th'),
                    align=td.get('align')
                )
                cells.append(cell)

            if cells:
                row = Row(cells=cells, is_header=(tr.find('.//th') is not None))

                # Determine if header or data row
                if tr.getparent().tag == 'thead' or row.is_header:
                    table.headers.append(cells)
                else:
                    table.rows.append(row)

        return table

    def _parse_dimension(self, value: Optional[str]) -> Optional[int]:
        """Parse dimension value (width/height)."""
        if not value:
            return None

        # Remove 'px' suffix if present
        value = value.strip().rstrip('px')

        try:
            return int(value)
        except ValueError:
            return None

    def _enter_xbrl_context(self, element: HtmlElement):
        """Enter XBRL context."""
        if self.config.extract_xbrl and self.strategies.get('xbrl_extraction'):
            xbrl_data = self.strategies['xbrl_extraction'].extract_context(element)
            if xbrl_data:
                self.xbrl_context_stack.append(xbrl_data)

    def _exit_xbrl_context(self, element: HtmlElement):
        """Exit XBRL context."""
        if self.xbrl_context_stack:
            self.xbrl_context_stack.pop()

    def _get_current_xbrl_metadata(self) -> Dict[str, Any]:
        """Get current XBRL metadata."""
        if not self.xbrl_context_stack:
            return {}

        # Merge all contexts in stack
        metadata = {}
        for context in self.xbrl_context_stack:
            metadata.update(context)

        return metadata

    def _merge_adjacent_nodes(self, root: Node):
        """Merge adjacent text nodes with similar styles."""
        # Implementation would recursively merge adjacent text nodes
        # This is a placeholder for the actual implementation
        pass