edgartools/venv/lib/python3.10/site-packages/edgar/documents/ranking/semantic.py

"""
Semantic scoring for document structure awareness.

Provides structure-based boosting without ML/embeddings:
- Node type importance (headings, tables, XBRL)
- Cross-reference detection (gateway content)
- Section importance
- Text quality signals

This is NOT embedding-based semantic search. It's structure-aware ranking
that helps agents find investigation starting points.
"""

import re
from typing import List, Dict, Optional, TYPE_CHECKING

if TYPE_CHECKING:
    from edgar.documents.nodes import Node

from edgar.documents.types import NodeType, SemanticType


# Gateway terms that indicate summary/overview content
GATEWAY_TERMS = [
    'summary', 'overview', 'introduction', 'highlights',
    'key points', 'executive summary', 'in summary',
    'table of contents', 'index'
]

# Cross-reference patterns
CROSS_REFERENCE_PATTERNS = [
    r'\bsee\s+item\s+\d+[a-z]?\b',              # "See Item 1A"
    r'\bsee\s+(?:part|section)\s+\d+\b',        # "See Part II"
    r'\brefer\s+to\s+item\s+\d+[a-z]?\b',       # "Refer to Item 7"
    r'\bas\s+discussed\s+in\s+item\s+\d+\b',    # "As discussed in Item 1"
    r'\bfor\s+(?:more|additional)\s+information\b',  # "For more information"
]

# Section importance weights
SECTION_IMPORTANCE = {
    'risk factors': 1.5,
    'management discussion': 1.4,
    'md&a': 1.4,
    'business': 1.3,
    'financial statements': 1.2,
    'controls and procedures': 1.2,
}


def compute_semantic_scores(nodes: List['Node'],
                           query: str,
                           boost_sections: Optional[List[str]] = None) -> Dict[int, float]:
    """
    Compute semantic/structure scores for nodes.

    This provides structure-aware boosting based on:
    1. Node type (headings > tables > paragraphs)
    2. Cross-references (gateway content)
    3. Section importance
    4. Gateway terms (summaries, overviews)
    5. XBRL presence
    6. Text quality

    Args:
        nodes: Nodes to score
        query: Search query (for context-aware boosting)
        boost_sections: Additional sections to boost

    Returns:
        Dictionary mapping node id to semantic score (0-1 range)
    """
    scores = {}
    boost_sections = boost_sections or []

    # Get query context
    query_lower = query.lower()
    is_item_query = bool(re.search(r'item\s+\d+[a-z]?', query_lower))

    for node in nodes:
        score = 0.0

        # 1. Node Type Boosting
        score += _get_node_type_boost(node)

        # 2. Semantic Type Boosting
        score += _get_semantic_type_boost(node)

        # 3. Cross-Reference Detection (gateway content)
        score += _detect_cross_references(node)

        # 4. Gateway Content Detection
        score += _detect_gateway_content(node, query_lower)

        # 5. Section Importance Boosting
        score += _get_section_boost(node, boost_sections)

        # 6. XBRL Fact Boosting (for financial queries)
        score += _get_xbrl_boost(node)

        # 7. Text Quality Signals
        score += _get_quality_boost(node)

        # 8. Query-Specific Boosting
        if is_item_query:
            score += _get_item_header_boost(node)

        # Normalize to 0-1 range (max possible score is ~7.0)
        normalized_score = min(score / 7.0, 1.0)

        scores[id(node)] = normalized_score

    return scores


def _get_node_type_boost(node: 'Node') -> float:
    """
    Boost based on node type.

    Headings and structural elements are more important for navigation.
    """
    type_boosts = {
        NodeType.HEADING: 2.0,      # Headings are key navigation points
        NodeType.SECTION: 1.5,       # Section markers
        NodeType.TABLE: 1.0,         # Tables contain structured data
        NodeType.XBRL_FACT: 0.8,     # Financial facts
        NodeType.LIST: 0.5,          # Lists
        NodeType.PARAGRAPH: 0.3,     # Regular text
        NodeType.TEXT: 0.1,          # Plain text nodes
    }

    return type_boosts.get(node.type, 0.0)


def _get_semantic_type_boost(node: 'Node') -> float:
    """
    Boost based on semantic type.

    Section headers and items are important for SEC filings.
    """
    if not hasattr(node, 'semantic_type') or node.semantic_type is None:
        return 0.0

    semantic_boosts = {
        SemanticType.ITEM_HEADER: 2.0,          # Item headers are critical
        SemanticType.SECTION_HEADER: 1.5,       # Section headers
        SemanticType.FINANCIAL_STATEMENT: 1.2,  # Financial statements
        SemanticType.TABLE_OF_CONTENTS: 1.0,    # TOC is a gateway
        SemanticType.TITLE: 0.8,
        SemanticType.HEADER: 0.6,
    }

    return semantic_boosts.get(node.semantic_type, 0.0)


def _detect_cross_references(node: 'Node') -> float:
    """
    Detect cross-references that indicate gateway content.

    Content that points to other sections is useful for navigation.
    """
    text = node.text() if hasattr(node, 'text') else ''
    if not text:
        return 0.0

    text_lower = text.lower()

    # Check each pattern
    matches = 0
    for pattern in CROSS_REFERENCE_PATTERNS:
        if re.search(pattern, text_lower):
            matches += 1

    # Boost increases with number of cross-references
    return min(matches * 0.5, 1.5)  # Cap at 1.5


def _detect_gateway_content(node: 'Node', query_lower: str) -> float:
    """
    Detect gateway content (summaries, overviews, introductions).

    These are excellent starting points for investigation.
    """
    text = node.text() if hasattr(node, 'text') else ''
    if not text:
        return 0.0

    text_lower = text.lower()

    # Check for gateway terms in text
    for term in GATEWAY_TERMS:
        if term in text_lower:
            return 1.0

    # Check if this is an introductory paragraph (first ~200 chars)
    if len(text) < 200 and len(text) > 20:
        # Short intro paragraphs are often summaries
        if any(word in text_lower for word in ['provides', 'describes', 'includes', 'contains']):
            return 0.5

    return 0.0


def _get_section_boost(node: 'Node', boost_sections: List[str]) -> float:
    """
    Boost nodes in important sections.

    Some SEC sections are more relevant for certain queries.
    """
    # Try to determine section from node or ancestors
    section_name = _get_node_section(node)
    if not section_name:
        return 0.0

    section_lower = section_name.lower()

    # Check built-in importance
    for key, boost in SECTION_IMPORTANCE.items():
        if key in section_lower:
            return boost

    # Check user-specified sections
    for boost_section in boost_sections:
        if boost_section.lower() in section_lower:
            return 1.5

    return 0.0


def _get_xbrl_boost(node: 'Node') -> float:
    """
    Boost XBRL facts and tables with XBRL data.

    Financial data is important for financial queries.
    """
    if node.type == NodeType.XBRL_FACT:
        return 0.8

    # Check if table contains XBRL facts
    if node.type == NodeType.TABLE:
        # Check metadata for XBRL indicator
        if hasattr(node, 'metadata') and node.metadata.get('has_xbrl'):
            return 0.6

    return 0.0


def _get_quality_boost(node: 'Node') -> float:
    """
    Boost based on text quality signals.

    Higher quality content tends to be more useful:
    - Appropriate length (not too short, not too long)
    - Good structure (sentences, punctuation)
    - Substantive content (not just formatting)
    """
    text = node.text() if hasattr(node, 'text') else ''
    if not text:
        return 0.0

    score = 0.0

    # Length signal
    text_len = len(text)
    if 50 <= text_len <= 1000:
        score += 0.3  # Good length
    elif text_len > 1000:
        score += 0.1  # Long but might be comprehensive
    else:
        score += 0.0  # Too short, likely not substantive

    # Sentence structure
    sentence_count = text.count('.') + text.count('?') + text.count('!')
    if sentence_count >= 2:
        score += 0.2  # Multiple sentences indicate substantive content

    # Avoid pure formatting/navigation
    if text.strip() in ['...', '—', '-', 'Table of Contents', 'Page', '']:
        return 0.0  # Skip pure formatting

    return score


def _get_item_header_boost(node: 'Node') -> float:
    """
    Boost Item headers when query is about items.

    "Item 1A" queries should prioritize Item 1A headers.
    """
    if node.type != NodeType.HEADING:
        return 0.0

    text = node.text() if hasattr(node, 'text') else ''
    if not text:
        return 0.0

    # Check if this is an Item header
    if re.match(r'^\s*item\s+\d+[a-z]?[:\.\s]', text, re.IGNORECASE):
        return 1.5

    return 0.0


def _get_node_section(node: 'Node') -> Optional[str]:
    """
    Get section name for a node by walking up the tree.

    Returns:
        Section name if found, None otherwise
    """
    # Check if node has section in metadata
    if hasattr(node, 'metadata') and 'section' in node.metadata:
        return node.metadata['section']

    # Walk up tree looking for section marker
    current = node
    while current:
        if hasattr(current, 'semantic_type'):
            if current.semantic_type in (SemanticType.SECTION_HEADER, SemanticType.ITEM_HEADER):
                return current.text() if hasattr(current, 'text') else None

        current = current.parent if hasattr(current, 'parent') else None

    return None


def get_section_importance_names() -> List[str]:
    """
    Get list of important section names for reference.

    Returns:
        List of section names with built-in importance boosts
    """
    return list(SECTION_IMPORTANCE.keys())