Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/documents/ranking/semantic.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/ranking/semantic.py
@@ -0,0 +1,333 @@
+"""
+Semantic scoring for document structure awareness.
+
+Provides structure-based boosting without ML/embeddings:
+- Node type importance (headings, tables, XBRL)
+- Cross-reference detection (gateway content)
+- Section importance
+- Text quality signals
+
+This is NOT embedding-based semantic search. It's structure-aware ranking
+that helps agents find investigation starting points.
+"""
+
+import re
+from typing import List, Dict, Optional, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from edgar.documents.nodes import Node
+
+from edgar.documents.types import NodeType, SemanticType
+
+
+# Gateway terms that indicate summary/overview content
+GATEWAY_TERMS = [
+    'summary', 'overview', 'introduction', 'highlights',
+    'key points', 'executive summary', 'in summary',
+    'table of contents', 'index'
+]
+
+# Cross-reference patterns
+CROSS_REFERENCE_PATTERNS = [
+    r'\bsee\s+item\s+\d+[a-z]?\b',              # "See Item 1A"
+    r'\bsee\s+(?:part|section)\s+\d+\b',        # "See Part II"
+    r'\brefer\s+to\s+item\s+\d+[a-z]?\b',       # "Refer to Item 7"
+    r'\bas\s+discussed\s+in\s+item\s+\d+\b',    # "As discussed in Item 1"
+    r'\bfor\s+(?:more|additional)\s+information\b',  # "For more information"
+]
+
+# Section importance weights
+SECTION_IMPORTANCE = {
+    'risk factors': 1.5,
+    'management discussion': 1.4,
+    'md&a': 1.4,
+    'business': 1.3,
+    'financial statements': 1.2,
+    'controls and procedures': 1.2,
+}
+
+
+def compute_semantic_scores(nodes: List['Node'],
+                           query: str,
+                           boost_sections: Optional[List[str]] = None) -> Dict[int, float]:
+    """
+    Compute semantic/structure scores for nodes.
+
+    This provides structure-aware boosting based on:
+    1. Node type (headings > tables > paragraphs)
+    2. Cross-references (gateway content)
+    3. Section importance
+    4. Gateway terms (summaries, overviews)
+    5. XBRL presence
+    6. Text quality
+
+    Args:
+        nodes: Nodes to score
+        query: Search query (for context-aware boosting)
+        boost_sections: Additional sections to boost
+
+    Returns:
+        Dictionary mapping node id to semantic score (0-1 range)
+    """
+    scores = {}
+    boost_sections = boost_sections or []
+
+    # Get query context
+    query_lower = query.lower()
+    is_item_query = bool(re.search(r'item\s+\d+[a-z]?', query_lower))
+
+    for node in nodes:
+        score = 0.0
+
+        # 1. Node Type Boosting
+        score += _get_node_type_boost(node)
+
+        # 2. Semantic Type Boosting
+        score += _get_semantic_type_boost(node)
+
+        # 3. Cross-Reference Detection (gateway content)
+        score += _detect_cross_references(node)
+
+        # 4. Gateway Content Detection
+        score += _detect_gateway_content(node, query_lower)
+
+        # 5. Section Importance Boosting
+        score += _get_section_boost(node, boost_sections)
+
+        # 6. XBRL Fact Boosting (for financial queries)
+        score += _get_xbrl_boost(node)
+
+        # 7. Text Quality Signals
+        score += _get_quality_boost(node)
+
+        # 8. Query-Specific Boosting
+        if is_item_query:
+            score += _get_item_header_boost(node)
+
+        # Normalize to 0-1 range (max possible score is ~7.0)
+        normalized_score = min(score / 7.0, 1.0)
+
+        scores[id(node)] = normalized_score
+
+    return scores
+
+
+def _get_node_type_boost(node: 'Node') -> float:
+    """
+    Boost based on node type.
+
+    Headings and structural elements are more important for navigation.
+    """
+    type_boosts = {
+        NodeType.HEADING: 2.0,      # Headings are key navigation points
+        NodeType.SECTION: 1.5,       # Section markers
+        NodeType.TABLE: 1.0,         # Tables contain structured data
+        NodeType.XBRL_FACT: 0.8,     # Financial facts
+        NodeType.LIST: 0.5,          # Lists
+        NodeType.PARAGRAPH: 0.3,     # Regular text
+        NodeType.TEXT: 0.1,          # Plain text nodes
+    }
+
+    return type_boosts.get(node.type, 0.0)
+
+
+def _get_semantic_type_boost(node: 'Node') -> float:
+    """
+    Boost based on semantic type.
+
+    Section headers and items are important for SEC filings.
+    """
+    if not hasattr(node, 'semantic_type') or node.semantic_type is None:
+        return 0.0
+
+    semantic_boosts = {
+        SemanticType.ITEM_HEADER: 2.0,          # Item headers are critical
+        SemanticType.SECTION_HEADER: 1.5,       # Section headers
+        SemanticType.FINANCIAL_STATEMENT: 1.2,  # Financial statements
+        SemanticType.TABLE_OF_CONTENTS: 1.0,    # TOC is a gateway
+        SemanticType.TITLE: 0.8,
+        SemanticType.HEADER: 0.6,
+    }
+
+    return semantic_boosts.get(node.semantic_type, 0.0)
+
+
+def _detect_cross_references(node: 'Node') -> float:
+    """
+    Detect cross-references that indicate gateway content.
+
+    Content that points to other sections is useful for navigation.
+    """
+    text = node.text() if hasattr(node, 'text') else ''
+    if not text:
+        return 0.0
+
+    text_lower = text.lower()
+
+    # Check each pattern
+    matches = 0
+    for pattern in CROSS_REFERENCE_PATTERNS:
+        if re.search(pattern, text_lower):
+            matches += 1
+
+    # Boost increases with number of cross-references
+    return min(matches * 0.5, 1.5)  # Cap at 1.5
+
+
+def _detect_gateway_content(node: 'Node', query_lower: str) -> float:
+    """
+    Detect gateway content (summaries, overviews, introductions).
+
+    These are excellent starting points for investigation.
+    """
+    text = node.text() if hasattr(node, 'text') else ''
+    if not text:
+        return 0.0
+
+    text_lower = text.lower()
+
+    # Check for gateway terms in text
+    for term in GATEWAY_TERMS:
+        if term in text_lower:
+            return 1.0
+
+    # Check if this is an introductory paragraph (first ~200 chars)
+    if len(text) < 200 and len(text) > 20:
+        # Short intro paragraphs are often summaries
+        if any(word in text_lower for word in ['provides', 'describes', 'includes', 'contains']):
+            return 0.5
+
+    return 0.0
+
+
+def _get_section_boost(node: 'Node', boost_sections: List[str]) -> float:
+    """
+    Boost nodes in important sections.
+
+    Some SEC sections are more relevant for certain queries.
+    """
+    # Try to determine section from node or ancestors
+    section_name = _get_node_section(node)
+    if not section_name:
+        return 0.0
+
+    section_lower = section_name.lower()
+
+    # Check built-in importance
+    for key, boost in SECTION_IMPORTANCE.items():
+        if key in section_lower:
+            return boost
+
+    # Check user-specified sections
+    for boost_section in boost_sections:
+        if boost_section.lower() in section_lower:
+            return 1.5
+
+    return 0.0
+
+
+def _get_xbrl_boost(node: 'Node') -> float:
+    """
+    Boost XBRL facts and tables with XBRL data.
+
+    Financial data is important for financial queries.
+    """
+    if node.type == NodeType.XBRL_FACT:
+        return 0.8
+
+    # Check if table contains XBRL facts
+    if node.type == NodeType.TABLE:
+        # Check metadata for XBRL indicator
+        if hasattr(node, 'metadata') and node.metadata.get('has_xbrl'):
+            return 0.6
+
+    return 0.0
+
+
+def _get_quality_boost(node: 'Node') -> float:
+    """
+    Boost based on text quality signals.
+
+    Higher quality content tends to be more useful:
+    - Appropriate length (not too short, not too long)
+    - Good structure (sentences, punctuation)
+    - Substantive content (not just formatting)
+    """
+    text = node.text() if hasattr(node, 'text') else ''
+    if not text:
+        return 0.0
+
+    score = 0.0
+
+    # Length signal
+    text_len = len(text)
+    if 50 <= text_len <= 1000:
+        score += 0.3  # Good length
+    elif text_len > 1000:
+        score += 0.1  # Long but might be comprehensive
+    else:
+        score += 0.0  # Too short, likely not substantive
+
+    # Sentence structure
+    sentence_count = text.count('.') + text.count('?') + text.count('!')
+    if sentence_count >= 2:
+        score += 0.2  # Multiple sentences indicate substantive content
+
+    # Avoid pure formatting/navigation
+    if text.strip() in ['...', '—', '-', 'Table of Contents', 'Page', '']:
+        return 0.0  # Skip pure formatting
+
+    return score
+
+
+def _get_item_header_boost(node: 'Node') -> float:
+    """
+    Boost Item headers when query is about items.
+
+    "Item 1A" queries should prioritize Item 1A headers.
+    """
+    if node.type != NodeType.HEADING:
+        return 0.0
+
+    text = node.text() if hasattr(node, 'text') else ''
+    if not text:
+        return 0.0
+
+    # Check if this is an Item header
+    if re.match(r'^\s*item\s+\d+[a-z]?[:\.\s]', text, re.IGNORECASE):
+        return 1.5
+
+    return 0.0
+
+
+def _get_node_section(node: 'Node') -> Optional[str]:
+    """
+    Get section name for a node by walking up the tree.
+
+    Returns:
+        Section name if found, None otherwise
+    """
+    # Check if node has section in metadata
+    if hasattr(node, 'metadata') and 'section' in node.metadata:
+        return node.metadata['section']
+
+    # Walk up tree looking for section marker
+    current = node
+    while current:
+        if hasattr(current, 'semantic_type'):
+            if current.semantic_type in (SemanticType.SECTION_HEADER, SemanticType.ITEM_HEADER):
+                return current.text() if hasattr(current, 'text') else None
+
+        current = current.parent if hasattr(current, 'parent') else None
+
+    return None
+
+
+def get_section_importance_names() -> List[str]:
+    """
+    Get list of important section names for reference.
+
+    Returns:
+        List of section names with built-in importance boosts
+    """
+    return list(SECTION_IMPORTANCE.keys())