""" Semantic scoring for document structure awareness. Provides structure-based boosting without ML/embeddings: - Node type importance (headings, tables, XBRL) - Cross-reference detection (gateway content) - Section importance - Text quality signals This is NOT embedding-based semantic search. It's structure-aware ranking that helps agents find investigation starting points. """ import re from typing import List, Dict, Optional, TYPE_CHECKING if TYPE_CHECKING: from edgar.documents.nodes import Node from edgar.documents.types import NodeType, SemanticType # Gateway terms that indicate summary/overview content GATEWAY_TERMS = [ 'summary', 'overview', 'introduction', 'highlights', 'key points', 'executive summary', 'in summary', 'table of contents', 'index' ] # Cross-reference patterns CROSS_REFERENCE_PATTERNS = [ r'\bsee\s+item\s+\d+[a-z]?\b', # "See Item 1A" r'\bsee\s+(?:part|section)\s+\d+\b', # "See Part II" r'\brefer\s+to\s+item\s+\d+[a-z]?\b', # "Refer to Item 7" r'\bas\s+discussed\s+in\s+item\s+\d+\b', # "As discussed in Item 1" r'\bfor\s+(?:more|additional)\s+information\b', # "For more information" ] # Section importance weights SECTION_IMPORTANCE = { 'risk factors': 1.5, 'management discussion': 1.4, 'md&a': 1.4, 'business': 1.3, 'financial statements': 1.2, 'controls and procedures': 1.2, } def compute_semantic_scores(nodes: List['Node'], query: str, boost_sections: Optional[List[str]] = None) -> Dict[int, float]: """ Compute semantic/structure scores for nodes. This provides structure-aware boosting based on: 1. Node type (headings > tables > paragraphs) 2. Cross-references (gateway content) 3. Section importance 4. Gateway terms (summaries, overviews) 5. XBRL presence 6. Text quality Args: nodes: Nodes to score query: Search query (for context-aware boosting) boost_sections: Additional sections to boost Returns: Dictionary mapping node id to semantic score (0-1 range) """ scores = {} boost_sections = boost_sections or [] # Get query context query_lower = query.lower() is_item_query = bool(re.search(r'item\s+\d+[a-z]?', query_lower)) for node in nodes: score = 0.0 # 1. Node Type Boosting score += _get_node_type_boost(node) # 2. Semantic Type Boosting score += _get_semantic_type_boost(node) # 3. Cross-Reference Detection (gateway content) score += _detect_cross_references(node) # 4. Gateway Content Detection score += _detect_gateway_content(node, query_lower) # 5. Section Importance Boosting score += _get_section_boost(node, boost_sections) # 6. XBRL Fact Boosting (for financial queries) score += _get_xbrl_boost(node) # 7. Text Quality Signals score += _get_quality_boost(node) # 8. Query-Specific Boosting if is_item_query: score += _get_item_header_boost(node) # Normalize to 0-1 range (max possible score is ~7.0) normalized_score = min(score / 7.0, 1.0) scores[id(node)] = normalized_score return scores def _get_node_type_boost(node: 'Node') -> float: """ Boost based on node type. Headings and structural elements are more important for navigation. """ type_boosts = { NodeType.HEADING: 2.0, # Headings are key navigation points NodeType.SECTION: 1.5, # Section markers NodeType.TABLE: 1.0, # Tables contain structured data NodeType.XBRL_FACT: 0.8, # Financial facts NodeType.LIST: 0.5, # Lists NodeType.PARAGRAPH: 0.3, # Regular text NodeType.TEXT: 0.1, # Plain text nodes } return type_boosts.get(node.type, 0.0) def _get_semantic_type_boost(node: 'Node') -> float: """ Boost based on semantic type. Section headers and items are important for SEC filings. """ if not hasattr(node, 'semantic_type') or node.semantic_type is None: return 0.0 semantic_boosts = { SemanticType.ITEM_HEADER: 2.0, # Item headers are critical SemanticType.SECTION_HEADER: 1.5, # Section headers SemanticType.FINANCIAL_STATEMENT: 1.2, # Financial statements SemanticType.TABLE_OF_CONTENTS: 1.0, # TOC is a gateway SemanticType.TITLE: 0.8, SemanticType.HEADER: 0.6, } return semantic_boosts.get(node.semantic_type, 0.0) def _detect_cross_references(node: 'Node') -> float: """ Detect cross-references that indicate gateway content. Content that points to other sections is useful for navigation. """ text = node.text() if hasattr(node, 'text') else '' if not text: return 0.0 text_lower = text.lower() # Check each pattern matches = 0 for pattern in CROSS_REFERENCE_PATTERNS: if re.search(pattern, text_lower): matches += 1 # Boost increases with number of cross-references return min(matches * 0.5, 1.5) # Cap at 1.5 def _detect_gateway_content(node: 'Node', query_lower: str) -> float: """ Detect gateway content (summaries, overviews, introductions). These are excellent starting points for investigation. """ text = node.text() if hasattr(node, 'text') else '' if not text: return 0.0 text_lower = text.lower() # Check for gateway terms in text for term in GATEWAY_TERMS: if term in text_lower: return 1.0 # Check if this is an introductory paragraph (first ~200 chars) if len(text) < 200 and len(text) > 20: # Short intro paragraphs are often summaries if any(word in text_lower for word in ['provides', 'describes', 'includes', 'contains']): return 0.5 return 0.0 def _get_section_boost(node: 'Node', boost_sections: List[str]) -> float: """ Boost nodes in important sections. Some SEC sections are more relevant for certain queries. """ # Try to determine section from node or ancestors section_name = _get_node_section(node) if not section_name: return 0.0 section_lower = section_name.lower() # Check built-in importance for key, boost in SECTION_IMPORTANCE.items(): if key in section_lower: return boost # Check user-specified sections for boost_section in boost_sections: if boost_section.lower() in section_lower: return 1.5 return 0.0 def _get_xbrl_boost(node: 'Node') -> float: """ Boost XBRL facts and tables with XBRL data. Financial data is important for financial queries. """ if node.type == NodeType.XBRL_FACT: return 0.8 # Check if table contains XBRL facts if node.type == NodeType.TABLE: # Check metadata for XBRL indicator if hasattr(node, 'metadata') and node.metadata.get('has_xbrl'): return 0.6 return 0.0 def _get_quality_boost(node: 'Node') -> float: """ Boost based on text quality signals. Higher quality content tends to be more useful: - Appropriate length (not too short, not too long) - Good structure (sentences, punctuation) - Substantive content (not just formatting) """ text = node.text() if hasattr(node, 'text') else '' if not text: return 0.0 score = 0.0 # Length signal text_len = len(text) if 50 <= text_len <= 1000: score += 0.3 # Good length elif text_len > 1000: score += 0.1 # Long but might be comprehensive else: score += 0.0 # Too short, likely not substantive # Sentence structure sentence_count = text.count('.') + text.count('?') + text.count('!') if sentence_count >= 2: score += 0.2 # Multiple sentences indicate substantive content # Avoid pure formatting/navigation if text.strip() in ['...', '—', '-', 'Table of Contents', 'Page', '']: return 0.0 # Skip pure formatting return score def _get_item_header_boost(node: 'Node') -> float: """ Boost Item headers when query is about items. "Item 1A" queries should prioritize Item 1A headers. """ if node.type != NodeType.HEADING: return 0.0 text = node.text() if hasattr(node, 'text') else '' if not text: return 0.0 # Check if this is an Item header if re.match(r'^\s*item\s+\d+[a-z]?[:\.\s]', text, re.IGNORECASE): return 1.5 return 0.0 def _get_node_section(node: 'Node') -> Optional[str]: """ Get section name for a node by walking up the tree. Returns: Section name if found, None otherwise """ # Check if node has section in metadata if hasattr(node, 'metadata') and 'section' in node.metadata: return node.metadata['section'] # Walk up tree looking for section marker current = node while current: if hasattr(current, 'semantic_type'): if current.semantic_type in (SemanticType.SECTION_HEADER, SemanticType.ITEM_HEADER): return current.text() if hasattr(current, 'text') else None current = current.parent if hasattr(current, 'parent') else None return None def get_section_importance_names() -> List[str]: """ Get list of important section names for reference. Returns: List of section names with built-in importance boosts """ return list(SECTION_IMPORTANCE.keys())