Files
2025-12-09 12:13:01 +01:00

334 lines
9.4 KiB
Python

"""
Semantic scoring for document structure awareness.
Provides structure-based boosting without ML/embeddings:
- Node type importance (headings, tables, XBRL)
- Cross-reference detection (gateway content)
- Section importance
- Text quality signals
This is NOT embedding-based semantic search. It's structure-aware ranking
that helps agents find investigation starting points.
"""
import re
from typing import List, Dict, Optional, TYPE_CHECKING
if TYPE_CHECKING:
from edgar.documents.nodes import Node
from edgar.documents.types import NodeType, SemanticType
# Gateway terms that indicate summary/overview content
GATEWAY_TERMS = [
'summary', 'overview', 'introduction', 'highlights',
'key points', 'executive summary', 'in summary',
'table of contents', 'index'
]
# Cross-reference patterns
CROSS_REFERENCE_PATTERNS = [
r'\bsee\s+item\s+\d+[a-z]?\b', # "See Item 1A"
r'\bsee\s+(?:part|section)\s+\d+\b', # "See Part II"
r'\brefer\s+to\s+item\s+\d+[a-z]?\b', # "Refer to Item 7"
r'\bas\s+discussed\s+in\s+item\s+\d+\b', # "As discussed in Item 1"
r'\bfor\s+(?:more|additional)\s+information\b', # "For more information"
]
# Section importance weights
SECTION_IMPORTANCE = {
'risk factors': 1.5,
'management discussion': 1.4,
'md&a': 1.4,
'business': 1.3,
'financial statements': 1.2,
'controls and procedures': 1.2,
}
def compute_semantic_scores(nodes: List['Node'],
query: str,
boost_sections: Optional[List[str]] = None) -> Dict[int, float]:
"""
Compute semantic/structure scores for nodes.
This provides structure-aware boosting based on:
1. Node type (headings > tables > paragraphs)
2. Cross-references (gateway content)
3. Section importance
4. Gateway terms (summaries, overviews)
5. XBRL presence
6. Text quality
Args:
nodes: Nodes to score
query: Search query (for context-aware boosting)
boost_sections: Additional sections to boost
Returns:
Dictionary mapping node id to semantic score (0-1 range)
"""
scores = {}
boost_sections = boost_sections or []
# Get query context
query_lower = query.lower()
is_item_query = bool(re.search(r'item\s+\d+[a-z]?', query_lower))
for node in nodes:
score = 0.0
# 1. Node Type Boosting
score += _get_node_type_boost(node)
# 2. Semantic Type Boosting
score += _get_semantic_type_boost(node)
# 3. Cross-Reference Detection (gateway content)
score += _detect_cross_references(node)
# 4. Gateway Content Detection
score += _detect_gateway_content(node, query_lower)
# 5. Section Importance Boosting
score += _get_section_boost(node, boost_sections)
# 6. XBRL Fact Boosting (for financial queries)
score += _get_xbrl_boost(node)
# 7. Text Quality Signals
score += _get_quality_boost(node)
# 8. Query-Specific Boosting
if is_item_query:
score += _get_item_header_boost(node)
# Normalize to 0-1 range (max possible score is ~7.0)
normalized_score = min(score / 7.0, 1.0)
scores[id(node)] = normalized_score
return scores
def _get_node_type_boost(node: 'Node') -> float:
"""
Boost based on node type.
Headings and structural elements are more important for navigation.
"""
type_boosts = {
NodeType.HEADING: 2.0, # Headings are key navigation points
NodeType.SECTION: 1.5, # Section markers
NodeType.TABLE: 1.0, # Tables contain structured data
NodeType.XBRL_FACT: 0.8, # Financial facts
NodeType.LIST: 0.5, # Lists
NodeType.PARAGRAPH: 0.3, # Regular text
NodeType.TEXT: 0.1, # Plain text nodes
}
return type_boosts.get(node.type, 0.0)
def _get_semantic_type_boost(node: 'Node') -> float:
"""
Boost based on semantic type.
Section headers and items are important for SEC filings.
"""
if not hasattr(node, 'semantic_type') or node.semantic_type is None:
return 0.0
semantic_boosts = {
SemanticType.ITEM_HEADER: 2.0, # Item headers are critical
SemanticType.SECTION_HEADER: 1.5, # Section headers
SemanticType.FINANCIAL_STATEMENT: 1.2, # Financial statements
SemanticType.TABLE_OF_CONTENTS: 1.0, # TOC is a gateway
SemanticType.TITLE: 0.8,
SemanticType.HEADER: 0.6,
}
return semantic_boosts.get(node.semantic_type, 0.0)
def _detect_cross_references(node: 'Node') -> float:
"""
Detect cross-references that indicate gateway content.
Content that points to other sections is useful for navigation.
"""
text = node.text() if hasattr(node, 'text') else ''
if not text:
return 0.0
text_lower = text.lower()
# Check each pattern
matches = 0
for pattern in CROSS_REFERENCE_PATTERNS:
if re.search(pattern, text_lower):
matches += 1
# Boost increases with number of cross-references
return min(matches * 0.5, 1.5) # Cap at 1.5
def _detect_gateway_content(node: 'Node', query_lower: str) -> float:
"""
Detect gateway content (summaries, overviews, introductions).
These are excellent starting points for investigation.
"""
text = node.text() if hasattr(node, 'text') else ''
if not text:
return 0.0
text_lower = text.lower()
# Check for gateway terms in text
for term in GATEWAY_TERMS:
if term in text_lower:
return 1.0
# Check if this is an introductory paragraph (first ~200 chars)
if len(text) < 200 and len(text) > 20:
# Short intro paragraphs are often summaries
if any(word in text_lower for word in ['provides', 'describes', 'includes', 'contains']):
return 0.5
return 0.0
def _get_section_boost(node: 'Node', boost_sections: List[str]) -> float:
"""
Boost nodes in important sections.
Some SEC sections are more relevant for certain queries.
"""
# Try to determine section from node or ancestors
section_name = _get_node_section(node)
if not section_name:
return 0.0
section_lower = section_name.lower()
# Check built-in importance
for key, boost in SECTION_IMPORTANCE.items():
if key in section_lower:
return boost
# Check user-specified sections
for boost_section in boost_sections:
if boost_section.lower() in section_lower:
return 1.5
return 0.0
def _get_xbrl_boost(node: 'Node') -> float:
"""
Boost XBRL facts and tables with XBRL data.
Financial data is important for financial queries.
"""
if node.type == NodeType.XBRL_FACT:
return 0.8
# Check if table contains XBRL facts
if node.type == NodeType.TABLE:
# Check metadata for XBRL indicator
if hasattr(node, 'metadata') and node.metadata.get('has_xbrl'):
return 0.6
return 0.0
def _get_quality_boost(node: 'Node') -> float:
"""
Boost based on text quality signals.
Higher quality content tends to be more useful:
- Appropriate length (not too short, not too long)
- Good structure (sentences, punctuation)
- Substantive content (not just formatting)
"""
text = node.text() if hasattr(node, 'text') else ''
if not text:
return 0.0
score = 0.0
# Length signal
text_len = len(text)
if 50 <= text_len <= 1000:
score += 0.3 # Good length
elif text_len > 1000:
score += 0.1 # Long but might be comprehensive
else:
score += 0.0 # Too short, likely not substantive
# Sentence structure
sentence_count = text.count('.') + text.count('?') + text.count('!')
if sentence_count >= 2:
score += 0.2 # Multiple sentences indicate substantive content
# Avoid pure formatting/navigation
if text.strip() in ['...', '', '-', 'Table of Contents', 'Page', '']:
return 0.0 # Skip pure formatting
return score
def _get_item_header_boost(node: 'Node') -> float:
"""
Boost Item headers when query is about items.
"Item 1A" queries should prioritize Item 1A headers.
"""
if node.type != NodeType.HEADING:
return 0.0
text = node.text() if hasattr(node, 'text') else ''
if not text:
return 0.0
# Check if this is an Item header
if re.match(r'^\s*item\s+\d+[a-z]?[:\.\s]', text, re.IGNORECASE):
return 1.5
return 0.0
def _get_node_section(node: 'Node') -> Optional[str]:
"""
Get section name for a node by walking up the tree.
Returns:
Section name if found, None otherwise
"""
# Check if node has section in metadata
if hasattr(node, 'metadata') and 'section' in node.metadata:
return node.metadata['section']
# Walk up tree looking for section marker
current = node
while current:
if hasattr(current, 'semantic_type'):
if current.semantic_type in (SemanticType.SECTION_HEADER, SemanticType.ITEM_HEADER):
return current.text() if hasattr(current, 'text') else None
current = current.parent if hasattr(current, 'parent') else None
return None
def get_section_importance_names() -> List[str]:
"""
Get list of important section names for reference.
Returns:
List of section names with built-in importance boosts
"""
return list(SECTION_IMPORTANCE.keys())