334 lines
9.4 KiB
Python
334 lines
9.4 KiB
Python
"""
|
|
Semantic scoring for document structure awareness.
|
|
|
|
Provides structure-based boosting without ML/embeddings:
|
|
- Node type importance (headings, tables, XBRL)
|
|
- Cross-reference detection (gateway content)
|
|
- Section importance
|
|
- Text quality signals
|
|
|
|
This is NOT embedding-based semantic search. It's structure-aware ranking
|
|
that helps agents find investigation starting points.
|
|
"""
|
|
|
|
import re
|
|
from typing import List, Dict, Optional, TYPE_CHECKING
|
|
|
|
if TYPE_CHECKING:
|
|
from edgar.documents.nodes import Node
|
|
|
|
from edgar.documents.types import NodeType, SemanticType
|
|
|
|
|
|
# Gateway terms that indicate summary/overview content
|
|
GATEWAY_TERMS = [
|
|
'summary', 'overview', 'introduction', 'highlights',
|
|
'key points', 'executive summary', 'in summary',
|
|
'table of contents', 'index'
|
|
]
|
|
|
|
# Cross-reference patterns
|
|
CROSS_REFERENCE_PATTERNS = [
|
|
r'\bsee\s+item\s+\d+[a-z]?\b', # "See Item 1A"
|
|
r'\bsee\s+(?:part|section)\s+\d+\b', # "See Part II"
|
|
r'\brefer\s+to\s+item\s+\d+[a-z]?\b', # "Refer to Item 7"
|
|
r'\bas\s+discussed\s+in\s+item\s+\d+\b', # "As discussed in Item 1"
|
|
r'\bfor\s+(?:more|additional)\s+information\b', # "For more information"
|
|
]
|
|
|
|
# Section importance weights
|
|
SECTION_IMPORTANCE = {
|
|
'risk factors': 1.5,
|
|
'management discussion': 1.4,
|
|
'md&a': 1.4,
|
|
'business': 1.3,
|
|
'financial statements': 1.2,
|
|
'controls and procedures': 1.2,
|
|
}
|
|
|
|
|
|
def compute_semantic_scores(nodes: List['Node'],
|
|
query: str,
|
|
boost_sections: Optional[List[str]] = None) -> Dict[int, float]:
|
|
"""
|
|
Compute semantic/structure scores for nodes.
|
|
|
|
This provides structure-aware boosting based on:
|
|
1. Node type (headings > tables > paragraphs)
|
|
2. Cross-references (gateway content)
|
|
3. Section importance
|
|
4. Gateway terms (summaries, overviews)
|
|
5. XBRL presence
|
|
6. Text quality
|
|
|
|
Args:
|
|
nodes: Nodes to score
|
|
query: Search query (for context-aware boosting)
|
|
boost_sections: Additional sections to boost
|
|
|
|
Returns:
|
|
Dictionary mapping node id to semantic score (0-1 range)
|
|
"""
|
|
scores = {}
|
|
boost_sections = boost_sections or []
|
|
|
|
# Get query context
|
|
query_lower = query.lower()
|
|
is_item_query = bool(re.search(r'item\s+\d+[a-z]?', query_lower))
|
|
|
|
for node in nodes:
|
|
score = 0.0
|
|
|
|
# 1. Node Type Boosting
|
|
score += _get_node_type_boost(node)
|
|
|
|
# 2. Semantic Type Boosting
|
|
score += _get_semantic_type_boost(node)
|
|
|
|
# 3. Cross-Reference Detection (gateway content)
|
|
score += _detect_cross_references(node)
|
|
|
|
# 4. Gateway Content Detection
|
|
score += _detect_gateway_content(node, query_lower)
|
|
|
|
# 5. Section Importance Boosting
|
|
score += _get_section_boost(node, boost_sections)
|
|
|
|
# 6. XBRL Fact Boosting (for financial queries)
|
|
score += _get_xbrl_boost(node)
|
|
|
|
# 7. Text Quality Signals
|
|
score += _get_quality_boost(node)
|
|
|
|
# 8. Query-Specific Boosting
|
|
if is_item_query:
|
|
score += _get_item_header_boost(node)
|
|
|
|
# Normalize to 0-1 range (max possible score is ~7.0)
|
|
normalized_score = min(score / 7.0, 1.0)
|
|
|
|
scores[id(node)] = normalized_score
|
|
|
|
return scores
|
|
|
|
|
|
def _get_node_type_boost(node: 'Node') -> float:
|
|
"""
|
|
Boost based on node type.
|
|
|
|
Headings and structural elements are more important for navigation.
|
|
"""
|
|
type_boosts = {
|
|
NodeType.HEADING: 2.0, # Headings are key navigation points
|
|
NodeType.SECTION: 1.5, # Section markers
|
|
NodeType.TABLE: 1.0, # Tables contain structured data
|
|
NodeType.XBRL_FACT: 0.8, # Financial facts
|
|
NodeType.LIST: 0.5, # Lists
|
|
NodeType.PARAGRAPH: 0.3, # Regular text
|
|
NodeType.TEXT: 0.1, # Plain text nodes
|
|
}
|
|
|
|
return type_boosts.get(node.type, 0.0)
|
|
|
|
|
|
def _get_semantic_type_boost(node: 'Node') -> float:
|
|
"""
|
|
Boost based on semantic type.
|
|
|
|
Section headers and items are important for SEC filings.
|
|
"""
|
|
if not hasattr(node, 'semantic_type') or node.semantic_type is None:
|
|
return 0.0
|
|
|
|
semantic_boosts = {
|
|
SemanticType.ITEM_HEADER: 2.0, # Item headers are critical
|
|
SemanticType.SECTION_HEADER: 1.5, # Section headers
|
|
SemanticType.FINANCIAL_STATEMENT: 1.2, # Financial statements
|
|
SemanticType.TABLE_OF_CONTENTS: 1.0, # TOC is a gateway
|
|
SemanticType.TITLE: 0.8,
|
|
SemanticType.HEADER: 0.6,
|
|
}
|
|
|
|
return semantic_boosts.get(node.semantic_type, 0.0)
|
|
|
|
|
|
def _detect_cross_references(node: 'Node') -> float:
|
|
"""
|
|
Detect cross-references that indicate gateway content.
|
|
|
|
Content that points to other sections is useful for navigation.
|
|
"""
|
|
text = node.text() if hasattr(node, 'text') else ''
|
|
if not text:
|
|
return 0.0
|
|
|
|
text_lower = text.lower()
|
|
|
|
# Check each pattern
|
|
matches = 0
|
|
for pattern in CROSS_REFERENCE_PATTERNS:
|
|
if re.search(pattern, text_lower):
|
|
matches += 1
|
|
|
|
# Boost increases with number of cross-references
|
|
return min(matches * 0.5, 1.5) # Cap at 1.5
|
|
|
|
|
|
def _detect_gateway_content(node: 'Node', query_lower: str) -> float:
|
|
"""
|
|
Detect gateway content (summaries, overviews, introductions).
|
|
|
|
These are excellent starting points for investigation.
|
|
"""
|
|
text = node.text() if hasattr(node, 'text') else ''
|
|
if not text:
|
|
return 0.0
|
|
|
|
text_lower = text.lower()
|
|
|
|
# Check for gateway terms in text
|
|
for term in GATEWAY_TERMS:
|
|
if term in text_lower:
|
|
return 1.0
|
|
|
|
# Check if this is an introductory paragraph (first ~200 chars)
|
|
if len(text) < 200 and len(text) > 20:
|
|
# Short intro paragraphs are often summaries
|
|
if any(word in text_lower for word in ['provides', 'describes', 'includes', 'contains']):
|
|
return 0.5
|
|
|
|
return 0.0
|
|
|
|
|
|
def _get_section_boost(node: 'Node', boost_sections: List[str]) -> float:
|
|
"""
|
|
Boost nodes in important sections.
|
|
|
|
Some SEC sections are more relevant for certain queries.
|
|
"""
|
|
# Try to determine section from node or ancestors
|
|
section_name = _get_node_section(node)
|
|
if not section_name:
|
|
return 0.0
|
|
|
|
section_lower = section_name.lower()
|
|
|
|
# Check built-in importance
|
|
for key, boost in SECTION_IMPORTANCE.items():
|
|
if key in section_lower:
|
|
return boost
|
|
|
|
# Check user-specified sections
|
|
for boost_section in boost_sections:
|
|
if boost_section.lower() in section_lower:
|
|
return 1.5
|
|
|
|
return 0.0
|
|
|
|
|
|
def _get_xbrl_boost(node: 'Node') -> float:
|
|
"""
|
|
Boost XBRL facts and tables with XBRL data.
|
|
|
|
Financial data is important for financial queries.
|
|
"""
|
|
if node.type == NodeType.XBRL_FACT:
|
|
return 0.8
|
|
|
|
# Check if table contains XBRL facts
|
|
if node.type == NodeType.TABLE:
|
|
# Check metadata for XBRL indicator
|
|
if hasattr(node, 'metadata') and node.metadata.get('has_xbrl'):
|
|
return 0.6
|
|
|
|
return 0.0
|
|
|
|
|
|
def _get_quality_boost(node: 'Node') -> float:
|
|
"""
|
|
Boost based on text quality signals.
|
|
|
|
Higher quality content tends to be more useful:
|
|
- Appropriate length (not too short, not too long)
|
|
- Good structure (sentences, punctuation)
|
|
- Substantive content (not just formatting)
|
|
"""
|
|
text = node.text() if hasattr(node, 'text') else ''
|
|
if not text:
|
|
return 0.0
|
|
|
|
score = 0.0
|
|
|
|
# Length signal
|
|
text_len = len(text)
|
|
if 50 <= text_len <= 1000:
|
|
score += 0.3 # Good length
|
|
elif text_len > 1000:
|
|
score += 0.1 # Long but might be comprehensive
|
|
else:
|
|
score += 0.0 # Too short, likely not substantive
|
|
|
|
# Sentence structure
|
|
sentence_count = text.count('.') + text.count('?') + text.count('!')
|
|
if sentence_count >= 2:
|
|
score += 0.2 # Multiple sentences indicate substantive content
|
|
|
|
# Avoid pure formatting/navigation
|
|
if text.strip() in ['...', '—', '-', 'Table of Contents', 'Page', '']:
|
|
return 0.0 # Skip pure formatting
|
|
|
|
return score
|
|
|
|
|
|
def _get_item_header_boost(node: 'Node') -> float:
|
|
"""
|
|
Boost Item headers when query is about items.
|
|
|
|
"Item 1A" queries should prioritize Item 1A headers.
|
|
"""
|
|
if node.type != NodeType.HEADING:
|
|
return 0.0
|
|
|
|
text = node.text() if hasattr(node, 'text') else ''
|
|
if not text:
|
|
return 0.0
|
|
|
|
# Check if this is an Item header
|
|
if re.match(r'^\s*item\s+\d+[a-z]?[:\.\s]', text, re.IGNORECASE):
|
|
return 1.5
|
|
|
|
return 0.0
|
|
|
|
|
|
def _get_node_section(node: 'Node') -> Optional[str]:
|
|
"""
|
|
Get section name for a node by walking up the tree.
|
|
|
|
Returns:
|
|
Section name if found, None otherwise
|
|
"""
|
|
# Check if node has section in metadata
|
|
if hasattr(node, 'metadata') and 'section' in node.metadata:
|
|
return node.metadata['section']
|
|
|
|
# Walk up tree looking for section marker
|
|
current = node
|
|
while current:
|
|
if hasattr(current, 'semantic_type'):
|
|
if current.semantic_type in (SemanticType.SECTION_HEADER, SemanticType.ITEM_HEADER):
|
|
return current.text() if hasattr(current, 'text') else None
|
|
|
|
current = current.parent if hasattr(current, 'parent') else None
|
|
|
|
return None
|
|
|
|
|
|
def get_section_importance_names() -> List[str]:
|
|
"""
|
|
Get list of important section names for reference.
|
|
|
|
Returns:
|
|
List of section names with built-in importance boosts
|
|
"""
|
|
return list(SECTION_IMPORTANCE.keys())
|