Files
2025-12-09 12:13:01 +01:00

188 lines
4.4 KiB
Python

"""
Text preprocessing for search.
Provides tokenization and text normalization for BM25 and semantic analysis.
"""
import re
from typing import List, Set
# Common English stopwords (minimal set for financial documents)
# We keep many financial terms that might be stopwords in other contexts
STOPWORDS: Set[str] = {
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for',
'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on',
'that', 'the', 'to', 'was', 'will', 'with'
}
def preprocess_text(text: str,
lowercase: bool = True,
remove_punctuation: bool = False) -> str:
"""
Preprocess text for search.
Args:
text: Raw text
lowercase: Convert to lowercase
remove_punctuation: Remove punctuation (keep for financial data)
Returns:
Preprocessed text
"""
if not text:
return ""
# Normalize whitespace
text = ' '.join(text.split())
# Lowercase (important for BM25 matching)
if lowercase:
text = text.lower()
# Optionally remove punctuation (usually keep for "$5B", "Item 1A", etc.)
if remove_punctuation:
text = re.sub(r'[^\w\s]', ' ', text)
text = ' '.join(text.split()) # Clean up extra spaces
return text
def tokenize(text: str,
remove_stopwords: bool = False,
min_token_length: int = 2) -> List[str]:
"""
Tokenize text for BM25 indexing.
Args:
text: Text to tokenize
remove_stopwords: Remove common stopwords
min_token_length: Minimum token length to keep
Returns:
List of tokens
"""
if not text:
return []
# Split on whitespace and punctuation boundaries
# Keep alphanumeric + some special chars for financial terms
tokens = re.findall(r'\b[\w$%]+\b', text.lower())
# Filter by length
tokens = [t for t in tokens if len(t) >= min_token_length]
# Optionally remove stopwords
if remove_stopwords:
tokens = [t for t in tokens if t not in STOPWORDS]
return tokens
def extract_query_terms(query: str) -> List[str]:
"""
Extract important terms from query for boosting.
Identifies key financial terms, numbers, and important phrases.
Args:
query: Search query
Returns:
List of important query terms
"""
# Tokenize
tokens = tokenize(query, remove_stopwords=True)
# Extract important patterns
important = []
# Financial amounts: $5B, $1.2M, etc.
amounts = re.findall(r'\$[\d,.]+[BMK]?', query, re.IGNORECASE)
important.extend(amounts)
# Percentages: 15%, 3.5%
percentages = re.findall(r'\d+\.?\d*%', query)
important.extend(percentages)
# Years: 2023, 2024
years = re.findall(r'\b(19|20)\d{2}\b', query)
important.extend(years)
# Item references: Item 1A, Item 7
items = re.findall(r'item\s+\d+[a-z]?', query, re.IGNORECASE)
important.extend(items)
# Add all tokens
important.extend(tokens)
# Remove duplicates while preserving order
seen = set()
result = []
for term in important:
term_lower = term.lower()
if term_lower not in seen:
seen.add(term_lower)
result.append(term)
return result
def normalize_financial_term(term: str) -> str:
"""
Normalize financial terms for consistent matching.
Examples:
"$5 billion" -> "$5b"
"5,000,000" -> "5000000"
"Item 1A" -> "item1a"
Args:
term: Financial term
Returns:
Normalized term
"""
term = term.lower().strip()
# Remove commas from numbers
term = term.replace(',', '')
# Normalize billion/million/thousand
term = re.sub(r'\s*billion\b', 'b', term)
term = re.sub(r'\s*million\b', 'm', term)
term = re.sub(r'\s*thousand\b', 'k', term)
# Remove spaces in compound terms
term = re.sub(r'(item|section|part)\s+(\d+[a-z]?)', r'\1\2', term)
# Remove extra whitespace
term = ' '.join(term.split())
return term
def get_ngrams(tokens: List[str], n: int = 2) -> List[str]:
"""
Generate n-grams from tokens.
Useful for phrase matching in BM25.
Args:
tokens: List of tokens
n: N-gram size
Returns:
List of n-grams as strings
"""
if len(tokens) < n:
return []
ngrams = []
for i in range(len(tokens) - n + 1):
ngram = ' '.join(tokens[i:i + n])
ngrams.append(ngram)
return ngrams