edgartools/venv/lib/python3.10/site-packages/edgar/documents/ranking/preprocessing.py

"""
Text preprocessing for search.

Provides tokenization and text normalization for BM25 and semantic analysis.
"""

import re
from typing import List, Set


# Common English stopwords (minimal set for financial documents)
# We keep many financial terms that might be stopwords in other contexts
STOPWORDS: Set[str] = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for',
    'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on',
    'that', 'the', 'to', 'was', 'will', 'with'
}


def preprocess_text(text: str,
                   lowercase: bool = True,
                   remove_punctuation: bool = False) -> str:
    """
    Preprocess text for search.

    Args:
        text: Raw text
        lowercase: Convert to lowercase
        remove_punctuation: Remove punctuation (keep for financial data)

    Returns:
        Preprocessed text
    """
    if not text:
        return ""

    # Normalize whitespace
    text = ' '.join(text.split())

    # Lowercase (important for BM25 matching)
    if lowercase:
        text = text.lower()

    # Optionally remove punctuation (usually keep for "$5B", "Item 1A", etc.)
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', ' ', text)
        text = ' '.join(text.split())  # Clean up extra spaces

    return text


def tokenize(text: str,
            remove_stopwords: bool = False,
            min_token_length: int = 2) -> List[str]:
    """
    Tokenize text for BM25 indexing.

    Args:
        text: Text to tokenize
        remove_stopwords: Remove common stopwords
        min_token_length: Minimum token length to keep

    Returns:
        List of tokens
    """
    if not text:
        return []

    # Split on whitespace and punctuation boundaries
    # Keep alphanumeric + some special chars for financial terms
    tokens = re.findall(r'\b[\w$%]+\b', text.lower())

    # Filter by length
    tokens = [t for t in tokens if len(t) >= min_token_length]

    # Optionally remove stopwords
    if remove_stopwords:
        tokens = [t for t in tokens if t not in STOPWORDS]

    return tokens


def extract_query_terms(query: str) -> List[str]:
    """
    Extract important terms from query for boosting.

    Identifies key financial terms, numbers, and important phrases.

    Args:
        query: Search query

    Returns:
        List of important query terms
    """
    # Tokenize
    tokens = tokenize(query, remove_stopwords=True)

    # Extract important patterns
    important = []

    # Financial amounts: $5B, $1.2M, etc.
    amounts = re.findall(r'\$[\d,.]+[BMK]?', query, re.IGNORECASE)
    important.extend(amounts)

    # Percentages: 15%, 3.5%
    percentages = re.findall(r'\d+\.?\d*%', query)
    important.extend(percentages)

    # Years: 2023, 2024
    years = re.findall(r'\b(19|20)\d{2}\b', query)
    important.extend(years)

    # Item references: Item 1A, Item 7
    items = re.findall(r'item\s+\d+[a-z]?', query, re.IGNORECASE)
    important.extend(items)

    # Add all tokens
    important.extend(tokens)

    # Remove duplicates while preserving order
    seen = set()
    result = []
    for term in important:
        term_lower = term.lower()
        if term_lower not in seen:
            seen.add(term_lower)
            result.append(term)

    return result


def normalize_financial_term(term: str) -> str:
    """
    Normalize financial terms for consistent matching.

    Examples:
        "$5 billion" -> "$5b"
        "5,000,000" -> "5000000"
        "Item 1A" -> "item1a"

    Args:
        term: Financial term

    Returns:
        Normalized term
    """
    term = term.lower().strip()

    # Remove commas from numbers
    term = term.replace(',', '')

    # Normalize billion/million/thousand
    term = re.sub(r'\s*billion\b', 'b', term)
    term = re.sub(r'\s*million\b', 'm', term)
    term = re.sub(r'\s*thousand\b', 'k', term)

    # Remove spaces in compound terms
    term = re.sub(r'(item|section|part)\s+(\d+[a-z]?)', r'\1\2', term)

    # Remove extra whitespace
    term = ' '.join(term.split())

    return term


def get_ngrams(tokens: List[str], n: int = 2) -> List[str]:
    """
    Generate n-grams from tokens.

    Useful for phrase matching in BM25.

    Args:
        tokens: List of tokens
        n: N-gram size

    Returns:
        List of n-grams as strings
    """
    if len(tokens) < n:
        return []

    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngram = ' '.join(tokens[i:i + n])
        ngrams.append(ngram)

    return ngrams