edgartools/venv/lib/python3.10/site-packages/edgar/documents/ranking/ranking.py

"""
Ranking engines for document search.

Provides BM25-based ranking with optional semantic structure boosting.
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from enum import Enum, auto
from typing import List, Optional, Dict, Any, TYPE_CHECKING

from rank_bm25 import BM25Okapi

if TYPE_CHECKING:
    from edgar.documents.nodes import Node


class RankingAlgorithm(Enum):
    """Supported ranking algorithms."""
    BM25 = auto()           # Classic BM25 (Okapi variant)
    HYBRID = auto()         # BM25 + Semantic structure boosting
    SEMANTIC = auto()       # Pure structure-aware scoring


@dataclass
class RankedResult:
    """
    A search result with ranking score.

    Attributes:
        node: Document node containing the match
        score: Relevance score (higher is better)
        rank: Position in results (1-indexed)
        text: Matched text content
        bm25_score: Raw BM25 score (if applicable)
        semantic_score: Semantic boost score (if applicable)
        metadata: Additional result metadata
    """
    node: 'Node'
    score: float
    rank: int
    text: str
    bm25_score: Optional[float] = None
    semantic_score: Optional[float] = None
    metadata: Dict[str, Any] = field(default_factory=dict)

    @property
    def snippet(self) -> str:
        """Get text snippet (first 200 chars)."""
        if len(self.text) <= 200:
            return self.text
        return self.text[:197] + "..."


class RankingEngine(ABC):
    """Abstract base class for ranking engines."""

    @abstractmethod
    def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
        """
        Rank nodes by relevance to query.

        Args:
            query: Search query
            nodes: Nodes to rank

        Returns:
            List of ranked results sorted by relevance (best first)
        """
        pass

    @abstractmethod
    def get_algorithm_name(self) -> str:
        """Get name of ranking algorithm."""
        pass


class BM25Engine(RankingEngine):
    """
    BM25 ranking engine using Okapi variant.

    BM25 is a probabilistic retrieval function that ranks documents based on
    query term frequency and inverse document frequency. Well-suited for
    financial documents where exact term matching is important.

    Parameters:
        k1: Term frequency saturation parameter (default: 1.5)
            Controls how quickly term frequency impact plateaus.
        b: Length normalization parameter (default: 0.75)
            0 = no normalization, 1 = full normalization.
    """

    def __init__(self, k1: float = 1.5, b: float = 0.75):
        """
        Initialize BM25 engine.

        Args:
            k1: Term frequency saturation (1.2-2.0 typical)
            b: Length normalization (0.75 is standard)
        """
        self.k1 = k1
        self.b = b
        self._bm25: Optional[BM25Okapi] = None
        self._corpus_nodes: Optional[List['Node']] = None
        self._tokenized_corpus: Optional[List[List[str]]] = None

    def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
        """
        Rank nodes using BM25 algorithm.

        Args:
            query: Search query
            nodes: Nodes to rank

        Returns:
            Ranked results sorted by BM25 score
        """
        if not nodes:
            return []

        # Import preprocessing here to avoid circular dependency
        from edgar.documents.ranking.preprocessing import preprocess_text, tokenize

        # Build index if needed or if nodes changed
        if self._corpus_nodes != nodes:
            self._build_index(nodes)

        # Tokenize and preprocess query
        query_tokens = tokenize(preprocess_text(query))

        if not query_tokens:
            return []

        # Get BM25 scores
        scores = self._bm25.get_scores(query_tokens)

        # Create ranked results
        results = []
        for idx, (node, score) in enumerate(zip(nodes, scores)):
            if score > 0:  # Only include nodes with positive scores
                text = node.text() if hasattr(node, 'text') else str(node)
                results.append(RankedResult(
                    node=node,
                    score=float(score),
                    rank=0,  # Will be set after sorting
                    text=text,
                    bm25_score=float(score),
                    metadata={'algorithm': 'BM25'}
                ))

        # Sort by score (highest first) and assign ranks
        results.sort(key=lambda r: r.score, reverse=True)
        for rank, result in enumerate(results, start=1):
            result.rank = rank

        return results

    def _build_index(self, nodes: List['Node']):
        """Build BM25 index from nodes."""
        from edgar.documents.ranking.preprocessing import preprocess_text, tokenize

        # Store corpus
        self._corpus_nodes = nodes

        # Tokenize all nodes
        self._tokenized_corpus = []
        for node in nodes:
            text = node.text() if hasattr(node, 'text') else str(node)
            processed = preprocess_text(text)
            tokens = tokenize(processed)
            self._tokenized_corpus.append(tokens)

        # Build BM25 index with custom parameters
        self._bm25 = BM25Okapi(
            self._tokenized_corpus,
            k1=self.k1,
            b=self.b
        )

    def get_index_data(self) -> Dict[str, Any]:
        """
        Serialize index data for caching.

        Returns:
            Dictionary with serializable index data
        """
        return {
            'tokenized_corpus': self._tokenized_corpus,
            'k1': self.k1,
            'b': self.b,
            'algorithm': 'BM25'
        }

    def load_index_data(self, index_data: Dict[str, Any], nodes: List['Node']) -> None:
        """
        Load index from cached data.

        Args:
            index_data: Serialized index data
            nodes: Nodes corresponding to the index
        """
        self._corpus_nodes = nodes
        self._tokenized_corpus = index_data['tokenized_corpus']
        self.k1 = index_data['k1']
        self.b = index_data['b']

        # Rebuild BM25 index from tokenized corpus
        self._bm25 = BM25Okapi(
            self._tokenized_corpus,
            k1=self.k1,
            b=self.b
        )

    def get_algorithm_name(self) -> str:
        """Get algorithm name."""
        return "BM25"


class HybridEngine(RankingEngine):
    """
    Hybrid ranking engine: BM25 + Semantic structure boosting.

    Combines classic BM25 text matching with semantic structure awareness:
    - BM25 provides strong exact-match ranking for financial terms
    - Semantic scoring boosts results based on document structure:
      * Headings and section markers
      * Cross-references ("See Item X")
      * Gateway content (summaries, overviews)
      * Table and XBRL importance

    This approach is agent-friendly: it surfaces starting points for
    investigation rather than fragmented chunks.

    Parameters:
        bm25_weight: Weight for BM25 score (default: 0.8)
        semantic_weight: Weight for semantic score (default: 0.2)
        k1: BM25 term frequency saturation
        b: BM25 length normalization
    """

    def __init__(self,
                 bm25_weight: float = 0.8,
                 semantic_weight: float = 0.2,
                 k1: float = 1.5,
                 b: float = 0.75,
                 boost_sections: Optional[List[str]] = None):
        """
        Initialize hybrid engine.

        Args:
            bm25_weight: Weight for BM25 component (0-1)
            semantic_weight: Weight for semantic component (0-1)
            k1: BM25 k1 parameter
            b: BM25 b parameter
            boost_sections: Section names to boost (e.g., ["Risk Factors"])
        """
        self.bm25_engine = BM25Engine(k1=k1, b=b)
        self.bm25_weight = bm25_weight
        self.semantic_weight = semantic_weight
        self.boost_sections = boost_sections or []

        # Validate weights
        total_weight = bm25_weight + semantic_weight
        if not (0.99 <= total_weight <= 1.01):  # Allow small floating point error
            raise ValueError(f"Weights must sum to 1.0, got {total_weight}")

    def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
        """
        Rank nodes using hybrid approach.

        Args:
            query: Search query
            nodes: Nodes to rank

        Returns:
            Ranked results with combined BM25 + semantic scores
        """
        if not nodes:
            return []

        # Get BM25 results
        bm25_results = self.bm25_engine.rank(query, nodes)

        if not bm25_results:
            return []

        # Import semantic scoring
        from edgar.documents.ranking.semantic import compute_semantic_scores

        # Get semantic scores for all nodes
        semantic_scores_dict = compute_semantic_scores(
            nodes=nodes,
            query=query,
            boost_sections=self.boost_sections
        )

        # Normalize BM25 scores to 0-1 range
        max_bm25 = max(r.bm25_score for r in bm25_results)
        if max_bm25 > 0:
            for result in bm25_results:
                result.bm25_score = result.bm25_score / max_bm25

        # Combine scores
        for result in bm25_results:
            semantic_score = semantic_scores_dict.get(id(result.node), 0.0)
            result.semantic_score = semantic_score

            # Weighted combination
            result.score = (
                self.bm25_weight * result.bm25_score +
                self.semantic_weight * semantic_score
            )

            result.metadata['algorithm'] = 'Hybrid'
            result.metadata['bm25_weight'] = self.bm25_weight
            result.metadata['semantic_weight'] = self.semantic_weight

        # Re-sort by combined score
        bm25_results.sort(key=lambda r: r.score, reverse=True)

        # Update ranks
        for rank, result in enumerate(bm25_results, start=1):
            result.rank = rank

        return bm25_results

    def get_algorithm_name(self) -> str:
        """Get algorithm name."""
        return "Hybrid"


class SemanticEngine(RankingEngine):
    """
    Pure semantic/structure-based ranking (no text matching).

    Ranks nodes purely by structural importance:
    - Section headings
    - Cross-references
    - Gateway content
    - Document structure position

    Useful for understanding document organization without specific queries.
    """

    def __init__(self, boost_sections: Optional[List[str]] = None):
        """
        Initialize semantic engine.

        Args:
            boost_sections: Section names to boost
        """
        self.boost_sections = boost_sections or []

    def rank(self, query: str, nodes: List['Node']) -> List[RankedResult]:
        """
        Rank nodes by semantic importance.

        Args:
            query: Search query (used for context)
            nodes: Nodes to rank

        Returns:
            Ranked results by structural importance
        """
        if not nodes:
            return []

        from edgar.documents.ranking.semantic import compute_semantic_scores

        # Get semantic scores
        semantic_scores = compute_semantic_scores(
            nodes=nodes,
            query=query,
            boost_sections=self.boost_sections
        )

        # Create results
        results = []
        for node in nodes:
            score = semantic_scores.get(id(node), 0.0)
            if score > 0:
                text = node.text() if hasattr(node, 'text') else str(node)
                results.append(RankedResult(
                    node=node,
                    score=score,
                    rank=0,
                    text=text,
                    semantic_score=score,
                    metadata={'algorithm': 'Semantic'}
                ))

        # Sort and rank
        results.sort(key=lambda r: r.score, reverse=True)
        for rank, result in enumerate(results, start=1):
            result.rank = rank

        return results

    def get_algorithm_name(self) -> str:
        """Get algorithm name."""
        return "Semantic"