edgartools/venv/lib/python3.10/site-packages/edgar/documents/ranking/cache.py

"""
Search index caching for performance optimization.

Provides memory and disk caching with LRU eviction and TTL expiration.
"""

from dataclasses import dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Dict, Any, List
import hashlib
import pickle
import logging

logger = logging.getLogger(__name__)


@dataclass
class CacheEntry:
    """
    Cached search index entry.

    Stores pre-built search indices for a document along with metadata
    for cache management (access tracking, TTL).
    """
    document_hash: str
    index_data: Dict[str, Any]  # Serialized BM25 index data
    created_at: datetime
    access_count: int = 0
    last_accessed: Optional[datetime] = None
    metadata: Dict[str, Any] = field(default_factory=dict)


class SearchIndexCache:
    """
    Manages search index caching with memory + disk storage.

    Features:
    - In-memory LRU cache for fast access
    - Optional disk persistence for reuse across sessions
    - TTL-based expiration
    - Access statistics tracking

    Parameters:
        memory_cache_size: Maximum entries in memory (default: 10)
        disk_cache_enabled: Enable disk persistence (default: True)
        cache_dir: Directory for disk cache (default: ~/.edgar_cache/search)
        ttl_hours: Time-to-live for cached entries (default: 24)
    """

    def __init__(self,
                 memory_cache_size: int = 10,
                 disk_cache_enabled: bool = True,
                 cache_dir: Optional[Path] = None,
                 ttl_hours: int = 24):
        """Initialize cache."""
        self.memory_cache_size = memory_cache_size
        self.disk_cache_enabled = disk_cache_enabled
        self.cache_dir = cache_dir or Path.home() / ".edgar_cache" / "search"
        self.ttl = timedelta(hours=ttl_hours)

        # In-memory cache (LRU)
        self._memory_cache: Dict[str, CacheEntry] = {}
        self._access_order: List[str] = []

        # Statistics
        self._hits = 0
        self._misses = 0

        # Create cache directory
        if disk_cache_enabled:
            self.cache_dir.mkdir(parents=True, exist_ok=True)

    def compute_document_hash(self, document_id: str, content_sample: str) -> str:
        """
        Compute cache key from document identifiers.

        Uses document ID (e.g., accession number) and a content sample
        to create a unique, stable hash.

        Args:
            document_id: Unique document identifier
            content_sample: Sample of document content for verification

        Returns:
            16-character hex hash
        """
        content = f"{document_id}:{content_sample}"
        return hashlib.sha256(content.encode()).hexdigest()[:16]

    def get(self, document_hash: str) -> Optional[CacheEntry]:
        """
        Get cached entry.

        Tries memory cache first, then disk cache. Updates LRU order
        and access statistics.

        Args:
            document_hash: Cache key

        Returns:
            CacheEntry if found and valid, None otherwise
        """
        # Try memory cache first
        if document_hash in self._memory_cache:
            entry = self._memory_cache[document_hash]

            # Check TTL
            if datetime.now() - entry.created_at > self.ttl:
                # Expired - remove from cache
                self._evict_memory(document_hash)
                self._misses += 1
                return None

            # Update access tracking
            entry.access_count += 1
            entry.last_accessed = datetime.now()

            # Update LRU order
            if document_hash in self._access_order:
                self._access_order.remove(document_hash)
            self._access_order.append(document_hash)

            self._hits += 1
            logger.debug(f"Cache hit (memory): {document_hash}")
            return entry

        # Try disk cache
        if self.disk_cache_enabled:
            entry = self._load_from_disk(document_hash)
            if entry:
                # Check TTL
                if datetime.now() - entry.created_at > self.ttl:
                    # Expired - delete file
                    self._delete_from_disk(document_hash)
                    self._misses += 1
                    return None

                # Load into memory cache
                self._put_memory(document_hash, entry)
                self._hits += 1
                logger.debug(f"Cache hit (disk): {document_hash}")
                return entry

        self._misses += 1
        logger.debug(f"Cache miss: {document_hash}")
        return None

    def put(self, document_hash: str, entry: CacheEntry) -> None:
        """
        Cache entry in memory and optionally on disk.

        Args:
            document_hash: Cache key
            entry: Entry to cache
        """
        # Put in memory cache
        self._put_memory(document_hash, entry)

        # Put in disk cache
        if self.disk_cache_enabled:
            self._save_to_disk(document_hash, entry)

        logger.debug(f"Cached entry: {document_hash}")

    def _put_memory(self, document_hash: str, entry: CacheEntry) -> None:
        """Put entry in memory cache with LRU eviction."""
        # Evict if cache full
        while len(self._memory_cache) >= self.memory_cache_size:
            if self._access_order:
                oldest = self._access_order.pop(0)
                self._evict_memory(oldest)
            else:
                break

        self._memory_cache[document_hash] = entry
        self._access_order.append(document_hash)

    def _evict_memory(self, document_hash: str) -> None:
        """Evict entry from memory cache."""
        if document_hash in self._memory_cache:
            del self._memory_cache[document_hash]
            logger.debug(f"Evicted from memory: {document_hash}")

    def _load_from_disk(self, document_hash: str) -> Optional[CacheEntry]:
        """Load entry from disk cache."""
        cache_file = self.cache_dir / f"{document_hash}.pkl"
        if not cache_file.exists():
            return None

        try:
            with open(cache_file, 'rb') as f:
                entry = pickle.load(f)
            return entry
        except Exception as e:
            logger.warning(f"Failed to load cache from disk: {e}")
            # Delete corrupted file
            try:
                cache_file.unlink()
            except:
                pass
            return None

    def _save_to_disk(self, document_hash: str, entry: CacheEntry) -> None:
        """Save entry to disk cache."""
        cache_file = self.cache_dir / f"{document_hash}.pkl"
        try:
            with open(cache_file, 'wb') as f:
                pickle.dump(entry, f)
        except Exception as e:
            logger.warning(f"Failed to save cache to disk: {e}")

    def _delete_from_disk(self, document_hash: str) -> None:
        """Delete entry from disk cache."""
        cache_file = self.cache_dir / f"{document_hash}.pkl"
        try:
            if cache_file.exists():
                cache_file.unlink()
        except Exception as e:
            logger.warning(f"Failed to delete cache file: {e}")

    def clear(self, memory_only: bool = False) -> None:
        """
        Clear cache.

        Args:
            memory_only: If True, only clear memory cache (keep disk)
        """
        self._memory_cache.clear()
        self._access_order.clear()
        logger.info("Cleared memory cache")

        if not memory_only and self.disk_cache_enabled:
            try:
                for cache_file in self.cache_dir.glob("*.pkl"):
                    cache_file.unlink()
                logger.info("Cleared disk cache")
            except Exception as e:
                logger.warning(f"Failed to clear disk cache: {e}")

    def get_stats(self) -> Dict[str, Any]:
        """
        Get cache statistics.

        Returns:
            Dictionary with cache statistics
        """
        disk_entries = 0
        if self.disk_cache_enabled:
            try:
                disk_entries = len(list(self.cache_dir.glob("*.pkl")))
            except:
                pass

        total_requests = self._hits + self._misses
        hit_rate = self._hits / total_requests if total_requests > 0 else 0.0

        return {
            "memory_entries": len(self._memory_cache),
            "disk_entries": disk_entries,
            "total_accesses": sum(e.access_count for e in self._memory_cache.values()),
            "cache_hits": self._hits,
            "cache_misses": self._misses,
            "hit_rate": hit_rate,
            "memory_size_mb": self._estimate_cache_size()
        }

    def _estimate_cache_size(self) -> float:
        """Estimate memory cache size in MB."""
        try:
            import sys
            total_bytes = sum(
                sys.getsizeof(entry.index_data)
                for entry in self._memory_cache.values()
            )
            return total_bytes / (1024 * 1024)
        except:
            # Rough estimate if sys.getsizeof fails
            return len(self._memory_cache) * 5.0  # Assume ~5MB per entry


# Global cache instance
_global_cache: Optional[SearchIndexCache] = None


def get_search_cache() -> SearchIndexCache:
    """
    Get global search cache instance.

    Creates a singleton cache instance on first call.

    Returns:
        Global SearchIndexCache instance
    """
    global _global_cache
    if _global_cache is None:
        _global_cache = SearchIndexCache()
    return _global_cache


def set_search_cache(cache: Optional[SearchIndexCache]) -> None:
    """
    Set global search cache instance.

    Useful for testing or custom cache configuration.

    Args:
        cache: Cache instance to use globally (None to disable)
    """
    global _global_cache
    _global_cache = cache