""" Search index caching for performance optimization. Provides memory and disk caching with LRU eviction and TTL expiration. """ from dataclasses import dataclass, field from datetime import datetime, timedelta from pathlib import Path from typing import Optional, Dict, Any, List import hashlib import pickle import logging logger = logging.getLogger(__name__) @dataclass class CacheEntry: """ Cached search index entry. Stores pre-built search indices for a document along with metadata for cache management (access tracking, TTL). """ document_hash: str index_data: Dict[str, Any] # Serialized BM25 index data created_at: datetime access_count: int = 0 last_accessed: Optional[datetime] = None metadata: Dict[str, Any] = field(default_factory=dict) class SearchIndexCache: """ Manages search index caching with memory + disk storage. Features: - In-memory LRU cache for fast access - Optional disk persistence for reuse across sessions - TTL-based expiration - Access statistics tracking Parameters: memory_cache_size: Maximum entries in memory (default: 10) disk_cache_enabled: Enable disk persistence (default: True) cache_dir: Directory for disk cache (default: ~/.edgar_cache/search) ttl_hours: Time-to-live for cached entries (default: 24) """ def __init__(self, memory_cache_size: int = 10, disk_cache_enabled: bool = True, cache_dir: Optional[Path] = None, ttl_hours: int = 24): """Initialize cache.""" self.memory_cache_size = memory_cache_size self.disk_cache_enabled = disk_cache_enabled self.cache_dir = cache_dir or Path.home() / ".edgar_cache" / "search" self.ttl = timedelta(hours=ttl_hours) # In-memory cache (LRU) self._memory_cache: Dict[str, CacheEntry] = {} self._access_order: List[str] = [] # Statistics self._hits = 0 self._misses = 0 # Create cache directory if disk_cache_enabled: self.cache_dir.mkdir(parents=True, exist_ok=True) def compute_document_hash(self, document_id: str, content_sample: str) -> str: """ Compute cache key from document identifiers. Uses document ID (e.g., accession number) and a content sample to create a unique, stable hash. Args: document_id: Unique document identifier content_sample: Sample of document content for verification Returns: 16-character hex hash """ content = f"{document_id}:{content_sample}" return hashlib.sha256(content.encode()).hexdigest()[:16] def get(self, document_hash: str) -> Optional[CacheEntry]: """ Get cached entry. Tries memory cache first, then disk cache. Updates LRU order and access statistics. Args: document_hash: Cache key Returns: CacheEntry if found and valid, None otherwise """ # Try memory cache first if document_hash in self._memory_cache: entry = self._memory_cache[document_hash] # Check TTL if datetime.now() - entry.created_at > self.ttl: # Expired - remove from cache self._evict_memory(document_hash) self._misses += 1 return None # Update access tracking entry.access_count += 1 entry.last_accessed = datetime.now() # Update LRU order if document_hash in self._access_order: self._access_order.remove(document_hash) self._access_order.append(document_hash) self._hits += 1 logger.debug(f"Cache hit (memory): {document_hash}") return entry # Try disk cache if self.disk_cache_enabled: entry = self._load_from_disk(document_hash) if entry: # Check TTL if datetime.now() - entry.created_at > self.ttl: # Expired - delete file self._delete_from_disk(document_hash) self._misses += 1 return None # Load into memory cache self._put_memory(document_hash, entry) self._hits += 1 logger.debug(f"Cache hit (disk): {document_hash}") return entry self._misses += 1 logger.debug(f"Cache miss: {document_hash}") return None def put(self, document_hash: str, entry: CacheEntry) -> None: """ Cache entry in memory and optionally on disk. Args: document_hash: Cache key entry: Entry to cache """ # Put in memory cache self._put_memory(document_hash, entry) # Put in disk cache if self.disk_cache_enabled: self._save_to_disk(document_hash, entry) logger.debug(f"Cached entry: {document_hash}") def _put_memory(self, document_hash: str, entry: CacheEntry) -> None: """Put entry in memory cache with LRU eviction.""" # Evict if cache full while len(self._memory_cache) >= self.memory_cache_size: if self._access_order: oldest = self._access_order.pop(0) self._evict_memory(oldest) else: break self._memory_cache[document_hash] = entry self._access_order.append(document_hash) def _evict_memory(self, document_hash: str) -> None: """Evict entry from memory cache.""" if document_hash in self._memory_cache: del self._memory_cache[document_hash] logger.debug(f"Evicted from memory: {document_hash}") def _load_from_disk(self, document_hash: str) -> Optional[CacheEntry]: """Load entry from disk cache.""" cache_file = self.cache_dir / f"{document_hash}.pkl" if not cache_file.exists(): return None try: with open(cache_file, 'rb') as f: entry = pickle.load(f) return entry except Exception as e: logger.warning(f"Failed to load cache from disk: {e}") # Delete corrupted file try: cache_file.unlink() except: pass return None def _save_to_disk(self, document_hash: str, entry: CacheEntry) -> None: """Save entry to disk cache.""" cache_file = self.cache_dir / f"{document_hash}.pkl" try: with open(cache_file, 'wb') as f: pickle.dump(entry, f) except Exception as e: logger.warning(f"Failed to save cache to disk: {e}") def _delete_from_disk(self, document_hash: str) -> None: """Delete entry from disk cache.""" cache_file = self.cache_dir / f"{document_hash}.pkl" try: if cache_file.exists(): cache_file.unlink() except Exception as e: logger.warning(f"Failed to delete cache file: {e}") def clear(self, memory_only: bool = False) -> None: """ Clear cache. Args: memory_only: If True, only clear memory cache (keep disk) """ self._memory_cache.clear() self._access_order.clear() logger.info("Cleared memory cache") if not memory_only and self.disk_cache_enabled: try: for cache_file in self.cache_dir.glob("*.pkl"): cache_file.unlink() logger.info("Cleared disk cache") except Exception as e: logger.warning(f"Failed to clear disk cache: {e}") def get_stats(self) -> Dict[str, Any]: """ Get cache statistics. Returns: Dictionary with cache statistics """ disk_entries = 0 if self.disk_cache_enabled: try: disk_entries = len(list(self.cache_dir.glob("*.pkl"))) except: pass total_requests = self._hits + self._misses hit_rate = self._hits / total_requests if total_requests > 0 else 0.0 return { "memory_entries": len(self._memory_cache), "disk_entries": disk_entries, "total_accesses": sum(e.access_count for e in self._memory_cache.values()), "cache_hits": self._hits, "cache_misses": self._misses, "hit_rate": hit_rate, "memory_size_mb": self._estimate_cache_size() } def _estimate_cache_size(self) -> float: """Estimate memory cache size in MB.""" try: import sys total_bytes = sum( sys.getsizeof(entry.index_data) for entry in self._memory_cache.values() ) return total_bytes / (1024 * 1024) except: # Rough estimate if sys.getsizeof fails return len(self._memory_cache) * 5.0 # Assume ~5MB per entry # Global cache instance _global_cache: Optional[SearchIndexCache] = None def get_search_cache() -> SearchIndexCache: """ Get global search cache instance. Creates a singleton cache instance on first call. Returns: Global SearchIndexCache instance """ global _global_cache if _global_cache is None: _global_cache = SearchIndexCache() return _global_cache def set_search_cache(cache: Optional[SearchIndexCache]) -> None: """ Set global search cache instance. Useful for testing or custom cache configuration. Args: cache: Cache instance to use globally (None to disable) """ global _global_cache _global_cache = cache