Files
edgartools/venv/lib/python3.10/site-packages/edgar/documents/utils/cache.py
2025-12-09 12:13:01 +01:00

426 lines
12 KiB
Python

"""
Cache utilities for performance optimization.
"""
import weakref
from collections import OrderedDict
from typing import Any, Dict, Optional, Callable, TypeVar, Generic
from functools import wraps
import time
import threading
from dataclasses import dataclass, field
from datetime import datetime, timedelta
T = TypeVar('T')
@dataclass
class CacheStats:
"""Statistics for cache performance monitoring."""
hits: int = 0
misses: int = 0
evictions: int = 0
total_time: float = 0.0
last_reset: datetime = field(default_factory=datetime.now)
@property
def hit_rate(self) -> float:
"""Calculate cache hit rate."""
total = self.hits + self.misses
return self.hits / total if total > 0 else 0.0
@property
def avg_access_time(self) -> float:
"""Calculate average access time."""
total = self.hits + self.misses
return self.total_time / total if total > 0 else 0.0
def reset(self):
"""Reset statistics."""
self.hits = 0
self.misses = 0
self.evictions = 0
self.total_time = 0.0
self.last_reset = datetime.now()
class LRUCache(Generic[T]):
"""
Thread-safe LRU cache implementation.
Used for caching expensive operations like style parsing
and header detection results.
"""
def __init__(self, max_size: int = 1000):
"""
Initialize LRU cache.
Args:
max_size: Maximum number of items to cache
"""
self.max_size = max_size
self._cache: OrderedDict[str, T] = OrderedDict()
self._lock = threading.RLock()
self.stats = CacheStats()
def get(self, key: str) -> Optional[T]:
"""
Get item from cache.
Args:
key: Cache key
Returns:
Cached value or None if not found
"""
start_time = time.time()
with self._lock:
if key in self._cache:
# Move to end (most recently used)
self._cache.move_to_end(key)
self.stats.hits += 1
self.stats.total_time += time.time() - start_time
return self._cache[key]
self.stats.misses += 1
self.stats.total_time += time.time() - start_time
return None
def put(self, key: str, value: T) -> None:
"""
Put item in cache.
Args:
key: Cache key
value: Value to cache
"""
with self._lock:
if key in self._cache:
# Update existing
self._cache.move_to_end(key)
self._cache[key] = value
else:
# Add new
self._cache[key] = value
# Evict oldest if over capacity
if len(self._cache) > self.max_size:
self._cache.popitem(last=False)
self.stats.evictions += 1
def clear(self) -> None:
"""Clear all cached items."""
with self._lock:
self._cache.clear()
def size(self) -> int:
"""Get current cache size."""
with self._lock:
return len(self._cache)
class WeakCache:
"""
Weak reference cache for parsed nodes.
Allows garbage collection of unused nodes while
maintaining references to actively used ones.
"""
def __init__(self):
"""Initialize weak cache."""
self._cache: Dict[str, weakref.ref] = {}
self._lock = threading.RLock()
self.stats = CacheStats()
def get(self, key: str) -> Optional[Any]:
"""
Get item from cache.
Args:
key: Cache key
Returns:
Cached object or None if not found or collected
"""
start_time = time.time()
with self._lock:
ref = self._cache.get(key)
if ref is not None:
obj = ref()
if obj is not None:
self.stats.hits += 1
self.stats.total_time += time.time() - start_time
return obj
else:
# Object was garbage collected
del self._cache[key]
self.stats.misses += 1
self.stats.total_time += time.time() - start_time
return None
def put(self, key: str, value: Any) -> None:
"""
Put item in cache with weak reference.
Args:
key: Cache key
value: Object to cache
"""
with self._lock:
self._cache[key] = weakref.ref(value)
def clear(self) -> None:
"""Clear all cached references."""
with self._lock:
self._cache.clear()
def cleanup(self) -> int:
"""
Remove dead references.
Returns:
Number of references removed
"""
with self._lock:
dead_keys = [
key for key, ref in self._cache.items()
if ref() is None
]
for key in dead_keys:
del self._cache[key]
return len(dead_keys)
class TimeBasedCache(Generic[T]):
"""
Time-based expiring cache.
Items expire after a specified duration.
"""
def __init__(self, ttl_seconds: int = 3600):
"""
Initialize time-based cache.
Args:
ttl_seconds: Time to live in seconds
"""
self.ttl = timedelta(seconds=ttl_seconds)
self._cache: Dict[str, tuple[T, datetime]] = {}
self._lock = threading.RLock()
self.stats = CacheStats()
def get(self, key: str) -> Optional[T]:
"""
Get item from cache if not expired.
Args:
key: Cache key
Returns:
Cached value or None if not found or expired
"""
start_time = time.time()
with self._lock:
if key in self._cache:
value, timestamp = self._cache[key]
if datetime.now() - timestamp < self.ttl:
self.stats.hits += 1
self.stats.total_time += time.time() - start_time
return value
else:
# Expired
del self._cache[key]
self.stats.evictions += 1
self.stats.misses += 1
self.stats.total_time += time.time() - start_time
return None
def put(self, key: str, value: T) -> None:
"""
Put item in cache with timestamp.
Args:
key: Cache key
value: Value to cache
"""
with self._lock:
self._cache[key] = (value, datetime.now())
def clear(self) -> None:
"""Clear all cached items."""
with self._lock:
self._cache.clear()
def cleanup(self) -> int:
"""
Remove expired items.
Returns:
Number of items removed
"""
with self._lock:
now = datetime.now()
expired_keys = [
key for key, (_, timestamp) in self._cache.items()
if now - timestamp >= self.ttl
]
for key in expired_keys:
del self._cache[key]
self.stats.evictions += 1
return len(expired_keys)
def cached(cache: LRUCache, key_func: Optional[Callable] = None):
"""
Decorator for caching function results.
Args:
cache: Cache instance to use
key_func: Function to generate cache key from arguments
Returns:
Decorated function
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
# Generate cache key
if key_func:
key = key_func(*args, **kwargs)
else:
# Default key generation
key = f"{func.__name__}:{str(args)}:{str(sorted(kwargs.items()))}"
# Check cache
result = cache.get(key)
if result is not None:
return result
# Compute and cache result
result = func(*args, **kwargs)
cache.put(key, result)
return result
return wrapper
return decorator
class CacheManager:
"""
Manages multiple caches for the parser.
Provides centralized cache management and monitoring.
"""
def __init__(self):
"""Initialize cache manager."""
# Style parsing cache
self.style_cache = LRUCache[dict](max_size=5000)
# Header detection cache
self.header_cache = LRUCache[bool](max_size=2000)
# Pattern matching cache
self.pattern_cache = LRUCache[bool](max_size=10000)
# Node reference cache
self.node_cache = WeakCache()
# Compiled regex cache
self.regex_cache = LRUCache[Any](max_size=500)
# All caches for management
self._caches = {
'style': self.style_cache,
'header': self.header_cache,
'pattern': self.pattern_cache,
'node': self.node_cache,
'regex': self.regex_cache
}
def get_stats(self) -> Dict[str, CacheStats]:
"""Get statistics for all caches."""
return {
name: cache.stats
for name, cache in self._caches.items()
if hasattr(cache, 'stats')
}
def reset_stats(self) -> None:
"""Reset statistics for all caches."""
for cache in self._caches.values():
if hasattr(cache, 'stats'):
cache.stats.reset()
def clear_all(self) -> None:
"""Clear all caches."""
for cache in self._caches.values():
cache.clear()
def cleanup(self) -> Dict[str, int]:
"""
Cleanup expired/dead entries in all caches.
Returns:
Number of entries cleaned up per cache
"""
cleanup_counts = {}
# Cleanup weak cache
if hasattr(self.node_cache, 'cleanup'):
cleanup_counts['node'] = self.node_cache.cleanup()
return cleanup_counts
def get_memory_usage(self) -> Dict[str, int]:
"""
Estimate memory usage of caches.
Returns:
Approximate memory usage in bytes per cache
"""
import sys
usage = {}
for name, cache in self._caches.items():
if hasattr(cache, '_cache'):
# Rough estimation
size = 0
if isinstance(cache._cache, dict):
for key, value in cache._cache.items():
size += sys.getsizeof(key)
if hasattr(value, '__sizeof__'):
size += sys.getsizeof(value)
else:
size += 1000 # Default estimate
usage[name] = size
return usage
# Global cache manager instance
_cache_manager = None
def get_cache_manager() -> CacheManager:
"""Get global cache manager instance."""
global _cache_manager
if _cache_manager is None:
_cache_manager = CacheManager()
return _cache_manager