Initial commit
This commit is contained in:
717
venv/lib/python3.10/site-packages/edgar/storage_management.py
Normal file
717
venv/lib/python3.10/site-packages/edgar/storage_management.py
Normal file
@@ -0,0 +1,717 @@
|
||||
"""
|
||||
Storage Management for EdgarTools
|
||||
|
||||
This module provides visibility, analytics, and management capabilities for EdgarTools'
|
||||
local storage. It helps users understand what data is downloaded locally and provides
|
||||
tools to optimize and clean up storage.
|
||||
|
||||
Functions:
|
||||
storage_info() - Get overview of local storage with statistics
|
||||
check_filing() - Check if a filing is available locally
|
||||
check_filings_batch() - Check multiple filings efficiently
|
||||
availability_summary() - Get summary of filing availability
|
||||
analyze_storage() - Analyze storage with optimization recommendations
|
||||
optimize_storage() - Compress uncompressed files
|
||||
cleanup_storage() - Remove old files (dry-run by default)
|
||||
clear_cache() - Clear HTTP cache directories (with obsolete cache detection)
|
||||
|
||||
Classes:
|
||||
StorageInfo - Storage statistics dataclass with Rich display
|
||||
StorageAnalysis - Storage analysis with recommendations
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
|
||||
import time
|
||||
from edgar.richtools import repr_rich
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar._filings import Filing
|
||||
|
||||
# Cache storage statistics for 60 seconds
|
||||
_storage_cache: Optional[Tuple['StorageInfo', float]] = None
|
||||
_CACHE_TTL = 60.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class StorageAnalysis:
|
||||
"""Analysis of storage with optimization recommendations"""
|
||||
storage_info: 'StorageInfo'
|
||||
issues: List[str]
|
||||
recommendations: List[str]
|
||||
potential_savings_bytes: int
|
||||
|
||||
def __rich__(self):
|
||||
"""Rich Panel display with analysis and recommendations"""
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
# Create main table
|
||||
analysis = Table(show_header=False, box=None, padding=(0, 2))
|
||||
analysis.add_column(style="dim")
|
||||
analysis.add_column()
|
||||
|
||||
# Storage summary
|
||||
total_gb = self.storage_info.total_size_bytes / (1024**3)
|
||||
compressed_gb = self.storage_info.total_size_compressed / (1024**3)
|
||||
potential_gb = self.potential_savings_bytes / (1024**3)
|
||||
|
||||
analysis.add_row("📊 Current Size:", f"{compressed_gb:.2f} GB")
|
||||
analysis.add_row("💾 Total Files:", f"{self.storage_info.file_count:,}")
|
||||
|
||||
if self.potential_savings_bytes > 0:
|
||||
analysis.add_row("💰 Potential Savings:", f"{potential_gb:.2f} GB")
|
||||
|
||||
# Issues section
|
||||
if self.issues:
|
||||
analysis.add_row("", "") # Spacer
|
||||
analysis.add_row("[bold red]⚠️ Issues Found:[/bold red]", "")
|
||||
for issue in self.issues:
|
||||
analysis.add_row("", f"• {issue}")
|
||||
|
||||
# Recommendations section
|
||||
if self.recommendations:
|
||||
analysis.add_row("", "") # Spacer
|
||||
analysis.add_row("[bold green]💡 Recommendations:[/bold green]", "")
|
||||
for rec in self.recommendations:
|
||||
analysis.add_row("", f"• {rec}")
|
||||
|
||||
# All good message
|
||||
if not self.issues and not self.recommendations:
|
||||
analysis.add_row("", "")
|
||||
analysis.add_row("[bold green]✅ Storage is optimized[/bold green]", "")
|
||||
|
||||
return Panel(
|
||||
analysis,
|
||||
title="[bold]Storage Analysis[/bold]",
|
||||
border_style="blue",
|
||||
padding=(1, 2)
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
@dataclass
|
||||
class StorageInfo:
|
||||
"""Statistics about EdgarTools local storage"""
|
||||
total_size_bytes: int
|
||||
total_size_compressed: int # Actual disk usage
|
||||
file_count: int
|
||||
filing_count: int
|
||||
compression_savings_bytes: int
|
||||
compression_ratio: float
|
||||
by_type: Dict[str, int] # {'filings': 1247, 'companyfacts': 18839, ...}
|
||||
by_form: Dict[str, int] # {'10-K': 234, '10-Q': 456, ...} (future)
|
||||
by_year: Dict[str, int] # {2025: 45, 2024: 1202, ...}
|
||||
last_updated: datetime
|
||||
storage_path: Path
|
||||
|
||||
def __rich__(self):
|
||||
"""Rich Panel display"""
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
|
||||
# Create statistics table
|
||||
stats = Table(show_header=False, box=None, padding=(0, 2))
|
||||
stats.add_column(style="dim", justify="right")
|
||||
stats.add_column(style="bold")
|
||||
|
||||
# Format sizes
|
||||
total_gb = self.total_size_bytes / (1024**3)
|
||||
compressed_gb = self.total_size_compressed / (1024**3)
|
||||
savings_gb = self.compression_savings_bytes / (1024**3)
|
||||
|
||||
stats.add_row("Total Size:", f"{total_gb:.2f} GB (uncompressed)")
|
||||
stats.add_row("Disk Usage:", f"{compressed_gb:.2f} GB (compressed)")
|
||||
stats.add_row("Space Saved:", f"{savings_gb:.2f} GB ({self.compression_ratio:.1%})")
|
||||
stats.add_row("Total Files:", f"{self.file_count:,}")
|
||||
stats.add_row("Filings:", f"{self.filing_count:,}")
|
||||
stats.add_row("Location:", str(self.storage_path))
|
||||
|
||||
# Create breakdown by type with descriptive labels
|
||||
if self.by_type:
|
||||
stats.add_row("", "") # Spacer
|
||||
|
||||
# Define labels for cache directories
|
||||
cache_labels = {
|
||||
'_tcache': '_tcache (HTTP cache):',
|
||||
'_pcache': '_pcache (obsolete cache):',
|
||||
'_cache': '_cache (legacy cache):'
|
||||
}
|
||||
|
||||
for data_type, count in sorted(self.by_type.items()):
|
||||
# Use descriptive label for cache directories
|
||||
label = cache_labels.get(data_type, f"{data_type}:")
|
||||
stats.add_row(label, f"{count:,} files")
|
||||
|
||||
return Panel(
|
||||
stats,
|
||||
title="[bold]EdgarTools Local Storage[/bold]",
|
||||
border_style="blue",
|
||||
padding=(1, 2)
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def _scan_storage(force_refresh: bool = False) -> StorageInfo:
|
||||
"""
|
||||
Scan .edgar directory and collect storage statistics.
|
||||
Results are cached for 60 seconds unless force_refresh=True.
|
||||
"""
|
||||
global _storage_cache
|
||||
|
||||
# Check cache
|
||||
if not force_refresh and _storage_cache is not None:
|
||||
info, timestamp = _storage_cache
|
||||
if time.time() - timestamp < _CACHE_TTL:
|
||||
return info
|
||||
|
||||
from edgar.core import get_edgar_data_directory
|
||||
storage_path = get_edgar_data_directory()
|
||||
|
||||
# Initialize counters
|
||||
total_size_bytes = 0
|
||||
total_size_compressed = 0
|
||||
file_count = 0
|
||||
filing_count = 0
|
||||
by_type = {}
|
||||
by_form = {}
|
||||
by_year = {}
|
||||
|
||||
# Scan subdirectories
|
||||
for subdir in ['filings', 'companyfacts', 'submissions', 'reference', '_cache', '_pcache', '_tcache']:
|
||||
subdir_path = storage_path / subdir
|
||||
if not subdir_path.exists():
|
||||
continue
|
||||
|
||||
type_files = 0
|
||||
|
||||
# Recursively scan files
|
||||
for file_path in subdir_path.rglob('*'):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
|
||||
file_size = file_path.stat().st_size
|
||||
type_files += 1
|
||||
file_count += 1
|
||||
total_size_compressed += file_size
|
||||
|
||||
# Calculate uncompressed size
|
||||
if str(file_path).endswith('.gz'):
|
||||
# Estimate: compressed files are typically 70% smaller
|
||||
# For accuracy, could decompress header, but that's expensive
|
||||
estimated_uncompressed = file_size / 0.3 # Assuming 70% compression
|
||||
total_size_bytes += estimated_uncompressed
|
||||
else:
|
||||
total_size_bytes += file_size
|
||||
|
||||
# Count filings specifically
|
||||
if subdir == 'filings' and (file_path.suffix == '.nc' or file_path.name.endswith('.nc.gz')):
|
||||
filing_count += 1
|
||||
|
||||
# Extract year from path (filings/YYYYMMDD/*.nc)
|
||||
date_dir = file_path.parent.name
|
||||
if len(date_dir) == 8 and date_dir.isdigit():
|
||||
year = int(date_dir[:4])
|
||||
by_year[year] = by_year.get(year, 0) + 1
|
||||
|
||||
by_type[subdir] = type_files
|
||||
|
||||
# Calculate compression savings
|
||||
compression_savings = total_size_bytes - total_size_compressed
|
||||
compression_ratio = compression_savings / total_size_bytes if total_size_bytes > 0 else 0.0
|
||||
|
||||
# Create info object
|
||||
info = StorageInfo(
|
||||
total_size_bytes=int(total_size_bytes),
|
||||
total_size_compressed=int(total_size_compressed),
|
||||
file_count=file_count,
|
||||
filing_count=filing_count,
|
||||
compression_savings_bytes=int(compression_savings),
|
||||
compression_ratio=compression_ratio,
|
||||
by_type=by_type,
|
||||
by_form={}, # Phase 2: parse form types from filenames
|
||||
by_year=by_year,
|
||||
last_updated=datetime.now(),
|
||||
storage_path=storage_path
|
||||
)
|
||||
|
||||
# Update cache
|
||||
_storage_cache = (info, time.time())
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def storage_info(force_refresh: bool = False) -> StorageInfo:
|
||||
"""
|
||||
Get overview of EdgarTools local storage.
|
||||
|
||||
Returns statistics about total size, file counts, compression ratios,
|
||||
and breakdown by data type.
|
||||
|
||||
Args:
|
||||
force_refresh: If True, bypass cache and rescan filesystem
|
||||
|
||||
Returns:
|
||||
StorageInfo: Storage statistics with Rich display support
|
||||
|
||||
Example:
|
||||
>>> from edgar.storage_management import storage_info
|
||||
>>> info = storage_info()
|
||||
>>> print(info) # Rich-formatted panel
|
||||
>>> print(f"Total size: {info.total_size_bytes / 1e9:.2f} GB")
|
||||
"""
|
||||
return _scan_storage(force_refresh=force_refresh)
|
||||
|
||||
|
||||
def check_filing(filing: 'Filing') -> bool:
|
||||
"""
|
||||
Check if a filing is available in local storage.
|
||||
|
||||
Args:
|
||||
filing: Filing object to check
|
||||
|
||||
Returns:
|
||||
bool: True if filing exists locally, False otherwise
|
||||
|
||||
Example:
|
||||
>>> from edgar import Company
|
||||
>>> from edgar.storage_management import check_filing
|
||||
>>> filing = Company("AAPL").latest("10-K")
|
||||
>>> if check_filing(filing):
|
||||
... print("Available offline!")
|
||||
"""
|
||||
from edgar.storage import local_filing_path
|
||||
|
||||
local_path = local_filing_path(
|
||||
filing_date=str(filing.filing_date),
|
||||
accession_number=filing.accession_no
|
||||
)
|
||||
|
||||
return local_path.exists()
|
||||
|
||||
|
||||
def check_filings_batch(filings: List['Filing']) -> Dict[str, bool]:
|
||||
"""
|
||||
Efficiently check availability of multiple filings.
|
||||
|
||||
Args:
|
||||
filings: List of Filing objects to check
|
||||
|
||||
Returns:
|
||||
Dict mapping accession number to availability (True/False)
|
||||
|
||||
Example:
|
||||
>>> from edgar import get_filings
|
||||
>>> from edgar.storage_management import check_filings_batch
|
||||
>>> filings = get_filings(filing_date="2025-01-15").sample(10)
|
||||
>>> availability = check_filings_batch(filings)
|
||||
>>> available = [f for f in filings if availability[f.accession_no]]
|
||||
>>> print(f"{len(available)} of {len(filings)} available offline")
|
||||
"""
|
||||
from edgar.storage import local_filing_path
|
||||
|
||||
availability = {}
|
||||
for filing in filings:
|
||||
local_path = local_filing_path(
|
||||
filing_date=str(filing.filing_date),
|
||||
accession_number=filing.accession_no
|
||||
)
|
||||
availability[filing.accession_no] = local_path.exists()
|
||||
|
||||
return availability
|
||||
|
||||
|
||||
def availability_summary(filings: List['Filing']) -> str:
|
||||
"""
|
||||
Get a summary string of filing availability.
|
||||
|
||||
Args:
|
||||
filings: List of Filing objects
|
||||
|
||||
Returns:
|
||||
str: Summary like "45 of 100 filings available offline (45%)"
|
||||
|
||||
Example:
|
||||
>>> from edgar import get_filings
|
||||
>>> from edgar.storage_management import availability_summary
|
||||
>>> filings = get_filings(filing_date="2025-01-15").head(100)
|
||||
>>> print(availability_summary(filings))
|
||||
45 of 100 filings available offline (45%)
|
||||
"""
|
||||
availability = check_filings_batch(filings)
|
||||
available_count = sum(availability.values())
|
||||
total_count = len(filings)
|
||||
percentage = (available_count / total_count * 100) if total_count > 0 else 0
|
||||
|
||||
return f"{available_count} of {total_count} filings available offline ({percentage:.0f}%)"
|
||||
|
||||
|
||||
def analyze_storage(force_refresh: bool = False) -> StorageAnalysis:
|
||||
"""
|
||||
Analyze storage and provide optimization recommendations.
|
||||
|
||||
Scans local storage for potential issues and suggests improvements like:
|
||||
- Compressing uncompressed files
|
||||
- Cleaning up old cache files
|
||||
- Identifying duplicate or orphaned data
|
||||
|
||||
Args:
|
||||
force_refresh: If True, bypass cache and rescan filesystem
|
||||
|
||||
Returns:
|
||||
StorageAnalysis: Analysis with issues and recommendations
|
||||
|
||||
Example:
|
||||
>>> from edgar.storage_management import analyze_storage
|
||||
>>> analysis = analyze_storage()
|
||||
>>> print(analysis) # Rich-formatted panel with recommendations
|
||||
>>> if analysis.potential_savings_bytes > 1e9:
|
||||
... print(f"Can save {analysis.potential_savings_bytes / 1e9:.1f} GB")
|
||||
"""
|
||||
from edgar.core import get_edgar_data_directory
|
||||
|
||||
info = storage_info(force_refresh=force_refresh)
|
||||
storage_path = get_edgar_data_directory()
|
||||
|
||||
issues = []
|
||||
recommendations = []
|
||||
potential_savings = 0
|
||||
|
||||
# Check for uncompressed files
|
||||
uncompressed_count = 0
|
||||
uncompressed_size = 0
|
||||
|
||||
for subdir in ['filings', 'companyfacts', 'submissions']:
|
||||
subdir_path = storage_path / subdir
|
||||
if not subdir_path.exists():
|
||||
continue
|
||||
|
||||
for file_path in subdir_path.rglob('*'):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
|
||||
# Check if file should be compressed
|
||||
if file_path.suffix in ['.json', '.xml', '.txt', '.nc'] and not str(file_path).endswith('.gz'):
|
||||
uncompressed_count += 1
|
||||
file_size = file_path.stat().st_size
|
||||
uncompressed_size += file_size
|
||||
# Estimate 70% compression savings
|
||||
potential_savings += int(file_size * 0.7)
|
||||
|
||||
if uncompressed_count > 0:
|
||||
issues.append(f"Found {uncompressed_count:,} uncompressed files ({uncompressed_size / 1e9:.2f} GB)")
|
||||
recommendations.append(f"Run optimize_storage() to compress files and save ~{potential_savings / 1e9:.1f} GB")
|
||||
|
||||
# Check for obsolete _pcache directory (replaced by _tcache in commit 3bfba7e)
|
||||
pcache_path = storage_path / '_pcache'
|
||||
pcache_size = 0
|
||||
pcache_files = 0
|
||||
if pcache_path.exists():
|
||||
for file_path in pcache_path.rglob('*'):
|
||||
if file_path.is_file():
|
||||
pcache_files += 1
|
||||
pcache_size += file_path.stat().st_size
|
||||
|
||||
if pcache_files > 0:
|
||||
issues.append(f"Obsolete _pcache directory contains {pcache_files:,} files ({pcache_size / 1e9:.2f} GB)")
|
||||
recommendations.append(f"Run clear_cache(obsolete_only=True) to remove old cache and free {pcache_size / 1e9:.1f} GB")
|
||||
|
||||
# Check for large cache directories
|
||||
cache_size = 0
|
||||
cache_files = 0
|
||||
for cache_dir in ['_cache', '_tcache']: # Only check active cache directories
|
||||
cache_path = storage_path / cache_dir
|
||||
if cache_path.exists():
|
||||
for file_path in cache_path.rglob('*'):
|
||||
if file_path.is_file():
|
||||
cache_files += 1
|
||||
cache_size += file_path.stat().st_size
|
||||
|
||||
if cache_size > 1e9: # More than 1 GB
|
||||
issues.append(f"Cache directories contain {cache_files:,} files ({cache_size / 1e9:.2f} GB)")
|
||||
recommendations.append(f"Run clear_cache() to free up {cache_size / 1e9:.1f} GB")
|
||||
|
||||
# Check for old filings (over 1 year old) - only if many exist
|
||||
from datetime import datetime, timedelta
|
||||
old_threshold = datetime.now() - timedelta(days=365)
|
||||
old_filings = 0
|
||||
old_filings_size = 0
|
||||
|
||||
filings_dir = storage_path / 'filings'
|
||||
if filings_dir.exists():
|
||||
for date_dir in filings_dir.iterdir():
|
||||
if not date_dir.is_dir():
|
||||
continue
|
||||
|
||||
# Parse date from directory name (YYYYMMDD)
|
||||
if len(date_dir.name) == 8 and date_dir.name.isdigit():
|
||||
try:
|
||||
dir_date = datetime.strptime(date_dir.name, '%Y%m%d')
|
||||
if dir_date < old_threshold:
|
||||
for file_path in date_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
old_filings += 1
|
||||
old_filings_size += file_path.stat().st_size
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if old_filings > 100: # Only flag if substantial
|
||||
recommendations.append(
|
||||
f"Consider cleanup_storage(days=365) to remove {old_filings:,} old filings "
|
||||
f"({old_filings_size / 1e9:.1f} GB)"
|
||||
)
|
||||
|
||||
# Overall health check
|
||||
if not issues:
|
||||
recommendations.append("Storage is well-optimized!")
|
||||
|
||||
return StorageAnalysis(
|
||||
storage_info=info,
|
||||
issues=issues,
|
||||
recommendations=recommendations,
|
||||
potential_savings_bytes=potential_savings
|
||||
)
|
||||
|
||||
|
||||
def optimize_storage(dry_run: bool = True) -> Dict[str, int]:
|
||||
"""
|
||||
Compress uncompressed files to save disk space.
|
||||
|
||||
Compresses .json, .xml, .txt, and .nc files in filings, companyfacts,
|
||||
and submissions directories using gzip. Original files are replaced with
|
||||
.gz versions.
|
||||
|
||||
Args:
|
||||
dry_run: If True, only report what would be done without making changes
|
||||
|
||||
Returns:
|
||||
Dict with 'files_compressed', 'bytes_saved', 'errors'
|
||||
|
||||
Example:
|
||||
>>> from edgar.storage_management import optimize_storage
|
||||
>>> # First see what would happen
|
||||
>>> result = optimize_storage(dry_run=True)
|
||||
>>> print(f"Would compress {result['files_compressed']} files")
|
||||
>>> # Then do it
|
||||
>>> result = optimize_storage(dry_run=False)
|
||||
>>> print(f"Saved {result['bytes_saved'] / 1e9:.1f} GB")
|
||||
"""
|
||||
import gzip
|
||||
import shutil
|
||||
from edgar.core import get_edgar_data_directory
|
||||
|
||||
storage_path = get_edgar_data_directory()
|
||||
files_compressed = 0
|
||||
bytes_saved = 0
|
||||
errors = 0
|
||||
|
||||
for subdir in ['filings', 'companyfacts', 'submissions']:
|
||||
subdir_path = storage_path / subdir
|
||||
if not subdir_path.exists():
|
||||
continue
|
||||
|
||||
for file_path in subdir_path.rglob('*'):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
|
||||
# Check if file should be compressed
|
||||
if file_path.suffix in ['.json', '.xml', '.txt', '.nc'] and not str(file_path).endswith('.gz'):
|
||||
try:
|
||||
original_size = file_path.stat().st_size
|
||||
|
||||
if not dry_run:
|
||||
# Compress file
|
||||
gz_path = Path(str(file_path) + '.gz')
|
||||
with open(file_path, 'rb') as f_in:
|
||||
with gzip.open(gz_path, 'wb') as f_out:
|
||||
shutil.copyfileobj(f_in, f_out)
|
||||
|
||||
# Verify compressed file exists
|
||||
if gz_path.exists():
|
||||
compressed_size = gz_path.stat().st_size
|
||||
bytes_saved += (original_size - compressed_size)
|
||||
file_path.unlink() # Remove original
|
||||
else:
|
||||
errors += 1
|
||||
continue
|
||||
else:
|
||||
# Estimate 70% compression
|
||||
bytes_saved += int(original_size * 0.7)
|
||||
|
||||
files_compressed += 1
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
return {
|
||||
'files_compressed': files_compressed,
|
||||
'bytes_saved': bytes_saved,
|
||||
'errors': errors
|
||||
}
|
||||
|
||||
|
||||
def cleanup_storage(days: int = 365, dry_run: bool = True) -> Dict[str, int]:
|
||||
"""
|
||||
Remove old filings from local storage.
|
||||
|
||||
Deletes filing files older than the specified number of days. This helps
|
||||
free up space for users who only need recent filings.
|
||||
|
||||
Args:
|
||||
days: Remove filings older than this many days (default: 365)
|
||||
dry_run: If True, only report what would be deleted without making changes
|
||||
|
||||
Returns:
|
||||
Dict with 'files_deleted', 'bytes_freed', 'errors'
|
||||
|
||||
Example:
|
||||
>>> from edgar.storage_management import cleanup_storage
|
||||
>>> # First see what would be deleted
|
||||
>>> result = cleanup_storage(days=365, dry_run=True)
|
||||
>>> print(f"Would delete {result['files_deleted']} files")
|
||||
>>> # Then do it
|
||||
>>> result = cleanup_storage(days=365, dry_run=False)
|
||||
>>> print(f"Freed {result['bytes_freed'] / 1e9:.1f} GB")
|
||||
"""
|
||||
from datetime import datetime, timedelta
|
||||
from edgar.core import get_edgar_data_directory
|
||||
|
||||
storage_path = get_edgar_data_directory()
|
||||
cutoff_date = datetime.now() - timedelta(days=days)
|
||||
|
||||
files_deleted = 0
|
||||
bytes_freed = 0
|
||||
errors = 0
|
||||
|
||||
filings_dir = storage_path / 'filings'
|
||||
if not filings_dir.exists():
|
||||
return {'files_deleted': 0, 'bytes_freed': 0, 'errors': 0}
|
||||
|
||||
for date_dir in filings_dir.iterdir():
|
||||
if not date_dir.is_dir():
|
||||
continue
|
||||
|
||||
# Parse date from directory name (YYYYMMDD)
|
||||
if len(date_dir.name) == 8 and date_dir.name.isdigit():
|
||||
try:
|
||||
dir_date = datetime.strptime(date_dir.name, '%Y%m%d')
|
||||
|
||||
if dir_date < cutoff_date:
|
||||
# Delete all files in this directory
|
||||
for file_path in date_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
try:
|
||||
file_size = file_path.stat().st_size
|
||||
bytes_freed += file_size
|
||||
|
||||
if not dry_run:
|
||||
file_path.unlink()
|
||||
|
||||
files_deleted += 1
|
||||
except Exception:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
# Remove empty directory
|
||||
if not dry_run:
|
||||
try:
|
||||
# Remove all empty subdirectories
|
||||
for subdir in reversed(list(date_dir.rglob('*'))):
|
||||
if subdir.is_dir() and not list(subdir.iterdir()):
|
||||
subdir.rmdir()
|
||||
# Remove date directory if empty
|
||||
if not list(date_dir.iterdir()):
|
||||
date_dir.rmdir()
|
||||
except Exception:
|
||||
errors += 1
|
||||
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return {
|
||||
'files_deleted': files_deleted,
|
||||
'bytes_freed': bytes_freed,
|
||||
'errors': errors
|
||||
}
|
||||
|
||||
|
||||
def clear_cache(dry_run: bool = True, obsolete_only: bool = False) -> Dict[str, int]:
|
||||
"""
|
||||
Clear HTTP cache directories to free up space.
|
||||
|
||||
Removes cached HTTP responses from cache directories. By default clears all
|
||||
cache directories (_cache, _tcache). Use obsolete_only=True to only remove
|
||||
the obsolete _pcache directory (replaced by _tcache in Aug 2025).
|
||||
|
||||
Args:
|
||||
dry_run: If True, only report what would be deleted without making changes
|
||||
obsolete_only: If True, only clear obsolete _pcache directory
|
||||
|
||||
Returns:
|
||||
Dict with 'files_deleted', 'bytes_freed', 'errors'
|
||||
|
||||
Example:
|
||||
>>> from edgar.storage_management import clear_cache
|
||||
>>> # Clear obsolete cache only
|
||||
>>> result = clear_cache(obsolete_only=True, dry_run=False)
|
||||
>>> print(f"Freed {result['bytes_freed'] / 1e9:.1f} GB")
|
||||
>>> # Clear all caches
|
||||
>>> result = clear_cache(dry_run=False)
|
||||
>>> print(f"Cleared {result['files_deleted']} cache files")
|
||||
"""
|
||||
from edgar.core import get_edgar_data_directory
|
||||
|
||||
storage_path = get_edgar_data_directory()
|
||||
files_deleted = 0
|
||||
bytes_freed = 0
|
||||
errors = 0
|
||||
|
||||
# Determine which cache directories to clear
|
||||
if obsolete_only:
|
||||
cache_dirs = ['_pcache'] # Only obsolete cache
|
||||
else:
|
||||
cache_dirs = ['_cache', '_tcache'] # Active caches only
|
||||
|
||||
for cache_dir_name in cache_dirs:
|
||||
cache_dir = storage_path / cache_dir_name
|
||||
if not cache_dir.exists():
|
||||
continue
|
||||
|
||||
for file_path in cache_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
try:
|
||||
file_size = file_path.stat().st_size
|
||||
bytes_freed += file_size
|
||||
|
||||
if not dry_run:
|
||||
file_path.unlink()
|
||||
|
||||
files_deleted += 1
|
||||
except Exception:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
# Remove empty directories
|
||||
if not dry_run:
|
||||
try:
|
||||
for subdir in reversed(list(cache_dir.rglob('*'))):
|
||||
if subdir.is_dir() and not list(subdir.iterdir()):
|
||||
subdir.rmdir()
|
||||
# Remove the cache directory itself if empty
|
||||
if not list(cache_dir.iterdir()):
|
||||
cache_dir.rmdir()
|
||||
except Exception:
|
||||
errors += 1
|
||||
|
||||
return {
|
||||
'files_deleted': files_deleted,
|
||||
'bytes_freed': bytes_freed,
|
||||
'errors': errors
|
||||
}
|
||||
Reference in New Issue
Block a user