Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,717 @@
"""
Storage Management for EdgarTools
This module provides visibility, analytics, and management capabilities for EdgarTools'
local storage. It helps users understand what data is downloaded locally and provides
tools to optimize and clean up storage.
Functions:
storage_info() - Get overview of local storage with statistics
check_filing() - Check if a filing is available locally
check_filings_batch() - Check multiple filings efficiently
availability_summary() - Get summary of filing availability
analyze_storage() - Analyze storage with optimization recommendations
optimize_storage() - Compress uncompressed files
cleanup_storage() - Remove old files (dry-run by default)
clear_cache() - Clear HTTP cache directories (with obsolete cache detection)
Classes:
StorageInfo - Storage statistics dataclass with Rich display
StorageAnalysis - Storage analysis with recommendations
"""
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
import time
from edgar.richtools import repr_rich
if TYPE_CHECKING:
from edgar._filings import Filing
# Cache storage statistics for 60 seconds
_storage_cache: Optional[Tuple['StorageInfo', float]] = None
_CACHE_TTL = 60.0
@dataclass
class StorageAnalysis:
"""Analysis of storage with optimization recommendations"""
storage_info: 'StorageInfo'
issues: List[str]
recommendations: List[str]
potential_savings_bytes: int
def __rich__(self):
"""Rich Panel display with analysis and recommendations"""
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
# Create main table
analysis = Table(show_header=False, box=None, padding=(0, 2))
analysis.add_column(style="dim")
analysis.add_column()
# Storage summary
total_gb = self.storage_info.total_size_bytes / (1024**3)
compressed_gb = self.storage_info.total_size_compressed / (1024**3)
potential_gb = self.potential_savings_bytes / (1024**3)
analysis.add_row("📊 Current Size:", f"{compressed_gb:.2f} GB")
analysis.add_row("💾 Total Files:", f"{self.storage_info.file_count:,}")
if self.potential_savings_bytes > 0:
analysis.add_row("💰 Potential Savings:", f"{potential_gb:.2f} GB")
# Issues section
if self.issues:
analysis.add_row("", "") # Spacer
analysis.add_row("[bold red]⚠️ Issues Found:[/bold red]", "")
for issue in self.issues:
analysis.add_row("", f"{issue}")
# Recommendations section
if self.recommendations:
analysis.add_row("", "") # Spacer
analysis.add_row("[bold green]💡 Recommendations:[/bold green]", "")
for rec in self.recommendations:
analysis.add_row("", f"{rec}")
# All good message
if not self.issues and not self.recommendations:
analysis.add_row("", "")
analysis.add_row("[bold green]✅ Storage is optimized[/bold green]", "")
return Panel(
analysis,
title="[bold]Storage Analysis[/bold]",
border_style="blue",
padding=(1, 2)
)
def __repr__(self):
return repr_rich(self.__rich__())
@dataclass
class StorageInfo:
"""Statistics about EdgarTools local storage"""
total_size_bytes: int
total_size_compressed: int # Actual disk usage
file_count: int
filing_count: int
compression_savings_bytes: int
compression_ratio: float
by_type: Dict[str, int] # {'filings': 1247, 'companyfacts': 18839, ...}
by_form: Dict[str, int] # {'10-K': 234, '10-Q': 456, ...} (future)
by_year: Dict[str, int] # {2025: 45, 2024: 1202, ...}
last_updated: datetime
storage_path: Path
def __rich__(self):
"""Rich Panel display"""
from rich.panel import Panel
from rich.table import Table
# Create statistics table
stats = Table(show_header=False, box=None, padding=(0, 2))
stats.add_column(style="dim", justify="right")
stats.add_column(style="bold")
# Format sizes
total_gb = self.total_size_bytes / (1024**3)
compressed_gb = self.total_size_compressed / (1024**3)
savings_gb = self.compression_savings_bytes / (1024**3)
stats.add_row("Total Size:", f"{total_gb:.2f} GB (uncompressed)")
stats.add_row("Disk Usage:", f"{compressed_gb:.2f} GB (compressed)")
stats.add_row("Space Saved:", f"{savings_gb:.2f} GB ({self.compression_ratio:.1%})")
stats.add_row("Total Files:", f"{self.file_count:,}")
stats.add_row("Filings:", f"{self.filing_count:,}")
stats.add_row("Location:", str(self.storage_path))
# Create breakdown by type with descriptive labels
if self.by_type:
stats.add_row("", "") # Spacer
# Define labels for cache directories
cache_labels = {
'_tcache': '_tcache (HTTP cache):',
'_pcache': '_pcache (obsolete cache):',
'_cache': '_cache (legacy cache):'
}
for data_type, count in sorted(self.by_type.items()):
# Use descriptive label for cache directories
label = cache_labels.get(data_type, f"{data_type}:")
stats.add_row(label, f"{count:,} files")
return Panel(
stats,
title="[bold]EdgarTools Local Storage[/bold]",
border_style="blue",
padding=(1, 2)
)
def __repr__(self):
return repr_rich(self.__rich__())
def _scan_storage(force_refresh: bool = False) -> StorageInfo:
"""
Scan .edgar directory and collect storage statistics.
Results are cached for 60 seconds unless force_refresh=True.
"""
global _storage_cache
# Check cache
if not force_refresh and _storage_cache is not None:
info, timestamp = _storage_cache
if time.time() - timestamp < _CACHE_TTL:
return info
from edgar.core import get_edgar_data_directory
storage_path = get_edgar_data_directory()
# Initialize counters
total_size_bytes = 0
total_size_compressed = 0
file_count = 0
filing_count = 0
by_type = {}
by_form = {}
by_year = {}
# Scan subdirectories
for subdir in ['filings', 'companyfacts', 'submissions', 'reference', '_cache', '_pcache', '_tcache']:
subdir_path = storage_path / subdir
if not subdir_path.exists():
continue
type_files = 0
# Recursively scan files
for file_path in subdir_path.rglob('*'):
if not file_path.is_file():
continue
file_size = file_path.stat().st_size
type_files += 1
file_count += 1
total_size_compressed += file_size
# Calculate uncompressed size
if str(file_path).endswith('.gz'):
# Estimate: compressed files are typically 70% smaller
# For accuracy, could decompress header, but that's expensive
estimated_uncompressed = file_size / 0.3 # Assuming 70% compression
total_size_bytes += estimated_uncompressed
else:
total_size_bytes += file_size
# Count filings specifically
if subdir == 'filings' and (file_path.suffix == '.nc' or file_path.name.endswith('.nc.gz')):
filing_count += 1
# Extract year from path (filings/YYYYMMDD/*.nc)
date_dir = file_path.parent.name
if len(date_dir) == 8 and date_dir.isdigit():
year = int(date_dir[:4])
by_year[year] = by_year.get(year, 0) + 1
by_type[subdir] = type_files
# Calculate compression savings
compression_savings = total_size_bytes - total_size_compressed
compression_ratio = compression_savings / total_size_bytes if total_size_bytes > 0 else 0.0
# Create info object
info = StorageInfo(
total_size_bytes=int(total_size_bytes),
total_size_compressed=int(total_size_compressed),
file_count=file_count,
filing_count=filing_count,
compression_savings_bytes=int(compression_savings),
compression_ratio=compression_ratio,
by_type=by_type,
by_form={}, # Phase 2: parse form types from filenames
by_year=by_year,
last_updated=datetime.now(),
storage_path=storage_path
)
# Update cache
_storage_cache = (info, time.time())
return info
def storage_info(force_refresh: bool = False) -> StorageInfo:
"""
Get overview of EdgarTools local storage.
Returns statistics about total size, file counts, compression ratios,
and breakdown by data type.
Args:
force_refresh: If True, bypass cache and rescan filesystem
Returns:
StorageInfo: Storage statistics with Rich display support
Example:
>>> from edgar.storage_management import storage_info
>>> info = storage_info()
>>> print(info) # Rich-formatted panel
>>> print(f"Total size: {info.total_size_bytes / 1e9:.2f} GB")
"""
return _scan_storage(force_refresh=force_refresh)
def check_filing(filing: 'Filing') -> bool:
"""
Check if a filing is available in local storage.
Args:
filing: Filing object to check
Returns:
bool: True if filing exists locally, False otherwise
Example:
>>> from edgar import Company
>>> from edgar.storage_management import check_filing
>>> filing = Company("AAPL").latest("10-K")
>>> if check_filing(filing):
... print("Available offline!")
"""
from edgar.storage import local_filing_path
local_path = local_filing_path(
filing_date=str(filing.filing_date),
accession_number=filing.accession_no
)
return local_path.exists()
def check_filings_batch(filings: List['Filing']) -> Dict[str, bool]:
"""
Efficiently check availability of multiple filings.
Args:
filings: List of Filing objects to check
Returns:
Dict mapping accession number to availability (True/False)
Example:
>>> from edgar import get_filings
>>> from edgar.storage_management import check_filings_batch
>>> filings = get_filings(filing_date="2025-01-15").sample(10)
>>> availability = check_filings_batch(filings)
>>> available = [f for f in filings if availability[f.accession_no]]
>>> print(f"{len(available)} of {len(filings)} available offline")
"""
from edgar.storage import local_filing_path
availability = {}
for filing in filings:
local_path = local_filing_path(
filing_date=str(filing.filing_date),
accession_number=filing.accession_no
)
availability[filing.accession_no] = local_path.exists()
return availability
def availability_summary(filings: List['Filing']) -> str:
"""
Get a summary string of filing availability.
Args:
filings: List of Filing objects
Returns:
str: Summary like "45 of 100 filings available offline (45%)"
Example:
>>> from edgar import get_filings
>>> from edgar.storage_management import availability_summary
>>> filings = get_filings(filing_date="2025-01-15").head(100)
>>> print(availability_summary(filings))
45 of 100 filings available offline (45%)
"""
availability = check_filings_batch(filings)
available_count = sum(availability.values())
total_count = len(filings)
percentage = (available_count / total_count * 100) if total_count > 0 else 0
return f"{available_count} of {total_count} filings available offline ({percentage:.0f}%)"
def analyze_storage(force_refresh: bool = False) -> StorageAnalysis:
"""
Analyze storage and provide optimization recommendations.
Scans local storage for potential issues and suggests improvements like:
- Compressing uncompressed files
- Cleaning up old cache files
- Identifying duplicate or orphaned data
Args:
force_refresh: If True, bypass cache and rescan filesystem
Returns:
StorageAnalysis: Analysis with issues and recommendations
Example:
>>> from edgar.storage_management import analyze_storage
>>> analysis = analyze_storage()
>>> print(analysis) # Rich-formatted panel with recommendations
>>> if analysis.potential_savings_bytes > 1e9:
... print(f"Can save {analysis.potential_savings_bytes / 1e9:.1f} GB")
"""
from edgar.core import get_edgar_data_directory
info = storage_info(force_refresh=force_refresh)
storage_path = get_edgar_data_directory()
issues = []
recommendations = []
potential_savings = 0
# Check for uncompressed files
uncompressed_count = 0
uncompressed_size = 0
for subdir in ['filings', 'companyfacts', 'submissions']:
subdir_path = storage_path / subdir
if not subdir_path.exists():
continue
for file_path in subdir_path.rglob('*'):
if not file_path.is_file():
continue
# Check if file should be compressed
if file_path.suffix in ['.json', '.xml', '.txt', '.nc'] and not str(file_path).endswith('.gz'):
uncompressed_count += 1
file_size = file_path.stat().st_size
uncompressed_size += file_size
# Estimate 70% compression savings
potential_savings += int(file_size * 0.7)
if uncompressed_count > 0:
issues.append(f"Found {uncompressed_count:,} uncompressed files ({uncompressed_size / 1e9:.2f} GB)")
recommendations.append(f"Run optimize_storage() to compress files and save ~{potential_savings / 1e9:.1f} GB")
# Check for obsolete _pcache directory (replaced by _tcache in commit 3bfba7e)
pcache_path = storage_path / '_pcache'
pcache_size = 0
pcache_files = 0
if pcache_path.exists():
for file_path in pcache_path.rglob('*'):
if file_path.is_file():
pcache_files += 1
pcache_size += file_path.stat().st_size
if pcache_files > 0:
issues.append(f"Obsolete _pcache directory contains {pcache_files:,} files ({pcache_size / 1e9:.2f} GB)")
recommendations.append(f"Run clear_cache(obsolete_only=True) to remove old cache and free {pcache_size / 1e9:.1f} GB")
# Check for large cache directories
cache_size = 0
cache_files = 0
for cache_dir in ['_cache', '_tcache']: # Only check active cache directories
cache_path = storage_path / cache_dir
if cache_path.exists():
for file_path in cache_path.rglob('*'):
if file_path.is_file():
cache_files += 1
cache_size += file_path.stat().st_size
if cache_size > 1e9: # More than 1 GB
issues.append(f"Cache directories contain {cache_files:,} files ({cache_size / 1e9:.2f} GB)")
recommendations.append(f"Run clear_cache() to free up {cache_size / 1e9:.1f} GB")
# Check for old filings (over 1 year old) - only if many exist
from datetime import datetime, timedelta
old_threshold = datetime.now() - timedelta(days=365)
old_filings = 0
old_filings_size = 0
filings_dir = storage_path / 'filings'
if filings_dir.exists():
for date_dir in filings_dir.iterdir():
if not date_dir.is_dir():
continue
# Parse date from directory name (YYYYMMDD)
if len(date_dir.name) == 8 and date_dir.name.isdigit():
try:
dir_date = datetime.strptime(date_dir.name, '%Y%m%d')
if dir_date < old_threshold:
for file_path in date_dir.rglob('*'):
if file_path.is_file():
old_filings += 1
old_filings_size += file_path.stat().st_size
except ValueError:
continue
if old_filings > 100: # Only flag if substantial
recommendations.append(
f"Consider cleanup_storage(days=365) to remove {old_filings:,} old filings "
f"({old_filings_size / 1e9:.1f} GB)"
)
# Overall health check
if not issues:
recommendations.append("Storage is well-optimized!")
return StorageAnalysis(
storage_info=info,
issues=issues,
recommendations=recommendations,
potential_savings_bytes=potential_savings
)
def optimize_storage(dry_run: bool = True) -> Dict[str, int]:
"""
Compress uncompressed files to save disk space.
Compresses .json, .xml, .txt, and .nc files in filings, companyfacts,
and submissions directories using gzip. Original files are replaced with
.gz versions.
Args:
dry_run: If True, only report what would be done without making changes
Returns:
Dict with 'files_compressed', 'bytes_saved', 'errors'
Example:
>>> from edgar.storage_management import optimize_storage
>>> # First see what would happen
>>> result = optimize_storage(dry_run=True)
>>> print(f"Would compress {result['files_compressed']} files")
>>> # Then do it
>>> result = optimize_storage(dry_run=False)
>>> print(f"Saved {result['bytes_saved'] / 1e9:.1f} GB")
"""
import gzip
import shutil
from edgar.core import get_edgar_data_directory
storage_path = get_edgar_data_directory()
files_compressed = 0
bytes_saved = 0
errors = 0
for subdir in ['filings', 'companyfacts', 'submissions']:
subdir_path = storage_path / subdir
if not subdir_path.exists():
continue
for file_path in subdir_path.rglob('*'):
if not file_path.is_file():
continue
# Check if file should be compressed
if file_path.suffix in ['.json', '.xml', '.txt', '.nc'] and not str(file_path).endswith('.gz'):
try:
original_size = file_path.stat().st_size
if not dry_run:
# Compress file
gz_path = Path(str(file_path) + '.gz')
with open(file_path, 'rb') as f_in:
with gzip.open(gz_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
# Verify compressed file exists
if gz_path.exists():
compressed_size = gz_path.stat().st_size
bytes_saved += (original_size - compressed_size)
file_path.unlink() # Remove original
else:
errors += 1
continue
else:
# Estimate 70% compression
bytes_saved += int(original_size * 0.7)
files_compressed += 1
except Exception as e:
errors += 1
continue
return {
'files_compressed': files_compressed,
'bytes_saved': bytes_saved,
'errors': errors
}
def cleanup_storage(days: int = 365, dry_run: bool = True) -> Dict[str, int]:
"""
Remove old filings from local storage.
Deletes filing files older than the specified number of days. This helps
free up space for users who only need recent filings.
Args:
days: Remove filings older than this many days (default: 365)
dry_run: If True, only report what would be deleted without making changes
Returns:
Dict with 'files_deleted', 'bytes_freed', 'errors'
Example:
>>> from edgar.storage_management import cleanup_storage
>>> # First see what would be deleted
>>> result = cleanup_storage(days=365, dry_run=True)
>>> print(f"Would delete {result['files_deleted']} files")
>>> # Then do it
>>> result = cleanup_storage(days=365, dry_run=False)
>>> print(f"Freed {result['bytes_freed'] / 1e9:.1f} GB")
"""
from datetime import datetime, timedelta
from edgar.core import get_edgar_data_directory
storage_path = get_edgar_data_directory()
cutoff_date = datetime.now() - timedelta(days=days)
files_deleted = 0
bytes_freed = 0
errors = 0
filings_dir = storage_path / 'filings'
if not filings_dir.exists():
return {'files_deleted': 0, 'bytes_freed': 0, 'errors': 0}
for date_dir in filings_dir.iterdir():
if not date_dir.is_dir():
continue
# Parse date from directory name (YYYYMMDD)
if len(date_dir.name) == 8 and date_dir.name.isdigit():
try:
dir_date = datetime.strptime(date_dir.name, '%Y%m%d')
if dir_date < cutoff_date:
# Delete all files in this directory
for file_path in date_dir.rglob('*'):
if file_path.is_file():
try:
file_size = file_path.stat().st_size
bytes_freed += file_size
if not dry_run:
file_path.unlink()
files_deleted += 1
except Exception:
errors += 1
continue
# Remove empty directory
if not dry_run:
try:
# Remove all empty subdirectories
for subdir in reversed(list(date_dir.rglob('*'))):
if subdir.is_dir() and not list(subdir.iterdir()):
subdir.rmdir()
# Remove date directory if empty
if not list(date_dir.iterdir()):
date_dir.rmdir()
except Exception:
errors += 1
except ValueError:
continue
return {
'files_deleted': files_deleted,
'bytes_freed': bytes_freed,
'errors': errors
}
def clear_cache(dry_run: bool = True, obsolete_only: bool = False) -> Dict[str, int]:
"""
Clear HTTP cache directories to free up space.
Removes cached HTTP responses from cache directories. By default clears all
cache directories (_cache, _tcache). Use obsolete_only=True to only remove
the obsolete _pcache directory (replaced by _tcache in Aug 2025).
Args:
dry_run: If True, only report what would be deleted without making changes
obsolete_only: If True, only clear obsolete _pcache directory
Returns:
Dict with 'files_deleted', 'bytes_freed', 'errors'
Example:
>>> from edgar.storage_management import clear_cache
>>> # Clear obsolete cache only
>>> result = clear_cache(obsolete_only=True, dry_run=False)
>>> print(f"Freed {result['bytes_freed'] / 1e9:.1f} GB")
>>> # Clear all caches
>>> result = clear_cache(dry_run=False)
>>> print(f"Cleared {result['files_deleted']} cache files")
"""
from edgar.core import get_edgar_data_directory
storage_path = get_edgar_data_directory()
files_deleted = 0
bytes_freed = 0
errors = 0
# Determine which cache directories to clear
if obsolete_only:
cache_dirs = ['_pcache'] # Only obsolete cache
else:
cache_dirs = ['_cache', '_tcache'] # Active caches only
for cache_dir_name in cache_dirs:
cache_dir = storage_path / cache_dir_name
if not cache_dir.exists():
continue
for file_path in cache_dir.rglob('*'):
if file_path.is_file():
try:
file_size = file_path.stat().st_size
bytes_freed += file_size
if not dry_run:
file_path.unlink()
files_deleted += 1
except Exception:
errors += 1
continue
# Remove empty directories
if not dry_run:
try:
for subdir in reversed(list(cache_dir.rglob('*'))):
if subdir.is_dir() and not list(subdir.iterdir()):
subdir.rmdir()
# Remove the cache directory itself if empty
if not list(cache_dir.iterdir()):
cache_dir.rmdir()
except Exception:
errors += 1
return {
'files_deleted': files_deleted,
'bytes_freed': bytes_freed,
'errors': errors
}