Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/storage_management.py
+++ b/venv/lib/python3.10/site-packages/edgar/storage_management.py
@@ -0,0 +1,717 @@
+"""
+Storage Management for EdgarTools
+
+This module provides visibility, analytics, and management capabilities for EdgarTools'
+local storage. It helps users understand what data is downloaded locally and provides
+tools to optimize and clean up storage.
+
+Functions:
+    storage_info() - Get overview of local storage with statistics
+    check_filing() - Check if a filing is available locally
+    check_filings_batch() - Check multiple filings efficiently
+    availability_summary() - Get summary of filing availability
+    analyze_storage() - Analyze storage with optimization recommendations
+    optimize_storage() - Compress uncompressed files
+    cleanup_storage() - Remove old files (dry-run by default)
+    clear_cache() - Clear HTTP cache directories (with obsolete cache detection)
+
+Classes:
+    StorageInfo - Storage statistics dataclass with Rich display
+    StorageAnalysis - Storage analysis with recommendations
+"""
+
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
+import time
+from edgar.richtools import repr_rich
+
+if TYPE_CHECKING:
+    from edgar._filings import Filing
+
+# Cache storage statistics for 60 seconds
+_storage_cache: Optional[Tuple['StorageInfo', float]] = None
+_CACHE_TTL = 60.0
+
+
+@dataclass
+class StorageAnalysis:
+    """Analysis of storage with optimization recommendations"""
+    storage_info: 'StorageInfo'
+    issues: List[str]
+    recommendations: List[str]
+    potential_savings_bytes: int
+
+    def __rich__(self):
+        """Rich Panel display with analysis and recommendations"""
+        from rich.panel import Panel
+        from rich.table import Table
+        from rich.text import Text
+
+        # Create main table
+        analysis = Table(show_header=False, box=None, padding=(0, 2))
+        analysis.add_column(style="dim")
+        analysis.add_column()
+
+        # Storage summary
+        total_gb = self.storage_info.total_size_bytes / (1024**3)
+        compressed_gb = self.storage_info.total_size_compressed / (1024**3)
+        potential_gb = self.potential_savings_bytes / (1024**3)
+
+        analysis.add_row("📊 Current Size:", f"{compressed_gb:.2f} GB")
+        analysis.add_row("💾 Total Files:", f"{self.storage_info.file_count:,}")
+
+        if self.potential_savings_bytes > 0:
+            analysis.add_row("💰 Potential Savings:", f"{potential_gb:.2f} GB")
+
+        # Issues section
+        if self.issues:
+            analysis.add_row("", "")  # Spacer
+            analysis.add_row("[bold red]⚠️  Issues Found:[/bold red]", "")
+            for issue in self.issues:
+                analysis.add_row("", f"• {issue}")
+
+        # Recommendations section
+        if self.recommendations:
+            analysis.add_row("", "")  # Spacer
+            analysis.add_row("[bold green]💡 Recommendations:[/bold green]", "")
+            for rec in self.recommendations:
+                analysis.add_row("", f"• {rec}")
+
+        # All good message
+        if not self.issues and not self.recommendations:
+            analysis.add_row("", "")
+            analysis.add_row("[bold green]✅ Storage is optimized[/bold green]", "")
+
+        return Panel(
+            analysis,
+            title="[bold]Storage Analysis[/bold]",
+            border_style="blue",
+            padding=(1, 2)
+        )
+
+    def __repr__(self):
+        return repr_rich(self.__rich__())
+
+
+@dataclass
+class StorageInfo:
+    """Statistics about EdgarTools local storage"""
+    total_size_bytes: int
+    total_size_compressed: int  # Actual disk usage
+    file_count: int
+    filing_count: int
+    compression_savings_bytes: int
+    compression_ratio: float
+    by_type: Dict[str, int]  # {'filings': 1247, 'companyfacts': 18839, ...}
+    by_form: Dict[str, int]  # {'10-K': 234, '10-Q': 456, ...} (future)
+    by_year: Dict[str, int]  # {2025: 45, 2024: 1202, ...}
+    last_updated: datetime
+    storage_path: Path
+
+    def __rich__(self):
+        """Rich Panel display"""
+        from rich.panel import Panel
+        from rich.table import Table
+
+        # Create statistics table
+        stats = Table(show_header=False, box=None, padding=(0, 2))
+        stats.add_column(style="dim", justify="right")
+        stats.add_column(style="bold")
+
+        # Format sizes
+        total_gb = self.total_size_bytes / (1024**3)
+        compressed_gb = self.total_size_compressed / (1024**3)
+        savings_gb = self.compression_savings_bytes / (1024**3)
+
+        stats.add_row("Total Size:", f"{total_gb:.2f} GB (uncompressed)")
+        stats.add_row("Disk Usage:", f"{compressed_gb:.2f} GB (compressed)")
+        stats.add_row("Space Saved:", f"{savings_gb:.2f} GB ({self.compression_ratio:.1%})")
+        stats.add_row("Total Files:", f"{self.file_count:,}")
+        stats.add_row("Filings:", f"{self.filing_count:,}")
+        stats.add_row("Location:", str(self.storage_path))
+
+        # Create breakdown by type with descriptive labels
+        if self.by_type:
+            stats.add_row("", "")  # Spacer
+
+            # Define labels for cache directories
+            cache_labels = {
+                '_tcache': '_tcache (HTTP cache):',
+                '_pcache': '_pcache (obsolete cache):',
+                '_cache': '_cache (legacy cache):'
+            }
+
+            for data_type, count in sorted(self.by_type.items()):
+                # Use descriptive label for cache directories
+                label = cache_labels.get(data_type, f"{data_type}:")
+                stats.add_row(label, f"{count:,} files")
+
+        return Panel(
+            stats,
+            title="[bold]EdgarTools Local Storage[/bold]",
+            border_style="blue",
+            padding=(1, 2)
+        )
+
+    def __repr__(self):
+        return repr_rich(self.__rich__())
+
+def _scan_storage(force_refresh: bool = False) -> StorageInfo:
+    """
+    Scan .edgar directory and collect storage statistics.
+    Results are cached for 60 seconds unless force_refresh=True.
+    """
+    global _storage_cache
+
+    # Check cache
+    if not force_refresh and _storage_cache is not None:
+        info, timestamp = _storage_cache
+        if time.time() - timestamp < _CACHE_TTL:
+            return info
+
+    from edgar.core import get_edgar_data_directory
+    storage_path = get_edgar_data_directory()
+
+    # Initialize counters
+    total_size_bytes = 0
+    total_size_compressed = 0
+    file_count = 0
+    filing_count = 0
+    by_type = {}
+    by_form = {}
+    by_year = {}
+
+    # Scan subdirectories
+    for subdir in ['filings', 'companyfacts', 'submissions', 'reference', '_cache', '_pcache', '_tcache']:
+        subdir_path = storage_path / subdir
+        if not subdir_path.exists():
+            continue
+
+        type_files = 0
+
+        # Recursively scan files
+        for file_path in subdir_path.rglob('*'):
+            if not file_path.is_file():
+                continue
+
+            file_size = file_path.stat().st_size
+            type_files += 1
+            file_count += 1
+            total_size_compressed += file_size
+
+            # Calculate uncompressed size
+            if str(file_path).endswith('.gz'):
+                # Estimate: compressed files are typically 70% smaller
+                # For accuracy, could decompress header, but that's expensive
+                estimated_uncompressed = file_size / 0.3  # Assuming 70% compression
+                total_size_bytes += estimated_uncompressed
+            else:
+                total_size_bytes += file_size
+
+            # Count filings specifically
+            if subdir == 'filings' and (file_path.suffix == '.nc' or file_path.name.endswith('.nc.gz')):
+                filing_count += 1
+
+                # Extract year from path (filings/YYYYMMDD/*.nc)
+                date_dir = file_path.parent.name
+                if len(date_dir) == 8 and date_dir.isdigit():
+                    year = int(date_dir[:4])
+                    by_year[year] = by_year.get(year, 0) + 1
+
+        by_type[subdir] = type_files
+
+    # Calculate compression savings
+    compression_savings = total_size_bytes - total_size_compressed
+    compression_ratio = compression_savings / total_size_bytes if total_size_bytes > 0 else 0.0
+
+    # Create info object
+    info = StorageInfo(
+        total_size_bytes=int(total_size_bytes),
+        total_size_compressed=int(total_size_compressed),
+        file_count=file_count,
+        filing_count=filing_count,
+        compression_savings_bytes=int(compression_savings),
+        compression_ratio=compression_ratio,
+        by_type=by_type,
+        by_form={},  # Phase 2: parse form types from filenames
+        by_year=by_year,
+        last_updated=datetime.now(),
+        storage_path=storage_path
+    )
+
+    # Update cache
+    _storage_cache = (info, time.time())
+
+    return info
+
+
+def storage_info(force_refresh: bool = False) -> StorageInfo:
+    """
+    Get overview of EdgarTools local storage.
+
+    Returns statistics about total size, file counts, compression ratios,
+    and breakdown by data type.
+
+    Args:
+        force_refresh: If True, bypass cache and rescan filesystem
+
+    Returns:
+        StorageInfo: Storage statistics with Rich display support
+
+    Example:
+        >>> from edgar.storage_management import storage_info
+        >>> info = storage_info()
+        >>> print(info)  # Rich-formatted panel
+        >>> print(f"Total size: {info.total_size_bytes / 1e9:.2f} GB")
+    """
+    return _scan_storage(force_refresh=force_refresh)
+
+
+def check_filing(filing: 'Filing') -> bool:
+    """
+    Check if a filing is available in local storage.
+
+    Args:
+        filing: Filing object to check
+
+    Returns:
+        bool: True if filing exists locally, False otherwise
+
+    Example:
+        >>> from edgar import Company
+        >>> from edgar.storage_management import check_filing
+        >>> filing = Company("AAPL").latest("10-K")
+        >>> if check_filing(filing):
+        ...     print("Available offline!")
+    """
+    from edgar.storage import local_filing_path
+
+    local_path = local_filing_path(
+        filing_date=str(filing.filing_date),
+        accession_number=filing.accession_no
+    )
+
+    return local_path.exists()
+
+
+def check_filings_batch(filings: List['Filing']) -> Dict[str, bool]:
+    """
+    Efficiently check availability of multiple filings.
+
+    Args:
+        filings: List of Filing objects to check
+
+    Returns:
+        Dict mapping accession number to availability (True/False)
+
+    Example:
+        >>> from edgar import get_filings
+        >>> from edgar.storage_management import check_filings_batch
+        >>> filings = get_filings(filing_date="2025-01-15").sample(10)
+        >>> availability = check_filings_batch(filings)
+        >>> available = [f for f in filings if availability[f.accession_no]]
+        >>> print(f"{len(available)} of {len(filings)} available offline")
+    """
+    from edgar.storage import local_filing_path
+
+    availability = {}
+    for filing in filings:
+        local_path = local_filing_path(
+            filing_date=str(filing.filing_date),
+            accession_number=filing.accession_no
+        )
+        availability[filing.accession_no] = local_path.exists()
+
+    return availability
+
+
+def availability_summary(filings: List['Filing']) -> str:
+    """
+    Get a summary string of filing availability.
+
+    Args:
+        filings: List of Filing objects
+
+    Returns:
+        str: Summary like "45 of 100 filings available offline (45%)"
+
+    Example:
+        >>> from edgar import get_filings
+        >>> from edgar.storage_management import availability_summary
+        >>> filings = get_filings(filing_date="2025-01-15").head(100)
+        >>> print(availability_summary(filings))
+        45 of 100 filings available offline (45%)
+    """
+    availability = check_filings_batch(filings)
+    available_count = sum(availability.values())
+    total_count = len(filings)
+    percentage = (available_count / total_count * 100) if total_count > 0 else 0
+
+    return f"{available_count} of {total_count} filings available offline ({percentage:.0f}%)"
+
+
+def analyze_storage(force_refresh: bool = False) -> StorageAnalysis:
+    """
+    Analyze storage and provide optimization recommendations.
+
+    Scans local storage for potential issues and suggests improvements like:
+    - Compressing uncompressed files
+    - Cleaning up old cache files
+    - Identifying duplicate or orphaned data
+
+    Args:
+        force_refresh: If True, bypass cache and rescan filesystem
+
+    Returns:
+        StorageAnalysis: Analysis with issues and recommendations
+
+    Example:
+        >>> from edgar.storage_management import analyze_storage
+        >>> analysis = analyze_storage()
+        >>> print(analysis)  # Rich-formatted panel with recommendations
+        >>> if analysis.potential_savings_bytes > 1e9:
+        ...     print(f"Can save {analysis.potential_savings_bytes / 1e9:.1f} GB")
+    """
+    from edgar.core import get_edgar_data_directory
+
+    info = storage_info(force_refresh=force_refresh)
+    storage_path = get_edgar_data_directory()
+
+    issues = []
+    recommendations = []
+    potential_savings = 0
+
+    # Check for uncompressed files
+    uncompressed_count = 0
+    uncompressed_size = 0
+
+    for subdir in ['filings', 'companyfacts', 'submissions']:
+        subdir_path = storage_path / subdir
+        if not subdir_path.exists():
+            continue
+
+        for file_path in subdir_path.rglob('*'):
+            if not file_path.is_file():
+                continue
+
+            # Check if file should be compressed
+            if file_path.suffix in ['.json', '.xml', '.txt', '.nc'] and not str(file_path).endswith('.gz'):
+                uncompressed_count += 1
+                file_size = file_path.stat().st_size
+                uncompressed_size += file_size
+                # Estimate 70% compression savings
+                potential_savings += int(file_size * 0.7)
+
+    if uncompressed_count > 0:
+        issues.append(f"Found {uncompressed_count:,} uncompressed files ({uncompressed_size / 1e9:.2f} GB)")
+        recommendations.append(f"Run optimize_storage() to compress files and save ~{potential_savings / 1e9:.1f} GB")
+
+    # Check for obsolete _pcache directory (replaced by _tcache in commit 3bfba7e)
+    pcache_path = storage_path / '_pcache'
+    pcache_size = 0
+    pcache_files = 0
+    if pcache_path.exists():
+        for file_path in pcache_path.rglob('*'):
+            if file_path.is_file():
+                pcache_files += 1
+                pcache_size += file_path.stat().st_size
+
+        if pcache_files > 0:
+            issues.append(f"Obsolete _pcache directory contains {pcache_files:,} files ({pcache_size / 1e9:.2f} GB)")
+            recommendations.append(f"Run clear_cache(obsolete_only=True) to remove old cache and free {pcache_size / 1e9:.1f} GB")
+
+    # Check for large cache directories
+    cache_size = 0
+    cache_files = 0
+    for cache_dir in ['_cache', '_tcache']:  # Only check active cache directories
+        cache_path = storage_path / cache_dir
+        if cache_path.exists():
+            for file_path in cache_path.rglob('*'):
+                if file_path.is_file():
+                    cache_files += 1
+                    cache_size += file_path.stat().st_size
+
+    if cache_size > 1e9:  # More than 1 GB
+        issues.append(f"Cache directories contain {cache_files:,} files ({cache_size / 1e9:.2f} GB)")
+        recommendations.append(f"Run clear_cache() to free up {cache_size / 1e9:.1f} GB")
+
+    # Check for old filings (over 1 year old) - only if many exist
+    from datetime import datetime, timedelta
+    old_threshold = datetime.now() - timedelta(days=365)
+    old_filings = 0
+    old_filings_size = 0
+
+    filings_dir = storage_path / 'filings'
+    if filings_dir.exists():
+        for date_dir in filings_dir.iterdir():
+            if not date_dir.is_dir():
+                continue
+
+            # Parse date from directory name (YYYYMMDD)
+            if len(date_dir.name) == 8 and date_dir.name.isdigit():
+                try:
+                    dir_date = datetime.strptime(date_dir.name, '%Y%m%d')
+                    if dir_date < old_threshold:
+                        for file_path in date_dir.rglob('*'):
+                            if file_path.is_file():
+                                old_filings += 1
+                                old_filings_size += file_path.stat().st_size
+                except ValueError:
+                    continue
+
+    if old_filings > 100:  # Only flag if substantial
+        recommendations.append(
+            f"Consider cleanup_storage(days=365) to remove {old_filings:,} old filings "
+            f"({old_filings_size / 1e9:.1f} GB)"
+        )
+
+    # Overall health check
+    if not issues:
+        recommendations.append("Storage is well-optimized!")
+
+    return StorageAnalysis(
+        storage_info=info,
+        issues=issues,
+        recommendations=recommendations,
+        potential_savings_bytes=potential_savings
+    )
+
+
+def optimize_storage(dry_run: bool = True) -> Dict[str, int]:
+    """
+    Compress uncompressed files to save disk space.
+
+    Compresses .json, .xml, .txt, and .nc files in filings, companyfacts,
+    and submissions directories using gzip. Original files are replaced with
+    .gz versions.
+
+    Args:
+        dry_run: If True, only report what would be done without making changes
+
+    Returns:
+        Dict with 'files_compressed', 'bytes_saved', 'errors'
+
+    Example:
+        >>> from edgar.storage_management import optimize_storage
+        >>> # First see what would happen
+        >>> result = optimize_storage(dry_run=True)
+        >>> print(f"Would compress {result['files_compressed']} files")
+        >>> # Then do it
+        >>> result = optimize_storage(dry_run=False)
+        >>> print(f"Saved {result['bytes_saved'] / 1e9:.1f} GB")
+    """
+    import gzip
+    import shutil
+    from edgar.core import get_edgar_data_directory
+
+    storage_path = get_edgar_data_directory()
+    files_compressed = 0
+    bytes_saved = 0
+    errors = 0
+
+    for subdir in ['filings', 'companyfacts', 'submissions']:
+        subdir_path = storage_path / subdir
+        if not subdir_path.exists():
+            continue
+
+        for file_path in subdir_path.rglob('*'):
+            if not file_path.is_file():
+                continue
+
+            # Check if file should be compressed
+            if file_path.suffix in ['.json', '.xml', '.txt', '.nc'] and not str(file_path).endswith('.gz'):
+                try:
+                    original_size = file_path.stat().st_size
+
+                    if not dry_run:
+                        # Compress file
+                        gz_path = Path(str(file_path) + '.gz')
+                        with open(file_path, 'rb') as f_in:
+                            with gzip.open(gz_path, 'wb') as f_out:
+                                shutil.copyfileobj(f_in, f_out)
+
+                        # Verify compressed file exists
+                        if gz_path.exists():
+                            compressed_size = gz_path.stat().st_size
+                            bytes_saved += (original_size - compressed_size)
+                            file_path.unlink()  # Remove original
+                        else:
+                            errors += 1
+                            continue
+                    else:
+                        # Estimate 70% compression
+                        bytes_saved += int(original_size * 0.7)
+
+                    files_compressed += 1
+
+                except Exception as e:
+                    errors += 1
+                    continue
+
+    return {
+        'files_compressed': files_compressed,
+        'bytes_saved': bytes_saved,
+        'errors': errors
+    }
+
+
+def cleanup_storage(days: int = 365, dry_run: bool = True) -> Dict[str, int]:
+    """
+    Remove old filings from local storage.
+
+    Deletes filing files older than the specified number of days. This helps
+    free up space for users who only need recent filings.
+
+    Args:
+        days: Remove filings older than this many days (default: 365)
+        dry_run: If True, only report what would be deleted without making changes
+
+    Returns:
+        Dict with 'files_deleted', 'bytes_freed', 'errors'
+
+    Example:
+        >>> from edgar.storage_management import cleanup_storage
+        >>> # First see what would be deleted
+        >>> result = cleanup_storage(days=365, dry_run=True)
+        >>> print(f"Would delete {result['files_deleted']} files")
+        >>> # Then do it
+        >>> result = cleanup_storage(days=365, dry_run=False)
+        >>> print(f"Freed {result['bytes_freed'] / 1e9:.1f} GB")
+    """
+    from datetime import datetime, timedelta
+    from edgar.core import get_edgar_data_directory
+
+    storage_path = get_edgar_data_directory()
+    cutoff_date = datetime.now() - timedelta(days=days)
+
+    files_deleted = 0
+    bytes_freed = 0
+    errors = 0
+
+    filings_dir = storage_path / 'filings'
+    if not filings_dir.exists():
+        return {'files_deleted': 0, 'bytes_freed': 0, 'errors': 0}
+
+    for date_dir in filings_dir.iterdir():
+        if not date_dir.is_dir():
+            continue
+
+        # Parse date from directory name (YYYYMMDD)
+        if len(date_dir.name) == 8 and date_dir.name.isdigit():
+            try:
+                dir_date = datetime.strptime(date_dir.name, '%Y%m%d')
+
+                if dir_date < cutoff_date:
+                    # Delete all files in this directory
+                    for file_path in date_dir.rglob('*'):
+                        if file_path.is_file():
+                            try:
+                                file_size = file_path.stat().st_size
+                                bytes_freed += file_size
+
+                                if not dry_run:
+                                    file_path.unlink()
+
+                                files_deleted += 1
+                            except Exception:
+                                errors += 1
+                                continue
+
+                    # Remove empty directory
+                    if not dry_run:
+                        try:
+                            # Remove all empty subdirectories
+                            for subdir in reversed(list(date_dir.rglob('*'))):
+                                if subdir.is_dir() and not list(subdir.iterdir()):
+                                    subdir.rmdir()
+                            # Remove date directory if empty
+                            if not list(date_dir.iterdir()):
+                                date_dir.rmdir()
+                        except Exception:
+                            errors += 1
+
+            except ValueError:
+                continue
+
+    return {
+        'files_deleted': files_deleted,
+        'bytes_freed': bytes_freed,
+        'errors': errors
+    }
+
+
+def clear_cache(dry_run: bool = True, obsolete_only: bool = False) -> Dict[str, int]:
+    """
+    Clear HTTP cache directories to free up space.
+
+    Removes cached HTTP responses from cache directories. By default clears all
+    cache directories (_cache, _tcache). Use obsolete_only=True to only remove
+    the obsolete _pcache directory (replaced by _tcache in Aug 2025).
+
+    Args:
+        dry_run: If True, only report what would be deleted without making changes
+        obsolete_only: If True, only clear obsolete _pcache directory
+
+    Returns:
+        Dict with 'files_deleted', 'bytes_freed', 'errors'
+
+    Example:
+        >>> from edgar.storage_management import clear_cache
+        >>> # Clear obsolete cache only
+        >>> result = clear_cache(obsolete_only=True, dry_run=False)
+        >>> print(f"Freed {result['bytes_freed'] / 1e9:.1f} GB")
+        >>> # Clear all caches
+        >>> result = clear_cache(dry_run=False)
+        >>> print(f"Cleared {result['files_deleted']} cache files")
+    """
+    from edgar.core import get_edgar_data_directory
+
+    storage_path = get_edgar_data_directory()
+    files_deleted = 0
+    bytes_freed = 0
+    errors = 0
+
+    # Determine which cache directories to clear
+    if obsolete_only:
+        cache_dirs = ['_pcache']  # Only obsolete cache
+    else:
+        cache_dirs = ['_cache', '_tcache']  # Active caches only
+
+    for cache_dir_name in cache_dirs:
+        cache_dir = storage_path / cache_dir_name
+        if not cache_dir.exists():
+            continue
+
+        for file_path in cache_dir.rglob('*'):
+            if file_path.is_file():
+                try:
+                    file_size = file_path.stat().st_size
+                    bytes_freed += file_size
+
+                    if not dry_run:
+                        file_path.unlink()
+
+                    files_deleted += 1
+                except Exception:
+                    errors += 1
+                    continue
+
+        # Remove empty directories
+        if not dry_run:
+            try:
+                for subdir in reversed(list(cache_dir.rglob('*'))):
+                    if subdir.is_dir() and not list(subdir.iterdir()):
+                        subdir.rmdir()
+                # Remove the cache directory itself if empty
+                if not list(cache_dir.iterdir()):
+                    cache_dir.rmdir()
+            except Exception:
+                errors += 1
+
+    return {
+        'files_deleted': files_deleted,
+        'bytes_freed': bytes_freed,
+        'errors': errors
+    }