743 lines
28 KiB
Python
743 lines
28 KiB
Python
import asyncio
|
|
import os
|
|
import re
|
|
from datetime import date, datetime
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, List, Optional, Union
|
|
|
|
if TYPE_CHECKING:
|
|
from edgar._filings import Filings
|
|
|
|
import pandas as pd
|
|
from bs4 import BeautifulSoup
|
|
from httpx import AsyncClient, HTTPStatusError
|
|
from tqdm.auto import tqdm
|
|
|
|
from edgar.core import filing_date_to_year_quarters, get_edgar_data_directory, log, strtobool
|
|
from edgar.dates import extract_dates
|
|
from edgar.httprequests import download_bulk_data, download_datafile, download_text
|
|
from edgar.reference.tickers import company_tickers_exchange_url, company_tickers_json_url, mutual_fund_tickers_url, ticker_txt_url
|
|
|
|
__all__ = ['download_edgar_data',
|
|
'get_edgar_data_directory',
|
|
'use_local_storage',
|
|
'is_using_local_storage',
|
|
'set_local_storage_path',
|
|
'download_filings',
|
|
'local_filing_path',
|
|
'check_filings_exist_locally',
|
|
'_filter_extracted_files',
|
|
'compress_filing',
|
|
'decompress_filing',
|
|
'compress_all_filings',
|
|
'is_compressed_file']
|
|
|
|
class DirectoryBrowsingNotAllowed(Exception):
|
|
|
|
def __init__(self, url: str, message: str = "Directory browsing is not allowed for this URL."):
|
|
super().__init__(f"{message} \nurl: {url}")
|
|
self.url = url
|
|
|
|
def use_local_storage(path_or_enable: Union[bool, str, Path, None] = True, use_local: Optional[bool] = None):
|
|
"""
|
|
Enable or disable local storage, optionally setting the storage path.
|
|
|
|
This function supports multiple calling patterns for convenience:
|
|
|
|
Args:
|
|
path_or_enable: Can be:
|
|
- bool: Enable (True) or disable (False) local storage
|
|
- str/Path: Path to storage directory (enables local storage)
|
|
- None: Use default behavior
|
|
use_local: Optional boolean to explicitly set enable/disable state.
|
|
Only used when path_or_enable is a path.
|
|
|
|
Raises:
|
|
FileNotFoundError: If path is provided but does not exist.
|
|
NotADirectoryError: If path exists but is not a directory.
|
|
|
|
Examples:
|
|
>>> # Simple enable/disable (backward compatible)
|
|
>>> use_local_storage(True)
|
|
>>> use_local_storage(False)
|
|
>>> use_local_storage() # defaults to True
|
|
|
|
>>> # Set path and enable (new intuitive syntax)
|
|
>>> use_local_storage("~/Documents/edgar")
|
|
>>> use_local_storage("/tmp/edgar_data")
|
|
>>> use_local_storage(Path.home() / "edgar")
|
|
|
|
>>> # Set path and explicitly control enable/disable
|
|
>>> use_local_storage("/tmp/edgar", True) # enable
|
|
>>> use_local_storage("/tmp/edgar", False) # set path but disable
|
|
"""
|
|
# Determine the actual values based on parameter types
|
|
if isinstance(path_or_enable, bool):
|
|
# Backward compatible: use_local_storage(True/False)
|
|
enable = path_or_enable
|
|
path = None
|
|
elif isinstance(path_or_enable, (str, Path)):
|
|
# New syntax: use_local_storage("/path/to/storage")
|
|
path = path_or_enable
|
|
enable = use_local if use_local is not None else True
|
|
elif path_or_enable is None:
|
|
# use_local_storage() - default behavior
|
|
enable = True
|
|
path = None
|
|
else:
|
|
raise TypeError(f"First parameter must be bool, str, Path, or None, got {type(path_or_enable)}")
|
|
|
|
# If a path is provided and we're enabling local storage, set the path first
|
|
if path is not None and enable:
|
|
set_local_storage_path(path)
|
|
|
|
# Set the local storage flag
|
|
os.environ['EDGAR_USE_LOCAL_DATA'] = "1" if enable else "0"
|
|
|
|
|
|
def is_using_local_storage() -> bool:
|
|
"""
|
|
Returns True if using local storage
|
|
"""
|
|
return strtobool(os.getenv('EDGAR_USE_LOCAL_DATA', "False"))
|
|
|
|
|
|
def set_local_storage_path(path: Union[str, Path]) -> None:
|
|
"""
|
|
Set the local storage path for Edgar data.
|
|
|
|
This function provides a programmatic way to set the local storage directory,
|
|
equivalent to setting the EDGAR_LOCAL_DATA_DIR environment variable.
|
|
|
|
Args:
|
|
path: Path to the directory where Edgar data should be stored.
|
|
Can be a string or Path object. The directory must already exist.
|
|
|
|
Raises:
|
|
FileNotFoundError: If the specified directory does not exist.
|
|
NotADirectoryError: If the path exists but is not a directory.
|
|
|
|
Example:
|
|
>>> # First create the directory
|
|
>>> os.makedirs("/tmp/edgar_data", exist_ok=True)
|
|
>>> set_local_storage_path("/tmp/edgar_data")
|
|
>>>
|
|
>>> # Or use an existing directory
|
|
>>> set_local_storage_path(Path.home() / "Documents")
|
|
"""
|
|
from pathlib import Path
|
|
|
|
# Convert to Path object and resolve to absolute path
|
|
storage_path = Path(path).expanduser().resolve()
|
|
|
|
# Validate that the directory exists
|
|
if not storage_path.exists():
|
|
raise FileNotFoundError(f"Directory does not exist: {storage_path}")
|
|
|
|
# Validate that it's actually a directory
|
|
if not storage_path.is_dir():
|
|
raise NotADirectoryError(f"Path exists but is not a directory: {storage_path}")
|
|
|
|
# Set the environment variable
|
|
os.environ['EDGAR_LOCAL_DATA_DIR'] = str(storage_path)
|
|
|
|
|
|
async def download_facts_async(client: Optional[AsyncClient]) -> Path:
|
|
"""
|
|
Download company facts
|
|
"""
|
|
log.info(f"Downloading Company facts to {get_edgar_data_directory()}/companyfacts")
|
|
|
|
return await download_bulk_data(client=client, url="https://www.sec.gov/Archives/edgar/daily-index/xbrl/companyfacts.zip")
|
|
|
|
def download_facts() -> Path:
|
|
"""
|
|
Download company facts
|
|
"""
|
|
|
|
return asyncio.run(download_facts_async(client = None))
|
|
|
|
async def download_submissions_async(client: Optional[AsyncClient]) -> Path:
|
|
"""
|
|
Download company submissions
|
|
"""
|
|
log.info(f"Downloading Company submissions to {get_edgar_data_directory()}/submissions")
|
|
|
|
return await download_bulk_data(client=client, url="https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip")
|
|
|
|
|
|
def download_submissions() -> Path:
|
|
"""
|
|
Download company facts
|
|
"""
|
|
return asyncio.run(download_submissions_async(client = None))
|
|
|
|
def download_ticker_data(reference_data_directory: Path):
|
|
"""
|
|
Download reference data from the SEC website.
|
|
"""
|
|
log.info(f"Downloading ticker data to {reference_data_directory}")
|
|
download_datafile(ticker_txt_url, reference_data_directory)
|
|
download_datafile(company_tickers_json_url, reference_data_directory)
|
|
download_datafile(mutual_fund_tickers_url, reference_data_directory)
|
|
download_datafile(company_tickers_exchange_url, reference_data_directory)
|
|
|
|
|
|
def download_reference_data():
|
|
"""
|
|
Download reference data from the SEC website.
|
|
"""
|
|
log.info(f"Downloading reference data to {get_edgar_data_directory()}")
|
|
reference_directory = get_edgar_data_directory() / "reference"
|
|
reference_directory.mkdir(exist_ok=True)
|
|
download_ticker_data(reference_directory)
|
|
|
|
def download_edgar_data(submissions: bool = True,
|
|
facts: bool = True,
|
|
reference: bool = True):
|
|
"""
|
|
Download Edgar data to the local storage directory
|
|
:param submissions: Download submissions
|
|
:param facts: Download facts
|
|
:param reference: Download reference data
|
|
"""
|
|
if submissions:
|
|
download_submissions()
|
|
if facts:
|
|
download_facts()
|
|
if reference:
|
|
download_reference_data()
|
|
|
|
|
|
def download_filings(filing_date: Optional[str] = None,
|
|
data_directory: Optional[str] = None,
|
|
overwrite_existing:bool=False,
|
|
filings: Optional['Filings'] = None,
|
|
compress: bool = True,
|
|
compression_level: int = 6):
|
|
"""
|
|
Download feed files for the specified date or date range, or for specific filings.
|
|
Optionally compresses the extracted files to save disk space.
|
|
|
|
Examples
|
|
|
|
download_filings('2025-01-03:')
|
|
download_filings('2025-01-03', overwrite_existing=False)
|
|
download_filings('2024-01-01:2025-01-05', overwrite_existing=True)
|
|
download_filings(filings=my_filings_object)
|
|
download_filings('2025-01-03', compress=True, compression_level=9) # Maximum compression
|
|
|
|
Args:
|
|
filing_date: String in format 'YYYY-MM-DD', 'YYYY-MM-DD:', ':YYYY-MM-DD',
|
|
or 'YYYY-MM-DD:YYYY-MM-DD'. If both filing_date and filings are provided,
|
|
filing_date takes precedence for determining which feed files to download.
|
|
data_directory: Directory to save the downloaded files. Defaults to the Edgar data directory.
|
|
overwrite_existing: If True, overwrite existing files. Default is False.
|
|
filings: Optional Filings object. If provided, will download only filings with matching accession numbers.
|
|
compress: Whether to compress the extracted files to save disk space. Default is True.
|
|
compression_level: Compression level for gzip (1-9, with 9 being highest compression). Default is 6.
|
|
"""
|
|
if not data_directory:
|
|
data_directory = get_edgar_data_directory() / 'filings'
|
|
log.info('Using data directory: %s', data_directory)
|
|
|
|
# If filings object is provided, extract accession numbers
|
|
accession_numbers = None
|
|
if filings is not None:
|
|
log.info('Using provided Filings object with %d filings', len(filings))
|
|
accession_numbers = filings.data['accession_number'].to_pylist()
|
|
|
|
# If both filing_date and filings are provided, let the user know which takes precedence
|
|
if filing_date:
|
|
log.info('Both filing_date and filings parameters provided. Using filing_date %s for determining feed files to download.', filing_date)
|
|
# Use the date range from the filings object if no filing_date specified
|
|
else:
|
|
start_date, end_date = filings.date_range
|
|
filing_date = f"{start_date}:{end_date}"
|
|
log.info('Using date range from filings: %s', filing_date)
|
|
|
|
# Use default date if not specified
|
|
if not filing_date:
|
|
filing_date = latest_filing_date()
|
|
log.info('No filing date specified. Using latest filing date: %s', filing_date)
|
|
|
|
# Get start and end dates for filtering
|
|
start_date_tm, end_date_tm, is_range = extract_dates(filing_date)
|
|
|
|
# Get quarters to process
|
|
year_and_quarters = filing_date_to_year_quarters(filing_date)
|
|
|
|
# Track statistics
|
|
total_feed_files_downloaded = 0
|
|
total_filings_kept = 0
|
|
|
|
for year, quarter in year_and_quarters:
|
|
log.info('Downloading feed files for %d Q%d', year, quarter)
|
|
# Get list of feed files for this quarter
|
|
feed_files = list_filing_feed_files_for_quarter(year, quarter)
|
|
|
|
log.info('Found %d total feed files', len(feed_files))
|
|
|
|
# Filter files based on date range
|
|
filtered_files = feed_files[
|
|
feed_files['Name'].apply(
|
|
lambda x: is_feed_file_in_date_range(x, start_date_tm, end_date_tm)
|
|
)
|
|
]
|
|
log.info('Found %d feed files in date range', len(filtered_files))
|
|
|
|
if not filtered_files.empty:
|
|
# Process the filtered files...
|
|
for _, row in tqdm(filtered_files.iterrows(), desc='Downloading feed file(s)'):
|
|
bulk_filing_file = row['File']
|
|
bulk_file_directory = data_directory / row['Name'][:8]
|
|
filing_date_str = row['Name'][:8] # Extract YYYYMMDD from filename
|
|
|
|
if not overwrite_existing:
|
|
if bulk_file_directory.exists():
|
|
log.warning('Skipping %s. Already exists', bulk_file_directory)
|
|
continue
|
|
|
|
# Optimization: If we have specific accession numbers, check if all the ones
|
|
# for this specific filing date already exist locally
|
|
if accession_numbers and filings is not None:
|
|
# Convert YYYYMMDD to YYYY-MM-DD format
|
|
formatted_date = f"{filing_date_str[:4]}-{filing_date_str[4:6]}-{filing_date_str[6:8]}"
|
|
|
|
# Filter accession numbers to only those for this specific filing date
|
|
date_filtered_filings = filings.filter(filing_date=formatted_date)
|
|
if not date_filtered_filings.empty:
|
|
date_accession_numbers = date_filtered_filings.data['accession_number'].to_pylist()
|
|
if check_filings_exist_locally(formatted_date, date_accession_numbers):
|
|
log.warning('Not downloading for %s. All %d filings for this date already exist in local %s',
|
|
formatted_date, len(date_accession_numbers), bulk_file_directory)
|
|
continue
|
|
|
|
# Track existing files before extraction to preserve them during filtering
|
|
existing_files = set()
|
|
if accession_numbers and bulk_file_directory.exists():
|
|
existing_files = {str(f) for f in bulk_file_directory.glob('*.nc')}
|
|
|
|
path = asyncio.run(download_bulk_data(client=None, url=bulk_filing_file, data_directory=data_directory))
|
|
log.info('Downloaded feed file to %s', path)
|
|
total_feed_files_downloaded += 1
|
|
|
|
# If we have specific accession numbers, filter the extracted files
|
|
if accession_numbers and path.exists():
|
|
_filter_extracted_files._existing_files = existing_files
|
|
|
|
log.info('Filtering extracted files to keep only specified accession numbers')
|
|
filings_kept = _filter_extracted_files(path, accession_numbers, compress=compress, compression_level=compression_level)
|
|
total_filings_kept += filings_kept
|
|
|
|
# Clean up the tracking
|
|
if hasattr(_filter_extracted_files, '_existing_files'):
|
|
delattr(_filter_extracted_files, '_existing_files')
|
|
# If we don't have specific accession numbers but compression is enabled, compress all files
|
|
elif compress and path.exists():
|
|
log.info('Compressing all extracted files')
|
|
for file_path in path.glob('*.nc'):
|
|
if not is_compressed_file(file_path):
|
|
try:
|
|
compress_filing(file_path, compression_level=compression_level)
|
|
except Exception as e:
|
|
log.warning(f"Failed to compress {file_path}: {e}")
|
|
else:
|
|
log.info('No feed files found for %d Q%d in date range %s', year, quarter, filing_date)
|
|
|
|
# Log summary statistics
|
|
log.info('Download complete. Downloaded %d feed files.', total_feed_files_downloaded)
|
|
if accession_numbers:
|
|
log.info('Kept %d filings out of %d requested.', total_filings_kept, len(accession_numbers))
|
|
|
|
|
|
def _filter_extracted_files(directory_path: Path, accession_numbers: List[str], compress: bool = True, compression_level: int = 6) -> int:
|
|
"""
|
|
Filter files in the extracted directory to keep only those matching the specified accession numbers.
|
|
Files from the current extraction that don't match are removed to save disk space.
|
|
Files that existed before this extraction (from previous downloads) are preserved.
|
|
|
|
Args:
|
|
directory_path: Path to the directory containing extracted files
|
|
accession_numbers: List of accession numbers to keep
|
|
compress: Whether to compress the kept files (default: True)
|
|
compression_level: Compression level for gzip (1-9, with 9 being highest compression)
|
|
|
|
Returns:
|
|
int: Number of filings kept
|
|
"""
|
|
if not directory_path.is_dir():
|
|
return 0
|
|
|
|
# Convert accession numbers to the format used in filenames (removing dashes)
|
|
normalized_accession_numbers = [an.replace('-', '') for an in accession_numbers]
|
|
|
|
# Keep track of which filings were found
|
|
filings_kept = 0
|
|
|
|
# Get list of files that existed before this extraction
|
|
# We'll preserve these even if they don't match our filter
|
|
existing_files = getattr(_filter_extracted_files, '_existing_files', set())
|
|
|
|
# Find all .nc files in the directory
|
|
for file_path in directory_path.glob('*.nc'):
|
|
# Extract accession number from filename
|
|
file_accession = file_path.stem
|
|
undashed_accession = file_accession.replace('-', '')
|
|
|
|
# Check if this file matches our filter
|
|
matches_filter = (undashed_accession in normalized_accession_numbers or
|
|
file_accession in accession_numbers)
|
|
|
|
# Check if this file existed before this extraction
|
|
was_preexisting = str(file_path) in existing_files
|
|
|
|
if matches_filter:
|
|
filings_kept += 1
|
|
# Compress the file if requested
|
|
if compress and not is_compressed_file(file_path):
|
|
try:
|
|
compress_filing(file_path, compression_level=compression_level)
|
|
log.debug(f"Compressed {file_path}")
|
|
except Exception as e:
|
|
log.warning(f"Failed to compress {file_path}: {e}")
|
|
elif not was_preexisting:
|
|
# Remove files from current extraction that don't match filter
|
|
# But preserve files that existed before this extraction
|
|
try:
|
|
file_path.unlink()
|
|
log.debug(f"Removed non-matching file from current extraction: {file_path}")
|
|
except Exception as e:
|
|
log.warning(f"Failed to remove {file_path}: {e}")
|
|
|
|
return filings_kept
|
|
|
|
|
|
def is_feed_file_in_date_range(filename: str,
|
|
start_date: Optional[datetime],
|
|
end_date: Optional[datetime]) -> bool:
|
|
"""
|
|
Check if a feed file falls within the specified date range.
|
|
Feed files are named like '20240102.nc.tar.gz'
|
|
"""
|
|
# Extract date from filename
|
|
match = re.search(r'(\d{8})\.nc\.tar\.gz', filename)
|
|
if not match:
|
|
return False
|
|
|
|
date_str = match.group(1)
|
|
file_date = datetime.strptime(date_str, '%Y%m%d')
|
|
|
|
# For single date (not range)
|
|
if start_date and not end_date:
|
|
return (file_date.year == start_date.year and
|
|
file_date.month == start_date.month and
|
|
file_date.day == start_date.day)
|
|
|
|
# For date range
|
|
if start_date:
|
|
if file_date < start_date:
|
|
return False
|
|
|
|
if end_date:
|
|
if file_date > end_date:
|
|
return False
|
|
|
|
return True
|
|
|
|
def list_filing_feed_files_for_quarter(year:int, quarter:int) -> pd.DataFrame:
|
|
assert quarter in (1, 2, 3, 4), "Quarter must be between 1 and 4"
|
|
url = f"https://www.sec.gov/Archives/edgar/Feed/{year}/QTR{quarter}/"
|
|
return list_filing_feed_files(url)
|
|
|
|
def get_sec_file_listing(url:str) -> pd.DataFrame:
|
|
"""
|
|
Reads an SEC EDGAR file listing directory and returns file information as a DataFrame.
|
|
|
|
Args:
|
|
url (str): URL of the SEC EDGAR feed directory (e.g., 'https://www.sec.gov/Archives/edgar/Feed/2024/QTR1/')
|
|
|
|
Returns:
|
|
pd.DataFrame: DataFrame containing file information with columns:
|
|
- Name: str, filename
|
|
- Size: int, file size in bytes
|
|
- Modified: datetime, last modification timestamp
|
|
|
|
Raises:
|
|
ValueError: If URL is invalid or doesn't point to SEC EDGAR
|
|
ConnectionError: If unable to download the page
|
|
RuntimeError: If page structure is invalid or no table found
|
|
"""
|
|
try:
|
|
html = download_text(url)
|
|
except HTTPStatusError as e:
|
|
if e.response.status_code == 403:
|
|
log.warning(f"There are no feed files for url {url}")
|
|
return pd.DataFrame(columns=['Name', 'File', 'Size', 'Modified'])
|
|
elif e.response.status_code == 404:
|
|
raise FileNotFoundError(f"Page not found: {url}") from None
|
|
raise ConnectionError(f"Failed to download page: {str(e)}") from e
|
|
|
|
if "Directory Browsing Not Allowed" in html:
|
|
log.warning(f"Directory browsing is not allowed for {url}")
|
|
raise DirectoryBrowsingNotAllowed("""
|
|
Directory browsing is not allowed for here.
|
|
This is unexpected and the SEC likely has changed their policy for viewing the bulk filing files.
|
|
""")
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
table = soup.find('table')
|
|
|
|
if not table:
|
|
raise RuntimeError("No table found in the page")
|
|
|
|
records = []
|
|
|
|
# Process table rows, skip header row
|
|
for row in table.find_all('tr')[1:]:
|
|
cells = row.find_all('td')
|
|
|
|
# Skip if row structure is invalid
|
|
if len(cells) != 3:
|
|
continue
|
|
|
|
name = cells[0].text.strip()
|
|
|
|
# Skip parent directory entry
|
|
if name in ('.', '..'):
|
|
continue
|
|
feed_file_url = f"{url}{name}"
|
|
|
|
# Parse file size (convert "1.2K", "3.4M" etc. to bytes)
|
|
size_text = cells[1].text.strip()
|
|
size = parse_file_size(size_text)
|
|
|
|
# Parse modification date
|
|
modified_text = cells[2].text.strip()
|
|
try:
|
|
modified = datetime.strptime(modified_text, '%m/%d/%Y %I:%M:%S %p')
|
|
except ValueError:
|
|
modified = None
|
|
|
|
records.append((name, feed_file_url, size, modified))
|
|
|
|
df = pd.DataFrame(records, columns=['Name', 'File', 'Size', 'Modified'])
|
|
return df
|
|
|
|
def list_filing_feed_files(url: str) -> pd.DataFrame:
|
|
"""
|
|
Reads the SEC EDGAR filing feed directory and returns file information as a DataFrame.
|
|
|
|
Args:
|
|
url (str): URL of the SEC EDGAR feed directory (e.g., 'https://www.sec.gov/Archives/edgar/Feed/2024/QTR1/')
|
|
|
|
Returns:
|
|
pd.DataFrame: DataFrame containing file information with columns:
|
|
- Name: str, filename
|
|
- Size: int, file size in bytes
|
|
- Modified: datetime, last modification timestamp
|
|
|
|
Raises:
|
|
ValueError: If URL is invalid or doesn't point to SEC EDGAR
|
|
ConnectionError: If unable to download the page
|
|
RuntimeError: If page structure is invalid or no table found
|
|
"""
|
|
# Validate URL
|
|
if not url.startswith('https://www.sec.gov/Archives/edgar/Feed/'):
|
|
raise ValueError("URL must be an SEC EDGAR feed directory")
|
|
return get_sec_file_listing(url)
|
|
|
|
|
|
def parse_file_size(size_text: str) -> Optional[int]:
|
|
"""Convert size string to bytes (e.g., "1.2K" -> 1228)"""
|
|
if not size_text:
|
|
return None
|
|
|
|
units = {'B': 1, 'K': 1024, 'M': 1024 * 1024, 'G': 1024 * 1024 * 1024}
|
|
pattern = r'(\d+\.?\d*)\s*([BKMG])?'
|
|
match = re.match(pattern, size_text.upper())
|
|
|
|
if not match:
|
|
return None
|
|
|
|
number = float(match.group(1))
|
|
unit = match.group(2) or 'B'
|
|
|
|
return int(number * units[unit])
|
|
|
|
|
|
def latest_filing_date():
|
|
"""Get the latest filing date"""
|
|
from edgar import get_filings
|
|
return get_filings().end_date
|
|
|
|
|
|
def is_compressed_file(file_path: Path) -> bool:
|
|
"""
|
|
Check if a file is gzip-compressed by examining its extension.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
bool: True if the file has a .gz extension, False otherwise
|
|
"""
|
|
return str(file_path).endswith('.gz')
|
|
|
|
|
|
def compress_filing(file_path: Path, compression_level: int = 6, delete_original: bool = True) -> Path:
|
|
"""
|
|
Compress a filing file using gzip and optionally delete the original.
|
|
|
|
Args:
|
|
file_path: Path to the file to compress
|
|
compression_level: Compression level (1-9, with 9 being highest compression)
|
|
delete_original: Whether to delete the original file after compression
|
|
|
|
Returns:
|
|
Path to the compressed file
|
|
|
|
Raises:
|
|
FileNotFoundError: If the file does not exist
|
|
ValueError: If the file is already compressed
|
|
"""
|
|
import gzip
|
|
import shutil
|
|
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
if is_compressed_file(file_path):
|
|
raise ValueError(f"File is already compressed: {file_path}")
|
|
|
|
compressed_path = Path(f"{file_path}.gz")
|
|
|
|
# Compress the file
|
|
with file_path.open('rb') as f_in:
|
|
with gzip.open(compressed_path, 'wb', compresslevel=compression_level) as f_out:
|
|
shutil.copyfileobj(f_in, f_out)
|
|
|
|
# Delete the original file if requested
|
|
if delete_original:
|
|
file_path.unlink()
|
|
|
|
return compressed_path
|
|
|
|
|
|
def decompress_filing(file_path: Path, output_path: Optional[Path] = None, delete_original: bool = False) -> Path:
|
|
"""
|
|
Decompress a gzip-compressed filing file.
|
|
|
|
Args:
|
|
file_path: Path to the compressed file
|
|
output_path: Path to save the decompressed file (if None, use the original path without .gz)
|
|
delete_original: Whether to delete the original compressed file
|
|
|
|
Returns:
|
|
Path to the decompressed file
|
|
|
|
Raises:
|
|
FileNotFoundError: If the file does not exist
|
|
ValueError: If the file is not compressed
|
|
gzip.BadGzipFile: If the file is not a valid gzip file
|
|
"""
|
|
import gzip
|
|
import shutil
|
|
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
if not is_compressed_file(file_path):
|
|
raise ValueError(f"File is not compressed: {file_path}")
|
|
|
|
# Determine output path if not provided
|
|
if output_path is None:
|
|
# Remove .gz extension
|
|
output_path = Path(str(file_path)[:-3])
|
|
|
|
# Decompress the file
|
|
with gzip.open(file_path, 'rb') as f_in:
|
|
with output_path.open('wb') as f_out:
|
|
shutil.copyfileobj(f_in, f_out)
|
|
|
|
# Delete the original compressed file if requested
|
|
if delete_original:
|
|
file_path.unlink()
|
|
|
|
return output_path
|
|
|
|
|
|
def compress_all_filings(data_directory: Optional[Path] = None, compression_level: int = 6) -> int:
|
|
"""
|
|
Compress all uncompressed filing files in the data directory.
|
|
|
|
Args:
|
|
data_directory: Path to the data directory (defaults to the Edgar data directory)
|
|
compression_level: Compression level (1-9, with 9 being highest compression)
|
|
|
|
Returns:
|
|
Number of files compressed
|
|
"""
|
|
if data_directory is None:
|
|
data_directory = get_edgar_data_directory() / 'filings'
|
|
|
|
# Find all .nc files (not already compressed)
|
|
files_compressed = 0
|
|
for file_path in tqdm(list(data_directory.glob('**/*.nc')), desc="Compressing files"):
|
|
if not is_compressed_file(file_path) and file_path.is_file():
|
|
try:
|
|
compress_filing(file_path, compression_level=compression_level)
|
|
files_compressed += 1
|
|
except Exception as e:
|
|
log.warning(f"Failed to compress {file_path}: {e}")
|
|
|
|
return files_compressed
|
|
|
|
def check_filings_exist_locally(filing_date: Union[str, date], accession_numbers: List[str]) -> bool:
|
|
"""
|
|
Check if all specified accession numbers already exist locally for a given filing date.
|
|
|
|
Args:
|
|
filing_date: The filing date (YYYY-MM-DD format)
|
|
accession_numbers: List of accession numbers to check
|
|
|
|
Returns:
|
|
bool: True if all filings exist locally, False otherwise
|
|
"""
|
|
if not accession_numbers:
|
|
return False
|
|
|
|
for accession_number in accession_numbers:
|
|
# Check both compressed and uncompressed versions
|
|
filing_path = local_filing_path(filing_date, accession_number)
|
|
if not filing_path.exists():
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def local_filing_path(filing_date:Union[str, date],
|
|
accession_number:str,
|
|
correction:bool=False) -> Path:
|
|
"""
|
|
Get the local path for a filing
|
|
If correction is True, will look for the corrected filing with extension 'corr'
|
|
|
|
Returns the compressed version (.gz) if it exists, otherwise returns the uncompressed path.
|
|
"""
|
|
ext = 'corr' if correction else 'nc'
|
|
if isinstance(filing_date, date):
|
|
filing_date = filing_date.strftime('%Y-%m-%d')
|
|
filing_date = filing_date.replace('-', '')
|
|
|
|
# Base path without compression extension
|
|
base_path = get_edgar_data_directory() / 'filings' / filing_date / f"{accession_number}.{ext}"
|
|
|
|
# Check for compressed version first
|
|
compressed_path = Path(f"{base_path}.gz")
|
|
if compressed_path.exists():
|
|
return compressed_path
|
|
|
|
# Fall back to uncompressed version
|
|
return base_path
|