import asyncio import os import re from datetime import date, datetime from pathlib import Path from typing import TYPE_CHECKING, List, Optional, Union if TYPE_CHECKING: from edgar._filings import Filings import pandas as pd from bs4 import BeautifulSoup from httpx import AsyncClient, HTTPStatusError from tqdm.auto import tqdm from edgar.core import filing_date_to_year_quarters, get_edgar_data_directory, log, strtobool from edgar.dates import extract_dates from edgar.httprequests import download_bulk_data, download_datafile, download_text from edgar.reference.tickers import company_tickers_exchange_url, company_tickers_json_url, mutual_fund_tickers_url, ticker_txt_url __all__ = ['download_edgar_data', 'get_edgar_data_directory', 'use_local_storage', 'is_using_local_storage', 'set_local_storage_path', 'download_filings', 'local_filing_path', 'check_filings_exist_locally', '_filter_extracted_files', 'compress_filing', 'decompress_filing', 'compress_all_filings', 'is_compressed_file'] class DirectoryBrowsingNotAllowed(Exception): def __init__(self, url: str, message: str = "Directory browsing is not allowed for this URL."): super().__init__(f"{message} \nurl: {url}") self.url = url def use_local_storage(path_or_enable: Union[bool, str, Path, None] = True, use_local: Optional[bool] = None): """ Enable or disable local storage, optionally setting the storage path. This function supports multiple calling patterns for convenience: Args: path_or_enable: Can be: - bool: Enable (True) or disable (False) local storage - str/Path: Path to storage directory (enables local storage) - None: Use default behavior use_local: Optional boolean to explicitly set enable/disable state. Only used when path_or_enable is a path. Raises: FileNotFoundError: If path is provided but does not exist. NotADirectoryError: If path exists but is not a directory. Examples: >>> # Simple enable/disable (backward compatible) >>> use_local_storage(True) >>> use_local_storage(False) >>> use_local_storage() # defaults to True >>> # Set path and enable (new intuitive syntax) >>> use_local_storage("~/Documents/edgar") >>> use_local_storage("/tmp/edgar_data") >>> use_local_storage(Path.home() / "edgar") >>> # Set path and explicitly control enable/disable >>> use_local_storage("/tmp/edgar", True) # enable >>> use_local_storage("/tmp/edgar", False) # set path but disable """ # Determine the actual values based on parameter types if isinstance(path_or_enable, bool): # Backward compatible: use_local_storage(True/False) enable = path_or_enable path = None elif isinstance(path_or_enable, (str, Path)): # New syntax: use_local_storage("/path/to/storage") path = path_or_enable enable = use_local if use_local is not None else True elif path_or_enable is None: # use_local_storage() - default behavior enable = True path = None else: raise TypeError(f"First parameter must be bool, str, Path, or None, got {type(path_or_enable)}") # If a path is provided and we're enabling local storage, set the path first if path is not None and enable: set_local_storage_path(path) # Set the local storage flag os.environ['EDGAR_USE_LOCAL_DATA'] = "1" if enable else "0" def is_using_local_storage() -> bool: """ Returns True if using local storage """ return strtobool(os.getenv('EDGAR_USE_LOCAL_DATA', "False")) def set_local_storage_path(path: Union[str, Path]) -> None: """ Set the local storage path for Edgar data. This function provides a programmatic way to set the local storage directory, equivalent to setting the EDGAR_LOCAL_DATA_DIR environment variable. Args: path: Path to the directory where Edgar data should be stored. Can be a string or Path object. The directory must already exist. Raises: FileNotFoundError: If the specified directory does not exist. NotADirectoryError: If the path exists but is not a directory. Example: >>> # First create the directory >>> os.makedirs("/tmp/edgar_data", exist_ok=True) >>> set_local_storage_path("/tmp/edgar_data") >>> >>> # Or use an existing directory >>> set_local_storage_path(Path.home() / "Documents") """ from pathlib import Path # Convert to Path object and resolve to absolute path storage_path = Path(path).expanduser().resolve() # Validate that the directory exists if not storage_path.exists(): raise FileNotFoundError(f"Directory does not exist: {storage_path}") # Validate that it's actually a directory if not storage_path.is_dir(): raise NotADirectoryError(f"Path exists but is not a directory: {storage_path}") # Set the environment variable os.environ['EDGAR_LOCAL_DATA_DIR'] = str(storage_path) async def download_facts_async(client: Optional[AsyncClient]) -> Path: """ Download company facts """ log.info(f"Downloading Company facts to {get_edgar_data_directory()}/companyfacts") return await download_bulk_data(client=client, url="https://www.sec.gov/Archives/edgar/daily-index/xbrl/companyfacts.zip") def download_facts() -> Path: """ Download company facts """ return asyncio.run(download_facts_async(client = None)) async def download_submissions_async(client: Optional[AsyncClient]) -> Path: """ Download company submissions """ log.info(f"Downloading Company submissions to {get_edgar_data_directory()}/submissions") return await download_bulk_data(client=client, url="https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip") def download_submissions() -> Path: """ Download company facts """ return asyncio.run(download_submissions_async(client = None)) def download_ticker_data(reference_data_directory: Path): """ Download reference data from the SEC website. """ log.info(f"Downloading ticker data to {reference_data_directory}") download_datafile(ticker_txt_url, reference_data_directory) download_datafile(company_tickers_json_url, reference_data_directory) download_datafile(mutual_fund_tickers_url, reference_data_directory) download_datafile(company_tickers_exchange_url, reference_data_directory) def download_reference_data(): """ Download reference data from the SEC website. """ log.info(f"Downloading reference data to {get_edgar_data_directory()}") reference_directory = get_edgar_data_directory() / "reference" reference_directory.mkdir(exist_ok=True) download_ticker_data(reference_directory) def download_edgar_data(submissions: bool = True, facts: bool = True, reference: bool = True): """ Download Edgar data to the local storage directory :param submissions: Download submissions :param facts: Download facts :param reference: Download reference data """ if submissions: download_submissions() if facts: download_facts() if reference: download_reference_data() def download_filings(filing_date: Optional[str] = None, data_directory: Optional[str] = None, overwrite_existing:bool=False, filings: Optional['Filings'] = None, compress: bool = True, compression_level: int = 6): """ Download feed files for the specified date or date range, or for specific filings. Optionally compresses the extracted files to save disk space. Examples download_filings('2025-01-03:') download_filings('2025-01-03', overwrite_existing=False) download_filings('2024-01-01:2025-01-05', overwrite_existing=True) download_filings(filings=my_filings_object) download_filings('2025-01-03', compress=True, compression_level=9) # Maximum compression Args: filing_date: String in format 'YYYY-MM-DD', 'YYYY-MM-DD:', ':YYYY-MM-DD', or 'YYYY-MM-DD:YYYY-MM-DD'. If both filing_date and filings are provided, filing_date takes precedence for determining which feed files to download. data_directory: Directory to save the downloaded files. Defaults to the Edgar data directory. overwrite_existing: If True, overwrite existing files. Default is False. filings: Optional Filings object. If provided, will download only filings with matching accession numbers. compress: Whether to compress the extracted files to save disk space. Default is True. compression_level: Compression level for gzip (1-9, with 9 being highest compression). Default is 6. """ if not data_directory: data_directory = get_edgar_data_directory() / 'filings' log.info('Using data directory: %s', data_directory) # If filings object is provided, extract accession numbers accession_numbers = None if filings is not None: log.info('Using provided Filings object with %d filings', len(filings)) accession_numbers = filings.data['accession_number'].to_pylist() # If both filing_date and filings are provided, let the user know which takes precedence if filing_date: log.info('Both filing_date and filings parameters provided. Using filing_date %s for determining feed files to download.', filing_date) # Use the date range from the filings object if no filing_date specified else: start_date, end_date = filings.date_range filing_date = f"{start_date}:{end_date}" log.info('Using date range from filings: %s', filing_date) # Use default date if not specified if not filing_date: filing_date = latest_filing_date() log.info('No filing date specified. Using latest filing date: %s', filing_date) # Get start and end dates for filtering start_date_tm, end_date_tm, is_range = extract_dates(filing_date) # Get quarters to process year_and_quarters = filing_date_to_year_quarters(filing_date) # Track statistics total_feed_files_downloaded = 0 total_filings_kept = 0 for year, quarter in year_and_quarters: log.info('Downloading feed files for %d Q%d', year, quarter) # Get list of feed files for this quarter feed_files = list_filing_feed_files_for_quarter(year, quarter) log.info('Found %d total feed files', len(feed_files)) # Filter files based on date range filtered_files = feed_files[ feed_files['Name'].apply( lambda x: is_feed_file_in_date_range(x, start_date_tm, end_date_tm) ) ] log.info('Found %d feed files in date range', len(filtered_files)) if not filtered_files.empty: # Process the filtered files... for _, row in tqdm(filtered_files.iterrows(), desc='Downloading feed file(s)'): bulk_filing_file = row['File'] bulk_file_directory = data_directory / row['Name'][:8] filing_date_str = row['Name'][:8] # Extract YYYYMMDD from filename if not overwrite_existing: if bulk_file_directory.exists(): log.warning('Skipping %s. Already exists', bulk_file_directory) continue # Optimization: If we have specific accession numbers, check if all the ones # for this specific filing date already exist locally if accession_numbers and filings is not None: # Convert YYYYMMDD to YYYY-MM-DD format formatted_date = f"{filing_date_str[:4]}-{filing_date_str[4:6]}-{filing_date_str[6:8]}" # Filter accession numbers to only those for this specific filing date date_filtered_filings = filings.filter(filing_date=formatted_date) if not date_filtered_filings.empty: date_accession_numbers = date_filtered_filings.data['accession_number'].to_pylist() if check_filings_exist_locally(formatted_date, date_accession_numbers): log.warning('Not downloading for %s. All %d filings for this date already exist in local %s', formatted_date, len(date_accession_numbers), bulk_file_directory) continue # Track existing files before extraction to preserve them during filtering existing_files = set() if accession_numbers and bulk_file_directory.exists(): existing_files = {str(f) for f in bulk_file_directory.glob('*.nc')} path = asyncio.run(download_bulk_data(client=None, url=bulk_filing_file, data_directory=data_directory)) log.info('Downloaded feed file to %s', path) total_feed_files_downloaded += 1 # If we have specific accession numbers, filter the extracted files if accession_numbers and path.exists(): _filter_extracted_files._existing_files = existing_files log.info('Filtering extracted files to keep only specified accession numbers') filings_kept = _filter_extracted_files(path, accession_numbers, compress=compress, compression_level=compression_level) total_filings_kept += filings_kept # Clean up the tracking if hasattr(_filter_extracted_files, '_existing_files'): delattr(_filter_extracted_files, '_existing_files') # If we don't have specific accession numbers but compression is enabled, compress all files elif compress and path.exists(): log.info('Compressing all extracted files') for file_path in path.glob('*.nc'): if not is_compressed_file(file_path): try: compress_filing(file_path, compression_level=compression_level) except Exception as e: log.warning(f"Failed to compress {file_path}: {e}") else: log.info('No feed files found for %d Q%d in date range %s', year, quarter, filing_date) # Log summary statistics log.info('Download complete. Downloaded %d feed files.', total_feed_files_downloaded) if accession_numbers: log.info('Kept %d filings out of %d requested.', total_filings_kept, len(accession_numbers)) def _filter_extracted_files(directory_path: Path, accession_numbers: List[str], compress: bool = True, compression_level: int = 6) -> int: """ Filter files in the extracted directory to keep only those matching the specified accession numbers. Files from the current extraction that don't match are removed to save disk space. Files that existed before this extraction (from previous downloads) are preserved. Args: directory_path: Path to the directory containing extracted files accession_numbers: List of accession numbers to keep compress: Whether to compress the kept files (default: True) compression_level: Compression level for gzip (1-9, with 9 being highest compression) Returns: int: Number of filings kept """ if not directory_path.is_dir(): return 0 # Convert accession numbers to the format used in filenames (removing dashes) normalized_accession_numbers = [an.replace('-', '') for an in accession_numbers] # Keep track of which filings were found filings_kept = 0 # Get list of files that existed before this extraction # We'll preserve these even if they don't match our filter existing_files = getattr(_filter_extracted_files, '_existing_files', set()) # Find all .nc files in the directory for file_path in directory_path.glob('*.nc'): # Extract accession number from filename file_accession = file_path.stem undashed_accession = file_accession.replace('-', '') # Check if this file matches our filter matches_filter = (undashed_accession in normalized_accession_numbers or file_accession in accession_numbers) # Check if this file existed before this extraction was_preexisting = str(file_path) in existing_files if matches_filter: filings_kept += 1 # Compress the file if requested if compress and not is_compressed_file(file_path): try: compress_filing(file_path, compression_level=compression_level) log.debug(f"Compressed {file_path}") except Exception as e: log.warning(f"Failed to compress {file_path}: {e}") elif not was_preexisting: # Remove files from current extraction that don't match filter # But preserve files that existed before this extraction try: file_path.unlink() log.debug(f"Removed non-matching file from current extraction: {file_path}") except Exception as e: log.warning(f"Failed to remove {file_path}: {e}") return filings_kept def is_feed_file_in_date_range(filename: str, start_date: Optional[datetime], end_date: Optional[datetime]) -> bool: """ Check if a feed file falls within the specified date range. Feed files are named like '20240102.nc.tar.gz' """ # Extract date from filename match = re.search(r'(\d{8})\.nc\.tar\.gz', filename) if not match: return False date_str = match.group(1) file_date = datetime.strptime(date_str, '%Y%m%d') # For single date (not range) if start_date and not end_date: return (file_date.year == start_date.year and file_date.month == start_date.month and file_date.day == start_date.day) # For date range if start_date: if file_date < start_date: return False if end_date: if file_date > end_date: return False return True def list_filing_feed_files_for_quarter(year:int, quarter:int) -> pd.DataFrame: assert quarter in (1, 2, 3, 4), "Quarter must be between 1 and 4" url = f"https://www.sec.gov/Archives/edgar/Feed/{year}/QTR{quarter}/" return list_filing_feed_files(url) def get_sec_file_listing(url:str) -> pd.DataFrame: """ Reads an SEC EDGAR file listing directory and returns file information as a DataFrame. Args: url (str): URL of the SEC EDGAR feed directory (e.g., 'https://www.sec.gov/Archives/edgar/Feed/2024/QTR1/') Returns: pd.DataFrame: DataFrame containing file information with columns: - Name: str, filename - Size: int, file size in bytes - Modified: datetime, last modification timestamp Raises: ValueError: If URL is invalid or doesn't point to SEC EDGAR ConnectionError: If unable to download the page RuntimeError: If page structure is invalid or no table found """ try: html = download_text(url) except HTTPStatusError as e: if e.response.status_code == 403: log.warning(f"There are no feed files for url {url}") return pd.DataFrame(columns=['Name', 'File', 'Size', 'Modified']) elif e.response.status_code == 404: raise FileNotFoundError(f"Page not found: {url}") from None raise ConnectionError(f"Failed to download page: {str(e)}") from e if "Directory Browsing Not Allowed" in html: log.warning(f"Directory browsing is not allowed for {url}") raise DirectoryBrowsingNotAllowed(""" Directory browsing is not allowed for here. This is unexpected and the SEC likely has changed their policy for viewing the bulk filing files. """) soup = BeautifulSoup(html, 'html.parser') table = soup.find('table') if not table: raise RuntimeError("No table found in the page") records = [] # Process table rows, skip header row for row in table.find_all('tr')[1:]: cells = row.find_all('td') # Skip if row structure is invalid if len(cells) != 3: continue name = cells[0].text.strip() # Skip parent directory entry if name in ('.', '..'): continue feed_file_url = f"{url}{name}" # Parse file size (convert "1.2K", "3.4M" etc. to bytes) size_text = cells[1].text.strip() size = parse_file_size(size_text) # Parse modification date modified_text = cells[2].text.strip() try: modified = datetime.strptime(modified_text, '%m/%d/%Y %I:%M:%S %p') except ValueError: modified = None records.append((name, feed_file_url, size, modified)) df = pd.DataFrame(records, columns=['Name', 'File', 'Size', 'Modified']) return df def list_filing_feed_files(url: str) -> pd.DataFrame: """ Reads the SEC EDGAR filing feed directory and returns file information as a DataFrame. Args: url (str): URL of the SEC EDGAR feed directory (e.g., 'https://www.sec.gov/Archives/edgar/Feed/2024/QTR1/') Returns: pd.DataFrame: DataFrame containing file information with columns: - Name: str, filename - Size: int, file size in bytes - Modified: datetime, last modification timestamp Raises: ValueError: If URL is invalid or doesn't point to SEC EDGAR ConnectionError: If unable to download the page RuntimeError: If page structure is invalid or no table found """ # Validate URL if not url.startswith('https://www.sec.gov/Archives/edgar/Feed/'): raise ValueError("URL must be an SEC EDGAR feed directory") return get_sec_file_listing(url) def parse_file_size(size_text: str) -> Optional[int]: """Convert size string to bytes (e.g., "1.2K" -> 1228)""" if not size_text: return None units = {'B': 1, 'K': 1024, 'M': 1024 * 1024, 'G': 1024 * 1024 * 1024} pattern = r'(\d+\.?\d*)\s*([BKMG])?' match = re.match(pattern, size_text.upper()) if not match: return None number = float(match.group(1)) unit = match.group(2) or 'B' return int(number * units[unit]) def latest_filing_date(): """Get the latest filing date""" from edgar import get_filings return get_filings().end_date def is_compressed_file(file_path: Path) -> bool: """ Check if a file is gzip-compressed by examining its extension. Args: file_path: Path to the file Returns: bool: True if the file has a .gz extension, False otherwise """ return str(file_path).endswith('.gz') def compress_filing(file_path: Path, compression_level: int = 6, delete_original: bool = True) -> Path: """ Compress a filing file using gzip and optionally delete the original. Args: file_path: Path to the file to compress compression_level: Compression level (1-9, with 9 being highest compression) delete_original: Whether to delete the original file after compression Returns: Path to the compressed file Raises: FileNotFoundError: If the file does not exist ValueError: If the file is already compressed """ import gzip import shutil if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if is_compressed_file(file_path): raise ValueError(f"File is already compressed: {file_path}") compressed_path = Path(f"{file_path}.gz") # Compress the file with file_path.open('rb') as f_in: with gzip.open(compressed_path, 'wb', compresslevel=compression_level) as f_out: shutil.copyfileobj(f_in, f_out) # Delete the original file if requested if delete_original: file_path.unlink() return compressed_path def decompress_filing(file_path: Path, output_path: Optional[Path] = None, delete_original: bool = False) -> Path: """ Decompress a gzip-compressed filing file. Args: file_path: Path to the compressed file output_path: Path to save the decompressed file (if None, use the original path without .gz) delete_original: Whether to delete the original compressed file Returns: Path to the decompressed file Raises: FileNotFoundError: If the file does not exist ValueError: If the file is not compressed gzip.BadGzipFile: If the file is not a valid gzip file """ import gzip import shutil if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if not is_compressed_file(file_path): raise ValueError(f"File is not compressed: {file_path}") # Determine output path if not provided if output_path is None: # Remove .gz extension output_path = Path(str(file_path)[:-3]) # Decompress the file with gzip.open(file_path, 'rb') as f_in: with output_path.open('wb') as f_out: shutil.copyfileobj(f_in, f_out) # Delete the original compressed file if requested if delete_original: file_path.unlink() return output_path def compress_all_filings(data_directory: Optional[Path] = None, compression_level: int = 6) -> int: """ Compress all uncompressed filing files in the data directory. Args: data_directory: Path to the data directory (defaults to the Edgar data directory) compression_level: Compression level (1-9, with 9 being highest compression) Returns: Number of files compressed """ if data_directory is None: data_directory = get_edgar_data_directory() / 'filings' # Find all .nc files (not already compressed) files_compressed = 0 for file_path in tqdm(list(data_directory.glob('**/*.nc')), desc="Compressing files"): if not is_compressed_file(file_path) and file_path.is_file(): try: compress_filing(file_path, compression_level=compression_level) files_compressed += 1 except Exception as e: log.warning(f"Failed to compress {file_path}: {e}") return files_compressed def check_filings_exist_locally(filing_date: Union[str, date], accession_numbers: List[str]) -> bool: """ Check if all specified accession numbers already exist locally for a given filing date. Args: filing_date: The filing date (YYYY-MM-DD format) accession_numbers: List of accession numbers to check Returns: bool: True if all filings exist locally, False otherwise """ if not accession_numbers: return False for accession_number in accession_numbers: # Check both compressed and uncompressed versions filing_path = local_filing_path(filing_date, accession_number) if not filing_path.exists(): return False return True def local_filing_path(filing_date:Union[str, date], accession_number:str, correction:bool=False) -> Path: """ Get the local path for a filing If correction is True, will look for the corrected filing with extension 'corr' Returns the compressed version (.gz) if it exists, otherwise returns the uncompressed path. """ ext = 'corr' if correction else 'nc' if isinstance(filing_date, date): filing_date = filing_date.strftime('%Y-%m-%d') filing_date = filing_date.replace('-', '') # Base path without compression extension base_path = get_edgar_data_directory() / 'filings' / filing_date / f"{accession_number}.{ext}" # Check for compressed version first compressed_path = Path(f"{base_path}.gz") if compressed_path.exists(): return compressed_path # Fall back to uncompressed version return base_path