import asyncio import datetime import logging.config import os import random import re import sys import threading from _thread import interrupt_main from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from datetime import date from functools import lru_cache, partial, wraps from pathlib import Path from typing import Callable, Iterable, List, Optional, Tuple, TypeVar, Union import httpx import pandas as pd import pyarrow as pa import pytz from pandas.tseries.offsets import BDay from rich.logging import RichHandler from rich.prompt import Prompt from edgar.datatools import PagingState log = logging.getLogger(__name__) def parse_pandas_version(): """Parse pandas version without external dependencies""" version_parts = pd.__version__.split('.') major = int(version_parts[0]) minor = int(version_parts[1]) if len(version_parts) > 1 else 0 # Handle dev versions, rc versions, and build metadata patch_str = version_parts[2] if len(version_parts) > 2 else '0' patch = int(patch_str.split('+')[0].split('rc')[0].split('dev')[0]) return (major, minor, patch) pandas_version = parse_pandas_version() # sys version python_version = tuple(map(int, sys.version.split()[0].split('.'))) __all__ = [ 'log', 'Result', 'get_bool', 'edgar_mode', 'NORMAL', 'CRAWL', 'CAUTION', 'sec_edgar', 'IntString', 'sec_dot_gov', 'get_identity', 'python_version', 'set_identity', 'strtobool', 'listify', 'decode_content', 'cache_except_none', 'text_extensions', 'binary_extensions', 'ask_for_identity', 'is_start_of_quarter', 'run_async_or_sync', 'get_edgar_data_directory', 'is_probably_html', 'has_html_content', 'default_page_size', 'parse_acceptance_datetime', 'PagingState', 'Years', 'Quarters', 'YearAndQuarter', 'YearAndQuarters', 'quarters_in_year', 'parallel_thread_map', 'pandas_version' ] IntString = Union[str, int] quarters_in_year: List[int] = list(range(1, 5)) YearAndQuarter = Tuple[int, int] YearAndQuarters = List[YearAndQuarter] Years = Union[int, List[int], range] Quarters = Union[int, List[int], range] # Date patterns YYYY_MM_DD = "\\d{4}-\\d{2}-\\d{2}" DATE_PATTERN = re.compile(YYYY_MM_DD) DATE_RANGE_PATTERN = re.compile(f"^({YYYY_MM_DD}(:({YYYY_MM_DD})?)?|:({YYYY_MM_DD}))$") default_http_timeout: int = 12 default_page_size = 50 default_max_connections = 10 default_retries = 3 limits = httpx.Limits(max_connections=default_max_connections) def strtobool (val:str): """Convert a string representation of truth to true (1) or false (0). True values are case insensitive 'y', 'yes', 't', 'true', 'on', and '1'. false values are case insensitive 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if 'val' is anything else. """ if not val: return False val = val.lower() if val in ('y', 'yes', 't', 'true', 'on', '1'): return True elif val in ('n', 'no', 'f', 'false', 'off', '0'): return False else: return False #raise ValueError("invalid truth value %r" % (val,)) @dataclass class EdgarSettings: http_timeout: int max_connections: int retries: int = 3 @property @lru_cache(maxsize=1) def limits(self): return httpx.Limits(max_connections=default_max_connections) def __eq__(self, othr): return (isinstance(othr, type(self)) and (self.http_timeout, self.max_connections, self.retries) == (othr.http_timeout, othr.max_connections, othr.retries)) def __hash__(self): return hash((self.http_timeout, self.max_connections, self.retries)) # Modes of accessing edgar # The normal mode of accessing edgar NORMAL = EdgarSettings(http_timeout=15, max_connections=10) # A bit more cautious mode of accessing edgar CAUTION = EdgarSettings(http_timeout=20, max_connections=5) # Use this setting when you have long-running jobs and want to avoid breaching Edgar limits CRAWL = EdgarSettings(http_timeout=25, max_connections=2, retries=2) edgar_access_mode = os.getenv('EDGAR_ACCESS_MODE', 'NORMAL') if edgar_access_mode == 'CAUTION': # A bit more cautious mode of accessing edgar edgar_mode = CAUTION elif edgar_access_mode == 'CRAWL': # Use this setting when you have long-running jobs and want to avoid breaching Edgar limits edgar_mode = CRAWL else: # The normal mode of accessing edgar edgar_mode = NORMAL edgar_identity = 'EDGAR_IDENTITY' # SEC urls sec_dot_gov = "https://www.sec.gov" sec_edgar = "https://www.sec.gov/Archives/edgar" # Local storage directory. edgar_data_dir = os.path.join(os.path.expanduser("~"), ".edgar") def set_identity(user_identity: str): """ This function sets the environment variable EDGAR_IDENTITY to the identity you will use to call Edgar This user identity looks like "Sample Company Name AdminContact@.com" See https://www.sec.gov/os/accessing-edgar-data :param user_identity: """ os.environ[edgar_identity] = user_identity log.info("Identity of the Edgar REST client set to [%s]", user_identity) from edgar.httpclient import close_clients close_clients() # close any httpx clients, to reset the identity. identity_prompt = """ [bold turquoise4]Identify your client to SEC Edgar[/bold turquoise4] ------------------------------------------------------------------------------ Before running [bold]edgartools[/bold] it needs to know the UserAgent string to send to Edgar. See https://www.sec.gov/os/accessing-edgar-data This can be set in the environment variable [bold green]EDGAR_IDENTITY[/bold green]. 1. Set an OS environment variable [bold]EDGAR_IDENTITY=[green]Name email@domain.com[/green][/bold] 2. Or a Python environment variable import os [bold]os.environ['EDGAR_IDENTITY']=[green]"Name email@domain.com"[/green][/bold] 3. Or use [bold magenta]edgartools.set_identity[/bold magenta] from edgar import set_identity [bold]set_identity([green]'Name email@domain.com'[/green])[/bold] But since you are already using [bold]edgartools[/bold] you can set it here Enter your [bold green]EDGAR_IDENTITY[/bold green] e.g. [bold italic green]Name email@domain.com[/bold italic green] """ def ask_for_identity(user_prompt: str = identity_prompt, timeout: int = 60): timer = threading.Timer(timeout, interrupt_main) timer.start() try: # Prompt the user for input input_str = Prompt.ask(user_prompt) # Strip the newline character from the end of the input string input_str = input_str.strip() except KeyboardInterrupt: # If the timeout is reached, raise a TimeoutError exception message = "You did not enter your Edgar user identity. Try again .. or set environment variable EDGAR_IDENTITY" log.warning(message) raise TimeoutError(message) from None finally: # Cancel the timer to prevent it from interrupting the main thread timer.cancel() return input_str def get_identity() -> str: """ Get the sec identity used to set the UserAgent string :return: """ identity = os.environ.get(edgar_identity) if not identity: identity = ask_for_identity() os.environ[edgar_identity] = identity return identity def decode_content(content: bytes): try: return content.decode('utf-8') except UnicodeDecodeError: return content.decode('latin-1') text_extensions = (".txt", ".htm", ".html", ".xsd", ".xml", "XML", ".json", ".idx", ".paper") binary_extensions = (".pdf", ".jpg", ".jpeg", "png", ".gif", ".tif", ".tiff", ".bmp", ".ico", ".svg", ".webp", ".avif", ".apng") def get_bool(value: str = None) -> Optional[bool]: """Convert the value to a boolean""" return value in [1, "1", "Y", "true", "True", "TRUE"] class Result: """ This class represents the result of an operation which can succeed or fail. It allows for handling the failures more gracefully that using error handling """ def __init__(self, success: bool, error: Optional[str] = None, value: Optional[object] = None): self.success = success self.error = error self.value = value @property def failure(self) -> bool: """:return True if the operation failed""" return not self.success def __str__(self): if self.success: return '[Success]' else: return f'[Failure] "{self.error}"' def __repr__(self): if self.success: return f"Result (success={self.success})" else: return f'Result (success={self.success}, message="{self.error}")' @classmethod def Fail(cls, error: str): """Create a Result for a failed operation""" return cls(False, error=error, value=None) @classmethod def Ok(cls, value: object): """Create a Result for a successful operation""" return cls(success=True, value=value, error=None) def get_resource(file: str): import importlib import edgar return importlib.resources.path(edgar, file) def get_edgar_data_directory() -> Path: """Get the edgar data directory""" default_local_data_dir = Path(os.path.join(os.path.expanduser("~"), ".edgar")) edgar_data_dir = Path(os.getenv('EDGAR_LOCAL_DATA_DIR', default_local_data_dir)) os.makedirs(edgar_data_dir, exist_ok=True) return edgar_data_dir class TooManyRequestsException(Exception): def __init__(self, message: str): super().__init__(message) def filing_date_to_year_quarters(filing_date: str) -> List[Tuple[int, int]]: if ":" in filing_date: start_date, end_date = filing_date.split(":") if not start_date: start_date = "1994-06-01" if not end_date: end_date = date.today().strftime("%Y-%m-%d") start_year, start_month, _ = map(int, start_date.split("-")) end_year, end_month, _ = map(int, end_date.split("-")) start_quarter = (start_month - 1) // 3 + 1 end_quarter = (end_month - 1) // 3 + 1 result = [] for year in range(start_year, end_year + 1): if year == start_year and year == end_year: quarters = range(start_quarter, end_quarter + 1) elif year == start_year: quarters = range(start_quarter, 5) elif year == end_year: quarters = range(1, end_quarter + 1) else: quarters = range(1, 5) for quarter in quarters: result.append((year, quarter)) return result else: year, month, _ = map(int, filing_date.split("-")) quarter = (month - 1) // 3 + 1 return [(year, quarter)] def current_year_and_quarter() -> Tuple[int, int]: # Define the Eastern timezone eastern = pytz.timezone('America/New_York') # Get the current time in Eastern timezone now_eastern = datetime.datetime.now(eastern) # Calculate the current year and quarter current_year, current_quarter = now_eastern.year, (now_eastern.month - 1) // 3 + 1 return current_year, current_quarter def filter_by_date(data: pa.Table, date: Union[str, datetime.datetime], date_col: str) -> pa.Table: # If datetime convert to string if isinstance(date, datetime.date) or isinstance(date, datetime.datetime): date = date.strftime('%Y-%m-%d') def decode_content(content: bytes): try: return content.decode('utf-8') except UnicodeDecodeError: return content.decode('latin-1') text_extensions = (".txt", ".htm", ".html", ".xsd", ".xml", "XML", ".json", ".idx", ".paper") binary_extensions = (".pdf", ".jpg", ".jpeg", "png", ".gif", ".tif", ".tiff", ".bmp", ".ico", ".svg", ".webp", ".avif", ".apng") class DataPager: def __init__(self, data: Union[pa.Table, pd.DataFrame], page_size=default_page_size): self.data: Union[pa.Table, pd.DataFrame] = data self.page_size = page_size self.total_pages = (len(self.data) // page_size) + 1 self.current_page = 1 def has_next(self): return self.current_page < self.total_pages def has_previous(self): return self.current_page > 1 def next(self): """Get the next page of data""" if self.has_next(): self.current_page += 1 return self.current() else: return None def previous(self): """Get the previous page of data""" if self.has_previous(): self.current_page -= 1 return self.current() else: return None @property def _current_range(self) -> Tuple[int, int]: """Get the current start and end index for the data""" start_index = (self.current_page - 1) * self.page_size end_index = min(len(self.data), start_index + self.page_size) return start_index, end_index def current(self) -> pa.Table: """ Get the current data page as a pyarrow Table :return: """ start_index = (self.current_page - 1) * self.page_size end_index = start_index + self.page_size if isinstance(self.data, pa.Table): return self.data.slice(offset=start_index, length=self.page_size) else: return self.data.iloc[start_index:end_index] @property def start_index(self): return (self.current_page - 1) * self.page_size @property def end_index(self): return self.start_index + self.page_size @dataclass class PagingState: page_start: int num_records: int def parse_acceptance_datetime(acceptance_datetime: str) -> datetime.datetime: return datetime.datetime.fromisoformat(acceptance_datetime.replace('Z', '+00:00')) def sample_table(table, n=None, frac=None, replace=False, random_state=None): """Take a sample from a pyarrow Table""" if random_state: random.seed(random_state) if frac is not None: n = int(len(table) * frac) if n is not None: if replace: indices = [random.randint(0, len(table) - 1) for _ in range(n)] else: indices = random.sample(range(len(table)), min(n, len(table))) else: indices = random.sample(range(len(table)), len(table)) return table.take(indices) def run_async_or_sync(coroutine): try: # Check if we're in an IPython environment ipython = sys.modules['IPython'] if 'asyncio' in sys.modules: # try is needed for ipython console try: loop = asyncio.get_event_loop() except RuntimeError: import nest_asyncio nest_asyncio.apply() loop = asyncio.get_event_loop() if loop.is_running(): # We're in a notebook with an active event loop import nest_asyncio nest_asyncio.apply() return loop.run_until_complete(coroutine) else: # We're in IPython but without an active event loop return loop.run_until_complete(coroutine) else: # We're in IPython but asyncio is not available return ipython.get_ipython().run_cell_magic('time', '', f'import asyncio; asyncio.run({coroutine!r})') except (KeyError, AttributeError): # We're not in an IPython environment, use asyncio.run() return asyncio.run(coroutine) def listify(value): """ Convert the input to a list if it's not already a list. Args: value: Any type of input Returns: list: The input as a list """ if isinstance(value, list): return value elif isinstance(value, range): return list(value) else: return [value] def is_start_of_quarter(): today = datetime.datetime.now().date() # Check if it's the start of a quarter if today.month in [1, 4, 7, 10] and today.day <= 5: # Get the first day of the current quarter first_day_of_quarter = datetime.datetime(today.year, today.month, 1).date() # Calculate one business day after the start of the quarter one_business_day_after = (first_day_of_quarter + BDay(1)).date() # Check if we haven't passed one full business day yet if today <= one_business_day_after: return True return False def cache_except_none(maxsize=128): """ A decorator that caches the result of a function, but only if the result is not None. """ def decorator(func): cache = lru_cache(maxsize=maxsize) @cache def cached_func(*args, **kwargs): result = func(*args, **kwargs) if result is None: # Clear this result from the cache cached_func.cache_clear() return result @wraps(func) def wrapper(*args, **kwargs): return cached_func(*args, **kwargs) # Preserve cache methods wrapper.cache_info = cached_func.cache_info wrapper.cache_clear = cached_func.cache_clear return wrapper return decorator def is_probably_html(content: str) -> bool: """Does it have html tags""" if isinstance(content, bytes): content = content.decode('utf-8', errors='ignore') # Check for common HTML tags html_tags = ['', '', '', '', '<div', '<span', '<p>'] return any(tag in content.lower() for tag in html_tags) def has_html_content(content: str) -> bool: """ Check if the content is HTML or inline XBRL HTML """ if content is None: return False if isinstance(content, bytes): content = content.decode('utf-8', errors='ignore') # Strip only leading whitespace and get first 200 chars for doctype check content = content.lstrip() first_200_lower = content[:200].lower() # Check for XHTML doctype declarations if '<!doctype html public "-//w3c//dtd xhtml' in first_200_lower or \ '<!doctype html system "http://www.w3.org/tr/xhtml1/dtd/' in first_200_lower or \ '<!doctype html public "-//w3c//dtd html 4.01 transitional//en"' in first_200_lower: return True # Look for common XML/HTML indicators in first 1000 chars first_1000 = content[:1000] # Check for standard XHTML namespace if 'xmlns="http://www.w3.org/1999/xhtml"' in first_1000: return True # Check for HTML root element if '<html' in first_1000: # Check for common inline XBRL namespaces if ('xmlns:xbrli' in first_1000 or 'xmlns:ix' in first_1000 or 'xmlns:html' in first_1000): return True # If we have an <html> tag, it's likely HTML content # This catches cases like <html style="..."> that don't have XBRL namespaces return True # Just check for straightforward HTML if first_200_lower.startswith('<html>') and content[-7:].lower().startswith('</html>'): return True return False T = TypeVar('T') R = TypeVar('R') def parallel_thread_map(func: Callable[[T], R], items: Iterable[T], **kwargs) -> List[R]: """ Run a function in parallel across multiple items using ThreadPoolExecutor. This is a replacement for fastcore's parallel function, supporting only the threadpool execution mode. It does not include progress bars. Args: func: The function to apply to each item items: The items to process **kwargs: Additional keyword arguments to pass to func Returns: List of results from applying func to each item """ # Default to min(32, cores+4) which is a good balance for I/O-bound tasks max_workers = kwargs.pop('n_workers', None) or min(32, (os.cpu_count() or 1) + 4) # Convert items to a list for easier handling items_list = list(items) with ThreadPoolExecutor(max_workers=max_workers) as executor: if kwargs: # If there are kwargs, create a partial function partial_func = partial(func, **kwargs) results = list(executor.map(partial_func, items_list)) else: results = list(executor.map(func, items_list)) return results def initialize_rich_logging(): # Rich logging logging.basicConfig( level="INFO", format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)] ) # Turn down 3rd party logging logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpxthrottlecache").setLevel(logging.WARNING) logging.getLogger("pyrate_limiter").setLevel( logging.CRITICAL ) # TODO: Temporary, until next pyrate_limiter update that reduces the spurious "async" message # Turn on rich logging if the environment variable is set if os.getenv('EDGAR_USE_RICH_LOGGING', '0') == '1': initialize_rich_logging()