Files
2025-12-09 12:13:01 +01:00

164 lines
5.5 KiB
Python

import locale
import os
from contextlib import asynccontextmanager, contextmanager
from pathlib import Path
from typing import AsyncGenerator, Generator, Literal, Optional
import httpx
# Fix for issue #457: Force C locale for httpxthrottlecache to avoid locale-dependent date parsing
# httpxthrottlecache uses time.strptime() which is locale-dependent. On non-English systems
# (Chinese, Japanese, German, etc.), HTTP date headers fail to parse because month/day names
# are in the local language. Setting LC_TIME to 'C' ensures English date parsing.
# See: https://github.com/dgunning/edgartools/issues/457
try:
locale.setlocale(locale.LC_TIME, 'C')
except (locale.Error, ValueError):
# If 'C' locale is not available, try to continue anyway
# This shouldn't happen on most systems, but better safe than sorry
pass
from httpxthrottlecache import HttpxThrottleCache
from edgar.core import get_identity, strtobool
from .core import edgar_data_dir
MAX_SUBMISSIONS_AGE_SECONDS = 30 # Check for submissions every 30 seconds (reduced from 10 min for Issue #471)
MAX_INDEX_AGE_SECONDS = 30 * 60 # Check for updates to index (ie: daily-index) every 30 minutes
# rules are regular expressions matching the request url path:
# The value determines whether it is cached or not:
# - int > 0: how many seconds it'll be considered valid. During this time, the cached object will not be revalidated.
# - False or 0: Do not cache
# - True: Cache forever, never revalidate
# - None: Determine cachability using response cache headers only.
#
# Note that: revalidation consumes rate limit "hit", but will be served from cache if the data hasn't changed.
CACHE_RULES = {
r".*\.sec\.gov": {
"/submissions.*": MAX_SUBMISSIONS_AGE_SECONDS,
r"/include/ticker\.txt.*": MAX_SUBMISSIONS_AGE_SECONDS,
r"/files/company_tickers\.json.*": MAX_SUBMISSIONS_AGE_SECONDS,
".*index/.*": MAX_INDEX_AGE_SECONDS,
"/Archives/edgar/data": True, # cache forever
}
}
def get_cache_directory() -> str:
cachedir = Path(edgar_data_dir) / "_tcache"
cachedir.mkdir(parents=True, exist_ok=True)
return str(cachedir)
def get_edgar_verify_ssl():
"""
Returns True if using SSL verification on http requests
"""
if "EDGAR_VERIFY_SSL" in os.environ:
return strtobool(os.environ["EDGAR_VERIFY_SSL"])
else:
return True
def get_http_mgr(cache_enabled: bool = True, request_per_sec_limit: int = 9) -> HttpxThrottleCache:
cache_mode: Literal[False, "Disabled", "Hishel-S3", "Hishel-File", "FileCache"]
if cache_enabled:
cache_dir = get_cache_directory()
cache_mode = "Hishel-File"
else:
cache_dir = None
cache_mode = "Disabled"
http_mgr = HttpxThrottleCache(
user_agent_factory=get_identity, cache_dir=cache_dir, cache_mode=cache_mode, request_per_sec_limit=request_per_sec_limit,
cache_rules = CACHE_RULES
)
http_mgr.httpx_params["verify"] = get_edgar_verify_ssl()
return http_mgr
@asynccontextmanager
async def async_http_client(client: Optional[httpx.AsyncClient] = None, **kwargs) -> AsyncGenerator[httpx.AsyncClient, None]:
async with HTTP_MGR.async_http_client(client=client, **kwargs) as client:
yield client
@contextmanager
def http_client(**kwargs) -> Generator[httpx.Client, None, None]:
with HTTP_MGR.http_client(**kwargs) as client:
yield client
def get_http_params():
return HTTP_MGR._populate_user_agent(HTTP_MGR.httpx_params.copy())
def close_clients():
HTTP_MGR.close()
HTTP_MGR = get_http_mgr()
def clear_locale_corrupted_cache():
"""
One-time cache clearing function to remove locale-corrupted cache files from Issue #457.
This function addresses a specific issue where cache files created with non-English locales
(Chinese, Japanese, German, etc.) contain timestamps that cannot be deserialized after
the locale fix was applied in v4.19.0.
The function:
1. Checks for a marker file to avoid repeated clearing
2. Clears the HTTP cache directory if marker doesn't exist
3. Creates a marker file to prevent future clearing
This is safe to call multiple times - it will only clear cache once per installation.
Returns:
bool: True if cache was cleared, False if already cleared previously
"""
import logging
import shutil
from pathlib import Path
try:
cache_dir = Path(get_cache_directory())
marker_file = cache_dir / ".locale_fix_457_applied"
# If marker exists, cache was already cleared
try:
if marker_file.exists():
return False
except (PermissionError, OSError):
# If we can't check marker file, assume we need to proceed
pass
# Clear the cache directory if it exists
if cache_dir.exists():
# Remove all cache files
shutil.rmtree(cache_dir)
# Recreate the directory
cache_dir.mkdir(parents=True, exist_ok=True)
# Create marker file
marker_file.touch()
return True
else:
# No cache exists, just create marker
cache_dir.mkdir(parents=True, exist_ok=True)
marker_file.touch()
return False
except Exception as e:
# Log error but don't fail - worst case user still has cache issues
logging.getLogger(__name__).warning(
f"Failed to clear locale-corrupted cache: {e}. "
"You may need to manually delete ~/.edgar/_tcache directory."
)
return False