Files
2025-12-09 12:13:01 +01:00

476 lines
16 KiB
Python

import json
import os
import re
from enum import Enum
from functools import lru_cache
from io import StringIO
from typing import Any, Dict, List, Optional, Union
import pandas as pd
import pyarrow as pa
from httpx import HTTPStatusError
from edgar.core import get_edgar_data_directory, listify, log
from edgar.httprequests import download_file, download_json
from edgar.reference.data.common import read_csv_from_package, read_parquet_from_package
__all__ = ['cusip_ticker_mapping', 'get_ticker_from_cusip', 'get_company_tickers', 'get_icon_from_ticker', 'find_cik',
'get_cik_tickers', 'get_company_ticker_name_exchange', 'get_companies_by_exchange', 'popular_us_stocks',
'get_mutual_fund_tickers', 'find_mutual_fund_cik', 'list_all_tickers', 'find_ticker', 'find_ticker_safe', 'get_cik_ticker_lookup',
'get_company_cik_lookup', 'get_cik_tickers_from_ticker_txt', 'get_cik_tickers', 'get_company_tickers',
'ticker_txt_url', 'company_tickers_json_url', 'mutual_fund_tickers_url', 'company_tickers_exchange_url',
'Exchange'
]
ticker_txt_url = "https://www.sec.gov/include/ticker.txt"
company_tickers_json_url = "https://www.sec.gov/files/company_tickers.json"
mutual_fund_tickers_url = "https://www.sec.gov/files/company_tickers_mf.json"
company_tickers_exchange_url = "https://www.sec.gov/files/company_tickers_exchange.json"
@lru_cache(maxsize=1)
def cusip_ticker_mapping(allow_duplicate_cusips: bool = True) -> pd.DataFrame:
"""
Download the Cusip to Ticker mapping data from the SEC website.
This provides a Dataframe with Cusip as the index and Ticker as the column.
CUSIP can be duplicate to get non duplicate Cusips set allow_duplicate_cusips to False.
This will return only the first occurrence of the Cusip.
The first occurrence of the Cusip will also be most likely to be mapped to a Ticker that is linked to a cik
"""
df = read_parquet_from_package('ct.pq').set_index('Cusip')
if not allow_duplicate_cusips:
df = df[~df.index.duplicated(keep='first')]
return df
def load_tickers_from_local() -> Optional[Dict[str, Any]]:
"""
Load tickers from local data
"""
reference_dir = get_edgar_data_directory() / "reference"
if not reference_dir.exists():
return None
company_tickers_file = reference_dir / os.path.basename(company_tickers_json_url)
if not company_tickers_file.exists():
return None
return json.loads(company_tickers_file.read_text())
@lru_cache(maxsize=1)
def get_company_tickers(
as_dataframe: bool = True,
clean_name: bool = True,
clean_suffix: bool = False
) -> Union[pd.DataFrame, pa.Table]:
"""
Fetch and process company ticker data from SEC.
Args:
as_dataframe (bool): If True, returns pandas DataFrame; if False, returns pyarrow Table
clean_name (bool): If True, cleans company names
clean_suffix (bool): If True, removes common company suffixes
Returns:
Union[pd.DataFrame, pa.Table]: Processed company data
"""
# Pre-define schema for better performance
SCHEMA = pa.schema([
('cik', pa.int64()),
('ticker', pa.string()),
('company', pa.string())
])
try:
if os.getenv("EDGAR_USE_LOCAL_DATA"):
tickers_json = load_tickers_from_local()
if not tickers_json:
tickers_json = download_json(company_tickers_json_url)
else:
# Download JSON data
tickers_json = download_json(company_tickers_json_url)
# Pre-allocate lists for better memory efficiency
ciks = []
tickers = []
companies = []
# Process JSON data
for item in tickers_json.values():
company_name = item['title']
# Apply name cleaning if requested
if clean_name or clean_suffix:
if clean_name:
company_name = clean_company_name(company_name)
if clean_suffix:
company_name = clean_company_suffix(company_name)
# Append to respective lists
ciks.append(int(item['cik_str']))
tickers.append(item['ticker'])
companies.append(company_name)
if as_dataframe:
# Create DataFrame directly from lists
return pd.DataFrame({
'cik': ciks,
'ticker': tickers,
'company': companies
})
# Create pyarrow arrays
cik_array = pa.array(ciks, type=pa.int64())
ticker_array = pa.array(tickers, type=pa.string())
company_array = pa.array(companies, type=pa.string())
# Create and return pyarrow Table
return pa.Table.from_arrays(
[cik_array, ticker_array, company_array],
schema=SCHEMA
)
except Exception as e:
log.error(f"Error fetching company tickers from [{company_tickers_json_url}]: {str(e)}")
raise
def load_cik_tickers_from_local() -> Optional[str]:
"""
Load tickers.txt from local data
"""
reference_dir = get_edgar_data_directory() / "reference"
if not reference_dir.exists():
return None
tickers_txt_file = reference_dir / os.path.basename(ticker_txt_url)
if not tickers_txt_file.exists():
return None
return tickers_txt_file.read_text()
def get_cik_tickers_from_ticker_txt():
"""Get CIK and ticker data from ticker.txt file"""
try:
if os.getenv("EDGAR_USE_LOCAL_DATA"):
ticker_txt = load_cik_tickers_from_local()
if not ticker_txt:
ticker_txt = download_file(ticker_txt_url, as_text=True)
else:
ticker_txt = download_file(ticker_txt_url, as_text=True)
source = StringIO(ticker_txt)
data = pd.read_csv(source,
sep='\t',
header=None,
names=['ticker', 'cik']).dropna()
data['ticker'] = data['ticker'].str.upper()
return data
except Exception as e:
log.error(f"Error fetching company tickers from [{ticker_txt_url}]: {str(e)}")
return None
@lru_cache(maxsize=1)
def get_cik_tickers():
"""Merge unique records from both sources"""
txt_data = get_cik_tickers_from_ticker_txt()
try:
json_data = get_company_tickers(clean_name=False, clean_suffix=False)[['ticker', 'cik']]
except Exception:
json_data = None
if txt_data is None and json_data is None:
raise Exception("Both data sources are unavailable")
if txt_data is None:
return json_data
if json_data is None:
return txt_data
# Merge both dataframes and keep unique records
merged_data = pd.concat([txt_data, json_data], ignore_index=True)
merged_data = merged_data.drop_duplicates(subset=['ticker', 'cik'])
return merged_data
@lru_cache(maxsize=None)
def list_all_tickers():
"""List all tickers from the merged data"""
return get_cik_tickers()['ticker'].tolist()
@lru_cache(maxsize=None)
def get_company_cik_lookup():
df = get_cik_tickers()
lookup = {}
for ticker, cik in zip(df['ticker'], df['cik'], strict=False):
# Add original ticker
lookup[ticker] = cik
# Add base ticker (part before '-')
base_ticker = ticker.split('-')[0]
if base_ticker not in lookup:
lookup[base_ticker] = cik
return lookup
@lru_cache(maxsize=None)
def get_cik_ticker_lookup():
"""Create a mapping of CIK to base ticker symbols.
For CIKs with multiple tickers, uses the shortest ticker (usually the base symbol).
"""
company_lookup = get_company_cik_lookup()
cik_to_tickers = {}
for ticker, cik in company_lookup.items():
# Prefer the base ticker (without share class)
base_ticker = ticker.split('-')[0]
if cik not in cik_to_tickers or len(base_ticker) < len(cik_to_tickers[cik]):
cik_to_tickers[cik] = base_ticker
return cik_to_tickers
@lru_cache(maxsize=128)
def find_ticker(cik: Union[int, str]) -> str:
"""Find the ticker symbol for a given CIK.
Returns empty string if no ticker is found.
Args:
cik: Central Index Key (CIK) as integer or string
Returns:
str: Ticker symbol or empty string if not found
"""
try:
# Ensure cik is an integer
cik = int(str(cik).lstrip('0'))
return get_cik_ticker_lookup().get(cik, "")
except (ValueError, TypeError):
return ""
def find_ticker_safe(cik: Union[int, str]) -> Optional[str]:
"""Find the ticker symbol for a given CIK without making network calls.
Returns None if data is not already cached and would require a network call.
Returns empty string if CIK is found but has no ticker.
This function is designed for use cases where network calls should be avoided,
such as in rich display methods that should be fast and not block on I/O.
Args:
cik: Central Index Key (CIK) as integer or string
Returns:
Optional[str]: Ticker symbol, empty string if no ticker found, or None if network call would be required
"""
try:
# Simple approach: check if all required cache functions have data
# Only proceed if all the underlying data is already cached
if (get_cik_ticker_lookup.cache_info().currsize > 0 and
get_company_cik_lookup.cache_info().currsize > 0 and
get_cik_tickers.cache_info().currsize > 0):
# If we have cached data, try to use it
cik = int(str(cik).lstrip('0'))
# This should be fast since data is cached
lookup_dict = get_cik_ticker_lookup()
return lookup_dict.get(cik, "")
else:
# Not all required data is cached, return None to avoid network calls
return None
except Exception:
# Any error (including potential network errors) returns None
# This ensures we never trigger network calls
return None
@lru_cache(maxsize=None)
def get_company_ticker_name_exchange():
"""
Return a DataFrame with columns [cik name ticker exchange]
"""
data = download_json("https://www.sec.gov/files/company_tickers_exchange.json")
return pd.DataFrame(data['data'], columns=data['fields'])
def get_companies_by_exchange(exchange: Union[List[str], str]):
"""
Get companies listed on a specific exchange.
:param exchange: String, like 'Nasdaq' or 'NYSE'
:return: DataFrame with companies listed on the specified exchange
with columns [cik name ticker exchange]
"""
df = get_company_ticker_name_exchange()
exchanges = [ex.lower() for ex in listify(exchange)]
return df[df['exchange'].str.lower().isin(exchanges)].reset_index(drop=True)
@lru_cache(maxsize=None)
def get_mutual_fund_tickers():
"""
Get mutual fund tickers.
This returns a dataframe with columns
cik seriesId classId ticker
"""
data = download_json("https://www.sec.gov/files/company_tickers_mf.json")
return pd.DataFrame(data['data'], columns=['cik', 'seriesId', 'classId', 'ticker'])
@lru_cache(maxsize=None)
def get_mutual_fund_lookup():
df = get_mutual_fund_tickers()
return dict(zip(df['ticker'], df['cik'], strict=False))
def find_mutual_fund_cik(ticker):
"""
Find the CIK for a given mutual fund or ETF ticker.
:param ticker: String, the ticker symbol to look up
:return: Integer, the CIK for the given ticker, or None if not found
"""
lookup = get_mutual_fund_lookup()
return lookup.get(ticker.upper())
def find_company_cik(ticker):
lookup = get_company_cik_lookup()
ticker = ticker.upper().replace('.', '-')
return lookup.get(ticker)
def find_company_ticker(cik: Union[int, str]) -> Union[str, List[str], None]:
"""
Find the ticker for a given CIK.
:param cik (int or str): The CIK to look up
:return Union[str, List[str]]: A single ticker string if only one ticker is found,
a list of ticker strings if multiple tickers are found,
or an empty list if no tickers are found.
"""
try:
# Ensure cik is a string without leading zeros, then convert to int
cik = str(cik).lstrip('0')
cik = int(cik)
except (ValueError, TypeError):
return None
# Get DataFrame of CIK-Ticker mappings
df = get_cik_tickers()
# Ensure 'cik' and 'ticker' columns exist
if 'cik' not in df.columns or 'ticker' not in df.columns:
return None
# Filter DataFrame for the given CIK
ticker_series = df[df['cik'] == cik]['ticker']
# If no tickers found, return None
if ticker_series.empty:
return None
# Filter out None values from tickers
tickers = [ticker for ticker in ticker_series.to_numpy() if ticker is not None]
# Return a single ticker if only one found
if len(tickers) == 1:
return tickers[0]
return tickers
def find_cik(ticker):
"""
Find the CIK for a given ticker, checking both company and mutual fund/ETF data.
:param ticker: String, the ticker symbol to look up
:return: Integer, the CIK for the given ticker, or None if not found
"""
# First, check company CIKs
cik = find_company_cik(ticker)
if cik is not None:
return cik
# If not found, check mutual fund/ETF CIKs
return find_mutual_fund_cik(ticker)
@lru_cache(maxsize=128)
def get_ticker_from_cusip(cusip: str):
"""
Get the ticker symbol for a given Cusip.
"""
data = cusip_ticker_mapping()
results = data.loc[cusip]
if len(results) == 1:
return results.iloc[0]
elif len(results) > 1:
return results.iloc[0].Ticker
def clean_company_name(name: str) -> str:
# Regular expression to match unwanted patterns at the end of the company name
cleaned_name = re.sub(r'[/\\][A-Z]+[/\\]?$', '', name)
return cleaned_name.strip()
def clean_company_suffix(name: str) -> str:
"""Remove common suffixes from the company name, taking care of special cases."""
# Remove trailing slashes
name = name.rstrip('/')
# Handle cases like "JPMORGAN CHASE & CO" or "ELI LILLY & Co"
name = re.sub(r'\s*&\s*CO\b\.?', '', name, flags=re.IGNORECASE).strip()
# Remove other common suffixes, including "PLC", "LTD", "LIMITED", and combinations like "LTD CO"
name = re.sub(r'\b(?:Inc\.?|CO|CORP|PLC|LTD|LIMITED|L\.P\.)\b\.?$', '', name, flags=re.IGNORECASE).strip()
return name
def get_ticker_icon_url(ticker: str) -> str:
"""
Get the URL for the icon of a company with the given ticker.
"""
return f"https://raw.githubusercontent.com/nvstly/icons/main/ticker_icons/{ticker.upper()}.png"
@lru_cache(maxsize=4)
def get_icon_from_ticker(ticker: str) -> Optional[bytes]:
"""
Download an icon for a given ticker as a PNG image, if available.
WARNING: This function uses the nvstly/icons repository on GitHub to fetch the icons.
The icons are not guaranteed to be available for all tickers.
"""
if not isinstance(ticker, str):
raise ValueError("The ticker must be a valid string.")
if not ticker.isalpha():
raise ValueError("The ticker must only contain alphabetic characters.")
try:
downloaded = download_file(
f"https://raw.githubusercontent.com/nvstly/icons/main/ticker_icons/{ticker.upper()}.png", as_text=False)
return downloaded
except HTTPStatusError as e:
# If the status code is 404, the icon is not available
if e.response.status_code == 404:
return None
else:
raise
def popular_us_stocks():
df = (read_csv_from_package('popular_us_stocks.csv', dtype={'Cik': int})
.set_index('Cik')
)
return df
class Exchange(Enum):
Nasdaq = "Nasdaq"
NYSE = "NYSE"
OTC = "OTC"
CBOE = "CBOE"
def __str__(self):
return self.value