import json import os import re from enum import Enum from functools import lru_cache from io import StringIO from typing import Any, Dict, List, Optional, Union import pandas as pd import pyarrow as pa from httpx import HTTPStatusError from edgar.core import get_edgar_data_directory, listify, log from edgar.httprequests import download_file, download_json from edgar.reference.data.common import read_csv_from_package, read_parquet_from_package __all__ = ['cusip_ticker_mapping', 'get_ticker_from_cusip', 'get_company_tickers', 'get_icon_from_ticker', 'find_cik', 'get_cik_tickers', 'get_company_ticker_name_exchange', 'get_companies_by_exchange', 'popular_us_stocks', 'get_mutual_fund_tickers', 'find_mutual_fund_cik', 'list_all_tickers', 'find_ticker', 'find_ticker_safe', 'get_cik_ticker_lookup', 'get_company_cik_lookup', 'get_cik_tickers_from_ticker_txt', 'get_cik_tickers', 'get_company_tickers', 'ticker_txt_url', 'company_tickers_json_url', 'mutual_fund_tickers_url', 'company_tickers_exchange_url', 'Exchange' ] ticker_txt_url = "https://www.sec.gov/include/ticker.txt" company_tickers_json_url = "https://www.sec.gov/files/company_tickers.json" mutual_fund_tickers_url = "https://www.sec.gov/files/company_tickers_mf.json" company_tickers_exchange_url = "https://www.sec.gov/files/company_tickers_exchange.json" @lru_cache(maxsize=1) def cusip_ticker_mapping(allow_duplicate_cusips: bool = True) -> pd.DataFrame: """ Download the Cusip to Ticker mapping data from the SEC website. This provides a Dataframe with Cusip as the index and Ticker as the column. CUSIP can be duplicate to get non duplicate Cusips set allow_duplicate_cusips to False. This will return only the first occurrence of the Cusip. The first occurrence of the Cusip will also be most likely to be mapped to a Ticker that is linked to a cik """ df = read_parquet_from_package('ct.pq').set_index('Cusip') if not allow_duplicate_cusips: df = df[~df.index.duplicated(keep='first')] return df def load_tickers_from_local() -> Optional[Dict[str, Any]]: """ Load tickers from local data """ reference_dir = get_edgar_data_directory() / "reference" if not reference_dir.exists(): return None company_tickers_file = reference_dir / os.path.basename(company_tickers_json_url) if not company_tickers_file.exists(): return None return json.loads(company_tickers_file.read_text()) @lru_cache(maxsize=1) def get_company_tickers( as_dataframe: bool = True, clean_name: bool = True, clean_suffix: bool = False ) -> Union[pd.DataFrame, pa.Table]: """ Fetch and process company ticker data from SEC. Args: as_dataframe (bool): If True, returns pandas DataFrame; if False, returns pyarrow Table clean_name (bool): If True, cleans company names clean_suffix (bool): If True, removes common company suffixes Returns: Union[pd.DataFrame, pa.Table]: Processed company data """ # Pre-define schema for better performance SCHEMA = pa.schema([ ('cik', pa.int64()), ('ticker', pa.string()), ('company', pa.string()) ]) try: if os.getenv("EDGAR_USE_LOCAL_DATA"): tickers_json = load_tickers_from_local() if not tickers_json: tickers_json = download_json(company_tickers_json_url) else: # Download JSON data tickers_json = download_json(company_tickers_json_url) # Pre-allocate lists for better memory efficiency ciks = [] tickers = [] companies = [] # Process JSON data for item in tickers_json.values(): company_name = item['title'] # Apply name cleaning if requested if clean_name or clean_suffix: if clean_name: company_name = clean_company_name(company_name) if clean_suffix: company_name = clean_company_suffix(company_name) # Append to respective lists ciks.append(int(item['cik_str'])) tickers.append(item['ticker']) companies.append(company_name) if as_dataframe: # Create DataFrame directly from lists return pd.DataFrame({ 'cik': ciks, 'ticker': tickers, 'company': companies }) # Create pyarrow arrays cik_array = pa.array(ciks, type=pa.int64()) ticker_array = pa.array(tickers, type=pa.string()) company_array = pa.array(companies, type=pa.string()) # Create and return pyarrow Table return pa.Table.from_arrays( [cik_array, ticker_array, company_array], schema=SCHEMA ) except Exception as e: log.error(f"Error fetching company tickers from [{company_tickers_json_url}]: {str(e)}") raise def load_cik_tickers_from_local() -> Optional[str]: """ Load tickers.txt from local data """ reference_dir = get_edgar_data_directory() / "reference" if not reference_dir.exists(): return None tickers_txt_file = reference_dir / os.path.basename(ticker_txt_url) if not tickers_txt_file.exists(): return None return tickers_txt_file.read_text() def get_cik_tickers_from_ticker_txt(): """Get CIK and ticker data from ticker.txt file""" try: if os.getenv("EDGAR_USE_LOCAL_DATA"): ticker_txt = load_cik_tickers_from_local() if not ticker_txt: ticker_txt = download_file(ticker_txt_url, as_text=True) else: ticker_txt = download_file(ticker_txt_url, as_text=True) source = StringIO(ticker_txt) data = pd.read_csv(source, sep='\t', header=None, names=['ticker', 'cik']).dropna() data['ticker'] = data['ticker'].str.upper() return data except Exception as e: log.error(f"Error fetching company tickers from [{ticker_txt_url}]: {str(e)}") return None @lru_cache(maxsize=1) def get_cik_tickers(): """Merge unique records from both sources""" txt_data = get_cik_tickers_from_ticker_txt() try: json_data = get_company_tickers(clean_name=False, clean_suffix=False)[['ticker', 'cik']] except Exception: json_data = None if txt_data is None and json_data is None: raise Exception("Both data sources are unavailable") if txt_data is None: return json_data if json_data is None: return txt_data # Merge both dataframes and keep unique records merged_data = pd.concat([txt_data, json_data], ignore_index=True) merged_data = merged_data.drop_duplicates(subset=['ticker', 'cik']) return merged_data @lru_cache(maxsize=None) def list_all_tickers(): """List all tickers from the merged data""" return get_cik_tickers()['ticker'].tolist() @lru_cache(maxsize=None) def get_company_cik_lookup(): df = get_cik_tickers() lookup = {} for ticker, cik in zip(df['ticker'], df['cik'], strict=False): # Add original ticker lookup[ticker] = cik # Add base ticker (part before '-') base_ticker = ticker.split('-')[0] if base_ticker not in lookup: lookup[base_ticker] = cik return lookup @lru_cache(maxsize=None) def get_cik_ticker_lookup(): """Create a mapping of CIK to base ticker symbols. For CIKs with multiple tickers, uses the shortest ticker (usually the base symbol). """ company_lookup = get_company_cik_lookup() cik_to_tickers = {} for ticker, cik in company_lookup.items(): # Prefer the base ticker (without share class) base_ticker = ticker.split('-')[0] if cik not in cik_to_tickers or len(base_ticker) < len(cik_to_tickers[cik]): cik_to_tickers[cik] = base_ticker return cik_to_tickers @lru_cache(maxsize=128) def find_ticker(cik: Union[int, str]) -> str: """Find the ticker symbol for a given CIK. Returns empty string if no ticker is found. Args: cik: Central Index Key (CIK) as integer or string Returns: str: Ticker symbol or empty string if not found """ try: # Ensure cik is an integer cik = int(str(cik).lstrip('0')) return get_cik_ticker_lookup().get(cik, "") except (ValueError, TypeError): return "" def find_ticker_safe(cik: Union[int, str]) -> Optional[str]: """Find the ticker symbol for a given CIK without making network calls. Returns None if data is not already cached and would require a network call. Returns empty string if CIK is found but has no ticker. This function is designed for use cases where network calls should be avoided, such as in rich display methods that should be fast and not block on I/O. Args: cik: Central Index Key (CIK) as integer or string Returns: Optional[str]: Ticker symbol, empty string if no ticker found, or None if network call would be required """ try: # Simple approach: check if all required cache functions have data # Only proceed if all the underlying data is already cached if (get_cik_ticker_lookup.cache_info().currsize > 0 and get_company_cik_lookup.cache_info().currsize > 0 and get_cik_tickers.cache_info().currsize > 0): # If we have cached data, try to use it cik = int(str(cik).lstrip('0')) # This should be fast since data is cached lookup_dict = get_cik_ticker_lookup() return lookup_dict.get(cik, "") else: # Not all required data is cached, return None to avoid network calls return None except Exception: # Any error (including potential network errors) returns None # This ensures we never trigger network calls return None @lru_cache(maxsize=None) def get_company_ticker_name_exchange(): """ Return a DataFrame with columns [cik name ticker exchange] """ data = download_json("https://www.sec.gov/files/company_tickers_exchange.json") return pd.DataFrame(data['data'], columns=data['fields']) def get_companies_by_exchange(exchange: Union[List[str], str]): """ Get companies listed on a specific exchange. :param exchange: String, like 'Nasdaq' or 'NYSE' :return: DataFrame with companies listed on the specified exchange with columns [cik name ticker exchange] """ df = get_company_ticker_name_exchange() exchanges = [ex.lower() for ex in listify(exchange)] return df[df['exchange'].str.lower().isin(exchanges)].reset_index(drop=True) @lru_cache(maxsize=None) def get_mutual_fund_tickers(): """ Get mutual fund tickers. This returns a dataframe with columns cik seriesId classId ticker """ data = download_json("https://www.sec.gov/files/company_tickers_mf.json") return pd.DataFrame(data['data'], columns=['cik', 'seriesId', 'classId', 'ticker']) @lru_cache(maxsize=None) def get_mutual_fund_lookup(): df = get_mutual_fund_tickers() return dict(zip(df['ticker'], df['cik'], strict=False)) def find_mutual_fund_cik(ticker): """ Find the CIK for a given mutual fund or ETF ticker. :param ticker: String, the ticker symbol to look up :return: Integer, the CIK for the given ticker, or None if not found """ lookup = get_mutual_fund_lookup() return lookup.get(ticker.upper()) def find_company_cik(ticker): lookup = get_company_cik_lookup() ticker = ticker.upper().replace('.', '-') return lookup.get(ticker) def find_company_ticker(cik: Union[int, str]) -> Union[str, List[str], None]: """ Find the ticker for a given CIK. :param cik (int or str): The CIK to look up :return Union[str, List[str]]: A single ticker string if only one ticker is found, a list of ticker strings if multiple tickers are found, or an empty list if no tickers are found. """ try: # Ensure cik is a string without leading zeros, then convert to int cik = str(cik).lstrip('0') cik = int(cik) except (ValueError, TypeError): return None # Get DataFrame of CIK-Ticker mappings df = get_cik_tickers() # Ensure 'cik' and 'ticker' columns exist if 'cik' not in df.columns or 'ticker' not in df.columns: return None # Filter DataFrame for the given CIK ticker_series = df[df['cik'] == cik]['ticker'] # If no tickers found, return None if ticker_series.empty: return None # Filter out None values from tickers tickers = [ticker for ticker in ticker_series.to_numpy() if ticker is not None] # Return a single ticker if only one found if len(tickers) == 1: return tickers[0] return tickers def find_cik(ticker): """ Find the CIK for a given ticker, checking both company and mutual fund/ETF data. :param ticker: String, the ticker symbol to look up :return: Integer, the CIK for the given ticker, or None if not found """ # First, check company CIKs cik = find_company_cik(ticker) if cik is not None: return cik # If not found, check mutual fund/ETF CIKs return find_mutual_fund_cik(ticker) @lru_cache(maxsize=128) def get_ticker_from_cusip(cusip: str): """ Get the ticker symbol for a given Cusip. """ data = cusip_ticker_mapping() results = data.loc[cusip] if len(results) == 1: return results.iloc[0] elif len(results) > 1: return results.iloc[0].Ticker def clean_company_name(name: str) -> str: # Regular expression to match unwanted patterns at the end of the company name cleaned_name = re.sub(r'[/\\][A-Z]+[/\\]?$', '', name) return cleaned_name.strip() def clean_company_suffix(name: str) -> str: """Remove common suffixes from the company name, taking care of special cases.""" # Remove trailing slashes name = name.rstrip('/') # Handle cases like "JPMORGAN CHASE & CO" or "ELI LILLY & Co" name = re.sub(r'\s*&\s*CO\b\.?', '', name, flags=re.IGNORECASE).strip() # Remove other common suffixes, including "PLC", "LTD", "LIMITED", and combinations like "LTD CO" name = re.sub(r'\b(?:Inc\.?|CO|CORP|PLC|LTD|LIMITED|L\.P\.)\b\.?$', '', name, flags=re.IGNORECASE).strip() return name def get_ticker_icon_url(ticker: str) -> str: """ Get the URL for the icon of a company with the given ticker. """ return f"https://raw.githubusercontent.com/nvstly/icons/main/ticker_icons/{ticker.upper()}.png" @lru_cache(maxsize=4) def get_icon_from_ticker(ticker: str) -> Optional[bytes]: """ Download an icon for a given ticker as a PNG image, if available. WARNING: This function uses the nvstly/icons repository on GitHub to fetch the icons. The icons are not guaranteed to be available for all tickers. """ if not isinstance(ticker, str): raise ValueError("The ticker must be a valid string.") if not ticker.isalpha(): raise ValueError("The ticker must only contain alphabetic characters.") try: downloaded = download_file( f"https://raw.githubusercontent.com/nvstly/icons/main/ticker_icons/{ticker.upper()}.png", as_text=False) return downloaded except HTTPStatusError as e: # If the status code is 404, the icon is not available if e.response.status_code == 404: return None else: raise def popular_us_stocks(): df = (read_csv_from_package('popular_us_stocks.csv', dtype={'Cik': int}) .set_index('Cik') ) return df class Exchange(Enum): Nasdaq = "Nasdaq" NYSE = "NYSE" OTC = "OTC" CBOE = "CBOE" def __str__(self): return self.value