Initial commit
This commit is contained in:
109
venv/lib/python3.10/site-packages/edgar/reference/__init__.py
Normal file
109
venv/lib/python3.10/site-packages/edgar/reference/__init__.py
Normal file
@@ -0,0 +1,109 @@
|
||||
|
||||
from edgar.reference.company_subsets import (
|
||||
# Classes and Enums
|
||||
CompanySubset,
|
||||
MarketCapTier,
|
||||
PopularityTier,
|
||||
# Core Functions
|
||||
get_all_companies,
|
||||
get_companies_by_exchanges,
|
||||
get_popular_companies,
|
||||
# Industry and State Filtering (Comprehensive Mode)
|
||||
get_companies_by_industry,
|
||||
get_companies_by_state,
|
||||
# Sampling and Filtering
|
||||
get_random_sample,
|
||||
get_stratified_sample,
|
||||
get_top_companies_by_metric,
|
||||
filter_companies,
|
||||
exclude_companies,
|
||||
# Set Operations
|
||||
combine_company_sets,
|
||||
intersect_company_sets,
|
||||
# Convenience Functions - General
|
||||
get_faang_companies,
|
||||
get_tech_giants,
|
||||
get_dow_jones_sample,
|
||||
# Convenience Functions - Industry Specific
|
||||
get_pharmaceutical_companies,
|
||||
get_biotechnology_companies,
|
||||
get_software_companies,
|
||||
get_semiconductor_companies,
|
||||
get_banking_companies,
|
||||
get_investment_companies,
|
||||
get_insurance_companies,
|
||||
get_real_estate_companies,
|
||||
get_oil_gas_companies,
|
||||
get_retail_companies,
|
||||
)
|
||||
from edgar.reference.company_dataset import (
|
||||
get_company_dataset,
|
||||
build_company_dataset_parquet,
|
||||
build_company_dataset_duckdb,
|
||||
is_individual_from_json,
|
||||
to_duckdb,
|
||||
)
|
||||
from edgar.reference.forms import describe_form
|
||||
from edgar.reference.tickers import cusip_ticker_mapping, get_icon_from_ticker, get_ticker_from_cusip
|
||||
|
||||
# A dict of state abbreviations and their full names
|
||||
states = {
|
||||
|
||||
"AL": "Alabama",
|
||||
"AK": "Alaska",
|
||||
"AZ": "Arizona",
|
||||
"AR": "Arkansas",
|
||||
"CA": "California",
|
||||
"CO": "Colorado",
|
||||
"CT": "Connecticut",
|
||||
"DE": "Delaware",
|
||||
"FL": "Florida",
|
||||
"GA": "Georgia",
|
||||
"HI": "Hawaii",
|
||||
"ID": "Idaho",
|
||||
"IL": "Illinois",
|
||||
"IN": "Indiana",
|
||||
"IA": "Iowa",
|
||||
"KS": "Kansas",
|
||||
"KY": "Kentucky",
|
||||
"LA": "Louisiana",
|
||||
"ME": "Maine",
|
||||
"MD": "Maryland",
|
||||
"MA": "Massachusetts",
|
||||
"MI": "Michigan",
|
||||
"MN": "Minnesota",
|
||||
"MS": "Mississippi",
|
||||
"MO": "Missouri",
|
||||
"MT": "Montana",
|
||||
"NE": "Nebraska",
|
||||
"NV": "Nevada",
|
||||
"NH": "New Hampshire",
|
||||
"NJ": "New Jersey",
|
||||
"NM": "New Mexico",
|
||||
"NY": "New York",
|
||||
"NC": "North Carolina",
|
||||
"ND": "North Dakota",
|
||||
"OH": "Ohio",
|
||||
"OK": "Oklahoma",
|
||||
"OR": "Oregon",
|
||||
"PA": "Pennsylvania",
|
||||
"RI": "Rhode Island",
|
||||
"SC": "South Carolina",
|
||||
"SD": "South Dakota",
|
||||
"TN": "Tennessee",
|
||||
"TX": "Texas",
|
||||
"UT": "Utah",
|
||||
"VT": "Vermont",
|
||||
"VA": "Virginia",
|
||||
"WA": "Washington",
|
||||
"WV": "West Virginia",
|
||||
"WI": "Wisconsin",
|
||||
"WY": "Wyoming",
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
111
venv/lib/python3.10/site-packages/edgar/reference/_codes.py
Normal file
111
venv/lib/python3.10/site-packages/edgar/reference/_codes.py
Normal file
@@ -0,0 +1,111 @@
|
||||
ACRONYMS = {
|
||||
"CCC": "CIK Confirmation Code",
|
||||
"CIK": "Central Index Key",
|
||||
"EDGAR": "Electronic Data Gathering, Analysis, and Retrieval",
|
||||
"SEC": "Securities and Exchange Commission",
|
||||
}
|
||||
|
||||
INVESTMENT_CATEGORIES = {
|
||||
"ABS": "Asset-backed securities",
|
||||
"ACMO": "Agency collateralized mortgage obligations",
|
||||
"ACMBS": "Agency debentures and agency strips",
|
||||
"AMBS": "Agency mortgage-backed securities",
|
||||
"UST": " U.S. Treasuries (including strips)",
|
||||
"N/A": "Not applicable"
|
||||
}
|
||||
|
||||
ISO_STATES_AND_OUTLYING_AREAS = {
|
||||
"US-AL": "ALABAMA",
|
||||
"US-AK": "ALASKA",
|
||||
"US-AZ": "ARIZONA",
|
||||
"US-AR": "ARKANSAS",
|
||||
"US-CA": "CALIFORNIA",
|
||||
"US-CO": "COLORADO",
|
||||
"US-CT": "CONNECTICUT",
|
||||
"US-DE": "DELAWARE",
|
||||
"US-DC": "DISTRICT OF COLUMBIA",
|
||||
|
||||
}
|
||||
|
||||
ISO_COUNTRY_CODES = {
|
||||
"AF": " AFGHANISTAN",
|
||||
"AX": "ALAND ISLANDS",
|
||||
"AL": "ALBANIA",
|
||||
"DZ": "ALGERIA",
|
||||
"AS": "AMERICAN SAMOA",
|
||||
"AD": "ANDORRA",
|
||||
"AO": "ANGOLA",
|
||||
"AI": "ANGUILLA",
|
||||
"AQ": "ANTARCTICA",
|
||||
"AG": "ANTIGUA AND BARBUDA",
|
||||
"AR": "ARGENTINA",
|
||||
"AM": "ARMENIA",
|
||||
"AW": "ARUBA",
|
||||
"AU": "AUSTRALIA",
|
||||
"AT": "AUSTRIA",
|
||||
"AZ": "AZERBAIJAN",
|
||||
"BS": "BAHAMAS",
|
||||
"BH": "BAHRAIN",
|
||||
"BD": "BANGLADESH",
|
||||
"BB": "BARBADOS",
|
||||
"BY": "BELARUS",
|
||||
"BE": "BELGIUM",
|
||||
"BZ": "BELIZE",
|
||||
"BJ": "BENIN",
|
||||
"BM": "BERMUDA",
|
||||
"BT": "BHUTAN",
|
||||
"BO": "BOLIVIA (PLURINATIONAL STATE OF)",
|
||||
"BQ": "BONAIRE, SINT EUSTATIUS AND SABA",
|
||||
"BA": "BOSNIA AND HERZEGOVINA",
|
||||
"BW": "BOTSWANA",
|
||||
"BV": "BOUVET ISLAND",
|
||||
"BR": "BRAZIL",
|
||||
"IO": "BRITISH INDIAN OCEAN TERRITORY",
|
||||
"BN": "BRUNEI DARUSSALAM",
|
||||
"BG": "BULGARIA",
|
||||
"BF": "BURKINA FASO",
|
||||
"BI": "BURUNDI",
|
||||
"CV": "CABO VERDE",
|
||||
"KH": "CAMBODIA",
|
||||
"CM": "CAMEROON",
|
||||
"CA": "CANADA",
|
||||
"KY": "CAYMAN ISLANDS",
|
||||
"CF": "CENTRAL AFRICAN REPUBLIC",
|
||||
"TD": "CHAD",
|
||||
"CL": "CHILE",
|
||||
"CN": "CHINA",
|
||||
"CX": "CHRISTMAS ISLAND",
|
||||
"CC": "COCOS (KEELING) ISLANDS",
|
||||
"CO": "COLOMBIA",
|
||||
"KM": "COMOROS",
|
||||
"CG": "CONGO",
|
||||
"CD": "COOK ISLANDS",
|
||||
"CR": "COSTA RICA",
|
||||
"CI": "COTE D'IVOIRE",
|
||||
"HR": "CROATIA",
|
||||
"CU": "CUBA",
|
||||
"CW": "CURACAO",
|
||||
"CY": "CYPRUS",
|
||||
"CZ": "CZECHIA",
|
||||
"DK": "DENMARK",
|
||||
"DJ": "DJIBOUTI",
|
||||
"DM": "DOMINICA",
|
||||
"DO": "DOMINICAN REPUBLIC",
|
||||
"EC": "ECUADOR",
|
||||
"EG": "EGYPT",
|
||||
"SV": "EL SALVADOR",
|
||||
"GQ": "EQUATORIAL GUINEA",
|
||||
"ER": "ERITREA",
|
||||
"EE": "ESTONIA",
|
||||
"ET": "ETHIOPIA",
|
||||
"FK": "FALKLAND ISLANDS (MALVINAS)",
|
||||
"FO": "FAROE ISLANDS",
|
||||
"FJ": "FIJI",
|
||||
"FI": "FINLAND",
|
||||
"FR": "FRANCE",
|
||||
"GF": "FRENCH GUIANA",
|
||||
"PF": "FRENCH POLYNESIA",
|
||||
"TF": "FRENCH SOUTHERN TERRITORIES",
|
||||
"GA": "GABON",
|
||||
"GM": "GAMBIA",
|
||||
}
|
||||
@@ -0,0 +1,606 @@
|
||||
"""
|
||||
Company Dataset Builder for EdgarTools
|
||||
|
||||
Builds high-performance company datasets from SEC submissions data with two output formats:
|
||||
1. PyArrow Parquet (5-20 MB) - Fast filtering with PyArrow compute API
|
||||
2. DuckDB (287 MB) - Optional SQL interface for power users
|
||||
|
||||
Performance:
|
||||
- Build time: ~30 seconds (optimized with orjson + company filtering)
|
||||
- Records: ~562,413 companies (40% individual filers filtered)
|
||||
- Query speed: <1ms (DuckDB) or <100ms (Parquet)
|
||||
|
||||
Example:
|
||||
>>> from edgar.reference import get_company_dataset
|
||||
>>> import pyarrow.compute as pc
|
||||
>>>
|
||||
>>> # Load dataset (builds on first use)
|
||||
>>> companies = get_company_dataset()
|
||||
>>>
|
||||
>>> # Filter pharmaceutical companies
|
||||
>>> pharma = companies.filter(pc.field('sic').between(2834, 2836))
|
||||
>>> print(f"Found {len(pharma)} pharma companies")
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
import logging
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
from tqdm import tqdm
|
||||
|
||||
from edgar.core import get_edgar_data_directory, log
|
||||
|
||||
# Try to import orjson for performance, fall back to stdlib json
|
||||
try:
|
||||
import orjson
|
||||
|
||||
def load_json(path: Path) -> dict:
|
||||
"""Load JSON file using orjson (1.55x faster)"""
|
||||
return orjson.loads(path.read_bytes())
|
||||
|
||||
JSON_PARSER = "orjson"
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
def load_json(path: Path) -> dict:
|
||||
"""Load JSON file using stdlib json"""
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
JSON_PARSER = "json (stdlib)"
|
||||
|
||||
|
||||
# Company dataset schema
|
||||
COMPANY_SCHEMA = pa.schema([
|
||||
('cik', pa.string()), # Keep as string to preserve leading zeros
|
||||
('name', pa.string()),
|
||||
('sic', pa.int32()), # Nullable - some companies have no SIC
|
||||
('sic_description', pa.string()),
|
||||
('tickers', pa.string()), # Pipe-delimited (e.g., "AAPL|APPLE")
|
||||
('exchanges', pa.string()), # Pipe-delimited (e.g., "Nasdaq|NYSE")
|
||||
('state_of_incorporation', pa.string()),
|
||||
('state_of_incorporation_description', pa.string()),
|
||||
('fiscal_year_end', pa.string()), # MMDD format
|
||||
('entity_type', pa.string()),
|
||||
('ein', pa.string()),
|
||||
])
|
||||
|
||||
|
||||
def is_individual_from_json(data: dict) -> bool:
|
||||
"""
|
||||
Determine if entity is an individual filer vs a company.
|
||||
|
||||
Uses the same logic as edgar.entity.data:478 (is_individual property).
|
||||
|
||||
Companies typically have:
|
||||
- Tickers or exchanges
|
||||
- State of incorporation
|
||||
- Entity type other than '' or 'other'
|
||||
- Company-specific filings (10-K, 10-Q, 8-K, etc.)
|
||||
|
||||
Args:
|
||||
data: Parsed JSON submission data
|
||||
|
||||
Returns:
|
||||
True if individual filer, False if company
|
||||
|
||||
Example:
|
||||
>>> data = {'cik': '0001318605', 'tickers': ['TSLA']}
|
||||
>>> is_individual_from_json(data)
|
||||
False
|
||||
|
||||
>>> data = {'cik': '0001078519', 'name': 'JOHN DOE'}
|
||||
>>> is_individual_from_json(data)
|
||||
True
|
||||
"""
|
||||
# Has ticker or exchange → company
|
||||
if data.get('tickers') or data.get('exchanges'):
|
||||
return False
|
||||
|
||||
# Has state of incorporation → company (with exceptions)
|
||||
state = data.get('stateOfIncorporation', '')
|
||||
if state and state != '':
|
||||
# Reed Hastings exception (individual with state of incorporation)
|
||||
if data.get('cik') == '0001033331':
|
||||
return True
|
||||
return False
|
||||
|
||||
# Has entity type (not '' or 'other') → company
|
||||
entity_type = data.get('entityType', '')
|
||||
if entity_type and entity_type not in ['', 'other']:
|
||||
return False
|
||||
|
||||
# Files company forms (10-K, 10-Q, etc.) → company
|
||||
filings = data.get('filings', {})
|
||||
if filings:
|
||||
recent = filings.get('recent', {})
|
||||
forms = recent.get('form', [])
|
||||
company_forms = {'10-K', '10-Q', '8-K', '10-K/A', '10-Q/A', '20-F', 'S-1'}
|
||||
if any(form in company_forms for form in forms):
|
||||
return False
|
||||
|
||||
# Default: individual
|
||||
return True
|
||||
|
||||
|
||||
def build_company_dataset_parquet(
|
||||
submissions_dir: Path,
|
||||
output_path: Path,
|
||||
filter_individuals: bool = True,
|
||||
show_progress: bool = True
|
||||
) -> pa.Table:
|
||||
"""
|
||||
Build PyArrow Parquet dataset from submissions directory (companies only).
|
||||
|
||||
This function processes all CIK*.json files in the submissions directory,
|
||||
filters out individual filers (optional), and creates a compressed Parquet file.
|
||||
|
||||
Performance:
|
||||
- ~30 seconds for 562,413 companies (with orjson + filtering)
|
||||
- Output size: ~5-20 MB (zstd compressed)
|
||||
- Memory usage: ~100-200 MB during build
|
||||
|
||||
Args:
|
||||
submissions_dir: Directory containing CIK*.json files
|
||||
output_path: Where to save the .pq file
|
||||
filter_individuals: Skip individual filers (default: True)
|
||||
show_progress: Show progress bar (default: True)
|
||||
|
||||
Returns:
|
||||
PyArrow Table with company data
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If submissions_dir doesn't exist
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> submissions_dir = Path.home() / '.edgar' / 'submissions'
|
||||
>>> output_path = Path.home() / '.edgar' / 'companies.pq'
|
||||
>>> table = build_company_dataset_parquet(submissions_dir, output_path)
|
||||
>>> print(f"Built dataset: {len(table):,} companies")
|
||||
"""
|
||||
if not submissions_dir.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Submissions directory not found: {submissions_dir}\n\n"
|
||||
"Please download submissions data first:\n"
|
||||
" from edgar.storage import download_submissions\n"
|
||||
" download_submissions()\n"
|
||||
)
|
||||
|
||||
# Get all submission JSON files
|
||||
json_files = list(submissions_dir.glob("CIK*.json"))
|
||||
if len(json_files) == 0:
|
||||
raise FileNotFoundError(
|
||||
f"No submission files found in: {submissions_dir}\n"
|
||||
"Expected CIK*.json files"
|
||||
)
|
||||
|
||||
log.info(f"Building company dataset from {len(json_files):,} submission files")
|
||||
log.info(f"Using JSON parser: {JSON_PARSER}")
|
||||
|
||||
companies = []
|
||||
errors = 0
|
||||
individuals_skipped = 0
|
||||
|
||||
# Process each file with progress bar
|
||||
iterator = tqdm(json_files, desc="Processing submissions", disable=not show_progress)
|
||||
|
||||
for json_file in iterator:
|
||||
try:
|
||||
data = load_json(json_file)
|
||||
|
||||
# Skip individuals if filtering enabled
|
||||
if filter_individuals and is_individual_from_json(data):
|
||||
individuals_skipped += 1
|
||||
continue
|
||||
|
||||
# Extract SIC (handle empty strings)
|
||||
sic = data.get('sic')
|
||||
sic_int = int(sic) if sic and sic != '' else None
|
||||
|
||||
# Extract tickers and exchanges (filter None values)
|
||||
tickers = data.get('tickers', [])
|
||||
exchanges = data.get('exchanges', [])
|
||||
|
||||
companies.append({
|
||||
'cik': data.get('cik'),
|
||||
'name': data.get('name'),
|
||||
'sic': sic_int,
|
||||
'sic_description': data.get('sicDescription'),
|
||||
'tickers': '|'.join(filter(None, tickers)) if tickers else None,
|
||||
'exchanges': '|'.join(filter(None, exchanges)) if exchanges else None,
|
||||
'state_of_incorporation': data.get('stateOfIncorporation'),
|
||||
'state_of_incorporation_description': data.get('stateOfIncorporationDescription'),
|
||||
'fiscal_year_end': data.get('fiscalYearEnd'),
|
||||
'entity_type': data.get('entityType'),
|
||||
'ein': data.get('ein'),
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
log.debug(f"Error processing {json_file.name}: {e}")
|
||||
continue
|
||||
|
||||
# Log statistics
|
||||
log.info(f"Processed {len(json_files):,} files:")
|
||||
log.info(f" - Companies: {len(companies):,}")
|
||||
if filter_individuals:
|
||||
log.info(f" - Individuals skipped: {individuals_skipped:,}")
|
||||
if errors > 0:
|
||||
log.warning(f" - Errors: {errors:,}")
|
||||
|
||||
# Create PyArrow Table
|
||||
table = pa.Table.from_pylist(companies, schema=COMPANY_SCHEMA)
|
||||
|
||||
# Write to Parquet with compression
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pq.write_table(
|
||||
table,
|
||||
output_path,
|
||||
compression='zstd',
|
||||
compression_level=9,
|
||||
use_dictionary=True
|
||||
)
|
||||
|
||||
file_size_mb = output_path.stat().st_size / (1024 * 1024)
|
||||
log.info(f"Saved Parquet file: {output_path} ({file_size_mb:.1f} MB)")
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def build_company_dataset_duckdb(
|
||||
submissions_dir: Path,
|
||||
output_path: Path,
|
||||
filter_individuals: bool = True,
|
||||
create_indexes: bool = True,
|
||||
show_progress: bool = True
|
||||
) -> None:
|
||||
"""
|
||||
Build DuckDB database from submissions directory (companies only).
|
||||
|
||||
This function creates a DuckDB database with a 'companies' table and
|
||||
optional indexes on key columns for fast querying.
|
||||
|
||||
Performance:
|
||||
- ~30 seconds for 562,413 companies (with orjson + filtering)
|
||||
- Output size: ~287 MB
|
||||
- Query speed: <1ms with indexes
|
||||
|
||||
Args:
|
||||
submissions_dir: Directory containing CIK*.json files
|
||||
output_path: Where to save the .duckdb file
|
||||
filter_individuals: Skip individual filers (default: True)
|
||||
create_indexes: Create indexes on cik, sic, name (default: True)
|
||||
show_progress: Show progress bar (default: True)
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If submissions_dir doesn't exist
|
||||
ImportError: If duckdb package not installed
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> submissions_dir = Path.home() / '.edgar' / 'submissions'
|
||||
>>> output_path = Path.home() / '.edgar' / 'companies.duckdb'
|
||||
>>> build_company_dataset_duckdb(submissions_dir, output_path)
|
||||
>>>
|
||||
>>> import duckdb
|
||||
>>> con = duckdb.connect(str(output_path))
|
||||
>>> result = con.execute("SELECT COUNT(*) FROM companies").fetchone()
|
||||
>>> print(f"Companies: {result[0]:,}")
|
||||
"""
|
||||
try:
|
||||
import duckdb
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"DuckDB export requires duckdb package.\n"
|
||||
"Install with: pip install duckdb"
|
||||
)
|
||||
|
||||
if not submissions_dir.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Submissions directory not found: {submissions_dir}\n\n"
|
||||
"Please download submissions data first:\n"
|
||||
" from edgar.storage import download_submissions\n"
|
||||
" download_submissions()\n"
|
||||
)
|
||||
|
||||
# Get all submission JSON files
|
||||
json_files = list(submissions_dir.glob("CIK*.json"))
|
||||
if len(json_files) == 0:
|
||||
raise FileNotFoundError(
|
||||
f"No submission files found in: {submissions_dir}\n"
|
||||
"Expected CIK*.json files"
|
||||
)
|
||||
|
||||
log.info(f"Building DuckDB database from {len(json_files):,} submission files")
|
||||
log.info(f"Using JSON parser: {JSON_PARSER}")
|
||||
|
||||
companies = []
|
||||
errors = 0
|
||||
individuals_skipped = 0
|
||||
|
||||
# Process each file with progress bar
|
||||
iterator = tqdm(json_files, desc="Processing submissions", disable=not show_progress)
|
||||
|
||||
for json_file in iterator:
|
||||
try:
|
||||
data = load_json(json_file)
|
||||
|
||||
# Skip individuals if filtering enabled
|
||||
if filter_individuals and is_individual_from_json(data):
|
||||
individuals_skipped += 1
|
||||
continue
|
||||
|
||||
# Extract SIC (handle empty strings)
|
||||
sic = data.get('sic')
|
||||
sic_int = int(sic) if sic and sic != '' else None
|
||||
|
||||
# Extract tickers and exchanges (filter None values)
|
||||
tickers = data.get('tickers', [])
|
||||
exchanges = data.get('exchanges', [])
|
||||
|
||||
companies.append({
|
||||
'cik': data.get('cik'),
|
||||
'name': data.get('name'),
|
||||
'sic': sic_int,
|
||||
'sic_description': data.get('sicDescription'),
|
||||
'tickers': '|'.join(filter(None, tickers)) if tickers else None,
|
||||
'exchanges': '|'.join(filter(None, exchanges)) if exchanges else None,
|
||||
'state_of_incorporation': data.get('stateOfIncorporation'),
|
||||
'state_of_incorporation_description': data.get('stateOfIncorporationDescription'),
|
||||
'fiscal_year_end': data.get('fiscalYearEnd'),
|
||||
'entity_type': data.get('entityType'),
|
||||
'ein': data.get('ein'),
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
log.debug(f"Error processing {json_file.name}: {e}")
|
||||
continue
|
||||
|
||||
# Log statistics
|
||||
log.info(f"Processed {len(json_files):,} files:")
|
||||
log.info(f" - Companies: {len(companies):,}")
|
||||
if filter_individuals:
|
||||
log.info(f" - Individuals skipped: {individuals_skipped:,}")
|
||||
if errors > 0:
|
||||
log.warning(f" - Errors: {errors:,}")
|
||||
|
||||
# Create DuckDB database
|
||||
import pandas as pd
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
con = duckdb.connect(str(output_path))
|
||||
|
||||
# Create table from DataFrame
|
||||
df = pd.DataFrame(companies)
|
||||
con.execute("CREATE TABLE companies AS SELECT * FROM df")
|
||||
|
||||
# Create indexes
|
||||
if create_indexes:
|
||||
log.info("Creating indexes...")
|
||||
con.execute("CREATE INDEX idx_cik ON companies(cik)")
|
||||
con.execute("CREATE INDEX idx_sic ON companies(sic)")
|
||||
con.execute("CREATE INDEX idx_name ON companies(name)")
|
||||
|
||||
# Add metadata table
|
||||
con.execute("""
|
||||
CREATE TABLE metadata AS
|
||||
SELECT
|
||||
CURRENT_TIMESTAMP as created_at,
|
||||
COUNT(*) as total_companies,
|
||||
COUNT(DISTINCT sic) as unique_sic_codes,
|
||||
COUNT(DISTINCT CASE WHEN tickers IS NOT NULL THEN 1 END) as companies_with_tickers,
|
||||
COUNT(DISTINCT CASE WHEN exchanges IS NOT NULL THEN 1 END) as companies_with_exchanges
|
||||
FROM companies
|
||||
""")
|
||||
|
||||
con.close()
|
||||
|
||||
file_size_mb = output_path.stat().st_size / (1024 * 1024)
|
||||
log.info(f"Saved DuckDB database: {output_path} ({file_size_mb:.1f} MB)")
|
||||
|
||||
|
||||
def load_company_dataset_parquet(parquet_path: Path) -> pa.Table:
|
||||
"""
|
||||
Load company dataset from Parquet file.
|
||||
|
||||
This is a simple wrapper around pyarrow.parquet.read_table() with
|
||||
logging for consistency.
|
||||
|
||||
Performance: <100ms for typical dataset
|
||||
|
||||
Args:
|
||||
parquet_path: Path to .pq file
|
||||
|
||||
Returns:
|
||||
PyArrow Table with company data
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> path = Path.home() / '.edgar' / 'companies.pq'
|
||||
>>> companies = load_company_dataset_parquet(path)
|
||||
>>> print(f"Loaded {len(companies):,} companies")
|
||||
"""
|
||||
if not parquet_path.exists():
|
||||
raise FileNotFoundError(f"Parquet file not found: {parquet_path}")
|
||||
|
||||
table = pq.read_table(parquet_path)
|
||||
log.debug(f"Loaded {len(table):,} companies from {parquet_path}")
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def to_duckdb(
|
||||
parquet_path: Path,
|
||||
duckdb_path: Path,
|
||||
create_indexes: bool = True
|
||||
) -> None:
|
||||
"""
|
||||
Convert Parquet dataset to DuckDB database.
|
||||
|
||||
This provides an easy way to export the Parquet dataset to DuckDB
|
||||
for users who want SQL query capabilities.
|
||||
|
||||
Performance: <5 seconds for typical dataset
|
||||
|
||||
Args:
|
||||
parquet_path: Path to source .pq file
|
||||
duckdb_path: Path to output .duckdb file
|
||||
create_indexes: Create indexes on key columns (default: True)
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> parquet_path = Path.home() / '.edgar' / 'companies.pq'
|
||||
>>> duckdb_path = Path.home() / '.edgar' / 'companies.duckdb'
|
||||
>>> to_duckdb(parquet_path, duckdb_path)
|
||||
>>>
|
||||
>>> import duckdb
|
||||
>>> con = duckdb.connect(str(duckdb_path))
|
||||
>>> result = con.execute(
|
||||
... "SELECT * FROM companies WHERE sic = 2834"
|
||||
... ).fetchdf()
|
||||
"""
|
||||
try:
|
||||
import duckdb
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"DuckDB export requires duckdb package.\n"
|
||||
"Install with: pip install duckdb"
|
||||
)
|
||||
|
||||
if not parquet_path.exists():
|
||||
raise FileNotFoundError(f"Parquet file not found: {parquet_path}")
|
||||
|
||||
log.info(f"Converting Parquet to DuckDB: {parquet_path} -> {duckdb_path}")
|
||||
|
||||
# Read Parquet file and convert to pandas
|
||||
table = pq.read_table(parquet_path)
|
||||
import pandas as pd
|
||||
df = table.to_pandas()
|
||||
|
||||
# Create DuckDB database
|
||||
duckdb_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
con = duckdb.connect(str(duckdb_path))
|
||||
|
||||
# Create table from DataFrame
|
||||
con.execute("CREATE TABLE companies AS SELECT * FROM df")
|
||||
|
||||
# Create indexes
|
||||
if create_indexes:
|
||||
log.info("Creating indexes...")
|
||||
con.execute("CREATE INDEX idx_cik ON companies(cik)")
|
||||
con.execute("CREATE INDEX idx_sic ON companies(sic)")
|
||||
con.execute("CREATE INDEX idx_name ON companies(name)")
|
||||
|
||||
# Add metadata
|
||||
con.execute("""
|
||||
CREATE TABLE metadata AS
|
||||
SELECT
|
||||
CURRENT_TIMESTAMP as created_at,
|
||||
COUNT(*) as total_companies,
|
||||
COUNT(DISTINCT sic) as unique_sic_codes,
|
||||
COUNT(DISTINCT CASE WHEN tickers IS NOT NULL THEN 1 END) as companies_with_tickers,
|
||||
COUNT(DISTINCT CASE WHEN exchanges IS NOT NULL THEN 1 END) as companies_with_exchanges
|
||||
FROM companies
|
||||
""")
|
||||
|
||||
con.close()
|
||||
|
||||
file_size_mb = duckdb_path.stat().st_size / (1024 * 1024)
|
||||
log.info(f"Exported to DuckDB: {duckdb_path} ({file_size_mb:.1f} MB)")
|
||||
|
||||
|
||||
# In-memory cache for dataset
|
||||
_CACHE = {}
|
||||
|
||||
|
||||
def get_company_dataset(rebuild: bool = False) -> pa.Table:
|
||||
"""
|
||||
Get company dataset, building from submissions if needed.
|
||||
|
||||
This function checks for a cached dataset at ~/.edgar/companies.pq.
|
||||
If not found, it automatically builds the dataset from submissions data.
|
||||
|
||||
On first use, this will take ~30 seconds to build the dataset. Subsequent
|
||||
calls load from cache in <100ms.
|
||||
|
||||
Args:
|
||||
rebuild: Force rebuild even if cache exists (default: False)
|
||||
|
||||
Returns:
|
||||
PyArrow Table with company data (~562,413 companies)
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If submissions directory not found or incomplete
|
||||
|
||||
Performance:
|
||||
- First use: ~30 seconds (builds dataset)
|
||||
- Cached: <100ms (loads from disk)
|
||||
- Memory: ~20-50 MB
|
||||
|
||||
Example:
|
||||
>>> from edgar.reference import get_company_dataset
|
||||
>>> import pyarrow.compute as pc
|
||||
>>>
|
||||
>>> # First call builds dataset (takes ~30s)
|
||||
>>> companies = get_company_dataset()
|
||||
>>> print(f"Loaded {len(companies):,} companies")
|
||||
>>>
|
||||
>>> # Subsequent calls are fast (<100ms)
|
||||
>>> companies = get_company_dataset()
|
||||
>>>
|
||||
>>> # Filter pharmaceutical companies (SIC 2834-2836)
|
||||
>>> pharma = companies.filter(
|
||||
... pc.field('sic').between(2834, 2836)
|
||||
... )
|
||||
>>> print(f"Found {len(pharma)} pharma companies")
|
||||
>>>
|
||||
>>> # Filter by exchange
|
||||
>>> nasdaq = companies.filter(
|
||||
... pc.field('exchanges').contains('Nasdaq')
|
||||
... )
|
||||
>>>
|
||||
>>> # Force rebuild with latest data
|
||||
>>> companies = get_company_dataset(rebuild=True)
|
||||
"""
|
||||
# Check in-memory cache first
|
||||
if not rebuild and 'companies' in _CACHE:
|
||||
return _CACHE['companies']
|
||||
|
||||
# Check disk cache
|
||||
cache_path = get_edgar_data_directory() / 'companies.pq'
|
||||
|
||||
if cache_path.exists() and not rebuild:
|
||||
# Load from cache
|
||||
log.info(f"Loading company dataset from cache: {cache_path}")
|
||||
table = load_company_dataset_parquet(cache_path)
|
||||
_CACHE['companies'] = table
|
||||
return table
|
||||
|
||||
# Need to build dataset
|
||||
log.info("Building company dataset from submissions (this may take ~30 seconds)...")
|
||||
|
||||
submissions_dir = get_edgar_data_directory() / 'submissions'
|
||||
if not submissions_dir.exists() or len(list(submissions_dir.glob('CIK*.json'))) < 100000:
|
||||
raise FileNotFoundError(
|
||||
f"Submissions directory not found or incomplete: {submissions_dir}\n\n"
|
||||
"Please download submissions data first:\n"
|
||||
" from edgar.storage import download_submissions\n"
|
||||
" download_submissions()\n\n"
|
||||
"This is a one-time download (~500 MB compressed)."
|
||||
)
|
||||
|
||||
# Build dataset
|
||||
table = build_company_dataset_parquet(
|
||||
submissions_dir,
|
||||
cache_path,
|
||||
filter_individuals=True
|
||||
)
|
||||
|
||||
log.info(f"✅ Built dataset: {len(table):,} companies, cached at {cache_path}")
|
||||
|
||||
_CACHE['companies'] = table
|
||||
return table
|
||||
@@ -0,0 +1,991 @@
|
||||
"""
|
||||
Company subset selection utilities for analysis and learning tasks.
|
||||
|
||||
This module provides flexible ways to create subsets of companies from SEC reference data
|
||||
for educational, research, and analysis purposes. It offers exchange-based selection,
|
||||
popularity-based filtering, sampling capabilities, and composition utilities.
|
||||
|
||||
Key features:
|
||||
- Exchange-based selection (NYSE, NASDAQ, OTC, CBOE)
|
||||
- Popularity-based selection (popular stocks, market cap tiers)
|
||||
- Sampling capabilities (random, stratified, top N)
|
||||
- Filtering and combination utilities
|
||||
- Consistent DataFrame output format
|
||||
|
||||
All functions return a standardized DataFrame with columns: ['cik', 'ticker', 'name', 'exchange']
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
from functools import lru_cache
|
||||
from typing import Callable, List, Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from edgar.core import log
|
||||
from edgar.reference.tickers import get_company_ticker_name_exchange, popular_us_stocks
|
||||
|
||||
__all__ = [
|
||||
# Classes and Enums
|
||||
'CompanySubset',
|
||||
'MarketCapTier',
|
||||
'PopularityTier',
|
||||
# Core Functions
|
||||
'get_all_companies',
|
||||
'get_companies_by_exchanges',
|
||||
'get_popular_companies',
|
||||
# Industry and State Filtering (Comprehensive Mode)
|
||||
'get_companies_by_industry',
|
||||
'get_companies_by_state',
|
||||
# Sampling and Filtering
|
||||
'get_random_sample',
|
||||
'get_stratified_sample',
|
||||
'get_top_companies_by_metric',
|
||||
'filter_companies',
|
||||
'exclude_companies',
|
||||
# Set Operations
|
||||
'combine_company_sets',
|
||||
'intersect_company_sets',
|
||||
# Convenience Functions - General
|
||||
'get_faang_companies',
|
||||
'get_tech_giants',
|
||||
'get_dow_jones_sample',
|
||||
# Convenience Functions - Industry Specific
|
||||
'get_pharmaceutical_companies',
|
||||
'get_biotechnology_companies',
|
||||
'get_software_companies',
|
||||
'get_semiconductor_companies',
|
||||
'get_banking_companies',
|
||||
'get_investment_companies',
|
||||
'get_insurance_companies',
|
||||
'get_real_estate_companies',
|
||||
'get_oil_gas_companies',
|
||||
'get_retail_companies',
|
||||
]
|
||||
|
||||
|
||||
class MarketCapTier(Enum):
|
||||
"""Market cap tiers for company classification."""
|
||||
LARGE_CAP = "large_cap" # Usually > $10B
|
||||
MID_CAP = "mid_cap" # Usually $2B - $10B
|
||||
SMALL_CAP = "small_cap" # Usually $300M - $2B
|
||||
MICRO_CAP = "micro_cap" # Usually < $300M
|
||||
|
||||
|
||||
class PopularityTier(Enum):
|
||||
"""Popularity tiers based on trading activity and recognition."""
|
||||
MEGA_CAP = "mega_cap" # Top 10 most valuable companies
|
||||
POPULAR = "popular" # Popular stocks list
|
||||
MAINSTREAM = "mainstream" # Well-known companies
|
||||
EMERGING = "emerging" # Smaller but notable companies
|
||||
|
||||
|
||||
class CompanySubset:
|
||||
"""
|
||||
Fluent interface for building company subsets with chainable operations.
|
||||
|
||||
Example:
|
||||
# Get 50 random NYSE companies excluding financial sector
|
||||
companies = (CompanySubset()
|
||||
.from_exchange('NYSE')
|
||||
.exclude_tickers(['JPM', 'GS', 'C'])
|
||||
.sample(50)
|
||||
.get())
|
||||
|
||||
# Get pharmaceutical companies with comprehensive metadata
|
||||
pharma = (CompanySubset(use_comprehensive=True)
|
||||
.from_industry(sic_range=(2834, 2836))
|
||||
.sample(100)
|
||||
.get())
|
||||
"""
|
||||
|
||||
def __init__(self, companies: Optional[pd.DataFrame] = None, use_comprehensive: bool = False):
|
||||
"""
|
||||
Initialize with optional starting dataset.
|
||||
|
||||
Args:
|
||||
companies: Optional DataFrame to start with. If None, loads from get_all_companies()
|
||||
use_comprehensive: If True and companies is None, load comprehensive dataset
|
||||
with rich metadata (SIC, state, entity type, etc.)
|
||||
"""
|
||||
if companies is not None:
|
||||
self._companies = companies
|
||||
else:
|
||||
self._companies = get_all_companies(use_comprehensive=use_comprehensive)
|
||||
self._use_comprehensive = use_comprehensive
|
||||
|
||||
def from_exchange(self, exchanges: Union[str, List[str]]) -> 'CompanySubset':
|
||||
"""Filter companies by exchange(s)."""
|
||||
self._companies = get_companies_by_exchanges(exchanges)
|
||||
return self
|
||||
|
||||
def from_popular(self, tier: Optional[PopularityTier] = None) -> 'CompanySubset':
|
||||
"""Filter to popular companies."""
|
||||
self._companies = get_popular_companies(tier)
|
||||
return self
|
||||
|
||||
def from_industry(
|
||||
self,
|
||||
sic: Optional[Union[int, List[int]]] = None,
|
||||
sic_range: Optional[tuple[int, int]] = None,
|
||||
sic_description_contains: Optional[str] = None
|
||||
) -> 'CompanySubset':
|
||||
"""
|
||||
Filter companies by industry (SIC code).
|
||||
|
||||
Automatically enables comprehensive mode to access industry metadata.
|
||||
|
||||
Args:
|
||||
sic: Single SIC code or list of SIC codes to match exactly
|
||||
sic_range: Tuple of (min_sic, max_sic) for range filtering
|
||||
sic_description_contains: String to search within SIC description
|
||||
|
||||
Returns:
|
||||
CompanySubset with industry filter applied
|
||||
|
||||
Example:
|
||||
>>> # Pharmaceutical companies
|
||||
>>> pharma = CompanySubset().from_industry(sic=2834)
|
||||
|
||||
>>> # Biotech sector
|
||||
>>> biotech = CompanySubset().from_industry(sic_range=(2833, 2836))
|
||||
"""
|
||||
self._companies = get_companies_by_industry(
|
||||
sic=sic,
|
||||
sic_range=sic_range,
|
||||
sic_description_contains=sic_description_contains
|
||||
)
|
||||
self._use_comprehensive = True
|
||||
return self
|
||||
|
||||
def from_state(self, states: Union[str, List[str]]) -> 'CompanySubset':
|
||||
"""
|
||||
Filter companies by state of incorporation.
|
||||
|
||||
Automatically enables comprehensive mode to access state metadata.
|
||||
|
||||
Args:
|
||||
states: Single state code or list of state codes (e.g., 'DE', 'CA')
|
||||
|
||||
Returns:
|
||||
CompanySubset with state filter applied
|
||||
|
||||
Example:
|
||||
>>> # Delaware corporations
|
||||
>>> de_corps = CompanySubset().from_state('DE')
|
||||
|
||||
>>> # Delaware or Nevada corporations
|
||||
>>> de_nv = CompanySubset().from_state(['DE', 'NV'])
|
||||
"""
|
||||
self._companies = get_companies_by_state(states)
|
||||
self._use_comprehensive = True
|
||||
return self
|
||||
|
||||
def filter_by(self, condition: Callable[[pd.DataFrame], pd.DataFrame]) -> 'CompanySubset':
|
||||
"""Apply custom filter function."""
|
||||
self._companies = condition(self._companies)
|
||||
return self
|
||||
|
||||
def exclude_tickers(self, tickers: List[str]) -> 'CompanySubset':
|
||||
"""Exclude specific tickers."""
|
||||
self._companies = exclude_companies(self._companies, tickers)
|
||||
return self
|
||||
|
||||
def include_tickers(self, tickers: List[str]) -> 'CompanySubset':
|
||||
"""Include only specific tickers."""
|
||||
self._companies = filter_companies(self._companies, ticker_list=tickers)
|
||||
return self
|
||||
|
||||
def sample(self, n: int, random_state: Optional[int] = None) -> 'CompanySubset':
|
||||
"""Take random sample of n companies."""
|
||||
self._companies = get_random_sample(self._companies, n, random_state)
|
||||
return self
|
||||
|
||||
def top(self, n: int, by: str = 'name') -> 'CompanySubset':
|
||||
"""Take top n companies by specified column."""
|
||||
self._companies = get_top_companies_by_metric(self._companies, n, by)
|
||||
return self
|
||||
|
||||
def combine_with(self, other: 'CompanySubset') -> 'CompanySubset':
|
||||
"""Combine with another subset (union)."""
|
||||
self._companies = combine_company_sets([self._companies, other.get()])
|
||||
return self
|
||||
|
||||
def intersect_with(self, other: 'CompanySubset') -> 'CompanySubset':
|
||||
"""Intersect with another subset."""
|
||||
self._companies = intersect_company_sets([self._companies, other.get()])
|
||||
return self
|
||||
|
||||
def get(self) -> pd.DataFrame:
|
||||
"""Get the final DataFrame."""
|
||||
return self._companies.copy()
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Return number of companies in subset."""
|
||||
return len(self._companies)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""String representation showing count and sample."""
|
||||
count = len(self._companies)
|
||||
if count == 0:
|
||||
return "CompanySubset(empty)"
|
||||
|
||||
sample_size = min(3, count)
|
||||
sample_tickers = self._companies['ticker'].head(sample_size).tolist()
|
||||
sample_str = ', '.join(sample_tickers)
|
||||
|
||||
if count > sample_size:
|
||||
sample_str += f", ... +{count - sample_size} more"
|
||||
|
||||
return f"CompanySubset({count} companies: {sample_str})"
|
||||
|
||||
|
||||
def _get_comprehensive_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get comprehensive company dataset from company_dataset module.
|
||||
|
||||
This function loads the full SEC submissions dataset (~562K companies) with rich metadata
|
||||
including SIC codes, state of incorporation, entity types, and more.
|
||||
|
||||
Returns:
|
||||
DataFrame with extended schema:
|
||||
['cik', 'ticker', 'name', 'exchange', 'sic', 'sic_description',
|
||||
'state_of_incorporation', 'state_of_incorporation_description',
|
||||
'fiscal_year_end', 'entity_type', 'ein']
|
||||
|
||||
Note:
|
||||
- First call may take ~30 seconds to build the dataset
|
||||
- Subsequent calls use cached Parquet file (<100ms load time)
|
||||
- Primary ticker extracted from pipe-delimited tickers field
|
||||
- Primary exchange extracted from pipe-delimited exchanges field
|
||||
"""
|
||||
try:
|
||||
from edgar.reference.company_dataset import get_company_dataset
|
||||
|
||||
# Get PyArrow Table from company_dataset
|
||||
table = get_company_dataset()
|
||||
|
||||
# Convert to pandas
|
||||
df = table.to_pandas()
|
||||
|
||||
# Extract primary ticker from pipe-delimited tickers field
|
||||
def extract_primary(value):
|
||||
"""Extract first value from pipe-delimited string."""
|
||||
if pd.isna(value) or value is None:
|
||||
return None
|
||||
value_str = str(value)
|
||||
parts = value_str.split('|')
|
||||
return parts[0] if parts and parts[0] else None
|
||||
|
||||
df['ticker'] = df['tickers'].apply(extract_primary)
|
||||
df['exchange'] = df['exchanges'].apply(extract_primary)
|
||||
|
||||
# Drop the original pipe-delimited columns
|
||||
df = df.drop(columns=['tickers', 'exchanges'])
|
||||
|
||||
# Reorder columns to match standard format plus extensions
|
||||
columns = [
|
||||
'cik', 'ticker', 'name', 'exchange',
|
||||
'sic', 'sic_description',
|
||||
'state_of_incorporation', 'state_of_incorporation_description',
|
||||
'fiscal_year_end', 'entity_type', 'ein'
|
||||
]
|
||||
|
||||
return df[columns]
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error fetching comprehensive company data: {e}")
|
||||
# Return empty DataFrame with extended schema
|
||||
return pd.DataFrame(columns=[
|
||||
'cik', 'ticker', 'name', 'exchange',
|
||||
'sic', 'sic_description',
|
||||
'state_of_incorporation', 'state_of_incorporation_description',
|
||||
'fiscal_year_end', 'entity_type', 'ein'
|
||||
])
|
||||
|
||||
|
||||
@lru_cache(maxsize=2)
|
||||
def get_all_companies(use_comprehensive: bool = False) -> pd.DataFrame:
|
||||
"""
|
||||
Get all companies from SEC reference data in standardized format.
|
||||
|
||||
Args:
|
||||
use_comprehensive: If True, load comprehensive dataset with ~562K companies
|
||||
and rich metadata (SIC, state, entity type, etc.).
|
||||
If False (default), load ticker-only dataset with ~13K companies.
|
||||
|
||||
Returns:
|
||||
DataFrame with columns ['cik', 'ticker', 'name', 'exchange']
|
||||
|
||||
If use_comprehensive=True, also includes:
|
||||
['sic', 'sic_description', 'state_of_incorporation',
|
||||
'state_of_incorporation_description', 'fiscal_year_end',
|
||||
'entity_type', 'ein']
|
||||
|
||||
Note:
|
||||
- Default (use_comprehensive=False) maintains backward compatibility
|
||||
- Comprehensive mode adds ~30 second build time on first call
|
||||
- Both modes use caching for fast subsequent calls
|
||||
|
||||
Example:
|
||||
>>> # Standard mode - fast, ticker-only data
|
||||
>>> companies = get_all_companies()
|
||||
>>> len(companies) # ~13K companies
|
||||
|
||||
>>> # Comprehensive mode - slower first call, rich metadata
|
||||
>>> all_companies = get_all_companies(use_comprehensive=True)
|
||||
>>> len(all_companies) # ~562K companies
|
||||
>>> 'sic' in all_companies.columns # True
|
||||
"""
|
||||
if use_comprehensive:
|
||||
return _get_comprehensive_companies()
|
||||
|
||||
try:
|
||||
df = get_company_ticker_name_exchange().copy()
|
||||
# Reorder columns to match our standard format
|
||||
return df[['cik', 'ticker', 'name', 'exchange']]
|
||||
except Exception as e:
|
||||
log.error(f"Error fetching company data: {e}")
|
||||
# Return empty DataFrame with correct structure
|
||||
return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
|
||||
|
||||
|
||||
def get_companies_by_exchanges(exchanges: Union[str, List[str]]) -> pd.DataFrame:
|
||||
"""
|
||||
Get companies listed on specific exchange(s).
|
||||
|
||||
Args:
|
||||
exchanges: Single exchange string or list of exchanges
|
||||
('NYSE', 'Nasdaq', 'OTC', 'CBOE')
|
||||
|
||||
Returns:
|
||||
DataFrame with companies from specified exchanges
|
||||
|
||||
Example:
|
||||
>>> nyse_companies = get_companies_by_exchanges('NYSE')
|
||||
>>> major_exchanges = get_companies_by_exchanges(['NYSE', 'Nasdaq'])
|
||||
"""
|
||||
if isinstance(exchanges, str):
|
||||
exchanges = [exchanges]
|
||||
|
||||
try:
|
||||
all_companies = get_all_companies()
|
||||
return all_companies[all_companies['exchange'].isin(exchanges)].reset_index(drop=True)
|
||||
except Exception as e:
|
||||
log.error(f"Error filtering companies by exchanges {exchanges}: {e}")
|
||||
return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
|
||||
|
||||
|
||||
def get_popular_companies(tier: Optional[PopularityTier] = None) -> pd.DataFrame:
|
||||
"""
|
||||
Get popular companies based on tier selection.
|
||||
|
||||
Args:
|
||||
tier: Popularity tier (MEGA_CAP, POPULAR, MAINSTREAM, EMERGING)
|
||||
If None, returns all popular companies
|
||||
|
||||
Returns:
|
||||
DataFrame with popular companies
|
||||
|
||||
Example:
|
||||
>>> mega_cap = get_popular_companies(PopularityTier.MEGA_CAP)
|
||||
>>> all_popular = get_popular_companies()
|
||||
"""
|
||||
try:
|
||||
# Get popular stocks and merge with exchange data
|
||||
popular_df = popular_us_stocks().reset_index() # CIK becomes a column
|
||||
popular_df = popular_df.rename(columns={'Cik': 'cik', 'Ticker': 'ticker', 'Company': 'name'})
|
||||
|
||||
# Get exchange information
|
||||
all_companies = get_all_companies()
|
||||
|
||||
# Merge to get exchange information
|
||||
result = popular_df.merge(
|
||||
all_companies[['cik', 'exchange']],
|
||||
on='cik',
|
||||
how='left'
|
||||
)
|
||||
|
||||
# Fill missing exchanges with 'Unknown'
|
||||
result['exchange'] = result['exchange'].fillna('Unknown')
|
||||
|
||||
# Apply tier filtering
|
||||
if tier == PopularityTier.MEGA_CAP:
|
||||
result = result.head(10) # Top 10 by market cap (order in CSV)
|
||||
elif tier == PopularityTier.POPULAR:
|
||||
result = result.head(50) # Top 50 popular
|
||||
elif tier == PopularityTier.MAINSTREAM:
|
||||
result = result.head(100) # Top 100
|
||||
# EMERGING or None returns all
|
||||
|
||||
return result[['cik', 'ticker', 'name', 'exchange']].reset_index(drop=True)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error fetching popular companies: {e}")
|
||||
return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
|
||||
|
||||
|
||||
def get_random_sample(
|
||||
companies: Optional[pd.DataFrame] = None,
|
||||
n: int = 100,
|
||||
random_state: Optional[int] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Get random sample of companies.
|
||||
|
||||
Args:
|
||||
companies: DataFrame to sample from (if None, uses all companies)
|
||||
n: Number of companies to sample
|
||||
random_state: Random seed for reproducibility
|
||||
|
||||
Returns:
|
||||
DataFrame with n randomly selected companies
|
||||
|
||||
Example:
|
||||
>>> random_100 = get_random_sample(n=100, random_state=42)
|
||||
>>> nasdaq_sample = get_random_sample(get_companies_by_exchanges('Nasdaq'), n=50)
|
||||
"""
|
||||
if companies is None:
|
||||
companies = get_all_companies()
|
||||
|
||||
if len(companies) == 0:
|
||||
return companies.copy()
|
||||
|
||||
# Ensure we don't sample more than available
|
||||
sample_size = min(n, len(companies))
|
||||
|
||||
try:
|
||||
return companies.sample(n=sample_size, random_state=random_state).reset_index(drop=True)
|
||||
except Exception as e:
|
||||
log.error(f"Error sampling companies: {e}")
|
||||
return companies.head(sample_size).reset_index(drop=True)
|
||||
|
||||
|
||||
def get_stratified_sample(
|
||||
companies: Optional[pd.DataFrame] = None,
|
||||
n: int = 100,
|
||||
stratify_by: str = 'exchange',
|
||||
random_state: Optional[int] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Get stratified sample of companies maintaining proportions by specified column.
|
||||
|
||||
Args:
|
||||
companies: DataFrame to sample from (if None, uses all companies)
|
||||
n: Total number of companies to sample
|
||||
stratify_by: Column to stratify by (default: 'exchange')
|
||||
random_state: Random seed for reproducibility
|
||||
|
||||
Returns:
|
||||
DataFrame with stratified sample
|
||||
|
||||
Example:
|
||||
>>> # Sample maintaining exchange proportions
|
||||
>>> stratified = get_stratified_sample(n=200, stratify_by='exchange')
|
||||
"""
|
||||
if companies is None:
|
||||
companies = get_all_companies()
|
||||
|
||||
if len(companies) == 0 or stratify_by not in companies.columns:
|
||||
return get_random_sample(companies, n, random_state)
|
||||
|
||||
try:
|
||||
# Calculate proportions
|
||||
proportions = companies[stratify_by].value_counts(normalize=True)
|
||||
|
||||
samples = []
|
||||
remaining_n = n
|
||||
|
||||
for category, prop in proportions.items():
|
||||
category_companies = companies[companies[stratify_by] == category]
|
||||
|
||||
# Calculate sample size for this category
|
||||
if category == proportions.index[-1]: # Last category gets remainder
|
||||
category_n = remaining_n
|
||||
else:
|
||||
category_n = max(1, int(n * prop)) # At least 1 company per category
|
||||
remaining_n -= category_n
|
||||
|
||||
# Sample from this category
|
||||
if len(category_companies) > 0:
|
||||
category_sample = get_random_sample(
|
||||
category_companies,
|
||||
min(category_n, len(category_companies)),
|
||||
random_state
|
||||
)
|
||||
samples.append(category_sample)
|
||||
|
||||
# Combine all samples
|
||||
if samples:
|
||||
result = pd.concat(samples, ignore_index=True)
|
||||
# If we ended up with more than n, randomly select n
|
||||
if len(result) > n:
|
||||
result = get_random_sample(result, n, random_state)
|
||||
return result
|
||||
else:
|
||||
return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error creating stratified sample: {e}")
|
||||
return get_random_sample(companies, n, random_state)
|
||||
|
||||
|
||||
def get_top_companies_by_metric(
|
||||
companies: Optional[pd.DataFrame] = None,
|
||||
n: int = 100,
|
||||
metric: str = 'name',
|
||||
ascending: bool = True
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Get top N companies sorted by specified metric.
|
||||
|
||||
Args:
|
||||
companies: DataFrame to select from (if None, uses all companies)
|
||||
n: Number of top companies to return
|
||||
metric: Column to sort by (default: 'name' for alphabetical)
|
||||
ascending: Sort order (True for ascending, False for descending)
|
||||
|
||||
Returns:
|
||||
DataFrame with top N companies by metric
|
||||
|
||||
Example:
|
||||
>>> # Top 50 companies alphabetically by name
|
||||
>>> top_alpha = get_top_companies_by_metric(n=50, metric='name')
|
||||
>>> # Top 100 popular companies by ticker (reverse alphabetical)
|
||||
>>> top_tickers = get_top_companies_by_metric(
|
||||
... get_popular_companies(), n=100, metric='ticker', ascending=False)
|
||||
"""
|
||||
if companies is None:
|
||||
companies = get_all_companies()
|
||||
|
||||
if len(companies) == 0 or metric not in companies.columns:
|
||||
return companies.head(n).copy()
|
||||
|
||||
try:
|
||||
sorted_companies = companies.sort_values(by=metric, ascending=ascending)
|
||||
return sorted_companies.head(n).reset_index(drop=True)
|
||||
except Exception as e:
|
||||
log.error(f"Error sorting companies by {metric}: {e}")
|
||||
return companies.head(n).copy()
|
||||
|
||||
|
||||
def filter_companies(
|
||||
companies: pd.DataFrame,
|
||||
ticker_list: Optional[List[str]] = None,
|
||||
name_contains: Optional[str] = None,
|
||||
cik_list: Optional[List[int]] = None,
|
||||
custom_filter: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Filter companies by various criteria.
|
||||
|
||||
Args:
|
||||
companies: DataFrame to filter
|
||||
ticker_list: List of specific tickers to include
|
||||
name_contains: String that company name must contain (case-insensitive)
|
||||
cik_list: List of specific CIKs to include
|
||||
custom_filter: Custom function that takes and returns a DataFrame
|
||||
|
||||
Returns:
|
||||
Filtered DataFrame
|
||||
|
||||
Example:
|
||||
>>> # Filter to specific tickers
|
||||
>>> faang = filter_companies(
|
||||
... companies, ticker_list=['AAPL', 'AMZN', 'NFLX', 'GOOGL', 'META'])
|
||||
>>> # Filter by name containing 'Inc'
|
||||
>>> inc_companies = filter_companies(companies, name_contains='Inc')
|
||||
"""
|
||||
result = companies.copy()
|
||||
|
||||
try:
|
||||
if ticker_list is not None:
|
||||
ticker_list_upper = [t.upper() for t in ticker_list]
|
||||
result = result[result['ticker'].str.upper().isin(ticker_list_upper)]
|
||||
|
||||
if name_contains is not None:
|
||||
result = result[result['name'].str.contains(name_contains, case=False, na=False)]
|
||||
|
||||
if cik_list is not None:
|
||||
result = result[result['cik'].isin(cik_list)]
|
||||
|
||||
if custom_filter is not None:
|
||||
result = custom_filter(result)
|
||||
|
||||
return result.reset_index(drop=True)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error filtering companies: {e}")
|
||||
return result
|
||||
|
||||
|
||||
def exclude_companies(
|
||||
companies: pd.DataFrame,
|
||||
ticker_list: Optional[List[str]] = None,
|
||||
name_contains: Optional[str] = None,
|
||||
cik_list: Optional[List[int]] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Exclude companies by various criteria.
|
||||
|
||||
Args:
|
||||
companies: DataFrame to filter
|
||||
ticker_list: List of tickers to exclude
|
||||
name_contains: String to exclude companies whose names contain it
|
||||
cik_list: List of CIKs to exclude
|
||||
|
||||
Returns:
|
||||
DataFrame with specified companies excluded
|
||||
|
||||
Example:
|
||||
>>> # Exclude financial companies (simplified)
|
||||
>>> non_financial = exclude_companies(
|
||||
... companies, ticker_list=['JPM', 'GS', 'C', 'BAC'])
|
||||
>>> # Exclude companies with 'Corp' in name
|
||||
>>> non_corp = exclude_companies(companies, name_contains='Corp')
|
||||
"""
|
||||
result = companies.copy()
|
||||
|
||||
try:
|
||||
if ticker_list is not None:
|
||||
ticker_list_upper = [t.upper() for t in ticker_list]
|
||||
result = result[~result['ticker'].str.upper().isin(ticker_list_upper)]
|
||||
|
||||
if name_contains is not None:
|
||||
result = result[~result['name'].str.contains(name_contains, case=False, na=False)]
|
||||
|
||||
if cik_list is not None:
|
||||
result = result[~result['cik'].isin(cik_list)]
|
||||
|
||||
return result.reset_index(drop=True)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error excluding companies: {e}")
|
||||
return result
|
||||
|
||||
|
||||
def combine_company_sets(company_sets: List[pd.DataFrame]) -> pd.DataFrame:
|
||||
"""
|
||||
Combine multiple company DataFrames (union operation).
|
||||
|
||||
Args:
|
||||
company_sets: List of company DataFrames to combine
|
||||
|
||||
Returns:
|
||||
Combined DataFrame with duplicates removed
|
||||
|
||||
Example:
|
||||
>>> nyse = get_companies_by_exchanges('NYSE')
|
||||
>>> popular = get_popular_companies()
|
||||
>>> combined = combine_company_sets([nyse, popular])
|
||||
"""
|
||||
if not company_sets:
|
||||
return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
|
||||
|
||||
try:
|
||||
# Concatenate all DataFrames
|
||||
result = pd.concat(company_sets, ignore_index=True)
|
||||
|
||||
# Remove duplicates based on CIK (primary key)
|
||||
result = result.drop_duplicates(subset=['cik']).reset_index(drop=True)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error combining company sets: {e}")
|
||||
return company_sets[0].copy() if company_sets else pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
|
||||
|
||||
|
||||
def intersect_company_sets(company_sets: List[pd.DataFrame]) -> pd.DataFrame:
|
||||
"""
|
||||
Find intersection of multiple company DataFrames.
|
||||
|
||||
Args:
|
||||
company_sets: List of company DataFrames to intersect
|
||||
|
||||
Returns:
|
||||
DataFrame containing only companies present in all sets
|
||||
|
||||
Example:
|
||||
>>> nyse = get_companies_by_exchanges('NYSE')
|
||||
>>> popular = get_popular_companies()
|
||||
>>> nyse_popular = intersect_company_sets([nyse, popular])
|
||||
"""
|
||||
if not company_sets:
|
||||
return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
|
||||
|
||||
if len(company_sets) == 1:
|
||||
return company_sets[0].copy()
|
||||
|
||||
try:
|
||||
# Start with first set
|
||||
result = company_sets[0].copy()
|
||||
|
||||
# Intersect with each subsequent set
|
||||
for df in company_sets[1:]:
|
||||
# Find common CIKs
|
||||
common_ciks = set(result['cik']) & set(df['cik'])
|
||||
result = result[result['cik'].isin(common_ciks)]
|
||||
|
||||
return result.reset_index(drop=True)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error intersecting company sets: {e}")
|
||||
return company_sets[0].copy() if company_sets else pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
|
||||
|
||||
|
||||
def get_companies_by_industry(
|
||||
sic: Optional[Union[int, List[int]]] = None,
|
||||
sic_range: Optional[tuple[int, int]] = None,
|
||||
sic_description_contains: Optional[str] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Get companies by industry classification using SIC (Standard Industrial Classification) codes.
|
||||
|
||||
Requires comprehensive company dataset. This function automatically uses use_comprehensive=True.
|
||||
|
||||
Args:
|
||||
sic: Single SIC code or list of SIC codes to match exactly
|
||||
sic_range: Tuple of (min_sic, max_sic) for range filtering (inclusive)
|
||||
sic_description_contains: String to search within SIC description (case-insensitive)
|
||||
|
||||
Returns:
|
||||
DataFrame with companies matching the industry criteria, including comprehensive metadata
|
||||
|
||||
Example:
|
||||
>>> # Pharmaceutical companies (SIC 2834)
|
||||
>>> pharma = get_companies_by_industry(sic=2834)
|
||||
|
||||
>>> # Biotech range (SIC 2833-2836)
|
||||
>>> biotech = get_companies_by_industry(sic_range=(2833, 2836))
|
||||
|
||||
>>> # All companies with "software" in industry description
|
||||
>>> software = get_companies_by_industry(sic_description_contains='software')
|
||||
|
||||
>>> # Multiple specific SIC codes
|
||||
>>> healthcare = get_companies_by_industry(sic=[2834, 2835, 2836])
|
||||
|
||||
Note:
|
||||
SIC Code Ranges:
|
||||
- 0100-0999: Agriculture, Forestry, Fishing
|
||||
- 1000-1499: Mining
|
||||
- 1500-1799: Construction
|
||||
- 2000-3999: Manufacturing
|
||||
- 4000-4999: Transportation, Communications, Utilities
|
||||
- 5000-5199: Wholesale Trade
|
||||
- 5200-5999: Retail Trade
|
||||
- 6000-6799: Finance, Insurance, Real Estate
|
||||
- 7000-8999: Services
|
||||
- 9100-9729: Public Administration
|
||||
"""
|
||||
# Auto-enable comprehensive mode for industry filtering
|
||||
companies = get_all_companies(use_comprehensive=True)
|
||||
|
||||
result = companies.copy()
|
||||
|
||||
try:
|
||||
# Filter by exact SIC code(s)
|
||||
if sic is not None:
|
||||
if isinstance(sic, int):
|
||||
sic = [sic]
|
||||
result = result[result['sic'].isin(sic)]
|
||||
|
||||
# Filter by SIC range
|
||||
if sic_range is not None:
|
||||
min_sic, max_sic = sic_range
|
||||
result = result[
|
||||
(result['sic'] >= min_sic) &
|
||||
(result['sic'] <= max_sic)
|
||||
]
|
||||
|
||||
# Filter by SIC description contains
|
||||
if sic_description_contains is not None:
|
||||
result = result[
|
||||
result['sic_description'].str.contains(
|
||||
sic_description_contains,
|
||||
case=False,
|
||||
na=False
|
||||
)
|
||||
]
|
||||
|
||||
return result.reset_index(drop=True)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error filtering companies by industry: {e}")
|
||||
return pd.DataFrame(columns=companies.columns)
|
||||
|
||||
|
||||
def get_companies_by_state(
|
||||
states: Union[str, List[str]],
|
||||
include_description: bool = True
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Get companies by state of incorporation.
|
||||
|
||||
Requires comprehensive company dataset. This function automatically uses use_comprehensive=True.
|
||||
|
||||
Args:
|
||||
states: Single state code or list of state codes (e.g., 'DE', 'CA', ['DE', 'NV'])
|
||||
include_description: If True, includes state_of_incorporation_description in output
|
||||
|
||||
Returns:
|
||||
DataFrame with companies incorporated in specified state(s)
|
||||
|
||||
Example:
|
||||
>>> # Delaware corporations
|
||||
>>> de_corps = get_companies_by_state('DE')
|
||||
|
||||
>>> # Delaware and Nevada corporations
|
||||
>>> de_nv = get_companies_by_state(['DE', 'NV'])
|
||||
|
||||
>>> # California corporations
|
||||
>>> ca_corps = get_companies_by_state('CA')
|
||||
|
||||
Note:
|
||||
Common states of incorporation:
|
||||
- DE: Delaware (most common for public companies)
|
||||
- NV: Nevada (popular for tax benefits)
|
||||
- CA: California
|
||||
- NY: New York
|
||||
- TX: Texas
|
||||
"""
|
||||
if isinstance(states, str):
|
||||
states = [states]
|
||||
|
||||
# Auto-enable comprehensive mode for state filtering
|
||||
companies = get_all_companies(use_comprehensive=True)
|
||||
|
||||
try:
|
||||
# Normalize state codes to uppercase
|
||||
states_upper = [s.upper() for s in states]
|
||||
|
||||
result = companies[
|
||||
companies['state_of_incorporation'].str.upper().isin(states_upper)
|
||||
].reset_index(drop=True)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error filtering companies by state {states}: {e}")
|
||||
return pd.DataFrame(columns=companies.columns)
|
||||
|
||||
|
||||
# Convenience functions for common use cases
|
||||
|
||||
def get_faang_companies() -> pd.DataFrame:
|
||||
"""Get FAANG companies (Facebook/Meta, Apple, Amazon, Netflix, Google)."""
|
||||
return filter_companies(
|
||||
get_all_companies(),
|
||||
ticker_list=['META', 'AAPL', 'AMZN', 'NFLX', 'GOOGL']
|
||||
)
|
||||
|
||||
|
||||
def get_tech_giants() -> pd.DataFrame:
|
||||
"""Get major technology companies."""
|
||||
tech_tickers = [
|
||||
'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'TSLA', 'NVDA',
|
||||
'NFLX', 'ADBE', 'CRM', 'ORCL', 'INTC', 'CSCO'
|
||||
]
|
||||
return filter_companies(get_all_companies(), ticker_list=tech_tickers)
|
||||
|
||||
|
||||
def get_dow_jones_sample() -> pd.DataFrame:
|
||||
"""Get sample of Dow Jones Industrial Average companies."""
|
||||
dow_tickers = [
|
||||
'AAPL', 'MSFT', 'UNH', 'GS', 'HD', 'CAT', 'MCD', 'V', 'AXP', 'BA',
|
||||
'TRV', 'JPM', 'IBM', 'JNJ', 'WMT', 'CVX', 'NKE', 'MRK', 'KO', 'DIS',
|
||||
'MMM', 'DOW', 'CSCO', 'VZ', 'INTC', 'WBA', 'CRM', 'HON', 'AMGN', 'PG'
|
||||
]
|
||||
return filter_companies(get_all_companies(), ticker_list=dow_tickers)
|
||||
|
||||
|
||||
# Industry-specific convenience functions (require comprehensive dataset)
|
||||
|
||||
def get_pharmaceutical_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get pharmaceutical preparation companies (SIC 2834).
|
||||
|
||||
Returns companies in the pharmaceutical preparations industry including
|
||||
prescription drugs, biologics, and vaccines.
|
||||
"""
|
||||
return get_companies_by_industry(sic=2834)
|
||||
|
||||
|
||||
def get_biotechnology_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get biotechnology companies (SIC 2833-2836).
|
||||
|
||||
Returns companies in biotech and related pharmaceutical industries.
|
||||
"""
|
||||
return get_companies_by_industry(sic_range=(2833, 2836))
|
||||
|
||||
|
||||
def get_software_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get software and computer programming companies (SIC 7371-7379).
|
||||
|
||||
Returns companies in software publishing, programming, and related services.
|
||||
"""
|
||||
return get_companies_by_industry(sic_range=(7371, 7379))
|
||||
|
||||
|
||||
def get_semiconductor_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get semiconductor and electronic component companies (SIC 3674).
|
||||
|
||||
Returns companies manufacturing semiconductors and related devices.
|
||||
"""
|
||||
return get_companies_by_industry(sic=3674)
|
||||
|
||||
|
||||
def get_banking_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get commercial banking companies (SIC 6020-6029).
|
||||
|
||||
Returns national and state commercial banks.
|
||||
"""
|
||||
return get_companies_by_industry(sic_range=(6020, 6029))
|
||||
|
||||
|
||||
def get_investment_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get investment companies and funds (SIC 6200-6299).
|
||||
|
||||
Returns securities brokers, dealers, investment advisors, and funds.
|
||||
"""
|
||||
return get_companies_by_industry(sic_range=(6200, 6299))
|
||||
|
||||
|
||||
def get_insurance_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get insurance companies (SIC 6300-6399).
|
||||
|
||||
Returns life, health, property, and casualty insurance companies.
|
||||
"""
|
||||
return get_companies_by_industry(sic_range=(6300, 6399))
|
||||
|
||||
|
||||
def get_real_estate_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get real estate companies (SIC 6500-6599).
|
||||
|
||||
Returns REITs, real estate operators, and developers.
|
||||
"""
|
||||
return get_companies_by_industry(sic_range=(6500, 6599))
|
||||
|
||||
|
||||
def get_oil_gas_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get oil and gas extraction companies (SIC 1300-1399).
|
||||
|
||||
Returns crude petroleum, natural gas, and oil/gas field services companies.
|
||||
"""
|
||||
return get_companies_by_industry(sic_range=(1300, 1399))
|
||||
|
||||
|
||||
def get_retail_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get retail trade companies (SIC 5200-5999).
|
||||
|
||||
Returns general merchandise, apparel, food, and other retail stores.
|
||||
"""
|
||||
return get_companies_by_industry(sic_range=(5200, 5999))
|
||||
@@ -0,0 +1,300 @@
|
||||
# Portfolio Manager Database - Manual Maintenance Guide
|
||||
|
||||
This guide explains how to manually add, update, and maintain portfolio manager information in the EdgarTools database.
|
||||
|
||||
## File Location
|
||||
**Database File**: `/Users/dwight/PycharmProjects/edgartools/edgar/data/portfolio_managers.json`
|
||||
|
||||
## Database Structure
|
||||
|
||||
The JSON file has two main sections:
|
||||
|
||||
### 1. Metadata Section
|
||||
```json
|
||||
{
|
||||
"metadata": {
|
||||
"version": "2024.12.01",
|
||||
"description": "Curated database of portfolio managers for major 13F filing institutions",
|
||||
"total_companies": 15,
|
||||
"total_managers": 25,
|
||||
"last_updated": "2024-12-01",
|
||||
"sources": ["company_websites", "sec_filings", "press_releases", "public_records"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Update when adding managers:**
|
||||
- Increment `total_companies` when adding new companies
|
||||
- Increment `total_managers` when adding new individual managers
|
||||
- Update `last_updated` to current date
|
||||
|
||||
### 2. Managers Section
|
||||
Each company entry follows this structure:
|
||||
|
||||
```json
|
||||
{
|
||||
"managers": {
|
||||
"company_key": {
|
||||
"company_name": "Full Legal Company Name",
|
||||
"aum_billions": 123,
|
||||
"match_patterns": ["pattern1", "pattern2", "pattern3"],
|
||||
"website": "https://www.company.com",
|
||||
"managers": [
|
||||
{
|
||||
"name": "Manager Full Name",
|
||||
"title": "Official Title",
|
||||
"status": "active|retired|deceased|former",
|
||||
"confidence": "high|medium|low",
|
||||
"sources": ["source1", "source2"],
|
||||
"start_date": "YYYY-MM-DD",
|
||||
"end_date": "YYYY-MM-DD",
|
||||
"last_verified": "YYYY-MM-DD",
|
||||
"note": "Additional context or details"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Adding New Companies
|
||||
|
||||
### Step 1: Choose Company Key
|
||||
Use lowercase, underscore-separated format:
|
||||
- ✅ Good: `berkshire_hathaway`, `goldman_sachs`, `two_sigma`
|
||||
- ❌ Bad: `Berkshire-Hathaway`, `goldmanSachs`, `TwoSigma`
|
||||
|
||||
### Step 2: Research Company Information
|
||||
Gather the following data:
|
||||
|
||||
**Required:**
|
||||
- Full legal company name (from SEC filings)
|
||||
- Current AUM in billions (approximate is fine)
|
||||
- Company website URL
|
||||
- Portfolio manager names and titles
|
||||
|
||||
**Recommended Sources:**
|
||||
1. Company website "Leadership" or "Team" pages
|
||||
2. Latest 10-K filing (Item 1A - Directors and Executive Officers)
|
||||
3. Latest DEF 14A proxy statement
|
||||
4. Recent press releases
|
||||
5. Financial news articles
|
||||
|
||||
### Step 3: Add Company Entry
|
||||
```json
|
||||
{
|
||||
"new_company": {
|
||||
"company_name": "New Company Inc",
|
||||
"aum_billions": 50,
|
||||
"match_patterns": ["new company", "newco", "nc inc"],
|
||||
"website": "https://www.newcompany.com",
|
||||
"managers": []
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Match Patterns Tips:**
|
||||
- Include common variations of company name
|
||||
- Include stock ticker symbols if applicable
|
||||
- Include abbreviations commonly used
|
||||
- All patterns should be lowercase
|
||||
|
||||
### Step 4: Add Manager Information
|
||||
```json
|
||||
{
|
||||
"managers": [
|
||||
{
|
||||
"name": "Jane Smith",
|
||||
"title": "Chief Investment Officer",
|
||||
"status": "active",
|
||||
"confidence": "high",
|
||||
"sources": ["company_website", "sec_filing_2024"],
|
||||
"start_date": "2020-01-01",
|
||||
"last_verified": "2024-12-01",
|
||||
"note": "Former Goldman Sachs managing director"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Manager Status Definitions
|
||||
|
||||
- **active**: Currently in active management role
|
||||
- **retired**: Retired but may retain advisory role
|
||||
- **deceased**: Deceased (include year in status like "deceased_2023")
|
||||
- **former**: No longer with the organization
|
||||
|
||||
## Confidence Levels
|
||||
|
||||
- **high**: Verified from multiple official sources (company website + SEC filing)
|
||||
- **medium**: Verified from single official source
|
||||
- **low**: Approximate or historical information
|
||||
|
||||
## Common Sources
|
||||
|
||||
**Primary (High Confidence):**
|
||||
- `company_website` - Official leadership pages
|
||||
- `sec_filings` - 10-K, DEF 14A proxy statements
|
||||
- `annual_report_2024` - Latest annual report
|
||||
|
||||
**Secondary (Medium Confidence):**
|
||||
- `press_releases` - Official company announcements
|
||||
- `financial_press` - WSJ, FT, Bloomberg articles
|
||||
- `industry_publications` - Trade publications
|
||||
|
||||
**Tertiary (Low Confidence):**
|
||||
- `linkedin_profile` - Professional profiles
|
||||
- `wikipedia` - Publicly edited sources
|
||||
- `interview_transcript` - Media interviews
|
||||
|
||||
## Example: Adding a New Manager
|
||||
|
||||
Let's add a new company "Example Capital Management":
|
||||
|
||||
```json
|
||||
{
|
||||
"example_capital": {
|
||||
"company_name": "Example Capital Management LLC",
|
||||
"aum_billions": 25,
|
||||
"match_patterns": ["example capital", "example", "ecm"],
|
||||
"website": "https://www.examplecapital.com",
|
||||
"managers": [
|
||||
{
|
||||
"name": "John Doe",
|
||||
"title": "Founder & Chief Investment Officer",
|
||||
"status": "active",
|
||||
"confidence": "high",
|
||||
"sources": ["company_website", "sec_filing_2024"],
|
||||
"start_date": "2015-01-01",
|
||||
"last_verified": "2024-12-01",
|
||||
"note": "Former hedge fund analyst at Two Sigma"
|
||||
},
|
||||
{
|
||||
"name": "Sarah Wilson",
|
||||
"title": "Portfolio Manager",
|
||||
"status": "active",
|
||||
"confidence": "medium",
|
||||
"sources": ["company_website"],
|
||||
"start_date": "2018-06-01",
|
||||
"last_verified": "2024-12-01",
|
||||
"note": "Specializes in technology sector investments"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Data Validation Checklist
|
||||
|
||||
Before adding entries, verify:
|
||||
|
||||
- [ ] Company key is lowercase with underscores
|
||||
- [ ] Company name matches legal entity in SEC filings
|
||||
- [ ] AUM is reasonable (check recent 13F filings)
|
||||
- [ ] Match patterns are comprehensive and lowercase
|
||||
- [ ] Manager names are spelled correctly (double-check sources)
|
||||
- [ ] Status is appropriate (active/retired/deceased/former)
|
||||
- [ ] Confidence level matches quality of sources
|
||||
- [ ] Dates are in YYYY-MM-DD format
|
||||
- [ ] Sources are specific and verifiable
|
||||
- [ ] Notes provide helpful context
|
||||
|
||||
## Updating Existing Entries
|
||||
|
||||
### Manager Status Changes
|
||||
When a manager retires, is promoted, or leaves:
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "John Smith",
|
||||
"title": "Former CEO",
|
||||
"status": "retired",
|
||||
"end_date": "2024-06-30",
|
||||
"note": "Retired June 2024, remains on board of directors"
|
||||
}
|
||||
```
|
||||
|
||||
### Adding New Managers to Existing Companies
|
||||
Simply add to the managers array:
|
||||
|
||||
```json
|
||||
{
|
||||
"managers": [
|
||||
// ... existing managers ...
|
||||
{
|
||||
"name": "New Manager Name",
|
||||
"title": "Chief Investment Officer",
|
||||
"status": "active",
|
||||
// ... complete manager entry
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Testing Your Changes
|
||||
|
||||
After making changes, test the functionality:
|
||||
|
||||
```python
|
||||
import edgar
|
||||
|
||||
# Test with a company you added/modified
|
||||
company = edgar.Company("COMPANY_TICKER")
|
||||
filing = company.get_filings(form="13F-HR").head(1)[0]
|
||||
thirteen_f = filing.obj()
|
||||
|
||||
# Check if your managers are returned
|
||||
managers = thirteen_f.get_portfolio_managers()
|
||||
print(f"Found managers: {managers}")
|
||||
|
||||
# Test manager info summary
|
||||
summary = thirteen_f.get_manager_info_summary()
|
||||
print(f"Manager count: {summary['external_sources']['manager_count']}")
|
||||
```
|
||||
|
||||
## Common Mistakes to Avoid
|
||||
|
||||
1. **Inconsistent naming**: Use exact legal names from SEC filings
|
||||
2. **Missing match patterns**: Add common abbreviations and variations
|
||||
3. **Outdated information**: Always verify against recent sources
|
||||
4. **Low confidence data**: Avoid unverified Wikipedia or blog sources
|
||||
5. **JSON syntax errors**: Use a JSON validator before saving
|
||||
6. **Forgetting metadata**: Update total counts and last_updated date
|
||||
|
||||
## Priority Companies to Add
|
||||
|
||||
Focus on top 13F filers by AUM:
|
||||
|
||||
1. **Immediate Priority (AUM > $100B):**
|
||||
- Already added: BlackRock, Vanguard, Fidelity, State Street
|
||||
- Still needed: T. Rowe Price, Capital Group, Invesco
|
||||
|
||||
2. **High Priority (AUM $50-100B):**
|
||||
- Already added: AQR, Citadel, Two Sigma, Renaissance
|
||||
- Still needed: Millennium, D.E. Shaw, Baupost Group
|
||||
|
||||
3. **Medium Priority (AUM $20-50B):**
|
||||
- Already added: Elliott, Pershing Square, Icahn
|
||||
- Still needed: Third Point, ValueAct, Jana Partners
|
||||
|
||||
This systematic approach will provide coverage for the majority of institutional investment assets tracked in 13F filings.
|
||||
|
||||
---
|
||||
|
||||
## Enhancement Planning
|
||||
|
||||
**Current Status**: As of January 2025, this database covers 21 companies with verified CIKs (53.8% by count, 63.5% by AUM).
|
||||
|
||||
**Enhancement Roadmap**: See `docs-internal/features/FEAT-021-portfolio-manager-enhancement-followup.md` for:
|
||||
- Systematic expansion plans to reach 85% AUM coverage
|
||||
- Quarterly maintenance automation
|
||||
- International firm integration strategy
|
||||
- Historical manager tracking capabilities
|
||||
|
||||
**Priority Targets for Next Expansion**:
|
||||
1. **Vanguard Group** ($8.1T AUM) - Research filing patterns
|
||||
2. **Capital Group Companies** ($2.8T AUM) - American Funds family
|
||||
3. **T. Rowe Price Group** ($1.6T AUM) - Major active manager
|
||||
4. **Wellington Management** ($1.3T AUM) - Institutional specialist
|
||||
|
||||
For enhancement requests or database improvements, see the follow-up planning document and contribute via GitHub issues.
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,41 @@
|
||||
import sys
|
||||
from functools import lru_cache
|
||||
|
||||
import pandas as pd
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
# Dynamic import based on Python version
|
||||
if sys.version_info >= (3, 9):
|
||||
from importlib import resources
|
||||
else:
|
||||
import importlib_resources as resources
|
||||
|
||||
__all__ = ['read_parquet_from_package', 'read_pyarrow_from_package', 'read_csv_from_package']
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def read_parquet_from_package(parquet_filename: str):
|
||||
package_name = 'edgar.reference.data'
|
||||
|
||||
with resources.path(package_name, parquet_filename) as parquet_path:
|
||||
df = pd.read_parquet(parquet_path)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def read_pyarrow_from_package(parquet_filename: str):
|
||||
package_name = 'edgar.reference.data'
|
||||
|
||||
with resources.path(package_name, parquet_filename) as parquet_path:
|
||||
# Read a pyarrow table from a parquet file
|
||||
table = pq.read_table(parquet_path)
|
||||
return table
|
||||
|
||||
|
||||
def read_csv_from_package(csv_filename: str, **pandas_kwargs):
|
||||
package_name = 'edgar.reference.data'
|
||||
|
||||
with resources.path(package_name, csv_filename) as csv_path:
|
||||
df = pd.read_csv(csv_path, **pandas_kwargs)
|
||||
|
||||
return df
|
||||
BIN
venv/lib/python3.10/site-packages/edgar/reference/data/ct.pq
Normal file
BIN
venv/lib/python3.10/site-packages/edgar/reference/data/ct.pq
Normal file
Binary file not shown.
@@ -0,0 +1,37 @@
|
||||
Exhibit No.,Description,Form Types Involved,Regex
|
||||
1,Underwriting Agreement,"S-1, S-3, F-1, F-3, S-8, S-11, etc.",^EX-1\b
|
||||
2,"Plan of Acquisition, Reorganization, Arrangement, Liquidation or Succession","Commonly used across various forms including S-4, S-1, S-11, 10-K, etc.",^EX-2\b
|
||||
3,Articles of Incorporation and Bylaws,"S-1, S-3, F-1, F-3, S-8, S-11, etc.",^EX-3(\.\d+)?\b
|
||||
4,"Instruments Defining the Rights of Security Holders, including Indentures",Required across various form types,^EX-4\b
|
||||
5,Opinion regarding Legality,Typically required across all form types,^EX-5\b
|
||||
6,Reserved,N/A,^EX-6\b
|
||||
7,Correspondence from Independent Accountants,Limited use (specific forms only),^EX-7\b
|
||||
8,Opinion re Tax Matters,"S-11, F-1, F-3, S-3, S-8",^EX-8\b
|
||||
9,Voting Trust Agreements,Mostly required in S-4 and other specific forms,^EX-9\b
|
||||
10,Material Contracts,Required widely across forms for significant contracts,^EX-10(\.\d+)?\b
|
||||
11,Statement re Computation of Per Share Earnings,Commonly required where applicable,^EX-11\b
|
||||
12,Statements re Computation of Ratios,Required in forms where ratios are relevant,^EX-12\b
|
||||
13,Annual Report to Security Holders,Typically part of 10-K or annual disclosures,^EX-13\b
|
||||
14,Code of Ethics,Required disclosure for most forms,^EX-14\b
|
||||
15,Letter re Unaudited Interim Financial Information,Used in specific situations across various forms,^EX-15\b
|
||||
16,Letter re Change in Certifying Accountant,Used primarily in 10-K and 10-Q,^EX-16\b
|
||||
17,Correspondence on Departure of Director,"Occasionally required, depending on the circumstances",^EX-17\b
|
||||
18,Letter re Change in Accounting Principles,Used when significant changes in accounting principles occur,^EX-18\b
|
||||
19,Report Furnished to Security Holders,Often part of 10-Q or similar reports,^EX-19\b
|
||||
20,Other Documents or Statements to Security Holders,"As applicable, varies by form and content required",^EX-20\b
|
||||
21,Subsidiaries of the Registrant,Required across various forms depending on the structure of the registrant,^EX-21\b
|
||||
22,Published Report Regarding Matters Submitted to Vote of Security Holders,As applicable to the voting matters,^EX-22\b
|
||||
23,Consents of Experts and Counsel,Required across various forms when expert consents are necessary,^EX-23(\.\d+)?\b
|
||||
24,Power of Attorney,"As required, often associated with filings involving multiple signatories",^EX-24\b
|
||||
25,Statement of Eligibility of Trustee,Required in filings involving indentures under the Trust Indenture Act,^EX-25\b
|
||||
26,Invitation for Competitive Bids,Required in specific cases involving competitive bids,^EX-26\b
|
||||
27-30,Reserved,N/A,`^EX-(27
|
||||
31,Rule 13a-14(a)/15d-14(a) Certifications,Common certification required across various forms,^EX-31(\.\d+)?\b
|
||||
32,Section 1350 Certifications,Required under specific legal stipulations,^EX-32\b
|
||||
33-34,Assessment and Attestation Reports regarding Compliance,Specific to asset-backed securities,`^EX-(33
|
||||
35-36,Servicer Compliance Statement and Depositor Certification,Specific to asset-backed securities,`^EX-(35
|
||||
95,Mine Safety Disclosure Exhibit,Specific to registrants involved in mining operations,^EX-95\b
|
||||
99,Additional Exhibits,As required by specific circumstances or regulatory demands,^EX-99(\.\d+)?\b
|
||||
100-101,XBRL-Related Documents and Interactive Data File,Required for electronic data submission,`^EX-(100
|
||||
102-103,Asset Data File and Asset Related Documents,Specific to asset-backed securities filings,`^EX-(102
|
||||
104-106,Reserved/Static Pool PDF,N/A or specific to asset-backed securities,`^EX-(104
|
||||
|
@@ -0,0 +1,86 @@
|
||||
Ticker,Company,Cik
|
||||
AAPL,Apple Inc.,320193
|
||||
MSFT,Microsoft Corporation,789019
|
||||
AMZN,"Amazon.com, Inc.",1018724
|
||||
NVDA,NVIDIA Corporation,1045810
|
||||
TSLA,"Tesla, Inc.",1318605
|
||||
GOOGL,Alphabet Inc. Class A,1652044
|
||||
META,"Meta Platforms, Inc.",1326801
|
||||
AMD,"Advanced Micro Devices, Inc.",2488
|
||||
NFLX,"Netflix, Inc.",1065280
|
||||
BRK.B,Berkshire Hathaway Inc.,1067983
|
||||
V,Visa Inc.,1403161
|
||||
JNJ,Johnson & Johnson,200406
|
||||
PG,Procter & Gamble Co.,80424
|
||||
JPM,JPMorgan Chase & Co.,19617
|
||||
UNH,UnitedHealth Group Incorporated,731766
|
||||
DIS,The Walt Disney Company,1744489
|
||||
HD,"Home Depot, Inc.",354950
|
||||
XOM,Exxon Mobil Corporation,34088
|
||||
KO,Coca-Cola Company,21344
|
||||
PEP,"PepsiCo, Inc.",77476
|
||||
PFE,Pfizer Inc.,78003
|
||||
MA,Mastercard Incorporated,1141391
|
||||
ADBE,Adobe Inc.,796343
|
||||
CRM,"Salesforce, Inc.",1108524
|
||||
INTC,Intel Corporation,50863
|
||||
CSCO,"Cisco Systems, Inc.",858877
|
||||
NKE,"Nike, Inc.",320187
|
||||
T,AT&T Inc.,732717
|
||||
CMCSA,Comcast Corporation,1166691
|
||||
VZ,Verizon Communications Inc.,732712
|
||||
CVX,Chevron Corporation,93410
|
||||
ABBV,AbbVie Inc.,1551152
|
||||
MRK,"Merck & Co., Inc.",310158
|
||||
BMY,Bristol-Myers Squibb Company,14272
|
||||
WMT,Walmart Inc.,104169
|
||||
MCD,McDonald's Corporation,63908
|
||||
SBUX,Starbucks Corporation,829224
|
||||
GS,"Goldman Sachs Group, Inc.",886982
|
||||
MS,Morgan Stanley,895421
|
||||
AXP,American Express Company,4962
|
||||
C,Citigroup Inc.,831001
|
||||
BA,Boeing Company,12927
|
||||
DAL,"Delta Air Lines, Inc.",27904
|
||||
LUV,Southwest Airlines Co.,92380
|
||||
MAR,"Marriott International, Inc.",1048286
|
||||
HLT,Hilton Worldwide Holdings Inc.,1585689
|
||||
BKNG,Booking Holdings Inc.,1075531
|
||||
PYPL,"PayPal Holdings, Inc.",1633917
|
||||
SQ,"Square, Inc.",1512673
|
||||
ZM,"Zoom Video Communications, Inc.",1585521
|
||||
SNOW,Snowflake Inc.,1640147
|
||||
UBER,"Uber Technologies, Inc.",1543151
|
||||
LYFT,"Lyft, Inc.",1759509
|
||||
ROKU,"Roku, Inc.",1428439
|
||||
SPOT,Spotify Technology S.A.,1639920
|
||||
SHOP,Shopify Inc.,1594805
|
||||
EBAY,eBay Inc.,1065088
|
||||
TWTR,"Twitter, Inc.",1418091
|
||||
SNAP,Snap Inc.,1564408
|
||||
PINS,"Pinterest, Inc.",1506293
|
||||
PLTR,Palantir Technologies Inc.,1321655
|
||||
ZI,ZoomInfo Technologies Inc.,1794515
|
||||
DOCU,"DocuSign, Inc.",1261333
|
||||
TWLO,Twilio Inc.,1447669
|
||||
CRWD,"CrowdStrike Holdings, Inc.",1535527
|
||||
NET,"Cloudflare, Inc.",1477333
|
||||
DDOG,"Datadog, Inc.",1561550
|
||||
MDB,"MongoDB, Inc.",1441816
|
||||
ZS,"Zscaler, Inc.",1713683
|
||||
OKTA,"Okta, Inc.",1660134
|
||||
DBX,"Dropbox, Inc.",1467623
|
||||
SMAR,Smartsheet Inc.,1366561
|
||||
ASAN,"Asana, Inc.",1477720
|
||||
RNG,"RingCentral, Inc.",1384905
|
||||
PTON,"Peloton Interactive, Inc.",1639825
|
||||
TTD,"The Trade Desk, Inc.",1671933
|
||||
HUBS,"HubSpot, Inc.",1404655
|
||||
COUP,Coupa Software Incorporated,1385867
|
||||
AYX,"Alteryx, Inc.",1689923
|
||||
SPLK,Splunk Inc.,1353283
|
||||
NEWR,"New Relic, Inc.",1448056
|
||||
DT,"Dynatrace, Inc.",1773383
|
||||
NOW,"ServiceNow, Inc.",1373715
|
||||
WDAY,"Workday, Inc.",1327811
|
||||
ADSK,"Autodesk, Inc.",769397
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,312 @@
|
||||
Form,Description
|
||||
1-A POS,Reg A Offering Amendment
|
||||
1-A-W,Reg A Offering Withdrawal
|
||||
1-E,Notification filing for small business investment companies
|
||||
1-E AD,Sales material for small business investment companies
|
||||
1-K,Annual report for Regulation A issuers
|
||||
1-SA,Semiannual report for Regulation A issuers
|
||||
1-U,Current report for Regulation A issuers
|
||||
1-Z,Exit report for terminated Regulation A offerings
|
||||
1-Z-W,Withdrawal of Regulation A exit report
|
||||
2-E,Report of securities sales
|
||||
10-12B,Registration of a class of securities)
|
||||
10-12G,Registration of a class of securities
|
||||
10-D,Periodic distribution reports for asset-backed securities
|
||||
10-K,Annual report for public companies
|
||||
10-KT,Transition report with change in fiscal year
|
||||
10-Q,Quarterly report for public companies
|
||||
10-QT,Quarterly transition report with change in fiscal year
|
||||
11-K,Annual report for employee stock plans
|
||||
11-KT,Transition report for employee stock plans
|
||||
13F-CTR,Confidential treatment request by institutional managers
|
||||
13F-HR,Initial quarterly holdings report by institutional managers
|
||||
13F-NT,Initial quarterly notice by institutional managers
|
||||
13H,Registration for large traders
|
||||
144,Notice of proposed sale
|
||||
15-12G,Securities registration termination
|
||||
15-15D,Suspension of reporting obligations
|
||||
15F-12B,Foreign private issuer equity securities termination
|
||||
15F-12G,Securities registration termination by foreign private issuer
|
||||
15F-15D,Foreign private issuer reporting suspension
|
||||
17HACON,Confidential annual broker-dealer report
|
||||
17HQCON,Confidential quarterly broker-dealer report
|
||||
18-12B,Securities registration by foreign governments
|
||||
18-12G,Securities registration by foreign governments
|
||||
18-K,Annual report for foreign governments
|
||||
20-F,Annual report for foreign companies
|
||||
20FR12B,Foreign private issuer securities registration
|
||||
20FR12G,Foreign private issuer securities registration
|
||||
24F-2NT,Rule 24F-2 notice for investment companies
|
||||
25,Securities delisting
|
||||
25-NSE,Notice of matured/redeemed/retired securities by exchanges
|
||||
3,Initial statement of beneficial ownership
|
||||
305B2,Application for new trustee
|
||||
4,Statement of changes in beneficial ownership
|
||||
40-6B,Application by employees' securities company
|
||||
40-17F1,Custody report for management investment companies
|
||||
40-17F2,Custody report for management investment companies
|
||||
40-17G,Fidelity bond filing for investment companies
|
||||
40-17GCS,Claims and settlements under investment company fidelity bond
|
||||
40-24B2,Sales literature filing for investment companies
|
||||
40-33,Investment company shareholder derivative actions
|
||||
40-8B25,Investment company report or document
|
||||
40-8F-2,Application for deregistration by investment companies
|
||||
40-APP,Applications under Investment Company/Advisers Acts
|
||||
40-F,Annual report (Canadian)
|
||||
40FR12B,Securities registration by certain Canadian issuers
|
||||
40FR12G,Securities registration by certain Canadian issuers
|
||||
40-OIP,Applications under Investment Company/Advisers Acts reviewed by insurance office
|
||||
424A,Prospectus outlining the details of securities offered by a company
|
||||
424B1,Initial primary offering
|
||||
424B2,Primary offering prospectus
|
||||
424B3,Prospectus supplement
|
||||
424B4,Prospectus supplement with pricing
|
||||
424B5,Supplement to primary offering
|
||||
424B7,Prospectus with material changes
|
||||
424B8,Final prospectus changes
|
||||
424H,Preliminary prospectus
|
||||
425,Prospectus in business combination transactions
|
||||
424I,Prospectus filed under Rule 424(i)(1)
|
||||
485APOS,Post effective amendment
|
||||
485BPOS,Post effective amendment
|
||||
485BXT,Amendment to designate new effective date
|
||||
486APOS,Post-effective amendment
|
||||
486BPOS,Post-effective amendment
|
||||
486BXT,Amendment to designate new effective date
|
||||
487,Pre-effective pricing amendment under Rule 487
|
||||
497,Fund prospectus
|
||||
497AD,Rule 482 ads filed under Rule 497
|
||||
497H2,Filings under Rule 497(h)(2)
|
||||
497J,Certification of no changes to prospectus
|
||||
497K,Summary fund prospectus
|
||||
497VPI,Variable contracts summary prospectus
|
||||
497VPSUB,Substitution-related supplement for variable contracts
|
||||
497VPU,Updated summary prospectus for variable contracts
|
||||
5,Annual statement of beneficial ownership changes
|
||||
6-K,Foreign issuer current report
|
||||
8-A12B,Registration of securities
|
||||
8-A12G,Registration of securities
|
||||
8-K,Current report
|
||||
8-K12B,Successor issuer registration
|
||||
8-K12G3,Successor issuer registration
|
||||
8-K15D5,Successor issuer reporting
|
||||
ABS-15G,Asset-backed securities report
|
||||
ABS-EE,Electronic exhibits for asset-backed securities offerings
|
||||
ANNLRPT,Annual development bank report
|
||||
APP WD,Withdrawal of exemptive relief application
|
||||
ARS,Annual report to security holders
|
||||
ATS-N,Initial Alternative Trading System (ATS) notice
|
||||
ATS-N/CA,Correcting amendment to ATS notice
|
||||
ATS-N/MA,Material amendment to ATS notice
|
||||
ATS-N/OFA,Order display and fair access amendment to ATS notice
|
||||
ATS-N/UA,Updating amendment to ATS notice
|
||||
ATS-N-C,Notice of ATS cessation
|
||||
ATS-N-W,Withdrawal of ATS notice
|
||||
AW,Withdrawal of Securities Act registration amendment
|
||||
AW WD,Withdrawal request for registration amendment withdrawal
|
||||
BULK,Bulk submission
|
||||
C,Offering statement
|
||||
C-W,Withdrawal of offering statement
|
||||
C/A-W,Withdrawal of offering statement amendment
|
||||
C-U,Progress update
|
||||
C-U-W,Withdrawal of progress update
|
||||
C-AR,Annual report
|
||||
C-AR-W,Withdrawal of annual report
|
||||
C-AR/A-W,Withdrawal of annual report amendment
|
||||
C-TR,Termination of reporting
|
||||
C-TR-W,Withdrawal of termination of reporting
|
||||
CB,Notice for certain foreign issuer transactions
|
||||
CERT,Exchange certification of listing approval
|
||||
CFPORTAL,Funding portal registration application
|
||||
CFPORTAL-W,Withdrawal of funding portal registration
|
||||
CORRESP,Correspondence with the SEC
|
||||
D,Notice of exempt Regulation D offering
|
||||
DEF 14A,Definitive proxy statement
|
||||
DEF 14C,Definitive information statement
|
||||
DEFA14A,Additional definitive proxy materials
|
||||
DEFA14C,Additional definitive information statement materials
|
||||
DEFC14A,Definitive proxy statement - contested solicitation
|
||||
DEFC14C,Definitive information statement - contested solicitation
|
||||
DEFM14A,Definitive proxy statement for merger/acquisition
|
||||
DEFM14C,Definitive information statement for merger/acquisition
|
||||
DEFN14A,Definitive proxy statement by non-management
|
||||
DEFR14A,Revised definitive proxy materials
|
||||
DEFR14C,Revised definitive information statement materials
|
||||
DEL AM,Delaying amendment for registration statement
|
||||
DFAN14A,Additional proxy materials by non-management
|
||||
DFRN14A,Revised proxy statement by non-management
|
||||
DOS,Draft offering statement under Regulation A
|
||||
DOSLTR,Draft offering statement letter
|
||||
DRS,Draft registration statement
|
||||
DRSLTR,Draft registration statement letter
|
||||
DSTRBRPT,Distribution report for development bank
|
||||
F-1,Securities registration by foreign private issuers
|
||||
F-10,Securities registration by certain Canadian issuers
|
||||
F-10EF,Auto-effective registration by certain Canadian issuers
|
||||
F-10POS,Amendment to F-10EF registration
|
||||
F-1MEF,Additional securities registered to prior F-1
|
||||
F-3,Foreign private securities registration
|
||||
F-3ASR,Foreign private securities registration
|
||||
F-3D,Foreign private securities registration
|
||||
F-3DPOS,Amendment to F-3D registration
|
||||
F-3MEF,Additional securities registered to prior F-3
|
||||
F-4,Business combination for foreign issuers
|
||||
F-4MEF,Additional securities registered to prior F-4
|
||||
F-6 POS,Amendment to F-6EF registration
|
||||
F-6,Depositary receipts by foreign private issuers
|
||||
F-6EF,Depositary receipts by foreign private issuers
|
||||
F-7 POS,Amended F-7 registration
|
||||
F-7,Canadian rights offerings
|
||||
F-8 POS,Amendment to F-8 registration
|
||||
F-8,Canadian business combination
|
||||
F-80,Canadian business combination
|
||||
F-80POS,Amendment to F-80 registration
|
||||
F-N,Appointment of agent for service by foreign institutions
|
||||
FWP,Filing of free writing prospectuses
|
||||
F-X,Appointment of agent for service by foreign issuers
|
||||
IRANNOTICE,Notice of Iran/Syria disclosures in periodic reports
|
||||
MA,Municipal advisor registration
|
||||
MA-I,Natural persons engaged in municipal advisory activities
|
||||
MA-W,Withdrawal from municipal advisor registration
|
||||
MODULE,Module submission
|
||||
N-14 8C,Initial registration statement by closed-end funds for business combinations
|
||||
N-14,Initial registration statement by open-end funds for business combinations
|
||||
N-14MEF,Additional securities registered by closed-end funds
|
||||
N-18F1,Election of terms for future filings
|
||||
N-1A,Initial registration statement for open-end funds
|
||||
N-2,Closed-end fund registration
|
||||
N-2ASR,Closed-end fund automatic registration
|
||||
N-2 POSASR,Amendment to N-2ASR registration
|
||||
N-23C-2,Notice of closed-end fund's intention to call or redeem securities
|
||||
N-23C3A,Closed-end fund periodic repurchase offer notice
|
||||
N-23C3B,Filing under by closed-end funds
|
||||
N-23C3C,Filings under and (c) by closed-end funds
|
||||
N-27D-1,Accounting report for segregated trust accounts
|
||||
N-2MEF,Additional securities registered to prior N-2
|
||||
N-3,Initial registration for separate accounts of management companies
|
||||
N-30B-2,Periodic reports (other than annual/semi-annual) by management companies
|
||||
N-30D,Annual and semi-annual reports by management companies
|
||||
N-4,Initial registration for separate accounts of unit trusts
|
||||
N-5,Registration statement for small business investment companies
|
||||
N-54A,Election filing by business development companies
|
||||
N-54C,Withdrawal filing by business development companies
|
||||
N-6,Registration statement for separate accounts of unit trusts
|
||||
N-6F,Notice by business development companies electing to be subject to Sections 55-65
|
||||
N-8A,Initial notification of registration
|
||||
N-8B-2,Initial registration statement for unit investment trusts
|
||||
N-8B-3,Initial registration statement for periodic payment plans
|
||||
N-8B-4,Initial registration statement for face-amount certificate companies
|
||||
N-8F,Application for deregistration
|
||||
N-CEN,Annual report for registered investment companies
|
||||
N-CR,Current report for money market funds
|
||||
N-CSR,Certified annual shareholder report
|
||||
N-CSRS,Certified semi-annual shareholder report
|
||||
N-MFP2/A,Monthly portfolio holdings for money market funds
|
||||
N-MFP3,Monthly portfolio holdings for money market funds
|
||||
NPORT-EX,Portfolio holdings exhibit to Form N-PORT
|
||||
NPORT-NP,Non-public monthly portfolio investments report
|
||||
NPORT-P,Public monthly portfolio investments report
|
||||
N-PX,Annual proxy voting record report
|
||||
N-PX CTR,Confidential treatment request for Form N-PX
|
||||
N-RN,Current report for registered funds and BDCs
|
||||
NRSRO-UPD,Registration update by credit rating agencies
|
||||
NRSRO-CE,Annual certification by credit rating agencies
|
||||
NRSRO-FR,Annual reports for statistical rating organizations
|
||||
NRSRO-WCLS,Withdrawal from credit rating class for nationally recognized statistical rating organizations
|
||||
NRSRO-WREG,Withdrawal from registration as a nationally recognized statistical rating organization
|
||||
NT 10-K,Late filing of 10-K
|
||||
NT 10-D,Late filing of 10-D
|
||||
NT 10-Q,Late filing of 10-Q
|
||||
NT 11-K,Late filing of 11-K
|
||||
NT 15D2,Late filing of special report
|
||||
NT 20-F,Late filing of Form 20-F
|
||||
NT-NCEN,Late filing of Form N-CEN
|
||||
NT-NCSR,Late filing of Form N-CSR
|
||||
N-VP,Notice document for certain variable contracts
|
||||
N-VPFS,Financial statements for certain variable contracts
|
||||
POS 8C,Post-effective amendment for closed-end funds
|
||||
POS AM,Post-effective amendment to a registration statement
|
||||
POS AMI,Post-effective amendment for investment company filings
|
||||
POSASR,Post-effective amendment to automatic shelf registration
|
||||
POS EX,Post-effective amendment adding exhibits
|
||||
POS462B,Post-effective amendment filed
|
||||
POS462C,Post-effective amendment filed
|
||||
PRE 14A,Preliminary proxy statement
|
||||
PRE 14C,Preliminary information statement
|
||||
PREC14A,Preliminary proxy statement for contested solicitations
|
||||
PREC14C,Preliminary information statement for contested solicitations
|
||||
PREM14A,Preliminary merger proxy statement
|
||||
PREM14C,Preliminary merger information statement
|
||||
PREN14A,Preliminary proxy statement filed by non-management
|
||||
PRER14A,Preliminary revised proxy materials
|
||||
PRER14C,Preliminary revised information statements
|
||||
PRRN14A,Revised preliminary proxy statement non-management
|
||||
PX14A6G,Exempt solicitation
|
||||
PX14A6N,Exempt solicitation for roll-up transaction
|
||||
QRTLYRPT,Development banks quarterly report
|
||||
RW,Registration withdrawal
|
||||
RW WD,Withdrawal of registration withdrawal
|
||||
S-1,Securities registration
|
||||
S-11,Real estate securities registration
|
||||
S-11MEF,Registration statement for prior Form S-11
|
||||
S-1MEF,Registration statement for prior Form S-1
|
||||
S-20,Standardized options registration
|
||||
S-3,Simplified securities registration
|
||||
S-3ASR,Automatic shelf registration
|
||||
S-3D,Dividend reinvestment plans automatic securities registration
|
||||
S-3DPOS,Post-effective amendment to Form S-3D
|
||||
S-3MEF,Registration statement filed relating to prior Form S-3
|
||||
S-4 POS,Post-effective amendment to Form S-4
|
||||
S-4,Business acquisitions registration
|
||||
S-4EF,Bank/S&L loan registration
|
||||
S-4MEF,Registration statement filed relating to prior Form S-4
|
||||
S-6,Initial registration statement for unit investment trusts
|
||||
S-8,Employee securities registration
|
||||
S-8 POS,Post-effective amendment to Form S-8
|
||||
S-B,Foreign governments securities registration
|
||||
S-BMEF,Registration statement filed relating to prior Form S-B
|
||||
SBSE,Security-based swap dealer registration
|
||||
SBSE-A,Abbreviated application for SEC-registered swap entities also registered with CFTC
|
||||
SBSE-BD,Application for broker-dealer security-based swap dealers/major participants
|
||||
SBSE-C,Certifications for security-based swap dealer/major participant registration
|
||||
SBSE-W,Request to withdraw registration as security-based swap dealer/major participant
|
||||
SBSE-DISPUTE NOTICE,Notice of valuation dispute by a security-based swap entity
|
||||
SBSE-CCO-RPT,Annual compliance report for security-based swap dealers
|
||||
SC 13D,Ownership for control disclosure
|
||||
SCHEDULE 13D,Disclosure of beneficial ownership over 5% (XML)
|
||||
SC 13E1,Issuer statement for going private transactions
|
||||
SC 13E3,Schedule for going private transactions
|
||||
SC 13G,Beneficial ownership
|
||||
SCHEDULE 13G,Beneficial ownership by passive investors/institutions
|
||||
SC 14D9,Solicitation/recommendation statement for third-party tender offers
|
||||
SC 14F1,Statement for changes to majority of directors
|
||||
SC 14N,Information by nominating shareholders
|
||||
SC 14N-S,Solicitation relating to Rule 14a-11 nominating groups
|
||||
SC TO-C,Written communication relating to tender offers
|
||||
SC TO-I,Tender offer by issuer
|
||||
SC TO-T,Tender offer by third party
|
||||
SC13E4F,Foreign issuer tender
|
||||
SC14D1F,3rd party tender offer by foreign issuer
|
||||
SC14D9C,Subject company communication relating to third-party tender offer
|
||||
SC14D9F,Solicitation/recommendation statement by foreign issuers for third-party tender offers
|
||||
SD,Specialized disclosure report on conflict minerals or resource extraction payments
|
||||
SDR,Registration for security-based swap data repositories
|
||||
SDR-CCO,Compliance and financial reports for security-based swap data repositories
|
||||
SDR-W,Withdrawal from registration as security-based swap data repository
|
||||
SF-1,Asset-backed securities registration
|
||||
SF-1MEF,Registration statement filed relating to prior Form SF-1
|
||||
SF-3,Asset-backed securities shelf offerings
|
||||
SF-3MEF,Registration statement filed relating to prior Form SF-3
|
||||
SH-ER,Weekly entries report by institutional investment managers
|
||||
SH-NT,Weekly notice report by institutional investment managers
|
||||
SP 15D2,Special financial report
|
||||
SPDSCL,Specialized disclosure filing
|
||||
SUPPL,Supplemental material filed by foreign private issuers
|
||||
T-3,Initial application for trust indenture qualification
|
||||
T-6,Application for foreign entity to act as institutional trustee
|
||||
TA-1,Initial application for transfer agent registration
|
||||
TA-2,Annual report by registered transfer agents
|
||||
TA-W,Notice of withdrawal from transfer agent registration
|
||||
UPLOAD,Submission of documents
|
||||
UNDER,Initial undertaking to file reports
|
||||
X-17A-5,Reports required of brokers and dealers
|
||||
|
@@ -0,0 +1,7 @@
|
||||
from edgar.httprequests import download_file
|
||||
|
||||
dera_data_url = 'https://www.sec.gov/dera/data'
|
||||
financial_statement_datasets='financial-statement-data-sets'
|
||||
|
||||
if __name__ == '__main__':
|
||||
download_file('https://www.sec.gov/files/dera/data/financial-statement-data-sets/2024q1.zip')
|
||||
33
venv/lib/python3.10/site-packages/edgar/reference/forms.py
Normal file
33
venv/lib/python3.10/site-packages/edgar/reference/forms.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from functools import lru_cache
|
||||
|
||||
from edgar.reference.data.common import read_csv_from_package
|
||||
|
||||
sec_form_data = read_csv_from_package('secforms.csv')
|
||||
|
||||
|
||||
@lru_cache(maxsize=64)
|
||||
def describe_form(form: str,
|
||||
prepend_form: bool = True) -> str:
|
||||
"""
|
||||
Get the description of a form from the form descriptions file.
|
||||
"""
|
||||
is_amendment = False
|
||||
if form.endswith("/A"):
|
||||
form = form[:-2]
|
||||
is_amendment = True
|
||||
form = form.upper()
|
||||
description = sec_form_data.loc[sec_form_data.Form == form]
|
||||
if len(description) == 0:
|
||||
return f"Form {form}"
|
||||
else:
|
||||
description = description.Description.iloc[0]
|
||||
if prepend_form:
|
||||
return f"Form {form}{' Amendment' if is_amendment else ''}: {description}"
|
||||
else:
|
||||
return description
|
||||
|
||||
|
||||
PROSPECTUSES = ["S-1", "S-3", "S-4", "S-8", "S-11", "F-1", "F-3", "F-4", "F-6", "F-10", "424B1",
|
||||
"424B2", "424B3", "424B4", "424B5", "424B7", "424B8", "485BPOS", "486BPOS", "497", "N-2", "N-14",
|
||||
"POS AM", "POSASR", "POS EX", "10", "20-F", "8-A", "SF-1", "SF-3"
|
||||
]
|
||||
475
venv/lib/python3.10/site-packages/edgar/reference/tickers.py
Normal file
475
venv/lib/python3.10/site-packages/edgar/reference/tickers.py
Normal file
@@ -0,0 +1,475 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from enum import Enum
|
||||
from functools import lru_cache
|
||||
from io import StringIO
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
from httpx import HTTPStatusError
|
||||
|
||||
from edgar.core import get_edgar_data_directory, listify, log
|
||||
from edgar.httprequests import download_file, download_json
|
||||
from edgar.reference.data.common import read_csv_from_package, read_parquet_from_package
|
||||
|
||||
__all__ = ['cusip_ticker_mapping', 'get_ticker_from_cusip', 'get_company_tickers', 'get_icon_from_ticker', 'find_cik',
|
||||
'get_cik_tickers', 'get_company_ticker_name_exchange', 'get_companies_by_exchange', 'popular_us_stocks',
|
||||
'get_mutual_fund_tickers', 'find_mutual_fund_cik', 'list_all_tickers', 'find_ticker', 'find_ticker_safe', 'get_cik_ticker_lookup',
|
||||
'get_company_cik_lookup', 'get_cik_tickers_from_ticker_txt', 'get_cik_tickers', 'get_company_tickers',
|
||||
'ticker_txt_url', 'company_tickers_json_url', 'mutual_fund_tickers_url', 'company_tickers_exchange_url',
|
||||
'Exchange'
|
||||
]
|
||||
|
||||
ticker_txt_url = "https://www.sec.gov/include/ticker.txt"
|
||||
company_tickers_json_url = "https://www.sec.gov/files/company_tickers.json"
|
||||
mutual_fund_tickers_url = "https://www.sec.gov/files/company_tickers_mf.json"
|
||||
company_tickers_exchange_url = "https://www.sec.gov/files/company_tickers_exchange.json"
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def cusip_ticker_mapping(allow_duplicate_cusips: bool = True) -> pd.DataFrame:
|
||||
"""
|
||||
Download the Cusip to Ticker mapping data from the SEC website.
|
||||
This provides a Dataframe with Cusip as the index and Ticker as the column.
|
||||
|
||||
CUSIP can be duplicate to get non duplicate Cusips set allow_duplicate_cusips to False.
|
||||
This will return only the first occurrence of the Cusip.
|
||||
The first occurrence of the Cusip will also be most likely to be mapped to a Ticker that is linked to a cik
|
||||
"""
|
||||
df = read_parquet_from_package('ct.pq').set_index('Cusip')
|
||||
if not allow_duplicate_cusips:
|
||||
df = df[~df.index.duplicated(keep='first')]
|
||||
return df
|
||||
|
||||
|
||||
def load_tickers_from_local() -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Load tickers from local data
|
||||
"""
|
||||
reference_dir = get_edgar_data_directory() / "reference"
|
||||
if not reference_dir.exists():
|
||||
return None
|
||||
company_tickers_file = reference_dir / os.path.basename(company_tickers_json_url)
|
||||
if not company_tickers_file.exists():
|
||||
return None
|
||||
return json.loads(company_tickers_file.read_text())
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_company_tickers(
|
||||
as_dataframe: bool = True,
|
||||
clean_name: bool = True,
|
||||
clean_suffix: bool = False
|
||||
) -> Union[pd.DataFrame, pa.Table]:
|
||||
"""
|
||||
Fetch and process company ticker data from SEC.
|
||||
|
||||
Args:
|
||||
as_dataframe (bool): If True, returns pandas DataFrame; if False, returns pyarrow Table
|
||||
clean_name (bool): If True, cleans company names
|
||||
clean_suffix (bool): If True, removes common company suffixes
|
||||
|
||||
Returns:
|
||||
Union[pd.DataFrame, pa.Table]: Processed company data
|
||||
"""
|
||||
|
||||
# Pre-define schema for better performance
|
||||
SCHEMA = pa.schema([
|
||||
('cik', pa.int64()),
|
||||
('ticker', pa.string()),
|
||||
('company', pa.string())
|
||||
])
|
||||
|
||||
try:
|
||||
if os.getenv("EDGAR_USE_LOCAL_DATA"):
|
||||
tickers_json = load_tickers_from_local()
|
||||
if not tickers_json:
|
||||
tickers_json = download_json(company_tickers_json_url)
|
||||
else:
|
||||
# Download JSON data
|
||||
tickers_json = download_json(company_tickers_json_url)
|
||||
|
||||
# Pre-allocate lists for better memory efficiency
|
||||
ciks = []
|
||||
tickers = []
|
||||
companies = []
|
||||
|
||||
# Process JSON data
|
||||
for item in tickers_json.values():
|
||||
company_name = item['title']
|
||||
|
||||
# Apply name cleaning if requested
|
||||
if clean_name or clean_suffix:
|
||||
if clean_name:
|
||||
company_name = clean_company_name(company_name)
|
||||
if clean_suffix:
|
||||
company_name = clean_company_suffix(company_name)
|
||||
|
||||
# Append to respective lists
|
||||
ciks.append(int(item['cik_str']))
|
||||
tickers.append(item['ticker'])
|
||||
companies.append(company_name)
|
||||
|
||||
if as_dataframe:
|
||||
# Create DataFrame directly from lists
|
||||
return pd.DataFrame({
|
||||
'cik': ciks,
|
||||
'ticker': tickers,
|
||||
'company': companies
|
||||
})
|
||||
|
||||
# Create pyarrow arrays
|
||||
cik_array = pa.array(ciks, type=pa.int64())
|
||||
ticker_array = pa.array(tickers, type=pa.string())
|
||||
company_array = pa.array(companies, type=pa.string())
|
||||
|
||||
# Create and return pyarrow Table
|
||||
return pa.Table.from_arrays(
|
||||
[cik_array, ticker_array, company_array],
|
||||
schema=SCHEMA
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error fetching company tickers from [{company_tickers_json_url}]: {str(e)}")
|
||||
raise
|
||||
|
||||
def load_cik_tickers_from_local() -> Optional[str]:
|
||||
"""
|
||||
Load tickers.txt from local data
|
||||
"""
|
||||
reference_dir = get_edgar_data_directory() / "reference"
|
||||
if not reference_dir.exists():
|
||||
return None
|
||||
tickers_txt_file = reference_dir / os.path.basename(ticker_txt_url)
|
||||
if not tickers_txt_file.exists():
|
||||
return None
|
||||
return tickers_txt_file.read_text()
|
||||
|
||||
def get_cik_tickers_from_ticker_txt():
|
||||
"""Get CIK and ticker data from ticker.txt file"""
|
||||
try:
|
||||
if os.getenv("EDGAR_USE_LOCAL_DATA"):
|
||||
ticker_txt = load_cik_tickers_from_local()
|
||||
if not ticker_txt:
|
||||
ticker_txt = download_file(ticker_txt_url, as_text=True)
|
||||
else:
|
||||
ticker_txt = download_file(ticker_txt_url, as_text=True)
|
||||
source = StringIO(ticker_txt)
|
||||
data = pd.read_csv(source,
|
||||
sep='\t',
|
||||
header=None,
|
||||
names=['ticker', 'cik']).dropna()
|
||||
data['ticker'] = data['ticker'].str.upper()
|
||||
return data
|
||||
except Exception as e:
|
||||
log.error(f"Error fetching company tickers from [{ticker_txt_url}]: {str(e)}")
|
||||
return None
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_cik_tickers():
|
||||
"""Merge unique records from both sources"""
|
||||
txt_data = get_cik_tickers_from_ticker_txt()
|
||||
try:
|
||||
json_data = get_company_tickers(clean_name=False, clean_suffix=False)[['ticker', 'cik']]
|
||||
except Exception:
|
||||
json_data = None
|
||||
|
||||
if txt_data is None and json_data is None:
|
||||
raise Exception("Both data sources are unavailable")
|
||||
|
||||
if txt_data is None:
|
||||
return json_data
|
||||
|
||||
if json_data is None:
|
||||
return txt_data
|
||||
|
||||
# Merge both dataframes and keep unique records
|
||||
merged_data = pd.concat([txt_data, json_data], ignore_index=True)
|
||||
merged_data = merged_data.drop_duplicates(subset=['ticker', 'cik'])
|
||||
|
||||
return merged_data
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def list_all_tickers():
|
||||
"""List all tickers from the merged data"""
|
||||
return get_cik_tickers()['ticker'].tolist()
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_company_cik_lookup():
|
||||
df = get_cik_tickers()
|
||||
|
||||
lookup = {}
|
||||
for ticker, cik in zip(df['ticker'], df['cik'], strict=False):
|
||||
# Add original ticker
|
||||
lookup[ticker] = cik
|
||||
|
||||
# Add base ticker (part before '-')
|
||||
base_ticker = ticker.split('-')[0]
|
||||
if base_ticker not in lookup:
|
||||
lookup[base_ticker] = cik
|
||||
|
||||
return lookup
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_cik_ticker_lookup():
|
||||
"""Create a mapping of CIK to base ticker symbols.
|
||||
For CIKs with multiple tickers, uses the shortest ticker (usually the base symbol).
|
||||
"""
|
||||
company_lookup = get_company_cik_lookup()
|
||||
cik_to_tickers = {}
|
||||
for ticker, cik in company_lookup.items():
|
||||
# Prefer the base ticker (without share class)
|
||||
base_ticker = ticker.split('-')[0]
|
||||
if cik not in cik_to_tickers or len(base_ticker) < len(cik_to_tickers[cik]):
|
||||
cik_to_tickers[cik] = base_ticker
|
||||
return cik_to_tickers
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def find_ticker(cik: Union[int, str]) -> str:
|
||||
"""Find the ticker symbol for a given CIK.
|
||||
Returns empty string if no ticker is found.
|
||||
|
||||
Args:
|
||||
cik: Central Index Key (CIK) as integer or string
|
||||
|
||||
Returns:
|
||||
str: Ticker symbol or empty string if not found
|
||||
"""
|
||||
try:
|
||||
# Ensure cik is an integer
|
||||
cik = int(str(cik).lstrip('0'))
|
||||
return get_cik_ticker_lookup().get(cik, "")
|
||||
except (ValueError, TypeError):
|
||||
return ""
|
||||
|
||||
|
||||
def find_ticker_safe(cik: Union[int, str]) -> Optional[str]:
|
||||
"""Find the ticker symbol for a given CIK without making network calls.
|
||||
Returns None if data is not already cached and would require a network call.
|
||||
Returns empty string if CIK is found but has no ticker.
|
||||
|
||||
This function is designed for use cases where network calls should be avoided,
|
||||
such as in rich display methods that should be fast and not block on I/O.
|
||||
|
||||
Args:
|
||||
cik: Central Index Key (CIK) as integer or string
|
||||
|
||||
Returns:
|
||||
Optional[str]: Ticker symbol, empty string if no ticker found, or None if network call would be required
|
||||
"""
|
||||
try:
|
||||
# Simple approach: check if all required cache functions have data
|
||||
# Only proceed if all the underlying data is already cached
|
||||
if (get_cik_ticker_lookup.cache_info().currsize > 0 and
|
||||
get_company_cik_lookup.cache_info().currsize > 0 and
|
||||
get_cik_tickers.cache_info().currsize > 0):
|
||||
|
||||
# If we have cached data, try to use it
|
||||
cik = int(str(cik).lstrip('0'))
|
||||
|
||||
# This should be fast since data is cached
|
||||
lookup_dict = get_cik_ticker_lookup()
|
||||
return lookup_dict.get(cik, "")
|
||||
else:
|
||||
# Not all required data is cached, return None to avoid network calls
|
||||
return None
|
||||
|
||||
except Exception:
|
||||
# Any error (including potential network errors) returns None
|
||||
# This ensures we never trigger network calls
|
||||
return None
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_company_ticker_name_exchange():
|
||||
"""
|
||||
Return a DataFrame with columns [cik name ticker exchange]
|
||||
"""
|
||||
data = download_json("https://www.sec.gov/files/company_tickers_exchange.json")
|
||||
return pd.DataFrame(data['data'], columns=data['fields'])
|
||||
|
||||
|
||||
def get_companies_by_exchange(exchange: Union[List[str], str]):
|
||||
"""
|
||||
Get companies listed on a specific exchange.
|
||||
|
||||
:param exchange: String, like 'Nasdaq' or 'NYSE'
|
||||
:return: DataFrame with companies listed on the specified exchange
|
||||
with columns [cik name ticker exchange]
|
||||
"""
|
||||
df = get_company_ticker_name_exchange()
|
||||
exchanges = [ex.lower() for ex in listify(exchange)]
|
||||
return df[df['exchange'].str.lower().isin(exchanges)].reset_index(drop=True)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_mutual_fund_tickers():
|
||||
"""
|
||||
Get mutual fund tickers.
|
||||
This returns a dataframe with columns
|
||||
cik seriesId classId ticker
|
||||
"""
|
||||
data = download_json("https://www.sec.gov/files/company_tickers_mf.json")
|
||||
return pd.DataFrame(data['data'], columns=['cik', 'seriesId', 'classId', 'ticker'])
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_mutual_fund_lookup():
|
||||
df = get_mutual_fund_tickers()
|
||||
return dict(zip(df['ticker'], df['cik'], strict=False))
|
||||
|
||||
|
||||
def find_mutual_fund_cik(ticker):
|
||||
"""
|
||||
Find the CIK for a given mutual fund or ETF ticker.
|
||||
|
||||
:param ticker: String, the ticker symbol to look up
|
||||
:return: Integer, the CIK for the given ticker, or None if not found
|
||||
"""
|
||||
lookup = get_mutual_fund_lookup()
|
||||
return lookup.get(ticker.upper())
|
||||
|
||||
|
||||
def find_company_cik(ticker):
|
||||
lookup = get_company_cik_lookup()
|
||||
ticker = ticker.upper().replace('.', '-')
|
||||
return lookup.get(ticker)
|
||||
|
||||
def find_company_ticker(cik: Union[int, str]) -> Union[str, List[str], None]:
|
||||
"""
|
||||
Find the ticker for a given CIK.
|
||||
|
||||
:param cik (int or str): The CIK to look up
|
||||
:return Union[str, List[str]]: A single ticker string if only one ticker is found,
|
||||
a list of ticker strings if multiple tickers are found,
|
||||
or an empty list if no tickers are found.
|
||||
"""
|
||||
try:
|
||||
# Ensure cik is a string without leading zeros, then convert to int
|
||||
cik = str(cik).lstrip('0')
|
||||
cik = int(cik)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
# Get DataFrame of CIK-Ticker mappings
|
||||
df = get_cik_tickers()
|
||||
|
||||
# Ensure 'cik' and 'ticker' columns exist
|
||||
if 'cik' not in df.columns or 'ticker' not in df.columns:
|
||||
return None
|
||||
|
||||
# Filter DataFrame for the given CIK
|
||||
ticker_series = df[df['cik'] == cik]['ticker']
|
||||
|
||||
# If no tickers found, return None
|
||||
if ticker_series.empty:
|
||||
return None
|
||||
|
||||
# Filter out None values from tickers
|
||||
tickers = [ticker for ticker in ticker_series.to_numpy() if ticker is not None]
|
||||
|
||||
# Return a single ticker if only one found
|
||||
if len(tickers) == 1:
|
||||
return tickers[0]
|
||||
|
||||
return tickers
|
||||
|
||||
def find_cik(ticker):
|
||||
"""
|
||||
Find the CIK for a given ticker, checking both company and mutual fund/ETF data.
|
||||
|
||||
:param ticker: String, the ticker symbol to look up
|
||||
:return: Integer, the CIK for the given ticker, or None if not found
|
||||
"""
|
||||
# First, check company CIKs
|
||||
cik = find_company_cik(ticker)
|
||||
if cik is not None:
|
||||
return cik
|
||||
|
||||
# If not found, check mutual fund/ETF CIKs
|
||||
return find_mutual_fund_cik(ticker)
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def get_ticker_from_cusip(cusip: str):
|
||||
"""
|
||||
Get the ticker symbol for a given Cusip.
|
||||
"""
|
||||
data = cusip_ticker_mapping()
|
||||
results = data.loc[cusip]
|
||||
if len(results) == 1:
|
||||
return results.iloc[0]
|
||||
elif len(results) > 1:
|
||||
return results.iloc[0].Ticker
|
||||
|
||||
|
||||
def clean_company_name(name: str) -> str:
|
||||
# Regular expression to match unwanted patterns at the end of the company name
|
||||
cleaned_name = re.sub(r'[/\\][A-Z]+[/\\]?$', '', name)
|
||||
return cleaned_name.strip()
|
||||
|
||||
|
||||
def clean_company_suffix(name: str) -> str:
|
||||
"""Remove common suffixes from the company name, taking care of special cases."""
|
||||
# Remove trailing slashes
|
||||
name = name.rstrip('/')
|
||||
# Handle cases like "JPMORGAN CHASE & CO" or "ELI LILLY & Co"
|
||||
name = re.sub(r'\s*&\s*CO\b\.?', '', name, flags=re.IGNORECASE).strip()
|
||||
# Remove other common suffixes, including "PLC", "LTD", "LIMITED", and combinations like "LTD CO"
|
||||
name = re.sub(r'\b(?:Inc\.?|CO|CORP|PLC|LTD|LIMITED|L\.P\.)\b\.?$', '', name, flags=re.IGNORECASE).strip()
|
||||
return name
|
||||
|
||||
|
||||
def get_ticker_icon_url(ticker: str) -> str:
|
||||
"""
|
||||
Get the URL for the icon of a company with the given ticker.
|
||||
"""
|
||||
return f"https://raw.githubusercontent.com/nvstly/icons/main/ticker_icons/{ticker.upper()}.png"
|
||||
|
||||
@lru_cache(maxsize=4)
|
||||
def get_icon_from_ticker(ticker: str) -> Optional[bytes]:
|
||||
"""
|
||||
Download an icon for a given ticker as a PNG image, if available.
|
||||
|
||||
WARNING: This function uses the nvstly/icons repository on GitHub to fetch the icons.
|
||||
The icons are not guaranteed to be available for all tickers.
|
||||
"""
|
||||
|
||||
if not isinstance(ticker, str):
|
||||
raise ValueError("The ticker must be a valid string.")
|
||||
|
||||
if not ticker.isalpha():
|
||||
raise ValueError("The ticker must only contain alphabetic characters.")
|
||||
|
||||
try:
|
||||
downloaded = download_file(
|
||||
f"https://raw.githubusercontent.com/nvstly/icons/main/ticker_icons/{ticker.upper()}.png", as_text=False)
|
||||
return downloaded
|
||||
except HTTPStatusError as e:
|
||||
# If the status code is 404, the icon is not available
|
||||
if e.response.status_code == 404:
|
||||
return None
|
||||
else:
|
||||
raise
|
||||
|
||||
def popular_us_stocks():
|
||||
df = (read_csv_from_package('popular_us_stocks.csv', dtype={'Cik': int})
|
||||
.set_index('Cik')
|
||||
)
|
||||
return df
|
||||
|
||||
class Exchange(Enum):
|
||||
|
||||
Nasdaq = "Nasdaq"
|
||||
NYSE = "NYSE"
|
||||
OTC = "OTC"
|
||||
CBOE = "CBOE"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
|
||||
Reference in New Issue
Block a user