607 lines
20 KiB
Python
607 lines
20 KiB
Python
"""
|
|
Company Dataset Builder for EdgarTools
|
|
|
|
Builds high-performance company datasets from SEC submissions data with two output formats:
|
|
1. PyArrow Parquet (5-20 MB) - Fast filtering with PyArrow compute API
|
|
2. DuckDB (287 MB) - Optional SQL interface for power users
|
|
|
|
Performance:
|
|
- Build time: ~30 seconds (optimized with orjson + company filtering)
|
|
- Records: ~562,413 companies (40% individual filers filtered)
|
|
- Query speed: <1ms (DuckDB) or <100ms (Parquet)
|
|
|
|
Example:
|
|
>>> from edgar.reference import get_company_dataset
|
|
>>> import pyarrow.compute as pc
|
|
>>>
|
|
>>> # Load dataset (builds on first use)
|
|
>>> companies = get_company_dataset()
|
|
>>>
|
|
>>> # Filter pharmaceutical companies
|
|
>>> pharma = companies.filter(pc.field('sic').between(2834, 2836))
|
|
>>> print(f"Found {len(pharma)} pharma companies")
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from typing import Optional, Union
|
|
import logging
|
|
|
|
import pyarrow as pa
|
|
import pyarrow.parquet as pq
|
|
from tqdm import tqdm
|
|
|
|
from edgar.core import get_edgar_data_directory, log
|
|
|
|
# Try to import orjson for performance, fall back to stdlib json
|
|
try:
|
|
import orjson
|
|
|
|
def load_json(path: Path) -> dict:
|
|
"""Load JSON file using orjson (1.55x faster)"""
|
|
return orjson.loads(path.read_bytes())
|
|
|
|
JSON_PARSER = "orjson"
|
|
except ImportError:
|
|
import json
|
|
|
|
def load_json(path: Path) -> dict:
|
|
"""Load JSON file using stdlib json"""
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
JSON_PARSER = "json (stdlib)"
|
|
|
|
|
|
# Company dataset schema
|
|
COMPANY_SCHEMA = pa.schema([
|
|
('cik', pa.string()), # Keep as string to preserve leading zeros
|
|
('name', pa.string()),
|
|
('sic', pa.int32()), # Nullable - some companies have no SIC
|
|
('sic_description', pa.string()),
|
|
('tickers', pa.string()), # Pipe-delimited (e.g., "AAPL|APPLE")
|
|
('exchanges', pa.string()), # Pipe-delimited (e.g., "Nasdaq|NYSE")
|
|
('state_of_incorporation', pa.string()),
|
|
('state_of_incorporation_description', pa.string()),
|
|
('fiscal_year_end', pa.string()), # MMDD format
|
|
('entity_type', pa.string()),
|
|
('ein', pa.string()),
|
|
])
|
|
|
|
|
|
def is_individual_from_json(data: dict) -> bool:
|
|
"""
|
|
Determine if entity is an individual filer vs a company.
|
|
|
|
Uses the same logic as edgar.entity.data:478 (is_individual property).
|
|
|
|
Companies typically have:
|
|
- Tickers or exchanges
|
|
- State of incorporation
|
|
- Entity type other than '' or 'other'
|
|
- Company-specific filings (10-K, 10-Q, 8-K, etc.)
|
|
|
|
Args:
|
|
data: Parsed JSON submission data
|
|
|
|
Returns:
|
|
True if individual filer, False if company
|
|
|
|
Example:
|
|
>>> data = {'cik': '0001318605', 'tickers': ['TSLA']}
|
|
>>> is_individual_from_json(data)
|
|
False
|
|
|
|
>>> data = {'cik': '0001078519', 'name': 'JOHN DOE'}
|
|
>>> is_individual_from_json(data)
|
|
True
|
|
"""
|
|
# Has ticker or exchange → company
|
|
if data.get('tickers') or data.get('exchanges'):
|
|
return False
|
|
|
|
# Has state of incorporation → company (with exceptions)
|
|
state = data.get('stateOfIncorporation', '')
|
|
if state and state != '':
|
|
# Reed Hastings exception (individual with state of incorporation)
|
|
if data.get('cik') == '0001033331':
|
|
return True
|
|
return False
|
|
|
|
# Has entity type (not '' or 'other') → company
|
|
entity_type = data.get('entityType', '')
|
|
if entity_type and entity_type not in ['', 'other']:
|
|
return False
|
|
|
|
# Files company forms (10-K, 10-Q, etc.) → company
|
|
filings = data.get('filings', {})
|
|
if filings:
|
|
recent = filings.get('recent', {})
|
|
forms = recent.get('form', [])
|
|
company_forms = {'10-K', '10-Q', '8-K', '10-K/A', '10-Q/A', '20-F', 'S-1'}
|
|
if any(form in company_forms for form in forms):
|
|
return False
|
|
|
|
# Default: individual
|
|
return True
|
|
|
|
|
|
def build_company_dataset_parquet(
|
|
submissions_dir: Path,
|
|
output_path: Path,
|
|
filter_individuals: bool = True,
|
|
show_progress: bool = True
|
|
) -> pa.Table:
|
|
"""
|
|
Build PyArrow Parquet dataset from submissions directory (companies only).
|
|
|
|
This function processes all CIK*.json files in the submissions directory,
|
|
filters out individual filers (optional), and creates a compressed Parquet file.
|
|
|
|
Performance:
|
|
- ~30 seconds for 562,413 companies (with orjson + filtering)
|
|
- Output size: ~5-20 MB (zstd compressed)
|
|
- Memory usage: ~100-200 MB during build
|
|
|
|
Args:
|
|
submissions_dir: Directory containing CIK*.json files
|
|
output_path: Where to save the .pq file
|
|
filter_individuals: Skip individual filers (default: True)
|
|
show_progress: Show progress bar (default: True)
|
|
|
|
Returns:
|
|
PyArrow Table with company data
|
|
|
|
Raises:
|
|
FileNotFoundError: If submissions_dir doesn't exist
|
|
|
|
Example:
|
|
>>> from pathlib import Path
|
|
>>> submissions_dir = Path.home() / '.edgar' / 'submissions'
|
|
>>> output_path = Path.home() / '.edgar' / 'companies.pq'
|
|
>>> table = build_company_dataset_parquet(submissions_dir, output_path)
|
|
>>> print(f"Built dataset: {len(table):,} companies")
|
|
"""
|
|
if not submissions_dir.exists():
|
|
raise FileNotFoundError(
|
|
f"Submissions directory not found: {submissions_dir}\n\n"
|
|
"Please download submissions data first:\n"
|
|
" from edgar.storage import download_submissions\n"
|
|
" download_submissions()\n"
|
|
)
|
|
|
|
# Get all submission JSON files
|
|
json_files = list(submissions_dir.glob("CIK*.json"))
|
|
if len(json_files) == 0:
|
|
raise FileNotFoundError(
|
|
f"No submission files found in: {submissions_dir}\n"
|
|
"Expected CIK*.json files"
|
|
)
|
|
|
|
log.info(f"Building company dataset from {len(json_files):,} submission files")
|
|
log.info(f"Using JSON parser: {JSON_PARSER}")
|
|
|
|
companies = []
|
|
errors = 0
|
|
individuals_skipped = 0
|
|
|
|
# Process each file with progress bar
|
|
iterator = tqdm(json_files, desc="Processing submissions", disable=not show_progress)
|
|
|
|
for json_file in iterator:
|
|
try:
|
|
data = load_json(json_file)
|
|
|
|
# Skip individuals if filtering enabled
|
|
if filter_individuals and is_individual_from_json(data):
|
|
individuals_skipped += 1
|
|
continue
|
|
|
|
# Extract SIC (handle empty strings)
|
|
sic = data.get('sic')
|
|
sic_int = int(sic) if sic and sic != '' else None
|
|
|
|
# Extract tickers and exchanges (filter None values)
|
|
tickers = data.get('tickers', [])
|
|
exchanges = data.get('exchanges', [])
|
|
|
|
companies.append({
|
|
'cik': data.get('cik'),
|
|
'name': data.get('name'),
|
|
'sic': sic_int,
|
|
'sic_description': data.get('sicDescription'),
|
|
'tickers': '|'.join(filter(None, tickers)) if tickers else None,
|
|
'exchanges': '|'.join(filter(None, exchanges)) if exchanges else None,
|
|
'state_of_incorporation': data.get('stateOfIncorporation'),
|
|
'state_of_incorporation_description': data.get('stateOfIncorporationDescription'),
|
|
'fiscal_year_end': data.get('fiscalYearEnd'),
|
|
'entity_type': data.get('entityType'),
|
|
'ein': data.get('ein'),
|
|
})
|
|
|
|
except Exception as e:
|
|
errors += 1
|
|
log.debug(f"Error processing {json_file.name}: {e}")
|
|
continue
|
|
|
|
# Log statistics
|
|
log.info(f"Processed {len(json_files):,} files:")
|
|
log.info(f" - Companies: {len(companies):,}")
|
|
if filter_individuals:
|
|
log.info(f" - Individuals skipped: {individuals_skipped:,}")
|
|
if errors > 0:
|
|
log.warning(f" - Errors: {errors:,}")
|
|
|
|
# Create PyArrow Table
|
|
table = pa.Table.from_pylist(companies, schema=COMPANY_SCHEMA)
|
|
|
|
# Write to Parquet with compression
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
pq.write_table(
|
|
table,
|
|
output_path,
|
|
compression='zstd',
|
|
compression_level=9,
|
|
use_dictionary=True
|
|
)
|
|
|
|
file_size_mb = output_path.stat().st_size / (1024 * 1024)
|
|
log.info(f"Saved Parquet file: {output_path} ({file_size_mb:.1f} MB)")
|
|
|
|
return table
|
|
|
|
|
|
def build_company_dataset_duckdb(
|
|
submissions_dir: Path,
|
|
output_path: Path,
|
|
filter_individuals: bool = True,
|
|
create_indexes: bool = True,
|
|
show_progress: bool = True
|
|
) -> None:
|
|
"""
|
|
Build DuckDB database from submissions directory (companies only).
|
|
|
|
This function creates a DuckDB database with a 'companies' table and
|
|
optional indexes on key columns for fast querying.
|
|
|
|
Performance:
|
|
- ~30 seconds for 562,413 companies (with orjson + filtering)
|
|
- Output size: ~287 MB
|
|
- Query speed: <1ms with indexes
|
|
|
|
Args:
|
|
submissions_dir: Directory containing CIK*.json files
|
|
output_path: Where to save the .duckdb file
|
|
filter_individuals: Skip individual filers (default: True)
|
|
create_indexes: Create indexes on cik, sic, name (default: True)
|
|
show_progress: Show progress bar (default: True)
|
|
|
|
Raises:
|
|
FileNotFoundError: If submissions_dir doesn't exist
|
|
ImportError: If duckdb package not installed
|
|
|
|
Example:
|
|
>>> from pathlib import Path
|
|
>>> submissions_dir = Path.home() / '.edgar' / 'submissions'
|
|
>>> output_path = Path.home() / '.edgar' / 'companies.duckdb'
|
|
>>> build_company_dataset_duckdb(submissions_dir, output_path)
|
|
>>>
|
|
>>> import duckdb
|
|
>>> con = duckdb.connect(str(output_path))
|
|
>>> result = con.execute("SELECT COUNT(*) FROM companies").fetchone()
|
|
>>> print(f"Companies: {result[0]:,}")
|
|
"""
|
|
try:
|
|
import duckdb
|
|
except ImportError:
|
|
raise ImportError(
|
|
"DuckDB export requires duckdb package.\n"
|
|
"Install with: pip install duckdb"
|
|
)
|
|
|
|
if not submissions_dir.exists():
|
|
raise FileNotFoundError(
|
|
f"Submissions directory not found: {submissions_dir}\n\n"
|
|
"Please download submissions data first:\n"
|
|
" from edgar.storage import download_submissions\n"
|
|
" download_submissions()\n"
|
|
)
|
|
|
|
# Get all submission JSON files
|
|
json_files = list(submissions_dir.glob("CIK*.json"))
|
|
if len(json_files) == 0:
|
|
raise FileNotFoundError(
|
|
f"No submission files found in: {submissions_dir}\n"
|
|
"Expected CIK*.json files"
|
|
)
|
|
|
|
log.info(f"Building DuckDB database from {len(json_files):,} submission files")
|
|
log.info(f"Using JSON parser: {JSON_PARSER}")
|
|
|
|
companies = []
|
|
errors = 0
|
|
individuals_skipped = 0
|
|
|
|
# Process each file with progress bar
|
|
iterator = tqdm(json_files, desc="Processing submissions", disable=not show_progress)
|
|
|
|
for json_file in iterator:
|
|
try:
|
|
data = load_json(json_file)
|
|
|
|
# Skip individuals if filtering enabled
|
|
if filter_individuals and is_individual_from_json(data):
|
|
individuals_skipped += 1
|
|
continue
|
|
|
|
# Extract SIC (handle empty strings)
|
|
sic = data.get('sic')
|
|
sic_int = int(sic) if sic and sic != '' else None
|
|
|
|
# Extract tickers and exchanges (filter None values)
|
|
tickers = data.get('tickers', [])
|
|
exchanges = data.get('exchanges', [])
|
|
|
|
companies.append({
|
|
'cik': data.get('cik'),
|
|
'name': data.get('name'),
|
|
'sic': sic_int,
|
|
'sic_description': data.get('sicDescription'),
|
|
'tickers': '|'.join(filter(None, tickers)) if tickers else None,
|
|
'exchanges': '|'.join(filter(None, exchanges)) if exchanges else None,
|
|
'state_of_incorporation': data.get('stateOfIncorporation'),
|
|
'state_of_incorporation_description': data.get('stateOfIncorporationDescription'),
|
|
'fiscal_year_end': data.get('fiscalYearEnd'),
|
|
'entity_type': data.get('entityType'),
|
|
'ein': data.get('ein'),
|
|
})
|
|
|
|
except Exception as e:
|
|
errors += 1
|
|
log.debug(f"Error processing {json_file.name}: {e}")
|
|
continue
|
|
|
|
# Log statistics
|
|
log.info(f"Processed {len(json_files):,} files:")
|
|
log.info(f" - Companies: {len(companies):,}")
|
|
if filter_individuals:
|
|
log.info(f" - Individuals skipped: {individuals_skipped:,}")
|
|
if errors > 0:
|
|
log.warning(f" - Errors: {errors:,}")
|
|
|
|
# Create DuckDB database
|
|
import pandas as pd
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
con = duckdb.connect(str(output_path))
|
|
|
|
# Create table from DataFrame
|
|
df = pd.DataFrame(companies)
|
|
con.execute("CREATE TABLE companies AS SELECT * FROM df")
|
|
|
|
# Create indexes
|
|
if create_indexes:
|
|
log.info("Creating indexes...")
|
|
con.execute("CREATE INDEX idx_cik ON companies(cik)")
|
|
con.execute("CREATE INDEX idx_sic ON companies(sic)")
|
|
con.execute("CREATE INDEX idx_name ON companies(name)")
|
|
|
|
# Add metadata table
|
|
con.execute("""
|
|
CREATE TABLE metadata AS
|
|
SELECT
|
|
CURRENT_TIMESTAMP as created_at,
|
|
COUNT(*) as total_companies,
|
|
COUNT(DISTINCT sic) as unique_sic_codes,
|
|
COUNT(DISTINCT CASE WHEN tickers IS NOT NULL THEN 1 END) as companies_with_tickers,
|
|
COUNT(DISTINCT CASE WHEN exchanges IS NOT NULL THEN 1 END) as companies_with_exchanges
|
|
FROM companies
|
|
""")
|
|
|
|
con.close()
|
|
|
|
file_size_mb = output_path.stat().st_size / (1024 * 1024)
|
|
log.info(f"Saved DuckDB database: {output_path} ({file_size_mb:.1f} MB)")
|
|
|
|
|
|
def load_company_dataset_parquet(parquet_path: Path) -> pa.Table:
|
|
"""
|
|
Load company dataset from Parquet file.
|
|
|
|
This is a simple wrapper around pyarrow.parquet.read_table() with
|
|
logging for consistency.
|
|
|
|
Performance: <100ms for typical dataset
|
|
|
|
Args:
|
|
parquet_path: Path to .pq file
|
|
|
|
Returns:
|
|
PyArrow Table with company data
|
|
|
|
Example:
|
|
>>> from pathlib import Path
|
|
>>> path = Path.home() / '.edgar' / 'companies.pq'
|
|
>>> companies = load_company_dataset_parquet(path)
|
|
>>> print(f"Loaded {len(companies):,} companies")
|
|
"""
|
|
if not parquet_path.exists():
|
|
raise FileNotFoundError(f"Parquet file not found: {parquet_path}")
|
|
|
|
table = pq.read_table(parquet_path)
|
|
log.debug(f"Loaded {len(table):,} companies from {parquet_path}")
|
|
|
|
return table
|
|
|
|
|
|
def to_duckdb(
|
|
parquet_path: Path,
|
|
duckdb_path: Path,
|
|
create_indexes: bool = True
|
|
) -> None:
|
|
"""
|
|
Convert Parquet dataset to DuckDB database.
|
|
|
|
This provides an easy way to export the Parquet dataset to DuckDB
|
|
for users who want SQL query capabilities.
|
|
|
|
Performance: <5 seconds for typical dataset
|
|
|
|
Args:
|
|
parquet_path: Path to source .pq file
|
|
duckdb_path: Path to output .duckdb file
|
|
create_indexes: Create indexes on key columns (default: True)
|
|
|
|
Example:
|
|
>>> from pathlib import Path
|
|
>>> parquet_path = Path.home() / '.edgar' / 'companies.pq'
|
|
>>> duckdb_path = Path.home() / '.edgar' / 'companies.duckdb'
|
|
>>> to_duckdb(parquet_path, duckdb_path)
|
|
>>>
|
|
>>> import duckdb
|
|
>>> con = duckdb.connect(str(duckdb_path))
|
|
>>> result = con.execute(
|
|
... "SELECT * FROM companies WHERE sic = 2834"
|
|
... ).fetchdf()
|
|
"""
|
|
try:
|
|
import duckdb
|
|
except ImportError:
|
|
raise ImportError(
|
|
"DuckDB export requires duckdb package.\n"
|
|
"Install with: pip install duckdb"
|
|
)
|
|
|
|
if not parquet_path.exists():
|
|
raise FileNotFoundError(f"Parquet file not found: {parquet_path}")
|
|
|
|
log.info(f"Converting Parquet to DuckDB: {parquet_path} -> {duckdb_path}")
|
|
|
|
# Read Parquet file and convert to pandas
|
|
table = pq.read_table(parquet_path)
|
|
import pandas as pd
|
|
df = table.to_pandas()
|
|
|
|
# Create DuckDB database
|
|
duckdb_path.parent.mkdir(parents=True, exist_ok=True)
|
|
con = duckdb.connect(str(duckdb_path))
|
|
|
|
# Create table from DataFrame
|
|
con.execute("CREATE TABLE companies AS SELECT * FROM df")
|
|
|
|
# Create indexes
|
|
if create_indexes:
|
|
log.info("Creating indexes...")
|
|
con.execute("CREATE INDEX idx_cik ON companies(cik)")
|
|
con.execute("CREATE INDEX idx_sic ON companies(sic)")
|
|
con.execute("CREATE INDEX idx_name ON companies(name)")
|
|
|
|
# Add metadata
|
|
con.execute("""
|
|
CREATE TABLE metadata AS
|
|
SELECT
|
|
CURRENT_TIMESTAMP as created_at,
|
|
COUNT(*) as total_companies,
|
|
COUNT(DISTINCT sic) as unique_sic_codes,
|
|
COUNT(DISTINCT CASE WHEN tickers IS NOT NULL THEN 1 END) as companies_with_tickers,
|
|
COUNT(DISTINCT CASE WHEN exchanges IS NOT NULL THEN 1 END) as companies_with_exchanges
|
|
FROM companies
|
|
""")
|
|
|
|
con.close()
|
|
|
|
file_size_mb = duckdb_path.stat().st_size / (1024 * 1024)
|
|
log.info(f"Exported to DuckDB: {duckdb_path} ({file_size_mb:.1f} MB)")
|
|
|
|
|
|
# In-memory cache for dataset
|
|
_CACHE = {}
|
|
|
|
|
|
def get_company_dataset(rebuild: bool = False) -> pa.Table:
|
|
"""
|
|
Get company dataset, building from submissions if needed.
|
|
|
|
This function checks for a cached dataset at ~/.edgar/companies.pq.
|
|
If not found, it automatically builds the dataset from submissions data.
|
|
|
|
On first use, this will take ~30 seconds to build the dataset. Subsequent
|
|
calls load from cache in <100ms.
|
|
|
|
Args:
|
|
rebuild: Force rebuild even if cache exists (default: False)
|
|
|
|
Returns:
|
|
PyArrow Table with company data (~562,413 companies)
|
|
|
|
Raises:
|
|
FileNotFoundError: If submissions directory not found or incomplete
|
|
|
|
Performance:
|
|
- First use: ~30 seconds (builds dataset)
|
|
- Cached: <100ms (loads from disk)
|
|
- Memory: ~20-50 MB
|
|
|
|
Example:
|
|
>>> from edgar.reference import get_company_dataset
|
|
>>> import pyarrow.compute as pc
|
|
>>>
|
|
>>> # First call builds dataset (takes ~30s)
|
|
>>> companies = get_company_dataset()
|
|
>>> print(f"Loaded {len(companies):,} companies")
|
|
>>>
|
|
>>> # Subsequent calls are fast (<100ms)
|
|
>>> companies = get_company_dataset()
|
|
>>>
|
|
>>> # Filter pharmaceutical companies (SIC 2834-2836)
|
|
>>> pharma = companies.filter(
|
|
... pc.field('sic').between(2834, 2836)
|
|
... )
|
|
>>> print(f"Found {len(pharma)} pharma companies")
|
|
>>>
|
|
>>> # Filter by exchange
|
|
>>> nasdaq = companies.filter(
|
|
... pc.field('exchanges').contains('Nasdaq')
|
|
... )
|
|
>>>
|
|
>>> # Force rebuild with latest data
|
|
>>> companies = get_company_dataset(rebuild=True)
|
|
"""
|
|
# Check in-memory cache first
|
|
if not rebuild and 'companies' in _CACHE:
|
|
return _CACHE['companies']
|
|
|
|
# Check disk cache
|
|
cache_path = get_edgar_data_directory() / 'companies.pq'
|
|
|
|
if cache_path.exists() and not rebuild:
|
|
# Load from cache
|
|
log.info(f"Loading company dataset from cache: {cache_path}")
|
|
table = load_company_dataset_parquet(cache_path)
|
|
_CACHE['companies'] = table
|
|
return table
|
|
|
|
# Need to build dataset
|
|
log.info("Building company dataset from submissions (this may take ~30 seconds)...")
|
|
|
|
submissions_dir = get_edgar_data_directory() / 'submissions'
|
|
if not submissions_dir.exists() or len(list(submissions_dir.glob('CIK*.json'))) < 100000:
|
|
raise FileNotFoundError(
|
|
f"Submissions directory not found or incomplete: {submissions_dir}\n\n"
|
|
"Please download submissions data first:\n"
|
|
" from edgar.storage import download_submissions\n"
|
|
" download_submissions()\n\n"
|
|
"This is a one-time download (~500 MB compressed)."
|
|
)
|
|
|
|
# Build dataset
|
|
table = build_company_dataset_parquet(
|
|
submissions_dir,
|
|
cache_path,
|
|
filter_individuals=True
|
|
)
|
|
|
|
log.info(f"✅ Built dataset: {len(table):,} companies, cached at {cache_path}")
|
|
|
|
_CACHE['companies'] = table
|
|
return table
|