Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/reference/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/reference/init.py
@@ -0,0 +1,109 @@
+
+from edgar.reference.company_subsets import (
+    # Classes and Enums
+    CompanySubset,
+    MarketCapTier,
+    PopularityTier,
+    # Core Functions
+    get_all_companies,
+    get_companies_by_exchanges,
+    get_popular_companies,
+    # Industry and State Filtering (Comprehensive Mode)
+    get_companies_by_industry,
+    get_companies_by_state,
+    # Sampling and Filtering
+    get_random_sample,
+    get_stratified_sample,
+    get_top_companies_by_metric,
+    filter_companies,
+    exclude_companies,
+    # Set Operations
+    combine_company_sets,
+    intersect_company_sets,
+    # Convenience Functions - General
+    get_faang_companies,
+    get_tech_giants,
+    get_dow_jones_sample,
+    # Convenience Functions - Industry Specific
+    get_pharmaceutical_companies,
+    get_biotechnology_companies,
+    get_software_companies,
+    get_semiconductor_companies,
+    get_banking_companies,
+    get_investment_companies,
+    get_insurance_companies,
+    get_real_estate_companies,
+    get_oil_gas_companies,
+    get_retail_companies,
+)
+from edgar.reference.company_dataset import (
+    get_company_dataset,
+    build_company_dataset_parquet,
+    build_company_dataset_duckdb,
+    is_individual_from_json,
+    to_duckdb,
+)
+from edgar.reference.forms import describe_form
+from edgar.reference.tickers import cusip_ticker_mapping, get_icon_from_ticker, get_ticker_from_cusip
+
+# A dict of state abbreviations and their full names
+states = {
+
+    "AL": "Alabama",
+    "AK": "Alaska",
+    "AZ": "Arizona",
+    "AR": "Arkansas",
+    "CA": "California",
+    "CO": "Colorado",
+    "CT": "Connecticut",
+    "DE": "Delaware",
+    "FL": "Florida",
+    "GA": "Georgia",
+    "HI": "Hawaii",
+    "ID": "Idaho",
+    "IL": "Illinois",
+    "IN": "Indiana",
+    "IA": "Iowa",
+    "KS": "Kansas",
+    "KY": "Kentucky",
+    "LA": "Louisiana",
+    "ME": "Maine",
+    "MD": "Maryland",
+    "MA": "Massachusetts",
+    "MI": "Michigan",
+    "MN": "Minnesota",
+    "MS": "Mississippi",
+    "MO": "Missouri",
+    "MT": "Montana",
+    "NE": "Nebraska",
+    "NV": "Nevada",
+    "NH": "New Hampshire",
+    "NJ": "New Jersey",
+    "NM": "New Mexico",
+    "NY": "New York",
+    "NC": "North Carolina",
+    "ND": "North Dakota",
+    "OH": "Ohio",
+    "OK": "Oklahoma",
+    "OR": "Oregon",
+    "PA": "Pennsylvania",
+    "RI": "Rhode Island",
+    "SC": "South Carolina",
+    "SD": "South Dakota",
+    "TN": "Tennessee",
+    "TX": "Texas",
+    "UT": "Utah",
+    "VT": "Vermont",
+    "VA": "Virginia",
+    "WA": "Washington",
+    "WV": "West Virginia",
+    "WI": "Wisconsin",
+    "WY": "Wyoming",
+}
+
+
+
+
+
+
+
--- a/venv/lib/python3.10/site-packages/edgar/reference/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/reference/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/reference/pycache/_codes.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/reference/pycache/_codes.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/reference/pycache/company_dataset.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/reference/pycache/company_dataset.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/reference/pycache/company_subsets.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/reference/pycache/company_subsets.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/reference/pycache/financials.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/reference/pycache/financials.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/reference/pycache/forms.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/reference/pycache/forms.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/reference/pycache/tickers.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/reference/pycache/tickers.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/reference/_codes.py
+++ b/venv/lib/python3.10/site-packages/edgar/reference/_codes.py
@@ -0,0 +1,111 @@
+ACRONYMS = {
+    "CCC": "CIK Confirmation Code",
+    "CIK": "Central Index Key",
+    "EDGAR": "Electronic Data Gathering, Analysis, and Retrieval",
+    "SEC": "Securities and Exchange Commission",
+}
+
+INVESTMENT_CATEGORIES = {
+    "ABS": "Asset-backed securities",
+    "ACMO": "Agency collateralized mortgage obligations",
+    "ACMBS": "Agency debentures and agency strips",
+    "AMBS": "Agency mortgage-backed securities",
+    "UST": " U.S. Treasuries (including strips)",
+    "N/A": "Not applicable"
+}
+
+ISO_STATES_AND_OUTLYING_AREAS = {
+    "US-AL": "ALABAMA",
+    "US-AK": "ALASKA",
+    "US-AZ": "ARIZONA",
+    "US-AR": "ARKANSAS",
+    "US-CA": "CALIFORNIA",
+    "US-CO": "COLORADO",
+    "US-CT": "CONNECTICUT",
+    "US-DE": "DELAWARE",
+    "US-DC": "DISTRICT OF COLUMBIA",
+
+}
+
+ISO_COUNTRY_CODES = {
+    "AF": " AFGHANISTAN",
+    "AX": "ALAND ISLANDS",
+    "AL": "ALBANIA",
+    "DZ": "ALGERIA",
+    "AS": "AMERICAN SAMOA",
+    "AD": "ANDORRA",
+    "AO": "ANGOLA",
+    "AI": "ANGUILLA",
+    "AQ": "ANTARCTICA",
+    "AG": "ANTIGUA AND BARBUDA",
+    "AR": "ARGENTINA",
+    "AM": "ARMENIA",
+    "AW": "ARUBA",
+    "AU": "AUSTRALIA",
+    "AT": "AUSTRIA",
+    "AZ": "AZERBAIJAN",
+    "BS": "BAHAMAS",
+    "BH": "BAHRAIN",
+    "BD": "BANGLADESH",
+    "BB": "BARBADOS",
+    "BY": "BELARUS",
+    "BE": "BELGIUM",
+    "BZ": "BELIZE",
+    "BJ": "BENIN",
+    "BM": "BERMUDA",
+    "BT": "BHUTAN",
+    "BO": "BOLIVIA (PLURINATIONAL STATE OF)",
+    "BQ": "BONAIRE, SINT EUSTATIUS AND SABA",
+    "BA": "BOSNIA AND HERZEGOVINA",
+    "BW": "BOTSWANA",
+    "BV": "BOUVET ISLAND",
+    "BR": "BRAZIL",
+    "IO": "BRITISH INDIAN OCEAN TERRITORY",
+    "BN": "BRUNEI DARUSSALAM",
+    "BG": "BULGARIA",
+    "BF": "BURKINA FASO",
+    "BI": "BURUNDI",
+    "CV": "CABO VERDE",
+    "KH": "CAMBODIA",
+    "CM": "CAMEROON",
+    "CA": "CANADA",
+    "KY": "CAYMAN ISLANDS",
+    "CF": "CENTRAL AFRICAN REPUBLIC",
+    "TD": "CHAD",
+    "CL": "CHILE",
+    "CN": "CHINA",
+    "CX": "CHRISTMAS ISLAND",
+    "CC": "COCOS (KEELING) ISLANDS",
+    "CO": "COLOMBIA",
+    "KM": "COMOROS",
+    "CG": "CONGO",
+    "CD": "COOK ISLANDS",
+    "CR": "COSTA RICA",
+    "CI": "COTE D'IVOIRE",
+    "HR": "CROATIA",
+    "CU": "CUBA",
+    "CW": "CURACAO",
+    "CY": "CYPRUS",
+    "CZ": "CZECHIA",
+    "DK": "DENMARK",
+    "DJ": "DJIBOUTI",
+    "DM": "DOMINICA",
+    "DO": "DOMINICAN REPUBLIC",
+    "EC": "ECUADOR",
+    "EG": "EGYPT",
+    "SV": "EL SALVADOR",
+    "GQ": "EQUATORIAL GUINEA",
+    "ER": "ERITREA",
+    "EE": "ESTONIA",
+    "ET": "ETHIOPIA",
+    "FK": "FALKLAND ISLANDS (MALVINAS)",
+    "FO": "FAROE ISLANDS",
+    "FJ": "FIJI",
+    "FI": "FINLAND",
+    "FR": "FRANCE",
+    "GF": "FRENCH GUIANA",
+    "PF": "FRENCH POLYNESIA",
+    "TF": "FRENCH SOUTHERN TERRITORIES",
+    "GA": "GABON",
+    "GM": "GAMBIA",
+}
--- a/venv/lib/python3.10/site-packages/edgar/reference/company_dataset.py
+++ b/venv/lib/python3.10/site-packages/edgar/reference/company_dataset.py
@@ -0,0 +1,606 @@
+"""
+Company Dataset Builder for EdgarTools
+
+Builds high-performance company datasets from SEC submissions data with two output formats:
+1. PyArrow Parquet (5-20 MB) - Fast filtering with PyArrow compute API
+2. DuckDB (287 MB) - Optional SQL interface for power users
+
+Performance:
+- Build time: ~30 seconds (optimized with orjson + company filtering)
+- Records: ~562,413 companies (40% individual filers filtered)
+- Query speed: <1ms (DuckDB) or <100ms (Parquet)
+
+Example:
+    >>> from edgar.reference import get_company_dataset
+    >>> import pyarrow.compute as pc
+    >>>
+    >>> # Load dataset (builds on first use)
+    >>> companies = get_company_dataset()
+    >>>
+    >>> # Filter pharmaceutical companies
+    >>> pharma = companies.filter(pc.field('sic').between(2834, 2836))
+    >>> print(f"Found {len(pharma)} pharma companies")
+"""
+
+from pathlib import Path
+from typing import Optional, Union
+import logging
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+from tqdm import tqdm
+
+from edgar.core import get_edgar_data_directory, log
+
+# Try to import orjson for performance, fall back to stdlib json
+try:
+    import orjson
+
+    def load_json(path: Path) -> dict:
+        """Load JSON file using orjson (1.55x faster)"""
+        return orjson.loads(path.read_bytes())
+
+    JSON_PARSER = "orjson"
+except ImportError:
+    import json
+
+    def load_json(path: Path) -> dict:
+        """Load JSON file using stdlib json"""
+        with open(path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+
+    JSON_PARSER = "json (stdlib)"
+
+
+# Company dataset schema
+COMPANY_SCHEMA = pa.schema([
+    ('cik', pa.string()),  # Keep as string to preserve leading zeros
+    ('name', pa.string()),
+    ('sic', pa.int32()),  # Nullable - some companies have no SIC
+    ('sic_description', pa.string()),
+    ('tickers', pa.string()),  # Pipe-delimited (e.g., "AAPL|APPLE")
+    ('exchanges', pa.string()),  # Pipe-delimited (e.g., "Nasdaq|NYSE")
+    ('state_of_incorporation', pa.string()),
+    ('state_of_incorporation_description', pa.string()),
+    ('fiscal_year_end', pa.string()),  # MMDD format
+    ('entity_type', pa.string()),
+    ('ein', pa.string()),
+])
+
+
+def is_individual_from_json(data: dict) -> bool:
+    """
+    Determine if entity is an individual filer vs a company.
+
+    Uses the same logic as edgar.entity.data:478 (is_individual property).
+
+    Companies typically have:
+    - Tickers or exchanges
+    - State of incorporation
+    - Entity type other than '' or 'other'
+    - Company-specific filings (10-K, 10-Q, 8-K, etc.)
+
+    Args:
+        data: Parsed JSON submission data
+
+    Returns:
+        True if individual filer, False if company
+
+    Example:
+        >>> data = {'cik': '0001318605', 'tickers': ['TSLA']}
+        >>> is_individual_from_json(data)
+        False
+
+        >>> data = {'cik': '0001078519', 'name': 'JOHN DOE'}
+        >>> is_individual_from_json(data)
+        True
+    """
+    # Has ticker or exchange → company
+    if data.get('tickers') or data.get('exchanges'):
+        return False
+
+    # Has state of incorporation → company (with exceptions)
+    state = data.get('stateOfIncorporation', '')
+    if state and state != '':
+        # Reed Hastings exception (individual with state of incorporation)
+        if data.get('cik') == '0001033331':
+            return True
+        return False
+
+    # Has entity type (not '' or 'other') → company
+    entity_type = data.get('entityType', '')
+    if entity_type and entity_type not in ['', 'other']:
+        return False
+
+    # Files company forms (10-K, 10-Q, etc.) → company
+    filings = data.get('filings', {})
+    if filings:
+        recent = filings.get('recent', {})
+        forms = recent.get('form', [])
+        company_forms = {'10-K', '10-Q', '8-K', '10-K/A', '10-Q/A', '20-F', 'S-1'}
+        if any(form in company_forms for form in forms):
+            return False
+
+    # Default: individual
+    return True
+
+
+def build_company_dataset_parquet(
+    submissions_dir: Path,
+    output_path: Path,
+    filter_individuals: bool = True,
+    show_progress: bool = True
+) -> pa.Table:
+    """
+    Build PyArrow Parquet dataset from submissions directory (companies only).
+
+    This function processes all CIK*.json files in the submissions directory,
+    filters out individual filers (optional), and creates a compressed Parquet file.
+
+    Performance:
+        - ~30 seconds for 562,413 companies (with orjson + filtering)
+        - Output size: ~5-20 MB (zstd compressed)
+        - Memory usage: ~100-200 MB during build
+
+    Args:
+        submissions_dir: Directory containing CIK*.json files
+        output_path: Where to save the .pq file
+        filter_individuals: Skip individual filers (default: True)
+        show_progress: Show progress bar (default: True)
+
+    Returns:
+        PyArrow Table with company data
+
+    Raises:
+        FileNotFoundError: If submissions_dir doesn't exist
+
+    Example:
+        >>> from pathlib import Path
+        >>> submissions_dir = Path.home() / '.edgar' / 'submissions'
+        >>> output_path = Path.home() / '.edgar' / 'companies.pq'
+        >>> table = build_company_dataset_parquet(submissions_dir, output_path)
+        >>> print(f"Built dataset: {len(table):,} companies")
+    """
+    if not submissions_dir.exists():
+        raise FileNotFoundError(
+            f"Submissions directory not found: {submissions_dir}\n\n"
+            "Please download submissions data first:\n"
+            "  from edgar.storage import download_submissions\n"
+            "  download_submissions()\n"
+        )
+
+    # Get all submission JSON files
+    json_files = list(submissions_dir.glob("CIK*.json"))
+    if len(json_files) == 0:
+        raise FileNotFoundError(
+            f"No submission files found in: {submissions_dir}\n"
+            "Expected CIK*.json files"
+        )
+
+    log.info(f"Building company dataset from {len(json_files):,} submission files")
+    log.info(f"Using JSON parser: {JSON_PARSER}")
+
+    companies = []
+    errors = 0
+    individuals_skipped = 0
+
+    # Process each file with progress bar
+    iterator = tqdm(json_files, desc="Processing submissions", disable=not show_progress)
+
+    for json_file in iterator:
+        try:
+            data = load_json(json_file)
+
+            # Skip individuals if filtering enabled
+            if filter_individuals and is_individual_from_json(data):
+                individuals_skipped += 1
+                continue
+
+            # Extract SIC (handle empty strings)
+            sic = data.get('sic')
+            sic_int = int(sic) if sic and sic != '' else None
+
+            # Extract tickers and exchanges (filter None values)
+            tickers = data.get('tickers', [])
+            exchanges = data.get('exchanges', [])
+
+            companies.append({
+                'cik': data.get('cik'),
+                'name': data.get('name'),
+                'sic': sic_int,
+                'sic_description': data.get('sicDescription'),
+                'tickers': '|'.join(filter(None, tickers)) if tickers else None,
+                'exchanges': '|'.join(filter(None, exchanges)) if exchanges else None,
+                'state_of_incorporation': data.get('stateOfIncorporation'),
+                'state_of_incorporation_description': data.get('stateOfIncorporationDescription'),
+                'fiscal_year_end': data.get('fiscalYearEnd'),
+                'entity_type': data.get('entityType'),
+                'ein': data.get('ein'),
+            })
+
+        except Exception as e:
+            errors += 1
+            log.debug(f"Error processing {json_file.name}: {e}")
+            continue
+
+    # Log statistics
+    log.info(f"Processed {len(json_files):,} files:")
+    log.info(f"  - Companies: {len(companies):,}")
+    if filter_individuals:
+        log.info(f"  - Individuals skipped: {individuals_skipped:,}")
+    if errors > 0:
+        log.warning(f"  - Errors: {errors:,}")
+
+    # Create PyArrow Table
+    table = pa.Table.from_pylist(companies, schema=COMPANY_SCHEMA)
+
+    # Write to Parquet with compression
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    pq.write_table(
+        table,
+        output_path,
+        compression='zstd',
+        compression_level=9,
+        use_dictionary=True
+    )
+
+    file_size_mb = output_path.stat().st_size / (1024 * 1024)
+    log.info(f"Saved Parquet file: {output_path} ({file_size_mb:.1f} MB)")
+
+    return table
+
+
+def build_company_dataset_duckdb(
+    submissions_dir: Path,
+    output_path: Path,
+    filter_individuals: bool = True,
+    create_indexes: bool = True,
+    show_progress: bool = True
+) -> None:
+    """
+    Build DuckDB database from submissions directory (companies only).
+
+    This function creates a DuckDB database with a 'companies' table and
+    optional indexes on key columns for fast querying.
+
+    Performance:
+        - ~30 seconds for 562,413 companies (with orjson + filtering)
+        - Output size: ~287 MB
+        - Query speed: <1ms with indexes
+
+    Args:
+        submissions_dir: Directory containing CIK*.json files
+        output_path: Where to save the .duckdb file
+        filter_individuals: Skip individual filers (default: True)
+        create_indexes: Create indexes on cik, sic, name (default: True)
+        show_progress: Show progress bar (default: True)
+
+    Raises:
+        FileNotFoundError: If submissions_dir doesn't exist
+        ImportError: If duckdb package not installed
+
+    Example:
+        >>> from pathlib import Path
+        >>> submissions_dir = Path.home() / '.edgar' / 'submissions'
+        >>> output_path = Path.home() / '.edgar' / 'companies.duckdb'
+        >>> build_company_dataset_duckdb(submissions_dir, output_path)
+        >>>
+        >>> import duckdb
+        >>> con = duckdb.connect(str(output_path))
+        >>> result = con.execute("SELECT COUNT(*) FROM companies").fetchone()
+        >>> print(f"Companies: {result[0]:,}")
+    """
+    try:
+        import duckdb
+    except ImportError:
+        raise ImportError(
+            "DuckDB export requires duckdb package.\n"
+            "Install with: pip install duckdb"
+        )
+
+    if not submissions_dir.exists():
+        raise FileNotFoundError(
+            f"Submissions directory not found: {submissions_dir}\n\n"
+            "Please download submissions data first:\n"
+            "  from edgar.storage import download_submissions\n"
+            "  download_submissions()\n"
+        )
+
+    # Get all submission JSON files
+    json_files = list(submissions_dir.glob("CIK*.json"))
+    if len(json_files) == 0:
+        raise FileNotFoundError(
+            f"No submission files found in: {submissions_dir}\n"
+            "Expected CIK*.json files"
+        )
+
+    log.info(f"Building DuckDB database from {len(json_files):,} submission files")
+    log.info(f"Using JSON parser: {JSON_PARSER}")
+
+    companies = []
+    errors = 0
+    individuals_skipped = 0
+
+    # Process each file with progress bar
+    iterator = tqdm(json_files, desc="Processing submissions", disable=not show_progress)
+
+    for json_file in iterator:
+        try:
+            data = load_json(json_file)
+
+            # Skip individuals if filtering enabled
+            if filter_individuals and is_individual_from_json(data):
+                individuals_skipped += 1
+                continue
+
+            # Extract SIC (handle empty strings)
+            sic = data.get('sic')
+            sic_int = int(sic) if sic and sic != '' else None
+
+            # Extract tickers and exchanges (filter None values)
+            tickers = data.get('tickers', [])
+            exchanges = data.get('exchanges', [])
+
+            companies.append({
+                'cik': data.get('cik'),
+                'name': data.get('name'),
+                'sic': sic_int,
+                'sic_description': data.get('sicDescription'),
+                'tickers': '|'.join(filter(None, tickers)) if tickers else None,
+                'exchanges': '|'.join(filter(None, exchanges)) if exchanges else None,
+                'state_of_incorporation': data.get('stateOfIncorporation'),
+                'state_of_incorporation_description': data.get('stateOfIncorporationDescription'),
+                'fiscal_year_end': data.get('fiscalYearEnd'),
+                'entity_type': data.get('entityType'),
+                'ein': data.get('ein'),
+            })
+
+        except Exception as e:
+            errors += 1
+            log.debug(f"Error processing {json_file.name}: {e}")
+            continue
+
+    # Log statistics
+    log.info(f"Processed {len(json_files):,} files:")
+    log.info(f"  - Companies: {len(companies):,}")
+    if filter_individuals:
+        log.info(f"  - Individuals skipped: {individuals_skipped:,}")
+    if errors > 0:
+        log.warning(f"  - Errors: {errors:,}")
+
+    # Create DuckDB database
+    import pandas as pd
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    con = duckdb.connect(str(output_path))
+
+    # Create table from DataFrame
+    df = pd.DataFrame(companies)
+    con.execute("CREATE TABLE companies AS SELECT * FROM df")
+
+    # Create indexes
+    if create_indexes:
+        log.info("Creating indexes...")
+        con.execute("CREATE INDEX idx_cik ON companies(cik)")
+        con.execute("CREATE INDEX idx_sic ON companies(sic)")
+        con.execute("CREATE INDEX idx_name ON companies(name)")
+
+    # Add metadata table
+    con.execute("""
+        CREATE TABLE metadata AS
+        SELECT
+            CURRENT_TIMESTAMP as created_at,
+            COUNT(*) as total_companies,
+            COUNT(DISTINCT sic) as unique_sic_codes,
+            COUNT(DISTINCT CASE WHEN tickers IS NOT NULL THEN 1 END) as companies_with_tickers,
+            COUNT(DISTINCT CASE WHEN exchanges IS NOT NULL THEN 1 END) as companies_with_exchanges
+        FROM companies
+    """)
+
+    con.close()
+
+    file_size_mb = output_path.stat().st_size / (1024 * 1024)
+    log.info(f"Saved DuckDB database: {output_path} ({file_size_mb:.1f} MB)")
+
+
+def load_company_dataset_parquet(parquet_path: Path) -> pa.Table:
+    """
+    Load company dataset from Parquet file.
+
+    This is a simple wrapper around pyarrow.parquet.read_table() with
+    logging for consistency.
+
+    Performance: <100ms for typical dataset
+
+    Args:
+        parquet_path: Path to .pq file
+
+    Returns:
+        PyArrow Table with company data
+
+    Example:
+        >>> from pathlib import Path
+        >>> path = Path.home() / '.edgar' / 'companies.pq'
+        >>> companies = load_company_dataset_parquet(path)
+        >>> print(f"Loaded {len(companies):,} companies")
+    """
+    if not parquet_path.exists():
+        raise FileNotFoundError(f"Parquet file not found: {parquet_path}")
+
+    table = pq.read_table(parquet_path)
+    log.debug(f"Loaded {len(table):,} companies from {parquet_path}")
+
+    return table
+
+
+def to_duckdb(
+    parquet_path: Path,
+    duckdb_path: Path,
+    create_indexes: bool = True
+) -> None:
+    """
+    Convert Parquet dataset to DuckDB database.
+
+    This provides an easy way to export the Parquet dataset to DuckDB
+    for users who want SQL query capabilities.
+
+    Performance: <5 seconds for typical dataset
+
+    Args:
+        parquet_path: Path to source .pq file
+        duckdb_path: Path to output .duckdb file
+        create_indexes: Create indexes on key columns (default: True)
+
+    Example:
+        >>> from pathlib import Path
+        >>> parquet_path = Path.home() / '.edgar' / 'companies.pq'
+        >>> duckdb_path = Path.home() / '.edgar' / 'companies.duckdb'
+        >>> to_duckdb(parquet_path, duckdb_path)
+        >>>
+        >>> import duckdb
+        >>> con = duckdb.connect(str(duckdb_path))
+        >>> result = con.execute(
+        ...     "SELECT * FROM companies WHERE sic = 2834"
+        ... ).fetchdf()
+    """
+    try:
+        import duckdb
+    except ImportError:
+        raise ImportError(
+            "DuckDB export requires duckdb package.\n"
+            "Install with: pip install duckdb"
+        )
+
+    if not parquet_path.exists():
+        raise FileNotFoundError(f"Parquet file not found: {parquet_path}")
+
+    log.info(f"Converting Parquet to DuckDB: {parquet_path} -> {duckdb_path}")
+
+    # Read Parquet file and convert to pandas
+    table = pq.read_table(parquet_path)
+    import pandas as pd
+    df = table.to_pandas()
+
+    # Create DuckDB database
+    duckdb_path.parent.mkdir(parents=True, exist_ok=True)
+    con = duckdb.connect(str(duckdb_path))
+
+    # Create table from DataFrame
+    con.execute("CREATE TABLE companies AS SELECT * FROM df")
+
+    # Create indexes
+    if create_indexes:
+        log.info("Creating indexes...")
+        con.execute("CREATE INDEX idx_cik ON companies(cik)")
+        con.execute("CREATE INDEX idx_sic ON companies(sic)")
+        con.execute("CREATE INDEX idx_name ON companies(name)")
+
+    # Add metadata
+    con.execute("""
+        CREATE TABLE metadata AS
+        SELECT
+            CURRENT_TIMESTAMP as created_at,
+            COUNT(*) as total_companies,
+            COUNT(DISTINCT sic) as unique_sic_codes,
+            COUNT(DISTINCT CASE WHEN tickers IS NOT NULL THEN 1 END) as companies_with_tickers,
+            COUNT(DISTINCT CASE WHEN exchanges IS NOT NULL THEN 1 END) as companies_with_exchanges
+        FROM companies
+    """)
+
+    con.close()
+
+    file_size_mb = duckdb_path.stat().st_size / (1024 * 1024)
+    log.info(f"Exported to DuckDB: {duckdb_path} ({file_size_mb:.1f} MB)")
+
+
+# In-memory cache for dataset
+_CACHE = {}
+
+
+def get_company_dataset(rebuild: bool = False) -> pa.Table:
+    """
+    Get company dataset, building from submissions if needed.
+
+    This function checks for a cached dataset at ~/.edgar/companies.pq.
+    If not found, it automatically builds the dataset from submissions data.
+
+    On first use, this will take ~30 seconds to build the dataset. Subsequent
+    calls load from cache in <100ms.
+
+    Args:
+        rebuild: Force rebuild even if cache exists (default: False)
+
+    Returns:
+        PyArrow Table with company data (~562,413 companies)
+
+    Raises:
+        FileNotFoundError: If submissions directory not found or incomplete
+
+    Performance:
+        - First use: ~30 seconds (builds dataset)
+        - Cached: <100ms (loads from disk)
+        - Memory: ~20-50 MB
+
+    Example:
+        >>> from edgar.reference import get_company_dataset
+        >>> import pyarrow.compute as pc
+        >>>
+        >>> # First call builds dataset (takes ~30s)
+        >>> companies = get_company_dataset()
+        >>> print(f"Loaded {len(companies):,} companies")
+        >>>
+        >>> # Subsequent calls are fast (<100ms)
+        >>> companies = get_company_dataset()
+        >>>
+        >>> # Filter pharmaceutical companies (SIC 2834-2836)
+        >>> pharma = companies.filter(
+        ...     pc.field('sic').between(2834, 2836)
+        ... )
+        >>> print(f"Found {len(pharma)} pharma companies")
+        >>>
+        >>> # Filter by exchange
+        >>> nasdaq = companies.filter(
+        ...     pc.field('exchanges').contains('Nasdaq')
+        ... )
+        >>>
+        >>> # Force rebuild with latest data
+        >>> companies = get_company_dataset(rebuild=True)
+    """
+    # Check in-memory cache first
+    if not rebuild and 'companies' in _CACHE:
+        return _CACHE['companies']
+
+    # Check disk cache
+    cache_path = get_edgar_data_directory() / 'companies.pq'
+
+    if cache_path.exists() and not rebuild:
+        # Load from cache
+        log.info(f"Loading company dataset from cache: {cache_path}")
+        table = load_company_dataset_parquet(cache_path)
+        _CACHE['companies'] = table
+        return table
+
+    # Need to build dataset
+    log.info("Building company dataset from submissions (this may take ~30 seconds)...")
+
+    submissions_dir = get_edgar_data_directory() / 'submissions'
+    if not submissions_dir.exists() or len(list(submissions_dir.glob('CIK*.json'))) < 100000:
+        raise FileNotFoundError(
+            f"Submissions directory not found or incomplete: {submissions_dir}\n\n"
+            "Please download submissions data first:\n"
+            "  from edgar.storage import download_submissions\n"
+            "  download_submissions()\n\n"
+            "This is a one-time download (~500 MB compressed)."
+        )
+
+    # Build dataset
+    table = build_company_dataset_parquet(
+        submissions_dir,
+        cache_path,
+        filter_individuals=True
+    )
+
+    log.info(f"✅ Built dataset: {len(table):,} companies, cached at {cache_path}")
+
+    _CACHE['companies'] = table
+    return table
--- a/venv/lib/python3.10/site-packages/edgar/reference/company_subsets.py
+++ b/venv/lib/python3.10/site-packages/edgar/reference/company_subsets.py
@@ -0,0 +1,991 @@
+"""
+Company subset selection utilities for analysis and learning tasks.
+
+This module provides flexible ways to create subsets of companies from SEC reference data
+for educational, research, and analysis purposes. It offers exchange-based selection,
+popularity-based filtering, sampling capabilities, and composition utilities.
+
+Key features:
+- Exchange-based selection (NYSE, NASDAQ, OTC, CBOE)
+- Popularity-based selection (popular stocks, market cap tiers)
+- Sampling capabilities (random, stratified, top N)
+- Filtering and combination utilities
+- Consistent DataFrame output format
+
+All functions return a standardized DataFrame with columns: ['cik', 'ticker', 'name', 'exchange']
+"""
+
+from enum import Enum
+from functools import lru_cache
+from typing import Callable, List, Optional, Union
+
+import pandas as pd
+
+from edgar.core import log
+from edgar.reference.tickers import get_company_ticker_name_exchange, popular_us_stocks
+
+__all__ = [
+    # Classes and Enums
+    'CompanySubset',
+    'MarketCapTier',
+    'PopularityTier',
+    # Core Functions
+    'get_all_companies',
+    'get_companies_by_exchanges',
+    'get_popular_companies',
+    # Industry and State Filtering (Comprehensive Mode)
+    'get_companies_by_industry',
+    'get_companies_by_state',
+    # Sampling and Filtering
+    'get_random_sample',
+    'get_stratified_sample',
+    'get_top_companies_by_metric',
+    'filter_companies',
+    'exclude_companies',
+    # Set Operations
+    'combine_company_sets',
+    'intersect_company_sets',
+    # Convenience Functions - General
+    'get_faang_companies',
+    'get_tech_giants',
+    'get_dow_jones_sample',
+    # Convenience Functions - Industry Specific
+    'get_pharmaceutical_companies',
+    'get_biotechnology_companies',
+    'get_software_companies',
+    'get_semiconductor_companies',
+    'get_banking_companies',
+    'get_investment_companies',
+    'get_insurance_companies',
+    'get_real_estate_companies',
+    'get_oil_gas_companies',
+    'get_retail_companies',
+]
+
+
+class MarketCapTier(Enum):
+    """Market cap tiers for company classification."""
+    LARGE_CAP = "large_cap"      # Usually > $10B
+    MID_CAP = "mid_cap"          # Usually $2B - $10B  
+    SMALL_CAP = "small_cap"      # Usually $300M - $2B
+    MICRO_CAP = "micro_cap"      # Usually < $300M
+
+
+class PopularityTier(Enum):
+    """Popularity tiers based on trading activity and recognition."""
+    MEGA_CAP = "mega_cap"        # Top 10 most valuable companies
+    POPULAR = "popular"          # Popular stocks list
+    MAINSTREAM = "mainstream"    # Well-known companies
+    EMERGING = "emerging"        # Smaller but notable companies
+
+
+class CompanySubset:
+    """
+    Fluent interface for building company subsets with chainable operations.
+
+    Example:
+        # Get 50 random NYSE companies excluding financial sector
+        companies = (CompanySubset()
+                    .from_exchange('NYSE')
+                    .exclude_tickers(['JPM', 'GS', 'C'])
+                    .sample(50)
+                    .get())
+
+        # Get pharmaceutical companies with comprehensive metadata
+        pharma = (CompanySubset(use_comprehensive=True)
+                 .from_industry(sic_range=(2834, 2836))
+                 .sample(100)
+                 .get())
+    """
+
+    def __init__(self, companies: Optional[pd.DataFrame] = None, use_comprehensive: bool = False):
+        """
+        Initialize with optional starting dataset.
+
+        Args:
+            companies: Optional DataFrame to start with. If None, loads from get_all_companies()
+            use_comprehensive: If True and companies is None, load comprehensive dataset
+                             with rich metadata (SIC, state, entity type, etc.)
+        """
+        if companies is not None:
+            self._companies = companies
+        else:
+            self._companies = get_all_companies(use_comprehensive=use_comprehensive)
+        self._use_comprehensive = use_comprehensive
+
+    def from_exchange(self, exchanges: Union[str, List[str]]) -> 'CompanySubset':
+        """Filter companies by exchange(s)."""
+        self._companies = get_companies_by_exchanges(exchanges)
+        return self
+
+    def from_popular(self, tier: Optional[PopularityTier] = None) -> 'CompanySubset':
+        """Filter to popular companies."""
+        self._companies = get_popular_companies(tier)
+        return self
+
+    def from_industry(
+        self,
+        sic: Optional[Union[int, List[int]]] = None,
+        sic_range: Optional[tuple[int, int]] = None,
+        sic_description_contains: Optional[str] = None
+    ) -> 'CompanySubset':
+        """
+        Filter companies by industry (SIC code).
+
+        Automatically enables comprehensive mode to access industry metadata.
+
+        Args:
+            sic: Single SIC code or list of SIC codes to match exactly
+            sic_range: Tuple of (min_sic, max_sic) for range filtering
+            sic_description_contains: String to search within SIC description
+
+        Returns:
+            CompanySubset with industry filter applied
+
+        Example:
+            >>> # Pharmaceutical companies
+            >>> pharma = CompanySubset().from_industry(sic=2834)
+
+            >>> # Biotech sector
+            >>> biotech = CompanySubset().from_industry(sic_range=(2833, 2836))
+        """
+        self._companies = get_companies_by_industry(
+            sic=sic,
+            sic_range=sic_range,
+            sic_description_contains=sic_description_contains
+        )
+        self._use_comprehensive = True
+        return self
+
+    def from_state(self, states: Union[str, List[str]]) -> 'CompanySubset':
+        """
+        Filter companies by state of incorporation.
+
+        Automatically enables comprehensive mode to access state metadata.
+
+        Args:
+            states: Single state code or list of state codes (e.g., 'DE', 'CA')
+
+        Returns:
+            CompanySubset with state filter applied
+
+        Example:
+            >>> # Delaware corporations
+            >>> de_corps = CompanySubset().from_state('DE')
+
+            >>> # Delaware or Nevada corporations
+            >>> de_nv = CompanySubset().from_state(['DE', 'NV'])
+        """
+        self._companies = get_companies_by_state(states)
+        self._use_comprehensive = True
+        return self
+
+    def filter_by(self, condition: Callable[[pd.DataFrame], pd.DataFrame]) -> 'CompanySubset':
+        """Apply custom filter function."""
+        self._companies = condition(self._companies)
+        return self
+
+    def exclude_tickers(self, tickers: List[str]) -> 'CompanySubset':
+        """Exclude specific tickers."""
+        self._companies = exclude_companies(self._companies, tickers)
+        return self
+
+    def include_tickers(self, tickers: List[str]) -> 'CompanySubset':
+        """Include only specific tickers."""
+        self._companies = filter_companies(self._companies, ticker_list=tickers)
+        return self
+
+    def sample(self, n: int, random_state: Optional[int] = None) -> 'CompanySubset':
+        """Take random sample of n companies."""
+        self._companies = get_random_sample(self._companies, n, random_state)
+        return self
+
+    def top(self, n: int, by: str = 'name') -> 'CompanySubset':
+        """Take top n companies by specified column."""
+        self._companies = get_top_companies_by_metric(self._companies, n, by)
+        return self
+
+    def combine_with(self, other: 'CompanySubset') -> 'CompanySubset':
+        """Combine with another subset (union)."""
+        self._companies = combine_company_sets([self._companies, other.get()])
+        return self
+
+    def intersect_with(self, other: 'CompanySubset') -> 'CompanySubset':
+        """Intersect with another subset."""
+        self._companies = intersect_company_sets([self._companies, other.get()])
+        return self
+
+    def get(self) -> pd.DataFrame:
+        """Get the final DataFrame."""
+        return self._companies.copy()
+
+    def __len__(self) -> int:
+        """Return number of companies in subset."""
+        return len(self._companies)
+
+    def __repr__(self) -> str:
+        """String representation showing count and sample."""
+        count = len(self._companies)
+        if count == 0:
+            return "CompanySubset(empty)"
+
+        sample_size = min(3, count)
+        sample_tickers = self._companies['ticker'].head(sample_size).tolist()
+        sample_str = ', '.join(sample_tickers)
+
+        if count > sample_size:
+            sample_str += f", ... +{count - sample_size} more"
+
+        return f"CompanySubset({count} companies: {sample_str})"
+
+
+def _get_comprehensive_companies() -> pd.DataFrame:
+    """
+    Get comprehensive company dataset from company_dataset module.
+
+    This function loads the full SEC submissions dataset (~562K companies) with rich metadata
+    including SIC codes, state of incorporation, entity types, and more.
+
+    Returns:
+        DataFrame with extended schema:
+        ['cik', 'ticker', 'name', 'exchange', 'sic', 'sic_description',
+         'state_of_incorporation', 'state_of_incorporation_description',
+         'fiscal_year_end', 'entity_type', 'ein']
+
+    Note:
+        - First call may take ~30 seconds to build the dataset
+        - Subsequent calls use cached Parquet file (<100ms load time)
+        - Primary ticker extracted from pipe-delimited tickers field
+        - Primary exchange extracted from pipe-delimited exchanges field
+    """
+    try:
+        from edgar.reference.company_dataset import get_company_dataset
+
+        # Get PyArrow Table from company_dataset
+        table = get_company_dataset()
+
+        # Convert to pandas
+        df = table.to_pandas()
+
+        # Extract primary ticker from pipe-delimited tickers field
+        def extract_primary(value):
+            """Extract first value from pipe-delimited string."""
+            if pd.isna(value) or value is None:
+                return None
+            value_str = str(value)
+            parts = value_str.split('|')
+            return parts[0] if parts and parts[0] else None
+
+        df['ticker'] = df['tickers'].apply(extract_primary)
+        df['exchange'] = df['exchanges'].apply(extract_primary)
+
+        # Drop the original pipe-delimited columns
+        df = df.drop(columns=['tickers', 'exchanges'])
+
+        # Reorder columns to match standard format plus extensions
+        columns = [
+            'cik', 'ticker', 'name', 'exchange',
+            'sic', 'sic_description',
+            'state_of_incorporation', 'state_of_incorporation_description',
+            'fiscal_year_end', 'entity_type', 'ein'
+        ]
+
+        return df[columns]
+
+    except Exception as e:
+        log.error(f"Error fetching comprehensive company data: {e}")
+        # Return empty DataFrame with extended schema
+        return pd.DataFrame(columns=[
+            'cik', 'ticker', 'name', 'exchange',
+            'sic', 'sic_description',
+            'state_of_incorporation', 'state_of_incorporation_description',
+            'fiscal_year_end', 'entity_type', 'ein'
+        ])
+
+
+@lru_cache(maxsize=2)
+def get_all_companies(use_comprehensive: bool = False) -> pd.DataFrame:
+    """
+    Get all companies from SEC reference data in standardized format.
+
+    Args:
+        use_comprehensive: If True, load comprehensive dataset with ~562K companies
+                          and rich metadata (SIC, state, entity type, etc.).
+                          If False (default), load ticker-only dataset with ~13K companies.
+
+    Returns:
+        DataFrame with columns ['cik', 'ticker', 'name', 'exchange']
+
+        If use_comprehensive=True, also includes:
+        ['sic', 'sic_description', 'state_of_incorporation',
+         'state_of_incorporation_description', 'fiscal_year_end',
+         'entity_type', 'ein']
+
+    Note:
+        - Default (use_comprehensive=False) maintains backward compatibility
+        - Comprehensive mode adds ~30 second build time on first call
+        - Both modes use caching for fast subsequent calls
+
+    Example:
+        >>> # Standard mode - fast, ticker-only data
+        >>> companies = get_all_companies()
+        >>> len(companies)  # ~13K companies
+
+        >>> # Comprehensive mode - slower first call, rich metadata
+        >>> all_companies = get_all_companies(use_comprehensive=True)
+        >>> len(all_companies)  # ~562K companies
+        >>> 'sic' in all_companies.columns  # True
+    """
+    if use_comprehensive:
+        return _get_comprehensive_companies()
+
+    try:
+        df = get_company_ticker_name_exchange().copy()
+        # Reorder columns to match our standard format
+        return df[['cik', 'ticker', 'name', 'exchange']]
+    except Exception as e:
+        log.error(f"Error fetching company data: {e}")
+        # Return empty DataFrame with correct structure
+        return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
+
+
+def get_companies_by_exchanges(exchanges: Union[str, List[str]]) -> pd.DataFrame:
+    """
+    Get companies listed on specific exchange(s).
+
+    Args:
+        exchanges: Single exchange string or list of exchanges
+                  ('NYSE', 'Nasdaq', 'OTC', 'CBOE')
+
+    Returns:
+        DataFrame with companies from specified exchanges
+
+    Example:
+        >>> nyse_companies = get_companies_by_exchanges('NYSE')
+        >>> major_exchanges = get_companies_by_exchanges(['NYSE', 'Nasdaq'])
+    """
+    if isinstance(exchanges, str):
+        exchanges = [exchanges]
+
+    try:
+        all_companies = get_all_companies()
+        return all_companies[all_companies['exchange'].isin(exchanges)].reset_index(drop=True)
+    except Exception as e:
+        log.error(f"Error filtering companies by exchanges {exchanges}: {e}")
+        return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
+
+
+def get_popular_companies(tier: Optional[PopularityTier] = None) -> pd.DataFrame:
+    """
+    Get popular companies based on tier selection.
+
+    Args:
+        tier: Popularity tier (MEGA_CAP, POPULAR, MAINSTREAM, EMERGING)
+              If None, returns all popular companies
+
+    Returns:
+        DataFrame with popular companies
+
+    Example:
+        >>> mega_cap = get_popular_companies(PopularityTier.MEGA_CAP)
+        >>> all_popular = get_popular_companies()
+    """
+    try:
+        # Get popular stocks and merge with exchange data
+        popular_df = popular_us_stocks().reset_index()  # CIK becomes a column
+        popular_df = popular_df.rename(columns={'Cik': 'cik', 'Ticker': 'ticker', 'Company': 'name'})
+
+        # Get exchange information
+        all_companies = get_all_companies()
+
+        # Merge to get exchange information
+        result = popular_df.merge(
+            all_companies[['cik', 'exchange']], 
+            on='cik', 
+            how='left'
+        )
+
+        # Fill missing exchanges with 'Unknown' 
+        result['exchange'] = result['exchange'].fillna('Unknown')
+
+        # Apply tier filtering
+        if tier == PopularityTier.MEGA_CAP:
+            result = result.head(10)  # Top 10 by market cap (order in CSV)
+        elif tier == PopularityTier.POPULAR:
+            result = result.head(50)  # Top 50 popular
+        elif tier == PopularityTier.MAINSTREAM:
+            result = result.head(100)  # Top 100
+        # EMERGING or None returns all
+
+        return result[['cik', 'ticker', 'name', 'exchange']].reset_index(drop=True)
+
+    except Exception as e:
+        log.error(f"Error fetching popular companies: {e}")
+        return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
+
+
+def get_random_sample(
+    companies: Optional[pd.DataFrame] = None, 
+    n: int = 100, 
+    random_state: Optional[int] = None
+) -> pd.DataFrame:
+    """
+    Get random sample of companies.
+
+    Args:
+        companies: DataFrame to sample from (if None, uses all companies)
+        n: Number of companies to sample
+        random_state: Random seed for reproducibility
+
+    Returns:
+        DataFrame with n randomly selected companies
+
+    Example:
+        >>> random_100 = get_random_sample(n=100, random_state=42)
+        >>> nasdaq_sample = get_random_sample(get_companies_by_exchanges('Nasdaq'), n=50)
+    """
+    if companies is None:
+        companies = get_all_companies()
+
+    if len(companies) == 0:
+        return companies.copy()
+
+    # Ensure we don't sample more than available
+    sample_size = min(n, len(companies))
+
+    try:
+        return companies.sample(n=sample_size, random_state=random_state).reset_index(drop=True)
+    except Exception as e:
+        log.error(f"Error sampling companies: {e}")
+        return companies.head(sample_size).reset_index(drop=True)
+
+
+def get_stratified_sample(
+    companies: Optional[pd.DataFrame] = None,
+    n: int = 100,
+    stratify_by: str = 'exchange',
+    random_state: Optional[int] = None
+) -> pd.DataFrame:
+    """
+    Get stratified sample of companies maintaining proportions by specified column.
+
+    Args:
+        companies: DataFrame to sample from (if None, uses all companies)
+        n: Total number of companies to sample
+        stratify_by: Column to stratify by (default: 'exchange')
+        random_state: Random seed for reproducibility
+
+    Returns:
+        DataFrame with stratified sample
+
+    Example:
+        >>> # Sample maintaining exchange proportions
+        >>> stratified = get_stratified_sample(n=200, stratify_by='exchange')
+    """
+    if companies is None:
+        companies = get_all_companies()
+
+    if len(companies) == 0 or stratify_by not in companies.columns:
+        return get_random_sample(companies, n, random_state)
+
+    try:
+        # Calculate proportions
+        proportions = companies[stratify_by].value_counts(normalize=True)
+
+        samples = []
+        remaining_n = n
+
+        for category, prop in proportions.items():
+            category_companies = companies[companies[stratify_by] == category]
+
+            # Calculate sample size for this category
+            if category == proportions.index[-1]:  # Last category gets remainder
+                category_n = remaining_n
+            else:
+                category_n = max(1, int(n * prop))  # At least 1 company per category
+                remaining_n -= category_n
+
+            # Sample from this category
+            if len(category_companies) > 0:
+                category_sample = get_random_sample(
+                    category_companies, 
+                    min(category_n, len(category_companies)),
+                    random_state
+                )
+                samples.append(category_sample)
+
+        # Combine all samples
+        if samples:
+            result = pd.concat(samples, ignore_index=True)
+            # If we ended up with more than n, randomly select n
+            if len(result) > n:
+                result = get_random_sample(result, n, random_state)
+            return result
+        else:
+            return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
+
+    except Exception as e:
+        log.error(f"Error creating stratified sample: {e}")
+        return get_random_sample(companies, n, random_state)
+
+
+def get_top_companies_by_metric(
+    companies: Optional[pd.DataFrame] = None,
+    n: int = 100,
+    metric: str = 'name',
+    ascending: bool = True
+) -> pd.DataFrame:
+    """
+    Get top N companies sorted by specified metric.
+
+    Args:
+        companies: DataFrame to select from (if None, uses all companies)
+        n: Number of top companies to return
+        metric: Column to sort by (default: 'name' for alphabetical)
+        ascending: Sort order (True for ascending, False for descending)
+
+    Returns:
+        DataFrame with top N companies by metric
+
+    Example:
+        >>> # Top 50 companies alphabetically by name
+        >>> top_alpha = get_top_companies_by_metric(n=50, metric='name')
+        >>> # Top 100 popular companies by ticker (reverse alphabetical)
+        >>> top_tickers = get_top_companies_by_metric(
+        ...     get_popular_companies(), n=100, metric='ticker', ascending=False)
+    """
+    if companies is None:
+        companies = get_all_companies()
+
+    if len(companies) == 0 or metric not in companies.columns:
+        return companies.head(n).copy()
+
+    try:
+        sorted_companies = companies.sort_values(by=metric, ascending=ascending)
+        return sorted_companies.head(n).reset_index(drop=True)
+    except Exception as e:
+        log.error(f"Error sorting companies by {metric}: {e}")
+        return companies.head(n).copy()
+
+
+def filter_companies(
+    companies: pd.DataFrame,
+    ticker_list: Optional[List[str]] = None,
+    name_contains: Optional[str] = None,
+    cik_list: Optional[List[int]] = None,
+    custom_filter: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None
+) -> pd.DataFrame:
+    """
+    Filter companies by various criteria.
+
+    Args:
+        companies: DataFrame to filter
+        ticker_list: List of specific tickers to include
+        name_contains: String that company name must contain (case-insensitive)
+        cik_list: List of specific CIKs to include
+        custom_filter: Custom function that takes and returns a DataFrame
+
+    Returns:
+        Filtered DataFrame
+
+    Example:
+        >>> # Filter to specific tickers
+        >>> faang = filter_companies(
+        ...     companies, ticker_list=['AAPL', 'AMZN', 'NFLX', 'GOOGL', 'META'])
+        >>> # Filter by name containing 'Inc'
+        >>> inc_companies = filter_companies(companies, name_contains='Inc')
+    """
+    result = companies.copy()
+
+    try:
+        if ticker_list is not None:
+            ticker_list_upper = [t.upper() for t in ticker_list]
+            result = result[result['ticker'].str.upper().isin(ticker_list_upper)]
+
+        if name_contains is not None:
+            result = result[result['name'].str.contains(name_contains, case=False, na=False)]
+
+        if cik_list is not None:
+            result = result[result['cik'].isin(cik_list)]
+
+        if custom_filter is not None:
+            result = custom_filter(result)
+
+        return result.reset_index(drop=True)
+
+    except Exception as e:
+        log.error(f"Error filtering companies: {e}")
+        return result
+
+
+def exclude_companies(
+    companies: pd.DataFrame,
+    ticker_list: Optional[List[str]] = None,
+    name_contains: Optional[str] = None,
+    cik_list: Optional[List[int]] = None
+) -> pd.DataFrame:
+    """
+    Exclude companies by various criteria.
+
+    Args:
+        companies: DataFrame to filter
+        ticker_list: List of tickers to exclude
+        name_contains: String to exclude companies whose names contain it
+        cik_list: List of CIKs to exclude
+
+    Returns:
+        DataFrame with specified companies excluded
+
+    Example:
+        >>> # Exclude financial companies (simplified)
+        >>> non_financial = exclude_companies(
+        ...     companies, ticker_list=['JPM', 'GS', 'C', 'BAC'])
+        >>> # Exclude companies with 'Corp' in name
+        >>> non_corp = exclude_companies(companies, name_contains='Corp')
+    """
+    result = companies.copy()
+
+    try:
+        if ticker_list is not None:
+            ticker_list_upper = [t.upper() for t in ticker_list]
+            result = result[~result['ticker'].str.upper().isin(ticker_list_upper)]
+
+        if name_contains is not None:
+            result = result[~result['name'].str.contains(name_contains, case=False, na=False)]
+
+        if cik_list is not None:
+            result = result[~result['cik'].isin(cik_list)]
+
+        return result.reset_index(drop=True)
+
+    except Exception as e:
+        log.error(f"Error excluding companies: {e}")
+        return result
+
+
+def combine_company_sets(company_sets: List[pd.DataFrame]) -> pd.DataFrame:
+    """
+    Combine multiple company DataFrames (union operation).
+
+    Args:
+        company_sets: List of company DataFrames to combine
+
+    Returns:
+        Combined DataFrame with duplicates removed
+
+    Example:
+        >>> nyse = get_companies_by_exchanges('NYSE')
+        >>> popular = get_popular_companies()
+        >>> combined = combine_company_sets([nyse, popular])
+    """
+    if not company_sets:
+        return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
+
+    try:
+        # Concatenate all DataFrames
+        result = pd.concat(company_sets, ignore_index=True)
+
+        # Remove duplicates based on CIK (primary key)
+        result = result.drop_duplicates(subset=['cik']).reset_index(drop=True)
+
+        return result
+
+    except Exception as e:
+        log.error(f"Error combining company sets: {e}")
+        return company_sets[0].copy() if company_sets else pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
+
+
+def intersect_company_sets(company_sets: List[pd.DataFrame]) -> pd.DataFrame:
+    """
+    Find intersection of multiple company DataFrames.
+
+    Args:
+        company_sets: List of company DataFrames to intersect
+
+    Returns:
+        DataFrame containing only companies present in all sets
+
+    Example:
+        >>> nyse = get_companies_by_exchanges('NYSE')
+        >>> popular = get_popular_companies()
+        >>> nyse_popular = intersect_company_sets([nyse, popular])
+    """
+    if not company_sets:
+        return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
+
+    if len(company_sets) == 1:
+        return company_sets[0].copy()
+
+    try:
+        # Start with first set
+        result = company_sets[0].copy()
+
+        # Intersect with each subsequent set
+        for df in company_sets[1:]:
+            # Find common CIKs
+            common_ciks = set(result['cik']) & set(df['cik'])
+            result = result[result['cik'].isin(common_ciks)]
+
+        return result.reset_index(drop=True)
+
+    except Exception as e:
+        log.error(f"Error intersecting company sets: {e}")
+        return company_sets[0].copy() if company_sets else pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])
+
+
+def get_companies_by_industry(
+    sic: Optional[Union[int, List[int]]] = None,
+    sic_range: Optional[tuple[int, int]] = None,
+    sic_description_contains: Optional[str] = None
+) -> pd.DataFrame:
+    """
+    Get companies by industry classification using SIC (Standard Industrial Classification) codes.
+
+    Requires comprehensive company dataset. This function automatically uses use_comprehensive=True.
+
+    Args:
+        sic: Single SIC code or list of SIC codes to match exactly
+        sic_range: Tuple of (min_sic, max_sic) for range filtering (inclusive)
+        sic_description_contains: String to search within SIC description (case-insensitive)
+
+    Returns:
+        DataFrame with companies matching the industry criteria, including comprehensive metadata
+
+    Example:
+        >>> # Pharmaceutical companies (SIC 2834)
+        >>> pharma = get_companies_by_industry(sic=2834)
+
+        >>> # Biotech range (SIC 2833-2836)
+        >>> biotech = get_companies_by_industry(sic_range=(2833, 2836))
+
+        >>> # All companies with "software" in industry description
+        >>> software = get_companies_by_industry(sic_description_contains='software')
+
+        >>> # Multiple specific SIC codes
+        >>> healthcare = get_companies_by_industry(sic=[2834, 2835, 2836])
+
+    Note:
+        SIC Code Ranges:
+        - 0100-0999: Agriculture, Forestry, Fishing
+        - 1000-1499: Mining
+        - 1500-1799: Construction
+        - 2000-3999: Manufacturing
+        - 4000-4999: Transportation, Communications, Utilities
+        - 5000-5199: Wholesale Trade
+        - 5200-5999: Retail Trade
+        - 6000-6799: Finance, Insurance, Real Estate
+        - 7000-8999: Services
+        - 9100-9729: Public Administration
+    """
+    # Auto-enable comprehensive mode for industry filtering
+    companies = get_all_companies(use_comprehensive=True)
+
+    result = companies.copy()
+
+    try:
+        # Filter by exact SIC code(s)
+        if sic is not None:
+            if isinstance(sic, int):
+                sic = [sic]
+            result = result[result['sic'].isin(sic)]
+
+        # Filter by SIC range
+        if sic_range is not None:
+            min_sic, max_sic = sic_range
+            result = result[
+                (result['sic'] >= min_sic) &
+                (result['sic'] <= max_sic)
+            ]
+
+        # Filter by SIC description contains
+        if sic_description_contains is not None:
+            result = result[
+                result['sic_description'].str.contains(
+                    sic_description_contains,
+                    case=False,
+                    na=False
+                )
+            ]
+
+        return result.reset_index(drop=True)
+
+    except Exception as e:
+        log.error(f"Error filtering companies by industry: {e}")
+        return pd.DataFrame(columns=companies.columns)
+
+
+def get_companies_by_state(
+    states: Union[str, List[str]],
+    include_description: bool = True
+) -> pd.DataFrame:
+    """
+    Get companies by state of incorporation.
+
+    Requires comprehensive company dataset. This function automatically uses use_comprehensive=True.
+
+    Args:
+        states: Single state code or list of state codes (e.g., 'DE', 'CA', ['DE', 'NV'])
+        include_description: If True, includes state_of_incorporation_description in output
+
+    Returns:
+        DataFrame with companies incorporated in specified state(s)
+
+    Example:
+        >>> # Delaware corporations
+        >>> de_corps = get_companies_by_state('DE')
+
+        >>> # Delaware and Nevada corporations
+        >>> de_nv = get_companies_by_state(['DE', 'NV'])
+
+        >>> # California corporations
+        >>> ca_corps = get_companies_by_state('CA')
+
+    Note:
+        Common states of incorporation:
+        - DE: Delaware (most common for public companies)
+        - NV: Nevada (popular for tax benefits)
+        - CA: California
+        - NY: New York
+        - TX: Texas
+    """
+    if isinstance(states, str):
+        states = [states]
+
+    # Auto-enable comprehensive mode for state filtering
+    companies = get_all_companies(use_comprehensive=True)
+
+    try:
+        # Normalize state codes to uppercase
+        states_upper = [s.upper() for s in states]
+
+        result = companies[
+            companies['state_of_incorporation'].str.upper().isin(states_upper)
+        ].reset_index(drop=True)
+
+        return result
+
+    except Exception as e:
+        log.error(f"Error filtering companies by state {states}: {e}")
+        return pd.DataFrame(columns=companies.columns)
+
+
+# Convenience functions for common use cases
+
+def get_faang_companies() -> pd.DataFrame:
+    """Get FAANG companies (Facebook/Meta, Apple, Amazon, Netflix, Google)."""
+    return filter_companies(
+        get_all_companies(),
+        ticker_list=['META', 'AAPL', 'AMZN', 'NFLX', 'GOOGL']
+    )
+
+
+def get_tech_giants() -> pd.DataFrame:
+    """Get major technology companies."""
+    tech_tickers = [
+        'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'TSLA', 'NVDA', 
+        'NFLX', 'ADBE', 'CRM', 'ORCL', 'INTC', 'CSCO'
+    ]
+    return filter_companies(get_all_companies(), ticker_list=tech_tickers)
+
+
+def get_dow_jones_sample() -> pd.DataFrame:
+    """Get sample of Dow Jones Industrial Average companies."""
+    dow_tickers = [
+        'AAPL', 'MSFT', 'UNH', 'GS', 'HD', 'CAT', 'MCD', 'V', 'AXP', 'BA',
+        'TRV', 'JPM', 'IBM', 'JNJ', 'WMT', 'CVX', 'NKE', 'MRK', 'KO', 'DIS',
+        'MMM', 'DOW', 'CSCO', 'VZ', 'INTC', 'WBA', 'CRM', 'HON', 'AMGN', 'PG'
+    ]
+    return filter_companies(get_all_companies(), ticker_list=dow_tickers)
+
+
+# Industry-specific convenience functions (require comprehensive dataset)
+
+def get_pharmaceutical_companies() -> pd.DataFrame:
+    """
+    Get pharmaceutical preparation companies (SIC 2834).
+
+    Returns companies in the pharmaceutical preparations industry including
+    prescription drugs, biologics, and vaccines.
+    """
+    return get_companies_by_industry(sic=2834)
+
+
+def get_biotechnology_companies() -> pd.DataFrame:
+    """
+    Get biotechnology companies (SIC 2833-2836).
+
+    Returns companies in biotech and related pharmaceutical industries.
+    """
+    return get_companies_by_industry(sic_range=(2833, 2836))
+
+
+def get_software_companies() -> pd.DataFrame:
+    """
+    Get software and computer programming companies (SIC 7371-7379).
+
+    Returns companies in software publishing, programming, and related services.
+    """
+    return get_companies_by_industry(sic_range=(7371, 7379))
+
+
+def get_semiconductor_companies() -> pd.DataFrame:
+    """
+    Get semiconductor and electronic component companies (SIC 3674).
+
+    Returns companies manufacturing semiconductors and related devices.
+    """
+    return get_companies_by_industry(sic=3674)
+
+
+def get_banking_companies() -> pd.DataFrame:
+    """
+    Get commercial banking companies (SIC 6020-6029).
+
+    Returns national and state commercial banks.
+    """
+    return get_companies_by_industry(sic_range=(6020, 6029))
+
+
+def get_investment_companies() -> pd.DataFrame:
+    """
+    Get investment companies and funds (SIC 6200-6299).
+
+    Returns securities brokers, dealers, investment advisors, and funds.
+    """
+    return get_companies_by_industry(sic_range=(6200, 6299))
+
+
+def get_insurance_companies() -> pd.DataFrame:
+    """
+    Get insurance companies (SIC 6300-6399).
+
+    Returns life, health, property, and casualty insurance companies.
+    """
+    return get_companies_by_industry(sic_range=(6300, 6399))
+
+
+def get_real_estate_companies() -> pd.DataFrame:
+    """
+    Get real estate companies (SIC 6500-6599).
+
+    Returns REITs, real estate operators, and developers.
+    """
+    return get_companies_by_industry(sic_range=(6500, 6599))
+
+
+def get_oil_gas_companies() -> pd.DataFrame:
+    """
+    Get oil and gas extraction companies (SIC 1300-1399).
+
+    Returns crude petroleum, natural gas, and oil/gas field services companies.
+    """
+    return get_companies_by_industry(sic_range=(1300, 1399))
+
+
+def get_retail_companies() -> pd.DataFrame:
+    """
+    Get retail trade companies (SIC 5200-5999).
+
+    Returns general merchandise, apparel, food, and other retail stores.
+    """
+    return get_companies_by_industry(sic_range=(5200, 5999))
--- a/venv/lib/python3.10/site-packages/edgar/reference/data/README_PORTFOLIO_MANAGERS.md
+++ b/venv/lib/python3.10/site-packages/edgar/reference/data/README_PORTFOLIO_MANAGERS.md
@@ -0,0 +1,300 @@
+# Portfolio Manager Database - Manual Maintenance Guide
+
+This guide explains how to manually add, update, and maintain portfolio manager information in the EdgarTools database.
+
+## File Location
+**Database File**: `/Users/dwight/PycharmProjects/edgartools/edgar/data/portfolio_managers.json`
+
+## Database Structure
+
+The JSON file has two main sections:
+
+### 1. Metadata Section
+```json
+{
+  "metadata": {
+    "version": "2024.12.01",
+    "description": "Curated database of portfolio managers for major 13F filing institutions", 
+    "total_companies": 15,
+    "total_managers": 25,
+    "last_updated": "2024-12-01",
+    "sources": ["company_websites", "sec_filings", "press_releases", "public_records"]
+  }
+}
+```
+
+**Update when adding managers:**
+- Increment `total_companies` when adding new companies
+- Increment `total_managers` when adding new individual managers
+- Update `last_updated` to current date
+
+### 2. Managers Section
+Each company entry follows this structure:
+
+```json
+{
+  "managers": {
+    "company_key": {
+      "company_name": "Full Legal Company Name",
+      "aum_billions": 123,
+      "match_patterns": ["pattern1", "pattern2", "pattern3"],
+      "website": "https://www.company.com",
+      "managers": [
+        {
+          "name": "Manager Full Name",
+          "title": "Official Title",
+          "status": "active|retired|deceased|former", 
+          "confidence": "high|medium|low",
+          "sources": ["source1", "source2"],
+          "start_date": "YYYY-MM-DD",
+          "end_date": "YYYY-MM-DD",
+          "last_verified": "YYYY-MM-DD",
+          "note": "Additional context or details"
+        }
+      ]
+    }
+  }
+}
+```
+
+## Adding New Companies
+
+### Step 1: Choose Company Key
+Use lowercase, underscore-separated format:
+- ✅ Good: `berkshire_hathaway`, `goldman_sachs`, `two_sigma`  
+- ❌ Bad: `Berkshire-Hathaway`, `goldmanSachs`, `TwoSigma`
+
+### Step 2: Research Company Information
+Gather the following data:
+
+**Required:**
+- Full legal company name (from SEC filings)
+- Current AUM in billions (approximate is fine)
+- Company website URL
+- Portfolio manager names and titles
+
+**Recommended Sources:**
+1. Company website "Leadership" or "Team" pages
+2. Latest 10-K filing (Item 1A - Directors and Executive Officers)
+3. Latest DEF 14A proxy statement  
+4. Recent press releases
+5. Financial news articles
+
+### Step 3: Add Company Entry
+```json
+{
+  "new_company": {
+    "company_name": "New Company Inc", 
+    "aum_billions": 50,
+    "match_patterns": ["new company", "newco", "nc inc"],
+    "website": "https://www.newcompany.com",
+    "managers": []
+  }
+}
+```
+
+**Match Patterns Tips:**
+- Include common variations of company name
+- Include stock ticker symbols if applicable  
+- Include abbreviations commonly used
+- All patterns should be lowercase
+
+### Step 4: Add Manager Information
+```json
+{
+  "managers": [
+    {
+      "name": "Jane Smith",
+      "title": "Chief Investment Officer",
+      "status": "active",
+      "confidence": "high", 
+      "sources": ["company_website", "sec_filing_2024"],
+      "start_date": "2020-01-01",
+      "last_verified": "2024-12-01",
+      "note": "Former Goldman Sachs managing director"
+    }
+  ]
+}
+```
+
+## Manager Status Definitions
+
+- **active**: Currently in active management role
+- **retired**: Retired but may retain advisory role
+- **deceased**: Deceased (include year in status like "deceased_2023")
+- **former**: No longer with the organization
+
+## Confidence Levels
+
+- **high**: Verified from multiple official sources (company website + SEC filing)
+- **medium**: Verified from single official source  
+- **low**: Approximate or historical information
+
+## Common Sources
+
+**Primary (High Confidence):**
+- `company_website` - Official leadership pages
+- `sec_filings` - 10-K, DEF 14A proxy statements
+- `annual_report_2024` - Latest annual report
+
+**Secondary (Medium Confidence):**  
+- `press_releases` - Official company announcements
+- `financial_press` - WSJ, FT, Bloomberg articles
+- `industry_publications` - Trade publications
+
+**Tertiary (Low Confidence):**
+- `linkedin_profile` - Professional profiles
+- `wikipedia` - Publicly edited sources
+- `interview_transcript` - Media interviews
+
+## Example: Adding a New Manager
+
+Let's add a new company "Example Capital Management":
+
+```json
+{
+  "example_capital": {
+    "company_name": "Example Capital Management LLC",
+    "aum_billions": 25,
+    "match_patterns": ["example capital", "example", "ecm"],
+    "website": "https://www.examplecapital.com",
+    "managers": [
+      {
+        "name": "John Doe",
+        "title": "Founder & Chief Investment Officer", 
+        "status": "active",
+        "confidence": "high",
+        "sources": ["company_website", "sec_filing_2024"],
+        "start_date": "2015-01-01",
+        "last_verified": "2024-12-01",
+        "note": "Former hedge fund analyst at Two Sigma"
+      },
+      {
+        "name": "Sarah Wilson",
+        "title": "Portfolio Manager",
+        "status": "active", 
+        "confidence": "medium",
+        "sources": ["company_website"],
+        "start_date": "2018-06-01",
+        "last_verified": "2024-12-01",
+        "note": "Specializes in technology sector investments"
+      }
+    ]
+  }
+}
+```
+
+## Data Validation Checklist
+
+Before adding entries, verify:
+
+- [ ] Company key is lowercase with underscores
+- [ ] Company name matches legal entity in SEC filings
+- [ ] AUM is reasonable (check recent 13F filings)
+- [ ] Match patterns are comprehensive and lowercase
+- [ ] Manager names are spelled correctly (double-check sources)
+- [ ] Status is appropriate (active/retired/deceased/former)
+- [ ] Confidence level matches quality of sources
+- [ ] Dates are in YYYY-MM-DD format
+- [ ] Sources are specific and verifiable
+- [ ] Notes provide helpful context
+
+## Updating Existing Entries
+
+### Manager Status Changes
+When a manager retires, is promoted, or leaves:
+
+```json
+{
+  "name": "John Smith",
+  "title": "Former CEO", 
+  "status": "retired",
+  "end_date": "2024-06-30",
+  "note": "Retired June 2024, remains on board of directors"
+}
+```
+
+### Adding New Managers to Existing Companies
+Simply add to the managers array:
+
+```json
+{
+  "managers": [
+    // ... existing managers ...
+    {
+      "name": "New Manager Name",
+      "title": "Chief Investment Officer",
+      "status": "active",
+      // ... complete manager entry
+    }
+  ]
+}
+```
+
+## Testing Your Changes
+
+After making changes, test the functionality:
+
+```python
+import edgar
+
+# Test with a company you added/modified
+company = edgar.Company("COMPANY_TICKER")
+filing = company.get_filings(form="13F-HR").head(1)[0]
+thirteen_f = filing.obj()
+
+# Check if your managers are returned
+managers = thirteen_f.get_portfolio_managers()
+print(f"Found managers: {managers}")
+
+# Test manager info summary
+summary = thirteen_f.get_manager_info_summary()
+print(f"Manager count: {summary['external_sources']['manager_count']}")
+```
+
+## Common Mistakes to Avoid
+
+1. **Inconsistent naming**: Use exact legal names from SEC filings
+2. **Missing match patterns**: Add common abbreviations and variations
+3. **Outdated information**: Always verify against recent sources
+4. **Low confidence data**: Avoid unverified Wikipedia or blog sources  
+5. **JSON syntax errors**: Use a JSON validator before saving
+6. **Forgetting metadata**: Update total counts and last_updated date
+
+## Priority Companies to Add
+
+Focus on top 13F filers by AUM:
+
+1. **Immediate Priority (AUM > $100B):**
+   - Already added: BlackRock, Vanguard, Fidelity, State Street
+   - Still needed: T. Rowe Price, Capital Group, Invesco
+
+2. **High Priority (AUM $50-100B):**
+   - Already added: AQR, Citadel, Two Sigma, Renaissance
+   - Still needed: Millennium, D.E. Shaw, Baupost Group
+
+3. **Medium Priority (AUM $20-50B):**
+   - Already added: Elliott, Pershing Square, Icahn
+   - Still needed: Third Point, ValueAct, Jana Partners
+
+This systematic approach will provide coverage for the majority of institutional investment assets tracked in 13F filings.
+
+---
+
+## Enhancement Planning
+
+**Current Status**: As of January 2025, this database covers 21 companies with verified CIKs (53.8% by count, 63.5% by AUM).
+
+**Enhancement Roadmap**: See `docs-internal/features/FEAT-021-portfolio-manager-enhancement-followup.md` for:
+- Systematic expansion plans to reach 85% AUM coverage
+- Quarterly maintenance automation
+- International firm integration strategy
+- Historical manager tracking capabilities
+
+**Priority Targets for Next Expansion**:
+1. **Vanguard Group** ($8.1T AUM) - Research filing patterns
+2. **Capital Group Companies** ($2.8T AUM) - American Funds family  
+3. **T. Rowe Price Group** ($1.6T AUM) - Major active manager
+4. **Wellington Management** ($1.3T AUM) - Institutional specialist
+
+For enhancement requests or database improvements, see the follow-up planning document and contribute via GitHub issues.
--- a/venv/lib/python3.10/site-packages/edgar/reference/data/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/reference/data/init.py
--- a/venv/lib/python3.10/site-packages/edgar/reference/data/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/reference/data/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/reference/data/pycache/common.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/reference/data/pycache/common.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/reference/data/common.py
+++ b/venv/lib/python3.10/site-packages/edgar/reference/data/common.py
@@ -0,0 +1,41 @@
+import sys
+from functools import lru_cache
+
+import pandas as pd
+import pyarrow.parquet as pq
+
+# Dynamic import based on Python version
+if sys.version_info >= (3, 9):
+    from importlib import resources
+else:
+    import importlib_resources as resources
+
+__all__ = ['read_parquet_from_package', 'read_pyarrow_from_package', 'read_csv_from_package']
+
+
+@lru_cache(maxsize=1)
+def read_parquet_from_package(parquet_filename: str):
+    package_name = 'edgar.reference.data'
+
+    with resources.path(package_name, parquet_filename) as parquet_path:
+        df = pd.read_parquet(parquet_path)
+
+    return df
+
+
+def read_pyarrow_from_package(parquet_filename: str):
+    package_name = 'edgar.reference.data'
+
+    with resources.path(package_name, parquet_filename) as parquet_path:
+        # Read a pyarrow table from a parquet file
+        table = pq.read_table(parquet_path)
+    return table
+
+
+def read_csv_from_package(csv_filename: str, **pandas_kwargs):
+    package_name = 'edgar.reference.data'
+
+    with resources.path(package_name, csv_filename) as csv_path:
+        df = pd.read_csv(csv_path, **pandas_kwargs)
+
+    return df
--- a/venv/lib/python3.10/site-packages/edgar/reference/data/ct.pq
+++ b/venv/lib/python3.10/site-packages/edgar/reference/data/ct.pq
--- a/venv/lib/python3.10/site-packages/edgar/reference/data/exhibits.csv
+++ b/venv/lib/python3.10/site-packages/edgar/reference/data/exhibits.csv
@@ -0,0 +1,37 @@
+Exhibit No.,Description,Form Types Involved,Regex
+1,Underwriting Agreement,"S-1, S-3, F-1, F-3, S-8, S-11, etc.",^EX-1\b
+2,"Plan of Acquisition, Reorganization, Arrangement, Liquidation or Succession","Commonly used across various forms including S-4, S-1, S-11, 10-K, etc.",^EX-2\b
+3,Articles of Incorporation and Bylaws,"S-1, S-3, F-1, F-3, S-8, S-11, etc.",^EX-3(\.\d+)?\b
+4,"Instruments Defining the Rights of Security Holders, including Indentures",Required across various form types,^EX-4\b
+5,Opinion regarding Legality,Typically required across all form types,^EX-5\b
+6,Reserved,N/A,^EX-6\b
+7,Correspondence from Independent Accountants,Limited use (specific forms only),^EX-7\b
+8,Opinion re Tax Matters,"S-11, F-1, F-3, S-3, S-8",^EX-8\b
+9,Voting Trust Agreements,Mostly required in S-4 and other specific forms,^EX-9\b
+10,Material Contracts,Required widely across forms for significant contracts,^EX-10(\.\d+)?\b
+11,Statement re Computation of Per Share Earnings,Commonly required where applicable,^EX-11\b
+12,Statements re Computation of Ratios,Required in forms where ratios are relevant,^EX-12\b
+13,Annual Report to Security Holders,Typically part of 10-K or annual disclosures,^EX-13\b
+14,Code of Ethics,Required disclosure for most forms,^EX-14\b
+15,Letter re Unaudited Interim Financial Information,Used in specific situations across various forms,^EX-15\b
+16,Letter re Change in Certifying Accountant,Used primarily in 10-K and 10-Q,^EX-16\b
+17,Correspondence on Departure of Director,"Occasionally required, depending on the circumstances",^EX-17\b
+18,Letter re Change in Accounting Principles,Used when significant changes in accounting principles occur,^EX-18\b
+19,Report Furnished to Security Holders,Often part of 10-Q or similar reports,^EX-19\b
+20,Other Documents or Statements to Security Holders,"As applicable, varies by form and content required",^EX-20\b
+21,Subsidiaries of the Registrant,Required across various forms depending on the structure of the registrant,^EX-21\b
+22,Published Report Regarding Matters Submitted to Vote of Security Holders,As applicable to the voting matters,^EX-22\b
+23,Consents of Experts and Counsel,Required across various forms when expert consents are necessary,^EX-23(\.\d+)?\b
+24,Power of Attorney,"As required, often associated with filings involving multiple signatories",^EX-24\b
+25,Statement of Eligibility of Trustee,Required in filings involving indentures under the Trust Indenture Act,^EX-25\b
+26,Invitation for Competitive Bids,Required in specific cases involving competitive bids,^EX-26\b
+27-30,Reserved,N/A,`^EX-(27
+31,Rule 13a-14(a)/15d-14(a) Certifications,Common certification required across various forms,^EX-31(\.\d+)?\b
+32,Section 1350 Certifications,Required under specific legal stipulations,^EX-32\b
+33-34,Assessment and Attestation Reports regarding Compliance,Specific to asset-backed securities,`^EX-(33
+35-36,Servicer Compliance Statement and Depositor Certification,Specific to asset-backed securities,`^EX-(35
+95,Mine Safety Disclosure Exhibit,Specific to registrants involved in mining operations,^EX-95\b
+99,Additional Exhibits,As required by specific circumstances or regulatory demands,^EX-99(\.\d+)?\b
+100-101,XBRL-Related Documents and Interactive Data File,Required for electronic data submission,`^EX-(100
+102-103,Asset Data File and Asset Related Documents,Specific to asset-backed securities filings,`^EX-(102
+104-106,Reserved/Static Pool PDF,N/A or specific to asset-backed securities,`^EX-(104
--- a/venv/lib/python3.10/site-packages/edgar/reference/data/popular_us_stocks.csv
+++ b/venv/lib/python3.10/site-packages/edgar/reference/data/popular_us_stocks.csv
@@ -0,0 +1,86 @@
+Ticker,Company,Cik
+AAPL,Apple Inc.,320193
+MSFT,Microsoft Corporation,789019
+AMZN,"Amazon.com, Inc.",1018724
+NVDA,NVIDIA Corporation,1045810
+TSLA,"Tesla, Inc.",1318605
+GOOGL,Alphabet Inc. Class A,1652044
+META,"Meta Platforms, Inc.",1326801
+AMD,"Advanced Micro Devices, Inc.",2488
+NFLX,"Netflix, Inc.",1065280
+BRK.B,Berkshire Hathaway Inc.,1067983
+V,Visa Inc.,1403161
+JNJ,Johnson & Johnson,200406
+PG,Procter & Gamble Co.,80424
+JPM,JPMorgan Chase & Co.,19617
+UNH,UnitedHealth Group Incorporated,731766
+DIS,The Walt Disney Company,1744489
+HD,"Home Depot, Inc.",354950
+XOM,Exxon Mobil Corporation,34088
+KO,Coca-Cola Company,21344
+PEP,"PepsiCo, Inc.",77476
+PFE,Pfizer Inc.,78003
+MA,Mastercard Incorporated,1141391
+ADBE,Adobe Inc.,796343
+CRM,"Salesforce, Inc.",1108524
+INTC,Intel Corporation,50863
+CSCO,"Cisco Systems, Inc.",858877
+NKE,"Nike, Inc.",320187
+T,AT&T Inc.,732717
+CMCSA,Comcast Corporation,1166691
+VZ,Verizon Communications Inc.,732712
+CVX,Chevron Corporation,93410
+ABBV,AbbVie Inc.,1551152
+MRK,"Merck & Co., Inc.",310158
+BMY,Bristol-Myers Squibb Company,14272
+WMT,Walmart Inc.,104169
+MCD,McDonald's Corporation,63908
+SBUX,Starbucks Corporation,829224
+GS,"Goldman Sachs Group, Inc.",886982
+MS,Morgan Stanley,895421
+AXP,American Express Company,4962
+C,Citigroup Inc.,831001
+BA,Boeing Company,12927
+DAL,"Delta Air Lines, Inc.",27904
+LUV,Southwest Airlines Co.,92380
+MAR,"Marriott International, Inc.",1048286
+HLT,Hilton Worldwide Holdings Inc.,1585689
+BKNG,Booking Holdings Inc.,1075531
+PYPL,"PayPal Holdings, Inc.",1633917
+SQ,"Square, Inc.",1512673
+ZM,"Zoom Video Communications, Inc.",1585521
+SNOW,Snowflake Inc.,1640147
+UBER,"Uber Technologies, Inc.",1543151
+LYFT,"Lyft, Inc.",1759509
+ROKU,"Roku, Inc.",1428439
+SPOT,Spotify Technology S.A.,1639920
+SHOP,Shopify Inc.,1594805
+EBAY,eBay Inc.,1065088
+TWTR,"Twitter, Inc.",1418091
+SNAP,Snap Inc.,1564408
+PINS,"Pinterest, Inc.",1506293
+PLTR,Palantir Technologies Inc.,1321655
+ZI,ZoomInfo Technologies Inc.,1794515
+DOCU,"DocuSign, Inc.",1261333
+TWLO,Twilio Inc.,1447669
+CRWD,"CrowdStrike Holdings, Inc.",1535527
+NET,"Cloudflare, Inc.",1477333
+DDOG,"Datadog, Inc.",1561550
+MDB,"MongoDB, Inc.",1441816
+ZS,"Zscaler, Inc.",1713683
+OKTA,"Okta, Inc.",1660134
+DBX,"Dropbox, Inc.",1467623
+SMAR,Smartsheet Inc.,1366561
+ASAN,"Asana, Inc.",1477720
+RNG,"RingCentral, Inc.",1384905
+PTON,"Peloton Interactive, Inc.",1639825
+TTD,"The Trade Desk, Inc.",1671933
+HUBS,"HubSpot, Inc.",1404655
+COUP,Coupa Software Incorporated,1385867
+AYX,"Alteryx, Inc.",1689923
+SPLK,Splunk Inc.,1353283
+NEWR,"New Relic, Inc.",1448056
+DT,"Dynatrace, Inc.",1773383
+NOW,"ServiceNow, Inc.",1373715
+WDAY,"Workday, Inc.",1327811
+ADSK,"Autodesk, Inc.",769397
--- a/venv/lib/python3.10/site-packages/edgar/reference/data/portfolio_managers.json
+++ b/venv/lib/python3.10/site-packages/edgar/reference/data/portfolio_managers.json
--- a/venv/lib/python3.10/site-packages/edgar/reference/data/secforms.csv
+++ b/venv/lib/python3.10/site-packages/edgar/reference/data/secforms.csv
@@ -0,0 +1,312 @@
+Form,Description
+1-A POS,Reg A Offering Amendment
+1-A-W,Reg A Offering Withdrawal
+1-E,Notification filing for small business investment companies
+1-E AD,Sales material for small business investment companies
+1-K,Annual report for Regulation A issuers
+1-SA,Semiannual report for Regulation A issuers
+1-U,Current report for Regulation A issuers
+1-Z,Exit report for terminated Regulation A offerings
+1-Z-W,Withdrawal of Regulation A exit report
+2-E,Report of securities sales
+10-12B,Registration of a class of securities)
+10-12G,Registration of a class of securities
+10-D,Periodic distribution reports for asset-backed securities
+10-K,Annual report for public companies
+10-KT,Transition report with change in fiscal year
+10-Q,Quarterly report for public companies
+10-QT,Quarterly transition report with change in fiscal year
+11-K,Annual report for employee stock plans
+11-KT,Transition report for employee stock plans
+13F-CTR,Confidential treatment request by institutional managers
+13F-HR,Initial quarterly holdings report by institutional managers
+13F-NT,Initial quarterly notice by institutional managers
+13H,Registration for large traders
+144,Notice of proposed sale
+15-12G,Securities registration termination
+15-15D,Suspension of reporting obligations
+15F-12B,Foreign private issuer equity securities termination
+15F-12G,Securities registration termination by foreign private issuer
+15F-15D,Foreign private issuer reporting suspension
+17HACON,Confidential annual broker-dealer report
+17HQCON,Confidential quarterly broker-dealer report
+18-12B,Securities registration by foreign governments
+18-12G,Securities registration by foreign governments
+18-K,Annual report for foreign governments
+20-F,Annual report for foreign companies
+20FR12B,Foreign private issuer securities registration
+20FR12G,Foreign private issuer securities registration
+24F-2NT,Rule 24F-2 notice for investment companies
+25,Securities delisting
+25-NSE,Notice of matured/redeemed/retired securities by exchanges
+3,Initial statement of beneficial ownership
+305B2,Application for new trustee
+4,Statement of changes in beneficial ownership
+40-6B,Application by employees' securities company
+40-17F1,Custody report for management investment companies
+40-17F2,Custody report for management investment companies
+40-17G,Fidelity bond filing for investment companies
+40-17GCS,Claims and settlements under investment company fidelity bond
+40-24B2,Sales literature filing for investment companies
+40-33,Investment company shareholder derivative actions
+40-8B25,Investment company report or document
+40-8F-2,Application for deregistration by investment companies
+40-APP,Applications under Investment Company/Advisers Acts
+40-F,Annual report (Canadian)
+40FR12B,Securities registration by certain Canadian issuers
+40FR12G,Securities registration by certain Canadian issuers
+40-OIP,Applications under Investment Company/Advisers Acts reviewed by insurance office
+424A,Prospectus outlining the details of securities offered by a company
+424B1,Initial primary offering
+424B2,Primary offering prospectus
+424B3,Prospectus supplement
+424B4,Prospectus supplement with pricing
+424B5,Supplement to primary offering
+424B7,Prospectus with material changes
+424B8,Final prospectus changes
+424H,Preliminary prospectus
+425,Prospectus in business combination transactions
+424I,Prospectus filed under Rule 424(i)(1)
+485APOS,Post effective amendment
+485BPOS,Post effective amendment
+485BXT,Amendment to designate new effective date
+486APOS,Post-effective amendment
+486BPOS,Post-effective amendment
+486BXT,Amendment to designate new effective date
+487,Pre-effective pricing amendment under Rule 487
+497,Fund prospectus
+497AD,Rule 482 ads filed under Rule 497
+497H2,Filings under Rule 497(h)(2)
+497J,Certification of no changes to prospectus
+497K,Summary fund prospectus
+497VPI,Variable contracts summary prospectus
+497VPSUB,Substitution-related supplement for variable contracts
+497VPU,Updated summary prospectus for variable contracts
+5,Annual statement of beneficial ownership changes
+6-K,Foreign issuer current report
+8-A12B,Registration of securities
+8-A12G,Registration of securities
+8-K,Current report
+8-K12B,Successor issuer registration
+8-K12G3,Successor issuer registration
+8-K15D5,Successor issuer reporting
+ABS-15G,Asset-backed securities report
+ABS-EE,Electronic exhibits for asset-backed securities offerings
+ANNLRPT,Annual development bank report
+APP WD,Withdrawal of exemptive relief application
+ARS,Annual report to security holders
+ATS-N,Initial Alternative Trading System (ATS) notice
+ATS-N/CA,Correcting amendment to ATS notice
+ATS-N/MA,Material amendment to ATS notice
+ATS-N/OFA,Order display and fair access amendment to ATS notice
+ATS-N/UA,Updating amendment to ATS notice
+ATS-N-C,Notice of ATS cessation
+ATS-N-W,Withdrawal of ATS notice
+AW,Withdrawal of Securities Act registration amendment
+AW WD,Withdrawal request for registration amendment withdrawal
+BULK,Bulk submission
+C,Offering statement
+C-W,Withdrawal of offering statement
+C/A-W,Withdrawal of offering statement amendment
+C-U,Progress update
+C-U-W,Withdrawal of progress update
+C-AR,Annual report
+C-AR-W,Withdrawal of annual report
+C-AR/A-W,Withdrawal of annual report amendment
+C-TR,Termination of reporting
+C-TR-W,Withdrawal of termination of reporting
+CB,Notice for certain foreign issuer transactions
+CERT,Exchange certification of listing approval
+CFPORTAL,Funding portal registration application
+CFPORTAL-W,Withdrawal of funding portal registration
+CORRESP,Correspondence with the SEC
+D,Notice of exempt Regulation D offering
+DEF 14A,Definitive proxy statement
+DEF 14C,Definitive information statement
+DEFA14A,Additional definitive proxy materials
+DEFA14C,Additional definitive information statement materials
+DEFC14A,Definitive proxy statement - contested solicitation
+DEFC14C,Definitive information statement - contested solicitation
+DEFM14A,Definitive proxy statement for merger/acquisition
+DEFM14C,Definitive information statement for merger/acquisition
+DEFN14A,Definitive proxy statement by non-management
+DEFR14A,Revised definitive proxy materials
+DEFR14C,Revised definitive information statement materials
+DEL AM,Delaying amendment for registration statement
+DFAN14A,Additional proxy materials by non-management
+DFRN14A,Revised proxy statement by non-management
+DOS,Draft offering statement under Regulation A
+DOSLTR,Draft offering statement letter
+DRS,Draft registration statement
+DRSLTR,Draft registration statement letter
+DSTRBRPT,Distribution report for development bank
+F-1,Securities registration by foreign private issuers
+F-10,Securities registration by certain Canadian issuers
+F-10EF,Auto-effective registration by certain Canadian issuers
+F-10POS,Amendment to F-10EF registration
+F-1MEF,Additional securities registered to prior F-1
+F-3,Foreign private securities registration
+F-3ASR,Foreign private securities registration
+F-3D,Foreign private securities registration
+F-3DPOS,Amendment to F-3D registration
+F-3MEF,Additional securities registered to prior F-3
+F-4,Business combination for foreign issuers
+F-4MEF,Additional securities registered to prior F-4
+F-6 POS,Amendment to F-6EF registration
+F-6,Depositary receipts by foreign private issuers
+F-6EF,Depositary receipts by foreign private issuers
+F-7 POS,Amended F-7 registration
+F-7,Canadian rights offerings
+F-8 POS,Amendment to F-8 registration
+F-8,Canadian business combination
+F-80,Canadian business combination
+F-80POS,Amendment to F-80 registration
+F-N,Appointment of agent for service by foreign institutions
+FWP,Filing of free writing prospectuses
+F-X,Appointment of agent for service by foreign issuers
+IRANNOTICE,Notice of Iran/Syria disclosures in periodic reports
+MA,Municipal advisor registration
+MA-I,Natural persons engaged in municipal advisory activities
+MA-W,Withdrawal from municipal advisor registration
+MODULE,Module submission
+N-14 8C,Initial registration statement by closed-end funds for business combinations
+N-14,Initial registration statement by open-end funds for business combinations
+N-14MEF,Additional securities registered by closed-end funds
+N-18F1,Election of terms for future filings
+N-1A,Initial registration statement for open-end funds
+N-2,Closed-end fund registration
+N-2ASR,Closed-end fund automatic registration
+N-2 POSASR,Amendment to N-2ASR registration
+N-23C-2,Notice of closed-end fund's intention to call or redeem securities
+N-23C3A,Closed-end fund periodic repurchase offer notice
+N-23C3B,Filing under by closed-end funds
+N-23C3C,Filings under and (c) by closed-end funds
+N-27D-1,Accounting report for segregated trust accounts
+N-2MEF,Additional securities registered to prior N-2
+N-3,Initial registration for separate accounts of management companies
+N-30B-2,Periodic reports (other than annual/semi-annual) by management companies
+N-30D,Annual and semi-annual reports by management companies
+N-4,Initial registration for separate accounts of unit trusts
+N-5,Registration statement for small business investment companies
+N-54A,Election filing by business development companies
+N-54C,Withdrawal filing by business development companies
+N-6,Registration statement for separate accounts of unit trusts
+N-6F,Notice by business development companies electing to be subject to Sections 55-65
+N-8A,Initial notification of registration
+N-8B-2,Initial registration statement for unit investment trusts
+N-8B-3,Initial registration statement for periodic payment plans
+N-8B-4,Initial registration statement for face-amount certificate companies
+N-8F,Application for deregistration
+N-CEN,Annual report for registered investment companies
+N-CR,Current report for money market funds
+N-CSR,Certified annual shareholder report
+N-CSRS,Certified semi-annual shareholder report
+N-MFP2/A,Monthly portfolio holdings for money market funds
+N-MFP3,Monthly portfolio holdings for money market funds
+NPORT-EX,Portfolio holdings exhibit to Form N-PORT
+NPORT-NP,Non-public monthly portfolio investments report
+NPORT-P,Public monthly portfolio investments report
+N-PX,Annual proxy voting record report
+N-PX CTR,Confidential treatment request for Form N-PX
+N-RN,Current report for registered funds and BDCs
+NRSRO-UPD,Registration update by credit rating agencies
+NRSRO-CE,Annual certification by credit rating agencies
+NRSRO-FR,Annual reports for statistical rating organizations
+NRSRO-WCLS,Withdrawal from credit rating class for nationally recognized statistical rating organizations
+NRSRO-WREG,Withdrawal from registration as a nationally recognized statistical rating organization
+NT 10-K,Late filing of 10-K
+NT 10-D,Late filing of 10-D
+NT 10-Q,Late filing of 10-Q
+NT 11-K,Late filing of 11-K
+NT 15D2,Late filing of special report
+NT 20-F,Late filing of Form 20-F
+NT-NCEN,Late filing of Form N-CEN
+NT-NCSR,Late filing of Form N-CSR
+N-VP,Notice document for certain variable contracts
+N-VPFS,Financial statements for certain variable contracts
+POS 8C,Post-effective amendment for closed-end funds
+POS AM,Post-effective amendment to a registration statement
+POS AMI,Post-effective amendment for investment company filings
+POSASR,Post-effective amendment to automatic shelf registration
+POS EX,Post-effective amendment adding exhibits
+POS462B,Post-effective amendment filed
+POS462C,Post-effective amendment filed
+PRE 14A,Preliminary proxy statement
+PRE 14C,Preliminary information statement
+PREC14A,Preliminary proxy statement for contested solicitations
+PREC14C,Preliminary information statement for contested solicitations
+PREM14A,Preliminary merger proxy statement
+PREM14C,Preliminary merger information statement
+PREN14A,Preliminary proxy statement filed by non-management
+PRER14A,Preliminary revised proxy materials
+PRER14C,Preliminary revised information statements
+PRRN14A,Revised preliminary proxy statement non-management
+PX14A6G,Exempt solicitation
+PX14A6N,Exempt solicitation for roll-up transaction
+QRTLYRPT,Development banks quarterly report
+RW,Registration withdrawal
+RW WD,Withdrawal of registration withdrawal
+S-1,Securities registration
+S-11,Real estate securities registration
+S-11MEF,Registration statement for prior Form S-11
+S-1MEF,Registration statement for prior Form S-1
+S-20,Standardized options registration
+S-3,Simplified securities registration
+S-3ASR,Automatic shelf registration
+S-3D,Dividend reinvestment plans automatic securities registration
+S-3DPOS,Post-effective amendment to Form S-3D
+S-3MEF,Registration statement filed relating to prior Form S-3
+S-4 POS,Post-effective amendment to Form S-4
+S-4,Business acquisitions registration
+S-4EF,Bank/S&L loan registration
+S-4MEF,Registration statement filed relating to prior Form S-4
+S-6,Initial registration statement for unit investment trusts
+S-8,Employee securities registration
+S-8 POS,Post-effective amendment to Form S-8
+S-B,Foreign governments securities registration
+S-BMEF,Registration statement filed relating to prior Form S-B
+SBSE,Security-based swap dealer registration
+SBSE-A,Abbreviated application for SEC-registered swap entities also registered with CFTC
+SBSE-BD,Application for broker-dealer security-based swap dealers/major participants
+SBSE-C,Certifications for security-based swap dealer/major participant registration
+SBSE-W,Request to withdraw registration as security-based swap dealer/major participant
+SBSE-DISPUTE NOTICE,Notice of valuation dispute by a security-based swap entity
+SBSE-CCO-RPT,Annual compliance report for security-based swap dealers
+SC 13D,Ownership for control disclosure
+SCHEDULE 13D,Disclosure of beneficial ownership over 5% (XML)
+SC 13E1,Issuer statement for going private transactions
+SC 13E3,Schedule for going private transactions
+SC 13G,Beneficial ownership
+SCHEDULE 13G,Beneficial ownership by passive investors/institutions
+SC 14D9,Solicitation/recommendation statement for third-party tender offers
+SC 14F1,Statement for changes to majority of directors
+SC 14N,Information by nominating shareholders
+SC 14N-S,Solicitation relating to Rule 14a-11 nominating groups
+SC TO-C,Written communication relating to tender offers
+SC TO-I,Tender offer by issuer
+SC TO-T,Tender offer by third party
+SC13E4F,Foreign issuer tender
+SC14D1F,3rd party tender offer by foreign issuer
+SC14D9C,Subject company communication relating to third-party tender offer
+SC14D9F,Solicitation/recommendation statement by foreign issuers for third-party tender offers
+SD,Specialized disclosure report on conflict minerals or resource extraction payments
+SDR,Registration for security-based swap data repositories
+SDR-CCO,Compliance and financial reports for security-based swap data repositories
+SDR-W,Withdrawal from registration as security-based swap data repository
+SF-1,Asset-backed securities registration
+SF-1MEF,Registration statement filed relating to prior Form SF-1
+SF-3,Asset-backed securities shelf offerings
+SF-3MEF,Registration statement filed relating to prior Form SF-3
+SH-ER,Weekly entries report by institutional investment managers
+SH-NT,Weekly notice report by institutional investment managers
+SP 15D2,Special financial report
+SPDSCL,Specialized disclosure filing
+SUPPL,Supplemental material filed by foreign private issuers
+T-3,Initial application for trust indenture qualification
+T-6,Application for foreign entity to act as institutional trustee
+TA-1,Initial application for transfer agent registration
+TA-2,Annual report by registered transfer agents
+TA-W,Notice of withdrawal from transfer agent registration
+UPLOAD,Submission of documents
+UNDER,Initial undertaking to file reports
+X-17A-5,Reports required of brokers and dealers
--- a/venv/lib/python3.10/site-packages/edgar/reference/financials.py
+++ b/venv/lib/python3.10/site-packages/edgar/reference/financials.py
@@ -0,0 +1,7 @@
+from edgar.httprequests import download_file
+
+dera_data_url = 'https://www.sec.gov/dera/data'
+financial_statement_datasets='financial-statement-data-sets'
+
+if __name__ == '__main__':
+    download_file('https://www.sec.gov/files/dera/data/financial-statement-data-sets/2024q1.zip')
--- a/venv/lib/python3.10/site-packages/edgar/reference/forms.py
+++ b/venv/lib/python3.10/site-packages/edgar/reference/forms.py
@@ -0,0 +1,33 @@
+from functools import lru_cache
+
+from edgar.reference.data.common import read_csv_from_package
+
+sec_form_data = read_csv_from_package('secforms.csv')
+
+
+@lru_cache(maxsize=64)
+def describe_form(form: str,
+                  prepend_form: bool = True) -> str:
+    """
+    Get the description of a form from the form descriptions file.
+    """
+    is_amendment = False
+    if form.endswith("/A"):
+        form = form[:-2]
+        is_amendment = True
+    form = form.upper()
+    description = sec_form_data.loc[sec_form_data.Form == form]
+    if len(description) == 0:
+        return f"Form {form}"
+    else:
+        description = description.Description.iloc[0]
+        if prepend_form:
+            return f"Form {form}{' Amendment' if is_amendment else ''}: {description}"
+        else:
+            return description
+
+
+PROSPECTUSES = ["S-1", "S-3", "S-4", "S-8", "S-11", "F-1", "F-3", "F-4", "F-6", "F-10", "424B1",
+                "424B2", "424B3", "424B4", "424B5", "424B7", "424B8", "485BPOS", "486BPOS", "497", "N-2", "N-14",
+                "POS AM", "POSASR", "POS EX", "10", "20-F", "8-A", "SF-1", "SF-3"
+                ]
--- a/venv/lib/python3.10/site-packages/edgar/reference/tickers.py
+++ b/venv/lib/python3.10/site-packages/edgar/reference/tickers.py
@@ -0,0 +1,475 @@
+import json
+import os
+import re
+from enum import Enum
+from functools import lru_cache
+from io import StringIO
+from typing import Any, Dict, List, Optional, Union
+
+import pandas as pd
+import pyarrow as pa
+from httpx import HTTPStatusError
+
+from edgar.core import get_edgar_data_directory, listify, log
+from edgar.httprequests import download_file, download_json
+from edgar.reference.data.common import read_csv_from_package, read_parquet_from_package
+
+__all__ = ['cusip_ticker_mapping', 'get_ticker_from_cusip', 'get_company_tickers', 'get_icon_from_ticker', 'find_cik',
+           'get_cik_tickers', 'get_company_ticker_name_exchange', 'get_companies_by_exchange', 'popular_us_stocks',
+           'get_mutual_fund_tickers', 'find_mutual_fund_cik', 'list_all_tickers', 'find_ticker', 'find_ticker_safe', 'get_cik_ticker_lookup',
+           'get_company_cik_lookup', 'get_cik_tickers_from_ticker_txt', 'get_cik_tickers', 'get_company_tickers',
+           'ticker_txt_url', 'company_tickers_json_url', 'mutual_fund_tickers_url', 'company_tickers_exchange_url',
+           'Exchange'
+           ]
+
+ticker_txt_url = "https://www.sec.gov/include/ticker.txt"
+company_tickers_json_url = "https://www.sec.gov/files/company_tickers.json"
+mutual_fund_tickers_url = "https://www.sec.gov/files/company_tickers_mf.json"
+company_tickers_exchange_url = "https://www.sec.gov/files/company_tickers_exchange.json"
+
+
+@lru_cache(maxsize=1)
+def cusip_ticker_mapping(allow_duplicate_cusips: bool = True) -> pd.DataFrame:
+    """
+    Download the Cusip to Ticker mapping data from the SEC website.
+    This provides a Dataframe with Cusip as the index and Ticker as the column.
+
+    CUSIP can be duplicate to get non duplicate Cusips set allow_duplicate_cusips to False.
+    This will return only the first occurrence of the Cusip.
+    The first occurrence of the Cusip will also be most likely to be mapped to a Ticker that is linked to a cik
+    """
+    df = read_parquet_from_package('ct.pq').set_index('Cusip')
+    if not allow_duplicate_cusips:
+        df = df[~df.index.duplicated(keep='first')]
+    return df
+
+
+def load_tickers_from_local() -> Optional[Dict[str, Any]]:
+    """
+    Load tickers from local data
+    """
+    reference_dir = get_edgar_data_directory() / "reference"
+    if not reference_dir.exists():
+        return None
+    company_tickers_file = reference_dir / os.path.basename(company_tickers_json_url)
+    if not company_tickers_file.exists():
+        return None
+    return json.loads(company_tickers_file.read_text())
+
+
+@lru_cache(maxsize=1)
+def get_company_tickers(
+        as_dataframe: bool = True,
+        clean_name: bool = True,
+        clean_suffix: bool = False
+) -> Union[pd.DataFrame, pa.Table]:
+    """
+    Fetch and process company ticker data from SEC.
+
+    Args:
+        as_dataframe (bool): If True, returns pandas DataFrame; if False, returns pyarrow Table
+        clean_name (bool): If True, cleans company names
+        clean_suffix (bool): If True, removes common company suffixes
+
+    Returns:
+        Union[pd.DataFrame, pa.Table]: Processed company data
+    """
+
+    # Pre-define schema for better performance
+    SCHEMA = pa.schema([
+        ('cik', pa.int64()),
+        ('ticker', pa.string()),
+        ('company', pa.string())
+    ])
+
+    try:
+        if os.getenv("EDGAR_USE_LOCAL_DATA"):
+            tickers_json = load_tickers_from_local()
+            if not tickers_json:
+                tickers_json = download_json(company_tickers_json_url)
+        else:
+            # Download JSON data
+            tickers_json = download_json(company_tickers_json_url)
+
+        # Pre-allocate lists for better memory efficiency
+        ciks = []
+        tickers = []
+        companies = []
+
+        # Process JSON data
+        for item in tickers_json.values():
+            company_name = item['title']
+
+            # Apply name cleaning if requested
+            if clean_name or clean_suffix:
+                if clean_name:
+                    company_name = clean_company_name(company_name)
+                if clean_suffix:
+                    company_name = clean_company_suffix(company_name)
+
+            # Append to respective lists
+            ciks.append(int(item['cik_str']))
+            tickers.append(item['ticker'])
+            companies.append(company_name)
+
+        if as_dataframe:
+            # Create DataFrame directly from lists
+            return pd.DataFrame({
+                'cik': ciks,
+                'ticker': tickers,
+                'company': companies
+            })
+
+        # Create pyarrow arrays
+        cik_array = pa.array(ciks, type=pa.int64())
+        ticker_array = pa.array(tickers, type=pa.string())
+        company_array = pa.array(companies, type=pa.string())
+
+        # Create and return pyarrow Table
+        return pa.Table.from_arrays(
+            [cik_array, ticker_array, company_array],
+            schema=SCHEMA
+        )
+
+    except Exception as e:
+        log.error(f"Error fetching company tickers from [{company_tickers_json_url}]: {str(e)}")
+        raise
+
+def load_cik_tickers_from_local() -> Optional[str]:
+    """
+    Load tickers.txt from local data
+    """
+    reference_dir = get_edgar_data_directory() / "reference"
+    if not reference_dir.exists():
+        return None
+    tickers_txt_file = reference_dir / os.path.basename(ticker_txt_url)
+    if not tickers_txt_file.exists():
+        return None
+    return tickers_txt_file.read_text()
+
+def get_cik_tickers_from_ticker_txt():
+    """Get CIK and ticker data from ticker.txt file"""
+    try:
+        if os.getenv("EDGAR_USE_LOCAL_DATA"):
+            ticker_txt = load_cik_tickers_from_local()
+            if not ticker_txt:
+                ticker_txt = download_file(ticker_txt_url, as_text=True)
+        else:
+            ticker_txt = download_file(ticker_txt_url, as_text=True)
+        source = StringIO(ticker_txt)
+        data = pd.read_csv(source,
+                           sep='\t',
+                           header=None,
+                           names=['ticker', 'cik']).dropna()
+        data['ticker'] = data['ticker'].str.upper()
+        return data
+    except Exception as e:
+        log.error(f"Error fetching company tickers from [{ticker_txt_url}]: {str(e)}")
+        return None
+
+@lru_cache(maxsize=1)
+def get_cik_tickers():
+    """Merge unique records from both sources"""
+    txt_data = get_cik_tickers_from_ticker_txt()
+    try:
+        json_data = get_company_tickers(clean_name=False, clean_suffix=False)[['ticker', 'cik']]
+    except Exception:
+        json_data = None
+
+    if txt_data is None and json_data is None:
+        raise Exception("Both data sources are unavailable")
+
+    if txt_data is None:
+        return json_data
+
+    if json_data is None:
+        return txt_data
+
+    # Merge both dataframes and keep unique records
+    merged_data = pd.concat([txt_data, json_data], ignore_index=True)
+    merged_data = merged_data.drop_duplicates(subset=['ticker', 'cik'])
+
+    return merged_data
+
+@lru_cache(maxsize=None)
+def list_all_tickers():
+    """List all tickers from the merged data"""
+    return get_cik_tickers()['ticker'].tolist()
+
+
+@lru_cache(maxsize=None)
+def get_company_cik_lookup():
+    df = get_cik_tickers()
+
+    lookup = {}
+    for ticker, cik in zip(df['ticker'], df['cik'], strict=False):
+        # Add original ticker
+        lookup[ticker] = cik
+
+        # Add base ticker (part before '-')
+        base_ticker = ticker.split('-')[0]
+        if base_ticker not in lookup:
+            lookup[base_ticker] = cik
+
+    return lookup
+
+
+@lru_cache(maxsize=None)
+def get_cik_ticker_lookup():
+    """Create a mapping of CIK to base ticker symbols.
+    For CIKs with multiple tickers, uses the shortest ticker (usually the base symbol).
+    """
+    company_lookup = get_company_cik_lookup()
+    cik_to_tickers = {}
+    for ticker, cik in company_lookup.items():
+        # Prefer the base ticker (without share class)
+        base_ticker = ticker.split('-')[0]
+        if cik not in cik_to_tickers or len(base_ticker) < len(cik_to_tickers[cik]):
+            cik_to_tickers[cik] = base_ticker
+    return cik_to_tickers
+
+
+@lru_cache(maxsize=128)
+def find_ticker(cik: Union[int, str]) -> str:
+    """Find the ticker symbol for a given CIK.
+    Returns empty string if no ticker is found.
+
+    Args:
+        cik: Central Index Key (CIK) as integer or string
+
+    Returns:
+        str: Ticker symbol or empty string if not found
+    """
+    try:
+        # Ensure cik is an integer
+        cik = int(str(cik).lstrip('0'))
+        return get_cik_ticker_lookup().get(cik, "")
+    except (ValueError, TypeError):
+        return ""
+
+
+def find_ticker_safe(cik: Union[int, str]) -> Optional[str]:
+    """Find the ticker symbol for a given CIK without making network calls.
+    Returns None if data is not already cached and would require a network call.
+    Returns empty string if CIK is found but has no ticker.
+
+    This function is designed for use cases where network calls should be avoided,
+    such as in rich display methods that should be fast and not block on I/O.
+
+    Args:
+        cik: Central Index Key (CIK) as integer or string
+
+    Returns:
+        Optional[str]: Ticker symbol, empty string if no ticker found, or None if network call would be required
+    """
+    try:
+        # Simple approach: check if all required cache functions have data
+        # Only proceed if all the underlying data is already cached
+        if (get_cik_ticker_lookup.cache_info().currsize > 0 and
+            get_company_cik_lookup.cache_info().currsize > 0 and
+            get_cik_tickers.cache_info().currsize > 0):
+
+            # If we have cached data, try to use it
+            cik = int(str(cik).lstrip('0'))
+
+            # This should be fast since data is cached
+            lookup_dict = get_cik_ticker_lookup()
+            return lookup_dict.get(cik, "")
+        else:
+            # Not all required data is cached, return None to avoid network calls
+            return None
+
+    except Exception:
+        # Any error (including potential network errors) returns None
+        # This ensures we never trigger network calls
+        return None
+
+@lru_cache(maxsize=None)
+def get_company_ticker_name_exchange():
+    """
+    Return a DataFrame with columns [cik	name	ticker	exchange]
+    """
+    data = download_json("https://www.sec.gov/files/company_tickers_exchange.json")
+    return pd.DataFrame(data['data'], columns=data['fields'])
+
+
+def get_companies_by_exchange(exchange: Union[List[str], str]):
+    """
+    Get companies listed on a specific exchange.
+
+    :param exchange: String, like 'Nasdaq' or 'NYSE'
+    :return: DataFrame with companies listed on the specified exchange
+    with columns [cik	name	ticker	exchange]
+    """
+    df = get_company_ticker_name_exchange()
+    exchanges = [ex.lower() for ex in listify(exchange)]
+    return df[df['exchange'].str.lower().isin(exchanges)].reset_index(drop=True)
+
+
+@lru_cache(maxsize=None)
+def get_mutual_fund_tickers():
+    """
+    Get mutual fund tickers.
+    This returns a dataframe with columns
+        cik    seriesId     classId ticker
+    """
+    data = download_json("https://www.sec.gov/files/company_tickers_mf.json")
+    return pd.DataFrame(data['data'], columns=['cik', 'seriesId', 'classId', 'ticker'])
+
+
+@lru_cache(maxsize=None)
+def get_mutual_fund_lookup():
+    df = get_mutual_fund_tickers()
+    return dict(zip(df['ticker'], df['cik'], strict=False))
+
+
+def find_mutual_fund_cik(ticker):
+    """
+    Find the CIK for a given mutual fund or ETF ticker.
+
+    :param ticker: String, the ticker symbol to look up
+    :return: Integer, the CIK for the given ticker, or None if not found
+    """
+    lookup = get_mutual_fund_lookup()
+    return lookup.get(ticker.upper())
+
+
+def find_company_cik(ticker):
+    lookup = get_company_cik_lookup()
+    ticker = ticker.upper().replace('.', '-')
+    return lookup.get(ticker)
+
+def find_company_ticker(cik: Union[int, str]) -> Union[str, List[str], None]:
+    """
+    Find the ticker for a given CIK.
+
+    :param cik (int or str):        The CIK to look up
+    :return Union[str, List[str]]:  A single ticker string if only one ticker is found,
+                                    a list of ticker strings if multiple tickers are found,
+                                    or an empty list if no tickers are found.
+    """
+    try:
+        # Ensure cik is a string without leading zeros, then convert to int
+        cik = str(cik).lstrip('0')
+        cik = int(cik)
+    except (ValueError, TypeError):
+        return None
+
+    # Get DataFrame of CIK-Ticker mappings
+    df = get_cik_tickers()
+
+    # Ensure 'cik' and 'ticker' columns exist
+    if 'cik' not in df.columns or 'ticker' not in df.columns:
+        return None
+
+    # Filter DataFrame for the given CIK
+    ticker_series = df[df['cik'] == cik]['ticker']
+
+    # If no tickers found, return None
+    if ticker_series.empty:
+        return None
+
+    # Filter out None values from tickers
+    tickers = [ticker for ticker in ticker_series.to_numpy() if ticker is not None]
+
+    # Return a single ticker if only one found
+    if len(tickers) == 1:
+        return tickers[0]
+
+    return tickers
+
+def find_cik(ticker):
+    """
+    Find the CIK for a given ticker, checking both company and mutual fund/ETF data.
+
+    :param ticker: String, the ticker symbol to look up
+    :return: Integer, the CIK for the given ticker, or None if not found
+    """
+    # First, check company CIKs
+    cik = find_company_cik(ticker)
+    if cik is not None:
+        return cik
+
+    # If not found, check mutual fund/ETF CIKs
+    return find_mutual_fund_cik(ticker)
+
+
+@lru_cache(maxsize=128)
+def get_ticker_from_cusip(cusip: str):
+    """
+    Get the ticker symbol for a given Cusip.
+    """
+    data = cusip_ticker_mapping()
+    results = data.loc[cusip]
+    if len(results) == 1:
+        return results.iloc[0]
+    elif len(results) > 1:
+        return results.iloc[0].Ticker
+
+
+def clean_company_name(name: str) -> str:
+    # Regular expression to match unwanted patterns at the end of the company name
+    cleaned_name = re.sub(r'[/\\][A-Z]+[/\\]?$', '', name)
+    return cleaned_name.strip()
+
+
+def clean_company_suffix(name: str) -> str:
+    """Remove common suffixes from the company name, taking care of special cases."""
+    # Remove trailing slashes
+    name = name.rstrip('/')
+    # Handle cases like "JPMORGAN CHASE & CO" or "ELI LILLY & Co"
+    name = re.sub(r'\s*&\s*CO\b\.?', '', name, flags=re.IGNORECASE).strip()
+    # Remove other common suffixes, including "PLC", "LTD", "LIMITED", and combinations like "LTD CO"
+    name = re.sub(r'\b(?:Inc\.?|CO|CORP|PLC|LTD|LIMITED|L\.P\.)\b\.?$', '', name, flags=re.IGNORECASE).strip()
+    return name
+
+
+def get_ticker_icon_url(ticker: str) -> str:
+    """
+    Get the URL for the icon of a company with the given ticker.
+    """
+    return f"https://raw.githubusercontent.com/nvstly/icons/main/ticker_icons/{ticker.upper()}.png"
+
+@lru_cache(maxsize=4)
+def get_icon_from_ticker(ticker: str) -> Optional[bytes]:
+    """
+    Download an icon for a given ticker as a PNG image, if available.
+
+    WARNING: This function uses the nvstly/icons repository on GitHub to fetch the icons.
+    The icons are not guaranteed to be available for all tickers.
+    """
+
+    if not isinstance(ticker, str):
+        raise ValueError("The ticker must be a valid string.")
+
+    if not ticker.isalpha():
+        raise ValueError("The ticker must only contain alphabetic characters.")
+
+    try:
+        downloaded = download_file(
+            f"https://raw.githubusercontent.com/nvstly/icons/main/ticker_icons/{ticker.upper()}.png", as_text=False)
+        return downloaded
+    except HTTPStatusError as e:
+        # If the status code is 404, the icon is not available
+        if e.response.status_code == 404:
+            return None
+        else:
+            raise
+
+def popular_us_stocks():
+    df = (read_csv_from_package('popular_us_stocks.csv', dtype={'Cik': int})
+          .set_index('Cik')
+          )
+    return df
+
+class Exchange(Enum):
+
+    Nasdaq = "Nasdaq"
+    NYSE = "NYSE"
+    OTC = "OTC"
+    CBOE = "CBOE"
+
+    def __str__(self):
+        return self.value
+
+