edgartools/venv/lib/python3.10/site-packages/edgar/reference/company_subsets.py

"""
Company subset selection utilities for analysis and learning tasks.

This module provides flexible ways to create subsets of companies from SEC reference data
for educational, research, and analysis purposes. It offers exchange-based selection,
popularity-based filtering, sampling capabilities, and composition utilities.

Key features:
- Exchange-based selection (NYSE, NASDAQ, OTC, CBOE)
- Popularity-based selection (popular stocks, market cap tiers)
- Sampling capabilities (random, stratified, top N)
- Filtering and combination utilities
- Consistent DataFrame output format

All functions return a standardized DataFrame with columns: ['cik', 'ticker', 'name', 'exchange']
"""

from enum import Enum
from functools import lru_cache
from typing import Callable, List, Optional, Union

import pandas as pd

from edgar.core import log
from edgar.reference.tickers import get_company_ticker_name_exchange, popular_us_stocks

__all__ = [
    # Classes and Enums
    'CompanySubset',
    'MarketCapTier',
    'PopularityTier',
    # Core Functions
    'get_all_companies',
    'get_companies_by_exchanges',
    'get_popular_companies',
    # Industry and State Filtering (Comprehensive Mode)
    'get_companies_by_industry',
    'get_companies_by_state',
    # Sampling and Filtering
    'get_random_sample',
    'get_stratified_sample',
    'get_top_companies_by_metric',
    'filter_companies',
    'exclude_companies',
    # Set Operations
    'combine_company_sets',
    'intersect_company_sets',
    # Convenience Functions - General
    'get_faang_companies',
    'get_tech_giants',
    'get_dow_jones_sample',
    # Convenience Functions - Industry Specific
    'get_pharmaceutical_companies',
    'get_biotechnology_companies',
    'get_software_companies',
    'get_semiconductor_companies',
    'get_banking_companies',
    'get_investment_companies',
    'get_insurance_companies',
    'get_real_estate_companies',
    'get_oil_gas_companies',
    'get_retail_companies',
]


class MarketCapTier(Enum):
    """Market cap tiers for company classification."""
    LARGE_CAP = "large_cap"      # Usually > $10B
    MID_CAP = "mid_cap"          # Usually $2B - $10B
    SMALL_CAP = "small_cap"      # Usually $300M - $2B
    MICRO_CAP = "micro_cap"      # Usually < $300M


class PopularityTier(Enum):
    """Popularity tiers based on trading activity and recognition."""
    MEGA_CAP = "mega_cap"        # Top 10 most valuable companies
    POPULAR = "popular"          # Popular stocks list
    MAINSTREAM = "mainstream"    # Well-known companies
    EMERGING = "emerging"        # Smaller but notable companies


class CompanySubset:
    """
    Fluent interface for building company subsets with chainable operations.

    Example:
        # Get 50 random NYSE companies excluding financial sector
        companies = (CompanySubset()
                    .from_exchange('NYSE')
                    .exclude_tickers(['JPM', 'GS', 'C'])
                    .sample(50)
                    .get())

        # Get pharmaceutical companies with comprehensive metadata
        pharma = (CompanySubset(use_comprehensive=True)
                 .from_industry(sic_range=(2834, 2836))
                 .sample(100)
                 .get())
    """

    def __init__(self, companies: Optional[pd.DataFrame] = None, use_comprehensive: bool = False):
        """
        Initialize with optional starting dataset.

        Args:
            companies: Optional DataFrame to start with. If None, loads from get_all_companies()
            use_comprehensive: If True and companies is None, load comprehensive dataset
                             with rich metadata (SIC, state, entity type, etc.)
        """
        if companies is not None:
            self._companies = companies
        else:
            self._companies = get_all_companies(use_comprehensive=use_comprehensive)
        self._use_comprehensive = use_comprehensive

    def from_exchange(self, exchanges: Union[str, List[str]]) -> 'CompanySubset':
        """Filter companies by exchange(s)."""
        self._companies = get_companies_by_exchanges(exchanges)
        return self

    def from_popular(self, tier: Optional[PopularityTier] = None) -> 'CompanySubset':
        """Filter to popular companies."""
        self._companies = get_popular_companies(tier)
        return self

    def from_industry(
        self,
        sic: Optional[Union[int, List[int]]] = None,
        sic_range: Optional[tuple[int, int]] = None,
        sic_description_contains: Optional[str] = None
    ) -> 'CompanySubset':
        """
        Filter companies by industry (SIC code).

        Automatically enables comprehensive mode to access industry metadata.

        Args:
            sic: Single SIC code or list of SIC codes to match exactly
            sic_range: Tuple of (min_sic, max_sic) for range filtering
            sic_description_contains: String to search within SIC description

        Returns:
            CompanySubset with industry filter applied

        Example:
            >>> # Pharmaceutical companies
            >>> pharma = CompanySubset().from_industry(sic=2834)

            >>> # Biotech sector
            >>> biotech = CompanySubset().from_industry(sic_range=(2833, 2836))
        """
        self._companies = get_companies_by_industry(
            sic=sic,
            sic_range=sic_range,
            sic_description_contains=sic_description_contains
        )
        self._use_comprehensive = True
        return self

    def from_state(self, states: Union[str, List[str]]) -> 'CompanySubset':
        """
        Filter companies by state of incorporation.

        Automatically enables comprehensive mode to access state metadata.

        Args:
            states: Single state code or list of state codes (e.g., 'DE', 'CA')

        Returns:
            CompanySubset with state filter applied

        Example:
            >>> # Delaware corporations
            >>> de_corps = CompanySubset().from_state('DE')

            >>> # Delaware or Nevada corporations
            >>> de_nv = CompanySubset().from_state(['DE', 'NV'])
        """
        self._companies = get_companies_by_state(states)
        self._use_comprehensive = True
        return self

    def filter_by(self, condition: Callable[[pd.DataFrame], pd.DataFrame]) -> 'CompanySubset':
        """Apply custom filter function."""
        self._companies = condition(self._companies)
        return self

    def exclude_tickers(self, tickers: List[str]) -> 'CompanySubset':
        """Exclude specific tickers."""
        self._companies = exclude_companies(self._companies, tickers)
        return self

    def include_tickers(self, tickers: List[str]) -> 'CompanySubset':
        """Include only specific tickers."""
        self._companies = filter_companies(self._companies, ticker_list=tickers)
        return self

    def sample(self, n: int, random_state: Optional[int] = None) -> 'CompanySubset':
        """Take random sample of n companies."""
        self._companies = get_random_sample(self._companies, n, random_state)
        return self

    def top(self, n: int, by: str = 'name') -> 'CompanySubset':
        """Take top n companies by specified column."""
        self._companies = get_top_companies_by_metric(self._companies, n, by)
        return self

    def combine_with(self, other: 'CompanySubset') -> 'CompanySubset':
        """Combine with another subset (union)."""
        self._companies = combine_company_sets([self._companies, other.get()])
        return self

    def intersect_with(self, other: 'CompanySubset') -> 'CompanySubset':
        """Intersect with another subset."""
        self._companies = intersect_company_sets([self._companies, other.get()])
        return self

    def get(self) -> pd.DataFrame:
        """Get the final DataFrame."""
        return self._companies.copy()

    def __len__(self) -> int:
        """Return number of companies in subset."""
        return len(self._companies)

    def __repr__(self) -> str:
        """String representation showing count and sample."""
        count = len(self._companies)
        if count == 0:
            return "CompanySubset(empty)"

        sample_size = min(3, count)
        sample_tickers = self._companies['ticker'].head(sample_size).tolist()
        sample_str = ', '.join(sample_tickers)

        if count > sample_size:
            sample_str += f", ... +{count - sample_size} more"

        return f"CompanySubset({count} companies: {sample_str})"


def _get_comprehensive_companies() -> pd.DataFrame:
    """
    Get comprehensive company dataset from company_dataset module.

    This function loads the full SEC submissions dataset (~562K companies) with rich metadata
    including SIC codes, state of incorporation, entity types, and more.

    Returns:
        DataFrame with extended schema:
        ['cik', 'ticker', 'name', 'exchange', 'sic', 'sic_description',
         'state_of_incorporation', 'state_of_incorporation_description',
         'fiscal_year_end', 'entity_type', 'ein']

    Note:
        - First call may take ~30 seconds to build the dataset
        - Subsequent calls use cached Parquet file (<100ms load time)
        - Primary ticker extracted from pipe-delimited tickers field
        - Primary exchange extracted from pipe-delimited exchanges field
    """
    try:
        from edgar.reference.company_dataset import get_company_dataset

        # Get PyArrow Table from company_dataset
        table = get_company_dataset()

        # Convert to pandas
        df = table.to_pandas()

        # Extract primary ticker from pipe-delimited tickers field
        def extract_primary(value):
            """Extract first value from pipe-delimited string."""
            if pd.isna(value) or value is None:
                return None
            value_str = str(value)
            parts = value_str.split('|')
            return parts[0] if parts and parts[0] else None

        df['ticker'] = df['tickers'].apply(extract_primary)
        df['exchange'] = df['exchanges'].apply(extract_primary)

        # Drop the original pipe-delimited columns
        df = df.drop(columns=['tickers', 'exchanges'])

        # Reorder columns to match standard format plus extensions
        columns = [
            'cik', 'ticker', 'name', 'exchange',
            'sic', 'sic_description',
            'state_of_incorporation', 'state_of_incorporation_description',
            'fiscal_year_end', 'entity_type', 'ein'
        ]

        return df[columns]

    except Exception as e:
        log.error(f"Error fetching comprehensive company data: {e}")
        # Return empty DataFrame with extended schema
        return pd.DataFrame(columns=[
            'cik', 'ticker', 'name', 'exchange',
            'sic', 'sic_description',
            'state_of_incorporation', 'state_of_incorporation_description',
            'fiscal_year_end', 'entity_type', 'ein'
        ])


@lru_cache(maxsize=2)
def get_all_companies(use_comprehensive: bool = False) -> pd.DataFrame:
    """
    Get all companies from SEC reference data in standardized format.

    Args:
        use_comprehensive: If True, load comprehensive dataset with ~562K companies
                          and rich metadata (SIC, state, entity type, etc.).
                          If False (default), load ticker-only dataset with ~13K companies.

    Returns:
        DataFrame with columns ['cik', 'ticker', 'name', 'exchange']

        If use_comprehensive=True, also includes:
        ['sic', 'sic_description', 'state_of_incorporation',
         'state_of_incorporation_description', 'fiscal_year_end',
         'entity_type', 'ein']

    Note:
        - Default (use_comprehensive=False) maintains backward compatibility
        - Comprehensive mode adds ~30 second build time on first call
        - Both modes use caching for fast subsequent calls

    Example:
        >>> # Standard mode - fast, ticker-only data
        >>> companies = get_all_companies()
        >>> len(companies)  # ~13K companies

        >>> # Comprehensive mode - slower first call, rich metadata
        >>> all_companies = get_all_companies(use_comprehensive=True)
        >>> len(all_companies)  # ~562K companies
        >>> 'sic' in all_companies.columns  # True
    """
    if use_comprehensive:
        return _get_comprehensive_companies()

    try:
        df = get_company_ticker_name_exchange().copy()
        # Reorder columns to match our standard format
        return df[['cik', 'ticker', 'name', 'exchange']]
    except Exception as e:
        log.error(f"Error fetching company data: {e}")
        # Return empty DataFrame with correct structure
        return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])


def get_companies_by_exchanges(exchanges: Union[str, List[str]]) -> pd.DataFrame:
    """
    Get companies listed on specific exchange(s).

    Args:
        exchanges: Single exchange string or list of exchanges
                  ('NYSE', 'Nasdaq', 'OTC', 'CBOE')

    Returns:
        DataFrame with companies from specified exchanges

    Example:
        >>> nyse_companies = get_companies_by_exchanges('NYSE')
        >>> major_exchanges = get_companies_by_exchanges(['NYSE', 'Nasdaq'])
    """
    if isinstance(exchanges, str):
        exchanges = [exchanges]

    try:
        all_companies = get_all_companies()
        return all_companies[all_companies['exchange'].isin(exchanges)].reset_index(drop=True)
    except Exception as e:
        log.error(f"Error filtering companies by exchanges {exchanges}: {e}")
        return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])


def get_popular_companies(tier: Optional[PopularityTier] = None) -> pd.DataFrame:
    """
    Get popular companies based on tier selection.

    Args:
        tier: Popularity tier (MEGA_CAP, POPULAR, MAINSTREAM, EMERGING)
              If None, returns all popular companies

    Returns:
        DataFrame with popular companies

    Example:
        >>> mega_cap = get_popular_companies(PopularityTier.MEGA_CAP)
        >>> all_popular = get_popular_companies()
    """
    try:
        # Get popular stocks and merge with exchange data
        popular_df = popular_us_stocks().reset_index()  # CIK becomes a column
        popular_df = popular_df.rename(columns={'Cik': 'cik', 'Ticker': 'ticker', 'Company': 'name'})

        # Get exchange information
        all_companies = get_all_companies()

        # Merge to get exchange information
        result = popular_df.merge(
            all_companies[['cik', 'exchange']],
            on='cik',
            how='left'
        )

        # Fill missing exchanges with 'Unknown'
        result['exchange'] = result['exchange'].fillna('Unknown')

        # Apply tier filtering
        if tier == PopularityTier.MEGA_CAP:
            result = result.head(10)  # Top 10 by market cap (order in CSV)
        elif tier == PopularityTier.POPULAR:
            result = result.head(50)  # Top 50 popular
        elif tier == PopularityTier.MAINSTREAM:
            result = result.head(100)  # Top 100
        # EMERGING or None returns all

        return result[['cik', 'ticker', 'name', 'exchange']].reset_index(drop=True)

    except Exception as e:
        log.error(f"Error fetching popular companies: {e}")
        return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])


def get_random_sample(
    companies: Optional[pd.DataFrame] = None,
    n: int = 100,
    random_state: Optional[int] = None
) -> pd.DataFrame:
    """
    Get random sample of companies.

    Args:
        companies: DataFrame to sample from (if None, uses all companies)
        n: Number of companies to sample
        random_state: Random seed for reproducibility

    Returns:
        DataFrame with n randomly selected companies

    Example:
        >>> random_100 = get_random_sample(n=100, random_state=42)
        >>> nasdaq_sample = get_random_sample(get_companies_by_exchanges('Nasdaq'), n=50)
    """
    if companies is None:
        companies = get_all_companies()

    if len(companies) == 0:
        return companies.copy()

    # Ensure we don't sample more than available
    sample_size = min(n, len(companies))

    try:
        return companies.sample(n=sample_size, random_state=random_state).reset_index(drop=True)
    except Exception as e:
        log.error(f"Error sampling companies: {e}")
        return companies.head(sample_size).reset_index(drop=True)


def get_stratified_sample(
    companies: Optional[pd.DataFrame] = None,
    n: int = 100,
    stratify_by: str = 'exchange',
    random_state: Optional[int] = None
) -> pd.DataFrame:
    """
    Get stratified sample of companies maintaining proportions by specified column.

    Args:
        companies: DataFrame to sample from (if None, uses all companies)
        n: Total number of companies to sample
        stratify_by: Column to stratify by (default: 'exchange')
        random_state: Random seed for reproducibility

    Returns:
        DataFrame with stratified sample

    Example:
        >>> # Sample maintaining exchange proportions
        >>> stratified = get_stratified_sample(n=200, stratify_by='exchange')
    """
    if companies is None:
        companies = get_all_companies()

    if len(companies) == 0 or stratify_by not in companies.columns:
        return get_random_sample(companies, n, random_state)

    try:
        # Calculate proportions
        proportions = companies[stratify_by].value_counts(normalize=True)

        samples = []
        remaining_n = n

        for category, prop in proportions.items():
            category_companies = companies[companies[stratify_by] == category]

            # Calculate sample size for this category
            if category == proportions.index[-1]:  # Last category gets remainder
                category_n = remaining_n
            else:
                category_n = max(1, int(n * prop))  # At least 1 company per category
                remaining_n -= category_n

            # Sample from this category
            if len(category_companies) > 0:
                category_sample = get_random_sample(
                    category_companies,
                    min(category_n, len(category_companies)),
                    random_state
                )
                samples.append(category_sample)

        # Combine all samples
        if samples:
            result = pd.concat(samples, ignore_index=True)
            # If we ended up with more than n, randomly select n
            if len(result) > n:
                result = get_random_sample(result, n, random_state)
            return result
        else:
            return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])

    except Exception as e:
        log.error(f"Error creating stratified sample: {e}")
        return get_random_sample(companies, n, random_state)


def get_top_companies_by_metric(
    companies: Optional[pd.DataFrame] = None,
    n: int = 100,
    metric: str = 'name',
    ascending: bool = True
) -> pd.DataFrame:
    """
    Get top N companies sorted by specified metric.

    Args:
        companies: DataFrame to select from (if None, uses all companies)
        n: Number of top companies to return
        metric: Column to sort by (default: 'name' for alphabetical)
        ascending: Sort order (True for ascending, False for descending)

    Returns:
        DataFrame with top N companies by metric

    Example:
        >>> # Top 50 companies alphabetically by name
        >>> top_alpha = get_top_companies_by_metric(n=50, metric='name')
        >>> # Top 100 popular companies by ticker (reverse alphabetical)
        >>> top_tickers = get_top_companies_by_metric(
        ...     get_popular_companies(), n=100, metric='ticker', ascending=False)
    """
    if companies is None:
        companies = get_all_companies()

    if len(companies) == 0 or metric not in companies.columns:
        return companies.head(n).copy()

    try:
        sorted_companies = companies.sort_values(by=metric, ascending=ascending)
        return sorted_companies.head(n).reset_index(drop=True)
    except Exception as e:
        log.error(f"Error sorting companies by {metric}: {e}")
        return companies.head(n).copy()


def filter_companies(
    companies: pd.DataFrame,
    ticker_list: Optional[List[str]] = None,
    name_contains: Optional[str] = None,
    cik_list: Optional[List[int]] = None,
    custom_filter: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None
) -> pd.DataFrame:
    """
    Filter companies by various criteria.

    Args:
        companies: DataFrame to filter
        ticker_list: List of specific tickers to include
        name_contains: String that company name must contain (case-insensitive)
        cik_list: List of specific CIKs to include
        custom_filter: Custom function that takes and returns a DataFrame

    Returns:
        Filtered DataFrame

    Example:
        >>> # Filter to specific tickers
        >>> faang = filter_companies(
        ...     companies, ticker_list=['AAPL', 'AMZN', 'NFLX', 'GOOGL', 'META'])
        >>> # Filter by name containing 'Inc'
        >>> inc_companies = filter_companies(companies, name_contains='Inc')
    """
    result = companies.copy()

    try:
        if ticker_list is not None:
            ticker_list_upper = [t.upper() for t in ticker_list]
            result = result[result['ticker'].str.upper().isin(ticker_list_upper)]

        if name_contains is not None:
            result = result[result['name'].str.contains(name_contains, case=False, na=False)]

        if cik_list is not None:
            result = result[result['cik'].isin(cik_list)]

        if custom_filter is not None:
            result = custom_filter(result)

        return result.reset_index(drop=True)

    except Exception as e:
        log.error(f"Error filtering companies: {e}")
        return result


def exclude_companies(
    companies: pd.DataFrame,
    ticker_list: Optional[List[str]] = None,
    name_contains: Optional[str] = None,
    cik_list: Optional[List[int]] = None
) -> pd.DataFrame:
    """
    Exclude companies by various criteria.

    Args:
        companies: DataFrame to filter
        ticker_list: List of tickers to exclude
        name_contains: String to exclude companies whose names contain it
        cik_list: List of CIKs to exclude

    Returns:
        DataFrame with specified companies excluded

    Example:
        >>> # Exclude financial companies (simplified)
        >>> non_financial = exclude_companies(
        ...     companies, ticker_list=['JPM', 'GS', 'C', 'BAC'])
        >>> # Exclude companies with 'Corp' in name
        >>> non_corp = exclude_companies(companies, name_contains='Corp')
    """
    result = companies.copy()

    try:
        if ticker_list is not None:
            ticker_list_upper = [t.upper() for t in ticker_list]
            result = result[~result['ticker'].str.upper().isin(ticker_list_upper)]

        if name_contains is not None:
            result = result[~result['name'].str.contains(name_contains, case=False, na=False)]

        if cik_list is not None:
            result = result[~result['cik'].isin(cik_list)]

        return result.reset_index(drop=True)

    except Exception as e:
        log.error(f"Error excluding companies: {e}")
        return result


def combine_company_sets(company_sets: List[pd.DataFrame]) -> pd.DataFrame:
    """
    Combine multiple company DataFrames (union operation).

    Args:
        company_sets: List of company DataFrames to combine

    Returns:
        Combined DataFrame with duplicates removed

    Example:
        >>> nyse = get_companies_by_exchanges('NYSE')
        >>> popular = get_popular_companies()
        >>> combined = combine_company_sets([nyse, popular])
    """
    if not company_sets:
        return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])

    try:
        # Concatenate all DataFrames
        result = pd.concat(company_sets, ignore_index=True)

        # Remove duplicates based on CIK (primary key)
        result = result.drop_duplicates(subset=['cik']).reset_index(drop=True)

        return result

    except Exception as e:
        log.error(f"Error combining company sets: {e}")
        return company_sets[0].copy() if company_sets else pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])


def intersect_company_sets(company_sets: List[pd.DataFrame]) -> pd.DataFrame:
    """
    Find intersection of multiple company DataFrames.

    Args:
        company_sets: List of company DataFrames to intersect

    Returns:
        DataFrame containing only companies present in all sets

    Example:
        >>> nyse = get_companies_by_exchanges('NYSE')
        >>> popular = get_popular_companies()
        >>> nyse_popular = intersect_company_sets([nyse, popular])
    """
    if not company_sets:
        return pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])

    if len(company_sets) == 1:
        return company_sets[0].copy()

    try:
        # Start with first set
        result = company_sets[0].copy()

        # Intersect with each subsequent set
        for df in company_sets[1:]:
            # Find common CIKs
            common_ciks = set(result['cik']) & set(df['cik'])
            result = result[result['cik'].isin(common_ciks)]

        return result.reset_index(drop=True)

    except Exception as e:
        log.error(f"Error intersecting company sets: {e}")
        return company_sets[0].copy() if company_sets else pd.DataFrame(columns=['cik', 'ticker', 'name', 'exchange'])


def get_companies_by_industry(
    sic: Optional[Union[int, List[int]]] = None,
    sic_range: Optional[tuple[int, int]] = None,
    sic_description_contains: Optional[str] = None
) -> pd.DataFrame:
    """
    Get companies by industry classification using SIC (Standard Industrial Classification) codes.

    Requires comprehensive company dataset. This function automatically uses use_comprehensive=True.

    Args:
        sic: Single SIC code or list of SIC codes to match exactly
        sic_range: Tuple of (min_sic, max_sic) for range filtering (inclusive)
        sic_description_contains: String to search within SIC description (case-insensitive)

    Returns:
        DataFrame with companies matching the industry criteria, including comprehensive metadata

    Example:
        >>> # Pharmaceutical companies (SIC 2834)
        >>> pharma = get_companies_by_industry(sic=2834)

        >>> # Biotech range (SIC 2833-2836)
        >>> biotech = get_companies_by_industry(sic_range=(2833, 2836))

        >>> # All companies with "software" in industry description
        >>> software = get_companies_by_industry(sic_description_contains='software')

        >>> # Multiple specific SIC codes
        >>> healthcare = get_companies_by_industry(sic=[2834, 2835, 2836])

    Note:
        SIC Code Ranges:
        - 0100-0999: Agriculture, Forestry, Fishing
        - 1000-1499: Mining
        - 1500-1799: Construction
        - 2000-3999: Manufacturing
        - 4000-4999: Transportation, Communications, Utilities
        - 5000-5199: Wholesale Trade
        - 5200-5999: Retail Trade
        - 6000-6799: Finance, Insurance, Real Estate
        - 7000-8999: Services
        - 9100-9729: Public Administration
    """
    # Auto-enable comprehensive mode for industry filtering
    companies = get_all_companies(use_comprehensive=True)

    result = companies.copy()

    try:
        # Filter by exact SIC code(s)
        if sic is not None:
            if isinstance(sic, int):
                sic = [sic]
            result = result[result['sic'].isin(sic)]

        # Filter by SIC range
        if sic_range is not None:
            min_sic, max_sic = sic_range
            result = result[
                (result['sic'] >= min_sic) &
                (result['sic'] <= max_sic)
            ]

        # Filter by SIC description contains
        if sic_description_contains is not None:
            result = result[
                result['sic_description'].str.contains(
                    sic_description_contains,
                    case=False,
                    na=False
                )
            ]

        return result.reset_index(drop=True)

    except Exception as e:
        log.error(f"Error filtering companies by industry: {e}")
        return pd.DataFrame(columns=companies.columns)


def get_companies_by_state(
    states: Union[str, List[str]],
    include_description: bool = True
) -> pd.DataFrame:
    """
    Get companies by state of incorporation.

    Requires comprehensive company dataset. This function automatically uses use_comprehensive=True.

    Args:
        states: Single state code or list of state codes (e.g., 'DE', 'CA', ['DE', 'NV'])
        include_description: If True, includes state_of_incorporation_description in output

    Returns:
        DataFrame with companies incorporated in specified state(s)

    Example:
        >>> # Delaware corporations
        >>> de_corps = get_companies_by_state('DE')

        >>> # Delaware and Nevada corporations
        >>> de_nv = get_companies_by_state(['DE', 'NV'])

        >>> # California corporations
        >>> ca_corps = get_companies_by_state('CA')

    Note:
        Common states of incorporation:
        - DE: Delaware (most common for public companies)
        - NV: Nevada (popular for tax benefits)
        - CA: California
        - NY: New York
        - TX: Texas
    """
    if isinstance(states, str):
        states = [states]

    # Auto-enable comprehensive mode for state filtering
    companies = get_all_companies(use_comprehensive=True)

    try:
        # Normalize state codes to uppercase
        states_upper = [s.upper() for s in states]

        result = companies[
            companies['state_of_incorporation'].str.upper().isin(states_upper)
        ].reset_index(drop=True)

        return result

    except Exception as e:
        log.error(f"Error filtering companies by state {states}: {e}")
        return pd.DataFrame(columns=companies.columns)


# Convenience functions for common use cases

def get_faang_companies() -> pd.DataFrame:
    """Get FAANG companies (Facebook/Meta, Apple, Amazon, Netflix, Google)."""
    return filter_companies(
        get_all_companies(),
        ticker_list=['META', 'AAPL', 'AMZN', 'NFLX', 'GOOGL']
    )


def get_tech_giants() -> pd.DataFrame:
    """Get major technology companies."""
    tech_tickers = [
        'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'TSLA', 'NVDA',
        'NFLX', 'ADBE', 'CRM', 'ORCL', 'INTC', 'CSCO'
    ]
    return filter_companies(get_all_companies(), ticker_list=tech_tickers)


def get_dow_jones_sample() -> pd.DataFrame:
    """Get sample of Dow Jones Industrial Average companies."""
    dow_tickers = [
        'AAPL', 'MSFT', 'UNH', 'GS', 'HD', 'CAT', 'MCD', 'V', 'AXP', 'BA',
        'TRV', 'JPM', 'IBM', 'JNJ', 'WMT', 'CVX', 'NKE', 'MRK', 'KO', 'DIS',
        'MMM', 'DOW', 'CSCO', 'VZ', 'INTC', 'WBA', 'CRM', 'HON', 'AMGN', 'PG'
    ]
    return filter_companies(get_all_companies(), ticker_list=dow_tickers)


# Industry-specific convenience functions (require comprehensive dataset)

def get_pharmaceutical_companies() -> pd.DataFrame:
    """
    Get pharmaceutical preparation companies (SIC 2834).

    Returns companies in the pharmaceutical preparations industry including
    prescription drugs, biologics, and vaccines.
    """
    return get_companies_by_industry(sic=2834)


def get_biotechnology_companies() -> pd.DataFrame:
    """
    Get biotechnology companies (SIC 2833-2836).

    Returns companies in biotech and related pharmaceutical industries.
    """
    return get_companies_by_industry(sic_range=(2833, 2836))


def get_software_companies() -> pd.DataFrame:
    """
    Get software and computer programming companies (SIC 7371-7379).

    Returns companies in software publishing, programming, and related services.
    """
    return get_companies_by_industry(sic_range=(7371, 7379))


def get_semiconductor_companies() -> pd.DataFrame:
    """
    Get semiconductor and electronic component companies (SIC 3674).

    Returns companies manufacturing semiconductors and related devices.
    """
    return get_companies_by_industry(sic=3674)


def get_banking_companies() -> pd.DataFrame:
    """
    Get commercial banking companies (SIC 6020-6029).

    Returns national and state commercial banks.
    """
    return get_companies_by_industry(sic_range=(6020, 6029))


def get_investment_companies() -> pd.DataFrame:
    """
    Get investment companies and funds (SIC 6200-6299).

    Returns securities brokers, dealers, investment advisors, and funds.
    """
    return get_companies_by_industry(sic_range=(6200, 6299))


def get_insurance_companies() -> pd.DataFrame:
    """
    Get insurance companies (SIC 6300-6399).

    Returns life, health, property, and casualty insurance companies.
    """
    return get_companies_by_industry(sic_range=(6300, 6399))


def get_real_estate_companies() -> pd.DataFrame:
    """
    Get real estate companies (SIC 6500-6599).

    Returns REITs, real estate operators, and developers.
    """
    return get_companies_by_industry(sic_range=(6500, 6599))


def get_oil_gas_companies() -> pd.DataFrame:
    """
    Get oil and gas extraction companies (SIC 1300-1399).

    Returns crude petroleum, natural gas, and oil/gas field services companies.
    """
    return get_companies_by_industry(sic_range=(1300, 1399))


def get_retail_companies() -> pd.DataFrame:
    """
    Get retail trade companies (SIC 5200-5999).

    Returns general merchandise, apparel, food, and other retail stores.
    """
    return get_companies_by_industry(sic_range=(5200, 5999))