edgartools/venv/lib/python3.10/site-packages/edgar/xbrl/period_data_check.py

"""
Enhanced period selection with data availability checking.

This module provides functions to verify that selected periods have sufficient
data before displaying them to investors.
"""

from typing import Dict, List, Optional, Set, Tuple


def count_facts_for_period(xbrl_instance, period_key: str, statement_type: Optional[str] = None) -> int:
    """
    Count the number of facts available for a specific period.

    Args:
        xbrl_instance: XBRL instance with facts
        period_key: Period key to check (e.g., 'instant_2024-09-28')
        statement_type: Optional statement type to filter facts

    Returns:
        Number of facts found for this period
    """
    fact_count = 0

    # Parse period key to get context criteria
    if period_key.startswith('instant_'):
        period_type = 'instant'
        period_date = period_key.replace('instant_', '')
    elif 'duration_' in period_key:
        period_type = 'duration'
        parts = period_key.split('_')
        if len(parts) >= 3:
            start_date = parts[1]
            end_date = parts[2]
        else:
            return 0
    else:
        return 0

    # Count facts matching this period
    for _fact_key, fact in xbrl_instance._facts.items():
        # Get context for this fact
        context = xbrl_instance.contexts.get(fact.context_ref)
        if not context:
            continue

        # Check if period matches
        period_data = context.model_dump().get('period', {})
        if period_type == 'instant':
            if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
                fact_count += 1
        elif period_type == 'duration':
            if (period_data.get('type') == 'duration' and
                period_data.get('startDate') == start_date and
                period_data.get('endDate') == end_date):
                fact_count += 1

    return fact_count


def get_essential_concepts_for_statement(statement_type: str) -> Set[str]:
    """
    Get the essential concepts that should be present for a statement type.

    These are the minimum concepts investors expect to see.
    """
    essential_concepts = {
        'BalanceSheet': {
            # Core balance sheet items
            'Assets', 'AssetsCurrent',
            'Liabilities', 'LiabilitiesCurrent',
            'StockholdersEquity', 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest',
            # Common important items
            'CashAndCashEquivalentsAtCarryingValue', 'Cash',
            'AccountsReceivableNetCurrent', 'AccountsReceivable',
            'Inventory', 'InventoryNet',
            'PropertyPlantAndEquipmentNet',
            'AccountsPayableCurrent', 'AccountsPayable',
            'LongTermDebt', 'LongTermDebtNoncurrent'
        },
        'IncomeStatement': {
            # Core income items
            'Revenues', 'RevenueFromContractWithCustomerExcludingAssessedTax', 'SalesRevenueNet',
            'CostOfRevenue', 'CostOfGoodsAndServicesSold', 'CostOfGoodsSold',
            'GrossProfit',
            'OperatingExpenses', 'OperatingCostsAndExpenses',
            'OperatingIncomeLoss', 'IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest',
            'NetIncomeLoss', 'ProfitLoss',
            # Common important items
            'ResearchAndDevelopmentExpense',
            'SellingGeneralAndAdministrativeExpense',
            'EarningsPerShareBasic', 'EarningsPerShareDiluted'
        },
        'CashFlowStatement': {
            # Core cash flow items
            'NetCashProvidedByUsedInOperatingActivities',
            'NetCashProvidedByUsedInInvestingActivities',
            'NetCashProvidedByUsedInFinancingActivities',
            'CashAndCashEquivalentsPeriodIncreaseDecrease',
            # Common important items
            'NetIncomeLoss',
            'DepreciationDepletionAndAmortization', 'DepreciationAndAmortization',
            'PaymentsToAcquirePropertyPlantAndEquipment',
            'PaymentsOfDividends', 'PaymentsOfDividendsCommonStock'
        }
    }

    return essential_concepts.get(statement_type, set())


def check_period_data_quality(xbrl_instance, period_key: str, statement_type: str) -> Dict[str, any]:
    """
    Check the data quality for a specific period.

    Returns:
        Dictionary with quality metrics:
        - fact_count: Total number of facts
        - meaningful_fact_count: Number of facts with meaningful (non-empty) values
        - essential_coverage: Percentage of essential concepts found
        - has_sufficient_data: Boolean indicating if period should be displayed
        - missing_essentials: List of missing essential concepts
        - has_meaningful_data: Boolean indicating if period has meaningful values (fixes Issue #408)
    """
    # Count total facts
    fact_count = count_facts_for_period(xbrl_instance, period_key, statement_type)

    # Count meaningful facts (non-empty values) - Fix for Issue #408
    meaningful_fact_count = 0

    # Get essential concepts
    essential_concepts = get_essential_concepts_for_statement(statement_type)

    # Check which essential concepts are present
    found_essentials = set()
    missing_essentials = set()

    # Parse period for context matching
    if period_key.startswith('instant_'):
        period_type = 'instant'
        period_date = period_key.replace('instant_', '')
    else:
        period_type = 'duration'
        parts = period_key.split('_')
        if len(parts) >= 3:
            start_date = parts[1]
            end_date = parts[2]
        else:
            return {
                'fact_count': fact_count,
                'essential_coverage': 0.0,
                'has_sufficient_data': False,
                'missing_essentials': list(essential_concepts)
            }

    # Check each essential concept
    for concept in essential_concepts:
        concept_found = False

        # Look for this concept in facts
        for _fact_key, fact in xbrl_instance._facts.items():
            if concept_found:
                break

            # Check if this fact matches the concept
            element = xbrl_instance.element_catalog.get(fact.element_id)
            if element and concept in element.name:
                # Check if it's for our period
                context = xbrl_instance.contexts.get(fact.context_ref)
                if context:
                    period_data = context.model_dump().get('period', {})
                    if period_type == 'instant':
                        if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
                            found_essentials.add(concept)
                            concept_found = True
                    else:
                        if (period_data.get('type') == 'duration' and
                            period_data.get('startDate') == start_date and
                            period_data.get('endDate') == end_date):
                            found_essentials.add(concept)
                            concept_found = True

        if not concept_found:
            missing_essentials.add(concept)

    # Count meaningful facts (non-empty values) - Fix for Issue #408
    for _fact_key, fact in xbrl_instance._facts.items():
        # Check if it's for our period
        context = xbrl_instance.contexts.get(fact.context_ref)
        if context:
            period_data = context.model_dump().get('period', {})
            period_matches = False

            if period_type == 'instant':
                if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
                    period_matches = True
            else:
                if (period_data.get('type') == 'duration' and
                    period_data.get('startDate') == start_date and
                    period_data.get('endDate') == end_date):
                    period_matches = True

            if period_matches:
                # Check if fact has meaningful value
                fact_value = getattr(fact, 'value', None)
                if fact_value is not None:
                    str_value = str(fact_value).strip()
                    if str_value and str_value.lower() not in ['', 'nan', 'none']:
                        try:
                            import pandas as pd
                            numeric_value = pd.to_numeric(str_value, errors='coerce')
                            if not pd.isna(numeric_value):
                                meaningful_fact_count += 1
                        except:
                            # If not numeric but not empty, might still be meaningful
                            if len(str_value) > 0:
                                meaningful_fact_count += 1

    # Calculate coverage
    essential_coverage = len(found_essentials) / len(essential_concepts) if essential_concepts else 0.0

    # Determine if sufficient data
    # Require at least 50% essential coverage or 20+ facts
    has_sufficient_data = essential_coverage >= 0.5 or fact_count >= 20

    # Determine if has meaningful data (fixes Issue #408)
    # A period has meaningful data if it has at least some facts with non-empty values
    has_meaningful_data = meaningful_fact_count > 0

    return {
        'fact_count': fact_count,
        'meaningful_fact_count': meaningful_fact_count,
        'essential_coverage': essential_coverage,
        'has_sufficient_data': has_sufficient_data,
        'has_meaningful_data': has_meaningful_data,
        'missing_essentials': list(missing_essentials),
        'found_essentials': list(found_essentials)
    }


def filter_periods_with_data(xbrl_instance, periods: List[Tuple[str, str]],
                            statement_type: str,
                            min_fact_count: int = 10) -> List[Tuple[str, str]]:
    """
    Filter periods to only include those with sufficient data.

    Args:
        xbrl_instance: XBRL instance
        periods: List of (period_key, label) tuples
        statement_type: Type of statement
        min_fact_count: Minimum number of facts required

    Returns:
        Filtered list of periods with sufficient data
    """
    filtered_periods = []

    for period_key, label in periods:
        quality = check_period_data_quality(xbrl_instance, period_key, statement_type)

        # Include period if it has sufficient data AND meaningful data (fixes Issue #408)
        if (quality['has_sufficient_data'] and
            quality['fact_count'] >= min_fact_count and
            quality['has_meaningful_data']):
            filtered_periods.append((period_key, label))
        else:
            # Log why period was excluded
            pass

    return filtered_periods


def determine_investor_preferred_periods(xbrl_instance, statement_type: str) -> List[Tuple[str, str]]:
    """
    Enhanced period selection that prioritizes what investors want to see.

    For Annual Reports:
    1. Current fiscal year
    2. Prior fiscal year (YoY comparison)
    3. Two years ago (3-year trend)

    For Quarterly Reports:
    1. Current quarter
    2. Same quarter prior year (YoY)
    3. Current YTD
    4. Prior year YTD

    Only includes periods with sufficient data.
    """
    from edgar.xbrl.period_selector import select_periods

    # Start with the unified period selection
    base_periods = select_periods(xbrl_instance, statement_type)

    # Filter for data availability
    periods_with_data = filter_periods_with_data(
        xbrl_instance,
        base_periods,
        statement_type,
        min_fact_count=10
    )

    # If we lost too many periods, be less strict
    if len(periods_with_data) < 2 and len(base_periods) >= 2:
        # Try again with lower threshold
        periods_with_data = filter_periods_with_data(
            xbrl_instance,
            base_periods,
            statement_type,
            min_fact_count=5
        )

    return periods_with_data