Files
edgartools/venv/lib/python3.10/site-packages/edgar/xbrl/period_data_check.py
2025-12-09 12:13:01 +01:00

313 lines
12 KiB
Python

"""
Enhanced period selection with data availability checking.
This module provides functions to verify that selected periods have sufficient
data before displaying them to investors.
"""
from typing import Dict, List, Optional, Set, Tuple
def count_facts_for_period(xbrl_instance, period_key: str, statement_type: Optional[str] = None) -> int:
"""
Count the number of facts available for a specific period.
Args:
xbrl_instance: XBRL instance with facts
period_key: Period key to check (e.g., 'instant_2024-09-28')
statement_type: Optional statement type to filter facts
Returns:
Number of facts found for this period
"""
fact_count = 0
# Parse period key to get context criteria
if period_key.startswith('instant_'):
period_type = 'instant'
period_date = period_key.replace('instant_', '')
elif 'duration_' in period_key:
period_type = 'duration'
parts = period_key.split('_')
if len(parts) >= 3:
start_date = parts[1]
end_date = parts[2]
else:
return 0
else:
return 0
# Count facts matching this period
for _fact_key, fact in xbrl_instance._facts.items():
# Get context for this fact
context = xbrl_instance.contexts.get(fact.context_ref)
if not context:
continue
# Check if period matches
period_data = context.model_dump().get('period', {})
if period_type == 'instant':
if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
fact_count += 1
elif period_type == 'duration':
if (period_data.get('type') == 'duration' and
period_data.get('startDate') == start_date and
period_data.get('endDate') == end_date):
fact_count += 1
return fact_count
def get_essential_concepts_for_statement(statement_type: str) -> Set[str]:
"""
Get the essential concepts that should be present for a statement type.
These are the minimum concepts investors expect to see.
"""
essential_concepts = {
'BalanceSheet': {
# Core balance sheet items
'Assets', 'AssetsCurrent',
'Liabilities', 'LiabilitiesCurrent',
'StockholdersEquity', 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest',
# Common important items
'CashAndCashEquivalentsAtCarryingValue', 'Cash',
'AccountsReceivableNetCurrent', 'AccountsReceivable',
'Inventory', 'InventoryNet',
'PropertyPlantAndEquipmentNet',
'AccountsPayableCurrent', 'AccountsPayable',
'LongTermDebt', 'LongTermDebtNoncurrent'
},
'IncomeStatement': {
# Core income items
'Revenues', 'RevenueFromContractWithCustomerExcludingAssessedTax', 'SalesRevenueNet',
'CostOfRevenue', 'CostOfGoodsAndServicesSold', 'CostOfGoodsSold',
'GrossProfit',
'OperatingExpenses', 'OperatingCostsAndExpenses',
'OperatingIncomeLoss', 'IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest',
'NetIncomeLoss', 'ProfitLoss',
# Common important items
'ResearchAndDevelopmentExpense',
'SellingGeneralAndAdministrativeExpense',
'EarningsPerShareBasic', 'EarningsPerShareDiluted'
},
'CashFlowStatement': {
# Core cash flow items
'NetCashProvidedByUsedInOperatingActivities',
'NetCashProvidedByUsedInInvestingActivities',
'NetCashProvidedByUsedInFinancingActivities',
'CashAndCashEquivalentsPeriodIncreaseDecrease',
# Common important items
'NetIncomeLoss',
'DepreciationDepletionAndAmortization', 'DepreciationAndAmortization',
'PaymentsToAcquirePropertyPlantAndEquipment',
'PaymentsOfDividends', 'PaymentsOfDividendsCommonStock'
}
}
return essential_concepts.get(statement_type, set())
def check_period_data_quality(xbrl_instance, period_key: str, statement_type: str) -> Dict[str, any]:
"""
Check the data quality for a specific period.
Returns:
Dictionary with quality metrics:
- fact_count: Total number of facts
- meaningful_fact_count: Number of facts with meaningful (non-empty) values
- essential_coverage: Percentage of essential concepts found
- has_sufficient_data: Boolean indicating if period should be displayed
- missing_essentials: List of missing essential concepts
- has_meaningful_data: Boolean indicating if period has meaningful values (fixes Issue #408)
"""
# Count total facts
fact_count = count_facts_for_period(xbrl_instance, period_key, statement_type)
# Count meaningful facts (non-empty values) - Fix for Issue #408
meaningful_fact_count = 0
# Get essential concepts
essential_concepts = get_essential_concepts_for_statement(statement_type)
# Check which essential concepts are present
found_essentials = set()
missing_essentials = set()
# Parse period for context matching
if period_key.startswith('instant_'):
period_type = 'instant'
period_date = period_key.replace('instant_', '')
else:
period_type = 'duration'
parts = period_key.split('_')
if len(parts) >= 3:
start_date = parts[1]
end_date = parts[2]
else:
return {
'fact_count': fact_count,
'essential_coverage': 0.0,
'has_sufficient_data': False,
'missing_essentials': list(essential_concepts)
}
# Check each essential concept
for concept in essential_concepts:
concept_found = False
# Look for this concept in facts
for _fact_key, fact in xbrl_instance._facts.items():
if concept_found:
break
# Check if this fact matches the concept
element = xbrl_instance.element_catalog.get(fact.element_id)
if element and concept in element.name:
# Check if it's for our period
context = xbrl_instance.contexts.get(fact.context_ref)
if context:
period_data = context.model_dump().get('period', {})
if period_type == 'instant':
if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
found_essentials.add(concept)
concept_found = True
else:
if (period_data.get('type') == 'duration' and
period_data.get('startDate') == start_date and
period_data.get('endDate') == end_date):
found_essentials.add(concept)
concept_found = True
if not concept_found:
missing_essentials.add(concept)
# Count meaningful facts (non-empty values) - Fix for Issue #408
for _fact_key, fact in xbrl_instance._facts.items():
# Check if it's for our period
context = xbrl_instance.contexts.get(fact.context_ref)
if context:
period_data = context.model_dump().get('period', {})
period_matches = False
if period_type == 'instant':
if period_data.get('type') == 'instant' and period_data.get('instant') == period_date:
period_matches = True
else:
if (period_data.get('type') == 'duration' and
period_data.get('startDate') == start_date and
period_data.get('endDate') == end_date):
period_matches = True
if period_matches:
# Check if fact has meaningful value
fact_value = getattr(fact, 'value', None)
if fact_value is not None:
str_value = str(fact_value).strip()
if str_value and str_value.lower() not in ['', 'nan', 'none']:
try:
import pandas as pd
numeric_value = pd.to_numeric(str_value, errors='coerce')
if not pd.isna(numeric_value):
meaningful_fact_count += 1
except:
# If not numeric but not empty, might still be meaningful
if len(str_value) > 0:
meaningful_fact_count += 1
# Calculate coverage
essential_coverage = len(found_essentials) / len(essential_concepts) if essential_concepts else 0.0
# Determine if sufficient data
# Require at least 50% essential coverage or 20+ facts
has_sufficient_data = essential_coverage >= 0.5 or fact_count >= 20
# Determine if has meaningful data (fixes Issue #408)
# A period has meaningful data if it has at least some facts with non-empty values
has_meaningful_data = meaningful_fact_count > 0
return {
'fact_count': fact_count,
'meaningful_fact_count': meaningful_fact_count,
'essential_coverage': essential_coverage,
'has_sufficient_data': has_sufficient_data,
'has_meaningful_data': has_meaningful_data,
'missing_essentials': list(missing_essentials),
'found_essentials': list(found_essentials)
}
def filter_periods_with_data(xbrl_instance, periods: List[Tuple[str, str]],
statement_type: str,
min_fact_count: int = 10) -> List[Tuple[str, str]]:
"""
Filter periods to only include those with sufficient data.
Args:
xbrl_instance: XBRL instance
periods: List of (period_key, label) tuples
statement_type: Type of statement
min_fact_count: Minimum number of facts required
Returns:
Filtered list of periods with sufficient data
"""
filtered_periods = []
for period_key, label in periods:
quality = check_period_data_quality(xbrl_instance, period_key, statement_type)
# Include period if it has sufficient data AND meaningful data (fixes Issue #408)
if (quality['has_sufficient_data'] and
quality['fact_count'] >= min_fact_count and
quality['has_meaningful_data']):
filtered_periods.append((period_key, label))
else:
# Log why period was excluded
pass
return filtered_periods
def determine_investor_preferred_periods(xbrl_instance, statement_type: str) -> List[Tuple[str, str]]:
"""
Enhanced period selection that prioritizes what investors want to see.
For Annual Reports:
1. Current fiscal year
2. Prior fiscal year (YoY comparison)
3. Two years ago (3-year trend)
For Quarterly Reports:
1. Current quarter
2. Same quarter prior year (YoY)
3. Current YTD
4. Prior year YTD
Only includes periods with sufficient data.
"""
from edgar.xbrl.period_selector import select_periods
# Start with the unified period selection
base_periods = select_periods(xbrl_instance, statement_type)
# Filter for data availability
periods_with_data = filter_periods_with_data(
xbrl_instance,
base_periods,
statement_type,
min_fact_count=10
)
# If we lost too many periods, be less strict
if len(periods_with_data) < 2 and len(base_periods) >= 2:
# Try again with lower threshold
periods_with_data = filter_periods_with_data(
xbrl_instance,
base_periods,
statement_type,
min_fact_count=5
)
return periods_with_data