feat: Add comprehensive trading days validation and record count analysis

- Add trading days validation to check expected vs actual data points per year
- Implement calculate_expected_trading_days() accounting for weekends and Czech holidays
- Add validate_trading_days_count() with discrepancy analysis and severity classification
- Integrate trading days validation into main validation workflow
- Add record count analysis by time periods (week, month, quarter, half year, year)
- Implement get_record_counts_by_period() with detailed breakdowns
- Add --record-counts CLI command for standalone period analysis
- Enhance format_validation_text() to display trading days and record count information
- Update data quality scoring to include trading days compliance
- Add comprehensive JSON output support for all new validation features

Trading Days Validation:
- Calculates expected trading days excluding weekends and Czech holidays
- Compares actual data points against expected counts
- Provides discrepancy analysis with severity levels (ok, minor, moderate, severe)
- Shows data completeness percentage

Record Count Analysis:
- Breaks down data by multiple time periods simultaneously
- Supports week-by-week, monthly, quarterly, half-yearly, and yearly counts
- Handles leap years and varying month lengths correctly
- Provides both summary and detailed views

Integration Features:
- Seamlessly integrated with existing price change and gap validation
- Enhanced data quality scoring considers all validation aspects
- Comprehensive JSON schema for programmatic consumption
- Backward compatible with existing validation commands

Usage Examples:
  python src/cli.py --validate --currency USD --year 2025  # Shows all validations
  python src/cli.py --record-counts --currency USD --year 2025  # Period breakdown only
  python src/cli.py --validate --currency EUR --json  # Full validation in JSON

Quality Assurance:
-  Pyright type checking: 0 errors, 0 warnings
-  Syntax validation: No compilation errors
-  Functional testing: All features working correctly
-  Czech holiday integration: Proper weekend/holiday exclusion
-  Leap year handling: Correctly accounts for 366-day years
This commit is contained in:
kdusek
2026-01-12 23:19:33 +01:00
parent 65a1485ff9
commit 7ce88e6e4a
2 changed files with 328 additions and 1 deletions

View File

@@ -202,6 +202,11 @@ def main():
action="store_true",
help="Validuje data pro měnu nebo všechny měny. Zkontroluje konzistenci kurzů a detekuje možné chyby.",
)
parser.add_argument(
"--record-counts",
action="store_true",
help="Zobrazí počet záznamů podle časových období (týden, měsíc, čtvrtletí, pololetí, rok).",
)
parser.add_argument(
"--change-threshold",
type=float,
@@ -289,6 +294,67 @@ def main():
else:
text_output = data_validator.format_validation_text(results)
print(text_output)
elif args.record_counts:
# Record counts command
if not args.currency:
print(
"Chyba: Pro --record-counts je nutné zadat měnu pomocí -c/--currency."
)
sys.exit(1)
debug_print(f"Získávám počty záznamů pro měnu {args.currency}...")
record_counts = data_validator.get_record_counts_by_period(
args.currency, args.year
)
if args.json:
output_json({"currency": args.currency, "record_counts": record_counts})
else:
print(f"Record Counts for {args.currency}:")
print("=" * 50)
for year_key, periods in record_counts.items():
print(f"\nYear {year_key}:")
print(f" Total records: {periods.get('year', 0)}")
# Half years
half_years = periods.get("half_year", {})
if half_years:
print(
f" Half years: H1={half_years.get('H1', 0)}, H2={half_years.get('H2', 0)}"
)
# Quarters
quarters = periods.get("quarter", {})
if quarters:
quarter_str = ", ".join(
[f"Q{q}={quarters.get(f'Q{q}', 0)}" for q in range(1, 5)]
)
print(f" Quarters: {quarter_str}")
# Months
months = periods.get("month", {})
if months:
month_list = []
for month in range(1, 13):
month_key = f"{month:02d}"
count = months.get(month_key, 0)
month_list.append(f"{month}={count}")
print(f" Months: {', '.join(month_list)}")
# Weeks summary
weeks = periods.get("week", {})
if weeks:
total_weeks = len(weeks)
if total_weeks <= 10:
week_list = sorted([f"{w}={weeks[w]}" for w in weeks.keys()])
print(f" Weeks: {', '.join(week_list)}")
else:
sample_weeks = sorted(list(weeks.keys())[:5])
week_sample = [f"{w}={weeks[w]}" for w in sample_weeks]
print(
f" Weeks: {', '.join(week_sample)}... ({total_weeks} total weeks)"
)
elif args.year:
# Validation command
base_threshold = args.change_threshold

View File

@@ -133,6 +133,176 @@ def calculate_working_days_gap(start_date, end_date):
return working_days
def calculate_expected_trading_days(year):
"""
Calculate the expected number of trading days in a year (excluding weekends and holidays).
:param year: Year to calculate for
:return: Dictionary with expected trading days and breakdown
"""
import calendar
total_days = 366 if calendar.isleap(year) else 365
weekend_days = 0
holiday_days = 0
# Count weekends and holidays
for month in range(1, 13):
for day in range(1, calendar.monthrange(year, month)[1] + 1):
date_str = f"{day:02d}.{month:02d}.{year}"
if holidays.is_weekend(date_str):
weekend_days += 1
elif holidays.is_holiday(date_str):
holiday_days += 1
expected_trading_days = total_days - weekend_days - holiday_days
return {
"total_days": total_days,
"weekend_days": weekend_days,
"holiday_days": holiday_days,
"expected_trading_days": expected_trading_days,
}
def validate_trading_days_count(currency_code, year):
"""
Validate that a year has the appropriate number of trading day entries.
:param currency_code: Currency to validate
:param year: Year to check
:return: Validation result with actual vs expected counts
"""
# Get expected trading days
expected = calculate_expected_trading_days(year)
# Count actual data points for the year
actual_count = 0
rates_data = []
start_date = datetime(year, 1, 1)
end_date = datetime(year, 12, 31)
current_date = start_date
while current_date <= end_date:
date_str = current_date.strftime("%d.%m.%Y")
rate = database.get_rate(date_str, currency_code)
if rate is not None:
actual_count += 1
rates_data.append((current_date, rate, date_str))
current_date += timedelta(days=1)
# Calculate discrepancy
discrepancy_days = actual_count - expected["expected_trading_days"]
discrepancy_percent = (
(discrepancy_days / expected["expected_trading_days"]) * 100
if expected["expected_trading_days"] > 0
else 0
)
# Determine severity
severity = "ok"
if abs(discrepancy_percent) > 15:
severity = "severe"
elif abs(discrepancy_percent) > 5:
severity = "moderate"
elif abs(discrepancy_percent) > 0:
severity = "minor"
return {
"expected_trading_days": expected["expected_trading_days"],
"actual_data_points": actual_count,
"discrepancy_days": discrepancy_days,
"discrepancy_percent": round(discrepancy_percent, 2),
"severity": severity,
"total_days": expected["total_days"],
"weekend_days_excluded": expected["weekend_days"],
"holiday_days_excluded": expected["holiday_days"],
"data_completeness_percent": round(
(actual_count / expected["expected_trading_days"]) * 100, 1
)
if expected["expected_trading_days"] > 0
else 0,
}
def get_record_counts_by_period(currency_code, year=None):
"""
Get record counts for different time periods.
:param currency_code: Currency to analyze
:param year: Optional year filter
:return: Dictionary with counts by period
"""
if year:
years_to_check = [year]
else:
years_to_check = database.get_years_with_data()
if not years_to_check:
return {}
results = {}
for check_year in years_to_check:
year_results = {}
# Get all data for the year
data_points = []
start_date = datetime(check_year, 1, 1)
end_date = datetime(check_year, 12, 31)
current_date = start_date
while current_date <= end_date:
date_str = current_date.strftime("%d.%m.%Y")
rate = database.get_rate(date_str, currency_code)
if rate is not None:
data_points.append((current_date, rate))
current_date += timedelta(days=1)
# Count by different periods
period_counts = {
"year": len(data_points),
"half_year": {},
"quarter": {},
"month": {},
"week": {},
}
# Half years
period_counts["half_year"]["H1"] = len(
[d for d in data_points if d[0].month <= 6]
)
period_counts["half_year"]["H2"] = len(
[d for d in data_points if d[0].month > 6]
)
# Quarters
for quarter in range(1, 5):
start_month = (quarter - 1) * 3 + 1
end_month = quarter * 3
period_counts["quarter"][f"Q{quarter}"] = len(
[d for d in data_points if start_month <= d[0].month <= end_month]
)
# Months
for month in range(1, 13):
period_counts["month"][f"{month:02d}"] = len(
[d for d in data_points if d[0].month == month]
)
# Weeks (approximate by week number)
week_counts = {}
for data_point in data_points:
week_num = data_point[0].isocalendar()[1]
week_key = f"W{week_num:02d}"
week_counts[week_key] = week_counts.get(week_key, 0) + 1
period_counts["week"] = week_counts
results[str(check_year)] = period_counts
return results
def detect_temporal_gaps(currency_code, year=None, max_gap_days=3):
"""
Detect temporal gaps in data sequence (missing working days).
@@ -327,9 +497,19 @@ def validate_currency_data(
# Temporal gaps
gaps = detect_temporal_gaps(currency_code, year, max_gap_days)
# Trading days validation
trading_days_validation = None
if year:
trading_days_validation = validate_trading_days_count(currency_code, year)
# Record counts by period
record_counts = get_record_counts_by_period(currency_code, year)
results["adaptive_analysis"] = adaptive_info
results["price_change_violations"] = violations
results["temporal_gaps"] = gaps
results["trading_days_validation"] = trading_days_validation
results["record_counts_by_period"] = record_counts
# Summary statistics
severity_counts = defaultdict(int)
@@ -350,7 +530,7 @@ def validate_currency_data(
"max_gap_days": max_gap_days,
}
# Data quality score (simple heuristic)
# Data quality score (enhanced heuristic)
quality_penalty = 0
if violations:
quality_penalty += (
@@ -360,6 +540,11 @@ def validate_currency_data(
quality_penalty += (
len(gaps) * 10 + gap_severity_counts.get("severe", 0) * 30
)
if trading_days_validation and trading_days_validation["severity"] != "ok":
severity_penalty = {"minor": 5, "moderate": 15, "severe": 30}
quality_penalty += severity_penalty.get(
trading_days_validation["severity"], 0
)
results["data_quality_score"] = max(0, 100 - quality_penalty)
@@ -495,6 +680,82 @@ def format_validation_text(results):
else:
output.append("\nTemporal Gaps: None found")
# Trading days validation
trading_validation = results.get("trading_days_validation")
if trading_validation:
output.append("\nTrading Days Validation:")
output.append(
f"- Expected trading days: {trading_validation['expected_trading_days']} ({trading_validation.get('total_days', 'N/A')} total - {trading_validation.get('weekend_days_excluded', 0)} weekends - {trading_validation.get('holiday_days_excluded', 0)} holidays)"
)
output.append(
f"- Actual data points: {trading_validation['actual_data_points']}"
)
output.append(
f"- Discrepancy: {trading_validation['discrepancy_days']} days ({trading_validation['discrepancy_percent']}%)"
)
output.append(
f"- Data completeness: {trading_validation['data_completeness_percent']}%"
)
output.append(f"- Status: {trading_validation['severity'].upper()}")
# Record counts by period
record_counts = results.get("record_counts_by_period", {})
if record_counts:
for year_key, periods in record_counts.items():
output.append(f"\nRecord Counts for {year_key}:")
output.append(f"- Year total: {periods.get('year', 0)} records")
# Half years
half_years = periods.get("half_year", {})
if half_years:
output.append(
f"- Half years: H1={half_years.get('H1', 0)}, H2={half_years.get('H2', 0)}"
)
# Quarters
quarters = periods.get("quarter", {})
if quarters:
quarter_str = ", ".join(
[f"Q{q}={quarters.get(f'Q{q}', 0)}" for q in range(1, 5)]
)
output.append(f"- Quarters: {quarter_str}")
# Months summary
months = periods.get("month", {})
if months:
month_list = [
f"{m}={months.get(f'{int(m):02d}', 0)}"
for m in [
"01",
"02",
"03",
"04",
"05",
"06",
"07",
"08",
"09",
"10",
"11",
"12",
]
]
output.append(f"- Months: {', '.join(month_list)}")
# Weeks summary (show first few and indicate total)
weeks = periods.get("week", {})
if weeks:
total_weeks = len(weeks)
if total_weeks <= 10:
week_list = [f"{w}={weeks[w]}" for w in sorted(weeks.keys())]
output.append(f"- Weeks: {', '.join(week_list)}")
else:
sample_weeks = sorted(list(weeks.keys())[:5])
week_sample = [f"{w}={weeks[w]}" for w in sample_weeks]
output.append(
f"- Weeks: {', '.join(week_sample)}... ({total_weeks} total weeks)"
)
summary = results.get("summary", {})
quality_score = results.get("data_quality_score", 0)
output.append(f"\nData Quality Score: {quality_score}%")