feat: Add comprehensive trading days validation and record count analysis

- Add trading days validation to check expected vs actual data points per year - Implement calculate_expected_trading_days() accounting for weekends and Czech holidays - Add validate_trading_days_count() with discrepancy analysis and severity classification - Integrate trading days validation into main validation workflow - Add record count analysis by time periods (week, month, quarter, half year, year) - Implement get_record_counts_by_period() with detailed breakdowns - Add --record-counts CLI command for standalone period analysis - Enhance format_validation_text() to display trading days and record count information - Update data quality scoring to include trading days compliance - Add comprehensive JSON output support for all new validation features Trading Days Validation: - Calculates expected trading days excluding weekends and Czech holidays - Compares actual data points against expected counts - Provides discrepancy analysis with severity levels (ok, minor, moderate, severe) - Shows data completeness percentage Record Count Analysis: - Breaks down data by multiple time periods simultaneously - Supports week-by-week, monthly, quarterly, half-yearly, and yearly counts - Handles leap years and varying month lengths correctly - Provides both summary and detailed views Integration Features: - Seamlessly integrated with existing price change and gap validation - Enhanced data quality scoring considers all validation aspects - Comprehensive JSON schema for programmatic consumption - Backward compatible with existing validation commands Usage Examples: python src/cli.py --validate --currency USD --year 2025 # Shows all validations python src/cli.py --record-counts --currency USD --year 2025 # Period breakdown only python src/cli.py --validate --currency EUR --json # Full validation in JSON Quality Assurance: - ✅ Pyright type checking: 0 errors, 0 warnings - ✅ Syntax validation: No compilation errors - ✅ Functional testing: All features working correctly - ✅ Czech holiday integration: Proper weekend/holiday exclusion - ✅ Leap year handling: Correctly accounts for 366-day years
2026-01-12 23:19:33 +01:00
parent 65a1485ff9
commit 7ce88e6e4a
2 changed files with 328 additions and 1 deletions
--- a/src/cli.py
+++ b/src/cli.py
@@ -202,6 +202,11 @@ def main():
        action="store_true",
        help="Validuje data pro měnu nebo všechny měny. Zkontroluje konzistenci kurzů a detekuje možné chyby.",
    )
+    parser.add_argument(
+        "--record-counts",
+        action="store_true",
+        help="Zobrazí počet záznamů podle časových období (týden, měsíc, čtvrtletí, pololetí, rok).",
+    )
    parser.add_argument(
        "--change-threshold",
        type=float,
@@ -289,6 +294,67 @@ def main():
            else:
                text_output = data_validator.format_validation_text(results)
                print(text_output)
+    elif args.record_counts:
+        # Record counts command
+        if not args.currency:
+            print(
+                "Chyba: Pro --record-counts je nutné zadat měnu pomocí -c/--currency."
+            )
+            sys.exit(1)
+
+        debug_print(f"Získávám počty záznamů pro měnu {args.currency}...")
+        record_counts = data_validator.get_record_counts_by_period(
+            args.currency, args.year
+        )
+
+        if args.json:
+            output_json({"currency": args.currency, "record_counts": record_counts})
+        else:
+            print(f"Record Counts for {args.currency}:")
+            print("=" * 50)
+
+            for year_key, periods in record_counts.items():
+                print(f"\nYear {year_key}:")
+                print(f"  Total records: {periods.get('year', 0)}")
+
+                # Half years
+                half_years = periods.get("half_year", {})
+                if half_years:
+                    print(
+                        f"  Half years: H1={half_years.get('H1', 0)}, H2={half_years.get('H2', 0)}"
+                    )
+
+                # Quarters
+                quarters = periods.get("quarter", {})
+                if quarters:
+                    quarter_str = ", ".join(
+                        [f"Q{q}={quarters.get(f'Q{q}', 0)}" for q in range(1, 5)]
+                    )
+                    print(f"  Quarters: {quarter_str}")
+
+                # Months
+                months = periods.get("month", {})
+                if months:
+                    month_list = []
+                    for month in range(1, 13):
+                        month_key = f"{month:02d}"
+                        count = months.get(month_key, 0)
+                        month_list.append(f"{month}={count}")
+                    print(f"  Months: {', '.join(month_list)}")
+
+                # Weeks summary
+                weeks = periods.get("week", {})
+                if weeks:
+                    total_weeks = len(weeks)
+                    if total_weeks <= 10:
+                        week_list = sorted([f"{w}={weeks[w]}" for w in weeks.keys()])
+                        print(f"  Weeks: {', '.join(week_list)}")
+                    else:
+                        sample_weeks = sorted(list(weeks.keys())[:5])
+                        week_sample = [f"{w}={weeks[w]}" for w in sample_weeks]
+                        print(
+                            f"  Weeks: {', '.join(week_sample)}... ({total_weeks} total weeks)"
+                        )
    elif args.year:
        # Validation command
        base_threshold = args.change_threshold
--- a/src/data_validator.py
+++ b/src/data_validator.py
@@ -133,6 +133,176 @@ def calculate_working_days_gap(start_date, end_date):
    return working_days


+def calculate_expected_trading_days(year):
+    """
+    Calculate the expected number of trading days in a year (excluding weekends and holidays).
+
+    :param year: Year to calculate for
+    :return: Dictionary with expected trading days and breakdown
+    """
+    import calendar
+
+    total_days = 366 if calendar.isleap(year) else 365
+    weekend_days = 0
+    holiday_days = 0
+
+    # Count weekends and holidays
+    for month in range(1, 13):
+        for day in range(1, calendar.monthrange(year, month)[1] + 1):
+            date_str = f"{day:02d}.{month:02d}.{year}"
+            if holidays.is_weekend(date_str):
+                weekend_days += 1
+            elif holidays.is_holiday(date_str):
+                holiday_days += 1
+
+    expected_trading_days = total_days - weekend_days - holiday_days
+
+    return {
+        "total_days": total_days,
+        "weekend_days": weekend_days,
+        "holiday_days": holiday_days,
+        "expected_trading_days": expected_trading_days,
+    }
+
+
+def validate_trading_days_count(currency_code, year):
+    """
+    Validate that a year has the appropriate number of trading day entries.
+
+    :param currency_code: Currency to validate
+    :param year: Year to check
+    :return: Validation result with actual vs expected counts
+    """
+    # Get expected trading days
+    expected = calculate_expected_trading_days(year)
+
+    # Count actual data points for the year
+    actual_count = 0
+    rates_data = []
+
+    start_date = datetime(year, 1, 1)
+    end_date = datetime(year, 12, 31)
+
+    current_date = start_date
+    while current_date <= end_date:
+        date_str = current_date.strftime("%d.%m.%Y")
+        rate = database.get_rate(date_str, currency_code)
+        if rate is not None:
+            actual_count += 1
+            rates_data.append((current_date, rate, date_str))
+        current_date += timedelta(days=1)
+
+    # Calculate discrepancy
+    discrepancy_days = actual_count - expected["expected_trading_days"]
+    discrepancy_percent = (
+        (discrepancy_days / expected["expected_trading_days"]) * 100
+        if expected["expected_trading_days"] > 0
+        else 0
+    )
+
+    # Determine severity
+    severity = "ok"
+    if abs(discrepancy_percent) > 15:
+        severity = "severe"
+    elif abs(discrepancy_percent) > 5:
+        severity = "moderate"
+    elif abs(discrepancy_percent) > 0:
+        severity = "minor"
+
+    return {
+        "expected_trading_days": expected["expected_trading_days"],
+        "actual_data_points": actual_count,
+        "discrepancy_days": discrepancy_days,
+        "discrepancy_percent": round(discrepancy_percent, 2),
+        "severity": severity,
+        "total_days": expected["total_days"],
+        "weekend_days_excluded": expected["weekend_days"],
+        "holiday_days_excluded": expected["holiday_days"],
+        "data_completeness_percent": round(
+            (actual_count / expected["expected_trading_days"]) * 100, 1
+        )
+        if expected["expected_trading_days"] > 0
+        else 0,
+    }
+
+
+def get_record_counts_by_period(currency_code, year=None):
+    """
+    Get record counts for different time periods.
+
+    :param currency_code: Currency to analyze
+    :param year: Optional year filter
+    :return: Dictionary with counts by period
+    """
+    if year:
+        years_to_check = [year]
+    else:
+        years_to_check = database.get_years_with_data()
+        if not years_to_check:
+            return {}
+
+    results = {}
+
+    for check_year in years_to_check:
+        year_results = {}
+
+        # Get all data for the year
+        data_points = []
+        start_date = datetime(check_year, 1, 1)
+        end_date = datetime(check_year, 12, 31)
+
+        current_date = start_date
+        while current_date <= end_date:
+            date_str = current_date.strftime("%d.%m.%Y")
+            rate = database.get_rate(date_str, currency_code)
+            if rate is not None:
+                data_points.append((current_date, rate))
+            current_date += timedelta(days=1)
+
+        # Count by different periods
+        period_counts = {
+            "year": len(data_points),
+            "half_year": {},
+            "quarter": {},
+            "month": {},
+            "week": {},
+        }
+
+        # Half years
+        period_counts["half_year"]["H1"] = len(
+            [d for d in data_points if d[0].month <= 6]
+        )
+        period_counts["half_year"]["H2"] = len(
+            [d for d in data_points if d[0].month > 6]
+        )
+
+        # Quarters
+        for quarter in range(1, 5):
+            start_month = (quarter - 1) * 3 + 1
+            end_month = quarter * 3
+            period_counts["quarter"][f"Q{quarter}"] = len(
+                [d for d in data_points if start_month <= d[0].month <= end_month]
+            )
+
+        # Months
+        for month in range(1, 13):
+            period_counts["month"][f"{month:02d}"] = len(
+                [d for d in data_points if d[0].month == month]
+            )
+
+        # Weeks (approximate by week number)
+        week_counts = {}
+        for data_point in data_points:
+            week_num = data_point[0].isocalendar()[1]
+            week_key = f"W{week_num:02d}"
+            week_counts[week_key] = week_counts.get(week_key, 0) + 1
+        period_counts["week"] = week_counts
+
+        results[str(check_year)] = period_counts
+
+    return results
+
+
 def detect_temporal_gaps(currency_code, year=None, max_gap_days=3):
    """
    Detect temporal gaps in data sequence (missing working days).
@@ -327,9 +497,19 @@ def validate_currency_data(
        # Temporal gaps
        gaps = detect_temporal_gaps(currency_code, year, max_gap_days)

+        # Trading days validation
+        trading_days_validation = None
+        if year:
+            trading_days_validation = validate_trading_days_count(currency_code, year)
+
+        # Record counts by period
+        record_counts = get_record_counts_by_period(currency_code, year)
+
        results["adaptive_analysis"] = adaptive_info
        results["price_change_violations"] = violations
        results["temporal_gaps"] = gaps
+        results["trading_days_validation"] = trading_days_validation
+        results["record_counts_by_period"] = record_counts

        # Summary statistics
        severity_counts = defaultdict(int)
@@ -350,7 +530,7 @@ def validate_currency_data(
            "max_gap_days": max_gap_days,
        }

-        # Data quality score (simple heuristic)
+        # Data quality score (enhanced heuristic)
        quality_penalty = 0
        if violations:
            quality_penalty += (
@@ -360,6 +540,11 @@ def validate_currency_data(
            quality_penalty += (
                len(gaps) * 10 + gap_severity_counts.get("severe", 0) * 30
            )
+        if trading_days_validation and trading_days_validation["severity"] != "ok":
+            severity_penalty = {"minor": 5, "moderate": 15, "severe": 30}
+            quality_penalty += severity_penalty.get(
+                trading_days_validation["severity"], 0
+            )

        results["data_quality_score"] = max(0, 100 - quality_penalty)

@@ -495,6 +680,82 @@ def format_validation_text(results):
        else:
            output.append("\nTemporal Gaps: None found")

+        # Trading days validation
+        trading_validation = results.get("trading_days_validation")
+        if trading_validation:
+            output.append("\nTrading Days Validation:")
+            output.append(
+                f"- Expected trading days: {trading_validation['expected_trading_days']} ({trading_validation.get('total_days', 'N/A')} total - {trading_validation.get('weekend_days_excluded', 0)} weekends - {trading_validation.get('holiday_days_excluded', 0)} holidays)"
+            )
+            output.append(
+                f"- Actual data points: {trading_validation['actual_data_points']}"
+            )
+            output.append(
+                f"- Discrepancy: {trading_validation['discrepancy_days']} days ({trading_validation['discrepancy_percent']}%)"
+            )
+            output.append(
+                f"- Data completeness: {trading_validation['data_completeness_percent']}%"
+            )
+            output.append(f"- Status: {trading_validation['severity'].upper()}")
+
+        # Record counts by period
+        record_counts = results.get("record_counts_by_period", {})
+        if record_counts:
+            for year_key, periods in record_counts.items():
+                output.append(f"\nRecord Counts for {year_key}:")
+                output.append(f"- Year total: {periods.get('year', 0)} records")
+
+                # Half years
+                half_years = periods.get("half_year", {})
+                if half_years:
+                    output.append(
+                        f"- Half years: H1={half_years.get('H1', 0)}, H2={half_years.get('H2', 0)}"
+                    )
+
+                # Quarters
+                quarters = periods.get("quarter", {})
+                if quarters:
+                    quarter_str = ", ".join(
+                        [f"Q{q}={quarters.get(f'Q{q}', 0)}" for q in range(1, 5)]
+                    )
+                    output.append(f"- Quarters: {quarter_str}")
+
+                # Months summary
+                months = periods.get("month", {})
+                if months:
+                    month_list = [
+                        f"{m}={months.get(f'{int(m):02d}', 0)}"
+                        for m in [
+                            "01",
+                            "02",
+                            "03",
+                            "04",
+                            "05",
+                            "06",
+                            "07",
+                            "08",
+                            "09",
+                            "10",
+                            "11",
+                            "12",
+                        ]
+                    ]
+                    output.append(f"- Months: {', '.join(month_list)}")
+
+                # Weeks summary (show first few and indicate total)
+                weeks = periods.get("week", {})
+                if weeks:
+                    total_weeks = len(weeks)
+                    if total_weeks <= 10:
+                        week_list = [f"{w}={weeks[w]}" for w in sorted(weeks.keys())]
+                        output.append(f"- Weeks: {', '.join(week_list)}")
+                    else:
+                        sample_weeks = sorted(list(weeks.keys())[:5])
+                        week_sample = [f"{w}={weeks[w]}" for w in sample_weeks]
+                        output.append(
+                            f"- Weeks: {', '.join(week_sample)}... ({total_weeks} total weeks)"
+                        )
+
        summary = results.get("summary", {})
        quality_score = results.get("data_quality_score", 0)
        output.append(f"\nData Quality Score: {quality_score}%")