Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/xbrl/deduplication_strategy.py
+++ b/venv/lib/python3.10/site-packages/edgar/xbrl/deduplication_strategy.py
@@ -0,0 +1,232 @@
+"""
+Revenue Deduplication Strategy for Issue #438
+
+This module implements intelligent deduplication for revenue concepts
+that may have the same underlying value but different GAAP concept names.
+
+The strategy:
+1. Identify groups of items with the same value in the same period
+2. Apply hierarchical precedence rules to choose the most appropriate concept
+3. Filter out less specific concepts when duplicates exist
+
+Revenue Concept Hierarchy (most to least preferred):
+1. us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax (most specific - ASC 606)
+2. us-gaap:Revenues (standard general concept)
+3. us-gaap:SalesRevenueNet (less common)
+4. us-gaap:Revenue (least specific)
+"""
+
+import logging
+from collections import defaultdict
+from typing import Any, Dict, List, Set
+
+log = logging.getLogger(__name__)
+
+
+class RevenueDeduplicator:
+    """
+    Handles deduplication of revenue concepts in financial statements.
+    """
+
+    # Revenue concept precedence (higher number = higher precedence)
+    REVENUE_CONCEPT_PRECEDENCE = {
+        'us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax': 100,  # Most specific (ASC 606)
+        'us-gaap:Revenues': 90,  # Standard concept
+        'us-gaap:SalesRevenueNet': 80,  # Alternative concept
+        'us-gaap:Revenue': 70,  # Generic concept
+        'us-gaap:TotalRevenuesAndGains': 60,  # Broader concept
+    }
+
+    # Additional revenue-related concepts that might cause duplicates
+    REVENUE_RELATED_CONCEPTS = {
+        'RevenueFromContractWithCustomerExcludingAssessedTax',
+        'Revenues', 
+        'Revenue',
+        'SalesRevenueNet',
+        'TotalRevenuesAndGains',
+        'RevenueFromContractWithCustomer',
+        'TotalRevenues'
+    }
+
+    @classmethod
+    def deduplicate_statement_items(cls, statement_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Remove duplicate revenue concepts from statement items.
+
+        Args:
+            statement_items: List of statement line items
+
+        Returns:
+            Filtered list with duplicates removed
+        """
+        if not statement_items:
+            return statement_items
+
+        # Group items by period and value to find potential duplicates
+        period_value_groups = cls._group_by_period_value(statement_items)
+
+        # Identify items to remove
+        items_to_remove = set()
+
+        for (_period, _value), items in period_value_groups.items():
+            if len(items) > 1 and cls._are_revenue_duplicates(items):
+                # This is a group of revenue items with the same value
+                # Keep only the highest precedence item
+                items_to_remove.update(cls._select_duplicates_to_remove(items))
+
+        # Filter out the items marked for removal
+        result = []
+        for i, item in enumerate(statement_items):
+            if i not in items_to_remove:
+                result.append(item)
+            else:
+                log.debug("Removed duplicate revenue item: %s = %s", item.get('label', 'Unknown'), item.get('values', {}))
+
+        removed_count = len(statement_items) - len(result)
+        if removed_count > 0:
+            log.info("Revenue deduplication: removed %d duplicate items", removed_count)
+
+        return result
+
+    @classmethod
+    def _group_by_period_value(cls, statement_items: List[Dict[str, Any]]) -> Dict[tuple, List[tuple]]:
+        """
+        Group statement items by (period, value) pairs.
+
+        Returns:
+            Dict mapping (period, value) to list of (index, item) tuples
+        """
+        groups = defaultdict(list)
+
+        for i, item in enumerate(statement_items):
+            values = item.get('values', {})
+            for period, value in values.items():
+                if value is not None and value != 0:
+                    groups[(period, value)].append((i, item))
+
+        return groups
+
+    @classmethod
+    def _are_revenue_duplicates(cls, indexed_items: List[tuple]) -> bool:
+        """
+        Check if a group of items are revenue duplicates.
+
+        Args:
+            indexed_items: List of (index, item) tuples
+
+        Returns:
+            True if these items are revenue duplicates
+        """
+        revenue_count = 0
+
+        for _, item in indexed_items:
+            if cls._is_revenue_concept(item):
+                revenue_count += 1
+
+        # If we have multiple revenue concepts, they're potential duplicates
+        return revenue_count > 1
+
+    @classmethod
+    def _is_revenue_concept(cls, item: Dict[str, Any]) -> bool:
+        """
+        Check if an item represents a revenue concept.
+        """
+        concept = item.get('concept', '')
+        all_names = item.get('all_names', [])
+        label = item.get('label', '').lower()
+
+        # First check for exclusions (costs, expenses, etc.)
+        exclusion_terms = ['cost', 'expense', 'loss', 'depreciation', 'amortization']
+        for name in [concept] + all_names + [label]:
+            if any(excl in name.lower() for excl in exclusion_terms):
+                return False
+
+        # Look for revenue-related terms in concept or names
+        for name in [concept] + all_names:
+            if any(term in name for term in cls.REVENUE_RELATED_CONCEPTS):
+                return True
+
+        # Also check label for revenue indicators (but not cost-related)
+        if any(term in label for term in ['revenue', 'sales']) and not any(excl in label for excl in exclusion_terms):
+            return True
+
+        return False
+
+    @classmethod
+    def _select_duplicates_to_remove(cls, indexed_items: List[tuple]) -> Set[int]:
+        """
+        Select which items to remove from a duplicate group.
+
+        Args:
+            indexed_items: List of (index, item) tuples
+
+        Returns:
+            Set of indices to remove
+        """
+        if len(indexed_items) <= 1:
+            return set()
+
+        # Score each item by precedence
+        scored_items = []
+        for index, item in indexed_items:
+            score = cls._get_precedence_score(item)
+            scored_items.append((score, index, item))
+
+        # Sort by score (highest first)
+        scored_items.sort(reverse=True)
+
+        # Keep the highest scored item, remove the rest
+        indices_to_remove = set()
+        for i in range(1, len(scored_items)):  # Skip first (highest scored)
+            _, index, item = scored_items[i]
+            indices_to_remove.add(index)
+
+        return indices_to_remove
+
+    @classmethod
+    def _get_precedence_score(cls, item: Dict[str, Any]) -> int:
+        """
+        Get the precedence score for a revenue concept.
+
+        Higher scores are preferred and will be kept.
+        """
+        concept = item.get('concept', '')
+        all_names = item.get('all_names', [])
+
+        # Check for exact matches in precedence table
+        for name in [concept] + all_names:
+            if name in cls.REVENUE_CONCEPT_PRECEDENCE:
+                return cls.REVENUE_CONCEPT_PRECEDENCE[name]
+
+        # Check for partial matches (handle namespace prefixes)
+        for name in [concept] + all_names:
+            for precedence_concept, score in cls.REVENUE_CONCEPT_PRECEDENCE.items():
+                if precedence_concept.split(':')[-1] in name:
+                    return score
+
+        # Default score for unrecognized revenue concepts
+        return 50
+
+    @classmethod
+    def get_deduplication_stats(cls, original_items: List[Dict[str, Any]], 
+                              deduplicated_items: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Generate statistics about the deduplication process.
+        """
+        original_count = len(original_items)
+        deduplicated_count = len(deduplicated_items)
+        removed_count = original_count - deduplicated_count
+
+        # Count revenue items
+        original_revenue_count = sum(1 for item in original_items if cls._is_revenue_concept(item))
+        deduplicated_revenue_count = sum(1 for item in deduplicated_items if cls._is_revenue_concept(item))
+
+        return {
+            'original_total_items': original_count,
+            'deduplicated_total_items': deduplicated_count,
+            'removed_items': removed_count,
+            'original_revenue_items': original_revenue_count,
+            'deduplicated_revenue_items': deduplicated_revenue_count,
+            'removed_revenue_items': original_revenue_count - deduplicated_revenue_count,
+            'deduplication_performed': removed_count > 0
+        }