Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,232 @@
"""
Revenue Deduplication Strategy for Issue #438
This module implements intelligent deduplication for revenue concepts
that may have the same underlying value but different GAAP concept names.
The strategy:
1. Identify groups of items with the same value in the same period
2. Apply hierarchical precedence rules to choose the most appropriate concept
3. Filter out less specific concepts when duplicates exist
Revenue Concept Hierarchy (most to least preferred):
1. us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax (most specific - ASC 606)
2. us-gaap:Revenues (standard general concept)
3. us-gaap:SalesRevenueNet (less common)
4. us-gaap:Revenue (least specific)
"""
import logging
from collections import defaultdict
from typing import Any, Dict, List, Set
log = logging.getLogger(__name__)
class RevenueDeduplicator:
"""
Handles deduplication of revenue concepts in financial statements.
"""
# Revenue concept precedence (higher number = higher precedence)
REVENUE_CONCEPT_PRECEDENCE = {
'us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax': 100, # Most specific (ASC 606)
'us-gaap:Revenues': 90, # Standard concept
'us-gaap:SalesRevenueNet': 80, # Alternative concept
'us-gaap:Revenue': 70, # Generic concept
'us-gaap:TotalRevenuesAndGains': 60, # Broader concept
}
# Additional revenue-related concepts that might cause duplicates
REVENUE_RELATED_CONCEPTS = {
'RevenueFromContractWithCustomerExcludingAssessedTax',
'Revenues',
'Revenue',
'SalesRevenueNet',
'TotalRevenuesAndGains',
'RevenueFromContractWithCustomer',
'TotalRevenues'
}
@classmethod
def deduplicate_statement_items(cls, statement_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Remove duplicate revenue concepts from statement items.
Args:
statement_items: List of statement line items
Returns:
Filtered list with duplicates removed
"""
if not statement_items:
return statement_items
# Group items by period and value to find potential duplicates
period_value_groups = cls._group_by_period_value(statement_items)
# Identify items to remove
items_to_remove = set()
for (_period, _value), items in period_value_groups.items():
if len(items) > 1 and cls._are_revenue_duplicates(items):
# This is a group of revenue items with the same value
# Keep only the highest precedence item
items_to_remove.update(cls._select_duplicates_to_remove(items))
# Filter out the items marked for removal
result = []
for i, item in enumerate(statement_items):
if i not in items_to_remove:
result.append(item)
else:
log.debug("Removed duplicate revenue item: %s = %s", item.get('label', 'Unknown'), item.get('values', {}))
removed_count = len(statement_items) - len(result)
if removed_count > 0:
log.info("Revenue deduplication: removed %d duplicate items", removed_count)
return result
@classmethod
def _group_by_period_value(cls, statement_items: List[Dict[str, Any]]) -> Dict[tuple, List[tuple]]:
"""
Group statement items by (period, value) pairs.
Returns:
Dict mapping (period, value) to list of (index, item) tuples
"""
groups = defaultdict(list)
for i, item in enumerate(statement_items):
values = item.get('values', {})
for period, value in values.items():
if value is not None and value != 0:
groups[(period, value)].append((i, item))
return groups
@classmethod
def _are_revenue_duplicates(cls, indexed_items: List[tuple]) -> bool:
"""
Check if a group of items are revenue duplicates.
Args:
indexed_items: List of (index, item) tuples
Returns:
True if these items are revenue duplicates
"""
revenue_count = 0
for _, item in indexed_items:
if cls._is_revenue_concept(item):
revenue_count += 1
# If we have multiple revenue concepts, they're potential duplicates
return revenue_count > 1
@classmethod
def _is_revenue_concept(cls, item: Dict[str, Any]) -> bool:
"""
Check if an item represents a revenue concept.
"""
concept = item.get('concept', '')
all_names = item.get('all_names', [])
label = item.get('label', '').lower()
# First check for exclusions (costs, expenses, etc.)
exclusion_terms = ['cost', 'expense', 'loss', 'depreciation', 'amortization']
for name in [concept] + all_names + [label]:
if any(excl in name.lower() for excl in exclusion_terms):
return False
# Look for revenue-related terms in concept or names
for name in [concept] + all_names:
if any(term in name for term in cls.REVENUE_RELATED_CONCEPTS):
return True
# Also check label for revenue indicators (but not cost-related)
if any(term in label for term in ['revenue', 'sales']) and not any(excl in label for excl in exclusion_terms):
return True
return False
@classmethod
def _select_duplicates_to_remove(cls, indexed_items: List[tuple]) -> Set[int]:
"""
Select which items to remove from a duplicate group.
Args:
indexed_items: List of (index, item) tuples
Returns:
Set of indices to remove
"""
if len(indexed_items) <= 1:
return set()
# Score each item by precedence
scored_items = []
for index, item in indexed_items:
score = cls._get_precedence_score(item)
scored_items.append((score, index, item))
# Sort by score (highest first)
scored_items.sort(reverse=True)
# Keep the highest scored item, remove the rest
indices_to_remove = set()
for i in range(1, len(scored_items)): # Skip first (highest scored)
_, index, item = scored_items[i]
indices_to_remove.add(index)
return indices_to_remove
@classmethod
def _get_precedence_score(cls, item: Dict[str, Any]) -> int:
"""
Get the precedence score for a revenue concept.
Higher scores are preferred and will be kept.
"""
concept = item.get('concept', '')
all_names = item.get('all_names', [])
# Check for exact matches in precedence table
for name in [concept] + all_names:
if name in cls.REVENUE_CONCEPT_PRECEDENCE:
return cls.REVENUE_CONCEPT_PRECEDENCE[name]
# Check for partial matches (handle namespace prefixes)
for name in [concept] + all_names:
for precedence_concept, score in cls.REVENUE_CONCEPT_PRECEDENCE.items():
if precedence_concept.split(':')[-1] in name:
return score
# Default score for unrecognized revenue concepts
return 50
@classmethod
def get_deduplication_stats(cls, original_items: List[Dict[str, Any]],
deduplicated_items: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Generate statistics about the deduplication process.
"""
original_count = len(original_items)
deduplicated_count = len(deduplicated_items)
removed_count = original_count - deduplicated_count
# Count revenue items
original_revenue_count = sum(1 for item in original_items if cls._is_revenue_concept(item))
deduplicated_revenue_count = sum(1 for item in deduplicated_items if cls._is_revenue_concept(item))
return {
'original_total_items': original_count,
'deduplicated_total_items': deduplicated_count,
'removed_items': removed_count,
'original_revenue_items': original_revenue_count,
'deduplicated_revenue_items': deduplicated_revenue_count,
'removed_revenue_items': original_revenue_count - deduplicated_revenue_count,
'deduplication_performed': removed_count > 0
}