Initial commit
This commit is contained in:
@@ -0,0 +1,44 @@
|
||||
# XBRL2 Standardization
|
||||
|
||||
This package provides functionality for standardizing XBRL concepts across different company filings.
|
||||
|
||||
## Overview
|
||||
|
||||
The standardization module maps company-specific XBRL concepts to standardized concept names,
|
||||
enabling consistent presentation of financial statements regardless of the filing entity.
|
||||
|
||||
This is particularly useful for:
|
||||
- Comparing financial data across different companies
|
||||
- Building standardized reports and visualizations
|
||||
- Creating consistent financial datasets for analysis
|
||||
|
||||
## Components
|
||||
|
||||
- `StandardConcept`: An enumeration of standard financial statement concepts
|
||||
- `MappingStore`: Storage for mappings between company-specific and standard concepts
|
||||
- `ConceptMapper`: Maps company-specific concepts to standard concepts using various techniques
|
||||
- `standardize_statement`: Function to standardize a statement's labels
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from edgar.xbrl.standardization import StandardConcept, initialize_default_mappings, ConceptMapper,
|
||||
standardize_statement
|
||||
|
||||
# Get the default mappings
|
||||
store = initialize_default_mappings()
|
||||
|
||||
# Create a mapper
|
||||
mapper = ConceptMapper(store)
|
||||
|
||||
# Standardize a statement
|
||||
standardized_data = standardize_statement(statement_data, mapper)
|
||||
```
|
||||
|
||||
## Concept Mappings
|
||||
|
||||
The standardized concept mappings are stored in the `concept_mappings.json` file included
|
||||
in this package. This file maps standard concept names to lists of company-specific concept IDs.
|
||||
|
||||
The file is automatically loaded when initializing the `MappingStore` and can be extended
|
||||
with new mappings as needed.
|
||||
@@ -0,0 +1,17 @@
|
||||
"""
|
||||
XBRL concept standardization package.
|
||||
|
||||
This package provides functionality to map company-specific XBRL concepts
|
||||
to standardized concept names, enabling consistent presentation of financial
|
||||
statements regardless of the filing entity.
|
||||
"""
|
||||
|
||||
from edgar.xbrl.standardization.core import ConceptMapper, MappingStore, StandardConcept, initialize_default_mappings, standardize_statement
|
||||
|
||||
__all__ = [
|
||||
'StandardConcept',
|
||||
'MappingStore',
|
||||
'ConceptMapper',
|
||||
'standardize_statement',
|
||||
'initialize_default_mappings'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"concept_mappings": {
|
||||
"Sales and Service Revenue": [
|
||||
"brka_SalesAndServiceRevenue"
|
||||
]
|
||||
},
|
||||
"hierarchy_rules": {
|
||||
"Revenue": {
|
||||
"components": [
|
||||
"Sales and Service Revenue",
|
||||
"Operating Lease Revenue"
|
||||
],
|
||||
"description": "Total revenue comprises sales/service revenue and operating lease income for holding company"
|
||||
}
|
||||
},
|
||||
"business_context": {
|
||||
"entity_type": "holding_company",
|
||||
"industry": "diversified_conglomerate",
|
||||
"description": "Berkshire Hathaway operates diverse businesses including insurance, utilities, railroads, and manufacturing"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
{
|
||||
"entity_info": {
|
||||
"name": "Microsoft Corporation",
|
||||
"cik": "0000789019",
|
||||
"ticker": "MSFT",
|
||||
"description": "Microsoft-specific concept mappings for unique business terminology"
|
||||
},
|
||||
|
||||
"concept_mappings": {
|
||||
"_comment_msft_revenue": "Microsoft uses specific revenue categorization that differs from standard tech companies",
|
||||
|
||||
"Product Revenue": [
|
||||
"msft_ProductRevenue",
|
||||
"msft_WindowsCommercialRevenue",
|
||||
"msft_WindowsConsumerRevenue",
|
||||
"msft_OfficeCommercialRevenue"
|
||||
],
|
||||
|
||||
"Service Revenue": [
|
||||
"msft_ServiceRevenue",
|
||||
"msft_CloudServicesRevenue",
|
||||
"msft_ConsultingServicesRevenue"
|
||||
],
|
||||
|
||||
"Subscription Revenue": [
|
||||
"msft_Office365CommercialRevenue",
|
||||
"msft_Office365ConsumerRevenue",
|
||||
"msft_DynamicsRevenue"
|
||||
],
|
||||
|
||||
"Platform Revenue": [
|
||||
"msft_AzureRevenue",
|
||||
"msft_XboxContentAndServicesRevenue"
|
||||
],
|
||||
|
||||
"_comment_msft_expenses": "Microsoft has unique expense categorizations for sales and marketing vs G&A",
|
||||
|
||||
"Sales and Marketing Expense": [
|
||||
"msft_SalesAndMarketingExpense",
|
||||
"msft_AdvertisingAndPromotionExpense"
|
||||
],
|
||||
|
||||
"Technical Support Expense": [
|
||||
"msft_TechnicalSupportExpense",
|
||||
"msft_CustomerSupportExpense"
|
||||
]
|
||||
},
|
||||
|
||||
"hierarchy_rules": {
|
||||
"_comment": "Rules for handling Microsoft-specific hierarchical relationships",
|
||||
|
||||
"revenue_hierarchy": {
|
||||
"parent": "Revenue",
|
||||
"children": ["Product Revenue", "Service Revenue", "Subscription Revenue", "Platform Revenue"],
|
||||
"calculation_rule": "sum"
|
||||
},
|
||||
|
||||
"expense_hierarchy": {
|
||||
"parent": "Operating Expenses",
|
||||
"children": ["Sales and Marketing Expense", "Technical Support Expense"],
|
||||
"calculation_rule": "sum"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"metadata": {
|
||||
"entity_identifier": "tsla",
|
||||
"company_name": "Tesla, Inc.",
|
||||
"cik": "1318605",
|
||||
"priority": "high",
|
||||
"created_date": "2024-06-25",
|
||||
"last_updated": "2024-06-25",
|
||||
"description": "Tesla-specific concept mappings to handle automotive, energy, and service revenue streams"
|
||||
},
|
||||
"concept_mappings": {
|
||||
"Automotive Revenue": [
|
||||
"tsla_AutomotiveRevenue",
|
||||
"tsla_AutomotiveSales",
|
||||
"tsla_VehicleRevenue"
|
||||
],
|
||||
"Automotive Leasing Revenue": [
|
||||
"tsla_AutomotiveLeasing",
|
||||
"tsla_AutomotiveLeasingRevenue",
|
||||
"tsla_VehicleLeasingRevenue"
|
||||
],
|
||||
"Energy Revenue": [
|
||||
"tsla_EnergyGenerationAndStorageRevenue",
|
||||
"tsla_EnergyRevenue",
|
||||
"tsla_SolarRevenue",
|
||||
"tsla_EnergyStorageRevenue"
|
||||
],
|
||||
"Service Revenue": [
|
||||
"tsla_ServicesAndOtherRevenue",
|
||||
"tsla_ServiceRevenue",
|
||||
"tsla_SuperchargerRevenue"
|
||||
]
|
||||
},
|
||||
"hierarchy_rules": {
|
||||
"Revenue": {
|
||||
"children": [
|
||||
"Automotive Revenue",
|
||||
"Energy Revenue",
|
||||
"Service Revenue"
|
||||
]
|
||||
},
|
||||
"Automotive Revenue": {
|
||||
"children": [
|
||||
"Automotive Leasing Revenue"
|
||||
]
|
||||
}
|
||||
},
|
||||
"business_context": {
|
||||
"primary_revenue_streams": ["automotive", "energy", "services"],
|
||||
"revenue_model": "product_and_service",
|
||||
"key_metrics": ["vehicle_deliveries", "energy_deployments"],
|
||||
"industry": "automotive_technology"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,353 @@
|
||||
{
|
||||
"_comment_revenue_hierarchy": "REVENUE HIERARCHY FIX: Separated total revenue from component revenue types to prevent duplicate labels. Contract and product revenue are components that should have distinct labels from total revenue.",
|
||||
"Revenue": [
|
||||
"us-gaap_Revenue",
|
||||
"us-gaap_Revenues",
|
||||
"us-gaap_SalesRevenueNet",
|
||||
"us-gaap_OperatingRevenue"
|
||||
],
|
||||
"Contract Revenue": [
|
||||
"us-gaap_RevenueFromContractWithCustomerExcludingAssessedTax",
|
||||
"us-gaap_RevenueFromContractWithCustomerIncludingAssessedTax"
|
||||
],
|
||||
"Product Revenue": [
|
||||
"us-gaap_SalesRevenueGoodsNet",
|
||||
"us-gaap_ProductSales"
|
||||
],
|
||||
"Operating Lease Revenue": [
|
||||
"us-gaap_OperatingLeaseLeaseIncome"
|
||||
],
|
||||
"_comment_cost_of_revenue_hierarchy": "COST OF REVENUE HIERARCHY FIX: Separated different cost types to prevent duplicate labels. Different business models (manufacturing, service, mixed) use different cost concepts that should have distinct labels for clarity.",
|
||||
"Cost of Revenue": [
|
||||
"us-gaap_CostOfRevenueAbstract"
|
||||
],
|
||||
"Total Cost of Revenue": [
|
||||
"us-gaap_CostOfRevenue"
|
||||
],
|
||||
"Cost of Goods Sold": [
|
||||
"us-gaap_CostOfGoodsSold"
|
||||
],
|
||||
"Cost of Goods and Services Sold": [
|
||||
"us-gaap_CostOfGoodsAndServicesSold"
|
||||
],
|
||||
"Cost of Sales": [
|
||||
"us-gaap_CostOfSales"
|
||||
],
|
||||
"Cost of Goods and Services Excluding Depreciation": [
|
||||
"us-gaap_CostOfGoodsAndServiceExcludingDepreciationDepletionAndAmortization"
|
||||
],
|
||||
"Direct Operating Costs": [
|
||||
"us-gaap_DirectOperatingCosts"
|
||||
],
|
||||
"Costs and Expenses": [
|
||||
"us-gaap_CostsAndExpenses"
|
||||
],
|
||||
"Gross Profit": [
|
||||
"us-gaap_GrossProfit"
|
||||
],
|
||||
"Operating Expenses": [
|
||||
"us-gaap_NoninterestExpense",
|
||||
"us-gaap_OperatingCostsAndExpenses",
|
||||
"us-gaap_OperatingExpenses"
|
||||
],
|
||||
"Research and Development Expense": [
|
||||
"us-gaap_ResearchAndDevelopmentCosts",
|
||||
"us-gaap_ResearchAndDevelopmentExpense"
|
||||
],
|
||||
"_comment_sga_hierarchy": "SG&A HIERARCHY FIX: Separated total SG&A from components to prevent duplicate labels. Previously all three concepts below mapped to 'Selling, General and Administrative Expense' causing confusion when companies report both total and components.",
|
||||
"Selling, General and Administrative Expense": [
|
||||
"us-gaap_SellingGeneralAndAdministrativeExpense"
|
||||
],
|
||||
"General and Administrative Expense": [
|
||||
"us-gaap_GeneralAndAdministrativeExpense",
|
||||
"us-gaap_AdministrativeExpense"
|
||||
],
|
||||
"Selling Expense": [
|
||||
"us-gaap_SellingAndMarketingExpense",
|
||||
"us-gaap_SellingExpense"
|
||||
],
|
||||
"Marketing Expense": [
|
||||
"us-gaap_MarketingExpense",
|
||||
"us-gaap_AdvertisingExpense"
|
||||
],
|
||||
"Operating Income": [
|
||||
"us-gaap_OperatingIncomeLoss",
|
||||
"us-gaap_OperatingIncome",
|
||||
"us-gaap_IncomeLossFromContinuingOperationsBeforeInterestAndTaxes"
|
||||
],
|
||||
"Nonoperating Income/Expense": [
|
||||
"orcl_NonoperatingIncomeExpenseIncludingEliminationOfNetIncomeLossAttributableToNoncontrollingInterests",
|
||||
"us-gaap_NonoperatingIncomeExpense"
|
||||
],
|
||||
"Interest Expense": [
|
||||
"us-gaap_InterestAndDebtExpense",
|
||||
"us-gaap_InterestExpense",
|
||||
"us-gaap_InterestIncomeExpenseNet"
|
||||
],
|
||||
"Interest Expense (operating)": [
|
||||
"us-gaap_InterestExpenseOperating"
|
||||
],
|
||||
"Interest Expense (non-operating)": [
|
||||
"us-gaap_InterestExpenseNonoperating"
|
||||
],
|
||||
"_comment_income_before_tax_hierarchy": "INCOME BEFORE TAX HIERARCHY FIX: Separated total income before tax from component types to prevent duplicate labels. Continuing operations and extraordinary items are components that should have distinct labels.",
|
||||
"Income Before Tax": [
|
||||
"us-gaap_IncomeLossBeforeIncomeTaxes"
|
||||
],
|
||||
"Income Before Tax from Continuing Operations": [
|
||||
"us-gaap_IncomeLossFromContinuingOperationsBeforeIncomeTaxes",
|
||||
"us-gaap_IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest",
|
||||
"orcl_IncomeLossFromContinuingOperationsIncludingNoncontrollingInterestBeforeIncomeTaxesExtraordinaryItems"
|
||||
],
|
||||
"Income Tax Expense": [
|
||||
"us-gaap_IncomeTaxesPaidNet",
|
||||
"us-gaap_IncomeTaxExpenseBenefit"
|
||||
],
|
||||
"_comment_net_income_hierarchy": "NET INCOME HIERARCHY FIX: Separated total net income from component income types to prevent duplicate labels. Continuing operations income and profit/loss are components that should have distinct labels from total net income.",
|
||||
"Net Income": [
|
||||
"us-gaap_NetIncome",
|
||||
"us-gaap_NetIncomeLoss"
|
||||
],
|
||||
"Net Income from Continuing Operations": [
|
||||
"us-gaap_IncomeLossFromContinuingOperationsIncludingPortionAttributableToNoncontrollingInterest",
|
||||
"us-gaap_IncomeLossFromContinuingOperations"
|
||||
],
|
||||
"Profit or Loss": [
|
||||
"us-gaap_ProfitLoss"
|
||||
],
|
||||
"Net Income Attributable to Noncontrolling Interest": [
|
||||
"us-gaap_NetIncomeLossAttributableToNonredeemableNoncontrollingInterest",
|
||||
"us-gaap_NetIncomeLossAttributableToNoncontrollingInterest"
|
||||
],
|
||||
"Basic Net Income Available to Common Shareholders": [
|
||||
"us-gaap_NetIncomeLossAvailableToCommonStockholdersBasic"
|
||||
],
|
||||
"Diluted Net Income Available to Common Shareholders": [
|
||||
"us-gaap_NetIncomeLossAvailableToCommonStockholdersDiluted"
|
||||
],
|
||||
"Accumulated Other Comprehensive Income/Loss": [
|
||||
"us-gaap_AccumulatedOtherComprehensiveIncomeLossNetOfTax"
|
||||
],
|
||||
"Earnings Per Share": [
|
||||
"us-gaap_EarningsPerShareAbstract"
|
||||
],
|
||||
"Earnings Per Share (Basic)": [
|
||||
"us-gaap_EarningsPerShareBasic"
|
||||
],
|
||||
"Earnings Per Share (Diluted)": [
|
||||
"us-gaap_EarningsPerShareDiluted"
|
||||
],
|
||||
"Shares Outstanding": [
|
||||
"us-gaap_WeightedAverageNumberOfSharesOutstandingAbstract"
|
||||
],
|
||||
"Shares Outstanding (Basic)": [
|
||||
"us-gaap_WeightedAverageNumberOfSharesOutstandingBasic"
|
||||
],
|
||||
"Shares Outstanding (Diluted)": [
|
||||
"us-gaap_WeightedAverageNumberOfDilutedSharesOutstanding"
|
||||
],
|
||||
"Cash and Cash Equivalents": [
|
||||
"us-gaap_CashEquivalentsAtCarryingValue",
|
||||
"us-gaap_Cash",
|
||||
"us-gaap_CashAndCashEquivalentsAtCarryingValue",
|
||||
"us-gaap_CashCashEquivalentsAndShortTermInvestments"
|
||||
],
|
||||
"Accounts Receivable": [
|
||||
"us-gaap_AccountsReceivableNet",
|
||||
"us-gaap_ReceivablesNetCurrent",
|
||||
"us-gaap_AccountsReceivableNetCurrent",
|
||||
"us-gaap_AccountsReceivableGross"
|
||||
],
|
||||
"Inventory": [
|
||||
"us-gaap_InventoryGross",
|
||||
"us-gaap_InventoryFinishedGoods",
|
||||
"us-gaap_InventoryNet"
|
||||
],
|
||||
"Prepaid Expenses": [
|
||||
"us-gaap_PrepaidExpenseAndOtherAssetsCurrent",
|
||||
"us-gaap_PrepaidExpenseCurrent"
|
||||
],
|
||||
"Current Marketable Securities": [
|
||||
"us-gaap_AvailableForSaleSecuritiesDebtSecuritiesCurrent",
|
||||
"us-gaap_MarketableSecuritiesCurrent"
|
||||
],
|
||||
"Non Current Marketable Securities": [
|
||||
"us-gaap_MarketableSecuritiesNoncurrent"
|
||||
],
|
||||
"Total Current Assets": [
|
||||
"us-gaap_AssetsCurrent"
|
||||
],
|
||||
"Total Non Current Assets": [
|
||||
"us-gaap_AssetsNoncurrent"
|
||||
],
|
||||
"Property, Plant and Equipment": [
|
||||
"us-gaap_PropertyPlantAndEquipmentGross",
|
||||
"us-gaap_PropertyPlantAndEquipmentNet",
|
||||
"us-gaap_FixedAssets"
|
||||
],
|
||||
"Goodwill": [
|
||||
"us-gaap_Goodwill"
|
||||
],
|
||||
"Intangible Assets": [
|
||||
"us-gaap_IntangibleAssetsNetIncludingGoodwill",
|
||||
"us-gaap_IntangibleAssetsNetExcludingGoodwill",
|
||||
"us-gaap_FiniteLivedIntangibleAssetsNet"
|
||||
],
|
||||
"Total Assets": [
|
||||
"us-gaap_Assets",
|
||||
"us-gaap_AssetsTotal"
|
||||
],
|
||||
"Long-Term Investments": [
|
||||
"us-gaap_LongTermInvestments"
|
||||
],
|
||||
"Accounts Payable": [
|
||||
"us-gaap_AccountsPayableCurrent",
|
||||
"us-gaap_AccountsPayableTradeCurrent"
|
||||
],
|
||||
"Accrued Liabilities": [
|
||||
"us-gaap_OtherAccruedLiabilitiesCurrent",
|
||||
"us-gaap_AccruedLiabilitiesCurrent",
|
||||
"us-gaap_EmployeeRelatedLiabilitiesCurrent"
|
||||
],
|
||||
"Short Term Debt": [
|
||||
"us-gaap_DebtCurrent",
|
||||
"us-gaap_ShortTermBorrowings",
|
||||
"us-gaap_LongTermDebtCurrent"
|
||||
],
|
||||
"Total Current Liabilities": [
|
||||
"us-gaap_LiabilitiesCurrent"
|
||||
],
|
||||
"Total Non Current Liabilities": [
|
||||
"us-gaap_LiabilitiesNoncurrent"
|
||||
],
|
||||
"Long Term Debt": [
|
||||
"us-gaap_LongTermDebtAndCapitalLeaseObligations",
|
||||
"us-gaap_LongTermDebt",
|
||||
"us-gaap_LongTermBorrowings",
|
||||
"us-gaap_LongTermDebtNoncurrent"
|
||||
],
|
||||
"Notes Payable, Current": [
|
||||
"us-gaap_NotesPayableCurrent"
|
||||
],
|
||||
"Notes Payable, Non Current": [
|
||||
"us-gaap_LongTermNotesAndLoans"
|
||||
],
|
||||
"Deferred Revenue": [
|
||||
"us-gaap_DeferredRevenueNoncurrent",
|
||||
"us-gaap_DeferredRevenueCurrent",
|
||||
"us-gaap_DeferredRevenue"
|
||||
],
|
||||
"Total Liabilities": [
|
||||
"us-gaap_LiabilitiesTotal",
|
||||
"us-gaap_Liabilities"
|
||||
],
|
||||
"Common Stock Shares Outstanding": [
|
||||
"us-gaap_CommonStockSharesOutstanding"
|
||||
],
|
||||
"Common Stock Shares Issued": [
|
||||
"us-gaap_CommonStockSharesIssued"
|
||||
],
|
||||
"Common Stock": [
|
||||
"us-gaap_CommonStocksIncludingAdditionalPaidInCapital",
|
||||
"us-gaap_StockholdersEquityCommonStock",
|
||||
"us-gaap_CommonStockValue"
|
||||
],
|
||||
"Preferred Stock": [
|
||||
"us-gaap_PreferredStockValue"
|
||||
],
|
||||
"Treasury Stock Common Value": [
|
||||
"us-gaap_TreasuryStockCommonValue",
|
||||
"us-gaap_TreasuryStockValue"
|
||||
],
|
||||
"Retained Earnings": [
|
||||
"us-gaap_RetainedEarnings",
|
||||
"us-gaap_RetainedEarningsAccumulatedDeficit"
|
||||
],
|
||||
"Minority Interest": [
|
||||
"us-gaap_MinorityInterest",
|
||||
"us-gaap_NoncontrollingInterest"
|
||||
],
|
||||
"Total Stockholders' Equity": [
|
||||
"us-gaap_EquityAttributableToParent",
|
||||
"us-gaap_StockholdersEquity",
|
||||
"us-gaap_StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest",
|
||||
"us-gaap_StockholdersEquityAttributableToParent"
|
||||
],
|
||||
"Total Liabilities and Stockholders' Equity": [
|
||||
"us-gaap_LiabilitiesAndStockholdersEquity"
|
||||
],
|
||||
"Net Cash from Operating Activities": [
|
||||
"us-gaap_NetCashProvidedByUsedInOperatingActivities",
|
||||
"us-gaap_NetCashProvidedByUsedInOperatingActivitiesContinuingOperations"
|
||||
],
|
||||
"Net Cash from Investing Activities": [
|
||||
"us-gaap_NetCashProvidedByUsedInInvestingActivities",
|
||||
"us-gaap_NetCashProvidedByUsedInInvestingActivitiesContinuingOperations"
|
||||
],
|
||||
"Net Cash from Financing Activities": [
|
||||
"us-gaap_NetCashProvidedByUsedInFinancingActivitiesContinuingOperations",
|
||||
"us-gaap_NetCashProvidedByUsedInFinancingActivities"
|
||||
],
|
||||
"Net Change in Cash": [
|
||||
"us-gaap_IncreaseDecreaseInCashAndCashEquivalents",
|
||||
"us-gaap_CashAndCashEquivalentsPeriodIncreaseDecrease",
|
||||
"us-gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect"
|
||||
],
|
||||
"Payments for Property, Plant and Equipment": [
|
||||
"us-gaap_PaymentsToAcquirePropertyPlantAndEquipment"
|
||||
],
|
||||
"Payments of Dividends": [
|
||||
"us-gaap_PaymentsOfDividends"
|
||||
],
|
||||
"Tax Withholding for Share-Based Compensation": [
|
||||
"us-gaap_PaymentsRelatedToTaxWithholdingForShareBasedCompensation"
|
||||
],
|
||||
"Payments to Acquire Businesses": [
|
||||
"us-gaap_PaymentsToAcquireBusinessesNetOfCashAcquired"
|
||||
],
|
||||
"Proceeds from Issuance of Common Stock": [
|
||||
"us-gaap_ProceedsFromIssuanceOfCommonStock"
|
||||
],
|
||||
"Proceeds from Issuance of Long-Term Debt": [
|
||||
"us-gaap_ProceedsFromIssuanceOfLongTermDebt"
|
||||
],
|
||||
"Proceeds from Maturities, Prepayments and Calls of Securities": [
|
||||
"us-gaap_ProceedsFromMaturitiesPrepaymentsAndCallsOfAvailableForSaleSecurities"
|
||||
],
|
||||
"Proceeds from Sale and Maturity of Other Investments": [
|
||||
"us-gaap_ProceedsFromSaleAndMaturityOfOtherInvestments"
|
||||
],
|
||||
"Proceeds from Sale of Debt Securities, ": [
|
||||
"us-gaap_ProceedsFromSaleOfAvailableForSaleSecuritiesDebt"
|
||||
],
|
||||
"Proceeds from (Repayments of) Commercial Paper": [
|
||||
"us-gaap_ProceedsFromRepaymentsOfCommercialPaper"
|
||||
],
|
||||
"Other Assets": [
|
||||
"us-gaap_OtherAssets"
|
||||
],
|
||||
"Other Current Assets": [
|
||||
"us-gaap_OtherAssetsCurrent"
|
||||
],
|
||||
"Other Non Current Assets": [
|
||||
"us-gaap_OtherAssetsNoncurrent"
|
||||
],
|
||||
"Deferred Tax Assets": [
|
||||
"us-gaap_DeferredIncomeTaxAssetsNet"
|
||||
],
|
||||
"Other Liabilities": [
|
||||
"us-gaap_OtherLiabilities"
|
||||
|
||||
],
|
||||
"Other Current Liabilities": [
|
||||
"us-gaap_OtherLiabilitiesCurrent"
|
||||
],
|
||||
"Other Non Current Liabilities": [
|
||||
"us-gaap_OtherLiabilitiesNoncurrent"
|
||||
],
|
||||
"Depreciation and Amortization": [
|
||||
"us-gaap_AmortizationOfIntangibleAssets",
|
||||
"us-gaap_Depreciation",
|
||||
"us-gaap_DepreciationAndAmortization"
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,817 @@
|
||||
"""
|
||||
Module for standardizing XBRL concepts across different company filings.
|
||||
|
||||
This module provides functionality to map company-specific XBRL concepts
|
||||
to standardized concept names, enabling consistent presentation of financial
|
||||
statements regardless of the filing entity.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from difflib import SequenceMatcher
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class StandardConcept(str, Enum):
|
||||
"""
|
||||
Standardized concept names for financial statements.
|
||||
|
||||
The enum value (string) is the display label used for presentation.
|
||||
These labels should match keys in concept_mappings.json.
|
||||
"""
|
||||
# Balance Sheet - Assets
|
||||
CASH_AND_EQUIVALENTS = "Cash and Cash Equivalents"
|
||||
ACCOUNTS_RECEIVABLE = "Accounts Receivable"
|
||||
INVENTORY = "Inventory"
|
||||
PREPAID_EXPENSES = "Prepaid Expenses"
|
||||
TOTAL_CURRENT_ASSETS = "Total Current Assets"
|
||||
PROPERTY_PLANT_EQUIPMENT = "Property, Plant and Equipment"
|
||||
GOODWILL = "Goodwill"
|
||||
INTANGIBLE_ASSETS = "Intangible Assets"
|
||||
TOTAL_ASSETS = "Total Assets"
|
||||
|
||||
# Balance Sheet - Liabilities
|
||||
ACCOUNTS_PAYABLE = "Accounts Payable"
|
||||
ACCRUED_LIABILITIES = "Accrued Liabilities"
|
||||
SHORT_TERM_DEBT = "Short Term Debt"
|
||||
TOTAL_CURRENT_LIABILITIES = "Total Current Liabilities"
|
||||
LONG_TERM_DEBT = "Long Term Debt"
|
||||
DEFERRED_REVENUE = "Deferred Revenue"
|
||||
TOTAL_LIABILITIES = "Total Liabilities"
|
||||
|
||||
# Balance Sheet - Equity
|
||||
COMMON_STOCK = "Common Stock"
|
||||
RETAINED_EARNINGS = "Retained Earnings"
|
||||
TOTAL_EQUITY = "Total Stockholders' Equity"
|
||||
|
||||
# Income Statement - Revenue Hierarchy
|
||||
REVENUE = "Revenue"
|
||||
CONTRACT_REVENUE = "Contract Revenue"
|
||||
PRODUCT_REVENUE = "Product Revenue"
|
||||
SERVICE_REVENUE = "Service Revenue"
|
||||
SUBSCRIPTION_REVENUE = "Subscription Revenue"
|
||||
LEASING_REVENUE = "Leasing Revenue"
|
||||
|
||||
# Industry-Specific Revenue Concepts
|
||||
AUTOMOTIVE_REVENUE = "Automotive Revenue"
|
||||
AUTOMOTIVE_LEASING_REVENUE = "Automotive Leasing Revenue"
|
||||
ENERGY_REVENUE = "Energy Revenue"
|
||||
SOFTWARE_REVENUE = "Software Revenue"
|
||||
HARDWARE_REVENUE = "Hardware Revenue"
|
||||
PLATFORM_REVENUE = "Platform Revenue"
|
||||
|
||||
# Income Statement - Expenses
|
||||
COST_OF_REVENUE = "Cost of Revenue"
|
||||
COST_OF_GOODS_SOLD = "Cost of Goods Sold"
|
||||
COST_OF_GOODS_AND_SERVICES_SOLD = "Cost of Goods and Services Sold"
|
||||
COST_OF_SALES = "Cost of Sales"
|
||||
COSTS_AND_EXPENSES = "Costs and Expenses"
|
||||
DIRECT_OPERATING_COSTS = "Direct Operating Costs"
|
||||
GROSS_PROFIT = "Gross Profit"
|
||||
OPERATING_EXPENSES = "Operating Expenses"
|
||||
RESEARCH_AND_DEVELOPMENT = "Research and Development Expense"
|
||||
|
||||
# Enhanced Expense Hierarchy
|
||||
SELLING_GENERAL_ADMIN = "Selling, General and Administrative Expense"
|
||||
SELLING_EXPENSE = "Selling Expense"
|
||||
GENERAL_ADMIN_EXPENSE = "General and Administrative Expense"
|
||||
MARKETING_EXPENSE = "Marketing Expense"
|
||||
SALES_EXPENSE = "Sales Expense"
|
||||
|
||||
# Other Income Statement
|
||||
OPERATING_INCOME = "Operating Income"
|
||||
INTEREST_EXPENSE = "Interest Expense"
|
||||
INCOME_BEFORE_TAX = "Income Before Tax"
|
||||
INCOME_BEFORE_TAX_CONTINUING_OPS = "Income Before Tax from Continuing Operations"
|
||||
INCOME_TAX_EXPENSE = "Income Tax Expense"
|
||||
NET_INCOME = "Net Income"
|
||||
NET_INCOME_CONTINUING_OPS = "Net Income from Continuing Operations"
|
||||
NET_INCOME_NONCONTROLLING = "Net Income Attributable to Noncontrolling Interest"
|
||||
PROFIT_OR_LOSS = "Profit or Loss"
|
||||
|
||||
# Cash Flow Statement
|
||||
CASH_FROM_OPERATIONS = "Net Cash from Operating Activities"
|
||||
CASH_FROM_INVESTING = "Net Cash from Investing Activities"
|
||||
CASH_FROM_FINANCING = "Net Cash from Financing Activities"
|
||||
NET_CHANGE_IN_CASH = "Net Change in Cash"
|
||||
|
||||
@classmethod
|
||||
def get_from_label(cls, label: str) -> Optional['StandardConcept']:
|
||||
"""
|
||||
Get a StandardConcept enum by its label value.
|
||||
|
||||
Args:
|
||||
label: The label string to look up
|
||||
|
||||
Returns:
|
||||
The corresponding StandardConcept or None if not found
|
||||
"""
|
||||
for concept in cls:
|
||||
if concept.value == label:
|
||||
return concept
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_all_values(cls) -> Set[str]:
|
||||
"""
|
||||
Get all label values defined in the enum.
|
||||
|
||||
Returns:
|
||||
Set of all label strings
|
||||
"""
|
||||
return {concept.value for concept in cls}
|
||||
|
||||
|
||||
class MappingStore:
|
||||
"""
|
||||
Storage for mappings between company-specific concepts and standard concepts.
|
||||
|
||||
Attributes:
|
||||
source (str): Path to the JSON file storing the mappings
|
||||
mappings (Dict[str, Set[str]]): Dictionary mapping standard concepts to sets of company concepts
|
||||
company_mappings (Dict[str, Dict]): Company-specific mappings loaded from company_mappings/
|
||||
merged_mappings (Dict[str, List[Tuple]]): Merged mappings with priority scoring
|
||||
"""
|
||||
|
||||
def __init__(self, source: Optional[str] = None, validate_with_enum: bool = False, read_only: bool = False):
|
||||
"""
|
||||
Initialize the mapping store.
|
||||
|
||||
Args:
|
||||
source: Path to the JSON file storing the mappings. If None, uses default location.
|
||||
validate_with_enum: Whether to validate JSON keys against StandardConcept enum
|
||||
read_only: If True, never save changes back to the file (used in testing)
|
||||
"""
|
||||
self.read_only = read_only
|
||||
|
||||
|
||||
if source is None:
|
||||
# Try a few different ways to locate the file, handling both development
|
||||
# and installed package scenarios
|
||||
self.source = None
|
||||
|
||||
# Default to a file in the same directory as this module (development mode)
|
||||
module_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
potential_path = os.path.join(module_dir, "concept_mappings.json")
|
||||
if os.path.exists(potential_path):
|
||||
self.source = potential_path
|
||||
|
||||
# If not found, try to load from package data (installed package)
|
||||
if self.source is None:
|
||||
try:
|
||||
import importlib.resources as pkg_resources
|
||||
try:
|
||||
# For Python 3.9+
|
||||
with pkg_resources.files('edgar.xbrl.standardization').joinpath('concept_mappings.json').open('r') as f:
|
||||
# Just read the file to see if it exists, we'll load it properly later
|
||||
f.read(1)
|
||||
self.source = potential_path # Use the same path as before
|
||||
except (ImportError, FileNotFoundError, AttributeError):
|
||||
# Fallback for older Python versions
|
||||
try:
|
||||
import pkg_resources as legacy_resources
|
||||
if legacy_resources.resource_exists('edgar.xbrl.standardization', 'concept_mappings.json'):
|
||||
self.source = potential_path # Use the same path as before
|
||||
except (ImportError, FileNotFoundError):
|
||||
pass
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# If we still haven't found the file, use the default path anyway
|
||||
# (it will fail gracefully in _load_mappings)
|
||||
if self.source is None:
|
||||
self.source = potential_path
|
||||
else:
|
||||
self.source = source
|
||||
|
||||
self.mappings = self._load_mappings()
|
||||
|
||||
# Load company-specific mappings (always enabled)
|
||||
self.company_mappings = self._load_all_company_mappings()
|
||||
self.merged_mappings = self._create_merged_mappings()
|
||||
self.hierarchy_rules = self._load_hierarchy_rules()
|
||||
|
||||
# Validate the loaded mappings against StandardConcept enum
|
||||
if validate_with_enum:
|
||||
self.validate_against_enum()
|
||||
|
||||
def validate_against_enum(self) -> Tuple[bool, List[str]]:
|
||||
"""
|
||||
Validate that all keys in the mappings exist in StandardConcept enum.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, list_of_missing_keys)
|
||||
"""
|
||||
standard_values = StandardConcept.get_all_values()
|
||||
json_keys = set(self.mappings.keys())
|
||||
|
||||
# Find keys in JSON that aren't in enum
|
||||
missing_in_enum = json_keys - standard_values
|
||||
|
||||
# Find enum values not in JSON (just for information)
|
||||
missing_in_json = standard_values - json_keys
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if missing_in_enum:
|
||||
logger.warning("Found %d keys in concept_mappings.json that don't exist in StandardConcept enum: %s", len(missing_in_enum), sorted(missing_in_enum))
|
||||
|
||||
if missing_in_json:
|
||||
logger.info("Found %d StandardConcept values without mappings in concept_mappings.json: %s", len(missing_in_json), sorted(missing_in_json))
|
||||
|
||||
return len(missing_in_enum) == 0, list(missing_in_enum)
|
||||
|
||||
def to_dataframe(self) -> pd.DataFrame:
|
||||
"""
|
||||
Convert mappings to a pandas DataFrame for analysis and visualization.
|
||||
|
||||
Returns:
|
||||
DataFrame with columns for standard_concept and company_concept
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
raise ImportError("pandas is required for to_dataframe() but is not installed") from None
|
||||
|
||||
rows = []
|
||||
for standard_concept, company_concepts in self.mappings.items():
|
||||
for company_concept in company_concepts:
|
||||
rows.append({
|
||||
'standard_concept': standard_concept,
|
||||
'company_concept': company_concept
|
||||
})
|
||||
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def _load_all_company_mappings(self) -> Dict[str, Dict]:
|
||||
"""Load all company-specific mapping files from company_mappings/ directory."""
|
||||
mappings = {}
|
||||
company_dir = os.path.join(os.path.dirname(self.source or __file__), "company_mappings")
|
||||
|
||||
if os.path.exists(company_dir):
|
||||
for file in os.listdir(company_dir):
|
||||
if file.endswith("_mappings.json"):
|
||||
entity_id = file.replace("_mappings.json", "")
|
||||
try:
|
||||
with open(os.path.join(company_dir, file), 'r') as f:
|
||||
company_data = json.load(f)
|
||||
mappings[entity_id] = company_data
|
||||
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("Failed to load %s: %s", file, e)
|
||||
|
||||
return mappings
|
||||
|
||||
def _create_merged_mappings(self) -> Dict[str, List[Tuple[str, str, int]]]:
|
||||
"""Create merged mappings with priority scoring.
|
||||
|
||||
Priority levels:
|
||||
1. Core mappings (lowest)
|
||||
2. Company mappings (higher)
|
||||
3. Company-specific matches (highest when company detected)
|
||||
|
||||
Returns:
|
||||
Dict mapping standard concepts to list of (company_concept, source, priority) tuples
|
||||
"""
|
||||
merged = {}
|
||||
|
||||
# Add core mappings (priority 1 - lowest)
|
||||
for std_concept, company_concepts in self.mappings.items():
|
||||
merged[std_concept] = []
|
||||
for concept in company_concepts:
|
||||
merged[std_concept].append((concept, "core", 1))
|
||||
|
||||
# Add company mappings (priority 2 - higher)
|
||||
for entity_id, company_data in self.company_mappings.items():
|
||||
concept_mappings = company_data.get("concept_mappings", {})
|
||||
priority_level = 2
|
||||
|
||||
for std_concept, company_concepts in concept_mappings.items():
|
||||
if std_concept not in merged:
|
||||
merged[std_concept] = []
|
||||
for concept in company_concepts:
|
||||
merged[std_concept].append((concept, entity_id, priority_level))
|
||||
|
||||
return merged
|
||||
|
||||
def _load_hierarchy_rules(self) -> Dict[str, Dict]:
|
||||
"""Load hierarchy rules from company mappings."""
|
||||
all_rules = {}
|
||||
|
||||
# Add company hierarchy rules
|
||||
for _entity_id, company_data in self.company_mappings.items():
|
||||
hierarchy_rules = company_data.get("hierarchy_rules", {})
|
||||
all_rules.update(hierarchy_rules)
|
||||
|
||||
return all_rules
|
||||
|
||||
def _detect_entity_from_concept(self, concept: str) -> Optional[str]:
|
||||
"""Detect entity identifier from concept name prefix."""
|
||||
if '_' in concept:
|
||||
prefix = concept.split('_')[0].lower()
|
||||
# Check if this prefix corresponds to a known company
|
||||
if prefix in self.company_mappings:
|
||||
return prefix
|
||||
return None
|
||||
|
||||
def _load_mappings(self) -> Dict[str, Set[str]]:
|
||||
"""
|
||||
Load mappings from the JSON file.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping standard concepts to sets of company concepts
|
||||
"""
|
||||
data = None
|
||||
|
||||
# First try direct file access
|
||||
try:
|
||||
with open(self.source, 'r') as f:
|
||||
data = json.load(f)
|
||||
except (FileNotFoundError, IOError, PermissionError):
|
||||
# If direct file access fails, try package resources
|
||||
try:
|
||||
try:
|
||||
# Modern importlib.resources approach (Python 3.9+)
|
||||
import importlib.resources as pkg_resources
|
||||
try:
|
||||
# For Python 3.9+
|
||||
with pkg_resources.files('edgar.xbrl.standardization').joinpath('concept_mappings.json').open('r') as f:
|
||||
data = json.load(f)
|
||||
except (ImportError, FileNotFoundError, AttributeError):
|
||||
# Fallback to legacy pkg_resources
|
||||
import pkg_resources as legacy_resources
|
||||
resource_string = legacy_resources.resource_string('edgar.xbrl.standardization', 'concept_mappings.json')
|
||||
data = json.loads(resource_string)
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
# If all attempts fail, log a warning
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("Could not load concept_mappings.json. Standardization will be limited.")
|
||||
|
||||
# If we have data, process it based on its structure
|
||||
if data:
|
||||
# Check if the structure is flat or nested
|
||||
if any(isinstance(value, dict) for value in data.values()):
|
||||
# Nested structure by statement type
|
||||
flattened = {}
|
||||
for _statement_type, concepts in data.items():
|
||||
for standard_concept, company_concepts in concepts.items():
|
||||
flattened[standard_concept] = set(company_concepts)
|
||||
return flattened
|
||||
else:
|
||||
# Flat structure
|
||||
return {k: set(v) for k, v in data.items()}
|
||||
|
||||
# If all methods fail, return empty mappings
|
||||
# The initialize_default_mappings function will create a file if needed
|
||||
return {}
|
||||
|
||||
def _save_mappings(self) -> None:
|
||||
"""Save mappings to the JSON file, unless in read_only mode."""
|
||||
# Skip saving if in read_only mode
|
||||
if self.read_only:
|
||||
return
|
||||
|
||||
# Ensure directory exists
|
||||
directory = os.path.dirname(self.source)
|
||||
if directory and not os.path.exists(directory):
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
|
||||
# Convert sets to lists for JSON serialization
|
||||
serializable_mappings = {k: list(v) for k, v in self.mappings.items()}
|
||||
|
||||
with open(self.source, 'w') as f:
|
||||
json.dump(serializable_mappings, f, indent=2)
|
||||
|
||||
def add(self, company_concept: str, standard_concept: str) -> None:
|
||||
"""
|
||||
Add a mapping from a company concept to a standard concept.
|
||||
|
||||
Args:
|
||||
company_concept: The company-specific concept
|
||||
standard_concept: The standard concept
|
||||
"""
|
||||
if standard_concept not in self.mappings:
|
||||
self.mappings[standard_concept] = set()
|
||||
|
||||
self.mappings[standard_concept].add(company_concept)
|
||||
self._save_mappings()
|
||||
|
||||
def get_standard_concept(self, company_concept: str, context: Dict = None) -> Optional[str]:
|
||||
"""
|
||||
Get the standard concept for a given company concept with priority-based resolution.
|
||||
|
||||
Args:
|
||||
company_concept: The company-specific concept
|
||||
context: Optional context information (not used in current implementation)
|
||||
|
||||
Returns:
|
||||
The standard concept or None if not found
|
||||
"""
|
||||
# Use merged mappings with priority-based resolution
|
||||
if self.merged_mappings:
|
||||
# Detect company from concept prefix (e.g., 'tsla:Revenue' -> 'tsla')
|
||||
detected_entity = self._detect_entity_from_concept(company_concept)
|
||||
|
||||
# Search through merged mappings with priority
|
||||
candidates = []
|
||||
|
||||
for std_concept, mapping_list in self.merged_mappings.items():
|
||||
for concept, source, priority in mapping_list:
|
||||
if concept == company_concept:
|
||||
# Boost priority if it matches detected entity
|
||||
effective_priority = priority
|
||||
if detected_entity and source == detected_entity:
|
||||
effective_priority = 4 # Highest priority for exact company match
|
||||
|
||||
candidates.append((std_concept, effective_priority, source))
|
||||
|
||||
# Return highest priority match
|
||||
if candidates:
|
||||
best_match = max(candidates, key=lambda x: x[1])
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.debug("Mapping applied: %s -> %s (source: %s, priority: %s)", company_concept, best_match[0], best_match[2], best_match[1])
|
||||
return best_match[0]
|
||||
|
||||
# Fallback to core mappings
|
||||
for standard_concept, company_concepts in self.mappings.items():
|
||||
if company_concept in company_concepts:
|
||||
return standard_concept
|
||||
return None
|
||||
|
||||
def get_company_concepts(self, standard_concept: str) -> Set[str]:
|
||||
"""
|
||||
Get all company concepts mapped to a standard concept.
|
||||
|
||||
Args:
|
||||
standard_concept: The standard concept
|
||||
|
||||
Returns:
|
||||
Set of company concepts mapped to the standard concept
|
||||
"""
|
||||
return self.mappings.get(standard_concept, set())
|
||||
|
||||
|
||||
class ConceptMapper:
|
||||
"""
|
||||
Maps company-specific concepts to standard concepts using various techniques.
|
||||
|
||||
Attributes:
|
||||
mapping_store (MappingStore): Storage for concept mappings
|
||||
pending_mappings (Dict): Low-confidence mappings pending review
|
||||
_cache (Dict): In-memory cache of mapped concepts
|
||||
"""
|
||||
|
||||
def __init__(self, mapping_store: MappingStore):
|
||||
"""
|
||||
Initialize the concept mapper.
|
||||
|
||||
Args:
|
||||
mapping_store: Storage for concept mappings
|
||||
"""
|
||||
self.mapping_store = mapping_store
|
||||
self.pending_mappings = {}
|
||||
# Cache for faster lookups of previously mapped concepts
|
||||
self._cache = {}
|
||||
# Precompute lowercased standard concept values for faster comparison
|
||||
self._std_concept_values = [(concept, concept.value.lower()) for concept in StandardConcept]
|
||||
|
||||
# Statement-specific keyword sets for faster contextual matching
|
||||
self._bs_keywords = {'assets', 'liabilities', 'equity', 'cash', 'debt', 'inventory', 'receivable', 'payable'}
|
||||
self._is_keywords = {'revenue', 'sales', 'income', 'expense', 'profit', 'loss', 'tax', 'earnings'}
|
||||
self._cf_keywords = {'cash', 'operating', 'investing', 'financing', 'activities'}
|
||||
|
||||
def map_concept(self, company_concept: str, label: str, context: Dict[str, Any]) -> Optional[str]:
|
||||
"""
|
||||
Map a company concept to a standard concept.
|
||||
|
||||
Args:
|
||||
company_concept: The company-specific concept
|
||||
label: The label for the concept
|
||||
context: Additional context information (statement type, calculation relationships, etc.)
|
||||
|
||||
Returns:
|
||||
The standard concept or None if no mapping found
|
||||
"""
|
||||
# Use cache for faster lookups
|
||||
cache_key = (company_concept, context.get('statement_type', ''))
|
||||
if cache_key in self._cache:
|
||||
return self._cache[cache_key]
|
||||
|
||||
# Check if we already have a mapping in the store
|
||||
standard_concept = self.mapping_store.get_standard_concept(company_concept)
|
||||
if standard_concept:
|
||||
self._cache[cache_key] = standard_concept
|
||||
return standard_concept
|
||||
|
||||
# Cache negative results too to avoid repeated inference
|
||||
self._cache[cache_key] = None
|
||||
return None
|
||||
|
||||
def _infer_mapping(self, company_concept: str, label: str, context: Dict[str, Any]) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Infer a mapping between a company concept and a standard concept.
|
||||
|
||||
Args:
|
||||
company_concept: The company-specific concept
|
||||
label: The label for the concept
|
||||
context: Additional context information
|
||||
|
||||
Returns:
|
||||
Tuple of (standard_concept, confidence)
|
||||
"""
|
||||
# Fast path for common patterns
|
||||
label_lower = label.lower()
|
||||
|
||||
# Quick matching for common concepts without full sequence matching
|
||||
if "total assets" in label_lower:
|
||||
return StandardConcept.TOTAL_ASSETS.value, 0.95
|
||||
elif "revenue" in label_lower and len(label_lower) < 30: # Only match short labels to avoid false positives
|
||||
return StandardConcept.REVENUE.value, 0.9
|
||||
elif "net income" in label_lower and "parent" not in label_lower:
|
||||
return StandardConcept.NET_INCOME.value, 0.9
|
||||
|
||||
# Faster direct match checking with precomputed lowercase values
|
||||
for std_concept, std_value_lower in self._std_concept_values:
|
||||
if std_value_lower == label_lower:
|
||||
return std_concept.value, 1.0 # Perfect match
|
||||
|
||||
# Fall back to sequence matching for similarity
|
||||
best_match = None
|
||||
best_score = 0
|
||||
|
||||
# Only compute similarity if some relevant keywords are present to reduce workload
|
||||
statement_type = context.get("statement_type", "")
|
||||
|
||||
# Statement type based filtering to reduce unnecessary comparisons
|
||||
limited_concepts = []
|
||||
if statement_type == "BalanceSheet":
|
||||
if any(kw in label_lower for kw in self._bs_keywords):
|
||||
# Filter to balance sheet concepts only
|
||||
limited_concepts = [c for c, v in self._std_concept_values
|
||||
if any(kw in v for kw in self._bs_keywords)]
|
||||
elif statement_type == "IncomeStatement":
|
||||
if any(kw in label_lower for kw in self._is_keywords):
|
||||
# Filter to income statement concepts only
|
||||
limited_concepts = [c for c, v in self._std_concept_values
|
||||
if any(kw in v for kw in self._is_keywords)]
|
||||
elif statement_type == "CashFlowStatement":
|
||||
if any(kw in label_lower for kw in self._cf_keywords):
|
||||
# Filter to cash flow concepts only
|
||||
limited_concepts = [c for c, v in self._std_concept_values
|
||||
if any(kw in v for kw in self._cf_keywords)]
|
||||
|
||||
# Use limited concepts if available, otherwise use all
|
||||
concepts_to_check = limited_concepts if limited_concepts else [c for c, _ in self._std_concept_values]
|
||||
|
||||
# Calculate similarities for candidate concepts
|
||||
for std_concept in concepts_to_check:
|
||||
# Calculate similarity between labels
|
||||
similarity = SequenceMatcher(None, label_lower, std_concept.value.lower()).ratio()
|
||||
|
||||
# Check if this is the best match so far
|
||||
if similarity > best_score:
|
||||
best_score = similarity
|
||||
best_match = std_concept.value
|
||||
|
||||
# Apply specific contextual rules based on statement type
|
||||
if statement_type == "BalanceSheet":
|
||||
if "assets" in label_lower and "total" in label_lower:
|
||||
if best_match == StandardConcept.TOTAL_ASSETS.value:
|
||||
best_score = min(1.0, best_score + 0.2)
|
||||
elif "liabilities" in label_lower and "total" in label_lower:
|
||||
if best_match == StandardConcept.TOTAL_LIABILITIES.value:
|
||||
best_score = min(1.0, best_score + 0.2)
|
||||
elif "equity" in label_lower and ("total" in label_lower or "stockholders" in label_lower):
|
||||
if best_match == StandardConcept.TOTAL_EQUITY.value:
|
||||
best_score = min(1.0, best_score + 0.2)
|
||||
|
||||
elif statement_type == "IncomeStatement":
|
||||
if any(term in label_lower for term in ["revenue", "sales"]):
|
||||
if best_match == StandardConcept.REVENUE.value:
|
||||
best_score = min(1.0, best_score + 0.2)
|
||||
elif "net income" in label_lower:
|
||||
if best_match == StandardConcept.NET_INCOME.value:
|
||||
best_score = min(1.0, best_score + 0.2)
|
||||
|
||||
# Promote to 0.5 confidence if score close enough to help match
|
||||
# more items that are almost at threshold
|
||||
if 0.45 <= best_score < 0.5:
|
||||
best_score = 0.5
|
||||
|
||||
# If confidence is too low, return None
|
||||
if best_score < 0.5:
|
||||
return None, 0.0
|
||||
|
||||
return best_match, best_score
|
||||
|
||||
def learn_mappings(self, filings: List[Dict[str, Any]]) -> None:
|
||||
"""
|
||||
Learn mappings from a list of filings.
|
||||
|
||||
Args:
|
||||
filings: List of dicts with XBRL data
|
||||
"""
|
||||
# Pre-filter to only process unmapped concepts
|
||||
mapped_concepts = set()
|
||||
for _std_concept, company_concepts in self.mapping_store.mappings.items():
|
||||
mapped_concepts.update(company_concepts)
|
||||
|
||||
# Process only unmapped filings
|
||||
unmapped_filings = [f for f in filings if f.get("concept") not in mapped_concepts]
|
||||
|
||||
# Create a batch of mappings to add
|
||||
mappings_to_add = {}
|
||||
|
||||
for filing in unmapped_filings:
|
||||
concept = filing["concept"]
|
||||
label = filing["label"]
|
||||
context = {
|
||||
"statement_type": filing.get("statement_type", ""),
|
||||
"calculation_parent": filing.get("calculation_parent", ""),
|
||||
"position": filing.get("position", "")
|
||||
}
|
||||
|
||||
# Infer mapping and confidence
|
||||
standard_concept, confidence = self._infer_mapping(concept, label, context)
|
||||
|
||||
# Handle based on confidence
|
||||
if standard_concept and confidence >= 0.9:
|
||||
if standard_concept not in mappings_to_add:
|
||||
mappings_to_add[standard_concept] = set()
|
||||
mappings_to_add[standard_concept].add(concept)
|
||||
elif standard_concept and confidence >= 0.5:
|
||||
if standard_concept not in self.pending_mappings:
|
||||
self.pending_mappings[standard_concept] = []
|
||||
self.pending_mappings[standard_concept].append((concept, confidence, label))
|
||||
|
||||
# Batch add all mappings at once
|
||||
for std_concept, concepts in mappings_to_add.items():
|
||||
for concept in concepts:
|
||||
self.mapping_store.add(concept, std_concept)
|
||||
# Update cache
|
||||
cache_key = (concept, filing.get("statement_type", ""))
|
||||
self._cache[cache_key] = std_concept
|
||||
|
||||
def save_pending_mappings(self, destination: str) -> None:
|
||||
"""
|
||||
Save pending mappings to a file.
|
||||
|
||||
Args:
|
||||
destination: Path to save the pending mappings
|
||||
"""
|
||||
# Convert to serializable format
|
||||
serializable_mappings = {}
|
||||
for std_concept, mappings in self.pending_mappings.items():
|
||||
serializable_mappings[std_concept] = [
|
||||
{"concept": c, "confidence": conf, "label": lbl}
|
||||
for c, conf, lbl in mappings
|
||||
]
|
||||
|
||||
with open(destination, 'w') as f:
|
||||
json.dump(serializable_mappings, f, indent=2)
|
||||
|
||||
|
||||
def standardize_statement(statement_data: List[Dict[str, Any]], mapper: ConceptMapper) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Standardize labels in a statement using the concept mapper.
|
||||
|
||||
Args:
|
||||
statement_data: List of statement line items
|
||||
mapper: ConceptMapper instance
|
||||
|
||||
Returns:
|
||||
Statement data with standardized labels where possible
|
||||
"""
|
||||
# Pre-filter to identify which items need standardization
|
||||
# This avoids unnecessary copying and processing
|
||||
items_to_standardize = []
|
||||
statement_type = statement_data[0].get("statement_type", "") if statement_data else ""
|
||||
|
||||
# First pass - identify which items need standardization and prepare context
|
||||
for i, item in enumerate(statement_data):
|
||||
# Skip abstract elements and dimensions as they don't need standardization
|
||||
if item.get("is_abstract", False) or item.get("is_dimension", False):
|
||||
continue
|
||||
|
||||
concept = item.get("concept", "")
|
||||
if not concept:
|
||||
continue
|
||||
|
||||
label = item.get("label", "")
|
||||
if not label:
|
||||
continue
|
||||
|
||||
# Build minimal context once, reuse for multiple calls
|
||||
context = {
|
||||
"statement_type": item.get("statement_type", "") or statement_type,
|
||||
"level": item.get("level", 0),
|
||||
"is_total": "total" in label.lower() or item.get("is_total", False)
|
||||
}
|
||||
|
||||
items_to_standardize.append((i, concept, label, context))
|
||||
|
||||
# If no items need standardization, return early with unchanged data
|
||||
if not items_to_standardize:
|
||||
return statement_data
|
||||
|
||||
# Second pass - create result list with standardized items
|
||||
result = []
|
||||
|
||||
# Track which indices need standardization for faster lookup
|
||||
standardize_indices = {i for i, _, _, _ in items_to_standardize}
|
||||
|
||||
# Process all items
|
||||
for i, item in enumerate(statement_data):
|
||||
if i not in standardize_indices:
|
||||
# Items that don't need standardization are used as-is
|
||||
result.append(item)
|
||||
continue
|
||||
|
||||
# Get the prepared data for this item
|
||||
_, concept, label, context = next((x for x in items_to_standardize if x[0] == i), (None, None, None, None))
|
||||
|
||||
# Try to map the concept
|
||||
standard_label = mapper.map_concept(concept, label, context)
|
||||
|
||||
# If we found a mapping, create a modified copy
|
||||
if standard_label:
|
||||
# Create a shallow copy only when needed
|
||||
standardized_item = item.copy()
|
||||
standardized_item["label"] = standard_label
|
||||
standardized_item["original_label"] = label
|
||||
result.append(standardized_item)
|
||||
else:
|
||||
# No mapping found, use original item
|
||||
result.append(item)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def create_default_mappings_file(file_path: str) -> None:
|
||||
"""
|
||||
Create the initial concept_mappings.json file with default mappings.
|
||||
This can be called during package installation or initialization.
|
||||
|
||||
Args:
|
||||
file_path: Path where to create the file
|
||||
"""
|
||||
# Ensure directory exists
|
||||
directory = os.path.dirname(file_path)
|
||||
if directory and not os.path.exists(directory):
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
|
||||
# The file already exists, don't overwrite it
|
||||
if os.path.exists(file_path):
|
||||
return
|
||||
|
||||
# Create a minimal set of mappings to get started
|
||||
minimal_mappings = {
|
||||
StandardConcept.REVENUE.value: [
|
||||
"us-gaap_Revenue",
|
||||
"us-gaap_SalesRevenueNet",
|
||||
"us-gaap_Revenues"
|
||||
],
|
||||
StandardConcept.NET_INCOME.value: [
|
||||
"us-gaap_NetIncome",
|
||||
"us-gaap_NetIncomeLoss",
|
||||
"us-gaap_ProfitLoss"
|
||||
],
|
||||
StandardConcept.TOTAL_ASSETS.value: [
|
||||
"us-gaap_Assets",
|
||||
"us-gaap_AssetsTotal"
|
||||
]
|
||||
}
|
||||
|
||||
# Write the file
|
||||
with open(file_path, 'w') as f:
|
||||
json.dump(minimal_mappings, f, indent=2)
|
||||
|
||||
# Initialize MappingStore - only loads from JSON
|
||||
def initialize_default_mappings(read_only: bool = False) -> MappingStore:
|
||||
"""
|
||||
Initialize a MappingStore with mappings from the concept_mappings.json file.
|
||||
|
||||
Args:
|
||||
read_only: If True, prevent writing changes back to the file (used in testing)
|
||||
|
||||
Returns:
|
||||
MappingStore initialized with mappings from JSON file
|
||||
"""
|
||||
store = MappingStore(read_only=read_only)
|
||||
|
||||
# If JSON file doesn't exist, create it with minimal default mappings
|
||||
# Only do this in non-read_only mode to avoid test-initiated file creation
|
||||
if not read_only and not os.path.exists(store.source):
|
||||
create_default_mappings_file(store.source)
|
||||
|
||||
return store
|
||||
Reference in New Issue
Block a user