Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,44 @@
# XBRL2 Standardization
This package provides functionality for standardizing XBRL concepts across different company filings.
## Overview
The standardization module maps company-specific XBRL concepts to standardized concept names,
enabling consistent presentation of financial statements regardless of the filing entity.
This is particularly useful for:
- Comparing financial data across different companies
- Building standardized reports and visualizations
- Creating consistent financial datasets for analysis
## Components
- `StandardConcept`: An enumeration of standard financial statement concepts
- `MappingStore`: Storage for mappings between company-specific and standard concepts
- `ConceptMapper`: Maps company-specific concepts to standard concepts using various techniques
- `standardize_statement`: Function to standardize a statement's labels
## Usage
```python
from edgar.xbrl.standardization import StandardConcept, initialize_default_mappings, ConceptMapper,
standardize_statement
# Get the default mappings
store = initialize_default_mappings()
# Create a mapper
mapper = ConceptMapper(store)
# Standardize a statement
standardized_data = standardize_statement(statement_data, mapper)
```
## Concept Mappings
The standardized concept mappings are stored in the `concept_mappings.json` file included
in this package. This file maps standard concept names to lists of company-specific concept IDs.
The file is automatically loaded when initializing the `MappingStore` and can be extended
with new mappings as needed.

View File

@@ -0,0 +1,17 @@
"""
XBRL concept standardization package.
This package provides functionality to map company-specific XBRL concepts
to standardized concept names, enabling consistent presentation of financial
statements regardless of the filing entity.
"""
from edgar.xbrl.standardization.core import ConceptMapper, MappingStore, StandardConcept, initialize_default_mappings, standardize_statement
__all__ = [
'StandardConcept',
'MappingStore',
'ConceptMapper',
'standardize_statement',
'initialize_default_mappings'
]

View File

@@ -0,0 +1,21 @@
{
"concept_mappings": {
"Sales and Service Revenue": [
"brka_SalesAndServiceRevenue"
]
},
"hierarchy_rules": {
"Revenue": {
"components": [
"Sales and Service Revenue",
"Operating Lease Revenue"
],
"description": "Total revenue comprises sales/service revenue and operating lease income for holding company"
}
},
"business_context": {
"entity_type": "holding_company",
"industry": "diversified_conglomerate",
"description": "Berkshire Hathaway operates diverse businesses including insurance, utilities, railroads, and manufacturing"
}
}

View File

@@ -0,0 +1,64 @@
{
"entity_info": {
"name": "Microsoft Corporation",
"cik": "0000789019",
"ticker": "MSFT",
"description": "Microsoft-specific concept mappings for unique business terminology"
},
"concept_mappings": {
"_comment_msft_revenue": "Microsoft uses specific revenue categorization that differs from standard tech companies",
"Product Revenue": [
"msft_ProductRevenue",
"msft_WindowsCommercialRevenue",
"msft_WindowsConsumerRevenue",
"msft_OfficeCommercialRevenue"
],
"Service Revenue": [
"msft_ServiceRevenue",
"msft_CloudServicesRevenue",
"msft_ConsultingServicesRevenue"
],
"Subscription Revenue": [
"msft_Office365CommercialRevenue",
"msft_Office365ConsumerRevenue",
"msft_DynamicsRevenue"
],
"Platform Revenue": [
"msft_AzureRevenue",
"msft_XboxContentAndServicesRevenue"
],
"_comment_msft_expenses": "Microsoft has unique expense categorizations for sales and marketing vs G&A",
"Sales and Marketing Expense": [
"msft_SalesAndMarketingExpense",
"msft_AdvertisingAndPromotionExpense"
],
"Technical Support Expense": [
"msft_TechnicalSupportExpense",
"msft_CustomerSupportExpense"
]
},
"hierarchy_rules": {
"_comment": "Rules for handling Microsoft-specific hierarchical relationships",
"revenue_hierarchy": {
"parent": "Revenue",
"children": ["Product Revenue", "Service Revenue", "Subscription Revenue", "Platform Revenue"],
"calculation_rule": "sum"
},
"expense_hierarchy": {
"parent": "Operating Expenses",
"children": ["Sales and Marketing Expense", "Technical Support Expense"],
"calculation_rule": "sum"
}
}
}

View File

@@ -0,0 +1,54 @@
{
"metadata": {
"entity_identifier": "tsla",
"company_name": "Tesla, Inc.",
"cik": "1318605",
"priority": "high",
"created_date": "2024-06-25",
"last_updated": "2024-06-25",
"description": "Tesla-specific concept mappings to handle automotive, energy, and service revenue streams"
},
"concept_mappings": {
"Automotive Revenue": [
"tsla_AutomotiveRevenue",
"tsla_AutomotiveSales",
"tsla_VehicleRevenue"
],
"Automotive Leasing Revenue": [
"tsla_AutomotiveLeasing",
"tsla_AutomotiveLeasingRevenue",
"tsla_VehicleLeasingRevenue"
],
"Energy Revenue": [
"tsla_EnergyGenerationAndStorageRevenue",
"tsla_EnergyRevenue",
"tsla_SolarRevenue",
"tsla_EnergyStorageRevenue"
],
"Service Revenue": [
"tsla_ServicesAndOtherRevenue",
"tsla_ServiceRevenue",
"tsla_SuperchargerRevenue"
]
},
"hierarchy_rules": {
"Revenue": {
"children": [
"Automotive Revenue",
"Energy Revenue",
"Service Revenue"
]
},
"Automotive Revenue": {
"children": [
"Automotive Leasing Revenue"
]
}
},
"business_context": {
"primary_revenue_streams": ["automotive", "energy", "services"],
"revenue_model": "product_and_service",
"key_metrics": ["vehicle_deliveries", "energy_deployments"],
"industry": "automotive_technology"
}
}

View File

@@ -0,0 +1,353 @@
{
"_comment_revenue_hierarchy": "REVENUE HIERARCHY FIX: Separated total revenue from component revenue types to prevent duplicate labels. Contract and product revenue are components that should have distinct labels from total revenue.",
"Revenue": [
"us-gaap_Revenue",
"us-gaap_Revenues",
"us-gaap_SalesRevenueNet",
"us-gaap_OperatingRevenue"
],
"Contract Revenue": [
"us-gaap_RevenueFromContractWithCustomerExcludingAssessedTax",
"us-gaap_RevenueFromContractWithCustomerIncludingAssessedTax"
],
"Product Revenue": [
"us-gaap_SalesRevenueGoodsNet",
"us-gaap_ProductSales"
],
"Operating Lease Revenue": [
"us-gaap_OperatingLeaseLeaseIncome"
],
"_comment_cost_of_revenue_hierarchy": "COST OF REVENUE HIERARCHY FIX: Separated different cost types to prevent duplicate labels. Different business models (manufacturing, service, mixed) use different cost concepts that should have distinct labels for clarity.",
"Cost of Revenue": [
"us-gaap_CostOfRevenueAbstract"
],
"Total Cost of Revenue": [
"us-gaap_CostOfRevenue"
],
"Cost of Goods Sold": [
"us-gaap_CostOfGoodsSold"
],
"Cost of Goods and Services Sold": [
"us-gaap_CostOfGoodsAndServicesSold"
],
"Cost of Sales": [
"us-gaap_CostOfSales"
],
"Cost of Goods and Services Excluding Depreciation": [
"us-gaap_CostOfGoodsAndServiceExcludingDepreciationDepletionAndAmortization"
],
"Direct Operating Costs": [
"us-gaap_DirectOperatingCosts"
],
"Costs and Expenses": [
"us-gaap_CostsAndExpenses"
],
"Gross Profit": [
"us-gaap_GrossProfit"
],
"Operating Expenses": [
"us-gaap_NoninterestExpense",
"us-gaap_OperatingCostsAndExpenses",
"us-gaap_OperatingExpenses"
],
"Research and Development Expense": [
"us-gaap_ResearchAndDevelopmentCosts",
"us-gaap_ResearchAndDevelopmentExpense"
],
"_comment_sga_hierarchy": "SG&A HIERARCHY FIX: Separated total SG&A from components to prevent duplicate labels. Previously all three concepts below mapped to 'Selling, General and Administrative Expense' causing confusion when companies report both total and components.",
"Selling, General and Administrative Expense": [
"us-gaap_SellingGeneralAndAdministrativeExpense"
],
"General and Administrative Expense": [
"us-gaap_GeneralAndAdministrativeExpense",
"us-gaap_AdministrativeExpense"
],
"Selling Expense": [
"us-gaap_SellingAndMarketingExpense",
"us-gaap_SellingExpense"
],
"Marketing Expense": [
"us-gaap_MarketingExpense",
"us-gaap_AdvertisingExpense"
],
"Operating Income": [
"us-gaap_OperatingIncomeLoss",
"us-gaap_OperatingIncome",
"us-gaap_IncomeLossFromContinuingOperationsBeforeInterestAndTaxes"
],
"Nonoperating Income/Expense": [
"orcl_NonoperatingIncomeExpenseIncludingEliminationOfNetIncomeLossAttributableToNoncontrollingInterests",
"us-gaap_NonoperatingIncomeExpense"
],
"Interest Expense": [
"us-gaap_InterestAndDebtExpense",
"us-gaap_InterestExpense",
"us-gaap_InterestIncomeExpenseNet"
],
"Interest Expense (operating)": [
"us-gaap_InterestExpenseOperating"
],
"Interest Expense (non-operating)": [
"us-gaap_InterestExpenseNonoperating"
],
"_comment_income_before_tax_hierarchy": "INCOME BEFORE TAX HIERARCHY FIX: Separated total income before tax from component types to prevent duplicate labels. Continuing operations and extraordinary items are components that should have distinct labels.",
"Income Before Tax": [
"us-gaap_IncomeLossBeforeIncomeTaxes"
],
"Income Before Tax from Continuing Operations": [
"us-gaap_IncomeLossFromContinuingOperationsBeforeIncomeTaxes",
"us-gaap_IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest",
"orcl_IncomeLossFromContinuingOperationsIncludingNoncontrollingInterestBeforeIncomeTaxesExtraordinaryItems"
],
"Income Tax Expense": [
"us-gaap_IncomeTaxesPaidNet",
"us-gaap_IncomeTaxExpenseBenefit"
],
"_comment_net_income_hierarchy": "NET INCOME HIERARCHY FIX: Separated total net income from component income types to prevent duplicate labels. Continuing operations income and profit/loss are components that should have distinct labels from total net income.",
"Net Income": [
"us-gaap_NetIncome",
"us-gaap_NetIncomeLoss"
],
"Net Income from Continuing Operations": [
"us-gaap_IncomeLossFromContinuingOperationsIncludingPortionAttributableToNoncontrollingInterest",
"us-gaap_IncomeLossFromContinuingOperations"
],
"Profit or Loss": [
"us-gaap_ProfitLoss"
],
"Net Income Attributable to Noncontrolling Interest": [
"us-gaap_NetIncomeLossAttributableToNonredeemableNoncontrollingInterest",
"us-gaap_NetIncomeLossAttributableToNoncontrollingInterest"
],
"Basic Net Income Available to Common Shareholders": [
"us-gaap_NetIncomeLossAvailableToCommonStockholdersBasic"
],
"Diluted Net Income Available to Common Shareholders": [
"us-gaap_NetIncomeLossAvailableToCommonStockholdersDiluted"
],
"Accumulated Other Comprehensive Income/Loss": [
"us-gaap_AccumulatedOtherComprehensiveIncomeLossNetOfTax"
],
"Earnings Per Share": [
"us-gaap_EarningsPerShareAbstract"
],
"Earnings Per Share (Basic)": [
"us-gaap_EarningsPerShareBasic"
],
"Earnings Per Share (Diluted)": [
"us-gaap_EarningsPerShareDiluted"
],
"Shares Outstanding": [
"us-gaap_WeightedAverageNumberOfSharesOutstandingAbstract"
],
"Shares Outstanding (Basic)": [
"us-gaap_WeightedAverageNumberOfSharesOutstandingBasic"
],
"Shares Outstanding (Diluted)": [
"us-gaap_WeightedAverageNumberOfDilutedSharesOutstanding"
],
"Cash and Cash Equivalents": [
"us-gaap_CashEquivalentsAtCarryingValue",
"us-gaap_Cash",
"us-gaap_CashAndCashEquivalentsAtCarryingValue",
"us-gaap_CashCashEquivalentsAndShortTermInvestments"
],
"Accounts Receivable": [
"us-gaap_AccountsReceivableNet",
"us-gaap_ReceivablesNetCurrent",
"us-gaap_AccountsReceivableNetCurrent",
"us-gaap_AccountsReceivableGross"
],
"Inventory": [
"us-gaap_InventoryGross",
"us-gaap_InventoryFinishedGoods",
"us-gaap_InventoryNet"
],
"Prepaid Expenses": [
"us-gaap_PrepaidExpenseAndOtherAssetsCurrent",
"us-gaap_PrepaidExpenseCurrent"
],
"Current Marketable Securities": [
"us-gaap_AvailableForSaleSecuritiesDebtSecuritiesCurrent",
"us-gaap_MarketableSecuritiesCurrent"
],
"Non Current Marketable Securities": [
"us-gaap_MarketableSecuritiesNoncurrent"
],
"Total Current Assets": [
"us-gaap_AssetsCurrent"
],
"Total Non Current Assets": [
"us-gaap_AssetsNoncurrent"
],
"Property, Plant and Equipment": [
"us-gaap_PropertyPlantAndEquipmentGross",
"us-gaap_PropertyPlantAndEquipmentNet",
"us-gaap_FixedAssets"
],
"Goodwill": [
"us-gaap_Goodwill"
],
"Intangible Assets": [
"us-gaap_IntangibleAssetsNetIncludingGoodwill",
"us-gaap_IntangibleAssetsNetExcludingGoodwill",
"us-gaap_FiniteLivedIntangibleAssetsNet"
],
"Total Assets": [
"us-gaap_Assets",
"us-gaap_AssetsTotal"
],
"Long-Term Investments": [
"us-gaap_LongTermInvestments"
],
"Accounts Payable": [
"us-gaap_AccountsPayableCurrent",
"us-gaap_AccountsPayableTradeCurrent"
],
"Accrued Liabilities": [
"us-gaap_OtherAccruedLiabilitiesCurrent",
"us-gaap_AccruedLiabilitiesCurrent",
"us-gaap_EmployeeRelatedLiabilitiesCurrent"
],
"Short Term Debt": [
"us-gaap_DebtCurrent",
"us-gaap_ShortTermBorrowings",
"us-gaap_LongTermDebtCurrent"
],
"Total Current Liabilities": [
"us-gaap_LiabilitiesCurrent"
],
"Total Non Current Liabilities": [
"us-gaap_LiabilitiesNoncurrent"
],
"Long Term Debt": [
"us-gaap_LongTermDebtAndCapitalLeaseObligations",
"us-gaap_LongTermDebt",
"us-gaap_LongTermBorrowings",
"us-gaap_LongTermDebtNoncurrent"
],
"Notes Payable, Current": [
"us-gaap_NotesPayableCurrent"
],
"Notes Payable, Non Current": [
"us-gaap_LongTermNotesAndLoans"
],
"Deferred Revenue": [
"us-gaap_DeferredRevenueNoncurrent",
"us-gaap_DeferredRevenueCurrent",
"us-gaap_DeferredRevenue"
],
"Total Liabilities": [
"us-gaap_LiabilitiesTotal",
"us-gaap_Liabilities"
],
"Common Stock Shares Outstanding": [
"us-gaap_CommonStockSharesOutstanding"
],
"Common Stock Shares Issued": [
"us-gaap_CommonStockSharesIssued"
],
"Common Stock": [
"us-gaap_CommonStocksIncludingAdditionalPaidInCapital",
"us-gaap_StockholdersEquityCommonStock",
"us-gaap_CommonStockValue"
],
"Preferred Stock": [
"us-gaap_PreferredStockValue"
],
"Treasury Stock Common Value": [
"us-gaap_TreasuryStockCommonValue",
"us-gaap_TreasuryStockValue"
],
"Retained Earnings": [
"us-gaap_RetainedEarnings",
"us-gaap_RetainedEarningsAccumulatedDeficit"
],
"Minority Interest": [
"us-gaap_MinorityInterest",
"us-gaap_NoncontrollingInterest"
],
"Total Stockholders' Equity": [
"us-gaap_EquityAttributableToParent",
"us-gaap_StockholdersEquity",
"us-gaap_StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest",
"us-gaap_StockholdersEquityAttributableToParent"
],
"Total Liabilities and Stockholders' Equity": [
"us-gaap_LiabilitiesAndStockholdersEquity"
],
"Net Cash from Operating Activities": [
"us-gaap_NetCashProvidedByUsedInOperatingActivities",
"us-gaap_NetCashProvidedByUsedInOperatingActivitiesContinuingOperations"
],
"Net Cash from Investing Activities": [
"us-gaap_NetCashProvidedByUsedInInvestingActivities",
"us-gaap_NetCashProvidedByUsedInInvestingActivitiesContinuingOperations"
],
"Net Cash from Financing Activities": [
"us-gaap_NetCashProvidedByUsedInFinancingActivitiesContinuingOperations",
"us-gaap_NetCashProvidedByUsedInFinancingActivities"
],
"Net Change in Cash": [
"us-gaap_IncreaseDecreaseInCashAndCashEquivalents",
"us-gaap_CashAndCashEquivalentsPeriodIncreaseDecrease",
"us-gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect"
],
"Payments for Property, Plant and Equipment": [
"us-gaap_PaymentsToAcquirePropertyPlantAndEquipment"
],
"Payments of Dividends": [
"us-gaap_PaymentsOfDividends"
],
"Tax Withholding for Share-Based Compensation": [
"us-gaap_PaymentsRelatedToTaxWithholdingForShareBasedCompensation"
],
"Payments to Acquire Businesses": [
"us-gaap_PaymentsToAcquireBusinessesNetOfCashAcquired"
],
"Proceeds from Issuance of Common Stock": [
"us-gaap_ProceedsFromIssuanceOfCommonStock"
],
"Proceeds from Issuance of Long-Term Debt": [
"us-gaap_ProceedsFromIssuanceOfLongTermDebt"
],
"Proceeds from Maturities, Prepayments and Calls of Securities": [
"us-gaap_ProceedsFromMaturitiesPrepaymentsAndCallsOfAvailableForSaleSecurities"
],
"Proceeds from Sale and Maturity of Other Investments": [
"us-gaap_ProceedsFromSaleAndMaturityOfOtherInvestments"
],
"Proceeds from Sale of Debt Securities, ": [
"us-gaap_ProceedsFromSaleOfAvailableForSaleSecuritiesDebt"
],
"Proceeds from (Repayments of) Commercial Paper": [
"us-gaap_ProceedsFromRepaymentsOfCommercialPaper"
],
"Other Assets": [
"us-gaap_OtherAssets"
],
"Other Current Assets": [
"us-gaap_OtherAssetsCurrent"
],
"Other Non Current Assets": [
"us-gaap_OtherAssetsNoncurrent"
],
"Deferred Tax Assets": [
"us-gaap_DeferredIncomeTaxAssetsNet"
],
"Other Liabilities": [
"us-gaap_OtherLiabilities"
],
"Other Current Liabilities": [
"us-gaap_OtherLiabilitiesCurrent"
],
"Other Non Current Liabilities": [
"us-gaap_OtherLiabilitiesNoncurrent"
],
"Depreciation and Amortization": [
"us-gaap_AmortizationOfIntangibleAssets",
"us-gaap_Depreciation",
"us-gaap_DepreciationAndAmortization"
]
}

View File

@@ -0,0 +1,817 @@
"""
Module for standardizing XBRL concepts across different company filings.
This module provides functionality to map company-specific XBRL concepts
to standardized concept names, enabling consistent presentation of financial
statements regardless of the filing entity.
"""
import json
import os
from difflib import SequenceMatcher
from enum import Enum
from typing import Any, Dict, List, Optional, Set, Tuple
import pandas as pd
class StandardConcept(str, Enum):
"""
Standardized concept names for financial statements.
The enum value (string) is the display label used for presentation.
These labels should match keys in concept_mappings.json.
"""
# Balance Sheet - Assets
CASH_AND_EQUIVALENTS = "Cash and Cash Equivalents"
ACCOUNTS_RECEIVABLE = "Accounts Receivable"
INVENTORY = "Inventory"
PREPAID_EXPENSES = "Prepaid Expenses"
TOTAL_CURRENT_ASSETS = "Total Current Assets"
PROPERTY_PLANT_EQUIPMENT = "Property, Plant and Equipment"
GOODWILL = "Goodwill"
INTANGIBLE_ASSETS = "Intangible Assets"
TOTAL_ASSETS = "Total Assets"
# Balance Sheet - Liabilities
ACCOUNTS_PAYABLE = "Accounts Payable"
ACCRUED_LIABILITIES = "Accrued Liabilities"
SHORT_TERM_DEBT = "Short Term Debt"
TOTAL_CURRENT_LIABILITIES = "Total Current Liabilities"
LONG_TERM_DEBT = "Long Term Debt"
DEFERRED_REVENUE = "Deferred Revenue"
TOTAL_LIABILITIES = "Total Liabilities"
# Balance Sheet - Equity
COMMON_STOCK = "Common Stock"
RETAINED_EARNINGS = "Retained Earnings"
TOTAL_EQUITY = "Total Stockholders' Equity"
# Income Statement - Revenue Hierarchy
REVENUE = "Revenue"
CONTRACT_REVENUE = "Contract Revenue"
PRODUCT_REVENUE = "Product Revenue"
SERVICE_REVENUE = "Service Revenue"
SUBSCRIPTION_REVENUE = "Subscription Revenue"
LEASING_REVENUE = "Leasing Revenue"
# Industry-Specific Revenue Concepts
AUTOMOTIVE_REVENUE = "Automotive Revenue"
AUTOMOTIVE_LEASING_REVENUE = "Automotive Leasing Revenue"
ENERGY_REVENUE = "Energy Revenue"
SOFTWARE_REVENUE = "Software Revenue"
HARDWARE_REVENUE = "Hardware Revenue"
PLATFORM_REVENUE = "Platform Revenue"
# Income Statement - Expenses
COST_OF_REVENUE = "Cost of Revenue"
COST_OF_GOODS_SOLD = "Cost of Goods Sold"
COST_OF_GOODS_AND_SERVICES_SOLD = "Cost of Goods and Services Sold"
COST_OF_SALES = "Cost of Sales"
COSTS_AND_EXPENSES = "Costs and Expenses"
DIRECT_OPERATING_COSTS = "Direct Operating Costs"
GROSS_PROFIT = "Gross Profit"
OPERATING_EXPENSES = "Operating Expenses"
RESEARCH_AND_DEVELOPMENT = "Research and Development Expense"
# Enhanced Expense Hierarchy
SELLING_GENERAL_ADMIN = "Selling, General and Administrative Expense"
SELLING_EXPENSE = "Selling Expense"
GENERAL_ADMIN_EXPENSE = "General and Administrative Expense"
MARKETING_EXPENSE = "Marketing Expense"
SALES_EXPENSE = "Sales Expense"
# Other Income Statement
OPERATING_INCOME = "Operating Income"
INTEREST_EXPENSE = "Interest Expense"
INCOME_BEFORE_TAX = "Income Before Tax"
INCOME_BEFORE_TAX_CONTINUING_OPS = "Income Before Tax from Continuing Operations"
INCOME_TAX_EXPENSE = "Income Tax Expense"
NET_INCOME = "Net Income"
NET_INCOME_CONTINUING_OPS = "Net Income from Continuing Operations"
NET_INCOME_NONCONTROLLING = "Net Income Attributable to Noncontrolling Interest"
PROFIT_OR_LOSS = "Profit or Loss"
# Cash Flow Statement
CASH_FROM_OPERATIONS = "Net Cash from Operating Activities"
CASH_FROM_INVESTING = "Net Cash from Investing Activities"
CASH_FROM_FINANCING = "Net Cash from Financing Activities"
NET_CHANGE_IN_CASH = "Net Change in Cash"
@classmethod
def get_from_label(cls, label: str) -> Optional['StandardConcept']:
"""
Get a StandardConcept enum by its label value.
Args:
label: The label string to look up
Returns:
The corresponding StandardConcept or None if not found
"""
for concept in cls:
if concept.value == label:
return concept
return None
@classmethod
def get_all_values(cls) -> Set[str]:
"""
Get all label values defined in the enum.
Returns:
Set of all label strings
"""
return {concept.value for concept in cls}
class MappingStore:
"""
Storage for mappings between company-specific concepts and standard concepts.
Attributes:
source (str): Path to the JSON file storing the mappings
mappings (Dict[str, Set[str]]): Dictionary mapping standard concepts to sets of company concepts
company_mappings (Dict[str, Dict]): Company-specific mappings loaded from company_mappings/
merged_mappings (Dict[str, List[Tuple]]): Merged mappings with priority scoring
"""
def __init__(self, source: Optional[str] = None, validate_with_enum: bool = False, read_only: bool = False):
"""
Initialize the mapping store.
Args:
source: Path to the JSON file storing the mappings. If None, uses default location.
validate_with_enum: Whether to validate JSON keys against StandardConcept enum
read_only: If True, never save changes back to the file (used in testing)
"""
self.read_only = read_only
if source is None:
# Try a few different ways to locate the file, handling both development
# and installed package scenarios
self.source = None
# Default to a file in the same directory as this module (development mode)
module_dir = os.path.dirname(os.path.abspath(__file__))
potential_path = os.path.join(module_dir, "concept_mappings.json")
if os.path.exists(potential_path):
self.source = potential_path
# If not found, try to load from package data (installed package)
if self.source is None:
try:
import importlib.resources as pkg_resources
try:
# For Python 3.9+
with pkg_resources.files('edgar.xbrl.standardization').joinpath('concept_mappings.json').open('r') as f:
# Just read the file to see if it exists, we'll load it properly later
f.read(1)
self.source = potential_path # Use the same path as before
except (ImportError, FileNotFoundError, AttributeError):
# Fallback for older Python versions
try:
import pkg_resources as legacy_resources
if legacy_resources.resource_exists('edgar.xbrl.standardization', 'concept_mappings.json'):
self.source = potential_path # Use the same path as before
except (ImportError, FileNotFoundError):
pass
except ImportError:
pass
# If we still haven't found the file, use the default path anyway
# (it will fail gracefully in _load_mappings)
if self.source is None:
self.source = potential_path
else:
self.source = source
self.mappings = self._load_mappings()
# Load company-specific mappings (always enabled)
self.company_mappings = self._load_all_company_mappings()
self.merged_mappings = self._create_merged_mappings()
self.hierarchy_rules = self._load_hierarchy_rules()
# Validate the loaded mappings against StandardConcept enum
if validate_with_enum:
self.validate_against_enum()
def validate_against_enum(self) -> Tuple[bool, List[str]]:
"""
Validate that all keys in the mappings exist in StandardConcept enum.
Returns:
Tuple of (is_valid, list_of_missing_keys)
"""
standard_values = StandardConcept.get_all_values()
json_keys = set(self.mappings.keys())
# Find keys in JSON that aren't in enum
missing_in_enum = json_keys - standard_values
# Find enum values not in JSON (just for information)
missing_in_json = standard_values - json_keys
import logging
logger = logging.getLogger(__name__)
if missing_in_enum:
logger.warning("Found %d keys in concept_mappings.json that don't exist in StandardConcept enum: %s", len(missing_in_enum), sorted(missing_in_enum))
if missing_in_json:
logger.info("Found %d StandardConcept values without mappings in concept_mappings.json: %s", len(missing_in_json), sorted(missing_in_json))
return len(missing_in_enum) == 0, list(missing_in_enum)
def to_dataframe(self) -> pd.DataFrame:
"""
Convert mappings to a pandas DataFrame for analysis and visualization.
Returns:
DataFrame with columns for standard_concept and company_concept
"""
try:
import pandas as pd
except ImportError:
raise ImportError("pandas is required for to_dataframe() but is not installed") from None
rows = []
for standard_concept, company_concepts in self.mappings.items():
for company_concept in company_concepts:
rows.append({
'standard_concept': standard_concept,
'company_concept': company_concept
})
return pd.DataFrame(rows)
def _load_all_company_mappings(self) -> Dict[str, Dict]:
"""Load all company-specific mapping files from company_mappings/ directory."""
mappings = {}
company_dir = os.path.join(os.path.dirname(self.source or __file__), "company_mappings")
if os.path.exists(company_dir):
for file in os.listdir(company_dir):
if file.endswith("_mappings.json"):
entity_id = file.replace("_mappings.json", "")
try:
with open(os.path.join(company_dir, file), 'r') as f:
company_data = json.load(f)
mappings[entity_id] = company_data
except (FileNotFoundError, json.JSONDecodeError) as e:
import logging
logger = logging.getLogger(__name__)
logger.warning("Failed to load %s: %s", file, e)
return mappings
def _create_merged_mappings(self) -> Dict[str, List[Tuple[str, str, int]]]:
"""Create merged mappings with priority scoring.
Priority levels:
1. Core mappings (lowest)
2. Company mappings (higher)
3. Company-specific matches (highest when company detected)
Returns:
Dict mapping standard concepts to list of (company_concept, source, priority) tuples
"""
merged = {}
# Add core mappings (priority 1 - lowest)
for std_concept, company_concepts in self.mappings.items():
merged[std_concept] = []
for concept in company_concepts:
merged[std_concept].append((concept, "core", 1))
# Add company mappings (priority 2 - higher)
for entity_id, company_data in self.company_mappings.items():
concept_mappings = company_data.get("concept_mappings", {})
priority_level = 2
for std_concept, company_concepts in concept_mappings.items():
if std_concept not in merged:
merged[std_concept] = []
for concept in company_concepts:
merged[std_concept].append((concept, entity_id, priority_level))
return merged
def _load_hierarchy_rules(self) -> Dict[str, Dict]:
"""Load hierarchy rules from company mappings."""
all_rules = {}
# Add company hierarchy rules
for _entity_id, company_data in self.company_mappings.items():
hierarchy_rules = company_data.get("hierarchy_rules", {})
all_rules.update(hierarchy_rules)
return all_rules
def _detect_entity_from_concept(self, concept: str) -> Optional[str]:
"""Detect entity identifier from concept name prefix."""
if '_' in concept:
prefix = concept.split('_')[0].lower()
# Check if this prefix corresponds to a known company
if prefix in self.company_mappings:
return prefix
return None
def _load_mappings(self) -> Dict[str, Set[str]]:
"""
Load mappings from the JSON file.
Returns:
Dictionary mapping standard concepts to sets of company concepts
"""
data = None
# First try direct file access
try:
with open(self.source, 'r') as f:
data = json.load(f)
except (FileNotFoundError, IOError, PermissionError):
# If direct file access fails, try package resources
try:
try:
# Modern importlib.resources approach (Python 3.9+)
import importlib.resources as pkg_resources
try:
# For Python 3.9+
with pkg_resources.files('edgar.xbrl.standardization').joinpath('concept_mappings.json').open('r') as f:
data = json.load(f)
except (ImportError, FileNotFoundError, AttributeError):
# Fallback to legacy pkg_resources
import pkg_resources as legacy_resources
resource_string = legacy_resources.resource_string('edgar.xbrl.standardization', 'concept_mappings.json')
data = json.loads(resource_string)
except ImportError:
pass
except Exception:
# If all attempts fail, log a warning
import logging
logger = logging.getLogger(__name__)
logger.warning("Could not load concept_mappings.json. Standardization will be limited.")
# If we have data, process it based on its structure
if data:
# Check if the structure is flat or nested
if any(isinstance(value, dict) for value in data.values()):
# Nested structure by statement type
flattened = {}
for _statement_type, concepts in data.items():
for standard_concept, company_concepts in concepts.items():
flattened[standard_concept] = set(company_concepts)
return flattened
else:
# Flat structure
return {k: set(v) for k, v in data.items()}
# If all methods fail, return empty mappings
# The initialize_default_mappings function will create a file if needed
return {}
def _save_mappings(self) -> None:
"""Save mappings to the JSON file, unless in read_only mode."""
# Skip saving if in read_only mode
if self.read_only:
return
# Ensure directory exists
directory = os.path.dirname(self.source)
if directory and not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
# Convert sets to lists for JSON serialization
serializable_mappings = {k: list(v) for k, v in self.mappings.items()}
with open(self.source, 'w') as f:
json.dump(serializable_mappings, f, indent=2)
def add(self, company_concept: str, standard_concept: str) -> None:
"""
Add a mapping from a company concept to a standard concept.
Args:
company_concept: The company-specific concept
standard_concept: The standard concept
"""
if standard_concept not in self.mappings:
self.mappings[standard_concept] = set()
self.mappings[standard_concept].add(company_concept)
self._save_mappings()
def get_standard_concept(self, company_concept: str, context: Dict = None) -> Optional[str]:
"""
Get the standard concept for a given company concept with priority-based resolution.
Args:
company_concept: The company-specific concept
context: Optional context information (not used in current implementation)
Returns:
The standard concept or None if not found
"""
# Use merged mappings with priority-based resolution
if self.merged_mappings:
# Detect company from concept prefix (e.g., 'tsla:Revenue' -> 'tsla')
detected_entity = self._detect_entity_from_concept(company_concept)
# Search through merged mappings with priority
candidates = []
for std_concept, mapping_list in self.merged_mappings.items():
for concept, source, priority in mapping_list:
if concept == company_concept:
# Boost priority if it matches detected entity
effective_priority = priority
if detected_entity and source == detected_entity:
effective_priority = 4 # Highest priority for exact company match
candidates.append((std_concept, effective_priority, source))
# Return highest priority match
if candidates:
best_match = max(candidates, key=lambda x: x[1])
import logging
logger = logging.getLogger(__name__)
logger.debug("Mapping applied: %s -> %s (source: %s, priority: %s)", company_concept, best_match[0], best_match[2], best_match[1])
return best_match[0]
# Fallback to core mappings
for standard_concept, company_concepts in self.mappings.items():
if company_concept in company_concepts:
return standard_concept
return None
def get_company_concepts(self, standard_concept: str) -> Set[str]:
"""
Get all company concepts mapped to a standard concept.
Args:
standard_concept: The standard concept
Returns:
Set of company concepts mapped to the standard concept
"""
return self.mappings.get(standard_concept, set())
class ConceptMapper:
"""
Maps company-specific concepts to standard concepts using various techniques.
Attributes:
mapping_store (MappingStore): Storage for concept mappings
pending_mappings (Dict): Low-confidence mappings pending review
_cache (Dict): In-memory cache of mapped concepts
"""
def __init__(self, mapping_store: MappingStore):
"""
Initialize the concept mapper.
Args:
mapping_store: Storage for concept mappings
"""
self.mapping_store = mapping_store
self.pending_mappings = {}
# Cache for faster lookups of previously mapped concepts
self._cache = {}
# Precompute lowercased standard concept values for faster comparison
self._std_concept_values = [(concept, concept.value.lower()) for concept in StandardConcept]
# Statement-specific keyword sets for faster contextual matching
self._bs_keywords = {'assets', 'liabilities', 'equity', 'cash', 'debt', 'inventory', 'receivable', 'payable'}
self._is_keywords = {'revenue', 'sales', 'income', 'expense', 'profit', 'loss', 'tax', 'earnings'}
self._cf_keywords = {'cash', 'operating', 'investing', 'financing', 'activities'}
def map_concept(self, company_concept: str, label: str, context: Dict[str, Any]) -> Optional[str]:
"""
Map a company concept to a standard concept.
Args:
company_concept: The company-specific concept
label: The label for the concept
context: Additional context information (statement type, calculation relationships, etc.)
Returns:
The standard concept or None if no mapping found
"""
# Use cache for faster lookups
cache_key = (company_concept, context.get('statement_type', ''))
if cache_key in self._cache:
return self._cache[cache_key]
# Check if we already have a mapping in the store
standard_concept = self.mapping_store.get_standard_concept(company_concept)
if standard_concept:
self._cache[cache_key] = standard_concept
return standard_concept
# Cache negative results too to avoid repeated inference
self._cache[cache_key] = None
return None
def _infer_mapping(self, company_concept: str, label: str, context: Dict[str, Any]) -> Tuple[Optional[str], float]:
"""
Infer a mapping between a company concept and a standard concept.
Args:
company_concept: The company-specific concept
label: The label for the concept
context: Additional context information
Returns:
Tuple of (standard_concept, confidence)
"""
# Fast path for common patterns
label_lower = label.lower()
# Quick matching for common concepts without full sequence matching
if "total assets" in label_lower:
return StandardConcept.TOTAL_ASSETS.value, 0.95
elif "revenue" in label_lower and len(label_lower) < 30: # Only match short labels to avoid false positives
return StandardConcept.REVENUE.value, 0.9
elif "net income" in label_lower and "parent" not in label_lower:
return StandardConcept.NET_INCOME.value, 0.9
# Faster direct match checking with precomputed lowercase values
for std_concept, std_value_lower in self._std_concept_values:
if std_value_lower == label_lower:
return std_concept.value, 1.0 # Perfect match
# Fall back to sequence matching for similarity
best_match = None
best_score = 0
# Only compute similarity if some relevant keywords are present to reduce workload
statement_type = context.get("statement_type", "")
# Statement type based filtering to reduce unnecessary comparisons
limited_concepts = []
if statement_type == "BalanceSheet":
if any(kw in label_lower for kw in self._bs_keywords):
# Filter to balance sheet concepts only
limited_concepts = [c for c, v in self._std_concept_values
if any(kw in v for kw in self._bs_keywords)]
elif statement_type == "IncomeStatement":
if any(kw in label_lower for kw in self._is_keywords):
# Filter to income statement concepts only
limited_concepts = [c for c, v in self._std_concept_values
if any(kw in v for kw in self._is_keywords)]
elif statement_type == "CashFlowStatement":
if any(kw in label_lower for kw in self._cf_keywords):
# Filter to cash flow concepts only
limited_concepts = [c for c, v in self._std_concept_values
if any(kw in v for kw in self._cf_keywords)]
# Use limited concepts if available, otherwise use all
concepts_to_check = limited_concepts if limited_concepts else [c for c, _ in self._std_concept_values]
# Calculate similarities for candidate concepts
for std_concept in concepts_to_check:
# Calculate similarity between labels
similarity = SequenceMatcher(None, label_lower, std_concept.value.lower()).ratio()
# Check if this is the best match so far
if similarity > best_score:
best_score = similarity
best_match = std_concept.value
# Apply specific contextual rules based on statement type
if statement_type == "BalanceSheet":
if "assets" in label_lower and "total" in label_lower:
if best_match == StandardConcept.TOTAL_ASSETS.value:
best_score = min(1.0, best_score + 0.2)
elif "liabilities" in label_lower and "total" in label_lower:
if best_match == StandardConcept.TOTAL_LIABILITIES.value:
best_score = min(1.0, best_score + 0.2)
elif "equity" in label_lower and ("total" in label_lower or "stockholders" in label_lower):
if best_match == StandardConcept.TOTAL_EQUITY.value:
best_score = min(1.0, best_score + 0.2)
elif statement_type == "IncomeStatement":
if any(term in label_lower for term in ["revenue", "sales"]):
if best_match == StandardConcept.REVENUE.value:
best_score = min(1.0, best_score + 0.2)
elif "net income" in label_lower:
if best_match == StandardConcept.NET_INCOME.value:
best_score = min(1.0, best_score + 0.2)
# Promote to 0.5 confidence if score close enough to help match
# more items that are almost at threshold
if 0.45 <= best_score < 0.5:
best_score = 0.5
# If confidence is too low, return None
if best_score < 0.5:
return None, 0.0
return best_match, best_score
def learn_mappings(self, filings: List[Dict[str, Any]]) -> None:
"""
Learn mappings from a list of filings.
Args:
filings: List of dicts with XBRL data
"""
# Pre-filter to only process unmapped concepts
mapped_concepts = set()
for _std_concept, company_concepts in self.mapping_store.mappings.items():
mapped_concepts.update(company_concepts)
# Process only unmapped filings
unmapped_filings = [f for f in filings if f.get("concept") not in mapped_concepts]
# Create a batch of mappings to add
mappings_to_add = {}
for filing in unmapped_filings:
concept = filing["concept"]
label = filing["label"]
context = {
"statement_type": filing.get("statement_type", ""),
"calculation_parent": filing.get("calculation_parent", ""),
"position": filing.get("position", "")
}
# Infer mapping and confidence
standard_concept, confidence = self._infer_mapping(concept, label, context)
# Handle based on confidence
if standard_concept and confidence >= 0.9:
if standard_concept not in mappings_to_add:
mappings_to_add[standard_concept] = set()
mappings_to_add[standard_concept].add(concept)
elif standard_concept and confidence >= 0.5:
if standard_concept not in self.pending_mappings:
self.pending_mappings[standard_concept] = []
self.pending_mappings[standard_concept].append((concept, confidence, label))
# Batch add all mappings at once
for std_concept, concepts in mappings_to_add.items():
for concept in concepts:
self.mapping_store.add(concept, std_concept)
# Update cache
cache_key = (concept, filing.get("statement_type", ""))
self._cache[cache_key] = std_concept
def save_pending_mappings(self, destination: str) -> None:
"""
Save pending mappings to a file.
Args:
destination: Path to save the pending mappings
"""
# Convert to serializable format
serializable_mappings = {}
for std_concept, mappings in self.pending_mappings.items():
serializable_mappings[std_concept] = [
{"concept": c, "confidence": conf, "label": lbl}
for c, conf, lbl in mappings
]
with open(destination, 'w') as f:
json.dump(serializable_mappings, f, indent=2)
def standardize_statement(statement_data: List[Dict[str, Any]], mapper: ConceptMapper) -> List[Dict[str, Any]]:
"""
Standardize labels in a statement using the concept mapper.
Args:
statement_data: List of statement line items
mapper: ConceptMapper instance
Returns:
Statement data with standardized labels where possible
"""
# Pre-filter to identify which items need standardization
# This avoids unnecessary copying and processing
items_to_standardize = []
statement_type = statement_data[0].get("statement_type", "") if statement_data else ""
# First pass - identify which items need standardization and prepare context
for i, item in enumerate(statement_data):
# Skip abstract elements and dimensions as they don't need standardization
if item.get("is_abstract", False) or item.get("is_dimension", False):
continue
concept = item.get("concept", "")
if not concept:
continue
label = item.get("label", "")
if not label:
continue
# Build minimal context once, reuse for multiple calls
context = {
"statement_type": item.get("statement_type", "") or statement_type,
"level": item.get("level", 0),
"is_total": "total" in label.lower() or item.get("is_total", False)
}
items_to_standardize.append((i, concept, label, context))
# If no items need standardization, return early with unchanged data
if not items_to_standardize:
return statement_data
# Second pass - create result list with standardized items
result = []
# Track which indices need standardization for faster lookup
standardize_indices = {i for i, _, _, _ in items_to_standardize}
# Process all items
for i, item in enumerate(statement_data):
if i not in standardize_indices:
# Items that don't need standardization are used as-is
result.append(item)
continue
# Get the prepared data for this item
_, concept, label, context = next((x for x in items_to_standardize if x[0] == i), (None, None, None, None))
# Try to map the concept
standard_label = mapper.map_concept(concept, label, context)
# If we found a mapping, create a modified copy
if standard_label:
# Create a shallow copy only when needed
standardized_item = item.copy()
standardized_item["label"] = standard_label
standardized_item["original_label"] = label
result.append(standardized_item)
else:
# No mapping found, use original item
result.append(item)
return result
def create_default_mappings_file(file_path: str) -> None:
"""
Create the initial concept_mappings.json file with default mappings.
This can be called during package installation or initialization.
Args:
file_path: Path where to create the file
"""
# Ensure directory exists
directory = os.path.dirname(file_path)
if directory and not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
# The file already exists, don't overwrite it
if os.path.exists(file_path):
return
# Create a minimal set of mappings to get started
minimal_mappings = {
StandardConcept.REVENUE.value: [
"us-gaap_Revenue",
"us-gaap_SalesRevenueNet",
"us-gaap_Revenues"
],
StandardConcept.NET_INCOME.value: [
"us-gaap_NetIncome",
"us-gaap_NetIncomeLoss",
"us-gaap_ProfitLoss"
],
StandardConcept.TOTAL_ASSETS.value: [
"us-gaap_Assets",
"us-gaap_AssetsTotal"
]
}
# Write the file
with open(file_path, 'w') as f:
json.dump(minimal_mappings, f, indent=2)
# Initialize MappingStore - only loads from JSON
def initialize_default_mappings(read_only: bool = False) -> MappingStore:
"""
Initialize a MappingStore with mappings from the concept_mappings.json file.
Args:
read_only: If True, prevent writing changes back to the file (used in testing)
Returns:
MappingStore initialized with mappings from JSON file
"""
store = MappingStore(read_only=read_only)
# If JSON file doesn't exist, create it with minimal default mappings
# Only do this in non-read_only mode to avoid test-initiated file creation
if not read_only and not os.path.exists(store.source):
create_default_mappings_file(store.source)
return store