Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
"""
XBRL Parser Components.
This package provides specialized parser components for different aspects
of XBRL document processing. Each parser handles a specific responsibility
in the XBRL parsing workflow.
"""
from .base import BaseParser
from .calculation import CalculationParser
from .coordinator import XBRLParser
from .definition import DefinitionParser
from .instance import InstanceParser
from .labels import LabelsParser
from .presentation import PresentationParser
from .schema import SchemaParser
__all__ = [
'BaseParser',
'XBRLParser',
'SchemaParser',
'LabelsParser',
'PresentationParser',
'CalculationParser',
'DefinitionParser',
'InstanceParser',
]

View File

@@ -0,0 +1,148 @@
"""
Base parser functionality for XBRL parsing components.
This module provides common utilities and base functionality shared across
all XBRL parser components.
"""
from typing import Any, Dict
from lxml import etree as ET
from edgar.core import log
from edgar.xbrl.core import NAMESPACES
class BaseParser:
"""Base class for XBRL parser components with common functionality."""
def __init__(self):
"""Initialize base parser with common data structures."""
# Common namespaces and utilities available to all parsers
self.namespaces = NAMESPACES
def _safe_parse_xml(self, content: str) -> ET.Element:
"""
Safely parse XML content with lxml, handling encoding declarations properly.
Args:
content: XML content as string or bytes
Returns:
parsed XML root element
"""
parser = ET.XMLParser(remove_blank_text=True, recover=True)
# Convert to bytes for safer parsing if needed
if isinstance(content, str):
content_bytes = content.encode('utf-8')
else:
content_bytes = content
# Parse with lxml
return ET.XML(content_bytes, parser)
def _parse_order_attribute(self, arc) -> float:
"""Parse order attribute from arc, checking both order and xlink:order."""
# Try xlink:order first (XBRL standard)
order_value = arc.get('{http://www.w3.org/1999/xlink}order')
if order_value is None:
# Fallback to order attribute
order_value = arc.get('order')
# Debug logging to understand what's in the XBRL document
if order_value is not None:
log.debug(f"Found order attribute: {order_value}")
else:
# Log all attributes to see what's actually there
all_attrs = dict(arc.attrib) if hasattr(arc, 'attrib') else {}
log.debug(f"No order attribute found. Available attributes: {all_attrs}")
try:
return float(order_value) if order_value is not None else 0.0
except (ValueError, TypeError):
return 0.0
def _extract_role_info(self, role_element) -> Dict[str, Any]:
"""
Extract role information from a role element.
Args:
role_element: XML element containing role definition
Returns:
Dictionary with role information
"""
role_info = {}
# Get role URI
role_uri = role_element.get('roleURI', '')
role_info['uri'] = role_uri
# Extract role definition/label
definition_elem = role_element.find('.//{http://www.xbrl.org/2003/linkbase}definition')
if definition_elem is not None:
role_info['definition'] = definition_elem.text or ''
else:
# Fallback: create definition from role URI
role_info['definition'] = role_uri.split('/')[-1].replace('_', ' ') if role_uri else ''
return role_info
def _get_element_namespace_and_name(self, element_id: str) -> tuple[str, str]:
"""
Extract namespace and local name from an element ID.
Args:
element_id: Element identifier (may include namespace prefix)
Returns:
Tuple of (namespace, local_name)
"""
if ':' in element_id:
prefix, local_name = element_id.split(':', 1)
# Map common prefixes to namespaces
namespace_map = {
'us-gaap': 'http://fasb.org/us-gaap/2024',
'dei': 'http://xbrl.sec.gov/dei/2024',
'invest': 'http://xbrl.sec.gov/invest/2013-01-31',
'country': 'http://xbrl.sec.gov/country/2023',
'currency': 'http://xbrl.sec.gov/currency/2023',
'exch': 'http://xbrl.sec.gov/exch/2023',
'naics': 'http://xbrl.sec.gov/naics/2023',
'sic': 'http://xbrl.sec.gov/sic/2023',
'stpr': 'http://xbrl.sec.gov/stpr/2023',
}
namespace = namespace_map.get(prefix, f'http://unknown.namespace/{prefix}')
return namespace, local_name
else:
return '', element_id
def _normalize_element_id(self, element_id: str) -> str:
"""
Normalize element ID to a consistent format.
Args:
element_id: Original element identifier
Returns:
Normalized element identifier
"""
if ':' in element_id:
prefix, name = element_id.split(':', 1)
return f"{prefix}_{name}"
return element_id
def _log_parsing_progress(self, component: str, count: int, total: int = None):
"""
Log parsing progress for debugging.
Args:
component: Name of component being parsed
count: Number of items processed
total: Total number of items (optional)
"""
if total:
log.debug(f"Parsed {count}/{total} {component}")
else:
log.debug(f"Parsed {count} {component}")

View File

@@ -0,0 +1,223 @@
"""
Calculation parser for XBRL documents.
This module handles parsing of XBRL calculation linkbases and building
calculation trees with weights for validation.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from edgar.xbrl.core import NAMESPACES, extract_element_id
from edgar.xbrl.models import CalculationNode, CalculationTree, ElementCatalog, Fact, XBRLProcessingError
from .base import BaseParser
class CalculationParser(BaseParser):
"""Parser for XBRL calculation linkbases."""
def __init__(self, calculation_roles: Dict[str, Dict[str, Any]],
calculation_trees: Dict[str, CalculationTree],
element_catalog: Dict[str, ElementCatalog],
facts: Dict[str, Fact]):
"""
Initialize calculation parser with data structure references.
Args:
calculation_roles: Reference to calculation roles dictionary
calculation_trees: Reference to calculation trees dictionary
element_catalog: Reference to element catalog dictionary
facts: Reference to facts dictionary
"""
super().__init__()
# Store references to data structures
self.calculation_roles = calculation_roles
self.calculation_trees = calculation_trees
self.element_catalog = element_catalog
self.facts = facts
def parse_calculation(self, file_path: Union[str, Path]) -> None:
"""Parse calculation linkbase file and build calculation trees."""
try:
content = Path(file_path).read_text()
self.parse_calculation_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing calculation file {file_path}: {str(e)}") from e
def parse_calculation_content(self, content: str) -> None:
"""Parse calculation linkbase content and build calculation trees."""
try:
# Use safe XML parsing method
root = self._safe_parse_xml(content)
# Extract calculation links
calculation_links = root.findall('.//{http://www.xbrl.org/2003/linkbase}calculationLink')
for link in calculation_links:
role = link.get('{http://www.w3.org/1999/xlink}role')
if not role:
continue
# Store role information
role_id = role.split('/')[-1] if '/' in role else role
role_def = role_id.replace('_', ' ')
self.calculation_roles[role] = {
'roleUri': role,
'definition': role_def,
'roleId': role_id
}
# Extract arcs
arcs = link.findall('.//{http://www.xbrl.org/2003/linkbase}calculationArc')
# Create relationships list
relationships = []
for arc in arcs:
from_ref = arc.get('{http://www.w3.org/1999/xlink}from')
to_ref = arc.get('{http://www.w3.org/1999/xlink}to')
order = self._parse_order_attribute(arc)
weight = float(arc.get('weight', '1.0'))
if not from_ref or not to_ref:
continue
# Find locators for from/to references
from_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{from_ref}"]')
to_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{to_ref}"]')
if from_loc is None or to_loc is None:
continue
from_href = from_loc.get('{http://www.w3.org/1999/xlink}href')
to_href = to_loc.get('{http://www.w3.org/1999/xlink}href')
if not from_href or not to_href:
continue
# Extract element IDs
from_element = extract_element_id(from_href)
to_element = extract_element_id(to_href)
# Add relationship
relationships.append({
'from_element': from_element,
'to_element': to_element,
'order': order,
'weight': weight
})
# Build calculation tree for this role
if relationships:
self._build_calculation_tree(role, relationships)
except Exception as e:
raise XBRLProcessingError(f"Error parsing calculation content: {str(e)}") from e
def _build_calculation_tree(self, role: str, relationships: List[Dict[str, Any]]) -> None:
"""
Build a calculation tree from relationships.
Args:
role: Extended link role URI
relationships: List of relationships (from_element, to_element, order, weight)
"""
# Group relationships by source element
from_map = {}
to_map = {}
for rel in relationships:
from_element = rel['from_element']
to_element = rel['to_element']
if from_element not in from_map:
from_map[from_element] = []
from_map[from_element].append(rel)
if to_element not in to_map:
to_map[to_element] = []
to_map[to_element].append(rel)
# Find root elements (appear as 'from' but not as 'to')
root_elements = set(from_map.keys()) - set(to_map.keys())
if not root_elements:
return # No root elements found
# Create calculation tree
tree = CalculationTree(
role_uri=role,
definition=self.calculation_roles[role]['definition'],
root_element_id=next(iter(root_elements)),
all_nodes={}
)
# Build tree recursively
for root_id in root_elements:
self._build_calculation_subtree(root_id, None, from_map, tree.all_nodes)
# Add tree to collection
self.calculation_trees[role] = tree
def _build_calculation_subtree(self, element_id: str, parent_id: Optional[str],
from_map: Dict[str, List[Dict[str, Any]]],
all_nodes: Dict[str, CalculationNode]) -> None:
"""
Recursively build a calculation subtree.
Args:
element_id: Current element ID
parent_id: Parent element ID
from_map: Map of relationships by source element
all_nodes: Dictionary to store all nodes
"""
# Create node
node = CalculationNode(
element_id=element_id,
parent=parent_id,
children=[]
)
# Add element information if available
elem_info = None
if element_id in self.element_catalog:
elem_info = self.element_catalog[element_id]
else:
# Try alternative element ID formats (colon vs underscore)
alt_element_id = element_id.replace(':', '_') if ':' in element_id else element_id.replace('_', ':')
if alt_element_id in self.element_catalog:
elem_info = self.element_catalog[alt_element_id]
if elem_info:
node.balance_type = elem_info.balance
node.period_type = elem_info.period_type
# Add to collection
all_nodes[element_id] = node
# Process children
if element_id in from_map:
# Sort children by order
children = sorted(from_map[element_id], key=lambda r: r['order'])
for rel in children:
child_id = rel['to_element']
# Add child to parent's children list
node.children.append(child_id)
# Set weight
weight = rel['weight']
# Recursively build child subtree
self._build_calculation_subtree(
child_id, element_id, from_map, all_nodes
)
# Update weight and order after child is built
if child_id in all_nodes:
all_nodes[child_id].weight = weight
all_nodes[child_id].order = rel['order']

View File

@@ -0,0 +1,382 @@
"""
Shared XBRL concept definitions for balance types and deprecated normalization lists.
This module contains balance type mappings for common US-GAAP concepts to support
the balance column in DataFrame exports without parsing full taxonomy schemas.
DEPRECATED: Static normalization concept lists (CONSISTENT_POSITIVE_CONCEPTS,
LEGITIMATE_NEGATIVE_CONCEPTS) are kept for historical reference but no longer used.
Testing confirmed that SEC XBRL instance data is already consistent across companies.
See Issue #463 analysis for details.
"""
# =============================================================================
# DEPRECATED CONCEPT LISTS (No longer used as of Issue #463)
# =============================================================================
# These lists were created to work around perceived inconsistencies in XBRL data.
# Testing revealed that raw SEC instance data is ALREADY consistent across companies.
#
# Historical context:
# - Issues #290, #334, #451 reported negative values for expenses
# - Root cause: EdgarTools was misusing calculation weights for display logic
# - These lists fixed symptoms but not the actual problem
# - Issue #463 removed calculation weight application during parsing
# - Result: Raw values preserved as-is (matching SEC CompanyFacts API)
#
# Kept for historical reference and potential future use cases.
# =============================================================================
CONSISTENT_POSITIVE_CONCEPTS = {
# Research and Development Expenses
'us-gaap_ResearchAndDevelopmentExpense',
'us_gaap_ResearchAndDevelopmentExpense',
'ResearchAndDevelopmentExpense',
# Selling, General & Administrative Expenses
'us-gaap_SellingGeneralAndAdministrativeExpense',
'us_gaap_SellingGeneralAndAdministrativeExpense',
'SellingGeneralAndAdministrativeExpense',
# General and Administrative Expenses (separate from SG&A)
'us-gaap_GeneralAndAdministrativeExpense',
'us_gaap_GeneralAndAdministrativeExpense',
'GeneralAndAdministrativeExpense',
# Selling Expenses
'us-gaap_SellingExpense',
'us_gaap_SellingExpense',
'SellingExpense',
# Marketing and Advertising Expenses
'us-gaap_SellingAndMarketingExpense',
'us_gaap_SellingAndMarketingExpense',
'SellingAndMarketingExpense',
'us-gaap_MarketingExpense',
'us_gaap_MarketingExpense',
'MarketingExpense',
'us-gaap_AdvertisingExpense',
'us_gaap_AdvertisingExpense',
'AdvertisingExpense',
# Share-based Compensation Expenses
'us-gaap_AllocatedShareBasedCompensationExpense',
'us_gaap_AllocatedShareBasedCompensationExpense',
'AllocatedShareBasedCompensationExpense',
'us-gaap_ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized',
'us_gaap_ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized',
'ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized',
# Operating Expenses (general)
'us-gaap_OperatingExpenses',
'us_gaap_OperatingExpenses',
'OperatingExpenses',
# Professional Services Expenses
'us-gaap_ProfessionalServiceFees',
'us_gaap_ProfessionalServiceFees',
'ProfessionalServiceFees',
# Compensation and Benefits
'us-gaap_LaborAndRelatedExpense',
'us_gaap_LaborAndRelatedExpense',
'LaborAndRelatedExpense',
'us-gaap_EmployeeBenefitsExpense',
'us_gaap_EmployeeBenefitsExpense',
'EmployeeBenefitsExpense',
# Cost of Revenue and Cost of Goods/Services Sold (Issue #290, #451)
'us-gaap_CostOfRevenue',
'us_gaap_CostOfRevenue',
'CostOfRevenue',
'us-gaap_CostOfGoodsAndServicesSold',
'us_gaap_CostOfGoodsAndServicesSold',
'CostOfGoodsAndServicesSold',
'us-gaap_CostOfGoodsSold',
'us_gaap_CostOfGoodsSold',
'CostOfGoodsSold',
'us-gaap_CostOfServices',
'us_gaap_CostOfServices',
'CostOfServices',
# Income Tax Expense (Issue #451)
'us-gaap_IncomeTaxExpenseBenefit',
'us_gaap_IncomeTaxExpenseBenefit',
'IncomeTaxExpenseBenefit',
'us-gaap_IncomeTaxRecoveryExpense',
'us_gaap_IncomeTaxRecoveryExpense',
'IncomeTaxRecoveryExpense',
# Cash Flow Statement - Financing Activities (cash outflows)
# These represent uses of cash that should always be positive
'us-gaap_PaymentsForRepurchaseOfCommonStock',
'us_gaap_PaymentsForRepurchaseOfCommonStock',
'PaymentsForRepurchaseOfCommonStock',
'us-gaap_PaymentsOfDividends',
'us_gaap_PaymentsOfDividends',
'PaymentsOfDividends',
'us-gaap_PaymentsOfDividendsCommonStock',
'us_gaap_PaymentsOfDividendsCommonStock',
'PaymentsOfDividendsCommonStock',
'us-gaap_PaymentsOfDividendsPreferredStockAndPreferenceStock',
'us_gaap_PaymentsOfDividendsPreferredStockAndPreferenceStock',
'PaymentsOfDividendsPreferredStockAndPreferenceStock'
}
# DEPRECATED: Concepts that can legitimately be negative
# This list is no longer used but kept for historical reference.
LEGITIMATE_NEGATIVE_CONCEPTS = {
# Interest expense/income that can be net negative
'us-gaap_InterestIncomeExpenseNet',
'us_gaap_InterestIncomeExpenseNet',
'InterestIncomeExpenseNet',
# Foreign exchange gains/losses
'us-gaap_ForeignCurrencyTransactionGainLossBeforeTax',
'us_gaap_ForeignCurrencyTransactionGainLossBeforeTax',
'ForeignCurrencyTransactionGainLossBeforeTax',
# Restructuring reversals/credits
'us-gaap_RestructuringChargesAndReversals',
'us_gaap_RestructuringChargesAndReversals',
'RestructuringChargesAndReversals'
}
# US-GAAP Balance Type Mappings (Issue #463)
#
# This mapping provides balance types for common US-GAAP concepts to support
# the balance column in DataFrame exports without requiring full taxonomy parsing.
#
# Balance types:
# - "debit": Assets, Expenses (increase with debits, decrease with credits)
# - "credit": Liabilities, Equity, Revenue (increase with credits, decrease with debits)
#
# TODO: Eventually replace with full US-GAAP taxonomy parser that follows schema imports
#
US_GAAP_BALANCE_TYPES = {
# ============================================================================
# ASSETS (Balance: debit)
# ============================================================================
# Current Assets
'us-gaap:Cash': 'debit',
'Cash': 'debit', # Short form
'us-gaap:CashAndCashEquivalentsAtCarryingValue': 'debit',
'CashAndCashEquivalentsAtCarryingValue': 'debit', # Short form
'us-gaap:CashEquivalentsAtCarryingValue': 'debit',
'us-gaap:RestrictedCashAndCashEquivalents': 'debit',
'us-gaap:MarketableSecurities': 'debit',
'us-gaap:AvailableForSaleSecuritiesDebtSecurities': 'debit',
'us-gaap:ShortTermInvestments': 'debit',
'us-gaap:AccountsReceivableNetCurrent': 'debit',
'us-gaap:AccountsReceivableGrossCurrent': 'debit',
'us-gaap:Inventory': 'debit',
'us-gaap:InventoryNet': 'debit',
'us-gaap:PrepaidExpenseAndOtherAssetsCurrent': 'debit',
'us-gaap:DeferredTaxAssetsNetCurrent': 'debit',
'us-gaap:OtherAssetsCurrent': 'debit',
'us-gaap:AssetsCurrent': 'debit',
# Non-Current Assets
'us-gaap:PropertyPlantAndEquipmentNet': 'debit',
'us-gaap:PropertyPlantAndEquipmentGross': 'debit',
'us-gaap:Land': 'debit',
'us-gaap:BuildingsAndImprovementsGross': 'debit',
'us-gaap:MachineryAndEquipmentGross': 'debit',
'us-gaap:Goodwill': 'debit',
'us-gaap:IntangibleAssetsNetExcludingGoodwill': 'debit',
'us-gaap:IntangibleAssetsGrossExcludingGoodwill': 'debit',
'us-gaap:LongTermInvestments': 'debit',
'us-gaap:DeferredTaxAssetsNetNoncurrent': 'debit',
'us-gaap:OtherAssetsNoncurrent': 'debit',
'us-gaap:AssetsNoncurrent': 'debit',
'us-gaap:Assets': 'debit',
'Assets': 'debit', # Short form
# ============================================================================
# LIABILITIES (Balance: credit)
# ============================================================================
# Current Liabilities
'us-gaap:AccountsPayableCurrent': 'credit',
'us-gaap:AccruedLiabilitiesCurrent': 'credit',
'us-gaap:DeferredRevenueCurrent': 'credit',
'us-gaap:ContractWithCustomerLiabilityCurrent': 'credit',
'us-gaap:ShortTermBorrowings': 'credit',
'us-gaap:LongTermDebtCurrent': 'credit',
'us-gaap:CommercialPaper': 'credit',
'us-gaap:AccruedIncomeTaxesCurrent': 'credit',
'us-gaap:DividendsPayableCurrent': 'credit',
'us-gaap:OtherLiabilitiesCurrent': 'credit',
'us-gaap:LiabilitiesCurrent': 'credit',
# Non-Current Liabilities
'us-gaap:LongTermDebtNoncurrent': 'credit',
'us-gaap:LongTermDebtAndCapitalLeaseObligations': 'credit',
'us-gaap:DeferredRevenueNoncurrent': 'credit',
'us-gaap:DeferredTaxLiabilitiesNoncurrent': 'credit',
'us-gaap:PensionAndOtherPostretirementDefinedBenefitPlansLiabilitiesNoncurrent': 'credit',
'us-gaap:OtherLiabilitiesNoncurrent': 'credit',
'us-gaap:LiabilitiesNoncurrent': 'credit',
'us-gaap:Liabilities': 'credit',
# ============================================================================
# EQUITY (Balance: credit)
# ============================================================================
'us-gaap:CommonStockValue': 'credit',
'us-gaap:CommonStockSharesIssued': 'credit',
'us-gaap:CommonStockSharesOutstanding': 'credit',
'us-gaap:PreferredStockValue': 'credit',
'us-gaap:AdditionalPaidInCapital': 'credit',
'us-gaap:AdditionalPaidInCapitalCommonStock': 'credit',
'us-gaap:RetainedEarningsAccumulatedDeficit': 'credit',
'us-gaap:TreasuryStockValue': 'debit', # Contra-equity (debit balance)
'us-gaap:AccumulatedOtherComprehensiveIncomeLossNetOfTax': 'credit',
'us-gaap:StockholdersEquity': 'credit',
'us-gaap:StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest': 'credit',
'us-gaap:LiabilitiesAndStockholdersEquity': 'credit',
# ============================================================================
# REVENUE (Balance: credit)
# ============================================================================
'us-gaap:Revenues': 'credit',
'Revenues': 'credit', # Short form
'Revenue': 'credit', # Short form (singular)
'us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax': 'credit',
'RevenueFromContractWithCustomerExcludingAssessedTax': 'credit', # Short form
'us-gaap:RevenueFromContractWithCustomerIncludingAssessedTax': 'credit',
'RevenueFromContractWithCustomerIncludingAssessedTax': 'credit', # Short form
'us-gaap:SalesRevenueNet': 'credit',
'us-gaap:SalesRevenueGoodsNet': 'credit',
'us-gaap:SalesRevenueServicesNet': 'credit',
'us-gaap:InterestAndDividendIncomeOperating': 'credit',
'us-gaap:InterestIncomeOther': 'credit',
'us-gaap:InvestmentIncomeInterest': 'credit',
'us-gaap:GainLossOnSaleOfPropertyPlantEquipment': 'credit',
'us-gaap:GainLossOnInvestments': 'credit',
'us-gaap:OtherNonoperatingIncomeExpense': 'credit',
# ============================================================================
# EXPENSES & COSTS (Balance: debit)
# ============================================================================
# Cost of Revenue
'us-gaap:CostOfRevenue': 'debit',
'us-gaap:CostOfGoodsAndServicesSold': 'debit',
'us-gaap:CostOfGoodsSold': 'debit',
'us-gaap:CostOfServices': 'debit',
# Operating Expenses
'us-gaap:ResearchAndDevelopmentExpense': 'debit',
'us-gaap:SellingGeneralAndAdministrativeExpense': 'debit',
'us-gaap:GeneralAndAdministrativeExpense': 'debit',
'us-gaap:SellingExpense': 'debit',
'us-gaap:SellingAndMarketingExpense': 'debit',
'us-gaap:MarketingExpense': 'debit',
'us-gaap:AdvertisingExpense': 'debit',
'us-gaap:DepreciationDepletionAndAmortization': 'debit',
'us-gaap:Depreciation': 'debit',
'us-gaap:AmortizationOfIntangibleAssets': 'debit',
'us-gaap:RestructuringCharges': 'debit',
'us-gaap:AssetImpairmentCharges': 'debit',
'us-gaap:ShareBasedCompensation': 'debit',
# Other Expenses
'us-gaap:InterestExpense': 'debit',
'us-gaap:InterestExpenseDebt': 'debit',
'us-gaap:IncomeTaxExpenseBenefit': 'debit',
'us-gaap:ProvisionForDoubtfulAccounts': 'debit',
# ============================================================================
# INCOME & TOTALS (Balance: credit)
# ============================================================================
'us-gaap:GrossProfit': 'credit',
'us-gaap:OperatingIncomeLoss': 'credit',
'us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest': 'credit',
'us-gaap:IncomeLossFromContinuingOperations': 'credit',
'us-gaap:NetIncomeLoss': 'credit',
'us-gaap:NetIncomeLossAvailableToCommonStockholdersBasic': 'credit',
'us-gaap:NetIncomeLossAvailableToCommonStockholdersDiluted': 'credit',
'us-gaap:ComprehensiveIncomeNetOfTax': 'credit',
# ============================================================================
# CASH FLOW STATEMENT
# ============================================================================
# Operating Activities
'us-gaap:NetCashProvidedByUsedInOperatingActivities': 'debit',
'us-gaap:DepreciationAndAmortization': 'debit',
'us-gaap:ShareBasedCompensationArrangementByShareBasedPaymentAwardExpenseRecognized': 'debit',
'us-gaap:DeferredIncomeTaxExpenseBenefit': 'debit',
# Investing Activities
'us-gaap:NetCashProvidedByUsedInInvestingActivities': 'debit',
'us-gaap:PaymentsToAcquirePropertyPlantAndEquipment': 'credit', # Cash outflow
'us-gaap:PaymentsToAcquireBusinessesNetOfCashAcquired': 'credit', # Cash outflow
'us-gaap:PaymentsToAcquireMarketableSecurities': 'credit', # Cash outflow
'us-gaap:ProceedsFromSaleOfPropertyPlantAndEquipment': 'debit', # Cash inflow
'us-gaap:ProceedsFromSaleOfAvailableForSaleSecuritiesDebt': 'debit', # Cash inflow
# Financing Activities
'us-gaap:NetCashProvidedByUsedInFinancingActivities': 'debit',
'us-gaap:ProceedsFromIssuanceOfCommonStock': 'debit', # Cash inflow
'us-gaap:ProceedsFromIssuanceOfLongTermDebt': 'debit', # Cash inflow
'us-gaap:RepaymentsOfLongTermDebt': 'credit', # Cash outflow
'us-gaap:PaymentsOfDividends': 'credit', # Cash outflow
'us-gaap:PaymentsOfDividendsCommonStock': 'credit', # Cash outflow
'us-gaap:PaymentsForRepurchaseOfCommonStock': 'credit', # Cash outflow
}
def get_balance_type(concept: str) -> str:
"""
Get the balance type for a concept.
Looks up the balance type from the static US-GAAP mapping, handling
both colon and underscore namespace separators.
Args:
concept: The concept name (e.g., 'us-gaap:Revenue' or 'us-gaap_Revenue' or 'us_gaap_Revenue')
Returns:
Balance type ('debit', 'credit', or None if not found)
Example:
>>> get_balance_type('us-gaap:Cash')
'debit'
>>> get_balance_type('us-gaap_Revenue')
'credit'
>>> get_balance_type('us_gaap_Revenue')
'credit'
>>> get_balance_type('UnknownConcept')
None
"""
# Try direct lookup first (standard form)
if concept in US_GAAP_BALANCE_TYPES:
return US_GAAP_BALANCE_TYPES[concept]
# Normalize to standard form: us-gaap:LocalName
# Handle common namespace prefix variations
normalized = concept
# Replace known namespace patterns
# us_gaap_Cash -> us-gaap:Cash
# us-gaap_Cash -> us-gaap:Cash
if 'us_gaap' in normalized:
normalized = normalized.replace('us_gaap_', 'us-gaap:')
normalized = normalized.replace('us_gaap:', 'us-gaap:')
elif 'us-gaap' in normalized:
normalized = normalized.replace('us-gaap_', 'us-gaap:')
# Try normalized form
if normalized in US_GAAP_BALANCE_TYPES:
return US_GAAP_BALANCE_TYPES[normalized]
# Try converting all underscores to colons (simple fallback)
concept_all_colons = concept.replace('_', ':')
if concept_all_colons in US_GAAP_BALANCE_TYPES:
return US_GAAP_BALANCE_TYPES[concept_all_colons]
return None

View File

@@ -0,0 +1,291 @@
"""
XBRL Parser Coordinator.
This module provides the main XBRLParser class that coordinates parsing
workflow across all specialized parser components while maintaining
API compatibility with the original monolithic parser.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from edgar.core import log
from edgar.xbrl.models import (
Axis,
CalculationTree,
Context,
Domain,
ElementCatalog,
Fact,
PresentationTree,
Table,
XBRLProcessingError,
)
from .calculation import CalculationParser
from .definition import DefinitionParser
from .instance import InstanceParser
from .labels import LabelsParser
from .presentation import PresentationParser
from .schema import SchemaParser
class XBRLParser:
"""
Coordinated XBRL parser that delegates to specialized component parsers.
This class maintains full API compatibility with the original monolithic
XBRLParser while providing improved maintainability through component separation.
"""
def __init__(self):
"""Initialize the coordinated XBRL parser with all data structures."""
# Core data structures
self.element_catalog: Dict[str, ElementCatalog] = {}
self.contexts: Dict[str, Context] = {}
self.facts: Dict[str, Fact] = {}
self.units: Dict[str, Any] = {}
self.footnotes: Dict[str, Any] = {}
# Presentation structures
self.presentation_roles: Dict[str, Dict[str, Any]] = {}
self.presentation_trees: Dict[str, PresentationTree] = {}
# Calculation structures
self.calculation_roles: Dict[str, Dict[str, Any]] = {}
self.calculation_trees: Dict[str, CalculationTree] = {}
# Definition (dimensional) structures
self.definition_roles: Dict[str, Dict[str, Any]] = {}
self.tables: Dict[str, List[Table]] = {}
self.axes: Dict[str, Axis] = {}
self.domains: Dict[str, Domain] = {}
# Entity information
self.entity_info: Dict[str, Any] = {}
self.dei_facts: Dict[str, Fact] = {}
# Reporting periods
self.reporting_periods: List[Dict[str, Any]] = []
# Mapping of context IDs to period identifiers for easy lookup
self.context_period_map: Dict[str, str] = {}
# Initialize component parsers
self._init_parsers()
def _init_parsers(self):
"""Initialize all component parsers with shared data structures."""
# Create component parsers with references to shared data structures
self.schema_parser = SchemaParser(
element_catalog=self.element_catalog
)
self.labels_parser = LabelsParser(
element_catalog=self.element_catalog
)
self.presentation_parser = PresentationParser(
presentation_roles=self.presentation_roles,
presentation_trees=self.presentation_trees,
element_catalog=self.element_catalog
)
self.calculation_parser = CalculationParser(
calculation_roles=self.calculation_roles,
calculation_trees=self.calculation_trees,
element_catalog=self.element_catalog,
facts=self.facts
)
self.definition_parser = DefinitionParser(
definition_roles=self.definition_roles,
tables=self.tables,
axes=self.axes,
domains=self.domains,
element_catalog=self.element_catalog
)
self.instance_parser = InstanceParser(
contexts=self.contexts,
facts=self.facts,
units=self.units,
footnotes=self.footnotes,
calculation_trees=self.calculation_trees,
entity_info=self.entity_info,
reporting_periods=self.reporting_periods,
context_period_map=self.context_period_map
)
# Set up cross-references for embedded linkbase processing
self.schema_parser.set_linkbase_parsers(
labels_parser=self.labels_parser,
presentation_parser=self.presentation_parser,
calculation_parser=self.calculation_parser,
definition_parser=self.definition_parser
)
def _create_normalized_fact_key(self, element_id: str, context_ref: str, instance_id: Optional[int] = None) -> str:
"""
Create a normalized fact key using underscore format.
Args:
element_id: The element ID
context_ref: The context reference
instance_id: Optional instance ID for duplicate facts
Returns:
Normalized key in format: element_id_context_ref[_instance_id]
"""
return self.instance_parser._create_normalized_fact_key(element_id, context_ref, instance_id)
def get_facts_by_key(self, element_id: str, context_ref: str) -> List[Fact]:
"""Get all facts matching the given element ID and context reference.
This method handles both single facts and duplicate facts using the hybrid storage approach.
For single facts, it returns a list with one fact. For duplicates, it returns all instances.
Args:
element_id: The element ID to look up
context_ref: The context reference
Returns:
List of matching facts
"""
# Create base key for lookup
base_key = self._create_normalized_fact_key(element_id, context_ref)
# Check if single fact exists
if base_key in self.facts:
return [self.facts[base_key]]
# Check for duplicate facts (with instance IDs)
matching_facts = []
instance_id = 0
while True:
instance_key = self._create_normalized_fact_key(element_id, context_ref, instance_id)
if instance_key in self.facts:
matching_facts.append(self.facts[instance_key])
instance_id += 1
else:
break
return matching_facts
def get_fact(self, element_id: str, context_ref: str) -> Optional[Fact]:
"""Get a single fact by element ID and context reference.
Returns the first fact if multiple instances exist.
Args:
element_id: The element ID to look up
context_ref: The context reference
Returns:
The fact if found, None otherwise
"""
facts = self.get_facts_by_key(element_id, context_ref)
return facts[0] if facts else None
def parse_directory(self, directory_path: Union[str, Path]) -> None:
"""
Parse all XBRL files in a directory.
Args:
directory_path: Path to directory containing XBRL files
"""
try:
directory = Path(directory_path)
if not directory.is_dir():
raise XBRLProcessingError(f"Directory not found: {directory_path}")
log.debug(f"Parsing XBRL directory: {directory}")
# Parse schema files first to build element catalog
schema_files = list(directory.glob('*.xsd'))
for schema_file in schema_files:
log.debug(f"Parsing schema: {schema_file}")
self.schema_parser.parse_schema(schema_file)
# Parse linkbase files
linkbase_patterns = [
('*_lab.xml', self.labels_parser.parse_labels),
('*_pre.xml', self.presentation_parser.parse_presentation),
('*_cal.xml', self.calculation_parser.parse_calculation),
('*_def.xml', self.definition_parser.parse_definition),
]
for pattern, parser_method in linkbase_patterns:
linkbase_files = list(directory.glob(pattern))
for linkbase_file in linkbase_files:
log.debug(f"Parsing linkbase: {linkbase_file}")
parser_method(linkbase_file)
# Parse instance files last (they depend on schemas and linkbases)
instance_files = list(directory.glob('*.xml'))
# Filter out linkbase files
instance_files = [f for f in instance_files if not any(
f.name.endswith(suffix) for suffix in ['_lab.xml', '_pre.xml', '_cal.xml', '_def.xml']
)]
for instance_file in instance_files:
log.debug(f"Parsing instance: {instance_file}")
self.instance_parser.parse_instance(instance_file)
log.info(f"Successfully parsed XBRL directory with {len(self.facts)} facts")
except Exception as e:
raise XBRLProcessingError(f"Error parsing directory {directory_path}: {str(e)}") from e
# Delegate methods to component parsers for API compatibility
def parse_schema(self, file_path: Union[str, Path]) -> None:
"""Parse schema file and extract element information."""
return self.schema_parser.parse_schema(file_path)
def parse_schema_content(self, content: str) -> None:
"""Parse schema content and extract element information."""
return self.schema_parser.parse_schema_content(content)
def parse_labels(self, file_path: Union[str, Path]) -> None:
"""Parse label linkbase file and extract label information."""
return self.labels_parser.parse_labels(file_path)
def parse_labels_content(self, content: str) -> None:
"""Parse label linkbase content and extract label information."""
return self.labels_parser.parse_labels_content(content)
def parse_presentation(self, file_path: Union[str, Path]) -> None:
"""Parse presentation linkbase file and build presentation trees."""
return self.presentation_parser.parse_presentation(file_path)
def parse_presentation_content(self, content: str) -> None:
"""Parse presentation linkbase content and build presentation trees."""
return self.presentation_parser.parse_presentation_content(content)
def parse_calculation(self, file_path: Union[str, Path]) -> None:
"""Parse calculation linkbase file and build calculation trees."""
return self.calculation_parser.parse_calculation(file_path)
def parse_calculation_content(self, content: str) -> None:
"""Parse calculation linkbase content and build calculation trees."""
return self.calculation_parser.parse_calculation_content(content)
def parse_definition(self, file_path: Union[str, Path]) -> None:
"""Parse definition linkbase file and build dimensional structures."""
return self.definition_parser.parse_definition(file_path)
def parse_definition_content(self, content: str) -> None:
"""Parse definition linkbase content and build dimensional structures."""
return self.definition_parser.parse_definition_content(content)
def parse_instance(self, file_path: Union[str, Path]) -> None:
"""Parse instance document file and extract contexts, facts, and units."""
return self.instance_parser.parse_instance(file_path)
def parse_instance_content(self, content: str) -> None:
"""Parse instance document content and extract contexts, facts, and units."""
return self.instance_parser.parse_instance_content(content)
def count_facts(self, content: str) -> tuple:
"""Count the number of facts in the instance document."""
return self.instance_parser.count_facts(content)

View File

@@ -0,0 +1,235 @@
"""
Definition parser for XBRL documents.
This module handles parsing of XBRL definition linkbases and building
dimensional structures like tables, axes, and domains.
"""
from pathlib import Path
from typing import Any, Dict, List, Union
from edgar.xbrl.core import NAMESPACES, STANDARD_LABEL, extract_element_id
from edgar.xbrl.models import Axis, Domain, ElementCatalog, Table, XBRLProcessingError
from .base import BaseParser
class DefinitionParser(BaseParser):
"""Parser for XBRL definition linkbases."""
def __init__(self, definition_roles: Dict[str, Dict[str, Any]],
tables: Dict[str, List[Table]],
axes: Dict[str, Axis],
domains: Dict[str, Domain],
element_catalog: Dict[str, ElementCatalog]):
"""
Initialize definition parser with data structure references.
Args:
definition_roles: Reference to definition roles dictionary
tables: Reference to tables dictionary
axes: Reference to axes dictionary
domains: Reference to domains dictionary
element_catalog: Reference to element catalog dictionary
"""
super().__init__()
# Store references to data structures
self.definition_roles = definition_roles
self.tables = tables
self.axes = axes
self.domains = domains
self.element_catalog = element_catalog
def parse_definition(self, file_path: Union[str, Path]) -> None:
"""Parse definition linkbase file and build dimensional structures."""
try:
content = Path(file_path).read_text()
self.parse_definition_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing definition file {file_path}: {str(e)}") from e
def parse_definition_content(self, content: str) -> None:
"""Parse definition linkbase content and build dimensional structures."""
try:
root = self._safe_parse_xml(content)
# Extract definition links
definition_links = root.findall('.//{http://www.xbrl.org/2003/linkbase}definitionLink')
for link in definition_links:
role = link.get('{http://www.w3.org/1999/xlink}role')
if not role:
continue
# Store role information
role_id = role.split('/')[-1] if '/' in role else role
role_def = role_id.replace('_', ' ')
self.definition_roles[role] = {
'roleUri': role,
'definition': role_def,
'roleId': role_id
}
# Extract arcs
arcs = link.findall('.//{http://www.xbrl.org/2003/linkbase}definitionArc')
# Create relationships list
relationships = []
for arc in arcs:
from_ref = arc.get('{http://www.w3.org/1999/xlink}from')
to_ref = arc.get('{http://www.w3.org/1999/xlink}to')
order = self._parse_order_attribute(arc)
# Get the arcrole - this is important for identifying dimensional relationships
arcrole = arc.get('{http://www.w3.org/1999/xlink}arcrole')
if not from_ref or not to_ref or not arcrole:
continue
# Find locators for from/to references
from_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{from_ref}"]')
to_loc = link.find(f'.//*[@{{{NAMESPACES["xlink"]}}}label="{to_ref}"]')
if from_loc is None or to_loc is None:
continue
from_href = from_loc.get('{http://www.w3.org/1999/xlink}href')
to_href = to_loc.get('{http://www.w3.org/1999/xlink}href')
if not from_href or not to_href:
continue
# Extract element IDs
from_element = extract_element_id(from_href)
to_element = extract_element_id(to_href)
# Add relationship with arcrole
relationships.append({
'from_element': from_element,
'to_element': to_element,
'order': order,
'arcrole': arcrole
})
# Process dimensional structures from relationships
self._process_dimensional_relationships(role, relationships)
except Exception as e:
raise XBRLProcessingError(f"Error parsing definition content: {str(e)}") from e
def _process_dimensional_relationships(self, role: str, relationships: List[Dict[str, Any]]) -> None:
"""
Process dimensional relationships to build tables, axes, and domains.
Args:
role: Extended link role URI
relationships: List of dimensional relationships
"""
# XBRL Dimensions arcrole URIs
HYPERCUBE_DIMENSION = "http://xbrl.org/int/dim/arcrole/hypercube-dimension"
DIMENSION_DOMAIN = "http://xbrl.org/int/dim/arcrole/dimension-domain"
DOMAIN_MEMBER = "http://xbrl.org/int/dim/arcrole/domain-member"
ALL = "http://xbrl.org/int/dim/arcrole/all"
# Group relationships by arcrole
grouped_rels = {}
for rel in relationships:
arcrole = rel['arcrole']
if arcrole not in grouped_rels:
grouped_rels[arcrole] = []
grouped_rels[arcrole].append(rel)
# Process hypercube-dimension relationships to identify tables and axes
hypercube_axes = {} # Map of hypercubes to their axes
if HYPERCUBE_DIMENSION in grouped_rels:
for rel in grouped_rels[HYPERCUBE_DIMENSION]:
table_id = rel['from_element']
axis_id = rel['to_element']
if table_id not in hypercube_axes:
hypercube_axes[table_id] = []
hypercube_axes[table_id].append(axis_id)
# Create or update axis
if axis_id not in self.axes:
self.axes[axis_id] = Axis(
element_id=axis_id,
label=self._get_element_label(axis_id)
)
# Process dimension-domain relationships to link axes to domains
if DIMENSION_DOMAIN in grouped_rels:
for rel in grouped_rels[DIMENSION_DOMAIN]:
axis_id = rel['from_element']
domain_id = rel['to_element']
# Link domain to axis
if axis_id in self.axes:
self.axes[axis_id].domain_id = domain_id
# Create or update domain
if domain_id not in self.domains:
self.domains[domain_id] = Domain(
element_id=domain_id,
label=self._get_element_label(domain_id)
)
# Process domain-member relationships to build domain hierarchies
if DOMAIN_MEMBER in grouped_rels:
# Group by parent (domain) element
domain_members = {}
for rel in grouped_rels[DOMAIN_MEMBER]:
domain_id = rel['from_element']
member_id = rel['to_element']
if domain_id not in domain_members:
domain_members[domain_id] = []
domain_members[domain_id].append(member_id)
# Also create the domain if it doesn't exist
if domain_id not in self.domains:
self.domains[domain_id] = Domain(
element_id=domain_id,
label=self._get_element_label(domain_id)
)
# Update domains with their members
for domain_id, members in domain_members.items():
if domain_id in self.domains:
self.domains[domain_id].members = members
# Process 'all' relationships to identify line items and build hypercubes (tables)
if ALL in grouped_rels:
tables_by_role = []
for rel in grouped_rels[ALL]:
line_items_id = rel['to_element']
table_id = rel['from_element']
# Only process if this table has axes defined
if table_id in hypercube_axes:
table = Table(
element_id=table_id,
label=self._get_element_label(table_id),
role_uri=role,
axes=hypercube_axes[table_id],
line_items=[line_items_id],
closed=False # Default
)
tables_by_role.append(table)
# Add tables to collection
if tables_by_role:
self.tables[role] = tables_by_role
def _get_element_label(self, element_id: str) -> str:
"""Get the label for an element, falling back to the element ID if not found."""
if element_id in self.element_catalog and self.element_catalog[element_id].labels:
# Use standard label if available
standard_label = self.element_catalog[element_id].labels.get(STANDARD_LABEL)
if standard_label:
return standard_label
return element_id # Fallback to element ID

View File

@@ -0,0 +1,768 @@
"""
Instance parser for XBRL documents.
This module handles parsing of XBRL instance documents including facts, contexts,
units, footnotes, and entity information extraction.
"""
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Union
from lxml import etree as ET
from edgar.core import log
from edgar.xbrl.core import NAMESPACES, classify_duration
from edgar.xbrl.models import Context, Fact, XBRLProcessingError
from .base import BaseParser
class InstanceParser(BaseParser):
"""Parser for XBRL instance documents."""
def __init__(self, contexts: Dict[str, Context], facts: Dict[str, Fact],
units: Dict[str, Any], footnotes: Dict[str, Any],
calculation_trees: Dict[str, Any], entity_info: Dict[str, Any],
reporting_periods: List[Dict[str, Any]], context_period_map: Dict[str, str]):
"""
Initialize instance parser with data structure references.
Args:
contexts: Reference to contexts dictionary
facts: Reference to facts dictionary
units: Reference to units dictionary
footnotes: Reference to footnotes dictionary
calculation_trees: Reference to calculation trees dictionary
entity_info: Reference to entity info dictionary
reporting_periods: Reference to reporting periods list
context_period_map: Reference to context period map
"""
super().__init__()
# Store references to data structures
self.contexts = contexts
self.facts = facts
self.units = units
self.footnotes = footnotes
self.calculation_trees = calculation_trees
self.entity_info = entity_info
self.reporting_periods = reporting_periods
self.context_period_map = context_period_map
# DEI facts extracted during entity info processing
self.dei_facts: Dict[str, Fact] = {}
def _create_normalized_fact_key(self, element_id: str, context_ref: str, instance_id: int = None) -> str:
"""
Create a normalized fact key using underscore format.
Args:
element_id: The element ID
context_ref: The context reference
instance_id: Optional instance ID for duplicate facts
Returns:
Normalized key in format: element_id_context_ref[_instance_id]
"""
normalized_element_id = element_id
if ':' in element_id:
prefix, name = element_id.split(':', 1)
normalized_element_id = f"{prefix}_{name}"
if instance_id is not None:
return f"{normalized_element_id}_{context_ref}_{instance_id}"
return f"{normalized_element_id}_{context_ref}"
def parse_instance(self, file_path: Union[str, Path]) -> None:
"""Parse instance document file and extract contexts, facts, and units."""
try:
content = Path(file_path).read_text()
self.parse_instance_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing instance file {file_path}: {str(e)}") from e
def parse_instance_content(self, content: str) -> None:
"""Parse instance document content and extract contexts, facts, and units."""
try:
# Use lxml's optimized parser with smart string handling and recovery mode
parser = ET.XMLParser(remove_blank_text=True, recover=True, huge_tree=True)
# Convert to bytes for faster parsing if not already
if isinstance(content, str):
content_bytes = content.encode('utf-8')
else:
content_bytes = content
# Parse content with optimized settings
root = ET.XML(content_bytes, parser)
# Extract data in optimal order (contexts first, then units, then facts)
# This ensures dependencies are resolved before they're needed
self._extract_contexts(root)
self._extract_units(root)
self._extract_facts(root)
self._extract_footnotes(root)
# Post-processing steps after all raw data is extracted
self._extract_entity_info()
self._build_reporting_periods()
except Exception as e:
raise XBRLProcessingError(f"Error parsing instance content: {str(e)}") from e
def count_facts(self, content: str) -> tuple:
"""Count the number of facts in the instance document
This function counts both unique facts and total fact instances in the XBRL document.
Returns:
tuple: (unique_facts_count, total_fact_instances)
"""
# Use lxml's optimized parser with smart string handling and recovery mode
parser = ET.XMLParser(remove_blank_text=True, recover=True, huge_tree=True)
# Convert to bytes for faster parsing if not already
if isinstance(content, str):
content_bytes = content.encode('utf-8')
else:
content_bytes = content
# Parse content with optimized settings
root = ET.XML(content_bytes, parser)
# Fast path to identify non-fact elements to skip
skip_tag_endings = {'}context', '}unit', '}schemaRef'}
# Track both total instances and unique facts
total_fact_instances = 0 # Total number of fact references in the document
unique_facts = set() # Set of unique element_id + context_ref combinations
create_key = self._create_normalized_fact_key
# Define counting function
def count_element(element):
"""Process a single element as a potential fact."""
nonlocal total_fact_instances
# Skip known non-fact elements
tag = element.tag
for ending in skip_tag_endings:
if tag.endswith(ending):
return
# Get context reference - key check to identify facts
context_ref = element.get('contextRef')
if context_ref is None:
return
# Extract element namespace and name - optimized split
if '}' in tag:
namespace, element_name = tag.split('}', 1)
namespace = namespace[1:] # Faster than strip('{')
else:
element_name = tag
namespace = None
# Get namespace prefix - cached for performance
prefix = None
for std_prefix, std_uri_base in NAMESPACES.items():
if namespace.startswith(std_uri_base):
prefix = std_prefix
break
if not prefix and namespace:
# Try to extract prefix from the namespace
parts = namespace.split('/')
prefix = parts[-1] if parts else ''
# Construct element ID with optimized string concatenation
if prefix:
element_id = f"{prefix}:{element_name}" if prefix else element_name
else:
element_id = element_name
# Create a normalized key using underscore format for consistency
normalized_key = create_key(element_id, context_ref)
# Track unique facts
unique_facts.add(normalized_key)
# Increment total instances count
total_fact_instances += 1
# Optimize traversal using lxml's iterchildren and iterdescendants if available
if hasattr(root, 'iterchildren'):
# Use lxml's optimized traversal methods
for child in root.iterchildren():
count_element(child)
# Process nested elements with optimized iteration
for descendant in child.iterdescendants():
count_element(descendant)
else:
# Fallback for ElementTree
for child in root:
count_element(child)
for descendant in child.findall('.//*'):
count_element(descendant)
# Return tuple of counts (unique_facts_count, total_fact_instances)
return len(unique_facts), total_fact_instances
def _extract_contexts(self, root: ET.Element) -> None:
"""Extract contexts from instance document."""
try:
# Find all context elements
for context_elem in root.findall('.//{http://www.xbrl.org/2003/instance}context'):
context_id = context_elem.get('id')
if not context_id:
continue
# Create context object
context = Context(context_id=context_id)
# Extract entity information
entity_elem = context_elem.find('.//{http://www.xbrl.org/2003/instance}entity')
if entity_elem is not None:
# Get identifier
identifier_elem = entity_elem.find('.//{http://www.xbrl.org/2003/instance}identifier')
if identifier_elem is not None:
scheme = identifier_elem.get('scheme', '')
identifier = identifier_elem.text
context.entity = {
'scheme': scheme,
'identifier': identifier
}
# Get segment dimensions if present
segment_elem = entity_elem.find('.//{http://www.xbrl.org/2003/instance}segment')
if segment_elem is not None:
# Extract explicit dimensions
for dim_elem in segment_elem.findall('.//{http://xbrl.org/2006/xbrldi}explicitMember'):
dimension = dim_elem.get('dimension')
value = dim_elem.text
if dimension and value:
context.dimensions[dimension] = value
# Extract typed dimensions
for dim_elem in segment_elem.findall('.//{http://xbrl.org/2006/xbrldi}typedMember'):
dimension = dim_elem.get('dimension')
if dimension:
# The typed dimension value is the text content of the first child element
for child in dim_elem:
# Extract the text content, which contains the actual typed member value
if child.text and child.text.strip():
context.dimensions[dimension] = child.text.strip()
else:
# Fallback to tag if no text content
context.dimensions[dimension] = child.tag
break
# Extract period information
period_elem = context_elem.find('.//{http://www.xbrl.org/2003/instance}period')
if period_elem is not None:
# Check for instant period
instant_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}instant')
if instant_elem is not None and instant_elem.text:
context.period = {
'type': 'instant',
'instant': instant_elem.text
}
# Check for duration period
start_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}startDate')
end_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}endDate')
if start_elem is not None and end_elem is not None and start_elem.text and end_elem.text:
context.period = {
'type': 'duration',
'startDate': start_elem.text,
'endDate': end_elem.text
}
# Check for forever period
forever_elem = period_elem.find('.//{http://www.xbrl.org/2003/instance}forever')
if forever_elem is not None:
context.period = {
'type': 'forever'
}
# Add context to registry
self.contexts[context_id] = context
except Exception as e:
raise XBRLProcessingError(f"Error extracting contexts: {str(e)}") from e
def _extract_units(self, root: ET.Element) -> None:
"""Extract units from instance document."""
try:
# Find all unit elements
for unit_elem in root.findall('.//{http://www.xbrl.org/2003/instance}unit'):
unit_id = unit_elem.get('id')
if not unit_id:
continue
# Check for measure
measure_elem = unit_elem.find('.//{http://www.xbrl.org/2003/instance}measure')
if measure_elem is not None and measure_elem.text:
self.units[unit_id] = {
'type': 'simple',
'measure': measure_elem.text
}
continue
# Check for divide
divide_elem = unit_elem.find('.//{http://www.xbrl.org/2003/instance}divide')
if divide_elem is not None:
# Get numerator
numerator_elem = divide_elem.find('.//{http://www.xbrl.org/2003/instance}unitNumerator')
denominator_elem = divide_elem.find('.//{http://www.xbrl.org/2003/instance}unitDenominator')
if numerator_elem is not None and denominator_elem is not None:
# Get measures
numerator_measures = [elem.text for elem in numerator_elem.findall('.//{http://www.xbrl.org/2003/instance}measure') if elem.text]
denominator_measures = [elem.text for elem in denominator_elem.findall('.//{http://www.xbrl.org/2003/instance}measure') if elem.text]
self.units[unit_id] = {
'type': 'divide',
'numerator': numerator_measures,
'denominator': denominator_measures
}
except Exception as e:
raise XBRLProcessingError(f"Error extracting units: {str(e)}") from e
def _extract_facts(self, root: ET.Element) -> None:
"""Extract facts from instance document."""
try:
# Get direct access to nsmap if using lxml (much faster than regex extraction)
if hasattr(root, 'nsmap'):
# Leverage lxml's native nsmap functionality
prefix_map = {uri: prefix for prefix, uri in root.nsmap.items() if prefix is not None}
else:
# Fallback for ElementTree - precompile regex patterns for namespace extraction
xmlns_pattern = '{http://www.w3.org/2000/xmlns/}'
prefix_map = {}
# Extract namespace declarations from root
for attr_name, attr_value in root.attrib.items():
if attr_name.startswith(xmlns_pattern) or attr_name.startswith('xmlns:'):
# Extract the prefix more efficiently
if attr_name.startswith(xmlns_pattern):
prefix = attr_name[len(xmlns_pattern):]
else:
prefix = attr_name.split(':', 1)[1]
prefix_map[attr_value] = prefix
# Initialize counters and tracking
fact_count = 0
facts_dict = {}
base_keys = {}
# Fast path to identify non-fact elements to skip - compile as set for O(1) lookup
skip_tag_endings = {
'schemaRef',
'roleRef',
'arcroleRef',
'linkbaseRef',
'context',
'unit'
}
def process_element(element):
"""Process a single element as a potential fact."""
nonlocal fact_count
# Skip annotation nodes and other non element nodes
if not ET.iselement(element):
return
# Skip known non-fact elements - faster check with set membership
# If the tag is not a string, try calling () to get the string value (in rare cases)
if callable(element.tag):
if isinstance(element, ET._Comment):
return
if not element.values():
return
tag = element.tag
for ending in skip_tag_endings:
if tag.endswith(ending):
return
# Get context reference - key check to identify facts
context_ref = element.get('contextRef')
if not context_ref:
return
# Get fact ID if present (for footnote linkage)
fact_id = element.get('id')
# Extract element namespace and name - optimized split
if '}' in tag:
namespace, element_name = tag.split('}', 1)
namespace = namespace[1:] # Faster than strip('{')
# Try to extract prefix from the namespace
prefix = prefix_map.get(namespace)
if not prefix:
parts = namespace.split('/')
prefix = parts[-1] if parts else ''
else:
element_name = tag
prefix = ''
# Construct element ID with optimized string concatenation
element_id = f"{prefix}:{element_name}" if prefix else element_name
# Get unit reference
unit_ref = element.get('unitRef')
# Get value - optimize string handling
value = element.text
if not value or not value.strip():
# Only check children if text is empty - use direct iteration for speed
for sub_elem in element:
sub_text = sub_elem.text
if sub_text and sub_text.strip():
value = sub_text
break
# Optimize string handling - inline conditional
value = value.strip() if value else ""
# Get decimals attribute - direct access
decimals = element.get('decimals')
# Optimize numeric conversion with faster try/except
numeric_value = None
if value:
try:
numeric_value = float(value)
except (ValueError, TypeError):
pass
# Create base key for duplicate detection
base_key = self._create_normalized_fact_key(element_id, context_ref)
# Handle duplicates
instance_id = None
if base_key in base_keys:
# This is a duplicate - convert existing fact to use instance_id if needed
if base_key in facts_dict:
existing_fact = facts_dict[base_key]
# Move existing fact to new key with instance_id=0
del facts_dict[base_key]
existing_fact.instance_id = 0
facts_dict[self._create_normalized_fact_key(element_id, context_ref, 0)] = existing_fact
# Add new fact with next instance_id
instance_id = len(base_keys[base_key])
base_keys[base_key].append(True)
else:
# First instance of this fact
base_keys[base_key] = [True]
# Create fact object
fact = Fact(
element_id=element_id,
context_ref=context_ref,
value=value,
unit_ref=unit_ref,
decimals=decimals,
numeric_value=numeric_value,
instance_id=instance_id,
fact_id=fact_id
)
# Store fact with appropriate key
key = self._create_normalized_fact_key(element_id, context_ref, instance_id)
facts_dict[key] = fact
fact_count += 1
# Use lxml's optimized traversal methods
if hasattr(root, 'iterchildren'):
# Use lxml's optimized traversal methods
for child in root.iterchildren():
process_element(child)
# Process nested elements with optimized iteration
for descendant in child.iterdescendants():
process_element(descendant)
else:
# Fallback for ElementTree
for child in root:
process_element(child)
for descendant in child.findall('.//*'):
process_element(descendant)
# Update instance facts
self.facts.update(facts_dict)
log.debug(f"Extracted {fact_count} facts ({len(base_keys)} unique fact identifiers)")
except Exception as e:
raise XBRLProcessingError(f"Error extracting facts: {str(e)}") from e
def _extract_footnotes(self, root: ET.Element) -> None:
"""Extract footnotes from instance document.
Footnotes in XBRL are linked to facts via footnoteLink elements that contain:
1. footnote elements with the actual text content
2. footnoteArc elements that connect fact IDs to footnote IDs
"""
try:
from edgar.xbrl.models import Footnote
# Find all footnoteLink elements
for footnote_link in root.findall('.//{http://www.xbrl.org/2003/linkbase}footnoteLink'):
# First, extract all footnote definitions
for footnote_elem in footnote_link.findall('{http://www.xbrl.org/2003/linkbase}footnote'):
# Try both 'id' and 'xlink:label' attributes
footnote_id = footnote_elem.get('id') or footnote_elem.get('{http://www.w3.org/1999/xlink}label')
if not footnote_id:
continue
# Get footnote attributes
lang = footnote_elem.get('{http://www.w3.org/XML/1998/namespace}lang', 'en-US')
role = footnote_elem.get('{http://www.w3.org/1999/xlink}role')
# Extract text content, handling XHTML formatting
footnote_text = ""
# Check for XHTML content
xhtml_divs = footnote_elem.findall('.//{http://www.w3.org/1999/xhtml}div')
if xhtml_divs:
# Concatenate all text within XHTML elements
for div in xhtml_divs:
footnote_text += "".join(div.itertext()).strip()
else:
# Fall back to direct text content
footnote_text = "".join(footnote_elem.itertext()).strip()
# Create Footnote object
footnote = Footnote(
footnote_id=footnote_id,
text=footnote_text,
lang=lang,
role=role,
related_fact_ids=[]
)
self.footnotes[footnote_id] = footnote
# Second, process footnoteArc elements to link facts to footnotes
for arc_elem in footnote_link.findall('{http://www.xbrl.org/2003/linkbase}footnoteArc'):
fact_id = arc_elem.get('{http://www.w3.org/1999/xlink}from')
footnote_id = arc_elem.get('{http://www.w3.org/1999/xlink}to')
if fact_id and footnote_id:
# Add fact ID to footnote's related facts
if footnote_id in self.footnotes:
self.footnotes[footnote_id].related_fact_ids.append(fact_id)
else:
log.warning(f"Footnote arc references undefined footnote: {footnote_id}")
# Also update the fact's footnotes list if we can find it
# This requires finding the fact by its fact_id
for fact in self.facts.values():
if fact.fact_id == fact_id:
if footnote_id not in fact.footnotes:
fact.footnotes.append(footnote_id)
break
log.debug(f"Extracted {len(self.footnotes)} footnotes")
except Exception as e:
# Log the error but don't fail - footnotes are optional
log.warning(f"Error extracting footnotes: {str(e)}")
def _extract_entity_info(self) -> None:
"""Extract entity information from contexts and DEI facts."""
try:
# Extract CIK/identifier from first context
identifier = None
if self.contexts:
first = next(iter(self.contexts.values()))
ident = first.entity.get('identifier')
if ident and ident.isdigit():
identifier = ident.lstrip('0')
# Collect all DEI facts into a dict: concept -> Fact
self.dei_facts: Dict[str, Fact] = {}
for fact in self.facts.values():
eid = fact.element_id
if eid.startswith('dei:'):
concept = eid.split(':', 1)[1]
elif eid.startswith('dei_'):
concept = eid.split('_', 1)[1]
else:
continue
self.dei_facts[concept] = fact
# Helper: get the first available DEI fact value
def get_dei(*names):
for n in names:
f = self.dei_facts.get(n)
if f:
return f.value
return None
# Build entity_info preserving existing keys
self.entity_info.update({
'entity_name': get_dei('EntityRegistrantName'),
'ticker': get_dei('TradingSymbol'),
'identifier': identifier,
'document_type': get_dei('DocumentType'),
'reporting_end_date': None,
'document_period_end_date':get_dei('DocumentPeriodEndDate'),
'fiscal_year': get_dei('DocumentFiscalYearFocus','FiscalYearFocus','FiscalYear'),
'fiscal_period': get_dei('DocumentFiscalPeriodFocus','FiscalPeriodFocus'),
'fiscal_year_end_month': None,
'fiscal_year_end_day': None,
'annual_report': False,
'quarterly_report': False,
'amendment': False,
})
# Determine reporting_end_date from contexts
for ctx in self.contexts.values():
period = getattr(ctx, 'period', {})
if period.get('type') == 'instant':
ds = period.get('instant')
if ds:
try:
dt_obj = datetime.strptime(ds, '%Y-%m-%d').date()
curr = self.entity_info['reporting_end_date']
if curr is None or dt_obj > curr:
self.entity_info['reporting_end_date'] = dt_obj
except Exception:
pass
# Parse fiscal year end date into month/day
fye = get_dei('CurrentFiscalYearEndDate','FiscalYearEnd')
if fye:
try:
s = fye
if s.startswith('--'):
s = s[2:]
if '-' in s:
m, d = s.split('-', 1)
if m.isdigit() and d.isdigit():
self.entity_info['fiscal_year_end_month'] = int(m)
self.entity_info['fiscal_year_end_day'] = int(d)
except Exception:
pass
# Flags based on document_type
dt_val = self.entity_info['document_type'] or ''
self.entity_info['annual_report'] = (dt_val == '10-K')
self.entity_info['quarterly_report'] = (dt_val == '10-Q')
self.entity_info['amendment'] = ('/A' in dt_val)
log.debug(f"Entity info: {self.entity_info}")
except Exception as e:
log.warning(f"Warning: Error extracting entity info: {str(e)}")
def _build_reporting_periods(self) -> None:
"""Build reporting periods from contexts."""
try:
# Clear existing periods
self.reporting_periods.clear()
self.context_period_map.clear()
# Collect unique periods from contexts
instant_periods = {}
duration_periods = {}
for context_id, context in self.contexts.items():
if 'period' in context.model_dump() and 'type' in context.period:
period_type = context.period.get('type')
if period_type == 'instant':
date_str = context.period.get('instant')
if date_str:
if date_str not in instant_periods:
instant_periods[date_str] = []
# Add context ID to this period
instant_periods[date_str].append(context_id)
# Map context to period key
period_key = f"instant_{date_str}"
self.context_period_map[context_id] = period_key
elif period_type == 'duration':
start_date = context.period.get('startDate')
end_date = context.period.get('endDate')
if start_date and end_date:
duration_key = f"{start_date}_{end_date}"
if duration_key not in duration_periods:
duration_periods[duration_key] = []
# Add context ID to this period
duration_periods[duration_key].append(context_id)
# Map context to period key
period_key = f"duration_{start_date}_{end_date}"
self.context_period_map[context_id] = period_key
# Process instant periods
for date_str, context_ids in instant_periods.items():
try:
date_obj = datetime.strptime(date_str, '%Y-%m-%d').date()
formatted_date = date_obj.strftime('%B %d, %Y')
period = {
'type': 'instant',
'date': date_str,
'date_obj': date_obj,
'label': formatted_date,
'context_ids': context_ids,
'key': f"instant_{date_str}"
}
self.reporting_periods.append(period)
except (ValueError, TypeError):
# Skip invalid dates
continue
# Process duration periods
for period_key, context_ids in duration_periods.items():
start_date, end_date = period_key.split('_')
try:
start_obj = datetime.strptime(start_date, '%Y-%m-%d').date()
end_obj = datetime.strptime(end_date, '%Y-%m-%d').date()
formatted_start = start_obj.strftime('%B %d, %Y')
formatted_end = end_obj.strftime('%B %d, %Y')
# Calculate duration in days
days = (end_obj - start_obj).days
# Determine period type based on duration
period_description = classify_duration(days)
period = {
'type': 'duration',
'start_date': start_date,
'end_date': end_date,
'start_obj': start_obj,
'end_obj': end_obj,
'days': days,
'period_type': period_description,
'label': f"{period_description}: {formatted_start} to {formatted_end}",
'context_ids': context_ids,
'key': f"duration_{start_date}_{end_date}"
}
self.reporting_periods.append(period)
except (ValueError, TypeError):
# Skip invalid dates
continue
# Sort periods by date (most recent first)
self.reporting_periods.sort(key=lambda p: p['date_obj'] if p['type'] == 'instant' else p['end_obj'], reverse=True)
# Debug printout to verify periods are extracted
if len(self.reporting_periods) > 0:
log.debug(f"Found {len(self.reporting_periods)} reporting periods.")
log.debug(f"First period: {self.reporting_periods[0]['label']}")
else:
log.debug("Warning: No reporting periods found!")
# Debug context period map
log.debug(f"Context period map has {len(self.context_period_map)} entries.")
except Exception as e:
# Log error but don't fail
log.debug(f"Warning: Error building reporting periods: {str(e)}")
self.reporting_periods.clear()

View File

@@ -0,0 +1,149 @@
"""
Labels parser for XBRL documents.
This module handles parsing of XBRL label linkbases and extracting
element labels for display purposes.
"""
from pathlib import Path
from typing import Dict, Union
from lxml import etree as ET
from edgar.xbrl.core import STANDARD_LABEL, extract_element_id
from edgar.xbrl.models import ElementCatalog, XBRLProcessingError
from .base import BaseParser
class LabelsParser(BaseParser):
"""Parser for XBRL label linkbases."""
def __init__(self, element_catalog: Dict[str, ElementCatalog]):
"""
Initialize labels parser with data structure references.
Args:
element_catalog: Reference to element catalog dictionary
"""
super().__init__()
# Store references to data structures
self.element_catalog = element_catalog
def parse_labels(self, file_path: Union[str, Path]) -> None:
"""Parse label linkbase file and extract label information."""
try:
content = Path(file_path).read_text()
self.parse_labels_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing label file {file_path}: {str(e)}") from e
def parse_labels_content(self, content: str) -> None:
"""Parse label linkbase content and extract label information."""
try:
# Optimize: Register namespaces for faster XPath lookups
nsmap = {
'link': 'http://www.xbrl.org/2003/linkbase',
'xlink': 'http://www.w3.org/1999/xlink',
'xml': 'http://www.w3.org/XML/1998/namespace'
}
# Optimize: Use lxml parser with smart string handling
parser = ET.XMLParser(remove_blank_text=True, recover=True)
root = ET.XML(content.encode('utf-8'), parser)
# Optimize: Use specific XPath expressions with namespaces for faster lookups
# This is much faster than using findall with '//' in element tree
label_arcs = root.xpath('//link:labelArc', namespaces=nsmap)
labels = root.xpath('//link:label', namespaces=nsmap)
# Optimize: Pre-allocate dictionary with expected size
label_lookup = {}
# Optimize: Cache attribute lookups
xlink_label = '{http://www.w3.org/1999/xlink}label'
xlink_role = '{http://www.w3.org/1999/xlink}role'
xml_lang = '{http://www.w3.org/XML/1998/namespace}lang'
default_role = 'http://www.xbrl.org/2003/role/label'
# Optimize: Process labels in a single pass with direct attribute access
for label in labels:
label_id = label.get(xlink_label)
if not label_id:
continue
# Get text first - if empty, skip further processing
text = label.text
if text is None:
continue
# Get attributes - direct lookup is faster than method calls
role = label.get(xlink_role, default_role)
lang = label.get(xml_lang, 'en-US')
# Create nested dictionaries only when needed
if label_id not in label_lookup:
label_lookup[label_id] = {}
if lang not in label_lookup[label_id]:
label_lookup[label_id][lang] = {}
label_lookup[label_id][lang][role] = text
# Optimize: Cache attribute lookups for arcs
xlink_from = '{http://www.w3.org/1999/xlink}from'
xlink_to = '{http://www.w3.org/1999/xlink}to'
xlink_href = '{http://www.w3.org/1999/xlink}href'
# Optimize: Create a lookup table for locators by label for faster access
loc_by_label = {}
for loc in root.xpath('//link:loc', namespaces=nsmap):
loc_label = loc.get(xlink_label)
if loc_label:
loc_by_label[loc_label] = loc.get(xlink_href)
# Connect labels to elements using arcs - with optimized lookups
for arc in label_arcs:
from_ref = arc.get(xlink_from)
to_ref = arc.get(xlink_to)
if not from_ref or not to_ref or to_ref not in label_lookup:
continue
# Use cached locator lookup instead of expensive XPath
href = loc_by_label.get(from_ref)
if not href:
continue
# Extract element ID from href
element_id = extract_element_id(href)
# Find labels for this element - check most likely case first
if 'en-US' in label_lookup[to_ref]:
element_labels = label_lookup[to_ref]['en-US']
# Optimize: Update catalog with minimal overhead
catalog_entry = self.element_catalog.get(element_id)
if catalog_entry:
catalog_entry.labels.update(element_labels)
else:
# Create placeholder in catalog
self.element_catalog[element_id] = ElementCatalog(
name=element_id,
data_type="",
period_type="duration",
labels=element_labels
)
except Exception as e:
raise XBRLProcessingError(f"Error parsing label content: {str(e)}") from e
def get_element_label(self, element_id: str) -> str:
"""Get the label for an element, falling back to the element ID if not found."""
if element_id in self.element_catalog and self.element_catalog[element_id].labels:
# Use standard label if available
standard_label = self.element_catalog[element_id].labels.get(STANDARD_LABEL)
if standard_label:
return standard_label
return element_id # Fallback to element ID

View File

@@ -0,0 +1,249 @@
"""
Presentation parser for XBRL documents.
This module handles parsing of XBRL presentation linkbases and building
presentation trees for financial statement structure.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from lxml import etree as ET
from edgar.xbrl.core import extract_element_id
from edgar.xbrl.models import ElementCatalog, PresentationNode, PresentationTree, XBRLProcessingError
from .base import BaseParser
class PresentationParser(BaseParser):
"""Parser for XBRL presentation linkbases."""
def __init__(self, presentation_roles: Dict[str, Dict[str, Any]],
presentation_trees: Dict[str, PresentationTree],
element_catalog: Dict[str, ElementCatalog]):
"""
Initialize presentation parser with data structure references.
Args:
presentation_roles: Reference to presentation roles dictionary
presentation_trees: Reference to presentation trees dictionary
element_catalog: Reference to element catalog dictionary
"""
super().__init__()
# Store references to data structures
self.presentation_roles = presentation_roles
self.presentation_trees = presentation_trees
self.element_catalog = element_catalog
def parse_presentation(self, file_path: Union[str, Path]) -> None:
"""Parse presentation linkbase file and build presentation trees."""
try:
content = Path(file_path).read_text()
self.parse_presentation_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing presentation file {file_path}: {str(e)}") from e
def parse_presentation_content(self, content: str) -> None:
"""Parse presentation linkbase content and build presentation trees."""
try:
# Optimize: Register namespaces for faster XPath lookups
nsmap = {
'link': 'http://www.xbrl.org/2003/linkbase',
'xlink': 'http://www.w3.org/1999/xlink'
}
# Optimize: Use lxml parser with smart string handling
parser = ET.XMLParser(remove_blank_text=True, recover=True)
root = ET.XML(content.encode('utf-8'), parser)
# Optimize: Use XPath with namespaces for faster extraction
presentation_links = root.xpath('//link:presentationLink', namespaces=nsmap)
# Optimize: Cache attribute paths
xlink_role = '{http://www.w3.org/1999/xlink}role'
xlink_from = '{http://www.w3.org/1999/xlink}from'
xlink_to = '{http://www.w3.org/1999/xlink}to'
xlink_label = '{http://www.w3.org/1999/xlink}label'
xlink_href = '{http://www.w3.org/1999/xlink}href'
for link in presentation_links:
role = link.get(xlink_role)
if not role:
continue
# Store role information
role_id = role.split('/')[-1] if '/' in role else role
role_def = role_id.replace('_', ' ')
self.presentation_roles[role] = {
'roleUri': role,
'definition': role_def,
'roleId': role_id
}
# Optimize: Pre-build locator map to avoid repeated XPath lookups
loc_map = {}
for loc in link.xpath('.//link:loc', namespaces=nsmap):
label = loc.get(xlink_label)
if label:
loc_map[label] = loc.get(xlink_href)
# Optimize: Extract arcs using direct xpath with context
arcs = link.xpath('.//link:presentationArc', namespaces=nsmap)
# Create relationships map - pre-allocate with known size
relationships = []
relationships_append = relationships.append # Local function reference for speed
# Process arcs with optimized locator lookups
for arc in arcs:
from_ref = arc.get(xlink_from)
to_ref = arc.get(xlink_to)
if not from_ref or not to_ref:
continue
# Optimize: Use cached locator references instead of expensive XPath lookups
from_href = loc_map.get(from_ref)
to_href = loc_map.get(to_ref)
if not from_href or not to_href:
continue
# Parse order attribute correctly
order = self._parse_order_attribute(arc)
preferred_label = arc.get('preferredLabel')
# Extract element IDs from hrefs
from_element = extract_element_id(from_href)
to_element = extract_element_id(to_href)
# Add relationship using local function reference
relationships_append({
'from_element': from_element,
'to_element': to_element,
'order': order,
'preferred_label': preferred_label
})
# Build presentation tree for this role if we have relationships
if relationships:
self._build_presentation_tree(role, relationships)
except Exception as e:
raise XBRLProcessingError(f"Error parsing presentation content: {str(e)}") from e
def _build_presentation_tree(self, role: str, relationships: List[Dict[str, Any]]) -> None:
"""
Build a presentation tree from relationships.
Args:
role: Extended link role URI
relationships: List of relationships (from_element, to_element, order, preferred_label)
"""
# Group relationships by source element
from_map = {}
to_map = {}
for rel in relationships:
from_element = rel['from_element']
to_element = rel['to_element']
if from_element not in from_map:
from_map[from_element] = []
from_map[from_element].append(rel)
if to_element not in to_map:
to_map[to_element] = []
to_map[to_element].append(rel)
# Find root elements (appear as 'from' but not as 'to')
root_elements = set(from_map.keys()) - set(to_map.keys())
if not root_elements:
return # No root elements found
# Create presentation tree
tree = PresentationTree(
role_uri=role,
definition=self.presentation_roles[role]['definition'],
root_element_id=next(iter(root_elements)),
all_nodes={}
)
# Build tree recursively
for root_id in root_elements:
self._build_presentation_subtree(root_id, None, 0, from_map, tree.all_nodes)
# Add tree to collection
self.presentation_trees[role] = tree
def _build_presentation_subtree(self, element_id: str, parent_id: Optional[str], depth: int,
from_map: Dict[str, List[Dict[str, Any]]],
all_nodes: Dict[str, PresentationNode]) -> None:
"""
Recursively build a presentation subtree.
Args:
element_id: Current element ID
parent_id: Parent element ID
depth: Current depth in tree
from_map: Map of relationships by source element
all_nodes: Dictionary to store all nodes
"""
# Create node
node = PresentationNode(
element_id=element_id,
parent=parent_id,
children=[],
depth=depth
)
# Add element information if available
if element_id in self.element_catalog:
elem_info = self.element_catalog[element_id]
node.element_name = elem_info.name
node.standard_label = elem_info.labels.get('http://www.xbrl.org/2003/role/label', elem_info.name)
# Use enhanced abstract detection (Issue #450 fix)
# The element catalog may not have correct abstract info for standard taxonomy concepts
from edgar.xbrl.abstract_detection import is_abstract_concept
node.is_abstract = is_abstract_concept(
concept_name=elem_info.name,
schema_abstract=elem_info.abstract,
has_children=False, # Will be updated after children are processed
has_values=False # Will be determined later when facts are loaded
)
node.labels = elem_info.labels
# Add to collection
all_nodes[element_id] = node
# Process children
if element_id in from_map:
# Sort children by order
children = sorted(from_map[element_id], key=lambda r: r['order'])
for rel in children:
child_id = rel['to_element']
# Add child to parent's children list
node.children.append(child_id)
# Set preferred label
preferred_label = rel['preferred_label']
# Recursively build child subtree
self._build_presentation_subtree(
child_id, element_id, depth + 1, from_map, all_nodes
)
# Update preferred label and order after child is built
if child_id in all_nodes:
if preferred_label:
all_nodes[child_id].preferred_label = preferred_label
all_nodes[child_id].order = rel['order']

View File

@@ -0,0 +1,210 @@
"""
Schema parser for XBRL documents.
This module handles parsing of XBRL taxonomy schemas and element catalog
creation with element definitions and properties.
"""
from pathlib import Path
from typing import Dict, Union
from lxml import etree as ET
from edgar.core import log
from edgar.xbrl.models import ElementCatalog, XBRLProcessingError
from .base import BaseParser
class SchemaParser(BaseParser):
"""Parser for XBRL taxonomy schemas."""
def __init__(self, element_catalog: Dict[str, ElementCatalog]):
"""
Initialize schema parser with data structure references.
Args:
element_catalog: Reference to element catalog dictionary
"""
super().__init__()
# Store references to data structures
self.element_catalog = element_catalog
# Will be set by coordinator when needed
self.parse_labels_content = None
self.parse_presentation_content = None
self.parse_calculation_content = None
self.parse_definition_content = None
def set_linkbase_parsers(self, labels_parser, presentation_parser, calculation_parser, definition_parser):
"""
Set references to other parsers for embedded linkbase processing.
Args:
labels_parser: LabelsParser instance
presentation_parser: PresentationParser instance
calculation_parser: CalculationParser instance
definition_parser: DefinitionParser instance
"""
self.parse_labels_content = labels_parser.parse_labels_content
self.parse_presentation_content = presentation_parser.parse_presentation_content
self.parse_calculation_content = calculation_parser.parse_calculation_content
self.parse_definition_content = definition_parser.parse_definition_content
def parse_schema(self, file_path: Union[str, Path]) -> None:
"""Parse schema file and extract element information."""
try:
content = Path(file_path).read_text()
self.parse_schema_content(content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing schema file {file_path}: {str(e)}") from e
def parse_schema_content(self, content: str) -> None:
"""Parse schema content and extract element information."""
try:
# Use the safe XML parsing helper
root = self._safe_parse_xml(content)
# Extract element declarations
for element in root.findall('.//{http://www.w3.org/2001/XMLSchema}element'):
element_id = element.get('id') or element.get('name')
if not element_id:
continue
# Extract element properties
data_type = element.get('type', '')
# Check for balance and period type
# First check as attributes on the element (modern XBRL style)
balance_type = element.get('{http://www.xbrl.org/2003/instance}balance')
period_type = element.get('{http://www.xbrl.org/2003/instance}periodType')
abstract = element.get('abstract', 'false').lower() == 'true'
# If not found as attributes, look in nested annotations (legacy style)
if not balance_type or not period_type:
annotation = element.find('.//{http://www.w3.org/2001/XMLSchema}annotation')
if annotation is not None:
for appinfo in annotation.findall('.//{http://www.w3.org/2001/XMLSchema}appinfo'):
if not balance_type:
balance_element = appinfo.find('.//{http://www.xbrl.org/2003/instance}balance')
if balance_element is not None:
balance_type = balance_element.text
if not period_type:
period_element = appinfo.find('.//{http://www.xbrl.org/2003/instance}periodType')
if period_element is not None:
period_type = period_element.text
# Create element catalog entry
self.element_catalog[element_id] = ElementCatalog(
name=element_id,
data_type=data_type,
period_type=period_type or "duration", # Default to duration
balance=balance_type,
abstract=abstract,
labels={}
)
# Extract embedded linkbases if present
embedded_linkbases = self._extract_embedded_linkbases(content)
# If embedded linkbases were found, parse them
if embedded_linkbases and 'linkbases' in embedded_linkbases:
if 'label' in embedded_linkbases['linkbases'] and self.parse_labels_content:
label_content = embedded_linkbases['linkbases']['label']
self.parse_labels_content(label_content)
if 'presentation' in embedded_linkbases['linkbases'] and self.parse_presentation_content:
presentation_content = embedded_linkbases['linkbases']['presentation']
self.parse_presentation_content(presentation_content)
if 'calculation' in embedded_linkbases['linkbases'] and self.parse_calculation_content:
calculation_content = embedded_linkbases['linkbases']['calculation']
self.parse_calculation_content(calculation_content)
if 'definition' in embedded_linkbases['linkbases'] and self.parse_definition_content:
definition_content = embedded_linkbases['linkbases']['definition']
self.parse_definition_content(definition_content)
except Exception as e:
raise XBRLProcessingError(f"Error parsing schema content: {str(e)}") from e
def _extract_embedded_linkbases(self, schema_content: str) -> Dict[str, Dict[str, str]]:
"""
Extract embedded linkbases and role types from the schema file.
Args:
schema_content: XML content of the schema file
Returns:
Dictionary containing embedded linkbases and role type information
"""
embedded_data = {
'linkbases': {},
'role_types': {}
}
try:
# Use the safe XML parsing helper
root = self._safe_parse_xml(schema_content)
# Create namespace map for use with XPath
nsmap = {
'xsd': 'http://www.w3.org/2001/XMLSchema',
'link': 'http://www.xbrl.org/2003/linkbase'
}
# Find all appinfo elements using optimized XPath
for appinfo in root.xpath('.//xsd:appinfo', namespaces=nsmap):
# Extract role types
for role_type in appinfo.xpath('./link:roleType', namespaces=nsmap):
role_uri = role_type.get('roleURI')
role_id = role_type.get('id')
# Use optimized XPath to find definition
definition = role_type.find('./link:definition', nsmap)
definition_text = definition.text if definition is not None else ""
# Use optimized XPath to find usedOn elements
used_on = [elem.text for elem in role_type.xpath('./link:usedOn', namespaces=nsmap) if elem.text]
if role_uri:
embedded_data['role_types'][role_uri] = {
'id': role_id,
'definition': definition_text,
'used_on': used_on
}
# Find the linkbase element with optimized XPath
linkbase = appinfo.find('./link:linkbase', nsmap)
if linkbase is not None:
# Extract the entire linkbase element as a string - with proper encoding
linkbase_string = ET.tostring(linkbase, encoding='unicode', method='xml')
# Extract each type of linkbase with optimized XPath
for linkbase_type in ['presentation', 'label', 'calculation', 'definition']:
# Use direct child XPath for better performance
xpath_expr = f'./link:{linkbase_type}Link'
linkbase_elements = linkbase.xpath(xpath_expr, namespaces=nsmap)
if linkbase_elements:
# Convert all linkbase elements of this type to strings
linkbase_strings = [
ET.tostring(elem, encoding='unicode', method='xml')
for elem in linkbase_elements
]
# Join multiple linkbase elements efficiently
linkbase_header = linkbase_string.split('>', 1)[0] + '>'
embedded_data['linkbases'][linkbase_type] = (
f"{linkbase_header}\n" +
'\n'.join(linkbase_strings) +
"\n</link:linkbase>"
)
return embedded_data
except Exception as e:
# Log the error but don't fail - just return empty embedded data
log.warning(f"Warning: Error extracting embedded linkbases: {str(e)}")
return embedded_data