Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/documents/strategies/xbrl_extraction.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/strategies/xbrl_extraction.py
@@ -0,0 +1,345 @@
+"""
+XBRL extraction strategy for inline XBRL documents.
+"""
+
+from typing import Dict, Any, Optional
+
+from lxml.html import HtmlElement
+
+from edgar.documents.types import XBRLFact
+
+
+class XBRLExtractor:
+    """
+    Extracts XBRL facts from inline XBRL (iXBRL) documents.
+    
+    Handles:
+    - ix:nonFraction, ix:nonNumeric facts
+    - Context and unit resolution
+    - Continuation handling
+    - Transformation rules
+    """
+    
+    # XBRL namespaces
+    NAMESPACES = {
+        'ix': 'http://www.xbrl.org/2013/inlineXBRL',
+        'xbrli': 'http://www.xbrl.org/2003/instance',
+        'xbrldi': 'http://xbrl.org/2006/xbrldi',
+        'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
+    }
+    
+    # Common transformation formats
+    TRANSFORMATIONS = {
+        'ixt:numdotdecimal': lambda x: x.replace(',', ''),
+        'ixt:numcommadecimal': lambda x: x.replace('.', '_').replace(',', '.').replace('_', ','),
+        'ixt:zerodash': lambda x: '0' if x == '-' else x,
+        'ixt:datedoteu': lambda x: x.replace('.', '-'),
+        'ixt:datedotus': lambda x: x.replace('.', '/'),
+    }
+    
+    def __init__(self):
+        """Initialize XBRL extractor."""
+        self.contexts: Dict[str, Dict[str, Any]] = {}
+        self.units: Dict[str, str] = {}
+        self.continuations: Dict[str, str] = {}
+        self._initialized = False
+    
+    def extract_context(self, element: HtmlElement) -> Optional[Dict[str, Any]]:
+        """
+        Extract XBRL context from element.
+        
+        Args:
+            element: HTML element that might contain XBRL
+            
+        Returns:
+            XBRL metadata if found
+        """
+        # Check if element is an ix: tag
+        if not self._is_xbrl_element(element):
+            return None
+        
+        # Initialize context if needed
+        if not self._initialized:
+            self._initialize_context(element)
+        
+        # Extract based on element type
+        tag_name = self._get_local_name(element.tag)
+        
+        if tag_name == 'nonfraction':
+            return self._extract_nonfraction(element)
+        elif tag_name == 'nonnumeric':
+            return self._extract_nonnumeric(element)
+        elif tag_name == 'continuation':
+            return self._extract_continuation(element)
+        elif tag_name == 'footnote':
+            return self._extract_footnote(element)
+        elif tag_name == 'fraction':
+            return self._extract_fraction(element)
+        
+        return None
+    
+    def extract_fact(self, element: HtmlElement) -> Optional[XBRLFact]:
+        """Extract XBRL fact from element."""
+        context = self.extract_context(element)
+        if not context:
+            return None
+        
+        # Get fact value
+        value = self._get_fact_value(element)
+        
+        # Create fact
+        fact = XBRLFact(
+            concept=context.get('name', ''),
+            value=value,
+            context_ref=context.get('contextRef'),
+            unit_ref=context.get('unitRef'),
+            decimals=context.get('decimals'),
+            scale=context.get('scale'),
+            format=context.get('format'),
+            sign=context.get('sign')
+        )
+        
+        # Resolve references
+        if fact.context_ref and fact.context_ref in self.contexts:
+            fact.context = self.contexts[fact.context_ref]
+        
+        if fact.unit_ref and fact.unit_ref in self.units:
+            fact.unit = self.units[fact.unit_ref]
+        
+        return fact
+    
+    def _is_xbrl_element(self, element: HtmlElement) -> bool:
+        """Check if element is an XBRL element."""
+        tag = element.tag
+        if not isinstance(tag, str):
+            return False
+        
+        # Handle both namespaced and non-namespaced tags
+        tag_lower = tag.lower()
+        return (
+            tag.startswith('{' + self.NAMESPACES['ix'] + '}') or
+            tag.startswith('ix:') or
+            tag_lower.startswith('ix:')
+        )
+    
+    def _get_local_name(self, tag: str) -> str:
+        """Get local name from qualified tag."""
+        if '}' in tag:
+            return tag.split('}')[1].lower()
+        elif ':' in tag:
+            return tag.split(':')[1].lower()
+        return tag.lower()
+    
+    def _initialize_context(self, element: HtmlElement):
+        """Initialize context and unit information from document."""
+        # Find root element
+        root = element.getroottree().getroot()
+        
+        # Extract contexts
+        self._extract_contexts(root)
+        
+        # Extract units
+        self._extract_units(root)
+        
+        self._initialized = True
+    
+    def _extract_contexts(self, root: HtmlElement):
+        """Extract all context definitions."""
+        # Look for xbrli:context elements
+        for context in root.xpath('//xbrli:context', namespaces=self.NAMESPACES):
+            context_id = context.get('id')
+            if not context_id:
+                continue
+            
+            context_data = {
+                'id': context_id
+            }
+            
+            # Extract entity
+            entity = context.find('.//xbrli:entity', namespaces=self.NAMESPACES)
+            if entity is not None:
+                identifier = entity.find('.//xbrli:identifier', namespaces=self.NAMESPACES)
+                if identifier is not None:
+                    context_data['entity'] = identifier.text
+                    context_data['scheme'] = identifier.get('scheme')
+            
+            # Extract period
+            period = context.find('.//xbrli:period', namespaces=self.NAMESPACES)
+            if period is not None:
+                instant = period.find('.//xbrli:instant', namespaces=self.NAMESPACES)
+                if instant is not None:
+                    context_data['instant'] = instant.text
+                    context_data['period_type'] = 'instant'
+                else:
+                    start = period.find('.//xbrli:startDate', namespaces=self.NAMESPACES)
+                    end = period.find('.//xbrli:endDate', namespaces=self.NAMESPACES)
+                    if start is not None and end is not None:
+                        context_data['start_date'] = start.text
+                        context_data['end_date'] = end.text
+                        context_data['period_type'] = 'duration'
+            
+            # Extract dimensions
+            segment = context.find('.//xbrli:segment', namespaces=self.NAMESPACES)
+            if segment is not None:
+                dimensions = {}
+                for member in segment.findall('.//xbrldi:explicitMember', namespaces=self.NAMESPACES):
+                    dim = member.get('dimension')
+                    if dim:
+                        dimensions[dim] = member.text
+                if dimensions:
+                    context_data['dimensions'] = dimensions
+            
+            self.contexts[context_id] = context_data
+    
+    def _extract_units(self, root: HtmlElement):
+        """Extract all unit definitions."""
+        # Look for xbrli:unit elements
+        for unit in root.xpath('//xbrli:unit', namespaces=self.NAMESPACES):
+            unit_id = unit.get('id')
+            if not unit_id:
+                continue
+            
+            # Check for simple measure
+            measure = unit.find('.//xbrli:measure', namespaces=self.NAMESPACES)
+            if measure is not None:
+                self.units[unit_id] = self._normalize_unit(measure.text)
+                continue
+            
+            # Check for complex unit (divide)
+            divide = unit.find('.//xbrli:divide', namespaces=self.NAMESPACES)
+            if divide is not None:
+                numerator = divide.find('.//xbrli:unitNumerator/xbrli:measure', namespaces=self.NAMESPACES)
+                denominator = divide.find('.//xbrli:unitDenominator/xbrli:measure', namespaces=self.NAMESPACES)
+                
+                if numerator is not None and denominator is not None:
+                    num_unit = self._normalize_unit(numerator.text)
+                    den_unit = self._normalize_unit(denominator.text)
+                    self.units[unit_id] = f"{num_unit}/{den_unit}"
+    
+    def _normalize_unit(self, unit_text: str) -> str:
+        """Normalize unit text."""
+        if not unit_text:
+            return ''
+        
+        # Remove namespace prefix
+        if ':' in unit_text:
+            unit_text = unit_text.split(':')[-1]
+        
+        # Common normalizations
+        unit_map = {
+            'usd': 'USD',
+            'shares': 'shares',
+            'pure': 'pure',
+            'percent': '%'
+        }
+        
+        return unit_map.get(unit_text.lower(), unit_text)
+    
+    def _extract_nonfraction(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:nonFraction element."""
+        metadata = {
+            'type': 'nonFraction',
+            'name': element.get('name'),
+            'contextRef': element.get('contextRef') or element.get('contextref'),
+            'unitRef': element.get('unitRef') or element.get('unitref'),
+            'decimals': element.get('decimals'),
+            'scale': element.get('scale'),
+            'format': element.get('format'),
+            'sign': element.get('sign')
+        }
+        
+        # Clean None values
+        return {k: v for k, v in metadata.items() if v is not None}
+    
+    def _extract_nonnumeric(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:nonNumeric element."""
+        metadata = {
+            'type': 'nonNumeric',
+            'name': element.get('name'),
+            'contextRef': element.get('contextRef') or element.get('contextref'),
+            'format': element.get('format')
+        }
+        
+        # Clean None values
+        return {k: v for k, v in metadata.items() if v is not None}
+    
+    def _extract_continuation(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:continuation element."""
+        cont_id = element.get('id')
+        continued_at = element.get('continuedAt')
+        
+        if cont_id and continued_at:
+            # Map continuation to original
+            if continued_at in self.continuations:
+                original = self.continuations[continued_at]
+                self.continuations[cont_id] = original
+                return original
+            else:
+                # Store for later resolution
+                metadata = {
+                    'type': 'continuation',
+                    'id': cont_id,
+                    'continuedAt': continued_at
+                }
+                self.continuations[cont_id] = metadata
+                return metadata
+        
+        return {}
+    
+    def _extract_footnote(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:footnote element."""
+        return {
+            'type': 'footnote',
+            'footnoteRole': element.get('footnoteRole'),
+            'footnoteID': element.get('footnoteID')
+        }
+    
+    def _extract_fraction(self, element: HtmlElement) -> Dict[str, Any]:
+        """Extract ix:fraction element."""
+        metadata = {
+            'type': 'fraction',
+            'name': element.get('name'),
+            'contextRef': element.get('contextRef'),
+            'unitRef': element.get('unitRef')
+        }
+        
+        # Extract numerator and denominator
+        numerator = element.find('.//ix:numerator', namespaces=self.NAMESPACES)
+        denominator = element.find('.//ix:denominator', namespaces=self.NAMESPACES)
+        
+        if numerator is not None:
+            metadata['numerator'] = numerator.text
+        if denominator is not None:
+            metadata['denominator'] = denominator.text
+        
+        return {k: v for k, v in metadata.items() if v is not None}
+    
+    def _get_fact_value(self, element: HtmlElement) -> str:
+        """Get fact value from element with transformations."""
+        # Get raw value
+        value = element.text or ''
+        
+        # Apply format transformation if specified
+        format_attr = element.get('format')
+        if format_attr and format_attr in self.TRANSFORMATIONS:
+            transform = self.TRANSFORMATIONS[format_attr]
+            value = transform(value)
+        
+        # Apply scale if specified
+        scale = element.get('scale')
+        if scale:
+            try:
+                scale_factor = int(scale)
+                numeric_value = float(value.replace(',', ''))
+                scaled_value = numeric_value * (10 ** scale_factor)
+                value = str(scaled_value)
+            except (ValueError, TypeError):
+                pass
+        
+        # Apply sign if specified
+        sign = element.get('sign')
+        if sign == '-':
+            if value and not value.startswith('-'):
+                value = '-' + value
+        
+        return value.strip()