edgartools/venv/lib/python3.10/site-packages/edgar/documents/strategies/xbrl_extraction.py

"""
XBRL extraction strategy for inline XBRL documents.
"""

from typing import Dict, Any, Optional

from lxml.html import HtmlElement

from edgar.documents.types import XBRLFact


class XBRLExtractor:
    """
    Extracts XBRL facts from inline XBRL (iXBRL) documents.

    Handles:
    - ix:nonFraction, ix:nonNumeric facts
    - Context and unit resolution
    - Continuation handling
    - Transformation rules
    """

    # XBRL namespaces
    NAMESPACES = {
        'ix': 'http://www.xbrl.org/2013/inlineXBRL',
        'xbrli': 'http://www.xbrl.org/2003/instance',
        'xbrldi': 'http://xbrl.org/2006/xbrldi',
        'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
    }

    # Common transformation formats
    TRANSFORMATIONS = {
        'ixt:numdotdecimal': lambda x: x.replace(',', ''),
        'ixt:numcommadecimal': lambda x: x.replace('.', '_').replace(',', '.').replace('_', ','),
        'ixt:zerodash': lambda x: '0' if x == '-' else x,
        'ixt:datedoteu': lambda x: x.replace('.', '-'),
        'ixt:datedotus': lambda x: x.replace('.', '/'),
    }

    def __init__(self):
        """Initialize XBRL extractor."""
        self.contexts: Dict[str, Dict[str, Any]] = {}
        self.units: Dict[str, str] = {}
        self.continuations: Dict[str, str] = {}
        self._initialized = False

    def extract_context(self, element: HtmlElement) -> Optional[Dict[str, Any]]:
        """
        Extract XBRL context from element.

        Args:
            element: HTML element that might contain XBRL

        Returns:
            XBRL metadata if found
        """
        # Check if element is an ix: tag
        if not self._is_xbrl_element(element):
            return None

        # Initialize context if needed
        if not self._initialized:
            self._initialize_context(element)

        # Extract based on element type
        tag_name = self._get_local_name(element.tag)

        if tag_name == 'nonfraction':
            return self._extract_nonfraction(element)
        elif tag_name == 'nonnumeric':
            return self._extract_nonnumeric(element)
        elif tag_name == 'continuation':
            return self._extract_continuation(element)
        elif tag_name == 'footnote':
            return self._extract_footnote(element)
        elif tag_name == 'fraction':
            return self._extract_fraction(element)

        return None

    def extract_fact(self, element: HtmlElement) -> Optional[XBRLFact]:
        """Extract XBRL fact from element."""
        context = self.extract_context(element)
        if not context:
            return None

        # Get fact value
        value = self._get_fact_value(element)

        # Create fact
        fact = XBRLFact(
            concept=context.get('name', ''),
            value=value,
            context_ref=context.get('contextRef'),
            unit_ref=context.get('unitRef'),
            decimals=context.get('decimals'),
            scale=context.get('scale'),
            format=context.get('format'),
            sign=context.get('sign')
        )

        # Resolve references
        if fact.context_ref and fact.context_ref in self.contexts:
            fact.context = self.contexts[fact.context_ref]

        if fact.unit_ref and fact.unit_ref in self.units:
            fact.unit = self.units[fact.unit_ref]

        return fact

    def _is_xbrl_element(self, element: HtmlElement) -> bool:
        """Check if element is an XBRL element."""
        tag = element.tag
        if not isinstance(tag, str):
            return False

        # Handle both namespaced and non-namespaced tags
        tag_lower = tag.lower()
        return (
            tag.startswith('{' + self.NAMESPACES['ix'] + '}') or
            tag.startswith('ix:') or
            tag_lower.startswith('ix:')
        )

    def _get_local_name(self, tag: str) -> str:
        """Get local name from qualified tag."""
        if '}' in tag:
            return tag.split('}')[1].lower()
        elif ':' in tag:
            return tag.split(':')[1].lower()
        return tag.lower()

    def _initialize_context(self, element: HtmlElement):
        """Initialize context and unit information from document."""
        # Find root element
        root = element.getroottree().getroot()

        # Extract contexts
        self._extract_contexts(root)

        # Extract units
        self._extract_units(root)

        self._initialized = True

    def _extract_contexts(self, root: HtmlElement):
        """Extract all context definitions."""
        # Look for xbrli:context elements
        for context in root.xpath('//xbrli:context', namespaces=self.NAMESPACES):
            context_id = context.get('id')
            if not context_id:
                continue

            context_data = {
                'id': context_id
            }

            # Extract entity
            entity = context.find('.//xbrli:entity', namespaces=self.NAMESPACES)
            if entity is not None:
                identifier = entity.find('.//xbrli:identifier', namespaces=self.NAMESPACES)
                if identifier is not None:
                    context_data['entity'] = identifier.text
                    context_data['scheme'] = identifier.get('scheme')

            # Extract period
            period = context.find('.//xbrli:period', namespaces=self.NAMESPACES)
            if period is not None:
                instant = period.find('.//xbrli:instant', namespaces=self.NAMESPACES)
                if instant is not None:
                    context_data['instant'] = instant.text
                    context_data['period_type'] = 'instant'
                else:
                    start = period.find('.//xbrli:startDate', namespaces=self.NAMESPACES)
                    end = period.find('.//xbrli:endDate', namespaces=self.NAMESPACES)
                    if start is not None and end is not None:
                        context_data['start_date'] = start.text
                        context_data['end_date'] = end.text
                        context_data['period_type'] = 'duration'

            # Extract dimensions
            segment = context.find('.//xbrli:segment', namespaces=self.NAMESPACES)
            if segment is not None:
                dimensions = {}
                for member in segment.findall('.//xbrldi:explicitMember', namespaces=self.NAMESPACES):
                    dim = member.get('dimension')
                    if dim:
                        dimensions[dim] = member.text
                if dimensions:
                    context_data['dimensions'] = dimensions

            self.contexts[context_id] = context_data

    def _extract_units(self, root: HtmlElement):
        """Extract all unit definitions."""
        # Look for xbrli:unit elements
        for unit in root.xpath('//xbrli:unit', namespaces=self.NAMESPACES):
            unit_id = unit.get('id')
            if not unit_id:
                continue

            # Check for simple measure
            measure = unit.find('.//xbrli:measure', namespaces=self.NAMESPACES)
            if measure is not None:
                self.units[unit_id] = self._normalize_unit(measure.text)
                continue

            # Check for complex unit (divide)
            divide = unit.find('.//xbrli:divide', namespaces=self.NAMESPACES)
            if divide is not None:
                numerator = divide.find('.//xbrli:unitNumerator/xbrli:measure', namespaces=self.NAMESPACES)
                denominator = divide.find('.//xbrli:unitDenominator/xbrli:measure', namespaces=self.NAMESPACES)

                if numerator is not None and denominator is not None:
                    num_unit = self._normalize_unit(numerator.text)
                    den_unit = self._normalize_unit(denominator.text)
                    self.units[unit_id] = f"{num_unit}/{den_unit}"

    def _normalize_unit(self, unit_text: str) -> str:
        """Normalize unit text."""
        if not unit_text:
            return ''

        # Remove namespace prefix
        if ':' in unit_text:
            unit_text = unit_text.split(':')[-1]

        # Common normalizations
        unit_map = {
            'usd': 'USD',
            'shares': 'shares',
            'pure': 'pure',
            'percent': '%'
        }

        return unit_map.get(unit_text.lower(), unit_text)

    def _extract_nonfraction(self, element: HtmlElement) -> Dict[str, Any]:
        """Extract ix:nonFraction element."""
        metadata = {
            'type': 'nonFraction',
            'name': element.get('name'),
            'contextRef': element.get('contextRef') or element.get('contextref'),
            'unitRef': element.get('unitRef') or element.get('unitref'),
            'decimals': element.get('decimals'),
            'scale': element.get('scale'),
            'format': element.get('format'),
            'sign': element.get('sign')
        }

        # Clean None values
        return {k: v for k, v in metadata.items() if v is not None}

    def _extract_nonnumeric(self, element: HtmlElement) -> Dict[str, Any]:
        """Extract ix:nonNumeric element."""
        metadata = {
            'type': 'nonNumeric',
            'name': element.get('name'),
            'contextRef': element.get('contextRef') or element.get('contextref'),
            'format': element.get('format')
        }

        # Clean None values
        return {k: v for k, v in metadata.items() if v is not None}

    def _extract_continuation(self, element: HtmlElement) -> Dict[str, Any]:
        """Extract ix:continuation element."""
        cont_id = element.get('id')
        continued_at = element.get('continuedAt')

        if cont_id and continued_at:
            # Map continuation to original
            if continued_at in self.continuations:
                original = self.continuations[continued_at]
                self.continuations[cont_id] = original
                return original
            else:
                # Store for later resolution
                metadata = {
                    'type': 'continuation',
                    'id': cont_id,
                    'continuedAt': continued_at
                }
                self.continuations[cont_id] = metadata
                return metadata

        return {}

    def _extract_footnote(self, element: HtmlElement) -> Dict[str, Any]:
        """Extract ix:footnote element."""
        return {
            'type': 'footnote',
            'footnoteRole': element.get('footnoteRole'),
            'footnoteID': element.get('footnoteID')
        }

    def _extract_fraction(self, element: HtmlElement) -> Dict[str, Any]:
        """Extract ix:fraction element."""
        metadata = {
            'type': 'fraction',
            'name': element.get('name'),
            'contextRef': element.get('contextRef'),
            'unitRef': element.get('unitRef')
        }

        # Extract numerator and denominator
        numerator = element.find('.//ix:numerator', namespaces=self.NAMESPACES)
        denominator = element.find('.//ix:denominator', namespaces=self.NAMESPACES)

        if numerator is not None:
            metadata['numerator'] = numerator.text
        if denominator is not None:
            metadata['denominator'] = denominator.text

        return {k: v for k, v in metadata.items() if v is not None}

    def _get_fact_value(self, element: HtmlElement) -> str:
        """Get fact value from element with transformations."""
        # Get raw value
        value = element.text or ''

        # Apply format transformation if specified
        format_attr = element.get('format')
        if format_attr and format_attr in self.TRANSFORMATIONS:
            transform = self.TRANSFORMATIONS[format_attr]
            value = transform(value)

        # Apply scale if specified
        scale = element.get('scale')
        if scale:
            try:
                scale_factor = int(scale)
                numeric_value = float(value.replace(',', ''))
                scaled_value = numeric_value * (10 ** scale_factor)
                value = str(scaled_value)
            except (ValueError, TypeError):
                pass

        # Apply sign if specified
        sign = element.get('sign')
        if sign == '-':
            if value and not value.startswith('-'):
                value = '-' + value

        return value.strip()