edgartools/venv/lib/python3.10/site-packages/edgar/documents/strategies/table_processing.py

"""
Advanced table processing strategy.
"""

import re
from functools import lru_cache
from typing import List, Optional

from lxml.html import HtmlElement

from edgar.documents.config import ParserConfig
from edgar.documents.strategies.style_parser import StyleParser
from edgar.documents.table_nodes import TableNode, Cell, Row
from edgar.documents.types import TableType


class TableProcessor:
    """
    Advanced table processing with type detection and structure analysis.
    """

    # HTML entities that need replacement
    ENTITY_REPLACEMENTS = {
        '&horbar;': '-----',
        '&mdash;': '-----',
        '&ndash;': '---',
        '&minus;': '-',
        '&hyphen;': '-',
        '&dash;': '-',
        '&nbsp;': ' ',
        '&amp;': '&',
        '&lt;': '<',
        '&gt;': '>',
        '&quot;': '"',
        '&apos;': "'",
        '&#8202;': ' ',
        '&#8203;': '',
        '&#x2014;': '-----',
        '&#x2013;': '---',
        '&#x2212;': '-',
    }

    # Financial keywords for table type detection
    FINANCIAL_KEYWORDS = {
        'revenue', 'income', 'expense', 'asset', 'liability',
        'cash', 'equity', 'profit', 'loss', 'margin',
        'earnings', 'cost', 'sales', 'operating', 'net',
        'gross', 'total', 'balance', 'statement', 'consolidated',
        'provision', 'tax', 'taxes', 'compensation', 'stock',
        'share', 'shares', 'rsu', 'option', 'grant', 'vest'
    }

    # Metrics keywords
    METRICS_KEYWORDS = {
        'ratio', 'percentage', 'percent', '%', 'rate',
        'growth', 'change', 'increase', 'decrease',
        'average', 'median', 'total', 'count', 'number'
    }

    def __init__(self, config: ParserConfig):
        """Initialize table processor."""
        self.config = config
        self.style_parser = StyleParser()

    def process(self, element: HtmlElement) -> TableNode:
        """
        Process table element into TableNode.

        Args:
            element: HTML table element

        Returns:
            Processed TableNode
        """
        # Extract table metadata
        table_id = element.get('id')
        table_class = element.get('class', '').split()
        table_style = self.style_parser.parse(element.get('style', ''))

        # Create table node
        table = TableNode(style=table_style)

        # Set config for rendering decisions
        table._config = self.config

        # Add metadata
        if table_id:
            table.set_metadata('id', table_id)
        if table_class:
            table.set_metadata('classes', table_class)

        # Extract caption
        caption_elem = element.find('.//caption')
        if caption_elem is not None:
            table.caption = self._extract_text(caption_elem)

        # Extract summary
        summary = element.get('summary')
        if summary:
            table.summary = summary

        # Process table structure
        self._process_table_structure(element, table)

        # Detect table type if configured
        if self.config.detect_table_types:
            table.table_type = self._detect_table_type(table)

        # Extract relationships if configured
        if self.config.extract_table_relationships:
            self._extract_relationships(table)

        return table

    def _process_table_structure(self, element: HtmlElement, table: TableNode):
        """Process table structure (thead, tbody, tfoot)."""
        # Process thead
        thead = element.find('.//thead')
        if thead is not None:
            for tr in thead.findall('.//tr'):
                cells = self._process_row(tr, is_header=True)
                if cells:
                    table.headers.append(cells)

        # Process tbody (or direct rows)
        tbody = element.find('.//tbody')
        rows_container = tbody if tbody is not None else element

        # Track if we've seen headers and data rows
        headers_found = bool(table.headers)
        consecutive_header_rows = 0
        data_rows_started = False

        for tr in rows_container.findall('.//tr'):
            # Skip if already processed in thead
            if thead is not None and tr.getparent() == thead:
                continue

            # Check if this might be a header row
            is_header_row = False

            # Continue checking for headers if:
            # 1. We haven't found any headers yet, OR
            # 2. We've found headers but haven't seen data rows yet (multi-row headers)
            if not data_rows_started:
                is_header_row = self._is_header_row(tr)

                # Additional check for multi-row headers in financial tables
                # If the previous row was a header and this row has years or units,
                # it's likely part of the header
                if headers_found and not is_header_row:
                    row_text = tr.text_content().strip()
                    # Check for units like "(in millions)" or "(in thousands)"
                    if '(in millions)' in row_text or '(in thousands)' in row_text or '(in billions)' in row_text:
                        is_header_row = True
                    # Check for year rows that follow "Year Ended" headers
                    elif len(table.headers) > 0:
                        last_header_text = ' '.join(cell.text() for cell in table.headers[-1])
                        if 'year ended' in last_header_text.lower() or 'years ended' in last_header_text.lower():
                            # Check if this row has years
                            year_pattern = r'\b(19\d{2}|20\d{2})\b'
                            years_found = re.findall(year_pattern, row_text)
                            if years_found:
                                is_header_row = True

            cells = self._process_row(tr, is_header=is_header_row)
            if cells:
                if is_header_row:
                    table.headers.append(cells)
                    headers_found = True
                    consecutive_header_rows += 1
                else:
                    # Only mark data_rows_started if this row has actual content
                    # Empty rows at the beginning shouldn't stop header detection
                    row = Row(cells=cells, is_header=False)
                    table.rows.append(row)

                    # Check if row has significant content that indicates data rows have started
                    # But be smart about it - descriptive rows like "(in millions)" or pure spacing
                    # shouldn't stop header detection
                    has_content = any(cell.text().strip() for cell in cells)
                    if has_content:
                        # Get the row text for smarter analysis
                        row_text = ' '.join(cell.text().strip() for cell in cells).strip()
                        row_text_lower = row_text.lower()

                        # Don't consider this as "data started" if it's likely a header-related row
                        is_header_related = (
                            # Unit descriptions
                            '(in millions)' in row_text_lower or
                            '(in thousands)' in row_text_lower or
                            '(in billions)' in row_text_lower or
                            'except per share' in row_text_lower or
                            # Financial period descriptions
                            'year ended' in row_text_lower or
                            'months ended' in row_text_lower or
                            # Mostly just spacing/formatting
                            len(row_text.strip()) < 5 or
                            # Contains years (might be misclassified header)
                            bool(re.search(r'\b(19\d{2}|20\d{2})\b', row_text))
                        )

                        # Only mark data_rows_started if this seems like actual data, not header-related
                        if not is_header_related:
                            data_rows_started = True

                    consecutive_header_rows = 0

        # Process tfoot
        tfoot = element.find('.//tfoot')
        if tfoot is not None:
            for tr in tfoot.findall('.//tr'):
                cells = self._process_row(tr, is_header=False)
                if cells:
                    row = Row(cells=cells, is_header=False)
                    table.footer.append(row)

    def _process_row(self, tr: HtmlElement, is_header: bool) -> List[Cell]:
        """Process table row into cells."""
        cells = []

        # Process both td and th elements
        for cell_elem in tr.findall('.//td') + tr.findall('.//th'):
            cell = self._process_cell(cell_elem, is_header or cell_elem.tag == 'th')
            if cell:
                cells.append(cell)

        return cells

    def _process_cell(self, elem: HtmlElement, is_header: bool) -> Optional[Cell]:
        """Process table cell."""
        # Extract cell properties
        colspan = int(elem.get('colspan', '1'))
        rowspan = int(elem.get('rowspan', '1'))
        align = elem.get('align')

        # Extract style
        style = self.style_parser.parse(elem.get('style', ''))
        if style.text_align:
            align = style.text_align

        # Extract content
        content = self._extract_cell_content(elem)

        # Create cell
        cell = Cell(
            content=content,
            colspan=colspan,
            rowspan=rowspan,
            is_header=is_header,
            align=align
        )

        return cell

    def _extract_cell_content(self, elem: HtmlElement) -> str:
        """Extract and clean cell content."""
        # Check for nested structure
        divs = elem.findall('.//div')
        if divs and len(divs) > 1:
            # Multiple divs - likely multi-line content
            lines = []
            for div in divs:
                text = self._extract_text(div)
                if text:
                    lines.append(text)
            return '\n'.join(lines)

        # Handle line breaks
        for br in elem.findall('.//br'):
            br.tail = '\n' + (br.tail or '')

        # Extract text
        text = self._extract_text(elem)

        return text

    def _extract_text(self, elem: HtmlElement) -> str:
        """Extract and clean text from element."""
        # Use itertext() to get all text fragments
        # This preserves spaces better than text_content()
        text_parts = []
        for text in elem.itertext():
            if text:
                text_parts.append(text)

        # Join parts, ensuring we don't lose spaces
        # If a part doesn't end with whitespace and the next doesn't start with whitespace,
        # we need to add a space between them
        if not text_parts:
            return ''

        result = []
        for i, part in enumerate(text_parts):
            if i == 0:
                result.append(part)
            else:
                prev_part = text_parts[i-1]
                # Check if we need to add a space between parts
                # Don't add space if previous ends with space or current starts with space
                if prev_part and part:
                    if not prev_part[-1].isspace() and not part[0].isspace():
                        # Check for punctuation that shouldn't have space before it
                        if part[0] not in ',.;:!?%)]':
                            result.append(' ')
                result.append(part)

        text = ''.join(result)

        # Replace entities
        for entity, replacement in self.ENTITY_REPLACEMENTS.items():
            text = text.replace(entity, replacement)

        # Clean whitespace
        text = text.strip()

        # Normalize internal whitespace but preserve line breaks
        lines = text.split('\n')
        cleaned_lines = []
        for line in lines:
            # Collapse multiple spaces to single space
            line = ' '.join(line.split())
            cleaned_lines.append(line)

        return '\n'.join(cleaned_lines)

    @staticmethod
    @lru_cache(maxsize=1)
    def _get_period_header_pattern():
        """
        Compile comprehensive regex for financial period headers.
        Adapted from old parser's proven patterns.

        Returns:
            Compiled regex pattern matching financial period headers
        """
        # Base components
        periods = r'(?:three|six|nine|twelve|[1-4]|first|second|third|fourth)'
        timeframes = r'(?:month|quarter|year|week)'
        ended_variants = r'(?:ended|ending|end|period)'
        as_of_variants = r'(?:as\s+of|at|as\s+at)'

        # Date pattern
        months = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
        day = r'\d{1,2}'
        year = r'(?:19|20)\d{2}'
        date = f'{months}\\s*\\.?\\s*{day}\\s*,?\\s*{year}'

        # Combined patterns
        patterns = [
            # Standard period headers
            f'{periods}\\s+{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
            f'(?:fiscal\\s+)?{timeframes}\\s+{ended_variants}',
            f'{timeframes}\\s+{ended_variants}(?:\\s+{date})?',

            # Balance sheet date headers
            f'{as_of_variants}\\s+{date}',

            # Multiple date sequences
            f'{date}(?:\\s*(?:and|,)\\s*{date})*',

            # Single dates
            f'(?:{ended_variants}\\s+)?{date}'
        ]

        pattern = '|'.join(f'(?:{p})' for p in patterns)
        return re.compile(pattern, re.IGNORECASE)

    def _is_header_row(self, tr: HtmlElement) -> bool:
        """Detect if row is likely a header row in SEC filings."""
        # Check if contains th elements (most reliable indicator)
        if tr.find('.//th') is not None:
            return True

        cells = tr.findall('.//td')
        if not cells:
            return False

        # Get row text for analysis
        row_text = tr.text_content()
        row_text_lower = row_text.lower()

        # Check for date ranges with financial data (Oracle Table 6 pattern)
        # Date ranges like "March 1, 2024—March 31, 2024" should be data rows, not headers
        date_range_pattern = r'(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}\s*[—–-]\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}'
        has_date_range = bool(re.search(date_range_pattern, row_text_lower))

        # Check for financial data indicators
        has_currency = bool(re.search(r'\$[\s]*[\d,\.]+', row_text))
        has_decimals = bool(re.search(r'\b\d+\.\d+\b', row_text))
        has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))

        # If row has date range + financial data, it's definitely a data row
        if has_date_range and (has_currency or has_decimals or has_large_numbers):
            return False

        # Check for year patterns (very common in financial headers)
        year_pattern = r'\b(19\d{2}|20\d{2})\b'
        years_found = re.findall(year_pattern, row_text)
        if len(years_found) >= 2:  # Multiple years suggest header row
            # IMPORTANT: Check for date ranges and same-year repetition
            # Date ranges like "March 1, 2024—March 31, 2024" contain the same year twice
            # but are data rows, not multi-year comparison headers

            # If all years are the same (date range pattern)
            if len(set(years_found)) == 1:
                # Same year repeated - likely a date range like "Jan 1, 2024 - Mar 31, 2024"
                # Not a multi-year comparison header
                pass  # Don't return True
            # Multiple different years suggest multi-year comparison header
            elif 'total' not in row_text_lower[:20]:  # Check first 20 chars
                return True

        # Enhanced year detection - check individual cells for year patterns
        # This handles cases where years are in separate cells
        year_cells = 0
        date_phrases = 0
        for cell in cells:
            cell_text = cell.text_content().strip()
            if cell_text:
                # Check for individual years
                if re.match(r'^\s*(19\d{2}|20\d{2})\s*$', cell_text):
                    year_cells += 1
                # Check for date phrases like "June 30, 2025"
                elif 'june 30' in cell_text.lower() or 'december 31' in cell_text.lower():
                    date_phrases += 1

        # If we have multiple year cells or year + date phrases, likely a header
        if year_cells >= 2 or (year_cells >= 1 and date_phrases >= 1):
            if 'total' not in row_text_lower[:20]:
                return True

        # Check for comprehensive financial period patterns (from old parser)
        period_pattern = self._get_period_header_pattern()
        if period_pattern.search(row_text_lower):
            # Additional validation: ensure it's not a data row with period text
            # Check for absence of strong data indicators
            data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\s*[+\-*/]\s*\d+|\(\s*\d+(?:,\d{3})*\s*\))'
            if not re.search(data_pattern, row_text):
                return True

        # Check for units notation (in millions, thousands, billions)
        units_pattern = r'\(in\s+(?:millions|thousands|billions)\)'
        if re.search(units_pattern, row_text_lower):
            return True

        # Check for period indicators (quarters, months)
        # But be careful with "fiscal" - it could be data like "Fiscal 2025"
        period_keywords = ['quarter', 'q1', 'q2', 'q3', 'q4', 'month',
                          'january', 'february', 'march', 'april', 'may', 'june',
                          'july', 'august', 'september', 'october', 'november', 'december',
                          'ended', 'three months', 'six months', 'nine months']

        # Special handling for "fiscal" - only treat as header if it's part of a phrase like "fiscal year ended"
        if 'fiscal' in row_text_lower:
            # Check if row has numeric values (suggests it's data, not header)
            # Look for patterns like "Fiscal 2025 $10,612"
            has_currency_values = bool(re.search(r'\$[\s]*[\d,]+', row_text))
            has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))

            # If it has currency or large numbers, it's likely data
            if has_currency_values or has_large_numbers:
                return False

            # Check if it's just "Fiscal YYYY" which is likely data, not a header
            fiscal_year_only = re.match(r'^\s*fiscal\s+\d{4}\s*$', row_text_lower.strip())
            if fiscal_year_only:
                return False  # This is data, not a header

            # Check for header-like phrases with fiscal
            if 'fiscal year' in row_text_lower and ('ended' in row_text_lower or 'ending' in row_text_lower):
                return True

        if any(keyword in row_text_lower for keyword in period_keywords):
            # Validate it's not a data row with period keywords
            # Check for strong data indicators
            data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
            if not re.search(data_pattern, row_text):
                return True

        # Check for column descriptors (but NOT total)
        # These are words commonly found in headers but not data rows
        header_keywords = ['description', 'item', 'category', 'type', 'classification',
                          'change', 'percent', 'increase', 'decrease', 'variance']
        if any(keyword in row_text_lower for keyword in header_keywords):
            # Make sure it's not a total row
            if 'total' not in row_text_lower[:30]:
                # Additional validation: long narrative text is not a header
                # Headers are typically concise (< 150 chars)
                if len(row_text) > 150:
                    return False
                # Check for data indicators (would indicate data row, not header)
                data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
                if re.search(data_pattern, row_text):
                    return False
                return True

        # Check if all cells are bold (common header formatting)
        bold_count = 0
        for cell in cells:
            style = cell.get('style', '')
            if 'font-weight' in style and 'bold' in style:
                bold_count += 1
            elif cell.find('.//b') is not None or cell.find('.//strong') is not None:
                bold_count += 1

        # Only consider it a header if ALL cells are bold (not just some)
        if bold_count == len(cells) and bold_count > 0:
            return True

        # Check content type ratio - headers usually have more text than numbers
        # Count cells with primarily text vs primarily numbers
        text_cells = 0
        number_cells = 0
        for cell in cells:
            cell_text = cell.text_content().strip()
            if cell_text:
                # Remove common symbols for analysis
                clean_text = cell_text.replace('$', '').replace('%', '').replace(',', '').replace('(', '').replace(')', '')
                if clean_text.replace('.', '').replace('-', '').strip().isdigit():
                    number_cells += 1
                else:
                    text_cells += 1

        # Be very careful about treating text-heavy rows as headers
        # Many data rows start with text labels (e.g., "Impact of...", "Effect of...")
        # Only consider it a header if it has mostly text AND doesn't look like a data label
        if text_cells > number_cells * 2 and text_cells >= 3:
            # Check for common data row patterns
            data_row_indicators = [
                'impact of', 'effect of', 'adjustment', 'provision for', 'benefit',
                'expense', 'income from', 'loss on', 'gain on', 'charge', 'credit',
                'earnings', 'computed', 'state taxes', 'research', 'excess tax'
            ]

            # If it starts with any of these, it's likely a data row, not a header
            for indicator in data_row_indicators:
                if row_text_lower.startswith(indicator) or indicator in row_text_lower[:50]:
                    return False

            # Also not a header if it starts with "total"
            if not row_text_lower.startswith('total'):
                return True

        return False

    def _detect_table_type(self, table: TableNode) -> TableType:
        """Detect the type of table based on content."""
        # Collect text from headers and first few rows
        text_parts = []

        # Add caption
        if table.caption:
            text_parts.append(table.caption.lower())

        # Add headers
        for header_row in table.headers:
            for cell in header_row:
                text_parts.append(cell.text().lower())

        # Add first few rows
        for row in table.rows[:3]:
            for cell in row.cells:
                text_parts.append(cell.text().lower())

        combined_text = ' '.join(text_parts)

        # Check for financial table
        financial_count = sum(1 for keyword in self.FINANCIAL_KEYWORDS if keyword in combined_text)
        if financial_count >= 2:  # Lowered threshold for better detection
            return TableType.FINANCIAL

        # Check for metrics table
        metrics_count = sum(1 for keyword in self.METRICS_KEYWORDS if keyword in combined_text)
        numeric_cells = sum(1 for row in table.rows for cell in row.cells if cell.is_numeric)
        total_cells = sum(len(row.cells) for row in table.rows)

        if total_cells > 0:
            numeric_ratio = numeric_cells / total_cells
            # More lenient metrics detection
            if metrics_count >= 1 or numeric_ratio > 0.3:
                return TableType.METRICS

        # Check for table of contents
        if 'content' in combined_text or 'index' in combined_text:
            # Look for page numbers
            has_page_numbers = any(
                re.search(r'\b\d{1,3}\b', cell.text())
                for row in table.rows
                for cell in row.cells
            )
            if has_page_numbers:
                return TableType.TABLE_OF_CONTENTS

        # Check for exhibit index
        if 'exhibit' in combined_text:
            return TableType.EXHIBIT_INDEX

        # Check for reference table (citations, definitions, etc.)
        if any(word in combined_text for word in ['reference', 'definition', 'glossary', 'citation']):
            return TableType.REFERENCE

        return TableType.GENERAL

    def _extract_relationships(self, table: TableNode):
        """Extract relationships within table data."""
        # This would implement relationship extraction
        # For now, just set a flag that relationships were processed
        table.set_metadata('relationships_extracted', True)

        # Example relationships to extract:
        # - Parent-child relationships (indented rows)
        # - Total rows that sum other rows
        # - Cross-references between cells
        # - Time series relationships

        # Detect total rows
        total_rows = []
        for i, row in enumerate(table.rows):
            if row.is_total_row:
                total_rows.append(i)

        if total_rows:
            table.set_metadata('total_rows', total_rows)

        # Detect indentation patterns (parent-child)
        indentation_levels = []
        for row in table.rows:
            if row.cells:
                first_cell_text = row.cells[0].text()
                # Count leading spaces
                indent = len(first_cell_text) - len(first_cell_text.lstrip())
                indentation_levels.append(indent)

        if any(level > 0 for level in indentation_levels):
            table.set_metadata('has_hierarchy', True)
            table.set_metadata('indentation_levels', indentation_levels)