edgartools/venv/lib/python3.10/site-packages/edgar/documents/table_nodes.py

"""
Table-related nodes for the document tree.
"""

from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any, Union
import pandas as pd
from rich import box
from rich.table import Table as RichTable
from edgar.richtools import rich_to_text
from edgar.documents.nodes import Node
from edgar.documents.types import NodeType, TableType
from edgar.documents.cache_mixin import CacheableMixin
from edgar.documents.table_utils import process_table_matrix


@dataclass
class Cell:
    """Table cell representation."""
    content: Union[str, Node]
    colspan: int = 1
    rowspan: int = 1
    is_header: bool = False
    align: Optional[str] = None

    def text(self) -> str:
        """Extract text from cell."""
        if isinstance(self.content, str):
            return self.content
        elif isinstance(self.content, Node):
            return self.content.text()
        return ''

    def html(self) -> str:
        """Generate cell HTML."""
        tag = 'th' if self.is_header else 'td'
        text = self.text()

        attrs = []
        if self.colspan > 1:
            attrs.append(f'colspan="{self.colspan}"')
        if self.rowspan > 1:
            attrs.append(f'rowspan="{self.rowspan}"')
        if self.align:
            attrs.append(f'align="{self.align}"')

        attr_str = ' ' + ' '.join(attrs) if attrs else ''
        return f'<{tag}{attr_str}>{text}</{tag}>'

    @property
    def is_numeric(self) -> bool:
        """Check if cell contains numeric data."""
        text = self.text().strip()
        if not text:
            return False

        # Em dash and similar symbols are numeric placeholders (like null/zero)
        if text in ['—', '–', '-', '--', 'N/A', 'n/a', 'NM', 'nm']:
            return True

        # Remove common formatting
        clean_text = text.replace(',', '').replace('$', '').replace('%', '')
        clean_text = clean_text.replace('(', '-').replace(')', '')

        try:
            float(clean_text)
            return True
        except ValueError:
            return False

    @property
    def numeric_value(self) -> Optional[float]:
        """Get numeric value if cell is numeric."""
        if not self.is_numeric:
            return None

        text = self.text().strip()

        # Em dash and similar symbols represent zero/null
        if text in ['—', '–', '-', '--', 'N/A', 'n/a', 'NM', 'nm']:
            return 0.0

        clean_text = text.replace(',', '').replace('$', '').replace('%', '')
        clean_text = clean_text.replace('(', '-').replace(')', '')

        try:
            return float(clean_text)
        except ValueError:
            return None


@dataclass
class Row:
    """Table row representation."""
    cells: List[Cell]
    is_header: bool = False

    def text(self) -> str:
        """Extract row text."""
        return ' | '.join(cell.text() for cell in self.cells)

    def html(self) -> str:
        """Generate row HTML."""
        cells_html = ''.join(cell.html() for cell in self.cells)
        return f'<tr>{cells_html}</tr>'

    @property
    def is_numeric_row(self) -> bool:
        """Check if row contains mostly numeric data."""
        numeric_count = sum(1 for cell in self.cells if cell.is_numeric)
        return numeric_count > len(self.cells) / 2

    @property
    def is_total_row(self) -> bool:
        """Check if this might be a total row."""
        # Check if the first cell contains total-related keywords
        # This is more accurate than checking the entire row text
        if not self.cells:
            return False

        first_cell_text = self.cells[0].text().lower().strip()

        # Check if the first cell starts with or is exactly a total keyword
        total_keywords = ['total', 'sum', 'subtotal', 'grand total', 'net total']

        # Check for exact match or starts with total keyword
        for keyword in total_keywords:
            if first_cell_text == keyword or first_cell_text.startswith(keyword + ' '):
                return True

        # Also check for patterns like "Total revenue", "Total expenses", etc.
        if first_cell_text.startswith('total '):
            return True

        return False


@dataclass
class TableNode(Node, CacheableMixin):
    """
    Table node with structured data.

    Supports complex table structures with multi-level headers,
    merged cells, and semantic understanding.
    """
    type: NodeType = field(default=NodeType.TABLE, init=False)
    headers: List[List[Cell]] = field(default_factory=list)
    rows: List[Row] = field(default_factory=list)
    footer: List[Row] = field(default_factory=list)
    table_type: TableType = TableType.GENERAL

    # Table metadata
    caption: Optional[str] = None
    summary: Optional[str] = None

    @property
    def semantic_type(self) -> TableType:
        """Get semantic type of table (alias for table_type)."""
        return self.table_type

    @semantic_type.setter
    def semantic_type(self, value: TableType):
        """Set semantic type of table."""
        self.table_type = value

    def text(self) -> str:
        """Convert table to text representation with caching for performance."""
        def _generate_text():
            # Check if we should use fast rendering
            config = getattr(self, '_config', None)
            if config and getattr(config, 'fast_table_rendering', False):
                return self._fast_text_rendering()
            else:
                # Use Rich renderer (current behavior)
                rich_table = self.render(width=195)
                return rich_to_text(rich_table)

        return self._get_cached_text(_generate_text)

    def _fast_text_rendering(self) -> str:
        """
        Fast text rendering using FastTableRenderer with simple() style (clean, borderless).

        The simple style matches Rich's box.SIMPLE appearance:
        - No outer borders
        - No column separators
        - Single horizontal line under header
        - Space-separated columns
        - Clean, professional output

        For performance-critical operations (30x+ faster than Rich rendering).
        """
        from edgar.documents.renderers.fast_table import FastTableRenderer, TableStyle

        # Create fast renderer with simple() style as default
        renderer = FastTableRenderer(TableStyle.simple())

        # Render the table
        return renderer.render_table_node(self)


    def _fix_header_misclassification(self):
        """
        Note: We do NOT reorder rows as this would change the structure of the filing.
        This method is kept for compatibility but does minimal processing.
        """
        # We don't want to reorder rows as it changes the filing structure
        # The rendering should handle misclassified headers appropriately
        pass

    def render(self, width: Optional[int] = None) -> RichTable:
        """
        Render table using rich.table.Table for beautiful console output.

        Args:
            width: Optional max width for the table

        Returns:
            Rich Table object for console rendering
        """
        from edgar.documents.utils.table_matrix import TableMatrix, ColumnAnalyzer
        from edgar.documents.utils.currency_merger import CurrencyColumnMerger

        # Fix header misclassification issues before rendering
        self._fix_header_misclassification()

        # Normalize header row lengths to prevent alignment issues
        # When header rows have different cell counts (e.g., 14 vs 17 cells),
        # the rendering can misalign columns. Pad shorter rows with empty cells.
        if self.headers and len(self.headers) > 1:
            max_header_cols = max(len(h) for h in self.headers)
            for header_row in self.headers:
                if len(header_row) < max_header_cols:
                    # Pad with empty cells to match the longest header row
                    padding_needed = max_header_cols - len(header_row)
                    header_row.extend([Cell(content='') for _ in range(padding_needed)])

        # Build matrix to handle colspan/rowspan WITHOUT merging currencies
        # Old parser keeps $ as separate cells to maintain alignment
        matrix = TableMatrix()
        clean_matrix = process_table_matrix(matrix, self.headers, self.rows)

        # Create rich table with styling (following old parser approach)
        # Use minimal padding when we have symbol columns
        has_symbols = self._has_symbol_columns(clean_matrix) if hasattr(self, '_has_symbol_columns') else False
        padding_config = (0, 0) if has_symbols else (0, 1)

        # Don't force table to full width - let it be compact based on content
        # Only use width as a maximum constraint if the table would be too wide
        table = RichTable(
            title=self.caption if self.caption else None,
            box=box.SIMPLE,
            border_style="blue",
            header_style="bold cyan",
            padding=padding_config,  # Minimal padding for tables with symbols
            collapse_padding=True,
            width=None,  # Let Rich auto-size based on column widths
            show_header=bool(self.headers),
            show_footer=bool(self.footer)
        )

        # Detect column alignments
        column_alignments = self._detect_column_alignments(clean_matrix)

        # Calculate optimal column widths based on content and available width
        # Use smart widths if a width is specified, otherwise use content-based widths
        if width and width > 50:  # Only use smart widths for reasonable target widths
            calculated_widths = self._calculate_smart_widths(clean_matrix, table_width=width)
        else:
            # This creates a compact table that fits its content naturally
            calculated_widths = self._calculate_optimal_content_widths(clean_matrix)

        # Add columns with headers
        if self.headers:
            # Merge all header rows into single headers with newlines (like old parser)
            merged_headers = []

            # For each column, merge all header rows
            for col_idx in range(clean_matrix.col_count):
                header_parts = []
                for row_idx in range(len(self.headers)):
                    expanded_row = clean_matrix.get_expanded_row(row_idx)
                    if col_idx < len(expanded_row):
                        cell = expanded_row[col_idx]
                        if cell:
                            text = cell.text().strip()
                            # Skip empty cells and lone $ symbols (like old parser)
                            if text and text != '$':
                                header_parts.append(text)

                # Join with newlines to create multi-line header
                merged_header = '\n'.join(header_parts)
                merged_headers.append(merged_header)

            # Add columns with merged headers
            for col_idx, header_text in enumerate(merged_headers):
                alignment = column_alignments[col_idx] if col_idx < len(column_alignments) else "left"

                # Use calculated widths for optimal compact display
                if col_idx < len(calculated_widths):
                    col_width = calculated_widths[col_idx]
                    table.add_column(
                        header=header_text,
                        justify=alignment,
                        vertical="middle",
                        width=col_width,
                        overflow="fold"  # Wrap text instead of truncating
                    )
        else:
            # No headers, create generic columns
            for col_idx in range(clean_matrix.col_count):
                alignment = column_alignments[col_idx] if col_idx < len(column_alignments) else "left"

                if col_idx < len(calculated_widths):
                    col_width = calculated_widths[col_idx]
                    table.add_column(
                        header=f"Col{col_idx+1}",
                        justify=alignment,
                        vertical="middle",
                        width=col_width,
                        overflow="ellipsis"
                    )

        # Add data rows
        start_row = len(self.headers) if self.headers else 0
        for row_idx in range(start_row, clean_matrix.row_count):
            expanded_row = clean_matrix.get_expanded_row(row_idx)
            row_data = []

            for cell in expanded_row:
                if cell is not None:
                    text = cell.text()
                    # Format numbers nicely
                    if cell.is_numeric and not text.startswith('$'):
                        # Preserve em dashes and similar placeholders
                        if text.strip() in ['—', '–', '-', '--', 'N/A', 'n/a', 'NM', 'nm']:
                            # Keep original text for these placeholders
                            pass
                        else:
                            # Check if it's a percentage
                            is_percentage = text.endswith('%')
                            # Check if it's likely a year (4-digit number between 1900-2100)
                            is_likely_year = False
                            try:
                                num_val = cell.numeric_value
                                if num_val is not None:
                                    # Check if it's a 4-digit year-like number
                                    if 1900 <= num_val <= 2100 and num_val == int(num_val):
                                        # Also check if the original text is exactly 4 digits
                                        clean_text = text.strip().replace('%', '')
                                        if len(clean_text) == 4 and clean_text.isdigit():
                                            is_likely_year = True

                                    # Don't format years with thousands separator
                                    if is_likely_year:
                                        text = str(int(num_val))
                                    elif num_val < 0:
                                        text = f"({abs(num_val):,.0f})"
                                    else:
                                        text = f"{num_val:,.0f}"
                                    # Re-add percentage symbol if it was there
                                    if is_percentage:
                                        text = f"{text}%"
                            except:
                                pass
                    row_data.append(text)
                else:
                    row_data.append("")

            # Add row without special styling
            table.add_row(*row_data)

        # Add footer rows if present
        if self.footer:
            for row in self.footer:
                footer_data = [cell.text() for cell in row.cells]
                table.add_row(*footer_data, style="dim italic")

        return table

    def _has_symbol_columns(self, matrix) -> bool:
        """Check if table has columns that contain only symbols like $ or %."""
        header_row_count = len(self.headers) if self.headers else 0

        for col_idx in range(matrix.col_count):
            is_symbol_col = True
            has_content = False

            for row_idx in range(header_row_count, matrix.row_count):
                cell = matrix.get_cell(row_idx, col_idx)
                if cell and cell.text().strip():
                    has_content = True
                    text = cell.text().strip()
                    # Check if it's not just a symbol
                    if text not in ['$', '%', '€', '£', '¥', '—', '-', '–', '(', ')'] and len(text) > 2:
                        is_symbol_col = False
                        break

            if has_content and is_symbol_col:
                return True

        return False

    def _calculate_newline_safe_width(self, text: str, base_width: int) -> int:
        """
        Calculate width that guarantees Rich won't re-wrap multi-line text.

        If text contains newlines, ensures column width is sufficient for
        the longest line plus buffer for Rich's padding and borders.
        This preserves semantic line breaks in merged headers (similar to
        old parser's newline preservation approach).

        Args:
            text: Text content (may contain \n)
            base_width: Base width from content measurement

        Returns:
            Safe width that prevents re-wrapping
        """
        if not text or '\n' not in text:
            return base_width

        # Multi-line content detected - ensure Rich won't re-wrap
        lines = text.split('\n')
        max_line_len = max(len(line) for line in lines)

        # Add buffer for Rich's internal processing:
        # +2 for column padding (1 char each side)
        # +2 for safety margin (Rich's internal calculations)
        buffer = 4

        return max(max_line_len + buffer, base_width)

    def _calculate_optimal_content_widths(self, matrix) -> List[int]:
        """
        Calculate optimal column widths based on actual content.
        Creates compact tables that fit their content naturally.

        Args:
            matrix: TableMatrix with the table data

        Returns:
            List of optimal column widths
        """
        widths = []
        header_row_count = len(self.headers) if self.headers else 0

        for col_idx in range(matrix.col_count):
            max_width = 1  # Minimum width
            header_max_width = 1  # Track header width separately
            data_max_width = 1  # Track data width separately
            has_multiline = False  # Track if column has multi-line content
            multiline_text = ""  # Store representative multi-line text

            # Check all cells in column
            for row_idx in range(matrix.row_count):
                # Get the matrix cell to check if it's spanned
                matrix_cell = matrix.matrix[row_idx][col_idx] if row_idx < len(matrix.matrix) and col_idx < len(matrix.matrix[row_idx]) else None

                cell = matrix.get_cell(row_idx, col_idx)
                if cell is not None:
                    text = cell.text().strip()
                    if text:
                        # Check if this is a spanned cell (part of colspan)
                        # If it's spanned and not the origin column, don't count its full width
                        is_spanned = matrix_cell and matrix_cell.is_spanned

                        if is_spanned:
                            # For spanned cells, don't use the text width
                            # These are covered by the origin cell
                            continue

                        # Track if this cell has multi-line content
                        if '\n' in text:
                            has_multiline = True
                            # Store the multi-line text for width calculation
                            if not multiline_text or len(text) > len(multiline_text):
                                multiline_text = text

                        # For multi-line text (headers), get the max line width
                        lines = text.split('\n')
                        for line in lines:
                            line_len = len(line)
                            max_width = max(max_width, line_len)
                            # Consider all rows up to row 3 as potential headers for width calculation
                            # This handles tables with multi-row headers like Table 52
                            if row_idx < max(header_row_count, 3):
                                header_max_width = max(header_max_width, line_len)
                            else:
                                data_max_width = max(data_max_width, line_len)

            # Add appropriate padding based on content type
            col_width = max_width  # Start with measured max width

            if max_width <= 1:
                # Empty or single char (like symbols)
                col_width = max_width
            elif max_width <= 10:
                # Short to medium content (numbers, percentages, short headers)
                # Give headers adequate room for readability
                if header_max_width >= 7:
                    # Headers like "Accrued", "Expected" need breathing room
                    col_width = max_width + 3
                elif header_max_width > 5:
                    col_width = max_width + 2
                else:
                    col_width = max_width + 1
            elif max_width <= 15:
                # Medium content
                col_width = max_width + 2
            else:
                # Long content (text descriptions or long headers)
                # Check if this is primarily a text column (not numeric)
                is_text_column = False
                cells_checked = 0
                # Check more rows and skip empty ones
                for row_idx in range(header_row_count, matrix.row_count):
                    if cells_checked >= 5:  # Check up to 5 non-empty cells
                        break
                    test_cell = matrix.get_cell(row_idx, col_idx)
                    if test_cell and test_cell.text().strip():
                        cells_checked += 1
                        # If it's not numeric, it's a text column
                        if not test_cell.is_numeric:
                            is_text_column = True
                            break

                if is_text_column:
                    # Allow more width for text columns
                    # For very long text, allow wrapping at a reasonable width
                    if max_width > 80:
                        # Very long text - wrap at 70 chars
                        col_width = 70
                    elif max_width > 50:
                        # Long text - give it generous space
                        col_width = min(max_width + 3, 65)
                    else:
                        # Medium text
                        col_width = max_width + 3
                else:
                    # Numeric columns - need to balance header and data widths
                    # If header is much longer than data, give it reasonable space
                    # but not excessive
                    if header_max_width > data_max_width * 2:
                        # Header is much longer than data
                        # Give enough space for header but allow some wrapping
                        col_width = min(header_max_width + 1, 25)
                    else:
                        # Header and data are similar or data is longer
                        col_width = min(max_width + 2, 35)

            # Apply newline-safe width if column contains multi-line content
            # This prevents Rich from re-wrapping merged headers
            if has_multiline and multiline_text:
                col_width = self._calculate_newline_safe_width(multiline_text, col_width)

            widths.append(col_width)

        return widths

    def _calculate_smart_widths(self, matrix, table_width: Optional[int] = None) -> List[int]:
        """
        Calculate smart column widths for complex tables.

        Args:
            matrix: TableMatrix with the table data
            table_width: Optional target table width (used as maximum, not target)

        Returns:
            List of column widths
        """
        if table_width is None:
            table_width = 120  # Default reasonable width

        # Start with content-based widths
        content_widths = []
        header_row_count = len(self.headers) if self.headers else 0

        for col_idx in range(matrix.col_count):
            max_width = 1  # Minimum width

            # Check all cells in column including multi-line headers
            for row_idx in range(matrix.row_count):
                # Get the matrix cell to check if it's spanned
                matrix_cell = matrix.matrix[row_idx][col_idx] if row_idx < len(matrix.matrix) and col_idx < len(matrix.matrix[row_idx]) else None

                # Skip spanned cells (they're covered by the origin cell)
                is_spanned = matrix_cell and matrix_cell.is_spanned
                if is_spanned:
                    continue

                cell = matrix.get_cell(row_idx, col_idx)
                if cell is not None:
                    text = cell.text().strip()
                    if text:
                        # For multi-line text, get the max line width
                        lines = text.split('\n')
                        for line in lines:
                            max_width = max(max_width, len(line))

            content_widths.append(max_width)

        # For compact tables, just use natural widths with some padding
        # Don't try to expand to fill the entire width
        compact_widths = []
        for width in content_widths:
            # Add a bit of padding for readability but keep it compact
            if width <= 2:  # Symbol columns
                compact_widths.append(width)
            elif width <= 10:  # Short numeric columns
                compact_widths.append(width + 1)
            else:  # Text columns
                compact_widths.append(min(width + 2, 40))  # Cap at 40 for very long text

        # Check if compact table fits within maximum width
        padding_per_col = 2  # Rich adds padding
        total_padding = padding_per_col * len(compact_widths)
        separators = len(compact_widths) - 1  # Column separators
        total_width = sum(compact_widths) + total_padding + separators + 4  # 4 for table borders

        if total_width <= table_width:
            # Compact table fits, use it
            return compact_widths

        # If it doesn't fit, we need to compress intelligently
        available_width = table_width - total_padding - separators - 4

        # Need to compress - use smarter strategy
        final_widths = []

        # First pass: identify column types and minimum widths
        col_types = []
        is_first_text_col = True  # Track first text column (usually description/label)

        for col_idx, natural_width in enumerate(content_widths):
            # Check if column is empty or just whitespace
            is_empty = natural_width == 1 or all(
                not matrix.get_cell(row_idx, col_idx) or not matrix.get_cell(row_idx, col_idx).text().strip()
                for row_idx in range(matrix.row_count)
            )

            if is_empty:
                col_types.append('empty')
                final_widths.append(1)  # Minimal width for empty columns but not zero
                continue

            # Get the header content width for this column by looking at the merged header
            # This is how headers will actually be displayed (same logic as in render method)
            header_parts = []
            for row_idx in range(header_row_count):
                expanded_row = matrix.get_expanded_row(row_idx)
                if col_idx < len(expanded_row):
                    cell = expanded_row[col_idx]
                    if cell:
                        text = cell.text().strip()
                        # Skip empty cells and lone $ symbols (like render method does)
                        if text and text != '$':
                            header_parts.append(text)

            # Calculate width needed for merged header
            header_width = 1
            merged_header = ""
            if header_parts:
                merged_header = '\n'.join(header_parts)
                lines = merged_header.split('\n')
                for line in lines:
                    header_width = max(header_width, len(line))

                # Apply newline-safe buffer if header is multi-line
                # This prevents Rich from re-wrapping merged headers
                if '\n' in merged_header:
                    header_width = self._calculate_newline_safe_width(merged_header, header_width)

            # Check if this is a symbol column (%, $, etc)
            is_symbol = True
            is_numeric = True
            sample_values = []

            for row_idx in range(header_row_count, min(matrix.row_count, header_row_count + 5)):
                cell = matrix.get_cell(row_idx, col_idx)
                if cell and cell.text().strip():
                    text = cell.text().strip()
                    sample_values.append(text)
                    if text not in ['%', '$', '—', '-', '(', ')']:
                        is_symbol = False
                    if not (text.replace(',', '').replace('.', '').replace('(', '').replace(')', '').replace('$', '').replace('%', '').replace('-', '').replace(' ', '').isdigit()):
                        is_numeric = False

            if is_symbol:
                col_types.append('symbol')
                # Symbol columns still need space for their headers
                # Use header width if there's a meaningful header, otherwise minimal
                if header_width > 2:  # Has a real header, not just a symbol
                    final_widths.append(max(header_width, 7))  # At least 7 chars for headers
                else:
                    final_widths.append(1)  # True symbol column with no header
            elif is_numeric or any('$' in v for v in sample_values):
                col_types.append('numeric')
                # Financial numbers need reasonable space to avoid wrapping
                # Must be at least as wide as the header, but use at least 10 chars
                min_numeric_width = max(10, header_width)
                final_widths.append(min(natural_width, max(min_numeric_width, natural_width // 2)))
            else:
                col_types.append('text')
                # First text column (usually row labels) gets more space
                if is_first_text_col and col_idx == 0:
                    # Give generous space to the description column
                    # But cap at 35 to leave room for data columns
                    min_text_width = max(25, header_width)
                    final_widths.append(min(natural_width, max(min_text_width, min(35, natural_width))))
                    is_first_text_col = False
                else:
                    # Other text columns get moderate space, but at least as wide as header
                    min_other_width = max(12, header_width)
                    final_widths.append(min(natural_width, max(min_other_width, natural_width // 2)))

        # Second pass: redistribute remaining space if we're still over
        current_total = sum(final_widths)
        if current_total > available_width:
            # Need to compress more
            reduction_needed = current_total - available_width

            # Sort columns by width (largest first) for reduction
            # But prioritize reducing text columns before numeric columns
            width_indices = sorted(range(len(final_widths)),
                                   key=lambda i: (col_types[i] != 'text', final_widths[i]),
                                   reverse=True)

            for idx in width_indices:
                if col_types[idx] not in ['symbol', 'empty'] and final_widths[idx] > 8:
                    # Reduce this column but maintain minimum readable width
                    # First text column (descriptions): minimum 20 chars
                    # Other text columns: minimum 10 chars
                    # Numeric columns: minimum 8 chars
                    if col_types[idx] == 'text' and idx == 0:
                        min_width = 20
                    elif col_types[idx] == 'text':
                        min_width = 10
                    else:
                        min_width = 8

                    reduction = min(final_widths[idx] - min_width, reduction_needed)
                    if reduction > 0:
                        final_widths[idx] -= reduction
                        reduction_needed -= reduction
                        if reduction_needed <= 0:
                            break
        elif current_total < available_width - 10:
            # We have extra space - distribute it to columns that need it most
            extra_space = available_width - current_total

            # Priority: Give extra space to columns that are below their natural width
            # Focus on numeric columns that might have wrapped values
            for col_idx, (final_w, natural_w) in enumerate(zip(final_widths, content_widths)):
                if col_types[col_idx] == 'numeric' and final_w < natural_w:
                    # Give back some space to numeric columns
                    space_to_add = min(natural_w - final_w, extra_space // 2)
                    final_widths[col_idx] += space_to_add
                    extra_space -= space_to_add
                    if extra_space <= 0:
                        break

        return final_widths

    def _calculate_optimal_widths(self, matrix) -> List[int]:
        """
        Calculate optimal column widths based on content.

        Returns:
            List of optimal widths for each column
        """
        widths = []

        for col_idx in range(matrix.col_count):
            max_width = 0
            is_symbol_column = True  # Assume it's a symbol column until proven otherwise
            all_values = []

            # Check all cells in column to find max width needed
            # Skip header rows when determining if it's a symbol column
            header_row_count = len(self.headers) if self.headers else 0

            for row_idx in range(matrix.row_count):
                cell = matrix.get_cell(row_idx, col_idx)
                if cell is not None:
                    text = cell.text().strip()
                    if text:
                        all_values.append(text)
                        max_width = max(max_width, len(text))

                        # Only check data rows (not headers) for symbol detection
                        if row_idx >= header_row_count:
                            # Check if this is NOT a symbol
                            if text not in ['$', '%', '€', '£', '¥', '—', '-', '–', '']:
                                # If it's not a symbol and has alphanumeric content, it's not a symbol column
                                if any(c.isalnum() for c in text) and len(text) > 2:
                                    is_symbol_column = False

            # Determine width based on column type
            if max_width == 0:
                # Empty column
                widths.append(1)
            elif is_symbol_column:
                # Column contains only symbols (%, $, etc.)
                # Use minimal width regardless of header
                widths.append(1)  # Even tighter for symbols
            elif max_width <= 3:
                # Very short content (like "2", "(3)", "—")
                # Check if it's mostly numbers or symbols in data rows
                data_values = [v for v in all_values[header_row_count:] if v]
                if all(len(v) <= 3 for v in data_values):
                    # All data values are 3 chars or less
                    widths.append(max_width + 1)  # Just enough space
                else:
                    widths.append(max_width + 2)
            else:
                # Regular content - use actual width needed
                # But cap very long columns to prevent table explosion
                widths.append(min(max_width, 30))

        return widths

    def _detect_column_alignments(self, matrix) -> List[str]:
        """Detect whether columns should be left or right aligned."""
        alignments = []

        for col_idx in range(matrix.col_count):
            numeric_count = 0
            total_count = 0

            # Check data rows (skip headers)
            start_row = len(self.headers) if self.headers else 0
            for row_idx in range(start_row, matrix.row_count):
                cell = matrix.get_cell(row_idx, col_idx)
                if cell is not None and cell.text().strip():
                    total_count += 1
                    if cell.is_numeric:
                        numeric_count += 1

            # If more than 60% numeric, right-align
            if total_count > 0 and numeric_count / total_count > 0.6:
                alignments.append("right")
            else:
                alignments.append("left")

        return alignments

    def html(self) -> str:
        """Generate table HTML."""
        parts = ['<table>']

        # Add caption
        if self.caption:
            parts.append(f'<caption>{self.caption}</caption>')

        # Add header
        if self.headers:
            parts.append('<thead>')
            for header_row in self.headers:
                cells = ''.join(cell.html() for cell in header_row)
                parts.append(f'<tr>{cells}</tr>')
            parts.append('</thead>')

        # Add body
        parts.append('<tbody>')
        for row in self.rows:
            parts.append(row.html())
        parts.append('</tbody>')

        # Add footer
        if self.footer:
            parts.append('<tfoot>')
            for row in self.footer:
                parts.append(row.html())
            parts.append('</tfoot>')

        parts.append('</table>')
        return '\n'.join(parts)

    def to_dataframe(self) -> pd.DataFrame:
        """Convert table to pandas DataFrame with proper colspan/rowspan handling."""
        from edgar.documents.utils.table_matrix import TableMatrix

        # Build matrix to handle colspan/rowspan WITHOUT merging currencies
        # Old parser keeps $ as separate cells to maintain alignment
        matrix = TableMatrix()
        clean_matrix = process_table_matrix(matrix, self.headers, self.rows)

        # Extract headers with proper alignment
        if self.headers:
            # Get expanded headers from matrix
            header_arrays = []
            num_header_rows = len(self.headers)

            for row_idx in range(num_header_rows):
                expanded_row = clean_matrix.get_expanded_row(row_idx)
                header_texts = []

                prev_text = ''
                for i, cell in enumerate(expanded_row):
                    if cell is not None:
                        text = cell.text().strip()
                        header_texts.append(text)
                        prev_text = text
                    else:
                        # For spanned cells in first row, repeat the spanning header
                        # For subsequent rows, use empty string
                        if row_idx == 0 and prev_text:
                            header_texts.append(prev_text)
                        else:
                            header_texts.append('')

                # Fill in spanned cells with parent header text for MultiIndex
                if row_idx > 0 and header_arrays:
                    # For lower level headers, inherit from parent if empty
                    prev_header = header_arrays[-1]
                    for i, text in enumerate(header_texts):
                        if text == '' and i < len(prev_header):
                            # Check if this is under a spanned parent header
                            for j in range(i, -1, -1):
                                if prev_header[j] != '':
                                    # Keep empty to show it's under parent
                                    break

                header_arrays.append(header_texts)

            # Create column index
            if len(header_arrays) > 1:
                # Multi-level headers - create MultiIndex
                # Clean up arrays to same length
                max_len = max(len(arr) for arr in header_arrays)
                for arr in header_arrays:
                    while len(arr) < max_len:
                        arr.append('')

                df_columns = pd.MultiIndex.from_arrays(header_arrays)
            else:
                # Single level headers
                df_columns = header_arrays[0] if header_arrays else []
        else:
            # No headers, use numeric columns
            df_columns = list(range(clean_matrix.col_count))

        # Extract data rows with proper alignment
        data = []
        start_row = len(self.headers) if self.headers else 0

        for row_idx in range(start_row, clean_matrix.row_count):
            expanded_row = clean_matrix.get_expanded_row(row_idx)
            row_data = []

            for cell in expanded_row:
                if cell is not None:
                    text = cell.text()
                    # Check if this is a merged currency value (starts with $, €, £, etc.)
                    if text and text[0] in {'$', '€', '£', '¥'}:
                        # Keep the full text with currency symbol
                        row_data.append(text)
                    elif cell.is_numeric:
                        row_data.append(cell.numeric_value)
                    else:
                        row_data.append(text)
                else:
                    row_data.append(None)  # Empty cell

            # Only add non-empty rows
            if any(v is not None and str(v).strip() for v in row_data):
                data.append(row_data)

        # Create DataFrame
        if data and df_columns is not None:
            # Ensure data width matches column width
            col_count = len(df_columns) if hasattr(df_columns, '__len__') else df_columns.nlevels
            for row in data:
                while len(row) < col_count:
                    row.append(None)
                while len(row) > col_count:
                    row.pop()

            df = pd.DataFrame(data, columns=df_columns)

            # Set row index if first column is labels
            if self.has_row_headers and len(df.columns) > 0:
                df = df.set_index(df.columns[0])

            return df
        else:
            # Return empty DataFrame with columns
            return pd.DataFrame(columns=df_columns if df_columns is not None else [])

    def to_csv(self) -> str:
        """Export table as CSV."""
        df = self.to_dataframe()
        return df.to_csv(index=False)

    def to_dict(self) -> Dict[str, Any]:
        """Convert table to dictionary."""
        return {
            'type': self.table_type.name,
            'caption': self.caption,
            'headers': [[cell.text() for cell in row] for row in self.headers],
            'data': [[cell.text() for cell in row.cells] for row in self.rows],
            'footer': [[cell.text() for cell in row.cells] for row in self.footer]
        }

    def find_column(self, header_text: str) -> Optional[int]:
        """Find column index by header text."""
        if not self.headers:
            return None

        # Search in first header row
        for i, cell in enumerate(self.headers[0]):
            if header_text.lower() in cell.text().lower():
                return i

        return None

    def extract_column(self, column_index: int) -> List[str]:
        """Extract all values from a column."""
        values = []
        for row in self.rows:
            if column_index < len(row.cells):
                values.append(row.cells[column_index].text())
        return values

    def find_row_by_first_cell(self, text: str) -> Optional[Row]:
        """Find row by first cell content."""
        for row in self.rows:
            if row.cells and text.lower() in row.cells[0].text().lower():
                return row
        return None

    def get_numeric_columns(self) -> Dict[str, List[float]]:
        """Extract all numeric columns with their headers."""
        result = {}

        if not self.headers:
            return result

        # Check each column
        for col_idx, header_cell in enumerate(self.headers[0]):
            header = header_cell.text()
            values = []
            is_numeric_col = True

            # Extract values from column
            for row in self.rows:
                if col_idx < len(row.cells):
                    cell = row.cells[col_idx]
                    if cell.is_numeric:
                        values.append(cell.numeric_value)
                    else:
                        # Check if it's a total row or empty
                        if not row.is_total_row and cell.text().strip():
                            is_numeric_col = False
                            break
                        values.append(None)

            # Only include if mostly numeric
            if is_numeric_col and values:
                non_none_values = [v for v in values if v is not None]
                if len(non_none_values) > len(values) * 0.5:  # At least 50% numeric
                    result[header] = values

        return result

    def find_totals(self) -> Dict[str, float]:
        """Find total rows in table."""
        totals = {}

        for row in self.rows:
            if row.is_total_row:
                # Extract label from first cell
                label = row.cells[0].text() if row.cells else "Total"

                # Find numeric values in row
                for cell in row.cells[1:]:  # Skip label cell
                    if cell.is_numeric:
                        totals[label] = cell.numeric_value
                        break

        return totals

    @property
    def is_financial_table(self) -> bool:
        """Check if this appears to be a financial table."""
        if self.table_type == TableType.FINANCIAL:
            return True

        # Check headers for financial keywords
        financial_keywords = [
            'revenue', 'income', 'expense', 'asset', 'liability',
            'cash', 'equity', 'profit', 'loss', 'margin'
        ]

        header_text = ' '.join(
            cell.text().lower()
            for row in self.headers
            for cell in row
        )

        return any(keyword in header_text for keyword in financial_keywords)

    @property
    def row_count(self) -> int:
        """Get total number of rows in table (including headers)."""
        return len(self.headers) + len(self.rows)

    @property
    def col_count(self) -> int:
        """Get number of columns in table."""
        if self.headers and self.headers[0]:
            return len(self.headers[0])
        elif self.rows and self.rows[0].cells:
            return len(self.rows[0].cells)
        return 0

    @property
    def has_header(self) -> bool:
        """Check if table has header rows."""
        return bool(self.headers)

    @property
    def has_row_headers(self) -> bool:
        """Check if table has row headers (first column as labels)."""
        if not self.rows:
            return False

        # Check if first column is non-numeric
        first_col_numeric = 0
        for row in self.rows:
            if row.cells and row.cells[0].is_numeric:
                first_col_numeric += 1

        # If less than 20% of first column is numeric, likely row headers
        return first_col_numeric < len(self.rows) * 0.2

    @property
    def numeric_columns(self) -> List[int]:
        """Get indices of numeric columns."""
        numeric_cols = []

        for col_idx in range(self.col_count):
            numeric_count = 0
            total_count = 0

            for row in self.rows:
                if col_idx < len(row.cells):
                    total_count += 1
                    if row.cells[col_idx].is_numeric:
                        numeric_count += 1

            # If more than 50% numeric, consider it a numeric column
            if total_count > 0 and numeric_count / total_count > 0.5:
                numeric_cols.append(col_idx)

        return numeric_cols


    def summarize_for_llm(self, max_tokens: int = 500) -> str:
        """Create concise table summary for LLM processing."""
        parts = []

        # Add type and structure info
        parts.append(f"Table Type: {self.table_type.name}")
        parts.append(f"Size: {len(self.rows)} rows × {len(self.headers[0]) if self.headers else 'unknown'} columns")

        if self.caption:
            parts.append(f"Caption: {self.caption}")

        # Add column headers
        if self.headers:
            headers = [cell.text() for cell in self.headers[0]]
            parts.append(f"Columns: {', '.join(headers[:5])}")
            if len(headers) > 5:
                parts.append(f"  ... and {len(headers) - 5} more columns")

        # Add sample data or totals
        totals = self.find_totals()
        if totals:
            parts.append("Key totals:")
            for label, value in list(totals.items())[:3]:
                parts.append(f"  {label}: {value:,.0f}")

        # Add numeric column summary
        numeric_cols = self.get_numeric_columns()
        if numeric_cols:
            parts.append("Numeric columns found:")
            for col_name in list(numeric_cols.keys())[:3]:
                parts.append(f"  - {col_name}")

        return '\n'.join(parts)