Files
2025-12-09 12:13:01 +01:00

1192 lines
50 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Table-related nodes for the document tree.
"""
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any, Union
import pandas as pd
from rich import box
from rich.table import Table as RichTable
from edgar.richtools import rich_to_text
from edgar.documents.nodes import Node
from edgar.documents.types import NodeType, TableType
from edgar.documents.cache_mixin import CacheableMixin
from edgar.documents.table_utils import process_table_matrix
@dataclass
class Cell:
"""Table cell representation."""
content: Union[str, Node]
colspan: int = 1
rowspan: int = 1
is_header: bool = False
align: Optional[str] = None
def text(self) -> str:
"""Extract text from cell."""
if isinstance(self.content, str):
return self.content
elif isinstance(self.content, Node):
return self.content.text()
return ''
def html(self) -> str:
"""Generate cell HTML."""
tag = 'th' if self.is_header else 'td'
text = self.text()
attrs = []
if self.colspan > 1:
attrs.append(f'colspan="{self.colspan}"')
if self.rowspan > 1:
attrs.append(f'rowspan="{self.rowspan}"')
if self.align:
attrs.append(f'align="{self.align}"')
attr_str = ' ' + ' '.join(attrs) if attrs else ''
return f'<{tag}{attr_str}>{text}</{tag}>'
@property
def is_numeric(self) -> bool:
"""Check if cell contains numeric data."""
text = self.text().strip()
if not text:
return False
# Em dash and similar symbols are numeric placeholders (like null/zero)
if text in ['', '', '-', '--', 'N/A', 'n/a', 'NM', 'nm']:
return True
# Remove common formatting
clean_text = text.replace(',', '').replace('$', '').replace('%', '')
clean_text = clean_text.replace('(', '-').replace(')', '')
try:
float(clean_text)
return True
except ValueError:
return False
@property
def numeric_value(self) -> Optional[float]:
"""Get numeric value if cell is numeric."""
if not self.is_numeric:
return None
text = self.text().strip()
# Em dash and similar symbols represent zero/null
if text in ['', '', '-', '--', 'N/A', 'n/a', 'NM', 'nm']:
return 0.0
clean_text = text.replace(',', '').replace('$', '').replace('%', '')
clean_text = clean_text.replace('(', '-').replace(')', '')
try:
return float(clean_text)
except ValueError:
return None
@dataclass
class Row:
"""Table row representation."""
cells: List[Cell]
is_header: bool = False
def text(self) -> str:
"""Extract row text."""
return ' | '.join(cell.text() for cell in self.cells)
def html(self) -> str:
"""Generate row HTML."""
cells_html = ''.join(cell.html() for cell in self.cells)
return f'<tr>{cells_html}</tr>'
@property
def is_numeric_row(self) -> bool:
"""Check if row contains mostly numeric data."""
numeric_count = sum(1 for cell in self.cells if cell.is_numeric)
return numeric_count > len(self.cells) / 2
@property
def is_total_row(self) -> bool:
"""Check if this might be a total row."""
# Check if the first cell contains total-related keywords
# This is more accurate than checking the entire row text
if not self.cells:
return False
first_cell_text = self.cells[0].text().lower().strip()
# Check if the first cell starts with or is exactly a total keyword
total_keywords = ['total', 'sum', 'subtotal', 'grand total', 'net total']
# Check for exact match or starts with total keyword
for keyword in total_keywords:
if first_cell_text == keyword or first_cell_text.startswith(keyword + ' '):
return True
# Also check for patterns like "Total revenue", "Total expenses", etc.
if first_cell_text.startswith('total '):
return True
return False
@dataclass
class TableNode(Node, CacheableMixin):
"""
Table node with structured data.
Supports complex table structures with multi-level headers,
merged cells, and semantic understanding.
"""
type: NodeType = field(default=NodeType.TABLE, init=False)
headers: List[List[Cell]] = field(default_factory=list)
rows: List[Row] = field(default_factory=list)
footer: List[Row] = field(default_factory=list)
table_type: TableType = TableType.GENERAL
# Table metadata
caption: Optional[str] = None
summary: Optional[str] = None
@property
def semantic_type(self) -> TableType:
"""Get semantic type of table (alias for table_type)."""
return self.table_type
@semantic_type.setter
def semantic_type(self, value: TableType):
"""Set semantic type of table."""
self.table_type = value
def text(self) -> str:
"""Convert table to text representation with caching for performance."""
def _generate_text():
# Check if we should use fast rendering
config = getattr(self, '_config', None)
if config and getattr(config, 'fast_table_rendering', False):
return self._fast_text_rendering()
else:
# Use Rich renderer (current behavior)
rich_table = self.render(width=195)
return rich_to_text(rich_table)
return self._get_cached_text(_generate_text)
def _fast_text_rendering(self) -> str:
"""
Fast text rendering using FastTableRenderer with simple() style (clean, borderless).
The simple style matches Rich's box.SIMPLE appearance:
- No outer borders
- No column separators
- Single horizontal line under header
- Space-separated columns
- Clean, professional output
For performance-critical operations (30x+ faster than Rich rendering).
"""
from edgar.documents.renderers.fast_table import FastTableRenderer, TableStyle
# Create fast renderer with simple() style as default
renderer = FastTableRenderer(TableStyle.simple())
# Render the table
return renderer.render_table_node(self)
def _fix_header_misclassification(self):
"""
Note: We do NOT reorder rows as this would change the structure of the filing.
This method is kept for compatibility but does minimal processing.
"""
# We don't want to reorder rows as it changes the filing structure
# The rendering should handle misclassified headers appropriately
pass
def render(self, width: Optional[int] = None) -> RichTable:
"""
Render table using rich.table.Table for beautiful console output.
Args:
width: Optional max width for the table
Returns:
Rich Table object for console rendering
"""
from edgar.documents.utils.table_matrix import TableMatrix, ColumnAnalyzer
from edgar.documents.utils.currency_merger import CurrencyColumnMerger
# Fix header misclassification issues before rendering
self._fix_header_misclassification()
# Normalize header row lengths to prevent alignment issues
# When header rows have different cell counts (e.g., 14 vs 17 cells),
# the rendering can misalign columns. Pad shorter rows with empty cells.
if self.headers and len(self.headers) > 1:
max_header_cols = max(len(h) for h in self.headers)
for header_row in self.headers:
if len(header_row) < max_header_cols:
# Pad with empty cells to match the longest header row
padding_needed = max_header_cols - len(header_row)
header_row.extend([Cell(content='') for _ in range(padding_needed)])
# Build matrix to handle colspan/rowspan WITHOUT merging currencies
# Old parser keeps $ as separate cells to maintain alignment
matrix = TableMatrix()
clean_matrix = process_table_matrix(matrix, self.headers, self.rows)
# Create rich table with styling (following old parser approach)
# Use minimal padding when we have symbol columns
has_symbols = self._has_symbol_columns(clean_matrix) if hasattr(self, '_has_symbol_columns') else False
padding_config = (0, 0) if has_symbols else (0, 1)
# Don't force table to full width - let it be compact based on content
# Only use width as a maximum constraint if the table would be too wide
table = RichTable(
title=self.caption if self.caption else None,
box=box.SIMPLE,
border_style="blue",
header_style="bold cyan",
padding=padding_config, # Minimal padding for tables with symbols
collapse_padding=True,
width=None, # Let Rich auto-size based on column widths
show_header=bool(self.headers),
show_footer=bool(self.footer)
)
# Detect column alignments
column_alignments = self._detect_column_alignments(clean_matrix)
# Calculate optimal column widths based on content and available width
# Use smart widths if a width is specified, otherwise use content-based widths
if width and width > 50: # Only use smart widths for reasonable target widths
calculated_widths = self._calculate_smart_widths(clean_matrix, table_width=width)
else:
# This creates a compact table that fits its content naturally
calculated_widths = self._calculate_optimal_content_widths(clean_matrix)
# Add columns with headers
if self.headers:
# Merge all header rows into single headers with newlines (like old parser)
merged_headers = []
# For each column, merge all header rows
for col_idx in range(clean_matrix.col_count):
header_parts = []
for row_idx in range(len(self.headers)):
expanded_row = clean_matrix.get_expanded_row(row_idx)
if col_idx < len(expanded_row):
cell = expanded_row[col_idx]
if cell:
text = cell.text().strip()
# Skip empty cells and lone $ symbols (like old parser)
if text and text != '$':
header_parts.append(text)
# Join with newlines to create multi-line header
merged_header = '\n'.join(header_parts)
merged_headers.append(merged_header)
# Add columns with merged headers
for col_idx, header_text in enumerate(merged_headers):
alignment = column_alignments[col_idx] if col_idx < len(column_alignments) else "left"
# Use calculated widths for optimal compact display
if col_idx < len(calculated_widths):
col_width = calculated_widths[col_idx]
table.add_column(
header=header_text,
justify=alignment,
vertical="middle",
width=col_width,
overflow="fold" # Wrap text instead of truncating
)
else:
# No headers, create generic columns
for col_idx in range(clean_matrix.col_count):
alignment = column_alignments[col_idx] if col_idx < len(column_alignments) else "left"
if col_idx < len(calculated_widths):
col_width = calculated_widths[col_idx]
table.add_column(
header=f"Col{col_idx+1}",
justify=alignment,
vertical="middle",
width=col_width,
overflow="ellipsis"
)
# Add data rows
start_row = len(self.headers) if self.headers else 0
for row_idx in range(start_row, clean_matrix.row_count):
expanded_row = clean_matrix.get_expanded_row(row_idx)
row_data = []
for cell in expanded_row:
if cell is not None:
text = cell.text()
# Format numbers nicely
if cell.is_numeric and not text.startswith('$'):
# Preserve em dashes and similar placeholders
if text.strip() in ['', '', '-', '--', 'N/A', 'n/a', 'NM', 'nm']:
# Keep original text for these placeholders
pass
else:
# Check if it's a percentage
is_percentage = text.endswith('%')
# Check if it's likely a year (4-digit number between 1900-2100)
is_likely_year = False
try:
num_val = cell.numeric_value
if num_val is not None:
# Check if it's a 4-digit year-like number
if 1900 <= num_val <= 2100 and num_val == int(num_val):
# Also check if the original text is exactly 4 digits
clean_text = text.strip().replace('%', '')
if len(clean_text) == 4 and clean_text.isdigit():
is_likely_year = True
# Don't format years with thousands separator
if is_likely_year:
text = str(int(num_val))
elif num_val < 0:
text = f"({abs(num_val):,.0f})"
else:
text = f"{num_val:,.0f}"
# Re-add percentage symbol if it was there
if is_percentage:
text = f"{text}%"
except:
pass
row_data.append(text)
else:
row_data.append("")
# Add row without special styling
table.add_row(*row_data)
# Add footer rows if present
if self.footer:
for row in self.footer:
footer_data = [cell.text() for cell in row.cells]
table.add_row(*footer_data, style="dim italic")
return table
def _has_symbol_columns(self, matrix) -> bool:
"""Check if table has columns that contain only symbols like $ or %."""
header_row_count = len(self.headers) if self.headers else 0
for col_idx in range(matrix.col_count):
is_symbol_col = True
has_content = False
for row_idx in range(header_row_count, matrix.row_count):
cell = matrix.get_cell(row_idx, col_idx)
if cell and cell.text().strip():
has_content = True
text = cell.text().strip()
# Check if it's not just a symbol
if text not in ['$', '%', '', '£', '¥', '', '-', '', '(', ')'] and len(text) > 2:
is_symbol_col = False
break
if has_content and is_symbol_col:
return True
return False
def _calculate_newline_safe_width(self, text: str, base_width: int) -> int:
"""
Calculate width that guarantees Rich won't re-wrap multi-line text.
If text contains newlines, ensures column width is sufficient for
the longest line plus buffer for Rich's padding and borders.
This preserves semantic line breaks in merged headers (similar to
old parser's newline preservation approach).
Args:
text: Text content (may contain \n)
base_width: Base width from content measurement
Returns:
Safe width that prevents re-wrapping
"""
if not text or '\n' not in text:
return base_width
# Multi-line content detected - ensure Rich won't re-wrap
lines = text.split('\n')
max_line_len = max(len(line) for line in lines)
# Add buffer for Rich's internal processing:
# +2 for column padding (1 char each side)
# +2 for safety margin (Rich's internal calculations)
buffer = 4
return max(max_line_len + buffer, base_width)
def _calculate_optimal_content_widths(self, matrix) -> List[int]:
"""
Calculate optimal column widths based on actual content.
Creates compact tables that fit their content naturally.
Args:
matrix: TableMatrix with the table data
Returns:
List of optimal column widths
"""
widths = []
header_row_count = len(self.headers) if self.headers else 0
for col_idx in range(matrix.col_count):
max_width = 1 # Minimum width
header_max_width = 1 # Track header width separately
data_max_width = 1 # Track data width separately
has_multiline = False # Track if column has multi-line content
multiline_text = "" # Store representative multi-line text
# Check all cells in column
for row_idx in range(matrix.row_count):
# Get the matrix cell to check if it's spanned
matrix_cell = matrix.matrix[row_idx][col_idx] if row_idx < len(matrix.matrix) and col_idx < len(matrix.matrix[row_idx]) else None
cell = matrix.get_cell(row_idx, col_idx)
if cell is not None:
text = cell.text().strip()
if text:
# Check if this is a spanned cell (part of colspan)
# If it's spanned and not the origin column, don't count its full width
is_spanned = matrix_cell and matrix_cell.is_spanned
if is_spanned:
# For spanned cells, don't use the text width
# These are covered by the origin cell
continue
# Track if this cell has multi-line content
if '\n' in text:
has_multiline = True
# Store the multi-line text for width calculation
if not multiline_text or len(text) > len(multiline_text):
multiline_text = text
# For multi-line text (headers), get the max line width
lines = text.split('\n')
for line in lines:
line_len = len(line)
max_width = max(max_width, line_len)
# Consider all rows up to row 3 as potential headers for width calculation
# This handles tables with multi-row headers like Table 52
if row_idx < max(header_row_count, 3):
header_max_width = max(header_max_width, line_len)
else:
data_max_width = max(data_max_width, line_len)
# Add appropriate padding based on content type
col_width = max_width # Start with measured max width
if max_width <= 1:
# Empty or single char (like symbols)
col_width = max_width
elif max_width <= 10:
# Short to medium content (numbers, percentages, short headers)
# Give headers adequate room for readability
if header_max_width >= 7:
# Headers like "Accrued", "Expected" need breathing room
col_width = max_width + 3
elif header_max_width > 5:
col_width = max_width + 2
else:
col_width = max_width + 1
elif max_width <= 15:
# Medium content
col_width = max_width + 2
else:
# Long content (text descriptions or long headers)
# Check if this is primarily a text column (not numeric)
is_text_column = False
cells_checked = 0
# Check more rows and skip empty ones
for row_idx in range(header_row_count, matrix.row_count):
if cells_checked >= 5: # Check up to 5 non-empty cells
break
test_cell = matrix.get_cell(row_idx, col_idx)
if test_cell and test_cell.text().strip():
cells_checked += 1
# If it's not numeric, it's a text column
if not test_cell.is_numeric:
is_text_column = True
break
if is_text_column:
# Allow more width for text columns
# For very long text, allow wrapping at a reasonable width
if max_width > 80:
# Very long text - wrap at 70 chars
col_width = 70
elif max_width > 50:
# Long text - give it generous space
col_width = min(max_width + 3, 65)
else:
# Medium text
col_width = max_width + 3
else:
# Numeric columns - need to balance header and data widths
# If header is much longer than data, give it reasonable space
# but not excessive
if header_max_width > data_max_width * 2:
# Header is much longer than data
# Give enough space for header but allow some wrapping
col_width = min(header_max_width + 1, 25)
else:
# Header and data are similar or data is longer
col_width = min(max_width + 2, 35)
# Apply newline-safe width if column contains multi-line content
# This prevents Rich from re-wrapping merged headers
if has_multiline and multiline_text:
col_width = self._calculate_newline_safe_width(multiline_text, col_width)
widths.append(col_width)
return widths
def _calculate_smart_widths(self, matrix, table_width: Optional[int] = None) -> List[int]:
"""
Calculate smart column widths for complex tables.
Args:
matrix: TableMatrix with the table data
table_width: Optional target table width (used as maximum, not target)
Returns:
List of column widths
"""
if table_width is None:
table_width = 120 # Default reasonable width
# Start with content-based widths
content_widths = []
header_row_count = len(self.headers) if self.headers else 0
for col_idx in range(matrix.col_count):
max_width = 1 # Minimum width
# Check all cells in column including multi-line headers
for row_idx in range(matrix.row_count):
# Get the matrix cell to check if it's spanned
matrix_cell = matrix.matrix[row_idx][col_idx] if row_idx < len(matrix.matrix) and col_idx < len(matrix.matrix[row_idx]) else None
# Skip spanned cells (they're covered by the origin cell)
is_spanned = matrix_cell and matrix_cell.is_spanned
if is_spanned:
continue
cell = matrix.get_cell(row_idx, col_idx)
if cell is not None:
text = cell.text().strip()
if text:
# For multi-line text, get the max line width
lines = text.split('\n')
for line in lines:
max_width = max(max_width, len(line))
content_widths.append(max_width)
# For compact tables, just use natural widths with some padding
# Don't try to expand to fill the entire width
compact_widths = []
for width in content_widths:
# Add a bit of padding for readability but keep it compact
if width <= 2: # Symbol columns
compact_widths.append(width)
elif width <= 10: # Short numeric columns
compact_widths.append(width + 1)
else: # Text columns
compact_widths.append(min(width + 2, 40)) # Cap at 40 for very long text
# Check if compact table fits within maximum width
padding_per_col = 2 # Rich adds padding
total_padding = padding_per_col * len(compact_widths)
separators = len(compact_widths) - 1 # Column separators
total_width = sum(compact_widths) + total_padding + separators + 4 # 4 for table borders
if total_width <= table_width:
# Compact table fits, use it
return compact_widths
# If it doesn't fit, we need to compress intelligently
available_width = table_width - total_padding - separators - 4
# Need to compress - use smarter strategy
final_widths = []
# First pass: identify column types and minimum widths
col_types = []
is_first_text_col = True # Track first text column (usually description/label)
for col_idx, natural_width in enumerate(content_widths):
# Check if column is empty or just whitespace
is_empty = natural_width == 1 or all(
not matrix.get_cell(row_idx, col_idx) or not matrix.get_cell(row_idx, col_idx).text().strip()
for row_idx in range(matrix.row_count)
)
if is_empty:
col_types.append('empty')
final_widths.append(1) # Minimal width for empty columns but not zero
continue
# Get the header content width for this column by looking at the merged header
# This is how headers will actually be displayed (same logic as in render method)
header_parts = []
for row_idx in range(header_row_count):
expanded_row = matrix.get_expanded_row(row_idx)
if col_idx < len(expanded_row):
cell = expanded_row[col_idx]
if cell:
text = cell.text().strip()
# Skip empty cells and lone $ symbols (like render method does)
if text and text != '$':
header_parts.append(text)
# Calculate width needed for merged header
header_width = 1
merged_header = ""
if header_parts:
merged_header = '\n'.join(header_parts)
lines = merged_header.split('\n')
for line in lines:
header_width = max(header_width, len(line))
# Apply newline-safe buffer if header is multi-line
# This prevents Rich from re-wrapping merged headers
if '\n' in merged_header:
header_width = self._calculate_newline_safe_width(merged_header, header_width)
# Check if this is a symbol column (%, $, etc)
is_symbol = True
is_numeric = True
sample_values = []
for row_idx in range(header_row_count, min(matrix.row_count, header_row_count + 5)):
cell = matrix.get_cell(row_idx, col_idx)
if cell and cell.text().strip():
text = cell.text().strip()
sample_values.append(text)
if text not in ['%', '$', '', '-', '(', ')']:
is_symbol = False
if not (text.replace(',', '').replace('.', '').replace('(', '').replace(')', '').replace('$', '').replace('%', '').replace('-', '').replace(' ', '').isdigit()):
is_numeric = False
if is_symbol:
col_types.append('symbol')
# Symbol columns still need space for their headers
# Use header width if there's a meaningful header, otherwise minimal
if header_width > 2: # Has a real header, not just a symbol
final_widths.append(max(header_width, 7)) # At least 7 chars for headers
else:
final_widths.append(1) # True symbol column with no header
elif is_numeric or any('$' in v for v in sample_values):
col_types.append('numeric')
# Financial numbers need reasonable space to avoid wrapping
# Must be at least as wide as the header, but use at least 10 chars
min_numeric_width = max(10, header_width)
final_widths.append(min(natural_width, max(min_numeric_width, natural_width // 2)))
else:
col_types.append('text')
# First text column (usually row labels) gets more space
if is_first_text_col and col_idx == 0:
# Give generous space to the description column
# But cap at 35 to leave room for data columns
min_text_width = max(25, header_width)
final_widths.append(min(natural_width, max(min_text_width, min(35, natural_width))))
is_first_text_col = False
else:
# Other text columns get moderate space, but at least as wide as header
min_other_width = max(12, header_width)
final_widths.append(min(natural_width, max(min_other_width, natural_width // 2)))
# Second pass: redistribute remaining space if we're still over
current_total = sum(final_widths)
if current_total > available_width:
# Need to compress more
reduction_needed = current_total - available_width
# Sort columns by width (largest first) for reduction
# But prioritize reducing text columns before numeric columns
width_indices = sorted(range(len(final_widths)),
key=lambda i: (col_types[i] != 'text', final_widths[i]),
reverse=True)
for idx in width_indices:
if col_types[idx] not in ['symbol', 'empty'] and final_widths[idx] > 8:
# Reduce this column but maintain minimum readable width
# First text column (descriptions): minimum 20 chars
# Other text columns: minimum 10 chars
# Numeric columns: minimum 8 chars
if col_types[idx] == 'text' and idx == 0:
min_width = 20
elif col_types[idx] == 'text':
min_width = 10
else:
min_width = 8
reduction = min(final_widths[idx] - min_width, reduction_needed)
if reduction > 0:
final_widths[idx] -= reduction
reduction_needed -= reduction
if reduction_needed <= 0:
break
elif current_total < available_width - 10:
# We have extra space - distribute it to columns that need it most
extra_space = available_width - current_total
# Priority: Give extra space to columns that are below their natural width
# Focus on numeric columns that might have wrapped values
for col_idx, (final_w, natural_w) in enumerate(zip(final_widths, content_widths)):
if col_types[col_idx] == 'numeric' and final_w < natural_w:
# Give back some space to numeric columns
space_to_add = min(natural_w - final_w, extra_space // 2)
final_widths[col_idx] += space_to_add
extra_space -= space_to_add
if extra_space <= 0:
break
return final_widths
def _calculate_optimal_widths(self, matrix) -> List[int]:
"""
Calculate optimal column widths based on content.
Returns:
List of optimal widths for each column
"""
widths = []
for col_idx in range(matrix.col_count):
max_width = 0
is_symbol_column = True # Assume it's a symbol column until proven otherwise
all_values = []
# Check all cells in column to find max width needed
# Skip header rows when determining if it's a symbol column
header_row_count = len(self.headers) if self.headers else 0
for row_idx in range(matrix.row_count):
cell = matrix.get_cell(row_idx, col_idx)
if cell is not None:
text = cell.text().strip()
if text:
all_values.append(text)
max_width = max(max_width, len(text))
# Only check data rows (not headers) for symbol detection
if row_idx >= header_row_count:
# Check if this is NOT a symbol
if text not in ['$', '%', '', '£', '¥', '', '-', '', '']:
# If it's not a symbol and has alphanumeric content, it's not a symbol column
if any(c.isalnum() for c in text) and len(text) > 2:
is_symbol_column = False
# Determine width based on column type
if max_width == 0:
# Empty column
widths.append(1)
elif is_symbol_column:
# Column contains only symbols (%, $, etc.)
# Use minimal width regardless of header
widths.append(1) # Even tighter for symbols
elif max_width <= 3:
# Very short content (like "2", "(3)", "—")
# Check if it's mostly numbers or symbols in data rows
data_values = [v for v in all_values[header_row_count:] if v]
if all(len(v) <= 3 for v in data_values):
# All data values are 3 chars or less
widths.append(max_width + 1) # Just enough space
else:
widths.append(max_width + 2)
else:
# Regular content - use actual width needed
# But cap very long columns to prevent table explosion
widths.append(min(max_width, 30))
return widths
def _detect_column_alignments(self, matrix) -> List[str]:
"""Detect whether columns should be left or right aligned."""
alignments = []
for col_idx in range(matrix.col_count):
numeric_count = 0
total_count = 0
# Check data rows (skip headers)
start_row = len(self.headers) if self.headers else 0
for row_idx in range(start_row, matrix.row_count):
cell = matrix.get_cell(row_idx, col_idx)
if cell is not None and cell.text().strip():
total_count += 1
if cell.is_numeric:
numeric_count += 1
# If more than 60% numeric, right-align
if total_count > 0 and numeric_count / total_count > 0.6:
alignments.append("right")
else:
alignments.append("left")
return alignments
def html(self) -> str:
"""Generate table HTML."""
parts = ['<table>']
# Add caption
if self.caption:
parts.append(f'<caption>{self.caption}</caption>')
# Add header
if self.headers:
parts.append('<thead>')
for header_row in self.headers:
cells = ''.join(cell.html() for cell in header_row)
parts.append(f'<tr>{cells}</tr>')
parts.append('</thead>')
# Add body
parts.append('<tbody>')
for row in self.rows:
parts.append(row.html())
parts.append('</tbody>')
# Add footer
if self.footer:
parts.append('<tfoot>')
for row in self.footer:
parts.append(row.html())
parts.append('</tfoot>')
parts.append('</table>')
return '\n'.join(parts)
def to_dataframe(self) -> pd.DataFrame:
"""Convert table to pandas DataFrame with proper colspan/rowspan handling."""
from edgar.documents.utils.table_matrix import TableMatrix
# Build matrix to handle colspan/rowspan WITHOUT merging currencies
# Old parser keeps $ as separate cells to maintain alignment
matrix = TableMatrix()
clean_matrix = process_table_matrix(matrix, self.headers, self.rows)
# Extract headers with proper alignment
if self.headers:
# Get expanded headers from matrix
header_arrays = []
num_header_rows = len(self.headers)
for row_idx in range(num_header_rows):
expanded_row = clean_matrix.get_expanded_row(row_idx)
header_texts = []
prev_text = ''
for i, cell in enumerate(expanded_row):
if cell is not None:
text = cell.text().strip()
header_texts.append(text)
prev_text = text
else:
# For spanned cells in first row, repeat the spanning header
# For subsequent rows, use empty string
if row_idx == 0 and prev_text:
header_texts.append(prev_text)
else:
header_texts.append('')
# Fill in spanned cells with parent header text for MultiIndex
if row_idx > 0 and header_arrays:
# For lower level headers, inherit from parent if empty
prev_header = header_arrays[-1]
for i, text in enumerate(header_texts):
if text == '' and i < len(prev_header):
# Check if this is under a spanned parent header
for j in range(i, -1, -1):
if prev_header[j] != '':
# Keep empty to show it's under parent
break
header_arrays.append(header_texts)
# Create column index
if len(header_arrays) > 1:
# Multi-level headers - create MultiIndex
# Clean up arrays to same length
max_len = max(len(arr) for arr in header_arrays)
for arr in header_arrays:
while len(arr) < max_len:
arr.append('')
df_columns = pd.MultiIndex.from_arrays(header_arrays)
else:
# Single level headers
df_columns = header_arrays[0] if header_arrays else []
else:
# No headers, use numeric columns
df_columns = list(range(clean_matrix.col_count))
# Extract data rows with proper alignment
data = []
start_row = len(self.headers) if self.headers else 0
for row_idx in range(start_row, clean_matrix.row_count):
expanded_row = clean_matrix.get_expanded_row(row_idx)
row_data = []
for cell in expanded_row:
if cell is not None:
text = cell.text()
# Check if this is a merged currency value (starts with $, €, £, etc.)
if text and text[0] in {'$', '', '£', '¥'}:
# Keep the full text with currency symbol
row_data.append(text)
elif cell.is_numeric:
row_data.append(cell.numeric_value)
else:
row_data.append(text)
else:
row_data.append(None) # Empty cell
# Only add non-empty rows
if any(v is not None and str(v).strip() for v in row_data):
data.append(row_data)
# Create DataFrame
if data and df_columns is not None:
# Ensure data width matches column width
col_count = len(df_columns) if hasattr(df_columns, '__len__') else df_columns.nlevels
for row in data:
while len(row) < col_count:
row.append(None)
while len(row) > col_count:
row.pop()
df = pd.DataFrame(data, columns=df_columns)
# Set row index if first column is labels
if self.has_row_headers and len(df.columns) > 0:
df = df.set_index(df.columns[0])
return df
else:
# Return empty DataFrame with columns
return pd.DataFrame(columns=df_columns if df_columns is not None else [])
def to_csv(self) -> str:
"""Export table as CSV."""
df = self.to_dataframe()
return df.to_csv(index=False)
def to_dict(self) -> Dict[str, Any]:
"""Convert table to dictionary."""
return {
'type': self.table_type.name,
'caption': self.caption,
'headers': [[cell.text() for cell in row] for row in self.headers],
'data': [[cell.text() for cell in row.cells] for row in self.rows],
'footer': [[cell.text() for cell in row.cells] for row in self.footer]
}
def find_column(self, header_text: str) -> Optional[int]:
"""Find column index by header text."""
if not self.headers:
return None
# Search in first header row
for i, cell in enumerate(self.headers[0]):
if header_text.lower() in cell.text().lower():
return i
return None
def extract_column(self, column_index: int) -> List[str]:
"""Extract all values from a column."""
values = []
for row in self.rows:
if column_index < len(row.cells):
values.append(row.cells[column_index].text())
return values
def find_row_by_first_cell(self, text: str) -> Optional[Row]:
"""Find row by first cell content."""
for row in self.rows:
if row.cells and text.lower() in row.cells[0].text().lower():
return row
return None
def get_numeric_columns(self) -> Dict[str, List[float]]:
"""Extract all numeric columns with their headers."""
result = {}
if not self.headers:
return result
# Check each column
for col_idx, header_cell in enumerate(self.headers[0]):
header = header_cell.text()
values = []
is_numeric_col = True
# Extract values from column
for row in self.rows:
if col_idx < len(row.cells):
cell = row.cells[col_idx]
if cell.is_numeric:
values.append(cell.numeric_value)
else:
# Check if it's a total row or empty
if not row.is_total_row and cell.text().strip():
is_numeric_col = False
break
values.append(None)
# Only include if mostly numeric
if is_numeric_col and values:
non_none_values = [v for v in values if v is not None]
if len(non_none_values) > len(values) * 0.5: # At least 50% numeric
result[header] = values
return result
def find_totals(self) -> Dict[str, float]:
"""Find total rows in table."""
totals = {}
for row in self.rows:
if row.is_total_row:
# Extract label from first cell
label = row.cells[0].text() if row.cells else "Total"
# Find numeric values in row
for cell in row.cells[1:]: # Skip label cell
if cell.is_numeric:
totals[label] = cell.numeric_value
break
return totals
@property
def is_financial_table(self) -> bool:
"""Check if this appears to be a financial table."""
if self.table_type == TableType.FINANCIAL:
return True
# Check headers for financial keywords
financial_keywords = [
'revenue', 'income', 'expense', 'asset', 'liability',
'cash', 'equity', 'profit', 'loss', 'margin'
]
header_text = ' '.join(
cell.text().lower()
for row in self.headers
for cell in row
)
return any(keyword in header_text for keyword in financial_keywords)
@property
def row_count(self) -> int:
"""Get total number of rows in table (including headers)."""
return len(self.headers) + len(self.rows)
@property
def col_count(self) -> int:
"""Get number of columns in table."""
if self.headers and self.headers[0]:
return len(self.headers[0])
elif self.rows and self.rows[0].cells:
return len(self.rows[0].cells)
return 0
@property
def has_header(self) -> bool:
"""Check if table has header rows."""
return bool(self.headers)
@property
def has_row_headers(self) -> bool:
"""Check if table has row headers (first column as labels)."""
if not self.rows:
return False
# Check if first column is non-numeric
first_col_numeric = 0
for row in self.rows:
if row.cells and row.cells[0].is_numeric:
first_col_numeric += 1
# If less than 20% of first column is numeric, likely row headers
return first_col_numeric < len(self.rows) * 0.2
@property
def numeric_columns(self) -> List[int]:
"""Get indices of numeric columns."""
numeric_cols = []
for col_idx in range(self.col_count):
numeric_count = 0
total_count = 0
for row in self.rows:
if col_idx < len(row.cells):
total_count += 1
if row.cells[col_idx].is_numeric:
numeric_count += 1
# If more than 50% numeric, consider it a numeric column
if total_count > 0 and numeric_count / total_count > 0.5:
numeric_cols.append(col_idx)
return numeric_cols
def summarize_for_llm(self, max_tokens: int = 500) -> str:
"""Create concise table summary for LLM processing."""
parts = []
# Add type and structure info
parts.append(f"Table Type: {self.table_type.name}")
parts.append(f"Size: {len(self.rows)} rows × {len(self.headers[0]) if self.headers else 'unknown'} columns")
if self.caption:
parts.append(f"Caption: {self.caption}")
# Add column headers
if self.headers:
headers = [cell.text() for cell in self.headers[0]]
parts.append(f"Columns: {', '.join(headers[:5])}")
if len(headers) > 5:
parts.append(f" ... and {len(headers) - 5} more columns")
# Add sample data or totals
totals = self.find_totals()
if totals:
parts.append("Key totals:")
for label, value in list(totals.items())[:3]:
parts.append(f" {label}: {value:,.0f}")
# Add numeric column summary
numeric_cols = self.get_numeric_columns()
if numeric_cols:
parts.append("Numeric columns found:")
for col_name in list(numeric_cols.keys())[:3]:
parts.append(f" - {col_name}")
return '\n'.join(parts)