Files
edgartools/venv/lib/python3.10/site-packages/edgar/documents/utils/table_matrix.py
2025-12-09 12:13:01 +01:00

858 lines
39 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Table matrix builder for handling complex colspan/rowspan structures.
"""
from dataclasses import dataclass
from typing import List, Optional
from edgar.documents.table_nodes import Cell, Row
@dataclass
class MatrixCell:
"""Cell in the matrix with reference to original cell"""
original_cell: Optional[Cell] = None
is_spanned: bool = False # True if this is part of a colspan/rowspan
row_origin: int = -1 # Original row index
col_origin: int = -1 # Original column index
class TableMatrix:
"""
Build a 2D matrix representation of table with proper handling of merged cells.
This class converts a table with colspan/rowspan into a regular 2D grid
where each merged cell occupies multiple positions in the matrix.
"""
def __init__(self):
"""Initialize empty matrix"""
self.matrix: List[List[MatrixCell]] = []
self.row_count = 0
self.col_count = 0
self.header_row_count = 0 # Track number of header rows
def build_from_rows(self, header_rows: List[List[Cell]], data_rows: List[Row]) -> 'TableMatrix':
"""
Build matrix from header rows and data rows.
Args:
header_rows: List of header rows (each row is a list of Cells)
data_rows: List of Row objects
Returns:
Self for chaining
"""
# Store header row count for later use
self.header_row_count = len(header_rows)
# Combine all rows for processing
all_rows = []
# Add header rows
for header_row in header_rows:
all_rows.append(header_row)
# Add data rows
for row in data_rows:
all_rows.append(row.cells)
if not all_rows:
return self
# Calculate dimensions
self.row_count = len(all_rows)
# First pass: determine actual column count
self._calculate_dimensions(all_rows)
# Initialize matrix
self.matrix = [[MatrixCell() for _ in range(self.col_count)]
for _ in range(self.row_count)]
# Second pass: place cells in matrix
self._place_cells(all_rows)
return self
def _calculate_dimensions(self, rows: List[List[Cell]]):
"""Calculate the actual dimensions considering colspan"""
max_cols = 0
for row_idx, row in enumerate(rows):
col_pos = 0
for cell in row:
# Skip positions that might be occupied by rowspan from above
while col_pos < max_cols and self._is_occupied(row_idx, col_pos):
col_pos += 1
# This cell will occupy from col_pos to col_pos + colspan
col_end = col_pos + cell.colspan
max_cols = max(max_cols, col_end)
col_pos = col_end
self.col_count = max_cols
def _is_occupied(self, row: int, col: int) -> bool:
"""Check if a position is occupied by a cell from a previous row (rowspan)"""
if row == 0:
return False
# Check if any cell above has rowspan that reaches this position
for prev_row in range(row):
if prev_row < len(self.matrix) and col < len(self.matrix[prev_row]):
cell = self.matrix[prev_row][col]
if cell.original_cell and cell.row_origin == prev_row:
# Check if this cell's rowspan reaches current row
if prev_row + cell.original_cell.rowspan > row:
return True
return False
def _place_cells(self, rows: List[List[Cell]]):
"""Place cells in the matrix handling colspan and rowspan"""
for row_idx, row in enumerate(rows):
col_pos = 0
for cell_idx, cell in enumerate(row):
# Find next available column position
while col_pos < self.col_count and self.matrix[row_idx][col_pos].original_cell is not None:
col_pos += 1
if col_pos >= self.col_count:
# Need to expand matrix
self._expand_columns(col_pos + cell.colspan)
# Special handling for cells with colspan > 1 containing numeric values
# Only apply this logic for Table 15-style alignment issues
# Check if this looks like a financial value that should be right-aligned
cell_text = cell.text().strip()
# Check for numeric values that need special alignment
# This is specifically for cases like "167,045" that should align with "$167,045"
has_comma_separator = ',' in cell_text
digit_ratio = sum(c.isdigit() for c in cell_text) / len(cell_text) if cell_text else 0
# Only apply special placement for colspan=2 numeric values in data rows
# This handles Table 15's specific case without breaking Table 13
is_special_numeric = (cell.colspan == 2 and # Specifically colspan=2
has_comma_separator and
digit_ratio > 0.5 and # More than 50% digits
not cell_text.startswith('$') and
not any(month in cell_text.lower() for month in
['jan', 'feb', 'mar', 'apr', 'may', 'jun',
'jul', 'aug', 'sep', 'oct', 'nov', 'dec']) and
row_idx > 1) # Not a header row (allow for multi-row headers)
if is_special_numeric:
# Place empty cell at first position, content at second position
# This is specifically for Table 15 alignment
for r in range(cell.rowspan):
# First column of span: empty
if row_idx + r < self.row_count and col_pos < self.col_count:
self.matrix[row_idx + r][col_pos] = MatrixCell()
# Second column of span: the actual content
if row_idx + r < self.row_count and col_pos + 1 < self.col_count:
matrix_cell = MatrixCell(
original_cell=cell,
is_spanned=False,
row_origin=row_idx,
col_origin=col_pos + 1
)
self.matrix[row_idx + r][col_pos + 1] = matrix_cell
# Remaining columns of span: mark as spanned (though colspan=2 has no remaining)
for c in range(2, cell.colspan):
if row_idx + r < self.row_count and col_pos + c < self.col_count:
matrix_cell = MatrixCell(
original_cell=cell,
is_spanned=True,
row_origin=row_idx,
col_origin=col_pos + 1
)
self.matrix[row_idx + r][col_pos + c] = matrix_cell
else:
# Normal placement for other cells
for r in range(cell.rowspan):
for c in range(cell.colspan):
if row_idx + r < self.row_count and col_pos + c < self.col_count:
matrix_cell = MatrixCell(
original_cell=cell,
is_spanned=(r > 0 or c > 0),
row_origin=row_idx,
col_origin=col_pos
)
self.matrix[row_idx + r][col_pos + c] = matrix_cell
col_pos += cell.colspan
def _expand_columns(self, new_col_count: int):
"""Expand matrix to accommodate more columns"""
if new_col_count <= self.col_count:
return
for row in self.matrix:
row.extend([MatrixCell() for _ in range(new_col_count - self.col_count)])
self.col_count = new_col_count
def get_actual_columns(self) -> int:
"""Get the actual number of data columns (excluding empty/spacing columns)"""
non_empty_cols = 0
for col_idx in range(self.col_count):
has_content = False
for row_idx in range(self.row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
# Check if cell has actual content
text = cell.original_cell.text().strip()
if text and text not in ['', ' ', '\xa0']:
has_content = True
break
if has_content:
non_empty_cols += 1
return non_empty_cols
def get_column_widths(self) -> List[float]:
"""Estimate column widths based on content"""
widths = []
for col_idx in range(self.col_count):
max_width = 0
content_count = 0
for row_idx in range(self.row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
max_width = max(max_width, len(text))
content_count += 1
# If column has no content, it's likely a spacing column
if content_count == 0:
widths.append(0)
else:
widths.append(max_width)
return widths
def get_cell(self, row_idx: int, col_idx: int) -> Optional[Cell]:
"""
Get a cell at specific position in the matrix.
Args:
row_idx: Row index
col_idx: Column index
Returns:
Cell at position or None if out of bounds
"""
if row_idx >= self.row_count or col_idx >= self.col_count or row_idx < 0 or col_idx < 0:
return None
matrix_cell = self.matrix[row_idx][col_idx]
# Return the original cell
if matrix_cell.original_cell:
return matrix_cell.original_cell
# Return empty cell for empty positions
return Cell("")
def get_expanded_row(self, row_idx: int) -> List[Optional[Cell]]:
"""
Get a row with cells expanded to match column count.
For cells with colspan > 1, the cell appears in the first position
and None in subsequent positions.
"""
if row_idx >= self.row_count:
return []
expanded = []
for col_idx in range(self.col_count):
matrix_cell = self.matrix[row_idx][col_idx]
if matrix_cell.original_cell:
if not matrix_cell.is_spanned:
# This is the origin cell
expanded.append(matrix_cell.original_cell)
else:
# This is a spanned position
expanded.append(None)
else:
# Empty cell
expanded.append(None)
return expanded
def get_data_columns(self) -> List[int]:
"""
Get indices of columns that contain actual data (not spacing).
Uses strategy similar to old parser - keeps single empty columns for spacing.
Returns:
List of column indices that contain data
"""
# First, identify which columns are empty
empty_cols = []
for col_idx in range(self.col_count):
has_content = False
for row_idx in range(self.row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
has_content = True
break
if not has_content:
empty_cols.append(col_idx)
# Apply old parser's strategy
cols_to_remove = set()
# Remove leading empty columns
for col in range(self.col_count):
if col in empty_cols:
cols_to_remove.add(col)
else:
break
# Remove trailing empty columns
for col in reversed(range(self.col_count)):
if col in empty_cols:
cols_to_remove.add(col)
else:
break
# Remove consecutive empty columns in the middle (keep single empty cols for spacing)
i = 0
while i < self.col_count - 1:
if i in empty_cols and (i + 1) in empty_cols:
# Found consecutive empty columns
consecutive_count = 0
j = i
while j < self.col_count and j in empty_cols:
consecutive_count += 1
j += 1
# Keep first empty column as spacer, remove the rest
cols_to_remove.update(range(i + 1, i + consecutive_count))
i = j
else:
i += 1
# Return columns that are NOT in the removal set
data_cols = [col for col in range(self.col_count) if col not in cols_to_remove]
return data_cols
def filter_spacing_columns(self) -> 'TableMatrix':
"""
Create a new matrix with spacing columns removed.
Also handles colspan-generated duplicate columns and misalignment.
Returns:
New TableMatrix with only data columns
"""
# First pass: identify primary header columns (those with colspan > 1 headers)
# and data columns
primary_header_cols = set()
all_header_cols = set()
data_cols = set()
# Find primary header columns (those that start a colspan)
for row_idx in range(min(3, self.row_count)):
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
if cell.original_cell.text().strip():
all_header_cols.add(col_idx)
# Check if this is a primary header (colspan > 1)
if cell.original_cell.colspan > 1:
primary_header_cols.add(col_idx)
# If no primary headers found, use all headers as primary
if not primary_header_cols:
primary_header_cols = all_header_cols
# Phase 1.5: Identify columns with header content
# Any column with non-empty text in ANY header row must be preserved
# This prevents legitimate header columns from being removed as "spacing"
# Also preserve columns that are spanned by headers (colspan > 1)
header_content_columns = set()
for col_idx in range(self.col_count):
for row_idx in range(self.header_row_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell:
# Check for original header cell with content
if not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
header_content_columns.add(col_idx)
# Also add all columns spanned by this header
if cell.original_cell.colspan > 1:
for span_offset in range(1, cell.original_cell.colspan):
span_col = col_idx + span_offset
if span_col < self.col_count:
header_content_columns.add(span_col)
break # Found content, no need to check other header rows
# Also preserve columns that are spanned (part of a colspan)
elif cell.is_spanned:
# This column is part of a header's colspan
text = cell.original_cell.text().strip()
if text:
header_content_columns.add(col_idx)
# Find columns with data (skip header rows)
# Count actual header rows by checking for non-data content
actual_header_rows = 0
for row_idx in range(min(3, self.row_count)):
has_numeric_data = False
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
# Check if it looks like numeric data (has commas or starts with $)
if text and (',' in text and any(c.isdigit() for c in text)) or text == '$':
has_numeric_data = True
break
if has_numeric_data:
break
actual_header_rows += 1
data_start_row = max(1, actual_header_rows)
# Track columns with significant data (not just isolated cells)
col_data_count = {}
for row_idx in range(data_start_row, self.row_count):
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
if cell.original_cell.text().strip():
data_cols.add(col_idx)
col_data_count[col_idx] = col_data_count.get(col_idx, 0) + 1
# Build initial list of columns to keep
# Always include column 0 if it contains row labels
cols_to_keep = set(primary_header_cols)
# Add columns with header content (prevents removing legitimate headers)
cols_to_keep.update(header_content_columns)
# Identify misaligned data columns that need to be consolidated
# These are data columns that are not primary header columns
misaligned_data_cols = data_cols - primary_header_cols
# Map misaligned data columns to their nearest column for consolidation
# Only consolidate directly adjacent columns with specific patterns
consolidation_map = {}
# First pass: identify all potential consolidations
potential_consolidations = {}
for data_col in sorted(misaligned_data_cols):
# Check if this column should be consolidated with an adjacent column
# Check the column immediately before this one
prev_col = data_col - 1
# Sample some cells to see if consolidation makes sense
consolidation_type = None
for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
prev_cell = self.matrix[row_idx][prev_col] if prev_col >= 0 else None
curr_cell = self.matrix[row_idx][data_col]
if prev_cell and prev_cell.original_cell and curr_cell.original_cell:
prev_text = prev_cell.original_cell.text().strip()
curr_text = curr_cell.original_cell.text().strip()
# Skip empty cells
if not prev_text or not curr_text:
continue
# Check for patterns that indicate consolidation
if prev_text == '$' and curr_text and curr_text[0].isdigit():
consolidation_type = 'currency'
break
elif prev_text.startswith('(') and curr_text == ')':
consolidation_type = 'parentheses'
break
elif curr_text == '%' and prev_text and prev_text[-1].isdigit():
consolidation_type = 'percentage'
break
if consolidation_type:
potential_consolidations[data_col] = (prev_col, consolidation_type)
# Second pass: resolve conflicts
# If column Y is a target for consolidation from Y+1 (e.g., parentheses),
# then don't consolidate Y into another column
columns_needed_as_targets = set()
for data_col, (target_col, cons_type) in potential_consolidations.items():
if cons_type == 'parentheses':
# This target column is needed for parentheses consolidation
columns_needed_as_targets.add(target_col)
# Build final consolidation map, skipping consolidations that would remove needed targets
for data_col, (target_col, cons_type) in potential_consolidations.items():
# Don't consolidate this column if it's needed as a target for parentheses
if data_col in columns_needed_as_targets and cons_type != 'parentheses':
continue
# CRITICAL: Don't consolidate columns that have header content
# This prevents legitimate header columns from being merged together
if data_col in header_content_columns or target_col in header_content_columns:
continue
consolidation_map[data_col] = target_col
# Debug: uncomment to see consolidation mapping
# import os
# if os.environ.get('DEBUG_TABLE_CONSOLIDATION'):
# print(f"Consolidating column {data_col} into {target_col}")
# Special case: Keep data columns that are associated with header columns
# This handles cases where headers span multiple columns but data is in specific columns
for header_col in primary_header_cols:
# Check if there's a data column immediately after the header column
# This is common when headers span multiple columns
for offset in range(1, 3): # Check next 1-2 columns
data_col = header_col + offset
if data_col in data_cols and data_col not in cols_to_keep:
# Check if this column has meaningful data
has_data = False
for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
cell = self.matrix[row_idx][data_col]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text and text not in ['', '-', '', '']:
has_data = True
break
if has_data:
cols_to_keep.add(data_col)
# Keep data columns that have significant content but aren't near header columns
# This includes columns with dates, text descriptions, etc.
for col_idx in data_cols:
if col_idx not in cols_to_keep:
# Check if this column has important data
has_important_data = False
non_empty_count = 0
text_samples = []
for row_idx in range(data_start_row, min(data_start_row + 10, self.row_count)):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text and text not in ['', '-', '', '']:
non_empty_count += 1
if len(text_samples) < 3:
text_samples.append(text)
# Check for important patterns
# Dates, years, text descriptions, etc.
if any([
len(text) > 3 and not text.replace(',', '').replace('.', '').isdigit(), # Non-trivial text
any(month in text for month in ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']),
any(month in text for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']),
'20' in text and any(c.isdigit() for c in text), # Likely contains year
]):
has_important_data = True
# Keep columns with consistent important data
if has_important_data and non_empty_count >= 3:
cols_to_keep.add(col_idx)
# Special case: If we have very few primary headers but lots of data columns,
# we might have a table where headers are in data rows (like years)
# Keep columns that have significant financial data
if len(primary_header_cols) <= 2 and len(data_cols) > 4:
# Check for financial data patterns in columns
for col_idx in data_cols:
has_financial_data = False
sample_count = 0
# Sample a few cells from this column
for row_idx in range(data_start_row, min(data_start_row + 5, self.row_count)):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text:
sample_count += 1
# Check for financial patterns
if any([
text.startswith('(') and any(c.isdigit() for c in text), # Negative numbers
text == ')' and col_idx > 0, # Closing parenthesis
'$' in text, # Currency
'%' in text, # Percentages
text.replace(',', '').replace('.', '').isdigit(), # Plain numbers
text in ['', '', '-', '*'] # Common placeholders
]):
has_financial_data = True
break
# Keep columns with financial data
if has_financial_data and sample_count > 0:
cols_to_keep.add(col_idx)
# Check if column 0 contains row labels (non-empty cells in data rows)
col_0_has_labels = False
data_start_row = max(1, actual_header_rows)
for row_idx in range(data_start_row, self.row_count):
cell = self.matrix[row_idx][0]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
if text and not text.isdigit() and not text.startswith('$') and len(text) > 1:
col_0_has_labels = True
break
# Include column 0 if it has labels
if col_0_has_labels:
cols_to_keep.add(0)
# Remove columns that will be consolidated into other columns
# These columns' data will be merged into their target columns
cols_to_remove = set(consolidation_map.keys())
cols_to_keep = cols_to_keep - cols_to_remove
cols_to_keep = sorted(cols_to_keep)
# Create new matrix with consolidated columns
if not cols_to_keep:
return self
new_matrix = TableMatrix()
new_matrix.row_count = self.row_count
new_matrix.col_count = len(cols_to_keep)
new_matrix.header_row_count = self.header_row_count # Preserve header row count
new_matrix.matrix = []
# Create mapping from old to new column indices
old_to_new = {old_col: new_idx for new_idx, old_col in enumerate(cols_to_keep)}
# Build new matrix with consolidation
for row_idx in range(self.row_count):
new_row = [MatrixCell() for _ in range(new_matrix.col_count)]
# Track which cells we've already placed to handle colspan properly
placed_origins = {} # Maps (row_origin, col_origin) to new column index
# First, copy cells from kept columns
for old_col in sorted(cols_to_keep):
if old_col not in old_to_new:
continue
new_col = old_to_new[old_col]
cell = self.matrix[row_idx][old_col]
if cell.original_cell:
origin_key = (cell.row_origin, cell.col_origin)
# Check if we've already placed this cell (due to colspan)
if origin_key in placed_origins:
# This is a continuation of a colspan - mark as spanned
new_row[new_col] = MatrixCell(
original_cell=cell.original_cell,
is_spanned=True, # Mark as spanned since it's part of a colspan
row_origin=cell.row_origin,
col_origin=placed_origins[origin_key] # Point to the original placement
)
else:
# First occurrence of this cell - place normally
new_row[new_col] = MatrixCell(
original_cell=cell.original_cell,
is_spanned=False, # This is the primary cell
row_origin=cell.row_origin,
col_origin=new_col
)
placed_origins[origin_key] = new_col
# Then, consolidate misaligned data into header columns
for data_col, header_col in consolidation_map.items():
if header_col in old_to_new:
new_col = old_to_new[header_col]
data_cell = self.matrix[row_idx][data_col] if data_col < len(self.matrix[row_idx]) else None
# If data cell has content, merge it with header column
if data_cell and data_cell.original_cell and not data_cell.is_spanned:
# Skip empty data cells
if not data_cell.original_cell.text().strip():
continue
# Check the original header column cell to see if it has content to merge
header_cell = self.matrix[row_idx][header_col]
existing_cell = new_row[new_col]
# Check if we need to merge (e.g., $ with value)
if header_cell.original_cell and header_cell.original_cell.text().strip():
existing_text = header_cell.original_cell.text().strip()
new_text = data_cell.original_cell.text().strip()
# Merge currency symbol with value OR value with percentage OR parentheses
if existing_text == '$' and new_text:
# Currency merge: $ + number
merged_text = f"${new_text}"
# Create new cell with merged content
merged_cell = Cell(
content=merged_text,
colspan=header_cell.original_cell.colspan,
rowspan=header_cell.original_cell.rowspan,
is_header=header_cell.original_cell.is_header,
align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
)
new_row[new_col] = MatrixCell(
original_cell=merged_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
elif new_text == ')' and existing_text.startswith('('):
# Parentheses merge: (number + )
merged_text = f"{existing_text})"
# Create new cell with merged content
merged_cell = Cell(
content=merged_text,
colspan=header_cell.original_cell.colspan,
rowspan=header_cell.original_cell.rowspan,
is_header=header_cell.original_cell.is_header,
align=data_cell.original_cell.align if hasattr(data_cell.original_cell, 'align') else None
)
new_row[new_col] = MatrixCell(
original_cell=merged_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
elif new_text == '%' and existing_text:
# Percentage merge: number + %
merged_text = f"{existing_text}%"
# Create new cell with merged content
merged_cell = Cell(
content=merged_text,
colspan=header_cell.original_cell.colspan,
rowspan=header_cell.original_cell.rowspan,
is_header=header_cell.original_cell.is_header,
align=header_cell.original_cell.align if hasattr(header_cell.original_cell, 'align') else None
)
new_row[new_col] = MatrixCell(
original_cell=merged_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
else:
# Just keep the data cell if can't merge
new_row[new_col] = MatrixCell(
original_cell=data_cell.original_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
else:
# No existing content, just move the data
new_row[new_col] = MatrixCell(
original_cell=data_cell.original_cell,
is_spanned=False,
row_origin=row_idx,
col_origin=new_col
)
new_matrix.matrix.append(new_row)
return new_matrix
def to_cell_grid(self) -> List[List[Optional[Cell]]]:
"""
Convert matrix to a simple 2D grid of cells.
Returns:
2D list where each position contains either a Cell or None
"""
grid = []
for row_idx in range(self.row_count):
row = []
for col_idx in range(self.col_count):
matrix_cell = self.matrix[row_idx][col_idx]
if matrix_cell.original_cell and not matrix_cell.is_spanned:
row.append(matrix_cell.original_cell)
else:
row.append(None)
grid.append(row)
return grid
def debug_print(self):
"""Print matrix structure for debugging"""
print(f"Matrix: {self.row_count}×{self.col_count}")
for row_idx in range(self.row_count):
row_str = []
for col_idx in range(self.col_count):
cell = self.matrix[row_idx][col_idx]
if cell.original_cell:
text = cell.original_cell.text()[:10]
if cell.is_spanned:
row_str.append(f"[{text}...]")
else:
row_str.append(f"{text}...")
else:
row_str.append("___")
print(f"Row {row_idx}: {' | '.join(row_str)}")
class ColumnAnalyzer:
"""Analyze column structure to identify data vs spacing columns"""
def __init__(self, matrix: TableMatrix):
"""Initialize with a table matrix"""
self.matrix = matrix
def identify_spacing_columns(self) -> List[int]:
"""
Identify columns used only for spacing.
Returns:
List of column indices that are spacing columns
"""
spacing_cols = []
widths = self.matrix.get_column_widths()
total_width = sum(widths)
for col_idx in range(self.matrix.col_count):
if self._is_spacing_column(col_idx, widths, total_width):
spacing_cols.append(col_idx)
return spacing_cols
def _is_spacing_column(self, col_idx: int, widths: List[float], total_width: float) -> bool:
"""
Check if a column is used for spacing.
Only mark as spacing if column is completely empty.
Criteria:
- Column has absolutely no content across all rows
"""
# Check if column is completely empty
for row_idx in range(self.matrix.row_count):
cell = self.matrix.matrix[row_idx][col_idx]
if cell.original_cell and not cell.is_spanned:
text = cell.original_cell.text().strip()
# If there's any text at all, it's not a spacing column
if text:
return False
# Column is completely empty
return True
def get_clean_column_indices(self) -> List[int]:
"""
Get indices of non-spacing columns.
Returns:
List of column indices that contain actual data
"""
spacing = set(self.identify_spacing_columns())
return [i for i in range(self.matrix.col_count) if i not in spacing]