Files
2025-12-09 12:13:01 +01:00

63 lines
2.5 KiB
Python

"""
Table processing utilities for document parsing.
This module consolidates the standard table matrix processing pipeline used
across table rendering implementations (TableNode.render(), TableNode.to_dataframe(),
and FastTableRenderer.render_table_node()).
"""
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from edgar.documents.utils.table_matrix import TableMatrix, ColumnAnalyzer
from edgar.documents.utils.currency_merger import CurrencyColumnMerger
def process_table_matrix(matrix: "TableMatrix", headers, rows) -> "TableMatrix":
"""
Standard table matrix processing pipeline.
This function applies the standard three-step processing pipeline:
1. Build matrix from headers and rows (handles colspan/rowspan)
2. Filter out spacing columns (columns with only whitespace)
3. Detect and merge currency symbol columns with adjacent value columns
Args:
matrix: TableMatrix instance to populate
headers: List of header rows (each row is a list of Cell objects)
rows: List of data rows (each row is a list of Cell objects)
Returns:
Processed TableMatrix with spacing columns removed and currency columns merged
Example:
>>> matrix = TableMatrix()
>>> clean_matrix = process_table_matrix(matrix, headers, rows)
>>> # clean_matrix now has colspan/rowspan expanded, spacing removed, and currencies merged
Note:
This consolidates the identical processing sequence that appeared in:
- table_nodes.py:240-251 (TableNode.render())
- table_nodes.py:XXX (TableNode.to_dataframe())
- renderers/fast_table.py:XXX (FastTableRenderer.render_table_node())
"""
# Import at runtime to avoid circular imports
from edgar.documents.utils.table_matrix import ColumnAnalyzer
from edgar.documents.utils.currency_merger import CurrencyColumnMerger
# Step 1: Build matrix from rows (expands colspan/rowspan)
matrix.build_from_rows(headers, rows)
# Step 2: Remove spacing columns (columns with only whitespace/empty cells)
# Note: ColumnAnalyzer is created but unused in original implementation
analyzer = ColumnAnalyzer(matrix)
clean_matrix = matrix.filter_spacing_columns()
# Step 3: Detect and merge currency columns ($ with adjacent numbers)
currency_merger = CurrencyColumnMerger(clean_matrix)
currency_merger.detect_currency_pairs()
if currency_merger.merge_pairs:
clean_matrix = currency_merger.apply_merges()
return clean_matrix