637 lines
26 KiB
Python
637 lines
26 KiB
Python
"""
|
|
Advanced table processing strategy.
|
|
"""
|
|
|
|
import re
|
|
from functools import lru_cache
|
|
from typing import List, Optional
|
|
|
|
from lxml.html import HtmlElement
|
|
|
|
from edgar.documents.config import ParserConfig
|
|
from edgar.documents.strategies.style_parser import StyleParser
|
|
from edgar.documents.table_nodes import TableNode, Cell, Row
|
|
from edgar.documents.types import TableType
|
|
|
|
|
|
class TableProcessor:
|
|
"""
|
|
Advanced table processing with type detection and structure analysis.
|
|
"""
|
|
|
|
# HTML entities that need replacement
|
|
ENTITY_REPLACEMENTS = {
|
|
'―': '-----',
|
|
'—': '-----',
|
|
'–': '---',
|
|
'−': '-',
|
|
'‐': '-',
|
|
'‐': '-',
|
|
' ': ' ',
|
|
'&': '&',
|
|
'<': '<',
|
|
'>': '>',
|
|
'"': '"',
|
|
''': "'",
|
|
' ': ' ',
|
|
'​': '',
|
|
'—': '-----',
|
|
'–': '---',
|
|
'−': '-',
|
|
}
|
|
|
|
# Financial keywords for table type detection
|
|
FINANCIAL_KEYWORDS = {
|
|
'revenue', 'income', 'expense', 'asset', 'liability',
|
|
'cash', 'equity', 'profit', 'loss', 'margin',
|
|
'earnings', 'cost', 'sales', 'operating', 'net',
|
|
'gross', 'total', 'balance', 'statement', 'consolidated',
|
|
'provision', 'tax', 'taxes', 'compensation', 'stock',
|
|
'share', 'shares', 'rsu', 'option', 'grant', 'vest'
|
|
}
|
|
|
|
# Metrics keywords
|
|
METRICS_KEYWORDS = {
|
|
'ratio', 'percentage', 'percent', '%', 'rate',
|
|
'growth', 'change', 'increase', 'decrease',
|
|
'average', 'median', 'total', 'count', 'number'
|
|
}
|
|
|
|
def __init__(self, config: ParserConfig):
|
|
"""Initialize table processor."""
|
|
self.config = config
|
|
self.style_parser = StyleParser()
|
|
|
|
def process(self, element: HtmlElement) -> TableNode:
|
|
"""
|
|
Process table element into TableNode.
|
|
|
|
Args:
|
|
element: HTML table element
|
|
|
|
Returns:
|
|
Processed TableNode
|
|
"""
|
|
# Extract table metadata
|
|
table_id = element.get('id')
|
|
table_class = element.get('class', '').split()
|
|
table_style = self.style_parser.parse(element.get('style', ''))
|
|
|
|
# Create table node
|
|
table = TableNode(style=table_style)
|
|
|
|
# Set config for rendering decisions
|
|
table._config = self.config
|
|
|
|
# Add metadata
|
|
if table_id:
|
|
table.set_metadata('id', table_id)
|
|
if table_class:
|
|
table.set_metadata('classes', table_class)
|
|
|
|
# Extract caption
|
|
caption_elem = element.find('.//caption')
|
|
if caption_elem is not None:
|
|
table.caption = self._extract_text(caption_elem)
|
|
|
|
# Extract summary
|
|
summary = element.get('summary')
|
|
if summary:
|
|
table.summary = summary
|
|
|
|
# Process table structure
|
|
self._process_table_structure(element, table)
|
|
|
|
# Detect table type if configured
|
|
if self.config.detect_table_types:
|
|
table.table_type = self._detect_table_type(table)
|
|
|
|
# Extract relationships if configured
|
|
if self.config.extract_table_relationships:
|
|
self._extract_relationships(table)
|
|
|
|
return table
|
|
|
|
def _process_table_structure(self, element: HtmlElement, table: TableNode):
|
|
"""Process table structure (thead, tbody, tfoot)."""
|
|
# Process thead
|
|
thead = element.find('.//thead')
|
|
if thead is not None:
|
|
for tr in thead.findall('.//tr'):
|
|
cells = self._process_row(tr, is_header=True)
|
|
if cells:
|
|
table.headers.append(cells)
|
|
|
|
# Process tbody (or direct rows)
|
|
tbody = element.find('.//tbody')
|
|
rows_container = tbody if tbody is not None else element
|
|
|
|
# Track if we've seen headers and data rows
|
|
headers_found = bool(table.headers)
|
|
consecutive_header_rows = 0
|
|
data_rows_started = False
|
|
|
|
for tr in rows_container.findall('.//tr'):
|
|
# Skip if already processed in thead
|
|
if thead is not None and tr.getparent() == thead:
|
|
continue
|
|
|
|
# Check if this might be a header row
|
|
is_header_row = False
|
|
|
|
# Continue checking for headers if:
|
|
# 1. We haven't found any headers yet, OR
|
|
# 2. We've found headers but haven't seen data rows yet (multi-row headers)
|
|
if not data_rows_started:
|
|
is_header_row = self._is_header_row(tr)
|
|
|
|
# Additional check for multi-row headers in financial tables
|
|
# If the previous row was a header and this row has years or units,
|
|
# it's likely part of the header
|
|
if headers_found and not is_header_row:
|
|
row_text = tr.text_content().strip()
|
|
# Check for units like "(in millions)" or "(in thousands)"
|
|
if '(in millions)' in row_text or '(in thousands)' in row_text or '(in billions)' in row_text:
|
|
is_header_row = True
|
|
# Check for year rows that follow "Year Ended" headers
|
|
elif len(table.headers) > 0:
|
|
last_header_text = ' '.join(cell.text() for cell in table.headers[-1])
|
|
if 'year ended' in last_header_text.lower() or 'years ended' in last_header_text.lower():
|
|
# Check if this row has years
|
|
year_pattern = r'\b(19\d{2}|20\d{2})\b'
|
|
years_found = re.findall(year_pattern, row_text)
|
|
if years_found:
|
|
is_header_row = True
|
|
|
|
cells = self._process_row(tr, is_header=is_header_row)
|
|
if cells:
|
|
if is_header_row:
|
|
table.headers.append(cells)
|
|
headers_found = True
|
|
consecutive_header_rows += 1
|
|
else:
|
|
# Only mark data_rows_started if this row has actual content
|
|
# Empty rows at the beginning shouldn't stop header detection
|
|
row = Row(cells=cells, is_header=False)
|
|
table.rows.append(row)
|
|
|
|
# Check if row has significant content that indicates data rows have started
|
|
# But be smart about it - descriptive rows like "(in millions)" or pure spacing
|
|
# shouldn't stop header detection
|
|
has_content = any(cell.text().strip() for cell in cells)
|
|
if has_content:
|
|
# Get the row text for smarter analysis
|
|
row_text = ' '.join(cell.text().strip() for cell in cells).strip()
|
|
row_text_lower = row_text.lower()
|
|
|
|
# Don't consider this as "data started" if it's likely a header-related row
|
|
is_header_related = (
|
|
# Unit descriptions
|
|
'(in millions)' in row_text_lower or
|
|
'(in thousands)' in row_text_lower or
|
|
'(in billions)' in row_text_lower or
|
|
'except per share' in row_text_lower or
|
|
# Financial period descriptions
|
|
'year ended' in row_text_lower or
|
|
'months ended' in row_text_lower or
|
|
# Mostly just spacing/formatting
|
|
len(row_text.strip()) < 5 or
|
|
# Contains years (might be misclassified header)
|
|
bool(re.search(r'\b(19\d{2}|20\d{2})\b', row_text))
|
|
)
|
|
|
|
# Only mark data_rows_started if this seems like actual data, not header-related
|
|
if not is_header_related:
|
|
data_rows_started = True
|
|
|
|
consecutive_header_rows = 0
|
|
|
|
# Process tfoot
|
|
tfoot = element.find('.//tfoot')
|
|
if tfoot is not None:
|
|
for tr in tfoot.findall('.//tr'):
|
|
cells = self._process_row(tr, is_header=False)
|
|
if cells:
|
|
row = Row(cells=cells, is_header=False)
|
|
table.footer.append(row)
|
|
|
|
def _process_row(self, tr: HtmlElement, is_header: bool) -> List[Cell]:
|
|
"""Process table row into cells."""
|
|
cells = []
|
|
|
|
# Process both td and th elements
|
|
for cell_elem in tr.findall('.//td') + tr.findall('.//th'):
|
|
cell = self._process_cell(cell_elem, is_header or cell_elem.tag == 'th')
|
|
if cell:
|
|
cells.append(cell)
|
|
|
|
return cells
|
|
|
|
def _process_cell(self, elem: HtmlElement, is_header: bool) -> Optional[Cell]:
|
|
"""Process table cell."""
|
|
# Extract cell properties
|
|
colspan = int(elem.get('colspan', '1'))
|
|
rowspan = int(elem.get('rowspan', '1'))
|
|
align = elem.get('align')
|
|
|
|
# Extract style
|
|
style = self.style_parser.parse(elem.get('style', ''))
|
|
if style.text_align:
|
|
align = style.text_align
|
|
|
|
# Extract content
|
|
content = self._extract_cell_content(elem)
|
|
|
|
# Create cell
|
|
cell = Cell(
|
|
content=content,
|
|
colspan=colspan,
|
|
rowspan=rowspan,
|
|
is_header=is_header,
|
|
align=align
|
|
)
|
|
|
|
return cell
|
|
|
|
def _extract_cell_content(self, elem: HtmlElement) -> str:
|
|
"""Extract and clean cell content."""
|
|
# Check for nested structure
|
|
divs = elem.findall('.//div')
|
|
if divs and len(divs) > 1:
|
|
# Multiple divs - likely multi-line content
|
|
lines = []
|
|
for div in divs:
|
|
text = self._extract_text(div)
|
|
if text:
|
|
lines.append(text)
|
|
return '\n'.join(lines)
|
|
|
|
# Handle line breaks
|
|
for br in elem.findall('.//br'):
|
|
br.tail = '\n' + (br.tail or '')
|
|
|
|
# Extract text
|
|
text = self._extract_text(elem)
|
|
|
|
return text
|
|
|
|
def _extract_text(self, elem: HtmlElement) -> str:
|
|
"""Extract and clean text from element."""
|
|
# Use itertext() to get all text fragments
|
|
# This preserves spaces better than text_content()
|
|
text_parts = []
|
|
for text in elem.itertext():
|
|
if text:
|
|
text_parts.append(text)
|
|
|
|
# Join parts, ensuring we don't lose spaces
|
|
# If a part doesn't end with whitespace and the next doesn't start with whitespace,
|
|
# we need to add a space between them
|
|
if not text_parts:
|
|
return ''
|
|
|
|
result = []
|
|
for i, part in enumerate(text_parts):
|
|
if i == 0:
|
|
result.append(part)
|
|
else:
|
|
prev_part = text_parts[i-1]
|
|
# Check if we need to add a space between parts
|
|
# Don't add space if previous ends with space or current starts with space
|
|
if prev_part and part:
|
|
if not prev_part[-1].isspace() and not part[0].isspace():
|
|
# Check for punctuation that shouldn't have space before it
|
|
if part[0] not in ',.;:!?%)]':
|
|
result.append(' ')
|
|
result.append(part)
|
|
|
|
text = ''.join(result)
|
|
|
|
# Replace entities
|
|
for entity, replacement in self.ENTITY_REPLACEMENTS.items():
|
|
text = text.replace(entity, replacement)
|
|
|
|
# Clean whitespace
|
|
text = text.strip()
|
|
|
|
# Normalize internal whitespace but preserve line breaks
|
|
lines = text.split('\n')
|
|
cleaned_lines = []
|
|
for line in lines:
|
|
# Collapse multiple spaces to single space
|
|
line = ' '.join(line.split())
|
|
cleaned_lines.append(line)
|
|
|
|
return '\n'.join(cleaned_lines)
|
|
|
|
@staticmethod
|
|
@lru_cache(maxsize=1)
|
|
def _get_period_header_pattern():
|
|
"""
|
|
Compile comprehensive regex for financial period headers.
|
|
Adapted from old parser's proven patterns.
|
|
|
|
Returns:
|
|
Compiled regex pattern matching financial period headers
|
|
"""
|
|
# Base components
|
|
periods = r'(?:three|six|nine|twelve|[1-4]|first|second|third|fourth)'
|
|
timeframes = r'(?:month|quarter|year|week)'
|
|
ended_variants = r'(?:ended|ending|end|period)'
|
|
as_of_variants = r'(?:as\s+of|at|as\s+at)'
|
|
|
|
# Date pattern
|
|
months = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
|
|
day = r'\d{1,2}'
|
|
year = r'(?:19|20)\d{2}'
|
|
date = f'{months}\\s*\\.?\\s*{day}\\s*,?\\s*{year}'
|
|
|
|
# Combined patterns
|
|
patterns = [
|
|
# Standard period headers
|
|
f'{periods}\\s+{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
|
|
f'(?:fiscal\\s+)?{timeframes}\\s+{ended_variants}',
|
|
f'{timeframes}\\s+{ended_variants}(?:\\s+{date})?',
|
|
|
|
# Balance sheet date headers
|
|
f'{as_of_variants}\\s+{date}',
|
|
|
|
# Multiple date sequences
|
|
f'{date}(?:\\s*(?:and|,)\\s*{date})*',
|
|
|
|
# Single dates
|
|
f'(?:{ended_variants}\\s+)?{date}'
|
|
]
|
|
|
|
pattern = '|'.join(f'(?:{p})' for p in patterns)
|
|
return re.compile(pattern, re.IGNORECASE)
|
|
|
|
def _is_header_row(self, tr: HtmlElement) -> bool:
|
|
"""Detect if row is likely a header row in SEC filings."""
|
|
# Check if contains th elements (most reliable indicator)
|
|
if tr.find('.//th') is not None:
|
|
return True
|
|
|
|
cells = tr.findall('.//td')
|
|
if not cells:
|
|
return False
|
|
|
|
# Get row text for analysis
|
|
row_text = tr.text_content()
|
|
row_text_lower = row_text.lower()
|
|
|
|
# Check for date ranges with financial data (Oracle Table 6 pattern)
|
|
# Date ranges like "March 1, 2024—March 31, 2024" should be data rows, not headers
|
|
date_range_pattern = r'(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}\s*[—–-]\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s*\d{4}'
|
|
has_date_range = bool(re.search(date_range_pattern, row_text_lower))
|
|
|
|
# Check for financial data indicators
|
|
has_currency = bool(re.search(r'\$[\s]*[\d,\.]+', row_text))
|
|
has_decimals = bool(re.search(r'\b\d+\.\d+\b', row_text))
|
|
has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
|
|
|
|
# If row has date range + financial data, it's definitely a data row
|
|
if has_date_range and (has_currency or has_decimals or has_large_numbers):
|
|
return False
|
|
|
|
# Check for year patterns (very common in financial headers)
|
|
year_pattern = r'\b(19\d{2}|20\d{2})\b'
|
|
years_found = re.findall(year_pattern, row_text)
|
|
if len(years_found) >= 2: # Multiple years suggest header row
|
|
# IMPORTANT: Check for date ranges and same-year repetition
|
|
# Date ranges like "March 1, 2024—March 31, 2024" contain the same year twice
|
|
# but are data rows, not multi-year comparison headers
|
|
|
|
# If all years are the same (date range pattern)
|
|
if len(set(years_found)) == 1:
|
|
# Same year repeated - likely a date range like "Jan 1, 2024 - Mar 31, 2024"
|
|
# Not a multi-year comparison header
|
|
pass # Don't return True
|
|
# Multiple different years suggest multi-year comparison header
|
|
elif 'total' not in row_text_lower[:20]: # Check first 20 chars
|
|
return True
|
|
|
|
# Enhanced year detection - check individual cells for year patterns
|
|
# This handles cases where years are in separate cells
|
|
year_cells = 0
|
|
date_phrases = 0
|
|
for cell in cells:
|
|
cell_text = cell.text_content().strip()
|
|
if cell_text:
|
|
# Check for individual years
|
|
if re.match(r'^\s*(19\d{2}|20\d{2})\s*$', cell_text):
|
|
year_cells += 1
|
|
# Check for date phrases like "June 30, 2025"
|
|
elif 'june 30' in cell_text.lower() or 'december 31' in cell_text.lower():
|
|
date_phrases += 1
|
|
|
|
# If we have multiple year cells or year + date phrases, likely a header
|
|
if year_cells >= 2 or (year_cells >= 1 and date_phrases >= 1):
|
|
if 'total' not in row_text_lower[:20]:
|
|
return True
|
|
|
|
# Check for comprehensive financial period patterns (from old parser)
|
|
period_pattern = self._get_period_header_pattern()
|
|
if period_pattern.search(row_text_lower):
|
|
# Additional validation: ensure it's not a data row with period text
|
|
# Check for absence of strong data indicators
|
|
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\s*[+\-*/]\s*\d+|\(\s*\d+(?:,\d{3})*\s*\))'
|
|
if not re.search(data_pattern, row_text):
|
|
return True
|
|
|
|
# Check for units notation (in millions, thousands, billions)
|
|
units_pattern = r'\(in\s+(?:millions|thousands|billions)\)'
|
|
if re.search(units_pattern, row_text_lower):
|
|
return True
|
|
|
|
# Check for period indicators (quarters, months)
|
|
# But be careful with "fiscal" - it could be data like "Fiscal 2025"
|
|
period_keywords = ['quarter', 'q1', 'q2', 'q3', 'q4', 'month',
|
|
'january', 'february', 'march', 'april', 'may', 'june',
|
|
'july', 'august', 'september', 'october', 'november', 'december',
|
|
'ended', 'three months', 'six months', 'nine months']
|
|
|
|
# Special handling for "fiscal" - only treat as header if it's part of a phrase like "fiscal year ended"
|
|
if 'fiscal' in row_text_lower:
|
|
# Check if row has numeric values (suggests it's data, not header)
|
|
# Look for patterns like "Fiscal 2025 $10,612"
|
|
has_currency_values = bool(re.search(r'\$[\s]*[\d,]+', row_text))
|
|
has_large_numbers = bool(re.search(r'\b\d{1,3}(,\d{3})+\b', row_text))
|
|
|
|
# If it has currency or large numbers, it's likely data
|
|
if has_currency_values or has_large_numbers:
|
|
return False
|
|
|
|
# Check if it's just "Fiscal YYYY" which is likely data, not a header
|
|
fiscal_year_only = re.match(r'^\s*fiscal\s+\d{4}\s*$', row_text_lower.strip())
|
|
if fiscal_year_only:
|
|
return False # This is data, not a header
|
|
|
|
# Check for header-like phrases with fiscal
|
|
if 'fiscal year' in row_text_lower and ('ended' in row_text_lower or 'ending' in row_text_lower):
|
|
return True
|
|
|
|
if any(keyword in row_text_lower for keyword in period_keywords):
|
|
# Validate it's not a data row with period keywords
|
|
# Check for strong data indicators
|
|
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
|
|
if not re.search(data_pattern, row_text):
|
|
return True
|
|
|
|
# Check for column descriptors (but NOT total)
|
|
# These are words commonly found in headers but not data rows
|
|
header_keywords = ['description', 'item', 'category', 'type', 'classification',
|
|
'change', 'percent', 'increase', 'decrease', 'variance']
|
|
if any(keyword in row_text_lower for keyword in header_keywords):
|
|
# Make sure it's not a total row
|
|
if 'total' not in row_text_lower[:30]:
|
|
# Additional validation: long narrative text is not a header
|
|
# Headers are typically concise (< 150 chars)
|
|
if len(row_text) > 150:
|
|
return False
|
|
# Check for data indicators (would indicate data row, not header)
|
|
data_pattern = r'(?:\$\s*\d|\d+(?:,\d{3})+|\d+\.\d+|[(]\s*\d+(?:,\d{3})*\s*[)])'
|
|
if re.search(data_pattern, row_text):
|
|
return False
|
|
return True
|
|
|
|
# Check if all cells are bold (common header formatting)
|
|
bold_count = 0
|
|
for cell in cells:
|
|
style = cell.get('style', '')
|
|
if 'font-weight' in style and 'bold' in style:
|
|
bold_count += 1
|
|
elif cell.find('.//b') is not None or cell.find('.//strong') is not None:
|
|
bold_count += 1
|
|
|
|
# Only consider it a header if ALL cells are bold (not just some)
|
|
if bold_count == len(cells) and bold_count > 0:
|
|
return True
|
|
|
|
# Check content type ratio - headers usually have more text than numbers
|
|
# Count cells with primarily text vs primarily numbers
|
|
text_cells = 0
|
|
number_cells = 0
|
|
for cell in cells:
|
|
cell_text = cell.text_content().strip()
|
|
if cell_text:
|
|
# Remove common symbols for analysis
|
|
clean_text = cell_text.replace('$', '').replace('%', '').replace(',', '').replace('(', '').replace(')', '')
|
|
if clean_text.replace('.', '').replace('-', '').strip().isdigit():
|
|
number_cells += 1
|
|
else:
|
|
text_cells += 1
|
|
|
|
# Be very careful about treating text-heavy rows as headers
|
|
# Many data rows start with text labels (e.g., "Impact of...", "Effect of...")
|
|
# Only consider it a header if it has mostly text AND doesn't look like a data label
|
|
if text_cells > number_cells * 2 and text_cells >= 3:
|
|
# Check for common data row patterns
|
|
data_row_indicators = [
|
|
'impact of', 'effect of', 'adjustment', 'provision for', 'benefit',
|
|
'expense', 'income from', 'loss on', 'gain on', 'charge', 'credit',
|
|
'earnings', 'computed', 'state taxes', 'research', 'excess tax'
|
|
]
|
|
|
|
# If it starts with any of these, it's likely a data row, not a header
|
|
for indicator in data_row_indicators:
|
|
if row_text_lower.startswith(indicator) or indicator in row_text_lower[:50]:
|
|
return False
|
|
|
|
# Also not a header if it starts with "total"
|
|
if not row_text_lower.startswith('total'):
|
|
return True
|
|
|
|
return False
|
|
|
|
def _detect_table_type(self, table: TableNode) -> TableType:
|
|
"""Detect the type of table based on content."""
|
|
# Collect text from headers and first few rows
|
|
text_parts = []
|
|
|
|
# Add caption
|
|
if table.caption:
|
|
text_parts.append(table.caption.lower())
|
|
|
|
# Add headers
|
|
for header_row in table.headers:
|
|
for cell in header_row:
|
|
text_parts.append(cell.text().lower())
|
|
|
|
# Add first few rows
|
|
for row in table.rows[:3]:
|
|
for cell in row.cells:
|
|
text_parts.append(cell.text().lower())
|
|
|
|
combined_text = ' '.join(text_parts)
|
|
|
|
# Check for financial table
|
|
financial_count = sum(1 for keyword in self.FINANCIAL_KEYWORDS if keyword in combined_text)
|
|
if financial_count >= 2: # Lowered threshold for better detection
|
|
return TableType.FINANCIAL
|
|
|
|
# Check for metrics table
|
|
metrics_count = sum(1 for keyword in self.METRICS_KEYWORDS if keyword in combined_text)
|
|
numeric_cells = sum(1 for row in table.rows for cell in row.cells if cell.is_numeric)
|
|
total_cells = sum(len(row.cells) for row in table.rows)
|
|
|
|
if total_cells > 0:
|
|
numeric_ratio = numeric_cells / total_cells
|
|
# More lenient metrics detection
|
|
if metrics_count >= 1 or numeric_ratio > 0.3:
|
|
return TableType.METRICS
|
|
|
|
# Check for table of contents
|
|
if 'content' in combined_text or 'index' in combined_text:
|
|
# Look for page numbers
|
|
has_page_numbers = any(
|
|
re.search(r'\b\d{1,3}\b', cell.text())
|
|
for row in table.rows
|
|
for cell in row.cells
|
|
)
|
|
if has_page_numbers:
|
|
return TableType.TABLE_OF_CONTENTS
|
|
|
|
# Check for exhibit index
|
|
if 'exhibit' in combined_text:
|
|
return TableType.EXHIBIT_INDEX
|
|
|
|
# Check for reference table (citations, definitions, etc.)
|
|
if any(word in combined_text for word in ['reference', 'definition', 'glossary', 'citation']):
|
|
return TableType.REFERENCE
|
|
|
|
return TableType.GENERAL
|
|
|
|
def _extract_relationships(self, table: TableNode):
|
|
"""Extract relationships within table data."""
|
|
# This would implement relationship extraction
|
|
# For now, just set a flag that relationships were processed
|
|
table.set_metadata('relationships_extracted', True)
|
|
|
|
# Example relationships to extract:
|
|
# - Parent-child relationships (indented rows)
|
|
# - Total rows that sum other rows
|
|
# - Cross-references between cells
|
|
# - Time series relationships
|
|
|
|
# Detect total rows
|
|
total_rows = []
|
|
for i, row in enumerate(table.rows):
|
|
if row.is_total_row:
|
|
total_rows.append(i)
|
|
|
|
if total_rows:
|
|
table.set_metadata('total_rows', total_rows)
|
|
|
|
# Detect indentation patterns (parent-child)
|
|
indentation_levels = []
|
|
for row in table.rows:
|
|
if row.cells:
|
|
first_cell_text = row.cells[0].text()
|
|
# Count leading spaces
|
|
indent = len(first_cell_text) - len(first_cell_text.lstrip())
|
|
indentation_levels.append(indent)
|
|
|
|
if any(level > 0 for level in indentation_levels):
|
|
table.set_metadata('has_hierarchy', True)
|
|
table.set_metadata('indentation_levels', indentation_levels) |