"""
Module for converting HTML tables from filing reports to pandas DataFrames.
This provides an alternative to XBRL parsing by extracting data directly from
company-formatted HTML tables.
"""
import re
from dataclasses import dataclass
from typing import Optional, Union
import pandas as pd
from edgar.files.html import Document, TableNode
from edgar.files.tables import ProcessedTable
@dataclass
class TableMetadata:
"""Metadata extracted from table headers and content"""
currency: Optional[str] = None
units: Optional[str] = None
scaling_factor: Optional[int] = None
period_type: Optional[str] = None # 'instant' or 'duration'
class FinancialTableExtractor:
"""Extract financial tables from HTML reports as pandas DataFrames"""
# Common patterns for financial data
# More comprehensive currency patterns
CURRENCY_PATTERN = re.compile(
r'\$|USD|EUR|GBP|JPY|CNY|CAD|AUD|CHF|'
r'£|€|¥|₹|' # Currency symbols
r'\bDollars?\b|\bPounds?\b|\bEuros?\b|\bYen\b',
re.IGNORECASE
)
# More flexible units pattern
UNITS_PATTERN = re.compile(
r'(?:in\s+)?(?:thousands?|millions?|billions?|000s?|000,000s?|mln|mil|bn)',
re.IGNORECASE
)
SCALING_PATTERN = re.compile(r'(\d+(?:,\d{3})*)\s*=\s*\$?1')
# More flexible date patterns to handle various formats
PERIOD_PATTERN = re.compile(
r'(\d{1,2}[\s/\-]\w{3,}[\s/\-]\d{2,4}|' # 31-Dec-2024, 31/December/24
r'\w{3,}\.?\s+\d{1,2},?\s+\d{4}|' # December 31, 2024
r'\d{4}[\s/\-]\d{1,2}[\s/\-]\d{1,2}|' # 2024-12-31
r'\d{1,2}[\s/\-]\d{1,2}[\s/\-]\d{2,4}|' # 12/31/2024, 31-12-24
r'Q[1-4]\s*\d{2,4}|' # Q1 2024, Q12024
r'\d{1}Q\s*\d{2,4}|' # 1Q 2024, 1Q24
r'FY\s*\d{2,4}|' # FY 2024, FY24
r'Fiscal\s+\d{4}|' # Fiscal 2024
r'Year\s+Ended)', # Year Ended
re.IGNORECASE
)
@classmethod
def extract_table_to_dataframe(cls, table_node: TableNode) -> pd.DataFrame:
"""
Convert a TableNode to a pandas DataFrame with appropriate data types.
Args:
table_node: The TableNode containing financial data
Returns:
pd.DataFrame with financial data, periods as columns, line items as index
"""
try:
# Get processed table
processed_table = table_node._processed
if not processed_table:
return pd.DataFrame()
# Extract metadata from headers
metadata = cls._extract_metadata(table_node, processed_table)
# Build DataFrame
df = cls._build_dataframe(processed_table, metadata)
# Apply data transformations
df = cls._apply_transformations(df, metadata)
return df
except Exception:
# Log error but return empty DataFrame to allow processing to continue
return pd.DataFrame()
@classmethod
def _extract_metadata(cls, table_node: TableNode, processed_table: ProcessedTable) -> TableMetadata:
"""Extract metadata from table headers and first few rows"""
metadata = TableMetadata()
# Check headers for currency and units
if processed_table.headers:
header_text = ' '.join(processed_table.headers)
# Extract currency
currency_match = cls.CURRENCY_PATTERN.search(header_text)
if currency_match:
metadata.currency = currency_match.group(0)
# Extract units
units_match = cls.UNITS_PATTERN.search(header_text)
if units_match:
unit_text = units_match.group(0).lower()
if any(x in unit_text for x in ['thousand', '000s', '000,']):
metadata.scaling_factor = 1000
metadata.units = 'thousands'
elif any(x in unit_text for x in ['million', 'mln', 'mil', '000,000']):
metadata.scaling_factor = 1000000
metadata.units = 'millions'
elif any(x in unit_text for x in ['billion', 'bn']):
metadata.scaling_factor = 1000000000
metadata.units = 'billions'
# Check if periods are durations or instants
if processed_table.headers:
period_headers = [h for h in processed_table.headers if cls.PERIOD_PATTERN.search(h)]
if period_headers:
# If headers contain "ended" it's likely duration periods
if any('ended' in h.lower() for h in period_headers):
metadata.period_type = 'duration'
else:
metadata.period_type = 'instant'
return metadata
@classmethod
def _build_dataframe(cls, processed_table: ProcessedTable, metadata: TableMetadata) -> pd.DataFrame:
"""Build initial DataFrame from processed table"""
if not processed_table.data_rows:
return pd.DataFrame()
# Identify period columns and line item column
headers = processed_table.headers or []
period_cols = []
line_item_col = 0
# Check if this is a "vertical" table (like Cover Page)
# where first column is labels and all others are data
is_vertical_table = False
if len(headers) >= 2:
# Check if first column has label-like patterns
first_header_lower = headers[0].lower() if headers[0] else ''
first_is_label = any(pattern in first_header_lower for pattern in
['entity', 'line item', 'information', 'abstract', 'cover page',
'detail', 'description', 'item'])
# Check if this looks like a cover page or entity info table
# by examining the first few data rows
looks_like_entity_info = False
if processed_table.data_rows and len(processed_table.data_rows) > 2:
# Check if first column has entity/document field names
first_col_values = []
for row in processed_table.data_rows[:10]: # Check more rows
if len(row) > 0 and isinstance(row[0], str):
first_col_values.append(row[0].lower())
# More comprehensive patterns for vertical tables
entity_patterns = ['entity', 'document', 'registrant', 'address',
'file number', 'incorporation', 'fiscal', 'telephone',
'securities', 'trading', 'exchange', 'ticker']
# Count how many rows match entity patterns
pattern_matches = sum(
any(pattern in val for pattern in entity_patterns)
for val in first_col_values
)
# If more than 30% of rows have entity-like labels, it's probably vertical
looks_like_entity_info = pattern_matches >= len(first_col_values) * 0.3
is_vertical_table = first_is_label or looks_like_entity_info
if is_vertical_table:
# For vertical tables, first column is index, rest are data
line_item_col = 0
period_cols = list(range(1, len(headers)))
# Ensure we don't include the line item column
if line_item_col in period_cols:
period_cols.remove(line_item_col)
else:
# For standard tables, identify period columns
for i, header in enumerate(headers):
if cls.PERIOD_PATTERN.search(header):
period_cols.append(i)
elif i == 0: # First column is usually line items
line_item_col = i
# Extract data
data = []
index = []
for row in processed_table.data_rows:
if len(row) > line_item_col:
line_item = row[line_item_col].strip()
if line_item and not line_item.isspace():
index.append(line_item)
row_data = []
for col_idx in period_cols:
if col_idx < len(row):
row_data.append(row[col_idx])
else:
row_data.append('')
data.append(row_data)
# Create DataFrame
if data:
column_names = []
for i, col_idx in enumerate(period_cols):
if col_idx < len(headers):
# Clean up column name and make unique if needed
col_name = headers[col_idx].strip()
# If duplicate, append index
if col_name in column_names:
col_name = f"{col_name}_{i}"
column_names.append(col_name)
else:
column_names.append(f'Col_{i}')
df = pd.DataFrame(data, index=index, columns=column_names)
else:
df = pd.DataFrame()
return df
@classmethod
def _apply_transformations(cls, df: pd.DataFrame, metadata: TableMetadata) -> pd.DataFrame:
"""Apply data type conversions and scaling"""
if df.empty:
return df
# Convert numeric columns
for col in df.columns:
df[col] = df[col].apply(cls._parse_financial_value)
# Apply scaling if specified
if metadata.scaling_factor:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols] * metadata.scaling_factor
# Add metadata as attributes
df.attrs['currency'] = metadata.currency
df.attrs['units'] = metadata.units
df.attrs['scaling_factor'] = metadata.scaling_factor
df.attrs['period_type'] = metadata.period_type
return df
@staticmethod
def _parse_financial_value(value: str) -> Union[float, str]:
"""Parse a financial value string to float or keep as string"""
if not isinstance(value, str):
return value
# Clean the value
clean_value = value.strip()
# Check for special markers and empty values
empty_markers = ['—', '-', '–', '—', '‒', 'N/A', 'n/a', 'NA', 'nm', 'NM', '*', '**']
if clean_value in empty_markers or not clean_value:
return 0.0
# Remove currency symbols, whitespace, and other common symbols
# Keep negative sign and decimal points
clean_value = re.sub(r'[£€¥₹$,\s]', '', clean_value)
# Handle various negative formats
if clean_value.startswith('(') and clean_value.endswith(')'):
clean_value = '-' + clean_value[1:-1]
elif clean_value.endswith('-'): # Some companies put negative sign at end
clean_value = '-' + clean_value[:-1]
# Handle percentage values (remove % but keep the number)
clean_value = clean_value.replace('%', '')
# Try to convert to float
try:
return float(clean_value)
except ValueError:
# If it contains any digits, try harder to extract them
if re.search(r'\d', clean_value):
# Extract just the numeric part
numeric_match = re.search(r'-?\d+\.?\d*', clean_value)
if numeric_match:
try:
return float(numeric_match.group(0))
except ValueError:
pass
# Return original if not numeric
return value
def extract_statement_dataframe(report_content: str) -> pd.DataFrame:
"""
Convenience function to extract a DataFrame from report HTML content.
Args:
report_content: HTML content from a report
Returns:
pd.DataFrame containing the financial data
"""
# Parse HTML document
document = Document.parse(report_content)
if not document.tables:
return pd.DataFrame()
# Try each table to find one with financial data
for table_node in document.tables:
# Skip tables that are too small (likely headers or metadata)
if table_node.row_count < 3:
continue
# Check if table has numeric data
if _table_has_financial_data(table_node):
df = FinancialTableExtractor.extract_table_to_dataframe(table_node)
if not df.empty:
return df
# If no suitable table found, try the first table anyway
if document.tables:
return FinancialTableExtractor.extract_table_to_dataframe(document.tables[0])
return pd.DataFrame()
def _table_has_financial_data(table_node: TableNode) -> bool:
"""Check if a table contains financial data by looking for numeric patterns"""
if not table_node.content:
return False
# Check first few rows for numeric data
numeric_count = 0
total_cells = 0
for _i, row in enumerate(table_node.content[:10]): # Check first 10 rows
for cell in row.cells:
total_cells += 1
if isinstance(cell.content, str):
# Check for financial number patterns
if re.search(r'\$?\s*\d+[,.]?\d*', cell.content):
numeric_count += 1
# If more than 20% of cells have numbers, likely a financial table
return total_cells > 0 and (numeric_count / total_cells) > 0.2