Files
edgartools/venv/lib/python3.10/site-packages/edgar/sgml/table_to_dataframe.py
2025-12-09 12:13:01 +01:00

350 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Module for converting HTML tables from filing reports to pandas DataFrames.
This provides an alternative to XBRL parsing by extracting data directly from
company-formatted HTML tables.
"""
import re
from dataclasses import dataclass
from typing import Optional, Union
import pandas as pd
from edgar.files.html import Document, TableNode
from edgar.files.tables import ProcessedTable
@dataclass
class TableMetadata:
"""Metadata extracted from table headers and content"""
currency: Optional[str] = None
units: Optional[str] = None
scaling_factor: Optional[int] = None
period_type: Optional[str] = None # 'instant' or 'duration'
class FinancialTableExtractor:
"""Extract financial tables from HTML reports as pandas DataFrames"""
# Common patterns for financial data
# More comprehensive currency patterns
CURRENCY_PATTERN = re.compile(
r'\$|USD|EUR|GBP|JPY|CNY|CAD|AUD|CHF|'
r'£|€|¥|₹|' # Currency symbols
r'\bDollars?\b|\bPounds?\b|\bEuros?\b|\bYen\b',
re.IGNORECASE
)
# More flexible units pattern
UNITS_PATTERN = re.compile(
r'(?:in\s+)?(?:thousands?|millions?|billions?|000s?|000,000s?|mln|mil|bn)',
re.IGNORECASE
)
SCALING_PATTERN = re.compile(r'(\d+(?:,\d{3})*)\s*=\s*\$?1')
# More flexible date patterns to handle various formats
PERIOD_PATTERN = re.compile(
r'(\d{1,2}[\s/\-]\w{3,}[\s/\-]\d{2,4}|' # 31-Dec-2024, 31/December/24
r'\w{3,}\.?\s+\d{1,2},?\s+\d{4}|' # December 31, 2024
r'\d{4}[\s/\-]\d{1,2}[\s/\-]\d{1,2}|' # 2024-12-31
r'\d{1,2}[\s/\-]\d{1,2}[\s/\-]\d{2,4}|' # 12/31/2024, 31-12-24
r'Q[1-4]\s*\d{2,4}|' # Q1 2024, Q12024
r'\d{1}Q\s*\d{2,4}|' # 1Q 2024, 1Q24
r'FY\s*\d{2,4}|' # FY 2024, FY24
r'Fiscal\s+\d{4}|' # Fiscal 2024
r'Year\s+Ended)', # Year Ended
re.IGNORECASE
)
@classmethod
def extract_table_to_dataframe(cls, table_node: TableNode) -> pd.DataFrame:
"""
Convert a TableNode to a pandas DataFrame with appropriate data types.
Args:
table_node: The TableNode containing financial data
Returns:
pd.DataFrame with financial data, periods as columns, line items as index
"""
try:
# Get processed table
processed_table = table_node._processed
if not processed_table:
return pd.DataFrame()
# Extract metadata from headers
metadata = cls._extract_metadata(table_node, processed_table)
# Build DataFrame
df = cls._build_dataframe(processed_table, metadata)
# Apply data transformations
df = cls._apply_transformations(df, metadata)
return df
except Exception:
# Log error but return empty DataFrame to allow processing to continue
return pd.DataFrame()
@classmethod
def _extract_metadata(cls, table_node: TableNode, processed_table: ProcessedTable) -> TableMetadata:
"""Extract metadata from table headers and first few rows"""
metadata = TableMetadata()
# Check headers for currency and units
if processed_table.headers:
header_text = ' '.join(processed_table.headers)
# Extract currency
currency_match = cls.CURRENCY_PATTERN.search(header_text)
if currency_match:
metadata.currency = currency_match.group(0)
# Extract units
units_match = cls.UNITS_PATTERN.search(header_text)
if units_match:
unit_text = units_match.group(0).lower()
if any(x in unit_text for x in ['thousand', '000s', '000,']):
metadata.scaling_factor = 1000
metadata.units = 'thousands'
elif any(x in unit_text for x in ['million', 'mln', 'mil', '000,000']):
metadata.scaling_factor = 1000000
metadata.units = 'millions'
elif any(x in unit_text for x in ['billion', 'bn']):
metadata.scaling_factor = 1000000000
metadata.units = 'billions'
# Check if periods are durations or instants
if processed_table.headers:
period_headers = [h for h in processed_table.headers if cls.PERIOD_PATTERN.search(h)]
if period_headers:
# If headers contain "ended" it's likely duration periods
if any('ended' in h.lower() for h in period_headers):
metadata.period_type = 'duration'
else:
metadata.period_type = 'instant'
return metadata
@classmethod
def _build_dataframe(cls, processed_table: ProcessedTable, metadata: TableMetadata) -> pd.DataFrame:
"""Build initial DataFrame from processed table"""
if not processed_table.data_rows:
return pd.DataFrame()
# Identify period columns and line item column
headers = processed_table.headers or []
period_cols = []
line_item_col = 0
# Check if this is a "vertical" table (like Cover Page)
# where first column is labels and all others are data
is_vertical_table = False
if len(headers) >= 2:
# Check if first column has label-like patterns
first_header_lower = headers[0].lower() if headers[0] else ''
first_is_label = any(pattern in first_header_lower for pattern in
['entity', 'line item', 'information', 'abstract', 'cover page',
'detail', 'description', 'item'])
# Check if this looks like a cover page or entity info table
# by examining the first few data rows
looks_like_entity_info = False
if processed_table.data_rows and len(processed_table.data_rows) > 2:
# Check if first column has entity/document field names
first_col_values = []
for row in processed_table.data_rows[:10]: # Check more rows
if len(row) > 0 and isinstance(row[0], str):
first_col_values.append(row[0].lower())
# More comprehensive patterns for vertical tables
entity_patterns = ['entity', 'document', 'registrant', 'address',
'file number', 'incorporation', 'fiscal', 'telephone',
'securities', 'trading', 'exchange', 'ticker']
# Count how many rows match entity patterns
pattern_matches = sum(
any(pattern in val for pattern in entity_patterns)
for val in first_col_values
)
# If more than 30% of rows have entity-like labels, it's probably vertical
looks_like_entity_info = pattern_matches >= len(first_col_values) * 0.3
is_vertical_table = first_is_label or looks_like_entity_info
if is_vertical_table:
# For vertical tables, first column is index, rest are data
line_item_col = 0
period_cols = list(range(1, len(headers)))
# Ensure we don't include the line item column
if line_item_col in period_cols:
period_cols.remove(line_item_col)
else:
# For standard tables, identify period columns
for i, header in enumerate(headers):
if cls.PERIOD_PATTERN.search(header):
period_cols.append(i)
elif i == 0: # First column is usually line items
line_item_col = i
# Extract data
data = []
index = []
for row in processed_table.data_rows:
if len(row) > line_item_col:
line_item = row[line_item_col].strip()
if line_item and not line_item.isspace():
index.append(line_item)
row_data = []
for col_idx in period_cols:
if col_idx < len(row):
row_data.append(row[col_idx])
else:
row_data.append('')
data.append(row_data)
# Create DataFrame
if data:
column_names = []
for i, col_idx in enumerate(period_cols):
if col_idx < len(headers):
# Clean up column name and make unique if needed
col_name = headers[col_idx].strip()
# If duplicate, append index
if col_name in column_names:
col_name = f"{col_name}_{i}"
column_names.append(col_name)
else:
column_names.append(f'Col_{i}')
df = pd.DataFrame(data, index=index, columns=column_names)
else:
df = pd.DataFrame()
return df
@classmethod
def _apply_transformations(cls, df: pd.DataFrame, metadata: TableMetadata) -> pd.DataFrame:
"""Apply data type conversions and scaling"""
if df.empty:
return df
# Convert numeric columns
for col in df.columns:
df[col] = df[col].apply(cls._parse_financial_value)
# Apply scaling if specified
if metadata.scaling_factor:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols] * metadata.scaling_factor
# Add metadata as attributes
df.attrs['currency'] = metadata.currency
df.attrs['units'] = metadata.units
df.attrs['scaling_factor'] = metadata.scaling_factor
df.attrs['period_type'] = metadata.period_type
return df
@staticmethod
def _parse_financial_value(value: str) -> Union[float, str]:
"""Parse a financial value string to float or keep as string"""
if not isinstance(value, str):
return value
# Clean the value
clean_value = value.strip()
# Check for special markers and empty values
empty_markers = ['', '-', '', '', '', 'N/A', 'n/a', 'NA', 'nm', 'NM', '*', '**']
if clean_value in empty_markers or not clean_value:
return 0.0
# Remove currency symbols, whitespace, and other common symbols
# Keep negative sign and decimal points
clean_value = re.sub(r'[£€¥₹$,\s]', '', clean_value)
# Handle various negative formats
if clean_value.startswith('(') and clean_value.endswith(')'):
clean_value = '-' + clean_value[1:-1]
elif clean_value.endswith('-'): # Some companies put negative sign at end
clean_value = '-' + clean_value[:-1]
# Handle percentage values (remove % but keep the number)
clean_value = clean_value.replace('%', '')
# Try to convert to float
try:
return float(clean_value)
except ValueError:
# If it contains any digits, try harder to extract them
if re.search(r'\d', clean_value):
# Extract just the numeric part
numeric_match = re.search(r'-?\d+\.?\d*', clean_value)
if numeric_match:
try:
return float(numeric_match.group(0))
except ValueError:
pass
# Return original if not numeric
return value
def extract_statement_dataframe(report_content: str) -> pd.DataFrame:
"""
Convenience function to extract a DataFrame from report HTML content.
Args:
report_content: HTML content from a report
Returns:
pd.DataFrame containing the financial data
"""
# Parse HTML document
document = Document.parse(report_content)
if not document.tables:
return pd.DataFrame()
# Try each table to find one with financial data
for table_node in document.tables:
# Skip tables that are too small (likely headers or metadata)
if table_node.row_count < 3:
continue
# Check if table has numeric data
if _table_has_financial_data(table_node):
df = FinancialTableExtractor.extract_table_to_dataframe(table_node)
if not df.empty:
return df
# If no suitable table found, try the first table anyway
if document.tables:
return FinancialTableExtractor.extract_table_to_dataframe(document.tables[0])
return pd.DataFrame()
def _table_has_financial_data(table_node: TableNode) -> bool:
"""Check if a table contains financial data by looking for numeric patterns"""
if not table_node.content:
return False
# Check first few rows for numeric data
numeric_count = 0
total_cells = 0
for _i, row in enumerate(table_node.content[:10]): # Check first 10 rows
for cell in row.cells:
total_cells += 1
if isinstance(cell.content, str):
# Check for financial number patterns
if re.search(r'\$?\s*\d+[,.]?\d*', cell.content):
numeric_count += 1
# If more than 20% of cells have numbers, likely a financial table
return total_cells > 0 and (numeric_count / total_cells) > 0.2