Initial commit
This commit is contained in:
@@ -0,0 +1,349 @@
|
||||
"""
|
||||
Module for converting HTML tables from filing reports to pandas DataFrames.
|
||||
This provides an alternative to XBRL parsing by extracting data directly from
|
||||
company-formatted HTML tables.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from edgar.files.html import Document, TableNode
|
||||
from edgar.files.tables import ProcessedTable
|
||||
|
||||
|
||||
@dataclass
|
||||
class TableMetadata:
|
||||
"""Metadata extracted from table headers and content"""
|
||||
currency: Optional[str] = None
|
||||
units: Optional[str] = None
|
||||
scaling_factor: Optional[int] = None
|
||||
period_type: Optional[str] = None # 'instant' or 'duration'
|
||||
|
||||
|
||||
class FinancialTableExtractor:
|
||||
"""Extract financial tables from HTML reports as pandas DataFrames"""
|
||||
|
||||
# Common patterns for financial data
|
||||
# More comprehensive currency patterns
|
||||
CURRENCY_PATTERN = re.compile(
|
||||
r'\$|USD|EUR|GBP|JPY|CNY|CAD|AUD|CHF|'
|
||||
r'£|€|¥|₹|' # Currency symbols
|
||||
r'\bDollars?\b|\bPounds?\b|\bEuros?\b|\bYen\b',
|
||||
re.IGNORECASE
|
||||
)
|
||||
# More flexible units pattern
|
||||
UNITS_PATTERN = re.compile(
|
||||
r'(?:in\s+)?(?:thousands?|millions?|billions?|000s?|000,000s?|mln|mil|bn)',
|
||||
re.IGNORECASE
|
||||
)
|
||||
SCALING_PATTERN = re.compile(r'(\d+(?:,\d{3})*)\s*=\s*\$?1')
|
||||
# More flexible date patterns to handle various formats
|
||||
PERIOD_PATTERN = re.compile(
|
||||
r'(\d{1,2}[\s/\-]\w{3,}[\s/\-]\d{2,4}|' # 31-Dec-2024, 31/December/24
|
||||
r'\w{3,}\.?\s+\d{1,2},?\s+\d{4}|' # December 31, 2024
|
||||
r'\d{4}[\s/\-]\d{1,2}[\s/\-]\d{1,2}|' # 2024-12-31
|
||||
r'\d{1,2}[\s/\-]\d{1,2}[\s/\-]\d{2,4}|' # 12/31/2024, 31-12-24
|
||||
r'Q[1-4]\s*\d{2,4}|' # Q1 2024, Q12024
|
||||
r'\d{1}Q\s*\d{2,4}|' # 1Q 2024, 1Q24
|
||||
r'FY\s*\d{2,4}|' # FY 2024, FY24
|
||||
r'Fiscal\s+\d{4}|' # Fiscal 2024
|
||||
r'Year\s+Ended)', # Year Ended
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def extract_table_to_dataframe(cls, table_node: TableNode) -> pd.DataFrame:
|
||||
"""
|
||||
Convert a TableNode to a pandas DataFrame with appropriate data types.
|
||||
|
||||
Args:
|
||||
table_node: The TableNode containing financial data
|
||||
|
||||
Returns:
|
||||
pd.DataFrame with financial data, periods as columns, line items as index
|
||||
"""
|
||||
try:
|
||||
# Get processed table
|
||||
processed_table = table_node._processed
|
||||
if not processed_table:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Extract metadata from headers
|
||||
metadata = cls._extract_metadata(table_node, processed_table)
|
||||
|
||||
# Build DataFrame
|
||||
df = cls._build_dataframe(processed_table, metadata)
|
||||
|
||||
# Apply data transformations
|
||||
df = cls._apply_transformations(df, metadata)
|
||||
|
||||
return df
|
||||
|
||||
except Exception:
|
||||
# Log error but return empty DataFrame to allow processing to continue
|
||||
return pd.DataFrame()
|
||||
|
||||
@classmethod
|
||||
def _extract_metadata(cls, table_node: TableNode, processed_table: ProcessedTable) -> TableMetadata:
|
||||
"""Extract metadata from table headers and first few rows"""
|
||||
metadata = TableMetadata()
|
||||
|
||||
# Check headers for currency and units
|
||||
if processed_table.headers:
|
||||
header_text = ' '.join(processed_table.headers)
|
||||
|
||||
# Extract currency
|
||||
currency_match = cls.CURRENCY_PATTERN.search(header_text)
|
||||
if currency_match:
|
||||
metadata.currency = currency_match.group(0)
|
||||
|
||||
# Extract units
|
||||
units_match = cls.UNITS_PATTERN.search(header_text)
|
||||
if units_match:
|
||||
unit_text = units_match.group(0).lower()
|
||||
if any(x in unit_text for x in ['thousand', '000s', '000,']):
|
||||
metadata.scaling_factor = 1000
|
||||
metadata.units = 'thousands'
|
||||
elif any(x in unit_text for x in ['million', 'mln', 'mil', '000,000']):
|
||||
metadata.scaling_factor = 1000000
|
||||
metadata.units = 'millions'
|
||||
elif any(x in unit_text for x in ['billion', 'bn']):
|
||||
metadata.scaling_factor = 1000000000
|
||||
metadata.units = 'billions'
|
||||
|
||||
# Check if periods are durations or instants
|
||||
if processed_table.headers:
|
||||
period_headers = [h for h in processed_table.headers if cls.PERIOD_PATTERN.search(h)]
|
||||
if period_headers:
|
||||
# If headers contain "ended" it's likely duration periods
|
||||
if any('ended' in h.lower() for h in period_headers):
|
||||
metadata.period_type = 'duration'
|
||||
else:
|
||||
metadata.period_type = 'instant'
|
||||
|
||||
return metadata
|
||||
|
||||
@classmethod
|
||||
def _build_dataframe(cls, processed_table: ProcessedTable, metadata: TableMetadata) -> pd.DataFrame:
|
||||
"""Build initial DataFrame from processed table"""
|
||||
if not processed_table.data_rows:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Identify period columns and line item column
|
||||
headers = processed_table.headers or []
|
||||
period_cols = []
|
||||
line_item_col = 0
|
||||
|
||||
# Check if this is a "vertical" table (like Cover Page)
|
||||
# where first column is labels and all others are data
|
||||
is_vertical_table = False
|
||||
if len(headers) >= 2:
|
||||
# Check if first column has label-like patterns
|
||||
first_header_lower = headers[0].lower() if headers[0] else ''
|
||||
first_is_label = any(pattern in first_header_lower for pattern in
|
||||
['entity', 'line item', 'information', 'abstract', 'cover page',
|
||||
'detail', 'description', 'item'])
|
||||
|
||||
# Check if this looks like a cover page or entity info table
|
||||
# by examining the first few data rows
|
||||
looks_like_entity_info = False
|
||||
if processed_table.data_rows and len(processed_table.data_rows) > 2:
|
||||
# Check if first column has entity/document field names
|
||||
first_col_values = []
|
||||
for row in processed_table.data_rows[:10]: # Check more rows
|
||||
if len(row) > 0 and isinstance(row[0], str):
|
||||
first_col_values.append(row[0].lower())
|
||||
|
||||
# More comprehensive patterns for vertical tables
|
||||
entity_patterns = ['entity', 'document', 'registrant', 'address',
|
||||
'file number', 'incorporation', 'fiscal', 'telephone',
|
||||
'securities', 'trading', 'exchange', 'ticker']
|
||||
|
||||
# Count how many rows match entity patterns
|
||||
pattern_matches = sum(
|
||||
any(pattern in val for pattern in entity_patterns)
|
||||
for val in first_col_values
|
||||
)
|
||||
|
||||
# If more than 30% of rows have entity-like labels, it's probably vertical
|
||||
looks_like_entity_info = pattern_matches >= len(first_col_values) * 0.3
|
||||
|
||||
is_vertical_table = first_is_label or looks_like_entity_info
|
||||
|
||||
if is_vertical_table:
|
||||
# For vertical tables, first column is index, rest are data
|
||||
line_item_col = 0
|
||||
period_cols = list(range(1, len(headers)))
|
||||
# Ensure we don't include the line item column
|
||||
if line_item_col in period_cols:
|
||||
period_cols.remove(line_item_col)
|
||||
else:
|
||||
# For standard tables, identify period columns
|
||||
for i, header in enumerate(headers):
|
||||
if cls.PERIOD_PATTERN.search(header):
|
||||
period_cols.append(i)
|
||||
elif i == 0: # First column is usually line items
|
||||
line_item_col = i
|
||||
|
||||
# Extract data
|
||||
data = []
|
||||
index = []
|
||||
|
||||
for row in processed_table.data_rows:
|
||||
if len(row) > line_item_col:
|
||||
line_item = row[line_item_col].strip()
|
||||
if line_item and not line_item.isspace():
|
||||
index.append(line_item)
|
||||
row_data = []
|
||||
for col_idx in period_cols:
|
||||
if col_idx < len(row):
|
||||
row_data.append(row[col_idx])
|
||||
else:
|
||||
row_data.append('')
|
||||
data.append(row_data)
|
||||
|
||||
# Create DataFrame
|
||||
if data:
|
||||
column_names = []
|
||||
for i, col_idx in enumerate(period_cols):
|
||||
if col_idx < len(headers):
|
||||
# Clean up column name and make unique if needed
|
||||
col_name = headers[col_idx].strip()
|
||||
# If duplicate, append index
|
||||
if col_name in column_names:
|
||||
col_name = f"{col_name}_{i}"
|
||||
column_names.append(col_name)
|
||||
else:
|
||||
column_names.append(f'Col_{i}')
|
||||
|
||||
df = pd.DataFrame(data, index=index, columns=column_names)
|
||||
else:
|
||||
df = pd.DataFrame()
|
||||
|
||||
return df
|
||||
|
||||
@classmethod
|
||||
def _apply_transformations(cls, df: pd.DataFrame, metadata: TableMetadata) -> pd.DataFrame:
|
||||
"""Apply data type conversions and scaling"""
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
# Convert numeric columns
|
||||
for col in df.columns:
|
||||
df[col] = df[col].apply(cls._parse_financial_value)
|
||||
|
||||
# Apply scaling if specified
|
||||
if metadata.scaling_factor:
|
||||
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
|
||||
df[numeric_cols] = df[numeric_cols] * metadata.scaling_factor
|
||||
|
||||
# Add metadata as attributes
|
||||
df.attrs['currency'] = metadata.currency
|
||||
df.attrs['units'] = metadata.units
|
||||
df.attrs['scaling_factor'] = metadata.scaling_factor
|
||||
df.attrs['period_type'] = metadata.period_type
|
||||
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def _parse_financial_value(value: str) -> Union[float, str]:
|
||||
"""Parse a financial value string to float or keep as string"""
|
||||
if not isinstance(value, str):
|
||||
return value
|
||||
|
||||
# Clean the value
|
||||
clean_value = value.strip()
|
||||
|
||||
# Check for special markers and empty values
|
||||
empty_markers = ['—', '-', '–', '—', '‒', 'N/A', 'n/a', 'NA', 'nm', 'NM', '*', '**']
|
||||
if clean_value in empty_markers or not clean_value:
|
||||
return 0.0
|
||||
|
||||
# Remove currency symbols, whitespace, and other common symbols
|
||||
# Keep negative sign and decimal points
|
||||
clean_value = re.sub(r'[£€¥₹$,\s]', '', clean_value)
|
||||
|
||||
# Handle various negative formats
|
||||
if clean_value.startswith('(') and clean_value.endswith(')'):
|
||||
clean_value = '-' + clean_value[1:-1]
|
||||
elif clean_value.endswith('-'): # Some companies put negative sign at end
|
||||
clean_value = '-' + clean_value[:-1]
|
||||
|
||||
# Handle percentage values (remove % but keep the number)
|
||||
clean_value = clean_value.replace('%', '')
|
||||
|
||||
# Try to convert to float
|
||||
try:
|
||||
return float(clean_value)
|
||||
except ValueError:
|
||||
# If it contains any digits, try harder to extract them
|
||||
if re.search(r'\d', clean_value):
|
||||
# Extract just the numeric part
|
||||
numeric_match = re.search(r'-?\d+\.?\d*', clean_value)
|
||||
if numeric_match:
|
||||
try:
|
||||
return float(numeric_match.group(0))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Return original if not numeric
|
||||
return value
|
||||
|
||||
|
||||
def extract_statement_dataframe(report_content: str) -> pd.DataFrame:
|
||||
"""
|
||||
Convenience function to extract a DataFrame from report HTML content.
|
||||
|
||||
Args:
|
||||
report_content: HTML content from a report
|
||||
|
||||
Returns:
|
||||
pd.DataFrame containing the financial data
|
||||
"""
|
||||
# Parse HTML document
|
||||
document = Document.parse(report_content)
|
||||
|
||||
if not document.tables:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Try each table to find one with financial data
|
||||
for table_node in document.tables:
|
||||
# Skip tables that are too small (likely headers or metadata)
|
||||
if table_node.row_count < 3:
|
||||
continue
|
||||
|
||||
# Check if table has numeric data
|
||||
if _table_has_financial_data(table_node):
|
||||
df = FinancialTableExtractor.extract_table_to_dataframe(table_node)
|
||||
if not df.empty:
|
||||
return df
|
||||
|
||||
# If no suitable table found, try the first table anyway
|
||||
if document.tables:
|
||||
return FinancialTableExtractor.extract_table_to_dataframe(document.tables[0])
|
||||
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def _table_has_financial_data(table_node: TableNode) -> bool:
|
||||
"""Check if a table contains financial data by looking for numeric patterns"""
|
||||
if not table_node.content:
|
||||
return False
|
||||
|
||||
# Check first few rows for numeric data
|
||||
numeric_count = 0
|
||||
total_cells = 0
|
||||
|
||||
for _i, row in enumerate(table_node.content[:10]): # Check first 10 rows
|
||||
for cell in row.cells:
|
||||
total_cells += 1
|
||||
if isinstance(cell.content, str):
|
||||
# Check for financial number patterns
|
||||
if re.search(r'\$?\s*\d+[,.]?\d*', cell.content):
|
||||
numeric_count += 1
|
||||
|
||||
# If more than 20% of cells have numbers, likely a financial table
|
||||
return total_cells > 0 and (numeric_count / total_cells) > 0.2
|
||||
Reference in New Issue
Block a user