""" Module for converting HTML tables from filing reports to pandas DataFrames. This provides an alternative to XBRL parsing by extracting data directly from company-formatted HTML tables. """ import re from dataclasses import dataclass from typing import Optional, Union import pandas as pd from edgar.files.html import Document, TableNode from edgar.files.tables import ProcessedTable @dataclass class TableMetadata: """Metadata extracted from table headers and content""" currency: Optional[str] = None units: Optional[str] = None scaling_factor: Optional[int] = None period_type: Optional[str] = None # 'instant' or 'duration' class FinancialTableExtractor: """Extract financial tables from HTML reports as pandas DataFrames""" # Common patterns for financial data # More comprehensive currency patterns CURRENCY_PATTERN = re.compile( r'\$|USD|EUR|GBP|JPY|CNY|CAD|AUD|CHF|' r'£|€|¥|₹|' # Currency symbols r'\bDollars?\b|\bPounds?\b|\bEuros?\b|\bYen\b', re.IGNORECASE ) # More flexible units pattern UNITS_PATTERN = re.compile( r'(?:in\s+)?(?:thousands?|millions?|billions?|000s?|000,000s?|mln|mil|bn)', re.IGNORECASE ) SCALING_PATTERN = re.compile(r'(\d+(?:,\d{3})*)\s*=\s*\$?1') # More flexible date patterns to handle various formats PERIOD_PATTERN = re.compile( r'(\d{1,2}[\s/\-]\w{3,}[\s/\-]\d{2,4}|' # 31-Dec-2024, 31/December/24 r'\w{3,}\.?\s+\d{1,2},?\s+\d{4}|' # December 31, 2024 r'\d{4}[\s/\-]\d{1,2}[\s/\-]\d{1,2}|' # 2024-12-31 r'\d{1,2}[\s/\-]\d{1,2}[\s/\-]\d{2,4}|' # 12/31/2024, 31-12-24 r'Q[1-4]\s*\d{2,4}|' # Q1 2024, Q12024 r'\d{1}Q\s*\d{2,4}|' # 1Q 2024, 1Q24 r'FY\s*\d{2,4}|' # FY 2024, FY24 r'Fiscal\s+\d{4}|' # Fiscal 2024 r'Year\s+Ended)', # Year Ended re.IGNORECASE ) @classmethod def extract_table_to_dataframe(cls, table_node: TableNode) -> pd.DataFrame: """ Convert a TableNode to a pandas DataFrame with appropriate data types. Args: table_node: The TableNode containing financial data Returns: pd.DataFrame with financial data, periods as columns, line items as index """ try: # Get processed table processed_table = table_node._processed if not processed_table: return pd.DataFrame() # Extract metadata from headers metadata = cls._extract_metadata(table_node, processed_table) # Build DataFrame df = cls._build_dataframe(processed_table, metadata) # Apply data transformations df = cls._apply_transformations(df, metadata) return df except Exception: # Log error but return empty DataFrame to allow processing to continue return pd.DataFrame() @classmethod def _extract_metadata(cls, table_node: TableNode, processed_table: ProcessedTable) -> TableMetadata: """Extract metadata from table headers and first few rows""" metadata = TableMetadata() # Check headers for currency and units if processed_table.headers: header_text = ' '.join(processed_table.headers) # Extract currency currency_match = cls.CURRENCY_PATTERN.search(header_text) if currency_match: metadata.currency = currency_match.group(0) # Extract units units_match = cls.UNITS_PATTERN.search(header_text) if units_match: unit_text = units_match.group(0).lower() if any(x in unit_text for x in ['thousand', '000s', '000,']): metadata.scaling_factor = 1000 metadata.units = 'thousands' elif any(x in unit_text for x in ['million', 'mln', 'mil', '000,000']): metadata.scaling_factor = 1000000 metadata.units = 'millions' elif any(x in unit_text for x in ['billion', 'bn']): metadata.scaling_factor = 1000000000 metadata.units = 'billions' # Check if periods are durations or instants if processed_table.headers: period_headers = [h for h in processed_table.headers if cls.PERIOD_PATTERN.search(h)] if period_headers: # If headers contain "ended" it's likely duration periods if any('ended' in h.lower() for h in period_headers): metadata.period_type = 'duration' else: metadata.period_type = 'instant' return metadata @classmethod def _build_dataframe(cls, processed_table: ProcessedTable, metadata: TableMetadata) -> pd.DataFrame: """Build initial DataFrame from processed table""" if not processed_table.data_rows: return pd.DataFrame() # Identify period columns and line item column headers = processed_table.headers or [] period_cols = [] line_item_col = 0 # Check if this is a "vertical" table (like Cover Page) # where first column is labels and all others are data is_vertical_table = False if len(headers) >= 2: # Check if first column has label-like patterns first_header_lower = headers[0].lower() if headers[0] else '' first_is_label = any(pattern in first_header_lower for pattern in ['entity', 'line item', 'information', 'abstract', 'cover page', 'detail', 'description', 'item']) # Check if this looks like a cover page or entity info table # by examining the first few data rows looks_like_entity_info = False if processed_table.data_rows and len(processed_table.data_rows) > 2: # Check if first column has entity/document field names first_col_values = [] for row in processed_table.data_rows[:10]: # Check more rows if len(row) > 0 and isinstance(row[0], str): first_col_values.append(row[0].lower()) # More comprehensive patterns for vertical tables entity_patterns = ['entity', 'document', 'registrant', 'address', 'file number', 'incorporation', 'fiscal', 'telephone', 'securities', 'trading', 'exchange', 'ticker'] # Count how many rows match entity patterns pattern_matches = sum( any(pattern in val for pattern in entity_patterns) for val in first_col_values ) # If more than 30% of rows have entity-like labels, it's probably vertical looks_like_entity_info = pattern_matches >= len(first_col_values) * 0.3 is_vertical_table = first_is_label or looks_like_entity_info if is_vertical_table: # For vertical tables, first column is index, rest are data line_item_col = 0 period_cols = list(range(1, len(headers))) # Ensure we don't include the line item column if line_item_col in period_cols: period_cols.remove(line_item_col) else: # For standard tables, identify period columns for i, header in enumerate(headers): if cls.PERIOD_PATTERN.search(header): period_cols.append(i) elif i == 0: # First column is usually line items line_item_col = i # Extract data data = [] index = [] for row in processed_table.data_rows: if len(row) > line_item_col: line_item = row[line_item_col].strip() if line_item and not line_item.isspace(): index.append(line_item) row_data = [] for col_idx in period_cols: if col_idx < len(row): row_data.append(row[col_idx]) else: row_data.append('') data.append(row_data) # Create DataFrame if data: column_names = [] for i, col_idx in enumerate(period_cols): if col_idx < len(headers): # Clean up column name and make unique if needed col_name = headers[col_idx].strip() # If duplicate, append index if col_name in column_names: col_name = f"{col_name}_{i}" column_names.append(col_name) else: column_names.append(f'Col_{i}') df = pd.DataFrame(data, index=index, columns=column_names) else: df = pd.DataFrame() return df @classmethod def _apply_transformations(cls, df: pd.DataFrame, metadata: TableMetadata) -> pd.DataFrame: """Apply data type conversions and scaling""" if df.empty: return df # Convert numeric columns for col in df.columns: df[col] = df[col].apply(cls._parse_financial_value) # Apply scaling if specified if metadata.scaling_factor: numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns df[numeric_cols] = df[numeric_cols] * metadata.scaling_factor # Add metadata as attributes df.attrs['currency'] = metadata.currency df.attrs['units'] = metadata.units df.attrs['scaling_factor'] = metadata.scaling_factor df.attrs['period_type'] = metadata.period_type return df @staticmethod def _parse_financial_value(value: str) -> Union[float, str]: """Parse a financial value string to float or keep as string""" if not isinstance(value, str): return value # Clean the value clean_value = value.strip() # Check for special markers and empty values empty_markers = ['—', '-', '–', '—', '‒', 'N/A', 'n/a', 'NA', 'nm', 'NM', '*', '**'] if clean_value in empty_markers or not clean_value: return 0.0 # Remove currency symbols, whitespace, and other common symbols # Keep negative sign and decimal points clean_value = re.sub(r'[£€¥₹$,\s]', '', clean_value) # Handle various negative formats if clean_value.startswith('(') and clean_value.endswith(')'): clean_value = '-' + clean_value[1:-1] elif clean_value.endswith('-'): # Some companies put negative sign at end clean_value = '-' + clean_value[:-1] # Handle percentage values (remove % but keep the number) clean_value = clean_value.replace('%', '') # Try to convert to float try: return float(clean_value) except ValueError: # If it contains any digits, try harder to extract them if re.search(r'\d', clean_value): # Extract just the numeric part numeric_match = re.search(r'-?\d+\.?\d*', clean_value) if numeric_match: try: return float(numeric_match.group(0)) except ValueError: pass # Return original if not numeric return value def extract_statement_dataframe(report_content: str) -> pd.DataFrame: """ Convenience function to extract a DataFrame from report HTML content. Args: report_content: HTML content from a report Returns: pd.DataFrame containing the financial data """ # Parse HTML document document = Document.parse(report_content) if not document.tables: return pd.DataFrame() # Try each table to find one with financial data for table_node in document.tables: # Skip tables that are too small (likely headers or metadata) if table_node.row_count < 3: continue # Check if table has numeric data if _table_has_financial_data(table_node): df = FinancialTableExtractor.extract_table_to_dataframe(table_node) if not df.empty: return df # If no suitable table found, try the first table anyway if document.tables: return FinancialTableExtractor.extract_table_to_dataframe(document.tables[0]) return pd.DataFrame() def _table_has_financial_data(table_node: TableNode) -> bool: """Check if a table contains financial data by looking for numeric patterns""" if not table_node.content: return False # Check first few rows for numeric data numeric_count = 0 total_cells = 0 for _i, row in enumerate(table_node.content[:10]): # Check first 10 rows for cell in row.cells: total_cells += 1 if isinstance(cell.content, str): # Check for financial number patterns if re.search(r'\$?\s*\d+[,.]?\d*', cell.content): numeric_count += 1 # If more than 20% of cells have numbers, likely a financial table return total_cells > 0 and (numeric_count / total_cells) > 0.2