Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/sgml/table_to_dataframe.py
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/table_to_dataframe.py
@@ -0,0 +1,349 @@
+"""
+Module for converting HTML tables from filing reports to pandas DataFrames.
+This provides an alternative to XBRL parsing by extracting data directly from
+company-formatted HTML tables.
+"""
+
+import re
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import pandas as pd
+
+from edgar.files.html import Document, TableNode
+from edgar.files.tables import ProcessedTable
+
+
+@dataclass
+class TableMetadata:
+    """Metadata extracted from table headers and content"""
+    currency: Optional[str] = None
+    units: Optional[str] = None
+    scaling_factor: Optional[int] = None
+    period_type: Optional[str] = None  # 'instant' or 'duration'
+
+
+class FinancialTableExtractor:
+    """Extract financial tables from HTML reports as pandas DataFrames"""
+
+    # Common patterns for financial data
+    # More comprehensive currency patterns
+    CURRENCY_PATTERN = re.compile(
+        r'\$|USD|EUR|GBP|JPY|CNY|CAD|AUD|CHF|'
+        r'£|€|¥|₹|'  # Currency symbols
+        r'\bDollars?\b|\bPounds?\b|\bEuros?\b|\bYen\b',
+        re.IGNORECASE
+    )
+    # More flexible units pattern
+    UNITS_PATTERN = re.compile(
+        r'(?:in\s+)?(?:thousands?|millions?|billions?|000s?|000,000s?|mln|mil|bn)',
+        re.IGNORECASE
+    )
+    SCALING_PATTERN = re.compile(r'(\d+(?:,\d{3})*)\s*=\s*\$?1')
+    # More flexible date patterns to handle various formats
+    PERIOD_PATTERN = re.compile(
+        r'(\d{1,2}[\s/\-]\w{3,}[\s/\-]\d{2,4}|'  # 31-Dec-2024, 31/December/24
+        r'\w{3,}\.?\s+\d{1,2},?\s+\d{4}|'        # December 31, 2024
+        r'\d{4}[\s/\-]\d{1,2}[\s/\-]\d{1,2}|'    # 2024-12-31
+        r'\d{1,2}[\s/\-]\d{1,2}[\s/\-]\d{2,4}|'  # 12/31/2024, 31-12-24
+        r'Q[1-4]\s*\d{2,4}|'                      # Q1 2024, Q12024
+        r'\d{1}Q\s*\d{2,4}|'                      # 1Q 2024, 1Q24
+        r'FY\s*\d{2,4}|'                          # FY 2024, FY24
+        r'Fiscal\s+\d{4}|'                        # Fiscal 2024
+        r'Year\s+Ended)',                         # Year Ended
+        re.IGNORECASE
+    )
+
+    @classmethod
+    def extract_table_to_dataframe(cls, table_node: TableNode) -> pd.DataFrame:
+        """
+        Convert a TableNode to a pandas DataFrame with appropriate data types.
+
+        Args:
+            table_node: The TableNode containing financial data
+
+        Returns:
+            pd.DataFrame with financial data, periods as columns, line items as index
+        """
+        try:
+            # Get processed table
+            processed_table = table_node._processed
+            if not processed_table:
+                return pd.DataFrame()
+
+            # Extract metadata from headers
+            metadata = cls._extract_metadata(table_node, processed_table)
+
+            # Build DataFrame
+            df = cls._build_dataframe(processed_table, metadata)
+
+            # Apply data transformations
+            df = cls._apply_transformations(df, metadata)
+
+            return df
+
+        except Exception:
+            # Log error but return empty DataFrame to allow processing to continue
+            return pd.DataFrame()
+
+    @classmethod
+    def _extract_metadata(cls, table_node: TableNode, processed_table: ProcessedTable) -> TableMetadata:
+        """Extract metadata from table headers and first few rows"""
+        metadata = TableMetadata()
+
+        # Check headers for currency and units
+        if processed_table.headers:
+            header_text = ' '.join(processed_table.headers)
+
+            # Extract currency
+            currency_match = cls.CURRENCY_PATTERN.search(header_text)
+            if currency_match:
+                metadata.currency = currency_match.group(0)
+
+            # Extract units
+            units_match = cls.UNITS_PATTERN.search(header_text)
+            if units_match:
+                unit_text = units_match.group(0).lower()
+                if any(x in unit_text for x in ['thousand', '000s', '000,']):
+                    metadata.scaling_factor = 1000
+                    metadata.units = 'thousands'
+                elif any(x in unit_text for x in ['million', 'mln', 'mil', '000,000']):
+                    metadata.scaling_factor = 1000000
+                    metadata.units = 'millions'
+                elif any(x in unit_text for x in ['billion', 'bn']):
+                    metadata.scaling_factor = 1000000000
+                    metadata.units = 'billions'
+
+        # Check if periods are durations or instants
+        if processed_table.headers:
+            period_headers = [h for h in processed_table.headers if cls.PERIOD_PATTERN.search(h)]
+            if period_headers:
+                # If headers contain "ended" it's likely duration periods
+                if any('ended' in h.lower() for h in period_headers):
+                    metadata.period_type = 'duration'
+                else:
+                    metadata.period_type = 'instant'
+
+        return metadata
+
+    @classmethod
+    def _build_dataframe(cls, processed_table: ProcessedTable, metadata: TableMetadata) -> pd.DataFrame:
+        """Build initial DataFrame from processed table"""
+        if not processed_table.data_rows:
+            return pd.DataFrame()
+
+        # Identify period columns and line item column
+        headers = processed_table.headers or []
+        period_cols = []
+        line_item_col = 0
+
+        # Check if this is a "vertical" table (like Cover Page)
+        # where first column is labels and all others are data
+        is_vertical_table = False
+        if len(headers) >= 2:
+            # Check if first column has label-like patterns
+            first_header_lower = headers[0].lower() if headers[0] else ''
+            first_is_label = any(pattern in first_header_lower for pattern in 
+                               ['entity', 'line item', 'information', 'abstract', 'cover page',
+                                'detail', 'description', 'item'])
+
+            # Check if this looks like a cover page or entity info table
+            # by examining the first few data rows
+            looks_like_entity_info = False
+            if processed_table.data_rows and len(processed_table.data_rows) > 2:
+                # Check if first column has entity/document field names
+                first_col_values = []
+                for row in processed_table.data_rows[:10]:  # Check more rows
+                    if len(row) > 0 and isinstance(row[0], str):
+                        first_col_values.append(row[0].lower())
+
+                # More comprehensive patterns for vertical tables
+                entity_patterns = ['entity', 'document', 'registrant', 'address', 
+                                 'file number', 'incorporation', 'fiscal', 'telephone',
+                                 'securities', 'trading', 'exchange', 'ticker']
+
+                # Count how many rows match entity patterns
+                pattern_matches = sum(
+                    any(pattern in val for pattern in entity_patterns) 
+                    for val in first_col_values
+                )
+
+                # If more than 30% of rows have entity-like labels, it's probably vertical
+                looks_like_entity_info = pattern_matches >= len(first_col_values) * 0.3
+
+            is_vertical_table = first_is_label or looks_like_entity_info
+
+        if is_vertical_table:
+            # For vertical tables, first column is index, rest are data
+            line_item_col = 0
+            period_cols = list(range(1, len(headers)))
+            # Ensure we don't include the line item column
+            if line_item_col in period_cols:
+                period_cols.remove(line_item_col)
+        else:
+            # For standard tables, identify period columns
+            for i, header in enumerate(headers):
+                if cls.PERIOD_PATTERN.search(header):
+                    period_cols.append(i)
+                elif i == 0:  # First column is usually line items
+                    line_item_col = i
+
+        # Extract data
+        data = []
+        index = []
+
+        for row in processed_table.data_rows:
+            if len(row) > line_item_col:
+                line_item = row[line_item_col].strip()
+                if line_item and not line_item.isspace():
+                    index.append(line_item)
+                    row_data = []
+                    for col_idx in period_cols:
+                        if col_idx < len(row):
+                            row_data.append(row[col_idx])
+                        else:
+                            row_data.append('')
+                    data.append(row_data)
+
+        # Create DataFrame
+        if data:
+            column_names = []
+            for i, col_idx in enumerate(period_cols):
+                if col_idx < len(headers):
+                    # Clean up column name and make unique if needed
+                    col_name = headers[col_idx].strip()
+                    # If duplicate, append index
+                    if col_name in column_names:
+                        col_name = f"{col_name}_{i}"
+                    column_names.append(col_name)
+                else:
+                    column_names.append(f'Col_{i}')
+
+            df = pd.DataFrame(data, index=index, columns=column_names)
+        else:
+            df = pd.DataFrame()
+
+        return df
+
+    @classmethod
+    def _apply_transformations(cls, df: pd.DataFrame, metadata: TableMetadata) -> pd.DataFrame:
+        """Apply data type conversions and scaling"""
+        if df.empty:
+            return df
+
+        # Convert numeric columns
+        for col in df.columns:
+            df[col] = df[col].apply(cls._parse_financial_value)
+
+        # Apply scaling if specified
+        if metadata.scaling_factor:
+            numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
+            df[numeric_cols] = df[numeric_cols] * metadata.scaling_factor
+
+        # Add metadata as attributes
+        df.attrs['currency'] = metadata.currency
+        df.attrs['units'] = metadata.units
+        df.attrs['scaling_factor'] = metadata.scaling_factor
+        df.attrs['period_type'] = metadata.period_type
+
+        return df
+
+    @staticmethod
+    def _parse_financial_value(value: str) -> Union[float, str]:
+        """Parse a financial value string to float or keep as string"""
+        if not isinstance(value, str):
+            return value
+
+        # Clean the value
+        clean_value = value.strip()
+
+        # Check for special markers and empty values
+        empty_markers = ['—', '-', '–', '—', '‒', 'N/A', 'n/a', 'NA', 'nm', 'NM', '*', '**']
+        if clean_value in empty_markers or not clean_value:
+            return 0.0
+
+        # Remove currency symbols, whitespace, and other common symbols
+        # Keep negative sign and decimal points
+        clean_value = re.sub(r'[£€¥₹$,\s]', '', clean_value)
+
+        # Handle various negative formats
+        if clean_value.startswith('(') and clean_value.endswith(')'):
+            clean_value = '-' + clean_value[1:-1]
+        elif clean_value.endswith('-'):  # Some companies put negative sign at end
+            clean_value = '-' + clean_value[:-1]
+
+        # Handle percentage values (remove % but keep the number)
+        clean_value = clean_value.replace('%', '')
+
+        # Try to convert to float
+        try:
+            return float(clean_value)
+        except ValueError:
+            # If it contains any digits, try harder to extract them
+            if re.search(r'\d', clean_value):
+                # Extract just the numeric part
+                numeric_match = re.search(r'-?\d+\.?\d*', clean_value)
+                if numeric_match:
+                    try:
+                        return float(numeric_match.group(0))
+                    except ValueError:
+                        pass
+
+            # Return original if not numeric
+            return value
+
+
+def extract_statement_dataframe(report_content: str) -> pd.DataFrame:
+    """
+    Convenience function to extract a DataFrame from report HTML content.
+
+    Args:
+        report_content: HTML content from a report
+
+    Returns:
+        pd.DataFrame containing the financial data
+    """
+    # Parse HTML document
+    document = Document.parse(report_content)
+
+    if not document.tables:
+        return pd.DataFrame()
+
+    # Try each table to find one with financial data
+    for table_node in document.tables:
+        # Skip tables that are too small (likely headers or metadata)
+        if table_node.row_count < 3:
+            continue
+
+        # Check if table has numeric data
+        if _table_has_financial_data(table_node):
+            df = FinancialTableExtractor.extract_table_to_dataframe(table_node)
+            if not df.empty:
+                return df
+
+    # If no suitable table found, try the first table anyway
+    if document.tables:
+        return FinancialTableExtractor.extract_table_to_dataframe(document.tables[0])
+
+    return pd.DataFrame()
+
+
+def _table_has_financial_data(table_node: TableNode) -> bool:
+    """Check if a table contains financial data by looking for numeric patterns"""
+    if not table_node.content:
+        return False
+
+    # Check first few rows for numeric data
+    numeric_count = 0
+    total_cells = 0
+
+    for _i, row in enumerate(table_node.content[:10]):  # Check first 10 rows
+        for cell in row.cells:
+            total_cells += 1
+            if isinstance(cell.content, str):
+                # Check for financial number patterns
+                if re.search(r'\$?\s*\d+[,.]?\d*', cell.content):
+                    numeric_count += 1
+
+    # If more than 20% of cells have numbers, likely a financial table
+    return total_cells > 0 and (numeric_count / total_cells) > 0.2