Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/files/tables.py
+++ b/venv/lib/python3.10/site-packages/edgar/files/tables.py
@@ -0,0 +1,657 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+if TYPE_CHECKING:
+    from edgar.files.html import BaseNode
+import re
+from functools import lru_cache
+
+from edgar.richtools import rich_to_text
+
+
+@dataclass
+class ProcessedTable:
+    """Represents a processed table ready for rendering"""
+    headers: Optional[list[str]]
+    data_rows: list[list[str]]
+    column_alignments: list[str]  # "left" or "right" for each column
+
+
+# Looks for actual numeric data values, currency, or calculations
+data_indicators = [
+    r'\$\s*\d',  # Currency with numbers
+    r'\d+(?:,\d{3})+',  # Numbers with thousands separators
+    r'\d+\s*[+\-*/]\s*\d+',  # Basic calculations
+    r'\(\s*\d+(?:,\d{3})*\s*\)',  # Parenthesized numbers
+]
+
+data_pattern = '|'.join(data_indicators)
+
+
+def is_number(s: str) -> bool:
+    """
+    Check if a string represents a number in common financial formats.
+
+    Handles:
+    - Regular numbers (123, -123, 123.45)
+    - Currency ($123, $123.45)
+    - Parenthetical negatives ((123), (123.45))
+    - Thousands separators (1,234, 1,234.56)
+    - Mixed formats ($1,234.56)
+    - Various whitespace
+    - En/Em dashes for negatives
+    - Multiple decimal formats (123.45, 123,45)
+
+    Args:
+        s: String to check
+
+    Returns:
+        bool: True if string represents a valid number
+    """
+    if not s or s.isspace():
+        return False
+
+    # Convert unicode minus/dash characters to regular minus
+    s = s.replace('−', '-').replace('–', '-').replace('—', '-')
+
+    # Handle parenthetical negatives
+    s = s.strip()
+    if s.startswith('(') and s.endswith(')'):
+        s = '-' + s[1:-1]
+
+    # Remove currency symbols and whitespace
+    s = s.replace('$', '').replace(' ', '')
+
+    # Handle European number format (convert 123,45 to 123.45)
+    if ',' in s and '.' not in s and len(s.split(',')[1]) == 2:
+        s = s.replace(',', '.')
+    else:
+        # Remove thousands separators
+        s = s.replace(',', '')
+
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+class TableProcessor:
+    @staticmethod
+    def process_table(node) -> Optional[ProcessedTable]:
+        """Process table node into a format ready for rendering"""
+        if not isinstance(node.content, list) or not node.content:
+            return None
+
+        def process_cell_content(content: Union[str, 'BaseNode']) -> str:
+            """Process cell content to handle HTML breaks and cleanup"""
+            if isinstance(content, str):
+                content = content.replace('<br/>', '\n').replace('<br>', '\n')
+                lines = [line.strip() for line in content.split('\n')]
+                return '\n'.join(line for line in lines if line)
+            else:
+                # Recursively process nested nodes
+                processed_table = content.render(500)
+                return rich_to_text(processed_table)
+
+        # Process all rows into virtual columns
+        virtual_rows = []
+        max_cols = max(sum(cell.colspan for cell in row.cells) for row in node.content)
+
+        # Convert all rows to virtual columns first
+        for row in node.content:
+            virtual_row = [""] * max_cols
+            current_col = 0
+
+            for cell in row.cells:
+                content = process_cell_content(cell.content)
+
+                if '\n' not in content and cell.is_currency and content.replace(',', '').replace('.', '').isdigit():
+                    content = f"${float(content.replace(',', '')):,.2f}"
+
+                if cell.colspan > 1:
+                    virtual_row[current_col + 1] = content
+                else:
+                    virtual_row[current_col] = content
+
+                current_col += cell.colspan
+
+            virtual_rows.append(virtual_row)
+
+        # Analyze and remove empty columns
+        empty_cols = []
+        for col in range(max_cols):
+            if all(row[col].strip() == "" for row in virtual_rows):
+                empty_cols.append(col)
+
+        # Process empty columns
+        cols_to_remove = TableProcessor._get_columns_to_remove(empty_cols, max_cols)
+
+        # Create optimized rows, filtering out empty ones
+        optimized_rows = []
+        for virtual_row in virtual_rows:
+            has_content = any(col.strip() for col in virtual_row)
+            if not has_content:
+                continue
+            optimized_row = [col for idx, col in enumerate(virtual_row) if idx not in cols_to_remove]
+            optimized_rows.append(optimized_row)
+
+        if not optimized_rows:
+            return None
+
+        # Detect headers
+        header_rows, data_start_idx = TableProcessor._analyze_table_structure(optimized_rows)
+
+        # Detect and fix misalignment in all rows
+        fixed_rows = TableProcessor._detect_and_fix_misalignment(optimized_rows, data_start_idx)
+
+        # Use the fixed header portion for processing headers
+        headers = None
+        if header_rows:
+            fixed_headers = fixed_rows[:data_start_idx]  # Take header portion from fixed rows
+            headers = TableProcessor._merge_header_rows(fixed_headers)
+
+        # Determine column alignments
+        col_count = len(optimized_rows[0])
+        alignments = TableProcessor._determine_column_alignments(
+            optimized_rows, data_start_idx, col_count)
+
+        # Format data rows
+        formatted_rows = TableProcessor._format_data_rows(
+            optimized_rows[data_start_idx:])
+
+        return ProcessedTable(
+            headers=headers,
+            data_rows=formatted_rows,
+            column_alignments=alignments
+        )
+
+    @staticmethod
+    def _is_date_header(text: str) -> bool:
+        """Detect if text looks like a date header (year, quarter, month)"""
+        text = text.lower().strip()
+
+        # Year patterns
+        if text.isdigit() and len(text) == 4:
+            return True
+
+        # Quarter patterns
+        quarter_patterns = ['q1', 'q2', 'q3', 'q4', 'first quarter', 'second quarter',
+                            'third quarter', 'fourth quarter']
+        if any(pattern in text for pattern in quarter_patterns):
+            return True
+
+        # Month patterns
+        months = ['january', 'february', 'march', 'april', 'may', 'june',
+                  'july', 'august', 'september', 'october', 'november', 'december',
+                  'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
+        return any(month in text for month in months)
+
+
+    @staticmethod
+    def _analyze_row(row: list) -> dict:
+        """Analyze characteristics of a row"""
+        return {
+            'empty_first': not bool(row[0].strip()),
+            'date_headers': sum(1 for cell in row if TableProcessor._is_date_header(cell)),
+            'financial_values': sum(1 for i, cell in enumerate(row)
+                                    if TableProcessor._is_financial_value(cell, row, i)),
+            'financial_metrics': sum(1 for cell in row if TableProcessor._is_financial_metric(cell)),
+            'empty_cells': sum(1 for cell in row if not cell.strip()),
+            'dollar_signs': sum(1 for cell in row if cell.strip() == '$'),
+            'total_cells': len(row)
+        }
+
+
+    @staticmethod
+    @lru_cache(maxsize=None)
+    def _get_period_header_pattern() -> re.Pattern:
+        """Create regex pattern for common financial period headers"""
+        # Base components
+        periods = r'(?:three|six|nine|twelve|[1-4]|first|second|third|fourth)'
+        timeframes = r'(?:month|quarter|year|week)'
+        ended_variants = r'(?:ended|ending|end|period)'
+        as_of_variants = r'(?:as\s+of|at|as\s+at)'
+
+        # Enhanced date pattern
+        months = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
+        day = r'\d{1,2}'
+        year = r'(?:19|20)\d{2}'
+        date = fr'{months}\s*\.?\s*{day}\s*,?\s*{year}'
+
+        # Combine into patterns
+        patterns = [
+            # Standard period headers
+            fr'{periods}\s+{timeframes}\s+{ended_variants}(?:\s+{date})?',
+            fr'(?:fiscal\s+)?{timeframes}\s+{ended_variants}',
+            fr'{timeframes}\s+{ended_variants}(?:\s+{date})?',
+
+            # Balance sheet date headers
+            fr'{as_of_variants}\s+{date}',
+
+            # Multiple dates in sequence (common in headers)
+            fr'{date}(?:\s*(?:and|,)\s*{date})*',
+
+            # Single date with optional period specification
+            fr'(?:{ended_variants}\s+)?{date}'
+        ]
+
+        # Combine all patterns
+        combined_pattern = '|'.join(f'(?:{p})' for p in patterns)
+        return re.compile(combined_pattern, re.IGNORECASE)
+
+    @staticmethod
+    def _contains_data(self, text):
+        # Check if the string contains data indicators
+        return bool(re.search(data_pattern, text))
+
+    @staticmethod
+    def _analyze_table_structure(rows: list) -> tuple[list, int]:
+        """
+        Analyze table structure to determine headers and data rows.
+        Returns (header_rows, data_start_index)
+        """
+        if not rows:
+            return [], 0
+
+        row_analyses = [TableProcessor._analyze_row(row) for row in rows[:4]]
+        period_pattern = TableProcessor._get_period_header_pattern()
+
+        # Pattern 1: Look for period headers
+        for i, row in enumerate(rows[:3]):  # Check first 4 rows
+            header_text = ' '.join(cell.strip() for cell in row).lower()
+            has_period_header = period_pattern.search(header_text)
+            contains_data = bool(re.search(data_pattern, header_text))
+            if has_period_header and not contains_data:
+                # Found a period header, check if next row has years or is part of header
+                if i + 1 < len(rows):
+                    next_row = rows[i + 1]
+                    next_text = ' '.join(cell.strip() for cell in next_row)
+                    # Check if next row has years or quarter references
+                    if (any(str(year) in next_text for year in range(2010, 2030)) or
+                            any(q in next_text.lower() for q in ['q1', 'q2', 'q3', 'q4'])):
+                        return rows[:i + 2], i + 2
+                return rows[:i + 1], i + 1
+
+        # Pattern 2: $ symbols in their own columns
+        for i, analysis in enumerate(row_analyses):
+            if analysis['dollar_signs'] > 0:
+                # If we see $ symbols, previous row might be header
+                if i > 0:
+                    return rows[:i], i
+                return [], i
+
+        # Pattern 3: Look for transition from text to numbers with $ alignment
+        for i in range(len(rows) - 1):
+            curr_analysis = TableProcessor._analyze_row(rows[i])
+            next_analysis = TableProcessor._analyze_row(rows[i + 1])
+
+            if (curr_analysis['financial_values'] == 0 and
+                    next_analysis['financial_values'] > 0 and
+                    next_analysis['dollar_signs'] > 0):
+                return rows[:i + 1], i + 1
+
+        # Default to no headers if no clear pattern found
+        return [], 0
+
+    # In TableProcessor class
+    @staticmethod
+    def _detect_and_fix_misalignment(virtual_rows: list[list[str]], data_start_idx: int) -> list[list[str]]:
+        """
+        Detect and fix misalignment between date headers and numeric data columns.
+        Returns corrected virtual rows.
+        """
+        if not virtual_rows or data_start_idx >= len(virtual_rows):
+            return virtual_rows
+
+        # Get header row (assumes dates are in the last header row)
+        header_idx = data_start_idx - 1
+        if header_idx < 0:
+            return virtual_rows
+
+        header_row = virtual_rows[data_start_idx - 1]
+
+        # Find date columns in header
+        date_columns = []
+        for i, cell in enumerate(header_row):
+            if TableProcessor._is_date_header(cell):
+                date_columns.append(i)
+
+        if not date_columns:
+            return virtual_rows  # No date headers found
+
+        # Find numeric columns in first few data rows
+        numeric_columns = set()
+        for row in virtual_rows[data_start_idx:data_start_idx + 3]:  # Check first 3 data rows
+            for i, cell in enumerate(row):
+                if TableProcessor._is_financial_value(cell, row, i):
+                    numeric_columns.add(i)
+
+        # Detect misalignment
+        if date_columns and numeric_columns:
+            # Check if dates are shifted right compared to numeric columns
+            dates_shifted = all(
+                (i + 1) in numeric_columns
+                for i in date_columns
+            )
+            if dates_shifted:
+                # Fix alignment by shifting only the row containing dates
+                fixed_rows = virtual_rows.copy()
+                # Find and fix only the row containing the dates
+                for row_idx, row in enumerate(virtual_rows):
+                    if row_idx < data_start_idx:  # Only check header rows
+                        # Check if this row contains the dates by counting date headers
+                        date_count = sum(1 for cell in row if TableProcessor._is_date_header(cell))
+                        if date_count >= 2:  # If multiple dates found, this is our target row
+                            new_row = [""] * len(row)  # Start with empty row
+                            for i in range(len(row) - 1):
+                                new_row[i + 1] = row[i]  # Copy each value one position right
+                            fixed_rows[row_idx] = new_row
+                            break  # Only fix one row
+                return fixed_rows
+
+        return virtual_rows
+
+    @staticmethod
+    def _get_columns_to_remove(empty_cols: list[int], max_cols: int) -> set[int]:
+        cols_to_remove = set()
+
+        # Handle leading empty columns
+        for col in range(max_cols):
+            if col in empty_cols:
+                cols_to_remove.add(col)
+            else:
+                break
+
+        # Handle trailing empty columns
+        for col in reversed(range(max_cols)):
+            if col in empty_cols:
+                cols_to_remove.add(col)
+            else:
+                break
+
+        # Handle consecutive empty columns in the middle
+        i = 0
+        while i < max_cols - 1:
+            if i in empty_cols and (i + 1) in empty_cols:
+                consecutive_empty = 0
+                j = i
+                while j < max_cols and j in empty_cols:
+                    consecutive_empty += 1
+                    j += 1
+                cols_to_remove.update(range(i + 1, i + consecutive_empty))
+                i = j
+            else:
+                i += 1
+
+        return cols_to_remove
+
+    @staticmethod
+    def _merge_header_rows(header_rows: list[list[str]]) -> list[str]:
+        """Merge multiple header rows into one"""
+        if not header_rows:
+            return []
+
+        merged = []
+        for col_idx in range(len(header_rows[0])):
+            parts = []
+            for row in header_rows:
+                text = row[col_idx].strip()
+                if text and text != '$':  # Skip empty cells and lone $ symbols
+                    parts.append(text)
+            merged.append('\n'.join(parts))
+        return merged
+
+    @staticmethod
+    def _determine_column_alignments(rows: list[list[str]],
+                                     data_start_idx: int,
+                                     col_count: int) -> list[str]:
+        """Determine alignment for each column"""
+        alignments = []
+        for col_idx in range(col_count):
+            # First column always left-aligned
+            if col_idx == 0:
+                alignments.append("left")
+                continue
+
+            # Check if column contains numbers
+            is_numeric = False
+            for row in rows[data_start_idx:]:
+                cell = row[col_idx].strip()
+                if cell and cell != '$':
+                    if TableProcessor._is_financial_value(cell, row, col_idx):
+                        is_numeric = True
+                        break
+            alignments.append("right" if is_numeric else "left")
+
+        return alignments
+
+    @staticmethod
+    def _is_financial_value(text: str, row: list, col_idx: int) -> bool:
+        """
+        Check if text represents a financial value, considering layout context
+        Takes the full row and column index to check for adjacent $ symbols
+        """
+        text = text.strip()
+
+        # If it's a $ symbol by itself, not a financial value
+        if text == '$':
+            return False
+
+        # Check if it's a number
+        is_numeric = is_number(text)
+
+        if not is_numeric:
+            return False
+
+        # Look for $ in adjacent columns (considering empty columns in between)
+        # Look left for $
+        left_idx = col_idx - 1
+        while left_idx >= 0:
+            left_cell = row[left_idx].strip()
+            if left_cell == '$':
+                return True
+            elif left_cell:  # If we hit any non-empty cell that's not $, stop looking
+                break
+            left_idx -= 1
+
+        return is_numeric  # If we found a number but no $, still treat as financial value
+
+    @staticmethod
+    def _is_financial_metric(text: str) -> bool:
+        """Check if text represents a common financial metric"""
+        text = text.lower().strip()
+        metrics = [
+            'revenue', 'sales', 'income', 'earnings', 'profit', 'loss',
+            'assets', 'liabilities', 'equity', 'cash', 'expenses',
+            'cost', 'margin', 'ebitda', 'eps', 'shares', 'tax',
+            'operating', 'net', 'gross', 'total', 'capital',
+            'depreciation', 'amortization', 'interest', 'debt'
+        ]
+        return any(metric in text for metric in metrics)
+
+    @staticmethod
+    def _format_data_rows(rows: list[list[str]]) -> list[list[str]]:
+        """Format data rows for display"""
+        formatted_rows = []
+        for row in rows:
+            formatted_row = []
+            for col_idx, cell in enumerate(row):
+                content = cell.strip()
+                if col_idx > 0:  # Don't format first column
+                    # Handle parenthesized numbers
+                    if content.startswith('(') and content.endswith(')'):
+                        content = f"-{content[1:-1]}"
+                formatted_row.append(content)
+            formatted_rows.append(formatted_row)
+        return formatted_rows
+
+
+class ColumnOptimizer:
+    """Optimizes column widths for table rendering"""
+
+    def __init__(self, total_width: int = 100, min_data_col_width: int = 15,
+                 max_left_col_ratio: float = 0.5, target_left_col_ratio: float = 0.4):
+        self.total_width = total_width
+        self.min_data_col_width = min_data_col_width
+        self.max_left_col_ratio = max_left_col_ratio  # Maximum portion of total width for left column
+        self.target_left_col_ratio = target_left_col_ratio  # Target portion for left column
+
+    def _measure_content_width(self, content: str) -> int:
+        """Measure the display width of content, handling multiline text"""
+        if not content:
+            return 0
+        lines = content.split('\n')
+        return max(len(line) for line in lines)
+
+    def _wrap_text(self, text: str, max_width: int) -> str:
+        """
+        Wrap text to specified width, preserving existing line breaks and word boundaries.
+        If text already contains line breaks, preserve the original formatting.
+        """
+        if not text or len(text) <= max_width:
+            return text
+
+        # If text already contains line breaks, preserve them
+        if '\n' in text:
+            return text
+
+        # Special handling for financial statement line items
+        if ',' in text and ':' in text:
+            # Split into main description and details
+            parts = text.split(':', 1)
+            if len(parts) == 2:
+                desc, details = parts
+                wrapped_desc = self._wrap_text(desc.strip(), max_width)
+                wrapped_details = self._wrap_text(details.strip(), max_width)
+                return f"{wrapped_desc}:\n{wrapped_details}"
+
+        words = text.split()
+        lines = []
+        current_line = []
+        current_length = 0
+
+        for word in words:
+            word_length = len(word)
+
+            # Handle very long words
+            if word_length > max_width:
+                # If we have a current line, add it first
+                if current_line:
+                    lines.append(' '.join(current_line))
+                    current_line = []
+                    current_length = 0
+
+                # Split long word across lines
+                while word_length > max_width:
+                    lines.append(word[:max_width - 1] + '-')
+                    word = word[max_width - 1:]
+                    word_length = len(word)
+                if word:
+                    current_line = [word]
+                    current_length = word_length
+                continue
+
+            if current_length + word_length + (1 if current_line else 0) <= max_width:
+                current_line.append(word)
+                current_length += word_length + (1 if current_length else 0)
+            else:
+                if current_line:
+                    lines.append(' '.join(current_line))
+                current_line = [word]
+                current_length = word_length
+
+        if current_line:
+            lines.append(' '.join(current_line))
+
+        return '\n'.join(lines)
+
+    def optimize_columns(self, table: ProcessedTable) -> tuple[list[int], ProcessedTable]:
+        """
+        Optimize column widths and wrap text as needed.
+        Returns (column_widths, modified_table)
+        """
+        col_count = len(table.data_rows[0]) if table.data_rows else 0
+        if not col_count:
+            return [], table
+
+        # Calculate maximum left column width based on total width
+        max_left_col_width = int(self.total_width * self.max_left_col_ratio)
+        target_left_col_width = int(self.total_width * self.target_left_col_ratio)
+
+        # Initialize widths array
+        widths = [0] * col_count
+
+        # First pass: calculate minimum required widths for data columns
+        for col in range(1, col_count):
+            col_content_width = self.min_data_col_width
+            if table.headers:
+                col_content_width = max(col_content_width,
+                                        self._measure_content_width(table.headers[col]))
+
+            # Check numeric data width
+            for row in table.data_rows:
+                if col < len(row):
+                    col_content_width = max(col_content_width,
+                                            self._measure_content_width(row[col]))
+
+            widths[col] = col_content_width
+
+        # Calculate available space for left column
+        data_cols_width = sum(widths[1:])
+        available_left_width = self.total_width - data_cols_width
+
+        # Determine left column width
+        left_col_max_content = 0
+        if table.headers and table.headers[0]:
+            left_col_max_content = self._measure_content_width(table.headers[0])
+        for row in table.data_rows:
+            if row:
+                left_col_max_content = max(left_col_max_content,
+                                           self._measure_content_width(row[0]))
+
+        # Set left column width based on constraints
+        if left_col_max_content <= target_left_col_width:
+            widths[0] = left_col_max_content
+        else:
+            widths[0] = min(max_left_col_width,
+                            max(target_left_col_width, available_left_width))
+
+        # If we still exceed total width, redistribute data column space
+        total_width = sum(widths)
+        if total_width > self.total_width:
+            excess = total_width - self.total_width
+            data_cols = len(widths) - 1
+            reduction_per_col = excess // data_cols
+
+            # Reduce data columns while ensuring minimum width
+            for i in range(1, len(widths)):
+                if widths[i] - reduction_per_col >= self.min_data_col_width:
+                    widths[i] -= reduction_per_col
+
+        # Apply width constraints and wrap text
+        modified_table = self._apply_column_constraints(table, widths)
+
+        return widths, modified_table
+
+    def _apply_column_constraints(self, table: ProcessedTable, widths: list[int]) -> ProcessedTable:
+        """Apply width constraints to table content, wrapping text as needed"""
+        # Wrap headers if present
+        wrapped_headers = None
+        if table.headers:
+            wrapped_headers = [
+                self._wrap_text(header, widths[i])
+                for i, header in enumerate(table.headers)
+            ]
+
+        # Wrap data in first column only
+        wrapped_rows = []
+        for row in table.data_rows:
+            wrapped_row = list(row)  # Make a copy
+            wrapped_row[0] = self._wrap_text(row[0], widths[0])
+            wrapped_rows.append(wrapped_row)
+
+        return ProcessedTable(
+            headers=wrapped_headers,
+            data_rows=wrapped_rows,
+            column_alignments=table.column_alignments
+        )