Initial commit
This commit is contained in:
@@ -0,0 +1,13 @@
|
||||
"""
|
||||
Document renderers for various output formats.
|
||||
"""
|
||||
|
||||
from edgar.documents.renderers.markdown import MarkdownRenderer
|
||||
from edgar.documents.renderers.text import TextRenderer
|
||||
from edgar.documents.renderers.fast_table import FastTableRenderer
|
||||
|
||||
__all__ = [
|
||||
'MarkdownRenderer',
|
||||
'TextRenderer',
|
||||
'FastTableRenderer'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,669 @@
|
||||
"""
|
||||
Fast table renderer for edgar.documents - optimized for performance.
|
||||
|
||||
This module provides a high-performance alternative to Rich table rendering
|
||||
while maintaining professional output quality and readability.
|
||||
|
||||
Performance target: ~32x faster than Rich rendering (0.2ms vs 6.5ms per table)
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Optional, Union, Tuple
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class Alignment(Enum):
|
||||
"""Column alignment options."""
|
||||
LEFT = "left"
|
||||
RIGHT = "right"
|
||||
CENTER = "center"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColumnConfig:
|
||||
"""Configuration for a table column."""
|
||||
alignment: Alignment = Alignment.LEFT
|
||||
min_width: int = 8
|
||||
max_width: Optional[int] = None
|
||||
padding: int = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class TableStyle:
|
||||
"""Table styling configuration."""
|
||||
border_char: str = "|"
|
||||
header_separator: str = "-"
|
||||
corner_char: str = "+"
|
||||
padding: int = 1
|
||||
min_col_width: int = 8
|
||||
max_col_width: int = 50
|
||||
|
||||
@classmethod
|
||||
def pipe_table(cls) -> 'TableStyle':
|
||||
"""Markdown-compatible pipe table style."""
|
||||
return cls(
|
||||
border_char="|",
|
||||
header_separator="-",
|
||||
corner_char="|",
|
||||
padding=1,
|
||||
min_col_width=8,
|
||||
max_col_width=50
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def minimal(cls) -> 'TableStyle':
|
||||
"""Minimal table style with spacing only."""
|
||||
return cls(
|
||||
border_char="",
|
||||
header_separator="",
|
||||
corner_char="",
|
||||
padding=2,
|
||||
min_col_width=6,
|
||||
max_col_width=40
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def simple(cls) -> 'TableStyle':
|
||||
"""
|
||||
Simple table style matching Rich's box.SIMPLE.
|
||||
|
||||
Features:
|
||||
- No outer border
|
||||
- No column separators
|
||||
- Single horizontal line under header
|
||||
- Space-separated columns with generous padding
|
||||
- Clean, professional appearance
|
||||
|
||||
This style provides the best balance of visual quality and performance,
|
||||
matching Rich's box.SIMPLE aesthetic while maintaining fast rendering speed.
|
||||
"""
|
||||
return cls(
|
||||
border_char="", # No pipes/borders
|
||||
header_separator="─", # Unicode horizontal line
|
||||
corner_char="", # No corners
|
||||
padding=2, # Generous spacing (was 1 in pipe_table)
|
||||
min_col_width=6, # Slightly relaxed (was 8)
|
||||
max_col_width=60 # Raised from 50 for wider columns
|
||||
)
|
||||
|
||||
|
||||
class FastTableRenderer:
|
||||
"""
|
||||
High-performance table renderer optimized for speed.
|
||||
|
||||
Features:
|
||||
- 30x+ faster than Rich table rendering
|
||||
- Professional, readable output
|
||||
- Configurable alignment and styling
|
||||
- Handles complex SEC filing table structures
|
||||
- Markdown-compatible output
|
||||
- Memory efficient
|
||||
"""
|
||||
|
||||
def __init__(self, style: Optional[TableStyle] = None):
|
||||
"""Initialize renderer with optional style configuration."""
|
||||
self.style = style or TableStyle.pipe_table()
|
||||
|
||||
# Pre-compile format strings for performance
|
||||
self._format_cache = {}
|
||||
|
||||
def render_table_node(self, table_node) -> str:
|
||||
"""
|
||||
Render a TableNode to text format with proper colspan/rowspan handling.
|
||||
|
||||
Args:
|
||||
table_node: TableNode instance from edgar.documents
|
||||
|
||||
Returns:
|
||||
Formatted table string
|
||||
"""
|
||||
from edgar.documents.utils.table_matrix import TableMatrix
|
||||
|
||||
# Build matrix to handle colspan/rowspan properly
|
||||
# This ensures cells are expanded to fill their full colspan/rowspan
|
||||
matrix = TableMatrix()
|
||||
matrix.build_from_rows(table_node.headers, table_node.rows)
|
||||
|
||||
# Extract headers from expanded matrix
|
||||
headers = []
|
||||
if table_node.headers:
|
||||
for row_idx in range(len(table_node.headers)):
|
||||
expanded_row = matrix.get_expanded_row(row_idx)
|
||||
# Convert Cell objects to strings, handling None values
|
||||
row_texts = [cell.text().strip() if cell else '' for cell in expanded_row]
|
||||
headers.append(row_texts)
|
||||
|
||||
# Extract data rows from expanded matrix
|
||||
rows = []
|
||||
start_row = len(table_node.headers) if table_node.headers else 0
|
||||
for row_idx in range(start_row, matrix.row_count):
|
||||
expanded_row = matrix.get_expanded_row(row_idx)
|
||||
# Convert Cell objects to strings, handling None values
|
||||
row_texts = [cell.text().strip() if cell else '' for cell in expanded_row]
|
||||
rows.append(row_texts)
|
||||
|
||||
# Render the table
|
||||
table_text = self.render_table_data(headers, rows)
|
||||
|
||||
# Add caption if present (matches Rich renderer behavior)
|
||||
if hasattr(table_node, 'caption') and table_node.caption:
|
||||
return f"{table_node.caption}\n{table_text}"
|
||||
|
||||
return table_text
|
||||
|
||||
def render_table_data(self, headers: List[List[str]], rows: List[List[str]]) -> str:
|
||||
"""
|
||||
Render table data with headers and rows.
|
||||
|
||||
Args:
|
||||
headers: List of header rows (for multi-row headers)
|
||||
rows: List of data rows
|
||||
|
||||
Returns:
|
||||
Formatted table string
|
||||
"""
|
||||
if not headers and not rows:
|
||||
return ""
|
||||
|
||||
# Determine column count from all rows (headers + data)
|
||||
all_rows = headers + rows if headers else rows
|
||||
if not all_rows:
|
||||
return ""
|
||||
|
||||
max_cols = max(len(row) for row in all_rows) if all_rows else 0
|
||||
if max_cols == 0:
|
||||
return ""
|
||||
|
||||
# Filter out empty/spacing columns
|
||||
meaningful_columns = self._identify_meaningful_columns(all_rows, max_cols)
|
||||
if not meaningful_columns:
|
||||
return ""
|
||||
|
||||
# Filter all rows (both headers and data) to only meaningful columns
|
||||
filtered_headers = [self._filter_row_to_columns(row, meaningful_columns) for row in headers] if headers else []
|
||||
filtered_rows = [self._filter_row_to_columns(row, meaningful_columns) for row in rows]
|
||||
|
||||
# Post-process to merge related columns (e.g., currency symbols with amounts)
|
||||
# Apply to all rows including headers
|
||||
all_filtered = filtered_headers + filtered_rows
|
||||
if all_filtered:
|
||||
# Merge using first filtered row as reference
|
||||
_, all_merged = self._merge_related_columns(all_filtered[0], all_filtered)
|
||||
# Split back into headers and data
|
||||
if filtered_headers:
|
||||
filtered_headers = all_merged[:len(filtered_headers)]
|
||||
filtered_rows = all_merged[len(filtered_headers):]
|
||||
else:
|
||||
filtered_rows = all_merged
|
||||
|
||||
# Recalculate with filtered and merged data
|
||||
filtered_all_rows = filtered_headers + filtered_rows if filtered_headers else filtered_rows
|
||||
filtered_max_cols = max(len(row) for row in filtered_all_rows) if filtered_all_rows else 0
|
||||
|
||||
# Calculate optimal column widths for filtered columns
|
||||
col_widths = self._calculate_column_widths(filtered_all_rows, filtered_max_cols)
|
||||
|
||||
# Detect column alignments based on filtered content
|
||||
alignments = self._detect_alignments(filtered_all_rows, filtered_max_cols)
|
||||
|
||||
# Build table with filtered data - pass headers as multiple rows
|
||||
return self._build_table(filtered_headers, filtered_rows, col_widths, alignments)
|
||||
|
||||
def _combine_headers(self, headers: List[List[str]]) -> List[str]:
|
||||
"""
|
||||
Combine multi-row headers intelligently.
|
||||
|
||||
For SEC tables, this prioritizes specific dates/periods over generic labels.
|
||||
"""
|
||||
if not headers:
|
||||
return []
|
||||
|
||||
if len(headers) == 1:
|
||||
return headers[0]
|
||||
|
||||
# Determine max columns across all header rows
|
||||
max_cols = max(len(row) for row in headers) if headers else 0
|
||||
combined = [""] * max_cols
|
||||
|
||||
for col in range(max_cols):
|
||||
# Collect all values for this column
|
||||
values = []
|
||||
for header_row in headers:
|
||||
if col < len(header_row) and header_row[col].strip():
|
||||
values.append(header_row[col].strip())
|
||||
|
||||
if values:
|
||||
# Prioritize date-like values over generic terms
|
||||
date_values = [v for v in values if self._looks_like_date(v)]
|
||||
if date_values:
|
||||
combined[col] = date_values[0]
|
||||
elif len(values) == 1:
|
||||
combined[col] = values[0]
|
||||
else:
|
||||
# Skip generic terms like "Year Ended" if we have something more specific
|
||||
specific_values = [v for v in values
|
||||
if v.lower() not in {'year ended', 'years ended', 'period ended'}]
|
||||
combined[col] = specific_values[0] if specific_values else values[0]
|
||||
|
||||
return combined
|
||||
|
||||
def _looks_like_date(self, text: str) -> bool:
|
||||
"""Quick date detection for header processing."""
|
||||
if not text or len(text) < 4:
|
||||
return False
|
||||
|
||||
text_lower = text.lower().replace('\n', ' ').strip()
|
||||
|
||||
# Common date indicators
|
||||
date_indicators = [
|
||||
'january', 'february', 'march', 'april', 'may', 'june',
|
||||
'july', 'august', 'september', 'october', 'november', 'december',
|
||||
'20', '19', # Year prefixes
|
||||
]
|
||||
|
||||
return any(indicator in text_lower for indicator in date_indicators) and \
|
||||
any(c.isdigit() for c in text)
|
||||
|
||||
def _identify_meaningful_columns(self, all_rows: List[List[str]], max_cols: int) -> List[int]:
|
||||
"""
|
||||
Identify columns that contain meaningful content (not just spacing).
|
||||
|
||||
Returns:
|
||||
List of column indices that have meaningful content
|
||||
"""
|
||||
column_scores = []
|
||||
|
||||
for col_idx in range(max_cols):
|
||||
content_score = 0
|
||||
total_rows = 0
|
||||
|
||||
# Score each column based on content quality
|
||||
for row in all_rows:
|
||||
if col_idx < len(row):
|
||||
total_rows += 1
|
||||
cell_content = str(row[col_idx]).strip()
|
||||
|
||||
if cell_content:
|
||||
# Higher score for longer, more substantial content
|
||||
if len(cell_content) >= 3: # Substantial content
|
||||
content_score += 3
|
||||
elif len(cell_content) == 2 and cell_content.isalnum():
|
||||
content_score += 2
|
||||
elif len(cell_content) == 1 and (cell_content.isalnum() or cell_content == '$'):
|
||||
content_score += 1
|
||||
# Skip single spaces, dashes, or other likely spacing characters
|
||||
|
||||
# Calculate average score per row for this column
|
||||
avg_score = content_score / max(total_rows, 1)
|
||||
column_scores.append((col_idx, avg_score, content_score))
|
||||
|
||||
# Sort by score descending
|
||||
column_scores.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Take columns with meaningful content (score >= 0.5 or among top columns)
|
||||
meaningful_columns = []
|
||||
for col_idx, avg_score, total_score in column_scores:
|
||||
# Include if it has good average score or significant total content
|
||||
if avg_score >= 0.5 or total_score >= 5:
|
||||
meaningful_columns.append(col_idx)
|
||||
# Limit to reasonable number of columns for readability
|
||||
if len(meaningful_columns) >= 8:
|
||||
break
|
||||
|
||||
# Sort by original column order
|
||||
meaningful_columns.sort()
|
||||
|
||||
return meaningful_columns
|
||||
|
||||
def _filter_row_to_columns(self, row: List[str], column_indices: List[int]) -> List[str]:
|
||||
"""
|
||||
Filter a row to only include the specified column indices.
|
||||
|
||||
Args:
|
||||
row: Original row data
|
||||
column_indices: List of column indices to keep
|
||||
|
||||
Returns:
|
||||
Filtered row with only the specified columns
|
||||
"""
|
||||
if not row:
|
||||
return []
|
||||
|
||||
filtered_row = []
|
||||
for col_idx in column_indices:
|
||||
if col_idx < len(row):
|
||||
filtered_row.append(row[col_idx])
|
||||
else:
|
||||
filtered_row.append("") # Missing column
|
||||
|
||||
return filtered_row
|
||||
|
||||
def _merge_related_columns(self, headers: List[str], rows: List[List[str]]) -> tuple:
|
||||
"""
|
||||
Merge related columns (e.g., currency symbols with their amounts).
|
||||
|
||||
Returns:
|
||||
Tuple of (merged_headers, merged_rows)
|
||||
"""
|
||||
if not rows or not any(rows):
|
||||
return headers, rows
|
||||
|
||||
# Find columns that should be merged
|
||||
merge_pairs = []
|
||||
max_cols = max(len(row) for row in [headers] + rows if row) if rows else len(headers) if headers else 0
|
||||
|
||||
for col_idx in range(max_cols - 1):
|
||||
# Check if this column and the next should be merged
|
||||
should_merge = self._should_merge_columns(headers, rows, col_idx, col_idx + 1)
|
||||
if should_merge:
|
||||
merge_pairs.append((col_idx, col_idx + 1))
|
||||
|
||||
# Apply merges (from right to left to avoid index shifting)
|
||||
merged_headers = headers[:] if headers else []
|
||||
merged_rows = [row[:] for row in rows]
|
||||
|
||||
for left_idx, right_idx in reversed(merge_pairs):
|
||||
# Merge headers
|
||||
if merged_headers and left_idx < len(merged_headers) and right_idx < len(merged_headers):
|
||||
left_header = merged_headers[left_idx].strip()
|
||||
right_header = merged_headers[right_idx].strip()
|
||||
merged_header = f"{left_header} {right_header}".strip()
|
||||
merged_headers[left_idx] = merged_header
|
||||
merged_headers.pop(right_idx)
|
||||
|
||||
# Merge rows
|
||||
for row in merged_rows:
|
||||
if left_idx < len(row) and right_idx < len(row):
|
||||
left_cell = str(row[left_idx]).strip()
|
||||
right_cell = str(row[right_idx]).strip()
|
||||
|
||||
# Smart merging based on content
|
||||
if left_cell == '$' and right_cell:
|
||||
merged_cell = f"${right_cell}"
|
||||
elif left_cell and right_cell:
|
||||
merged_cell = f"{left_cell} {right_cell}"
|
||||
else:
|
||||
merged_cell = left_cell or right_cell
|
||||
|
||||
row[left_idx] = merged_cell
|
||||
if right_idx < len(row):
|
||||
row.pop(right_idx)
|
||||
|
||||
return merged_headers, merged_rows
|
||||
|
||||
def _should_merge_columns(self, headers: List[str], rows: List[List[str]], left_idx: int, right_idx: int) -> bool:
|
||||
"""
|
||||
Determine if two adjacent columns should be merged.
|
||||
|
||||
Returns:
|
||||
True if columns should be merged
|
||||
"""
|
||||
# Check if left column is mostly currency symbols
|
||||
currency_count = 0
|
||||
total_count = 0
|
||||
|
||||
for row in rows:
|
||||
if left_idx < len(row) and right_idx < len(row):
|
||||
total_count += 1
|
||||
left_cell = str(row[left_idx]).strip()
|
||||
right_cell = str(row[right_idx]).strip()
|
||||
|
||||
# If left is '$' and right is a number, they should be merged
|
||||
if left_cell == '$' and right_cell and (right_cell.replace(',', '').replace('.', '').isdigit()):
|
||||
currency_count += 1
|
||||
|
||||
# If most rows have currency symbol + number pattern, merge them
|
||||
if total_count > 0 and currency_count / total_count >= 0.5:
|
||||
return True
|
||||
|
||||
# Check for other merge patterns (e.g., empty left column with content right column)
|
||||
empty_left_count = 0
|
||||
for row in rows:
|
||||
if left_idx < len(row) and right_idx < len(row):
|
||||
left_cell = str(row[left_idx]).strip()
|
||||
right_cell = str(row[right_idx]).strip()
|
||||
|
||||
if not left_cell and right_cell:
|
||||
empty_left_count += 1
|
||||
|
||||
# If left column is mostly empty, consider merging
|
||||
if total_count > 0 and empty_left_count / total_count >= 0.7:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _calculate_column_widths(self, all_rows: List[List[str]], max_cols: int) -> List[int]:
|
||||
"""Calculate optimal column widths based on content."""
|
||||
col_widths = [self.style.min_col_width] * max_cols
|
||||
|
||||
# Find the maximum content width for each column
|
||||
for row in all_rows:
|
||||
for col_idx in range(min(len(row), max_cols)):
|
||||
content = str(row[col_idx]) if row[col_idx] else ""
|
||||
# Handle multi-line content
|
||||
max_line_width = max((len(line) for line in content.split('\n')), default=0)
|
||||
content_width = max_line_width + (self.style.padding * 2)
|
||||
|
||||
# Apply limits
|
||||
content_width = min(content_width, self.style.max_col_width)
|
||||
col_widths[col_idx] = max(col_widths[col_idx], content_width)
|
||||
|
||||
return col_widths
|
||||
|
||||
def _detect_alignments(self, all_rows: List[List[str]], max_cols: int) -> List[Alignment]:
|
||||
"""Detect appropriate alignment for each column based on content."""
|
||||
alignments = [Alignment.LEFT] * max_cols
|
||||
|
||||
for col_idx in range(max_cols):
|
||||
# Analyze column content (skip header row if present)
|
||||
data_rows = all_rows[1:] if len(all_rows) > 1 else all_rows
|
||||
|
||||
numeric_count = 0
|
||||
total_count = 0
|
||||
|
||||
for row in data_rows:
|
||||
if col_idx < len(row) and row[col_idx].strip():
|
||||
total_count += 1
|
||||
content = row[col_idx].strip()
|
||||
|
||||
# Check if content looks numeric (currency, percentages, numbers)
|
||||
if self._looks_numeric(content):
|
||||
numeric_count += 1
|
||||
|
||||
# If most values in column are numeric, right-align
|
||||
if total_count > 0 and numeric_count / total_count >= 0.7:
|
||||
alignments[col_idx] = Alignment.RIGHT
|
||||
|
||||
return alignments
|
||||
|
||||
def _looks_numeric(self, text: str) -> bool:
|
||||
"""Check if text content looks numeric."""
|
||||
if not text:
|
||||
return False
|
||||
|
||||
# Remove common formatting characters
|
||||
clean_text = text.replace(',', '').replace('$', '').replace('%', '').replace('(', '').replace(')', '').strip()
|
||||
|
||||
# Handle negative numbers in parentheses
|
||||
if text.strip().startswith('(') and text.strip().endswith(')'):
|
||||
clean_text = text.strip()[1:-1].replace(',', '').replace('$', '').strip()
|
||||
|
||||
# Check if remaining text is numeric
|
||||
try:
|
||||
float(clean_text)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def _build_table(self, headers: List[List[str]], rows: List[List[str]],
|
||||
col_widths: List[int], alignments: List[Alignment]) -> str:
|
||||
"""
|
||||
Build the final table string.
|
||||
|
||||
Args:
|
||||
headers: List of header rows (can be multiple rows for multi-row headers)
|
||||
rows: List of data rows
|
||||
col_widths: Column widths
|
||||
alignments: Column alignments
|
||||
"""
|
||||
lines = []
|
||||
|
||||
# Header rows (can be multiple)
|
||||
if headers:
|
||||
for header_row in headers:
|
||||
# Only add header rows with meaningful content
|
||||
if any(cell.strip() for cell in header_row):
|
||||
# Handle multi-line cells in header rows
|
||||
formatted_lines = self._format_multiline_row(header_row, col_widths, alignments)
|
||||
lines.extend(formatted_lines)
|
||||
|
||||
# Header separator (after all header rows)
|
||||
if self.style.header_separator:
|
||||
sep_line = self._create_separator_line(col_widths)
|
||||
lines.append(sep_line)
|
||||
|
||||
# Data rows
|
||||
for row in rows:
|
||||
# Only add rows with meaningful content
|
||||
if any(cell.strip() for cell in row):
|
||||
row_line = self._format_row(row, col_widths, alignments)
|
||||
lines.append(row_line)
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
def _format_row(self, row: List[str], col_widths: List[int],
|
||||
alignments: List[Alignment]) -> str:
|
||||
"""Format a single row with proper alignment and padding."""
|
||||
cells = []
|
||||
border = self.style.border_char
|
||||
|
||||
for col_idx, width in enumerate(col_widths):
|
||||
# Get cell content
|
||||
content = str(row[col_idx]) if col_idx < len(row) else ""
|
||||
|
||||
# Handle multi-line content (take first line only for table)
|
||||
if '\n' in content:
|
||||
content = content.split('\n')[0]
|
||||
|
||||
content = content.strip()
|
||||
|
||||
# Calculate available width for content
|
||||
available_width = width - (self.style.padding * 2)
|
||||
|
||||
# Truncate if too long
|
||||
if len(content) > available_width:
|
||||
content = content[:available_width-3] + "..."
|
||||
|
||||
# Apply alignment
|
||||
alignment = alignments[col_idx] if col_idx < len(alignments) else Alignment.LEFT
|
||||
|
||||
if alignment == Alignment.RIGHT:
|
||||
aligned_content = content.rjust(available_width)
|
||||
elif alignment == Alignment.CENTER:
|
||||
aligned_content = content.center(available_width)
|
||||
else: # LEFT
|
||||
aligned_content = content.ljust(available_width)
|
||||
|
||||
# Add padding
|
||||
padded_cell = ' ' * self.style.padding + aligned_content + ' ' * self.style.padding
|
||||
cells.append(padded_cell)
|
||||
|
||||
# Join with borders
|
||||
if border:
|
||||
return border + border.join(cells) + border
|
||||
else:
|
||||
return ' '.join(cells)
|
||||
|
||||
def _format_multiline_row(self, row: List[str], col_widths: List[int],
|
||||
alignments: List[Alignment]) -> List[str]:
|
||||
"""
|
||||
Format a row that may contain multi-line cells (cells with \n characters).
|
||||
|
||||
Returns a list of formatted lines, one for each line of text in the cells.
|
||||
"""
|
||||
# Split each cell by newlines
|
||||
cell_lines = []
|
||||
max_lines = 1
|
||||
|
||||
for col_idx, content in enumerate(row):
|
||||
lines = content.split('\n') if content else ['']
|
||||
cell_lines.append(lines)
|
||||
max_lines = max(max_lines, len(lines))
|
||||
|
||||
# Build output lines
|
||||
output_lines = []
|
||||
for line_idx in range(max_lines):
|
||||
# Build row for this line
|
||||
current_row = []
|
||||
for col_idx in range(len(row)):
|
||||
# Get the line for this cell, or empty string if this cell has fewer lines
|
||||
if line_idx < len(cell_lines[col_idx]):
|
||||
current_row.append(cell_lines[col_idx][line_idx])
|
||||
else:
|
||||
current_row.append('')
|
||||
|
||||
# Format this line
|
||||
formatted_line = self._format_row(current_row, col_widths, alignments)
|
||||
output_lines.append(formatted_line)
|
||||
|
||||
return output_lines
|
||||
|
||||
def _create_separator_line(self, col_widths: List[int]) -> str:
|
||||
"""
|
||||
Create header separator line.
|
||||
|
||||
For bordered styles: |-------|-------|
|
||||
For borderless styles: ─────────────── (full width horizontal line)
|
||||
"""
|
||||
sep_char = self.style.header_separator
|
||||
border = self.style.border_char
|
||||
|
||||
if not sep_char:
|
||||
# No separator at all (minimal style)
|
||||
return ""
|
||||
|
||||
if border:
|
||||
# Bordered style: create separator matching column widths
|
||||
separators = []
|
||||
for width in col_widths:
|
||||
separators.append(sep_char * width)
|
||||
return border + border.join(separators) + border
|
||||
else:
|
||||
# Borderless style (simple): single horizontal line across full width
|
||||
# Calculate total width: sum of column widths + gaps between columns
|
||||
total_width = sum(col_widths) + (len(col_widths) - 1) * 2 # 2-space gaps
|
||||
|
||||
# Add leading space for indentation (matching row indentation)
|
||||
return " " + sep_char * total_width
|
||||
|
||||
|
||||
# Factory functions for easy usage
|
||||
def create_fast_renderer(style: str = "pipe") -> FastTableRenderer:
|
||||
"""
|
||||
Create a FastTableRenderer with predefined style.
|
||||
|
||||
Args:
|
||||
style: Style name ("pipe", "minimal")
|
||||
|
||||
Returns:
|
||||
Configured FastTableRenderer instance
|
||||
"""
|
||||
if style == "minimal":
|
||||
return FastTableRenderer(TableStyle.minimal())
|
||||
else: # Default to pipe
|
||||
return FastTableRenderer(TableStyle.pipe_table())
|
||||
|
||||
|
||||
def render_table_fast(table_node, style: str = "pipe") -> str:
|
||||
"""
|
||||
Convenience function to quickly render a table.
|
||||
|
||||
Args:
|
||||
table_node: TableNode instance
|
||||
style: Style name ("pipe", "minimal")
|
||||
|
||||
Returns:
|
||||
Formatted table string
|
||||
"""
|
||||
renderer = create_fast_renderer(style)
|
||||
return renderer.render_table_node(table_node)
|
||||
@@ -0,0 +1,613 @@
|
||||
"""
|
||||
Markdown renderer for parsed documents.
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Dict, Set
|
||||
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode, ListNode, ListItemNode
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
|
||||
|
||||
class MarkdownRenderer:
|
||||
"""
|
||||
Renders parsed documents to Markdown format.
|
||||
|
||||
Features:
|
||||
- Preserves document structure
|
||||
- Handles tables with proper formatting
|
||||
- Supports nested lists
|
||||
- Includes metadata annotations
|
||||
- Configurable output options
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
include_metadata: bool = False,
|
||||
include_toc: bool = False,
|
||||
max_heading_level: int = 6,
|
||||
table_format: str = 'pipe',
|
||||
wrap_width: Optional[int] = None):
|
||||
"""
|
||||
Initialize markdown renderer.
|
||||
|
||||
Args:
|
||||
include_metadata: Include metadata annotations
|
||||
include_toc: Generate table of contents
|
||||
max_heading_level: Maximum heading level to render
|
||||
table_format: Table format ('pipe', 'grid', 'simple')
|
||||
wrap_width: Wrap text at specified width
|
||||
"""
|
||||
self.include_metadata = include_metadata
|
||||
self.include_toc = include_toc
|
||||
self.max_heading_level = max_heading_level
|
||||
self.table_format = table_format
|
||||
self.wrap_width = wrap_width
|
||||
|
||||
# Track state during rendering
|
||||
self._toc_entries: List[tuple] = []
|
||||
self._rendered_ids: Set[str] = set()
|
||||
self._list_depth = 0
|
||||
self._in_table = False
|
||||
|
||||
def render(self, document: Document) -> str:
|
||||
"""
|
||||
Render document to Markdown.
|
||||
|
||||
Args:
|
||||
document: Document to render
|
||||
|
||||
Returns:
|
||||
Markdown formatted text
|
||||
"""
|
||||
self._reset_state()
|
||||
|
||||
parts = []
|
||||
|
||||
# Add metadata header if requested
|
||||
if self.include_metadata:
|
||||
parts.append(self._render_metadata(document))
|
||||
parts.append("")
|
||||
|
||||
# Placeholder for TOC
|
||||
if self.include_toc:
|
||||
toc_placeholder = "<!-- TOC -->"
|
||||
parts.append(toc_placeholder)
|
||||
parts.append("")
|
||||
|
||||
# Render document content
|
||||
content = self._render_node(document.root)
|
||||
parts.append(content)
|
||||
|
||||
# Join parts
|
||||
markdown = "\n".join(parts)
|
||||
|
||||
# Replace TOC placeholder
|
||||
if self.include_toc and self._toc_entries:
|
||||
toc = self._generate_toc()
|
||||
markdown = markdown.replace(toc_placeholder, toc)
|
||||
|
||||
return markdown.strip()
|
||||
|
||||
def render_node(self, node: Node) -> str:
|
||||
"""
|
||||
Render a specific node to Markdown.
|
||||
|
||||
Args:
|
||||
node: Node to render
|
||||
|
||||
Returns:
|
||||
Markdown formatted text
|
||||
"""
|
||||
self._reset_state()
|
||||
return self._render_node(node)
|
||||
|
||||
def _reset_state(self):
|
||||
"""Reset renderer state."""
|
||||
self._toc_entries = []
|
||||
self._rendered_ids = set()
|
||||
self._list_depth = 0
|
||||
self._in_table = False
|
||||
|
||||
def _render_node(self, node: Node) -> str:
|
||||
"""Render a node and its children."""
|
||||
# Skip if already rendered (handles shared nodes)
|
||||
if node.id in self._rendered_ids:
|
||||
return ""
|
||||
self._rendered_ids.add(node.id)
|
||||
|
||||
# Dispatch based on node type
|
||||
if isinstance(node, HeadingNode):
|
||||
return self._render_heading(node)
|
||||
elif isinstance(node, ParagraphNode):
|
||||
return self._render_paragraph(node)
|
||||
elif isinstance(node, TextNode):
|
||||
return self._render_text(node)
|
||||
elif isinstance(node, TableNode):
|
||||
return self._render_table(node)
|
||||
elif isinstance(node, ListNode):
|
||||
return self._render_list(node)
|
||||
elif isinstance(node, ListItemNode):
|
||||
return self._render_list_item(node)
|
||||
else:
|
||||
# Default: render children
|
||||
return self._render_children(node)
|
||||
|
||||
def _render_heading(self, node: HeadingNode) -> str:
|
||||
"""Render heading node."""
|
||||
# Limit heading level
|
||||
level = min(node.level, self.max_heading_level)
|
||||
|
||||
# Get heading text
|
||||
text = node.text().strip()
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Add to TOC
|
||||
if self.include_toc:
|
||||
self._toc_entries.append((level, text, node.id))
|
||||
|
||||
# Create markdown heading
|
||||
markdown = "#" * level + " " + text
|
||||
|
||||
# Add metadata if requested
|
||||
if self.include_metadata and node.metadata:
|
||||
metadata = self._format_metadata(node.metadata)
|
||||
if metadata:
|
||||
markdown += f" <!-- {metadata} -->"
|
||||
|
||||
# Add children content
|
||||
children_content = self._render_children(node)
|
||||
if children_content:
|
||||
markdown += "\n\n" + children_content
|
||||
|
||||
return markdown
|
||||
|
||||
def _render_paragraph(self, node: ParagraphNode) -> str:
|
||||
"""Render paragraph node."""
|
||||
# Get paragraph content
|
||||
content = self._render_children(node).strip()
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
# Wrap if requested
|
||||
if self.wrap_width:
|
||||
content = self._wrap_text(content, self.wrap_width)
|
||||
|
||||
# Add metadata if requested
|
||||
if self.include_metadata and node.metadata:
|
||||
metadata = self._format_metadata(node.metadata)
|
||||
if metadata:
|
||||
content = f"<!-- {metadata} -->\n{content}"
|
||||
|
||||
return content
|
||||
|
||||
def _render_text(self, node: TextNode) -> str:
|
||||
"""Render text node."""
|
||||
text = node.text()
|
||||
|
||||
# Escape markdown special characters
|
||||
text = self._escape_markdown(text)
|
||||
|
||||
# Apply text formatting based on style
|
||||
if node.style:
|
||||
if node.style.font_weight in ['bold', '700', '800', '900']:
|
||||
text = f"**{text}**"
|
||||
elif node.style.font_style == 'italic':
|
||||
text = f"*{text}*"
|
||||
elif node.style.text_decoration == 'underline':
|
||||
text = f"<u>{text}</u>"
|
||||
|
||||
return text
|
||||
|
||||
def _render_table(self, node: TableNode) -> str:
|
||||
"""Render table node."""
|
||||
self._in_table = True
|
||||
|
||||
parts = []
|
||||
|
||||
# Add caption if present
|
||||
if node.caption:
|
||||
parts.append(f"**Table: {node.caption}**")
|
||||
parts.append("")
|
||||
|
||||
# Render based on format
|
||||
if self.table_format == 'pipe':
|
||||
table_md = self._render_table_pipe(node)
|
||||
elif self.table_format == 'grid':
|
||||
table_md = self._render_table_grid(node)
|
||||
else: # simple
|
||||
table_md = self._render_table_simple(node)
|
||||
|
||||
parts.append(table_md)
|
||||
|
||||
# Add metadata if requested
|
||||
if self.include_metadata and node.metadata:
|
||||
metadata = self._format_metadata(node.metadata)
|
||||
if metadata:
|
||||
parts.append(f"<!-- Table metadata: {metadata} -->")
|
||||
|
||||
self._in_table = False
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
def _render_table_pipe(self, node: TableNode) -> str:
|
||||
"""Render table in pipe format with proper column spanning support."""
|
||||
# Handle complex SEC filing tables with column spanning
|
||||
expanded_headers, expanded_data_rows = self._expand_table_structure(node)
|
||||
|
||||
# Identify and filter to meaningful columns
|
||||
content_columns = self._identify_content_columns(expanded_headers, expanded_data_rows)
|
||||
|
||||
if not content_columns:
|
||||
return ""
|
||||
|
||||
rows = []
|
||||
|
||||
# Render headers with intelligent multi-row combination
|
||||
if expanded_headers:
|
||||
combined_headers = self._combine_multi_row_headers(expanded_headers)
|
||||
filtered_headers = [combined_headers[i] if i < len(combined_headers) else "" for i in content_columns]
|
||||
|
||||
row_md = "| " + " | ".join(filtered_headers) + " |"
|
||||
rows.append(row_md)
|
||||
|
||||
# Add separator
|
||||
separator = "| " + " | ".join(["---"] * len(filtered_headers)) + " |"
|
||||
rows.append(separator)
|
||||
|
||||
# Render data rows
|
||||
for expanded_row in expanded_data_rows:
|
||||
filtered_row = [expanded_row[i] if i < len(expanded_row) else "" for i in content_columns]
|
||||
|
||||
# Only add rows with meaningful content
|
||||
if any(cell.strip() for cell in filtered_row):
|
||||
row_md = "| " + " | ".join(filtered_row) + " |"
|
||||
rows.append(row_md)
|
||||
|
||||
return "\n".join(rows)
|
||||
|
||||
def _render_table_grid(self, node: TableNode) -> str:
|
||||
"""Render table in grid format."""
|
||||
# Simplified grid format
|
||||
all_rows = []
|
||||
|
||||
# Add headers
|
||||
if node.headers:
|
||||
for header_row in node.headers:
|
||||
cells = [cell.text() for cell in header_row]
|
||||
all_rows.append(" | ".join(cells))
|
||||
|
||||
# Add data rows
|
||||
for row in node.rows:
|
||||
cells = [cell.text() for cell in row.cells]
|
||||
all_rows.append(" | ".join(cells))
|
||||
|
||||
if all_rows:
|
||||
# Add borders
|
||||
max_width = max(len(row) for row in all_rows)
|
||||
border = "+" + "-" * (max_width + 2) + "+"
|
||||
result = [border]
|
||||
for row in all_rows:
|
||||
result.append(f"| {row:<{max_width}} |")
|
||||
result.append(border)
|
||||
return "\n".join(result)
|
||||
|
||||
return ""
|
||||
|
||||
def _render_table_simple(self, node: TableNode) -> str:
|
||||
"""Render table in simple format."""
|
||||
rows = []
|
||||
|
||||
# Add headers
|
||||
if node.headers:
|
||||
for header_row in node.headers:
|
||||
cells = [cell.text() for cell in header_row]
|
||||
rows.append(" ".join(cells))
|
||||
|
||||
# Add separator if we have headers
|
||||
if node.headers and node.rows:
|
||||
rows.append("")
|
||||
|
||||
# Add data rows
|
||||
for row in node.rows:
|
||||
cells = [cell.text() for cell in row.cells]
|
||||
rows.append(" ".join(cells))
|
||||
|
||||
return "\n".join(rows)
|
||||
|
||||
def _render_list(self, node: ListNode) -> str:
|
||||
"""Render list node."""
|
||||
self._list_depth += 1
|
||||
|
||||
items = []
|
||||
for child in node.children:
|
||||
if isinstance(child, ListItemNode):
|
||||
item_md = self._render_list_item(child)
|
||||
if item_md:
|
||||
items.append(item_md)
|
||||
|
||||
self._list_depth -= 1
|
||||
|
||||
return "\n".join(items)
|
||||
|
||||
def _render_list_item(self, node: ListItemNode) -> str:
|
||||
"""Render list item node."""
|
||||
# Determine bullet/number
|
||||
if node.parent and hasattr(node.parent, 'ordered') and node.parent.ordered:
|
||||
# Ordered list
|
||||
index = node.parent.children.index(node) + 1
|
||||
marker = f"{index}."
|
||||
else:
|
||||
# Unordered list
|
||||
markers = ['*', '-', '+']
|
||||
marker = markers[(self._list_depth - 1) % len(markers)]
|
||||
|
||||
# Indentation
|
||||
indent = " " * (self._list_depth - 1)
|
||||
|
||||
# Get content
|
||||
content = self._render_children(node).strip()
|
||||
|
||||
# Format item
|
||||
if '\n' in content:
|
||||
# Multi-line content
|
||||
lines = content.split('\n')
|
||||
result = indent + marker + " " + lines[0]
|
||||
for line in lines[1:]:
|
||||
result += "\n" + indent + " " + line
|
||||
return result
|
||||
else:
|
||||
# Single line
|
||||
return indent + marker + " " + content
|
||||
|
||||
def _render_children(self, node: Node) -> str:
|
||||
"""Render all children of a node."""
|
||||
parts = []
|
||||
|
||||
for child in node.children:
|
||||
child_md = self._render_node(child)
|
||||
if child_md:
|
||||
parts.append(child_md)
|
||||
|
||||
# Join with appropriate separator
|
||||
if self._in_table:
|
||||
return " ".join(parts)
|
||||
elif any(isinstance(child, (HeadingNode, ParagraphNode, TableNode, ListNode))
|
||||
for child in node.children):
|
||||
return "\n\n".join(parts)
|
||||
else:
|
||||
return " ".join(parts)
|
||||
|
||||
def _render_metadata(self, document: Document) -> str:
|
||||
"""Render document metadata."""
|
||||
lines = ["---"]
|
||||
|
||||
if document.metadata.company:
|
||||
lines.append(f"company: {document.metadata.company}")
|
||||
if document.metadata.form:
|
||||
lines.append(f"form: {document.metadata.form}")
|
||||
if document.metadata.filing_date:
|
||||
lines.append(f"filing_date: {document.metadata.filing_date}")
|
||||
if document.metadata.cik:
|
||||
lines.append(f"cik: {document.metadata.cik}")
|
||||
if document.metadata.accession_number:
|
||||
lines.append(f"accession_number: {document.metadata.accession_number}")
|
||||
|
||||
lines.append("---")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _generate_toc(self) -> str:
|
||||
"""Generate table of contents."""
|
||||
lines = ["## Table of Contents", ""]
|
||||
|
||||
for level, text, node_id in self._toc_entries:
|
||||
# Create anchor link
|
||||
anchor = self._create_anchor(text)
|
||||
|
||||
# Indentation based on level
|
||||
indent = " " * (level - 1)
|
||||
|
||||
# Add TOC entry
|
||||
lines.append(f"{indent}- [{text}](#{anchor})")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _create_anchor(self, text: str) -> str:
|
||||
"""Create anchor from heading text."""
|
||||
# Convert to lowercase and replace spaces with hyphens
|
||||
anchor = text.lower()
|
||||
anchor = anchor.replace(' ', '-')
|
||||
|
||||
# Remove special characters
|
||||
import re
|
||||
anchor = re.sub(r'[^a-z0-9\-]', '', anchor)
|
||||
|
||||
# Remove multiple hyphens
|
||||
anchor = re.sub(r'-+', '-', anchor)
|
||||
|
||||
return anchor.strip('-')
|
||||
|
||||
def _format_metadata(self, metadata: Dict) -> str:
|
||||
"""Format metadata for display."""
|
||||
parts = []
|
||||
|
||||
for key, value in metadata.items():
|
||||
if key == 'semantic_type':
|
||||
parts.append(f"type:{value}")
|
||||
elif key == 'section':
|
||||
parts.append(f"section:{value}")
|
||||
elif key == 'ix_tag':
|
||||
parts.append(f"xbrl:{value}")
|
||||
else:
|
||||
parts.append(f"{key}:{value}")
|
||||
|
||||
return " ".join(parts)
|
||||
|
||||
def _escape_markdown(self, text: str) -> str:
|
||||
"""Escape markdown special characters."""
|
||||
# Don't escape in tables
|
||||
if self._in_table:
|
||||
return text
|
||||
|
||||
# Escape special characters
|
||||
for char in ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']:
|
||||
text = text.replace(char, '\\' + char)
|
||||
|
||||
return text
|
||||
|
||||
def _wrap_text(self, text: str, width: int) -> str:
|
||||
"""Wrap text at specified width."""
|
||||
import textwrap
|
||||
return textwrap.fill(text, width=width, break_long_words=False)
|
||||
|
||||
def _expand_table_structure(self, node: TableNode) -> tuple:
|
||||
"""
|
||||
Expand table structure to handle column spanning properly.
|
||||
Returns (expanded_headers, expanded_data_rows).
|
||||
"""
|
||||
# Calculate the logical column count from colspan
|
||||
max_columns = 0
|
||||
|
||||
# Check all rows for maximum column span
|
||||
all_rows = []
|
||||
if node.headers:
|
||||
for header_row in node.headers:
|
||||
all_rows.append(header_row)
|
||||
for row in node.rows:
|
||||
all_rows.append(row.cells)
|
||||
|
||||
for row in all_rows:
|
||||
column_count = sum(cell.colspan for cell in row)
|
||||
max_columns = max(max_columns, column_count)
|
||||
|
||||
# Expand headers
|
||||
expanded_headers = []
|
||||
if node.headers:
|
||||
for header_row in node.headers:
|
||||
expanded = self._expand_row_to_columns(header_row, max_columns)
|
||||
expanded_headers.append(expanded)
|
||||
|
||||
# Expand data rows
|
||||
expanded_data_rows = []
|
||||
for row in node.rows:
|
||||
expanded = self._expand_row_to_columns(row.cells, max_columns)
|
||||
expanded_data_rows.append(expanded)
|
||||
|
||||
return expanded_headers, expanded_data_rows
|
||||
|
||||
def _expand_row_to_columns(self, cells: List, target_columns: int) -> List[str]:
|
||||
"""Expand a row with colspan cells to match the target column count."""
|
||||
expanded = []
|
||||
current_column = 0
|
||||
|
||||
for cell in cells:
|
||||
cell_text = cell.text().strip()
|
||||
|
||||
# Add the cell content
|
||||
expanded.append(cell_text)
|
||||
current_column += 1
|
||||
|
||||
# Add empty cells for remaining colspan
|
||||
for _ in range(cell.colspan - 1):
|
||||
if current_column < target_columns:
|
||||
expanded.append("")
|
||||
current_column += 1
|
||||
|
||||
# Pad to target column count if needed
|
||||
while len(expanded) < target_columns:
|
||||
expanded.append("")
|
||||
|
||||
return expanded[:target_columns]
|
||||
|
||||
def _identify_content_columns(self, expanded_headers: List[List[str]],
|
||||
expanded_data_rows: List[List[str]]) -> List[int]:
|
||||
"""Identify which columns actually contain meaningful content."""
|
||||
if not expanded_headers and not expanded_data_rows:
|
||||
return []
|
||||
|
||||
# Get the column count
|
||||
max_cols = 0
|
||||
if expanded_headers:
|
||||
max_cols = max(max_cols, max(len(row) for row in expanded_headers))
|
||||
if expanded_data_rows:
|
||||
max_cols = max(max_cols, max(len(row) for row in expanded_data_rows))
|
||||
|
||||
content_columns = []
|
||||
|
||||
for col in range(max_cols):
|
||||
has_content = False
|
||||
|
||||
# Check headers
|
||||
for header_row in expanded_headers:
|
||||
if col < len(header_row) and header_row[col].strip():
|
||||
has_content = True
|
||||
break
|
||||
|
||||
# Check data rows
|
||||
if not has_content:
|
||||
for data_row in expanded_data_rows:
|
||||
if col < len(data_row) and data_row[col].strip():
|
||||
has_content = True
|
||||
break
|
||||
|
||||
if has_content:
|
||||
content_columns.append(col)
|
||||
|
||||
return content_columns
|
||||
|
||||
def _combine_multi_row_headers(self, header_rows: List[List[str]]) -> List[str]:
|
||||
"""
|
||||
Combine multi-row headers intelligently for SEC filing tables.
|
||||
Prioritizes specific dates/periods over generic labels.
|
||||
"""
|
||||
if not header_rows:
|
||||
return []
|
||||
|
||||
num_columns = len(header_rows[0])
|
||||
combined = [""] * num_columns
|
||||
|
||||
for col in range(num_columns):
|
||||
# Collect all values for this column across header rows
|
||||
column_values = []
|
||||
for row in header_rows:
|
||||
if col < len(row) and row[col].strip():
|
||||
column_values.append(row[col].strip())
|
||||
|
||||
if column_values:
|
||||
# Prioritize date-like values over generic labels
|
||||
date_values = [v for v in column_values if self._looks_like_date(v)]
|
||||
if date_values:
|
||||
# Clean up line breaks in dates
|
||||
combined[col] = date_values[0].replace('\n', ' ')
|
||||
elif len(column_values) == 1:
|
||||
combined[col] = column_values[0].replace('\n', ' ')
|
||||
else:
|
||||
# Skip generic terms like "Year Ended" if we have something more specific
|
||||
specific_values = [v for v in column_values
|
||||
if v.lower() not in ['year ended', 'years ended']]
|
||||
if specific_values:
|
||||
combined[col] = specific_values[0].replace('\n', ' ')
|
||||
else:
|
||||
combined[col] = column_values[0].replace('\n', ' ')
|
||||
|
||||
return combined
|
||||
|
||||
def _looks_like_date(self, text: str) -> bool:
|
||||
"""Check if text looks like a date."""
|
||||
import re
|
||||
|
||||
# Common date patterns in SEC filings
|
||||
date_patterns = [
|
||||
r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s*\d{4}',
|
||||
r'\d{1,2}/\d{1,2}/\d{4}',
|
||||
r'\d{4}-\d{2}-\d{2}',
|
||||
r'^\d{4}$', # Just a year
|
||||
]
|
||||
|
||||
text_clean = text.replace('\n', ' ').strip()
|
||||
for pattern in date_patterns:
|
||||
if re.search(pattern, text_clean, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
return False
|
||||
@@ -0,0 +1,51 @@
|
||||
"""
|
||||
Plain text renderer for parsed documents.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.extractors.text_extractor import TextExtractor
|
||||
|
||||
|
||||
class TextRenderer:
|
||||
"""
|
||||
Renders parsed documents to plain text.
|
||||
|
||||
This is a simple wrapper around TextExtractor for consistency
|
||||
with other renderers.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
clean: bool = True,
|
||||
include_tables: bool = True,
|
||||
max_length: Optional[int] = None,
|
||||
preserve_structure: bool = False):
|
||||
"""
|
||||
Initialize text renderer.
|
||||
|
||||
Args:
|
||||
clean: Clean and normalize text
|
||||
include_tables: Include table content
|
||||
max_length: Maximum text length
|
||||
preserve_structure: Preserve document structure
|
||||
"""
|
||||
self.extractor = TextExtractor(
|
||||
clean=clean,
|
||||
include_tables=include_tables,
|
||||
include_metadata=False,
|
||||
include_links=False,
|
||||
max_length=max_length,
|
||||
preserve_structure=preserve_structure
|
||||
)
|
||||
|
||||
def render(self, document: Document) -> str:
|
||||
"""
|
||||
Render document to plain text.
|
||||
|
||||
Args:
|
||||
document: Document to render
|
||||
|
||||
Returns:
|
||||
Plain text
|
||||
"""
|
||||
return self.extractor.extract(document)
|
||||
Reference in New Issue
Block a user