Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
"""
Document renderers for various output formats.
"""
from edgar.documents.renderers.markdown import MarkdownRenderer
from edgar.documents.renderers.text import TextRenderer
from edgar.documents.renderers.fast_table import FastTableRenderer
__all__ = [
'MarkdownRenderer',
'TextRenderer',
'FastTableRenderer'
]

View File

@@ -0,0 +1,669 @@
"""
Fast table renderer for edgar.documents - optimized for performance.
This module provides a high-performance alternative to Rich table rendering
while maintaining professional output quality and readability.
Performance target: ~32x faster than Rich rendering (0.2ms vs 6.5ms per table)
"""
from dataclasses import dataclass
from typing import List, Dict, Optional, Union, Tuple
from enum import Enum
class Alignment(Enum):
"""Column alignment options."""
LEFT = "left"
RIGHT = "right"
CENTER = "center"
@dataclass
class ColumnConfig:
"""Configuration for a table column."""
alignment: Alignment = Alignment.LEFT
min_width: int = 8
max_width: Optional[int] = None
padding: int = 1
@dataclass
class TableStyle:
"""Table styling configuration."""
border_char: str = "|"
header_separator: str = "-"
corner_char: str = "+"
padding: int = 1
min_col_width: int = 8
max_col_width: int = 50
@classmethod
def pipe_table(cls) -> 'TableStyle':
"""Markdown-compatible pipe table style."""
return cls(
border_char="|",
header_separator="-",
corner_char="|",
padding=1,
min_col_width=8,
max_col_width=50
)
@classmethod
def minimal(cls) -> 'TableStyle':
"""Minimal table style with spacing only."""
return cls(
border_char="",
header_separator="",
corner_char="",
padding=2,
min_col_width=6,
max_col_width=40
)
@classmethod
def simple(cls) -> 'TableStyle':
"""
Simple table style matching Rich's box.SIMPLE.
Features:
- No outer border
- No column separators
- Single horizontal line under header
- Space-separated columns with generous padding
- Clean, professional appearance
This style provides the best balance of visual quality and performance,
matching Rich's box.SIMPLE aesthetic while maintaining fast rendering speed.
"""
return cls(
border_char="", # No pipes/borders
header_separator="", # Unicode horizontal line
corner_char="", # No corners
padding=2, # Generous spacing (was 1 in pipe_table)
min_col_width=6, # Slightly relaxed (was 8)
max_col_width=60 # Raised from 50 for wider columns
)
class FastTableRenderer:
"""
High-performance table renderer optimized for speed.
Features:
- 30x+ faster than Rich table rendering
- Professional, readable output
- Configurable alignment and styling
- Handles complex SEC filing table structures
- Markdown-compatible output
- Memory efficient
"""
def __init__(self, style: Optional[TableStyle] = None):
"""Initialize renderer with optional style configuration."""
self.style = style or TableStyle.pipe_table()
# Pre-compile format strings for performance
self._format_cache = {}
def render_table_node(self, table_node) -> str:
"""
Render a TableNode to text format with proper colspan/rowspan handling.
Args:
table_node: TableNode instance from edgar.documents
Returns:
Formatted table string
"""
from edgar.documents.utils.table_matrix import TableMatrix
# Build matrix to handle colspan/rowspan properly
# This ensures cells are expanded to fill their full colspan/rowspan
matrix = TableMatrix()
matrix.build_from_rows(table_node.headers, table_node.rows)
# Extract headers from expanded matrix
headers = []
if table_node.headers:
for row_idx in range(len(table_node.headers)):
expanded_row = matrix.get_expanded_row(row_idx)
# Convert Cell objects to strings, handling None values
row_texts = [cell.text().strip() if cell else '' for cell in expanded_row]
headers.append(row_texts)
# Extract data rows from expanded matrix
rows = []
start_row = len(table_node.headers) if table_node.headers else 0
for row_idx in range(start_row, matrix.row_count):
expanded_row = matrix.get_expanded_row(row_idx)
# Convert Cell objects to strings, handling None values
row_texts = [cell.text().strip() if cell else '' for cell in expanded_row]
rows.append(row_texts)
# Render the table
table_text = self.render_table_data(headers, rows)
# Add caption if present (matches Rich renderer behavior)
if hasattr(table_node, 'caption') and table_node.caption:
return f"{table_node.caption}\n{table_text}"
return table_text
def render_table_data(self, headers: List[List[str]], rows: List[List[str]]) -> str:
"""
Render table data with headers and rows.
Args:
headers: List of header rows (for multi-row headers)
rows: List of data rows
Returns:
Formatted table string
"""
if not headers and not rows:
return ""
# Determine column count from all rows (headers + data)
all_rows = headers + rows if headers else rows
if not all_rows:
return ""
max_cols = max(len(row) for row in all_rows) if all_rows else 0
if max_cols == 0:
return ""
# Filter out empty/spacing columns
meaningful_columns = self._identify_meaningful_columns(all_rows, max_cols)
if not meaningful_columns:
return ""
# Filter all rows (both headers and data) to only meaningful columns
filtered_headers = [self._filter_row_to_columns(row, meaningful_columns) for row in headers] if headers else []
filtered_rows = [self._filter_row_to_columns(row, meaningful_columns) for row in rows]
# Post-process to merge related columns (e.g., currency symbols with amounts)
# Apply to all rows including headers
all_filtered = filtered_headers + filtered_rows
if all_filtered:
# Merge using first filtered row as reference
_, all_merged = self._merge_related_columns(all_filtered[0], all_filtered)
# Split back into headers and data
if filtered_headers:
filtered_headers = all_merged[:len(filtered_headers)]
filtered_rows = all_merged[len(filtered_headers):]
else:
filtered_rows = all_merged
# Recalculate with filtered and merged data
filtered_all_rows = filtered_headers + filtered_rows if filtered_headers else filtered_rows
filtered_max_cols = max(len(row) for row in filtered_all_rows) if filtered_all_rows else 0
# Calculate optimal column widths for filtered columns
col_widths = self._calculate_column_widths(filtered_all_rows, filtered_max_cols)
# Detect column alignments based on filtered content
alignments = self._detect_alignments(filtered_all_rows, filtered_max_cols)
# Build table with filtered data - pass headers as multiple rows
return self._build_table(filtered_headers, filtered_rows, col_widths, alignments)
def _combine_headers(self, headers: List[List[str]]) -> List[str]:
"""
Combine multi-row headers intelligently.
For SEC tables, this prioritizes specific dates/periods over generic labels.
"""
if not headers:
return []
if len(headers) == 1:
return headers[0]
# Determine max columns across all header rows
max_cols = max(len(row) for row in headers) if headers else 0
combined = [""] * max_cols
for col in range(max_cols):
# Collect all values for this column
values = []
for header_row in headers:
if col < len(header_row) and header_row[col].strip():
values.append(header_row[col].strip())
if values:
# Prioritize date-like values over generic terms
date_values = [v for v in values if self._looks_like_date(v)]
if date_values:
combined[col] = date_values[0]
elif len(values) == 1:
combined[col] = values[0]
else:
# Skip generic terms like "Year Ended" if we have something more specific
specific_values = [v for v in values
if v.lower() not in {'year ended', 'years ended', 'period ended'}]
combined[col] = specific_values[0] if specific_values else values[0]
return combined
def _looks_like_date(self, text: str) -> bool:
"""Quick date detection for header processing."""
if not text or len(text) < 4:
return False
text_lower = text.lower().replace('\n', ' ').strip()
# Common date indicators
date_indicators = [
'january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november', 'december',
'20', '19', # Year prefixes
]
return any(indicator in text_lower for indicator in date_indicators) and \
any(c.isdigit() for c in text)
def _identify_meaningful_columns(self, all_rows: List[List[str]], max_cols: int) -> List[int]:
"""
Identify columns that contain meaningful content (not just spacing).
Returns:
List of column indices that have meaningful content
"""
column_scores = []
for col_idx in range(max_cols):
content_score = 0
total_rows = 0
# Score each column based on content quality
for row in all_rows:
if col_idx < len(row):
total_rows += 1
cell_content = str(row[col_idx]).strip()
if cell_content:
# Higher score for longer, more substantial content
if len(cell_content) >= 3: # Substantial content
content_score += 3
elif len(cell_content) == 2 and cell_content.isalnum():
content_score += 2
elif len(cell_content) == 1 and (cell_content.isalnum() or cell_content == '$'):
content_score += 1
# Skip single spaces, dashes, or other likely spacing characters
# Calculate average score per row for this column
avg_score = content_score / max(total_rows, 1)
column_scores.append((col_idx, avg_score, content_score))
# Sort by score descending
column_scores.sort(key=lambda x: x[1], reverse=True)
# Take columns with meaningful content (score >= 0.5 or among top columns)
meaningful_columns = []
for col_idx, avg_score, total_score in column_scores:
# Include if it has good average score or significant total content
if avg_score >= 0.5 or total_score >= 5:
meaningful_columns.append(col_idx)
# Limit to reasonable number of columns for readability
if len(meaningful_columns) >= 8:
break
# Sort by original column order
meaningful_columns.sort()
return meaningful_columns
def _filter_row_to_columns(self, row: List[str], column_indices: List[int]) -> List[str]:
"""
Filter a row to only include the specified column indices.
Args:
row: Original row data
column_indices: List of column indices to keep
Returns:
Filtered row with only the specified columns
"""
if not row:
return []
filtered_row = []
for col_idx in column_indices:
if col_idx < len(row):
filtered_row.append(row[col_idx])
else:
filtered_row.append("") # Missing column
return filtered_row
def _merge_related_columns(self, headers: List[str], rows: List[List[str]]) -> tuple:
"""
Merge related columns (e.g., currency symbols with their amounts).
Returns:
Tuple of (merged_headers, merged_rows)
"""
if not rows or not any(rows):
return headers, rows
# Find columns that should be merged
merge_pairs = []
max_cols = max(len(row) for row in [headers] + rows if row) if rows else len(headers) if headers else 0
for col_idx in range(max_cols - 1):
# Check if this column and the next should be merged
should_merge = self._should_merge_columns(headers, rows, col_idx, col_idx + 1)
if should_merge:
merge_pairs.append((col_idx, col_idx + 1))
# Apply merges (from right to left to avoid index shifting)
merged_headers = headers[:] if headers else []
merged_rows = [row[:] for row in rows]
for left_idx, right_idx in reversed(merge_pairs):
# Merge headers
if merged_headers and left_idx < len(merged_headers) and right_idx < len(merged_headers):
left_header = merged_headers[left_idx].strip()
right_header = merged_headers[right_idx].strip()
merged_header = f"{left_header} {right_header}".strip()
merged_headers[left_idx] = merged_header
merged_headers.pop(right_idx)
# Merge rows
for row in merged_rows:
if left_idx < len(row) and right_idx < len(row):
left_cell = str(row[left_idx]).strip()
right_cell = str(row[right_idx]).strip()
# Smart merging based on content
if left_cell == '$' and right_cell:
merged_cell = f"${right_cell}"
elif left_cell and right_cell:
merged_cell = f"{left_cell} {right_cell}"
else:
merged_cell = left_cell or right_cell
row[left_idx] = merged_cell
if right_idx < len(row):
row.pop(right_idx)
return merged_headers, merged_rows
def _should_merge_columns(self, headers: List[str], rows: List[List[str]], left_idx: int, right_idx: int) -> bool:
"""
Determine if two adjacent columns should be merged.
Returns:
True if columns should be merged
"""
# Check if left column is mostly currency symbols
currency_count = 0
total_count = 0
for row in rows:
if left_idx < len(row) and right_idx < len(row):
total_count += 1
left_cell = str(row[left_idx]).strip()
right_cell = str(row[right_idx]).strip()
# If left is '$' and right is a number, they should be merged
if left_cell == '$' and right_cell and (right_cell.replace(',', '').replace('.', '').isdigit()):
currency_count += 1
# If most rows have currency symbol + number pattern, merge them
if total_count > 0 and currency_count / total_count >= 0.5:
return True
# Check for other merge patterns (e.g., empty left column with content right column)
empty_left_count = 0
for row in rows:
if left_idx < len(row) and right_idx < len(row):
left_cell = str(row[left_idx]).strip()
right_cell = str(row[right_idx]).strip()
if not left_cell and right_cell:
empty_left_count += 1
# If left column is mostly empty, consider merging
if total_count > 0 and empty_left_count / total_count >= 0.7:
return True
return False
def _calculate_column_widths(self, all_rows: List[List[str]], max_cols: int) -> List[int]:
"""Calculate optimal column widths based on content."""
col_widths = [self.style.min_col_width] * max_cols
# Find the maximum content width for each column
for row in all_rows:
for col_idx in range(min(len(row), max_cols)):
content = str(row[col_idx]) if row[col_idx] else ""
# Handle multi-line content
max_line_width = max((len(line) for line in content.split('\n')), default=0)
content_width = max_line_width + (self.style.padding * 2)
# Apply limits
content_width = min(content_width, self.style.max_col_width)
col_widths[col_idx] = max(col_widths[col_idx], content_width)
return col_widths
def _detect_alignments(self, all_rows: List[List[str]], max_cols: int) -> List[Alignment]:
"""Detect appropriate alignment for each column based on content."""
alignments = [Alignment.LEFT] * max_cols
for col_idx in range(max_cols):
# Analyze column content (skip header row if present)
data_rows = all_rows[1:] if len(all_rows) > 1 else all_rows
numeric_count = 0
total_count = 0
for row in data_rows:
if col_idx < len(row) and row[col_idx].strip():
total_count += 1
content = row[col_idx].strip()
# Check if content looks numeric (currency, percentages, numbers)
if self._looks_numeric(content):
numeric_count += 1
# If most values in column are numeric, right-align
if total_count > 0 and numeric_count / total_count >= 0.7:
alignments[col_idx] = Alignment.RIGHT
return alignments
def _looks_numeric(self, text: str) -> bool:
"""Check if text content looks numeric."""
if not text:
return False
# Remove common formatting characters
clean_text = text.replace(',', '').replace('$', '').replace('%', '').replace('(', '').replace(')', '').strip()
# Handle negative numbers in parentheses
if text.strip().startswith('(') and text.strip().endswith(')'):
clean_text = text.strip()[1:-1].replace(',', '').replace('$', '').strip()
# Check if remaining text is numeric
try:
float(clean_text)
return True
except ValueError:
return False
def _build_table(self, headers: List[List[str]], rows: List[List[str]],
col_widths: List[int], alignments: List[Alignment]) -> str:
"""
Build the final table string.
Args:
headers: List of header rows (can be multiple rows for multi-row headers)
rows: List of data rows
col_widths: Column widths
alignments: Column alignments
"""
lines = []
# Header rows (can be multiple)
if headers:
for header_row in headers:
# Only add header rows with meaningful content
if any(cell.strip() for cell in header_row):
# Handle multi-line cells in header rows
formatted_lines = self._format_multiline_row(header_row, col_widths, alignments)
lines.extend(formatted_lines)
# Header separator (after all header rows)
if self.style.header_separator:
sep_line = self._create_separator_line(col_widths)
lines.append(sep_line)
# Data rows
for row in rows:
# Only add rows with meaningful content
if any(cell.strip() for cell in row):
row_line = self._format_row(row, col_widths, alignments)
lines.append(row_line)
return '\n'.join(lines)
def _format_row(self, row: List[str], col_widths: List[int],
alignments: List[Alignment]) -> str:
"""Format a single row with proper alignment and padding."""
cells = []
border = self.style.border_char
for col_idx, width in enumerate(col_widths):
# Get cell content
content = str(row[col_idx]) if col_idx < len(row) else ""
# Handle multi-line content (take first line only for table)
if '\n' in content:
content = content.split('\n')[0]
content = content.strip()
# Calculate available width for content
available_width = width - (self.style.padding * 2)
# Truncate if too long
if len(content) > available_width:
content = content[:available_width-3] + "..."
# Apply alignment
alignment = alignments[col_idx] if col_idx < len(alignments) else Alignment.LEFT
if alignment == Alignment.RIGHT:
aligned_content = content.rjust(available_width)
elif alignment == Alignment.CENTER:
aligned_content = content.center(available_width)
else: # LEFT
aligned_content = content.ljust(available_width)
# Add padding
padded_cell = ' ' * self.style.padding + aligned_content + ' ' * self.style.padding
cells.append(padded_cell)
# Join with borders
if border:
return border + border.join(cells) + border
else:
return ' '.join(cells)
def _format_multiline_row(self, row: List[str], col_widths: List[int],
alignments: List[Alignment]) -> List[str]:
"""
Format a row that may contain multi-line cells (cells with \n characters).
Returns a list of formatted lines, one for each line of text in the cells.
"""
# Split each cell by newlines
cell_lines = []
max_lines = 1
for col_idx, content in enumerate(row):
lines = content.split('\n') if content else ['']
cell_lines.append(lines)
max_lines = max(max_lines, len(lines))
# Build output lines
output_lines = []
for line_idx in range(max_lines):
# Build row for this line
current_row = []
for col_idx in range(len(row)):
# Get the line for this cell, or empty string if this cell has fewer lines
if line_idx < len(cell_lines[col_idx]):
current_row.append(cell_lines[col_idx][line_idx])
else:
current_row.append('')
# Format this line
formatted_line = self._format_row(current_row, col_widths, alignments)
output_lines.append(formatted_line)
return output_lines
def _create_separator_line(self, col_widths: List[int]) -> str:
"""
Create header separator line.
For bordered styles: |-------|-------|
For borderless styles: ─────────────── (full width horizontal line)
"""
sep_char = self.style.header_separator
border = self.style.border_char
if not sep_char:
# No separator at all (minimal style)
return ""
if border:
# Bordered style: create separator matching column widths
separators = []
for width in col_widths:
separators.append(sep_char * width)
return border + border.join(separators) + border
else:
# Borderless style (simple): single horizontal line across full width
# Calculate total width: sum of column widths + gaps between columns
total_width = sum(col_widths) + (len(col_widths) - 1) * 2 # 2-space gaps
# Add leading space for indentation (matching row indentation)
return " " + sep_char * total_width
# Factory functions for easy usage
def create_fast_renderer(style: str = "pipe") -> FastTableRenderer:
"""
Create a FastTableRenderer with predefined style.
Args:
style: Style name ("pipe", "minimal")
Returns:
Configured FastTableRenderer instance
"""
if style == "minimal":
return FastTableRenderer(TableStyle.minimal())
else: # Default to pipe
return FastTableRenderer(TableStyle.pipe_table())
def render_table_fast(table_node, style: str = "pipe") -> str:
"""
Convenience function to quickly render a table.
Args:
table_node: TableNode instance
style: Style name ("pipe", "minimal")
Returns:
Formatted table string
"""
renderer = create_fast_renderer(style)
return renderer.render_table_node(table_node)

View File

@@ -0,0 +1,613 @@
"""
Markdown renderer for parsed documents.
"""
from typing import List, Optional, Dict, Set
from edgar.documents.document import Document
from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode, ListNode, ListItemNode
from edgar.documents.table_nodes import TableNode
class MarkdownRenderer:
"""
Renders parsed documents to Markdown format.
Features:
- Preserves document structure
- Handles tables with proper formatting
- Supports nested lists
- Includes metadata annotations
- Configurable output options
"""
def __init__(self,
include_metadata: bool = False,
include_toc: bool = False,
max_heading_level: int = 6,
table_format: str = 'pipe',
wrap_width: Optional[int] = None):
"""
Initialize markdown renderer.
Args:
include_metadata: Include metadata annotations
include_toc: Generate table of contents
max_heading_level: Maximum heading level to render
table_format: Table format ('pipe', 'grid', 'simple')
wrap_width: Wrap text at specified width
"""
self.include_metadata = include_metadata
self.include_toc = include_toc
self.max_heading_level = max_heading_level
self.table_format = table_format
self.wrap_width = wrap_width
# Track state during rendering
self._toc_entries: List[tuple] = []
self._rendered_ids: Set[str] = set()
self._list_depth = 0
self._in_table = False
def render(self, document: Document) -> str:
"""
Render document to Markdown.
Args:
document: Document to render
Returns:
Markdown formatted text
"""
self._reset_state()
parts = []
# Add metadata header if requested
if self.include_metadata:
parts.append(self._render_metadata(document))
parts.append("")
# Placeholder for TOC
if self.include_toc:
toc_placeholder = "<!-- TOC -->"
parts.append(toc_placeholder)
parts.append("")
# Render document content
content = self._render_node(document.root)
parts.append(content)
# Join parts
markdown = "\n".join(parts)
# Replace TOC placeholder
if self.include_toc and self._toc_entries:
toc = self._generate_toc()
markdown = markdown.replace(toc_placeholder, toc)
return markdown.strip()
def render_node(self, node: Node) -> str:
"""
Render a specific node to Markdown.
Args:
node: Node to render
Returns:
Markdown formatted text
"""
self._reset_state()
return self._render_node(node)
def _reset_state(self):
"""Reset renderer state."""
self._toc_entries = []
self._rendered_ids = set()
self._list_depth = 0
self._in_table = False
def _render_node(self, node: Node) -> str:
"""Render a node and its children."""
# Skip if already rendered (handles shared nodes)
if node.id in self._rendered_ids:
return ""
self._rendered_ids.add(node.id)
# Dispatch based on node type
if isinstance(node, HeadingNode):
return self._render_heading(node)
elif isinstance(node, ParagraphNode):
return self._render_paragraph(node)
elif isinstance(node, TextNode):
return self._render_text(node)
elif isinstance(node, TableNode):
return self._render_table(node)
elif isinstance(node, ListNode):
return self._render_list(node)
elif isinstance(node, ListItemNode):
return self._render_list_item(node)
else:
# Default: render children
return self._render_children(node)
def _render_heading(self, node: HeadingNode) -> str:
"""Render heading node."""
# Limit heading level
level = min(node.level, self.max_heading_level)
# Get heading text
text = node.text().strip()
if not text:
return ""
# Add to TOC
if self.include_toc:
self._toc_entries.append((level, text, node.id))
# Create markdown heading
markdown = "#" * level + " " + text
# Add metadata if requested
if self.include_metadata and node.metadata:
metadata = self._format_metadata(node.metadata)
if metadata:
markdown += f" <!-- {metadata} -->"
# Add children content
children_content = self._render_children(node)
if children_content:
markdown += "\n\n" + children_content
return markdown
def _render_paragraph(self, node: ParagraphNode) -> str:
"""Render paragraph node."""
# Get paragraph content
content = self._render_children(node).strip()
if not content:
return ""
# Wrap if requested
if self.wrap_width:
content = self._wrap_text(content, self.wrap_width)
# Add metadata if requested
if self.include_metadata and node.metadata:
metadata = self._format_metadata(node.metadata)
if metadata:
content = f"<!-- {metadata} -->\n{content}"
return content
def _render_text(self, node: TextNode) -> str:
"""Render text node."""
text = node.text()
# Escape markdown special characters
text = self._escape_markdown(text)
# Apply text formatting based on style
if node.style:
if node.style.font_weight in ['bold', '700', '800', '900']:
text = f"**{text}**"
elif node.style.font_style == 'italic':
text = f"*{text}*"
elif node.style.text_decoration == 'underline':
text = f"<u>{text}</u>"
return text
def _render_table(self, node: TableNode) -> str:
"""Render table node."""
self._in_table = True
parts = []
# Add caption if present
if node.caption:
parts.append(f"**Table: {node.caption}**")
parts.append("")
# Render based on format
if self.table_format == 'pipe':
table_md = self._render_table_pipe(node)
elif self.table_format == 'grid':
table_md = self._render_table_grid(node)
else: # simple
table_md = self._render_table_simple(node)
parts.append(table_md)
# Add metadata if requested
if self.include_metadata and node.metadata:
metadata = self._format_metadata(node.metadata)
if metadata:
parts.append(f"<!-- Table metadata: {metadata} -->")
self._in_table = False
return "\n".join(parts)
def _render_table_pipe(self, node: TableNode) -> str:
"""Render table in pipe format with proper column spanning support."""
# Handle complex SEC filing tables with column spanning
expanded_headers, expanded_data_rows = self._expand_table_structure(node)
# Identify and filter to meaningful columns
content_columns = self._identify_content_columns(expanded_headers, expanded_data_rows)
if not content_columns:
return ""
rows = []
# Render headers with intelligent multi-row combination
if expanded_headers:
combined_headers = self._combine_multi_row_headers(expanded_headers)
filtered_headers = [combined_headers[i] if i < len(combined_headers) else "" for i in content_columns]
row_md = "| " + " | ".join(filtered_headers) + " |"
rows.append(row_md)
# Add separator
separator = "| " + " | ".join(["---"] * len(filtered_headers)) + " |"
rows.append(separator)
# Render data rows
for expanded_row in expanded_data_rows:
filtered_row = [expanded_row[i] if i < len(expanded_row) else "" for i in content_columns]
# Only add rows with meaningful content
if any(cell.strip() for cell in filtered_row):
row_md = "| " + " | ".join(filtered_row) + " |"
rows.append(row_md)
return "\n".join(rows)
def _render_table_grid(self, node: TableNode) -> str:
"""Render table in grid format."""
# Simplified grid format
all_rows = []
# Add headers
if node.headers:
for header_row in node.headers:
cells = [cell.text() for cell in header_row]
all_rows.append(" | ".join(cells))
# Add data rows
for row in node.rows:
cells = [cell.text() for cell in row.cells]
all_rows.append(" | ".join(cells))
if all_rows:
# Add borders
max_width = max(len(row) for row in all_rows)
border = "+" + "-" * (max_width + 2) + "+"
result = [border]
for row in all_rows:
result.append(f"| {row:<{max_width}} |")
result.append(border)
return "\n".join(result)
return ""
def _render_table_simple(self, node: TableNode) -> str:
"""Render table in simple format."""
rows = []
# Add headers
if node.headers:
for header_row in node.headers:
cells = [cell.text() for cell in header_row]
rows.append(" ".join(cells))
# Add separator if we have headers
if node.headers and node.rows:
rows.append("")
# Add data rows
for row in node.rows:
cells = [cell.text() for cell in row.cells]
rows.append(" ".join(cells))
return "\n".join(rows)
def _render_list(self, node: ListNode) -> str:
"""Render list node."""
self._list_depth += 1
items = []
for child in node.children:
if isinstance(child, ListItemNode):
item_md = self._render_list_item(child)
if item_md:
items.append(item_md)
self._list_depth -= 1
return "\n".join(items)
def _render_list_item(self, node: ListItemNode) -> str:
"""Render list item node."""
# Determine bullet/number
if node.parent and hasattr(node.parent, 'ordered') and node.parent.ordered:
# Ordered list
index = node.parent.children.index(node) + 1
marker = f"{index}."
else:
# Unordered list
markers = ['*', '-', '+']
marker = markers[(self._list_depth - 1) % len(markers)]
# Indentation
indent = " " * (self._list_depth - 1)
# Get content
content = self._render_children(node).strip()
# Format item
if '\n' in content:
# Multi-line content
lines = content.split('\n')
result = indent + marker + " " + lines[0]
for line in lines[1:]:
result += "\n" + indent + " " + line
return result
else:
# Single line
return indent + marker + " " + content
def _render_children(self, node: Node) -> str:
"""Render all children of a node."""
parts = []
for child in node.children:
child_md = self._render_node(child)
if child_md:
parts.append(child_md)
# Join with appropriate separator
if self._in_table:
return " ".join(parts)
elif any(isinstance(child, (HeadingNode, ParagraphNode, TableNode, ListNode))
for child in node.children):
return "\n\n".join(parts)
else:
return " ".join(parts)
def _render_metadata(self, document: Document) -> str:
"""Render document metadata."""
lines = ["---"]
if document.metadata.company:
lines.append(f"company: {document.metadata.company}")
if document.metadata.form:
lines.append(f"form: {document.metadata.form}")
if document.metadata.filing_date:
lines.append(f"filing_date: {document.metadata.filing_date}")
if document.metadata.cik:
lines.append(f"cik: {document.metadata.cik}")
if document.metadata.accession_number:
lines.append(f"accession_number: {document.metadata.accession_number}")
lines.append("---")
return "\n".join(lines)
def _generate_toc(self) -> str:
"""Generate table of contents."""
lines = ["## Table of Contents", ""]
for level, text, node_id in self._toc_entries:
# Create anchor link
anchor = self._create_anchor(text)
# Indentation based on level
indent = " " * (level - 1)
# Add TOC entry
lines.append(f"{indent}- [{text}](#{anchor})")
return "\n".join(lines)
def _create_anchor(self, text: str) -> str:
"""Create anchor from heading text."""
# Convert to lowercase and replace spaces with hyphens
anchor = text.lower()
anchor = anchor.replace(' ', '-')
# Remove special characters
import re
anchor = re.sub(r'[^a-z0-9\-]', '', anchor)
# Remove multiple hyphens
anchor = re.sub(r'-+', '-', anchor)
return anchor.strip('-')
def _format_metadata(self, metadata: Dict) -> str:
"""Format metadata for display."""
parts = []
for key, value in metadata.items():
if key == 'semantic_type':
parts.append(f"type:{value}")
elif key == 'section':
parts.append(f"section:{value}")
elif key == 'ix_tag':
parts.append(f"xbrl:{value}")
else:
parts.append(f"{key}:{value}")
return " ".join(parts)
def _escape_markdown(self, text: str) -> str:
"""Escape markdown special characters."""
# Don't escape in tables
if self._in_table:
return text
# Escape special characters
for char in ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']:
text = text.replace(char, '\\' + char)
return text
def _wrap_text(self, text: str, width: int) -> str:
"""Wrap text at specified width."""
import textwrap
return textwrap.fill(text, width=width, break_long_words=False)
def _expand_table_structure(self, node: TableNode) -> tuple:
"""
Expand table structure to handle column spanning properly.
Returns (expanded_headers, expanded_data_rows).
"""
# Calculate the logical column count from colspan
max_columns = 0
# Check all rows for maximum column span
all_rows = []
if node.headers:
for header_row in node.headers:
all_rows.append(header_row)
for row in node.rows:
all_rows.append(row.cells)
for row in all_rows:
column_count = sum(cell.colspan for cell in row)
max_columns = max(max_columns, column_count)
# Expand headers
expanded_headers = []
if node.headers:
for header_row in node.headers:
expanded = self._expand_row_to_columns(header_row, max_columns)
expanded_headers.append(expanded)
# Expand data rows
expanded_data_rows = []
for row in node.rows:
expanded = self._expand_row_to_columns(row.cells, max_columns)
expanded_data_rows.append(expanded)
return expanded_headers, expanded_data_rows
def _expand_row_to_columns(self, cells: List, target_columns: int) -> List[str]:
"""Expand a row with colspan cells to match the target column count."""
expanded = []
current_column = 0
for cell in cells:
cell_text = cell.text().strip()
# Add the cell content
expanded.append(cell_text)
current_column += 1
# Add empty cells for remaining colspan
for _ in range(cell.colspan - 1):
if current_column < target_columns:
expanded.append("")
current_column += 1
# Pad to target column count if needed
while len(expanded) < target_columns:
expanded.append("")
return expanded[:target_columns]
def _identify_content_columns(self, expanded_headers: List[List[str]],
expanded_data_rows: List[List[str]]) -> List[int]:
"""Identify which columns actually contain meaningful content."""
if not expanded_headers and not expanded_data_rows:
return []
# Get the column count
max_cols = 0
if expanded_headers:
max_cols = max(max_cols, max(len(row) for row in expanded_headers))
if expanded_data_rows:
max_cols = max(max_cols, max(len(row) for row in expanded_data_rows))
content_columns = []
for col in range(max_cols):
has_content = False
# Check headers
for header_row in expanded_headers:
if col < len(header_row) and header_row[col].strip():
has_content = True
break
# Check data rows
if not has_content:
for data_row in expanded_data_rows:
if col < len(data_row) and data_row[col].strip():
has_content = True
break
if has_content:
content_columns.append(col)
return content_columns
def _combine_multi_row_headers(self, header_rows: List[List[str]]) -> List[str]:
"""
Combine multi-row headers intelligently for SEC filing tables.
Prioritizes specific dates/periods over generic labels.
"""
if not header_rows:
return []
num_columns = len(header_rows[0])
combined = [""] * num_columns
for col in range(num_columns):
# Collect all values for this column across header rows
column_values = []
for row in header_rows:
if col < len(row) and row[col].strip():
column_values.append(row[col].strip())
if column_values:
# Prioritize date-like values over generic labels
date_values = [v for v in column_values if self._looks_like_date(v)]
if date_values:
# Clean up line breaks in dates
combined[col] = date_values[0].replace('\n', ' ')
elif len(column_values) == 1:
combined[col] = column_values[0].replace('\n', ' ')
else:
# Skip generic terms like "Year Ended" if we have something more specific
specific_values = [v for v in column_values
if v.lower() not in ['year ended', 'years ended']]
if specific_values:
combined[col] = specific_values[0].replace('\n', ' ')
else:
combined[col] = column_values[0].replace('\n', ' ')
return combined
def _looks_like_date(self, text: str) -> bool:
"""Check if text looks like a date."""
import re
# Common date patterns in SEC filings
date_patterns = [
r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s*\d{4}',
r'\d{1,2}/\d{1,2}/\d{4}',
r'\d{4}-\d{2}-\d{2}',
r'^\d{4}$', # Just a year
]
text_clean = text.replace('\n', ' ').strip()
for pattern in date_patterns:
if re.search(pattern, text_clean, re.IGNORECASE):
return True
return False

View File

@@ -0,0 +1,51 @@
"""
Plain text renderer for parsed documents.
"""
from typing import Optional
from edgar.documents.document import Document
from edgar.documents.extractors.text_extractor import TextExtractor
class TextRenderer:
"""
Renders parsed documents to plain text.
This is a simple wrapper around TextExtractor for consistency
with other renderers.
"""
def __init__(self,
clean: bool = True,
include_tables: bool = True,
max_length: Optional[int] = None,
preserve_structure: bool = False):
"""
Initialize text renderer.
Args:
clean: Clean and normalize text
include_tables: Include table content
max_length: Maximum text length
preserve_structure: Preserve document structure
"""
self.extractor = TextExtractor(
clean=clean,
include_tables=include_tables,
include_metadata=False,
include_links=False,
max_length=max_length,
preserve_structure=preserve_structure
)
def render(self, document: Document) -> str:
"""
Render document to plain text.
Args:
document: Document to render
Returns:
Plain text
"""
return self.extractor.extract(document)