Files
2025-12-09 12:13:01 +01:00

613 lines
21 KiB
Python

"""
Markdown renderer for parsed documents.
"""
from typing import List, Optional, Dict, Set
from edgar.documents.document import Document
from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode, ListNode, ListItemNode
from edgar.documents.table_nodes import TableNode
class MarkdownRenderer:
"""
Renders parsed documents to Markdown format.
Features:
- Preserves document structure
- Handles tables with proper formatting
- Supports nested lists
- Includes metadata annotations
- Configurable output options
"""
def __init__(self,
include_metadata: bool = False,
include_toc: bool = False,
max_heading_level: int = 6,
table_format: str = 'pipe',
wrap_width: Optional[int] = None):
"""
Initialize markdown renderer.
Args:
include_metadata: Include metadata annotations
include_toc: Generate table of contents
max_heading_level: Maximum heading level to render
table_format: Table format ('pipe', 'grid', 'simple')
wrap_width: Wrap text at specified width
"""
self.include_metadata = include_metadata
self.include_toc = include_toc
self.max_heading_level = max_heading_level
self.table_format = table_format
self.wrap_width = wrap_width
# Track state during rendering
self._toc_entries: List[tuple] = []
self._rendered_ids: Set[str] = set()
self._list_depth = 0
self._in_table = False
def render(self, document: Document) -> str:
"""
Render document to Markdown.
Args:
document: Document to render
Returns:
Markdown formatted text
"""
self._reset_state()
parts = []
# Add metadata header if requested
if self.include_metadata:
parts.append(self._render_metadata(document))
parts.append("")
# Placeholder for TOC
if self.include_toc:
toc_placeholder = "<!-- TOC -->"
parts.append(toc_placeholder)
parts.append("")
# Render document content
content = self._render_node(document.root)
parts.append(content)
# Join parts
markdown = "\n".join(parts)
# Replace TOC placeholder
if self.include_toc and self._toc_entries:
toc = self._generate_toc()
markdown = markdown.replace(toc_placeholder, toc)
return markdown.strip()
def render_node(self, node: Node) -> str:
"""
Render a specific node to Markdown.
Args:
node: Node to render
Returns:
Markdown formatted text
"""
self._reset_state()
return self._render_node(node)
def _reset_state(self):
"""Reset renderer state."""
self._toc_entries = []
self._rendered_ids = set()
self._list_depth = 0
self._in_table = False
def _render_node(self, node: Node) -> str:
"""Render a node and its children."""
# Skip if already rendered (handles shared nodes)
if node.id in self._rendered_ids:
return ""
self._rendered_ids.add(node.id)
# Dispatch based on node type
if isinstance(node, HeadingNode):
return self._render_heading(node)
elif isinstance(node, ParagraphNode):
return self._render_paragraph(node)
elif isinstance(node, TextNode):
return self._render_text(node)
elif isinstance(node, TableNode):
return self._render_table(node)
elif isinstance(node, ListNode):
return self._render_list(node)
elif isinstance(node, ListItemNode):
return self._render_list_item(node)
else:
# Default: render children
return self._render_children(node)
def _render_heading(self, node: HeadingNode) -> str:
"""Render heading node."""
# Limit heading level
level = min(node.level, self.max_heading_level)
# Get heading text
text = node.text().strip()
if not text:
return ""
# Add to TOC
if self.include_toc:
self._toc_entries.append((level, text, node.id))
# Create markdown heading
markdown = "#" * level + " " + text
# Add metadata if requested
if self.include_metadata and node.metadata:
metadata = self._format_metadata(node.metadata)
if metadata:
markdown += f" <!-- {metadata} -->"
# Add children content
children_content = self._render_children(node)
if children_content:
markdown += "\n\n" + children_content
return markdown
def _render_paragraph(self, node: ParagraphNode) -> str:
"""Render paragraph node."""
# Get paragraph content
content = self._render_children(node).strip()
if not content:
return ""
# Wrap if requested
if self.wrap_width:
content = self._wrap_text(content, self.wrap_width)
# Add metadata if requested
if self.include_metadata and node.metadata:
metadata = self._format_metadata(node.metadata)
if metadata:
content = f"<!-- {metadata} -->\n{content}"
return content
def _render_text(self, node: TextNode) -> str:
"""Render text node."""
text = node.text()
# Escape markdown special characters
text = self._escape_markdown(text)
# Apply text formatting based on style
if node.style:
if node.style.font_weight in ['bold', '700', '800', '900']:
text = f"**{text}**"
elif node.style.font_style == 'italic':
text = f"*{text}*"
elif node.style.text_decoration == 'underline':
text = f"<u>{text}</u>"
return text
def _render_table(self, node: TableNode) -> str:
"""Render table node."""
self._in_table = True
parts = []
# Add caption if present
if node.caption:
parts.append(f"**Table: {node.caption}**")
parts.append("")
# Render based on format
if self.table_format == 'pipe':
table_md = self._render_table_pipe(node)
elif self.table_format == 'grid':
table_md = self._render_table_grid(node)
else: # simple
table_md = self._render_table_simple(node)
parts.append(table_md)
# Add metadata if requested
if self.include_metadata and node.metadata:
metadata = self._format_metadata(node.metadata)
if metadata:
parts.append(f"<!-- Table metadata: {metadata} -->")
self._in_table = False
return "\n".join(parts)
def _render_table_pipe(self, node: TableNode) -> str:
"""Render table in pipe format with proper column spanning support."""
# Handle complex SEC filing tables with column spanning
expanded_headers, expanded_data_rows = self._expand_table_structure(node)
# Identify and filter to meaningful columns
content_columns = self._identify_content_columns(expanded_headers, expanded_data_rows)
if not content_columns:
return ""
rows = []
# Render headers with intelligent multi-row combination
if expanded_headers:
combined_headers = self._combine_multi_row_headers(expanded_headers)
filtered_headers = [combined_headers[i] if i < len(combined_headers) else "" for i in content_columns]
row_md = "| " + " | ".join(filtered_headers) + " |"
rows.append(row_md)
# Add separator
separator = "| " + " | ".join(["---"] * len(filtered_headers)) + " |"
rows.append(separator)
# Render data rows
for expanded_row in expanded_data_rows:
filtered_row = [expanded_row[i] if i < len(expanded_row) else "" for i in content_columns]
# Only add rows with meaningful content
if any(cell.strip() for cell in filtered_row):
row_md = "| " + " | ".join(filtered_row) + " |"
rows.append(row_md)
return "\n".join(rows)
def _render_table_grid(self, node: TableNode) -> str:
"""Render table in grid format."""
# Simplified grid format
all_rows = []
# Add headers
if node.headers:
for header_row in node.headers:
cells = [cell.text() for cell in header_row]
all_rows.append(" | ".join(cells))
# Add data rows
for row in node.rows:
cells = [cell.text() for cell in row.cells]
all_rows.append(" | ".join(cells))
if all_rows:
# Add borders
max_width = max(len(row) for row in all_rows)
border = "+" + "-" * (max_width + 2) + "+"
result = [border]
for row in all_rows:
result.append(f"| {row:<{max_width}} |")
result.append(border)
return "\n".join(result)
return ""
def _render_table_simple(self, node: TableNode) -> str:
"""Render table in simple format."""
rows = []
# Add headers
if node.headers:
for header_row in node.headers:
cells = [cell.text() for cell in header_row]
rows.append(" ".join(cells))
# Add separator if we have headers
if node.headers and node.rows:
rows.append("")
# Add data rows
for row in node.rows:
cells = [cell.text() for cell in row.cells]
rows.append(" ".join(cells))
return "\n".join(rows)
def _render_list(self, node: ListNode) -> str:
"""Render list node."""
self._list_depth += 1
items = []
for child in node.children:
if isinstance(child, ListItemNode):
item_md = self._render_list_item(child)
if item_md:
items.append(item_md)
self._list_depth -= 1
return "\n".join(items)
def _render_list_item(self, node: ListItemNode) -> str:
"""Render list item node."""
# Determine bullet/number
if node.parent and hasattr(node.parent, 'ordered') and node.parent.ordered:
# Ordered list
index = node.parent.children.index(node) + 1
marker = f"{index}."
else:
# Unordered list
markers = ['*', '-', '+']
marker = markers[(self._list_depth - 1) % len(markers)]
# Indentation
indent = " " * (self._list_depth - 1)
# Get content
content = self._render_children(node).strip()
# Format item
if '\n' in content:
# Multi-line content
lines = content.split('\n')
result = indent + marker + " " + lines[0]
for line in lines[1:]:
result += "\n" + indent + " " + line
return result
else:
# Single line
return indent + marker + " " + content
def _render_children(self, node: Node) -> str:
"""Render all children of a node."""
parts = []
for child in node.children:
child_md = self._render_node(child)
if child_md:
parts.append(child_md)
# Join with appropriate separator
if self._in_table:
return " ".join(parts)
elif any(isinstance(child, (HeadingNode, ParagraphNode, TableNode, ListNode))
for child in node.children):
return "\n\n".join(parts)
else:
return " ".join(parts)
def _render_metadata(self, document: Document) -> str:
"""Render document metadata."""
lines = ["---"]
if document.metadata.company:
lines.append(f"company: {document.metadata.company}")
if document.metadata.form:
lines.append(f"form: {document.metadata.form}")
if document.metadata.filing_date:
lines.append(f"filing_date: {document.metadata.filing_date}")
if document.metadata.cik:
lines.append(f"cik: {document.metadata.cik}")
if document.metadata.accession_number:
lines.append(f"accession_number: {document.metadata.accession_number}")
lines.append("---")
return "\n".join(lines)
def _generate_toc(self) -> str:
"""Generate table of contents."""
lines = ["## Table of Contents", ""]
for level, text, node_id in self._toc_entries:
# Create anchor link
anchor = self._create_anchor(text)
# Indentation based on level
indent = " " * (level - 1)
# Add TOC entry
lines.append(f"{indent}- [{text}](#{anchor})")
return "\n".join(lines)
def _create_anchor(self, text: str) -> str:
"""Create anchor from heading text."""
# Convert to lowercase and replace spaces with hyphens
anchor = text.lower()
anchor = anchor.replace(' ', '-')
# Remove special characters
import re
anchor = re.sub(r'[^a-z0-9\-]', '', anchor)
# Remove multiple hyphens
anchor = re.sub(r'-+', '-', anchor)
return anchor.strip('-')
def _format_metadata(self, metadata: Dict) -> str:
"""Format metadata for display."""
parts = []
for key, value in metadata.items():
if key == 'semantic_type':
parts.append(f"type:{value}")
elif key == 'section':
parts.append(f"section:{value}")
elif key == 'ix_tag':
parts.append(f"xbrl:{value}")
else:
parts.append(f"{key}:{value}")
return " ".join(parts)
def _escape_markdown(self, text: str) -> str:
"""Escape markdown special characters."""
# Don't escape in tables
if self._in_table:
return text
# Escape special characters
for char in ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']:
text = text.replace(char, '\\' + char)
return text
def _wrap_text(self, text: str, width: int) -> str:
"""Wrap text at specified width."""
import textwrap
return textwrap.fill(text, width=width, break_long_words=False)
def _expand_table_structure(self, node: TableNode) -> tuple:
"""
Expand table structure to handle column spanning properly.
Returns (expanded_headers, expanded_data_rows).
"""
# Calculate the logical column count from colspan
max_columns = 0
# Check all rows for maximum column span
all_rows = []
if node.headers:
for header_row in node.headers:
all_rows.append(header_row)
for row in node.rows:
all_rows.append(row.cells)
for row in all_rows:
column_count = sum(cell.colspan for cell in row)
max_columns = max(max_columns, column_count)
# Expand headers
expanded_headers = []
if node.headers:
for header_row in node.headers:
expanded = self._expand_row_to_columns(header_row, max_columns)
expanded_headers.append(expanded)
# Expand data rows
expanded_data_rows = []
for row in node.rows:
expanded = self._expand_row_to_columns(row.cells, max_columns)
expanded_data_rows.append(expanded)
return expanded_headers, expanded_data_rows
def _expand_row_to_columns(self, cells: List, target_columns: int) -> List[str]:
"""Expand a row with colspan cells to match the target column count."""
expanded = []
current_column = 0
for cell in cells:
cell_text = cell.text().strip()
# Add the cell content
expanded.append(cell_text)
current_column += 1
# Add empty cells for remaining colspan
for _ in range(cell.colspan - 1):
if current_column < target_columns:
expanded.append("")
current_column += 1
# Pad to target column count if needed
while len(expanded) < target_columns:
expanded.append("")
return expanded[:target_columns]
def _identify_content_columns(self, expanded_headers: List[List[str]],
expanded_data_rows: List[List[str]]) -> List[int]:
"""Identify which columns actually contain meaningful content."""
if not expanded_headers and not expanded_data_rows:
return []
# Get the column count
max_cols = 0
if expanded_headers:
max_cols = max(max_cols, max(len(row) for row in expanded_headers))
if expanded_data_rows:
max_cols = max(max_cols, max(len(row) for row in expanded_data_rows))
content_columns = []
for col in range(max_cols):
has_content = False
# Check headers
for header_row in expanded_headers:
if col < len(header_row) and header_row[col].strip():
has_content = True
break
# Check data rows
if not has_content:
for data_row in expanded_data_rows:
if col < len(data_row) and data_row[col].strip():
has_content = True
break
if has_content:
content_columns.append(col)
return content_columns
def _combine_multi_row_headers(self, header_rows: List[List[str]]) -> List[str]:
"""
Combine multi-row headers intelligently for SEC filing tables.
Prioritizes specific dates/periods over generic labels.
"""
if not header_rows:
return []
num_columns = len(header_rows[0])
combined = [""] * num_columns
for col in range(num_columns):
# Collect all values for this column across header rows
column_values = []
for row in header_rows:
if col < len(row) and row[col].strip():
column_values.append(row[col].strip())
if column_values:
# Prioritize date-like values over generic labels
date_values = [v for v in column_values if self._looks_like_date(v)]
if date_values:
# Clean up line breaks in dates
combined[col] = date_values[0].replace('\n', ' ')
elif len(column_values) == 1:
combined[col] = column_values[0].replace('\n', ' ')
else:
# Skip generic terms like "Year Ended" if we have something more specific
specific_values = [v for v in column_values
if v.lower() not in ['year ended', 'years ended']]
if specific_values:
combined[col] = specific_values[0].replace('\n', ' ')
else:
combined[col] = column_values[0].replace('\n', ' ')
return combined
def _looks_like_date(self, text: str) -> bool:
"""Check if text looks like a date."""
import re
# Common date patterns in SEC filings
date_patterns = [
r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s*\d{4}',
r'\d{1,2}/\d{1,2}/\d{4}',
r'\d{4}-\d{2}-\d{2}',
r'^\d{4}$', # Just a year
]
text_clean = text.replace('\n', ' ').strip()
for pattern in date_patterns:
if re.search(pattern, text_clean, re.IGNORECASE):
return True
return False