import re from typing import Dict, List, Optional, Tuple from edgar.files.html import BaseNode, Document from edgar.files.tables import ProcessedTable, TableProcessor __all__ = ['to_markdown', 'MarkdownRenderer'] class MarkdownRenderer: def __init__(self, document: Document, start_page_number: int = 0): self.document = document self.start_page_number = start_page_number self.toc_entries: List[Tuple[int, str, str]] = [] # level, text, anchor self.reference_links: Dict[str, str] = {} self.current_section = "" def render(self) -> str: """Render complete document""" rendered_parts = [] for node in self.document.nodes: rendered = "" if node.type == 'text_block': # Changed from 'paragraph' rendered = self._render_text_block(node) elif node.type == 'table': processed_table = TableProcessor.process_table(node) rendered = self._render_table(processed_table) if processed_table else "" elif node.type == 'heading': rendered = self._render_heading(node) elif node.type == 'page_break': rendered = self._render_page_break(node) if rendered: rendered_parts.append(rendered.rstrip()) # Remove trailing whitespace # Join with single newline and clean up multiple newlines return self._clean_spacing('\n\n'.join(filter(None, rendered_parts))) def _clean_spacing(self, text: str) -> str: """Clean up spacing while maintaining valid markdown""" # Replace 3 or more newlines with 2 newlines text = re.sub(r'\n{3,}', '\n\n', text) # Fix header spacing by treating the header line as a complete unit text = re.sub(r'\n*(#{1,6} [^\n]*[A-Za-z0-9][^\n]*)\n*', r'\n\n\1\n', text) # Clean up spacing around paragraphs text = re.sub(r'\n{2,}(?=\S)', '\n\n', text) text = re.sub("\xa0", " ", text) # Trim leading/trailing whitespace return text.strip() def _render_header(self) -> str: """Render SEC filing header with metadata""" header_parts = [] # Try to find filing type and registration number for node in self.document.nodes[:5]: # Check first few nodes if node.type == 'text_block': # Changed from 'paragraph' text = node.content if 'registration no.' in text.lower(): header_parts.append(f"**Registration No.:** {text.split('.')[-1].strip()}") if 'filed pursuant to' in text.lower(): header_parts.append(f"**Filing Type:** {text.strip()}") return "\n".join(header_parts) if header_parts else "" def _render_heading(self, node: BaseNode) -> str: """Render heading with metadata support""" if node.type != 'heading': raise ValueError(f"Expected heading node, got {node.type}") prefix = '#' * node.level text = node.content # Check metadata for any special rendering instructions if node.get_metadata('render_style') == 'centered': return f"{prefix}
{text}
" return f"{prefix} {text}" def _render_text_block(self, node: BaseNode) -> str: """Render text block (formerly paragraph) with metadata support""" if node.type != 'text_block': raise ValueError(f"Expected text_block node, got {node.type}") text = node.content # Apply styling if node.style: if node.style.font_weight == 'bold': text = f"**{text}**" if node.style.text_align == 'center': text = f"
{text}
" # Check metadata for special handling if node.get_metadata('is_note', False): text = f"> Note: {text}" elif node.get_metadata('is_quote', False): text = f"> {text}" return text def _render_table(self, processed: ProcessedTable) -> str: """Render processed table as Markdown""" if not processed.data_rows: return "" # Calculate column widths col_widths = [] for col_idx in range(len(processed.data_rows[0])): # Consider headers in width calculation col_content = [] if processed.headers: col_content.append(processed.headers[col_idx]) col_content.extend(row[col_idx] for row in processed.data_rows) # Calculate max width, considering multiline content and handling empty columns widths = [] for cell in col_content: if cell.strip(): # Only consider non-empty cells widths.extend(len(line) for line in cell.split('\n')) # Default to minimum width of 3 if column is empty max_width = max(widths) if widths else 3 col_widths.append(max_width) # Build table lines lines = [] # Add headers if present if processed.headers: header_lines = [] for col_idx, header in enumerate(processed.headers): header_lines.append(self._format_markdown_cell( header, col_widths[col_idx], processed.column_alignments[col_idx])) lines.append('|' + '|'.join(header_lines) + '|') # Add separator line separators = [] for idx, width in enumerate(col_widths): align = processed.column_alignments[idx] if align == "left": sep = ':' + '-' * (width + 1) else: # right sep = '-' * (width + 1) + ':' separators.append(sep) lines.append('|' + '|'.join(separators) + '|') # Add data rows for row in processed.data_rows: row_cells = [] for col_idx, cell in enumerate(row): row_cells.append(self._format_markdown_cell( cell, col_widths[col_idx], processed.column_alignments[col_idx])) lines.append('|' + '|'.join(row_cells) + '|') return '\n'.join(lines) def _format_markdown_cell(self, content: str, width: int, alignment: str) -> str: """Format cell content for markdown table""" if not content.strip(): return ' ' * (width + 2) # Add padding lines = content.split('\n') formatted_lines = [] for line in lines: if alignment == "left": formatted_lines.append(f" {line:<{width}} ") else: # right formatted_lines.append(f" {line:>{width}} ") return '\n'.join(formatted_lines) def _render_page_break(self, node: BaseNode) -> str: """Render page break as delimiter""" adjusted_page_number = node.page_number + self.start_page_number return f"{{{adjusted_page_number}}}------------------------------------------------" def to_markdown(html_content: str, include_page_breaks: bool = False, start_page_number: int = 0) -> Optional[str]: """Convert HTML content to markdown with optional page breaks Args: html_content: HTML string to convert include_page_breaks: Whether to include page break markers start_page_number: Starting page number for page break markers (default: 0) Returns: Markdown string or None if parsing failed """ document = Document.parse(html_content, include_page_breaks=include_page_breaks) if document: return document.to_markdown(start_page_number=start_page_number) return None