import re import textwrap from abc import ABC, abstractmethod from dataclasses import dataclass from functools import cached_property from typing import Any, Dict, List, Literal, Optional, Union from bs4 import NavigableString, Tag from rich import box from rich.align import Align from rich.console import Console, Group, RenderResult from rich.panel import Panel from rich.table import Table from rich.text import Text from edgar.core import log from edgar.files.html_documents import DocumentData, HtmlDocument from edgar.files.styles import StyleInfo, Width, get_heading_level, parse_style from edgar.files.tables import ColumnOptimizer, ProcessedTable, TableProcessor from edgar.richtools import repr_rich __all__ = ['SECHTMLParser', 'Document', 'DocumentNode'] class BaseNode(ABC): """Abstract base class for all document nodes with metadata support""" def __init__(self): self.metadata: Dict[str, Any] = {} @abstractmethod def render(self, console_width: int) -> RenderResult: """Render the node for display""" pass @property @abstractmethod def type(self) -> str: """Return the type of the node""" pass def add_metadata(self, key: str, value: Any) -> None: """Add or update metadata""" self.metadata[key] = value def get_metadata(self, key: str, default: Any = None) -> Any: """Get metadata value with optional default""" return self.metadata.get(key, default) def remove_metadata(self, key: str) -> None: """Remove metadata if it exists""" self.metadata.pop(key, None) @dataclass class HeadingNode(BaseNode): content: str style: StyleInfo level: int = 1 def __post_init__(self): super().__init__() @property def type(self) -> str: return 'heading' def render(self, console_width: int) -> RenderResult: """Render heading with enhanced styling based on level""" # Enhanced style configurations based on heading level styles = { 1: { "text_style": "bold cyan", "box": box.DOUBLE, "border_style": "cyan", "padding": (1, 2), "title": "§" if self.content else None # Section symbol for level 1 }, 2: { "text_style": "bold blue", "box": box.ROUNDED, "border_style": "blue", "padding": (1, 2), "title": "•" if self.content else None # Bullet for level 2 }, 3: { "text_style": "bold blue", "box": box.SIMPLE_HEAVY, "border_style": "white", "padding": (0, 2), "title": "" if self.content else None # Arrow for level 3 }, 4: { "text_style": "bold underline", "box": box.MINIMAL, "border_style": "grey62", "padding": (0, 1), "title": "" if self.content else None # Dash for level 4 } } # Get style configuration for current heading level, defaulting to level 4 style_config = styles.get(self.level, styles[4]) # Create base text with style text = Text(self.content.strip(), style=style_config["text_style"]) # Apply text alignment based on style if self.style and self.style.text_align == 'center': text = Align.center(text) # Create panel with enhanced styling return Panel( text, box=style_config["box"], border_style=style_config["border_style"], padding=style_config["padding"], expand=True, title=style_config["title"], title_align="left" ) @dataclass class TextBlockNode(BaseNode): content: str style: StyleInfo def __post_init__(self): super().__init__() @property def type(self) -> str: return 'text_block' def render(self, console_width: int) -> RenderResult: if not self.content: return Text("") width = console_width if self.style and self.style.width: width = min(self.style.width.to_chars(console_width), console_width) # Wrap text with improved handling def wrap_line(line: str) -> List[str]: if not line.strip(): return [''] if len(line) <= width: return [line] wrapped = textwrap.wrap( line, width=width, break_long_words=True, break_on_hyphens=True, expand_tabs=True ) # Handle orphaned words processed = [] i = 0 while i < len(wrapped): current_line = wrapped[i] if i < len(wrapped) - 1: next_line = wrapped[i + 1] if len(next_line) < width * 0.2 or ' ' not in next_line.strip(): combined = current_line + ' ' + next_line if len(combined) <= width: processed.append(combined) i += 2 continue processed.append(current_line) i += 1 return processed lines = self.content.splitlines(keepends=False) rendered_lines = [] for line in lines: wrapped_lines = wrap_line(line.rstrip('\n')) rendered_lines.extend(wrapped_lines) if line.endswith('\n'): rendered_lines.append('') final_text = '\n'.join(rendered_lines) result = Text(final_text) if self.style: if self.style.text_align: align_map = { 'center': 'center', 'right': 'right', 'justify': 'full', 'left': 'left' } result.justify = align_map.get(self.style.text_align, 'left') if self.style.font_weight in ('bold', '700', '800', '900'): result.stylize("bold") return result @dataclass class TableCell: content: Union[str, BaseNode] colspan: int = 1 rowspan: int = 1 align: str = 'left' is_currency: bool = False @dataclass class TableRow: cells: List[TableCell] is_header: bool = False @property def virtual_columns(self): return sum(cell.colspan for cell in self.cells) @dataclass class TableNode(BaseNode): content: List[TableRow] style: StyleInfo _processed_table: Optional[ProcessedTable] = None def __post_init__(self): super().__init__() @property def type(self) -> str: return 'table' @property def row_count(self) -> int: """Quick count of rows without processing""" return len(self.content) @property def approximate_column_count(self) -> int: """Quick approximate of columns using max cells in any row""" if not self.content: return 0 return max(row.virtual_columns for row in self.content) @cached_property def _processed(self) -> Optional[ProcessedTable]: """Cached access to processed table""" if self._processed_table is None: self._processed_table = TableProcessor.process_table(self) return self._processed_table @property def processed_row_count(self) -> int: """Accurate row count after processing""" if not self._processed: return self.row_count return len(self._processed.data_rows) + (len(self._processed.headers or []) > 0) @property def processed_column_count(self) -> int: """Accurate column count after processing""" if not self._processed: return self.approximate_column_count if self._processed.headers: return len(self._processed.headers) elif self._processed.data_rows: return len(self._processed.data_rows[0]) return 0 def reset_processing(self) -> None: """Clear cached processed table""" self._processed_table = None # Clear cached properties try: del self._processed except AttributeError: pass def render(self, console_width: int) -> RenderResult: from edgar.files.tables import TableProcessor processed_table = TableProcessor.process_table(self) if not processed_table: return None # Optimize the table column_optimizer:ColumnOptimizer = ColumnOptimizer() widths, processed_table = column_optimizer.optimize_columns(processed_table) table = Table( box=box.SIMPLE, border_style="blue", padding=(0, 1), show_header=bool(processed_table.headers), row_styles=["", "gray54"], collapse_padding=True, width=None ) # Add columns for col_idx, alignment in enumerate(processed_table.column_alignments): table.add_column( header=processed_table.headers[col_idx] if processed_table.headers else None, justify=alignment, vertical="middle" ) # Add data rows for row in processed_table.data_rows: table.add_row(*row) return table @dataclass class PageBreakNode(BaseNode): """Represents a page break in the document""" page_number: int def __post_init__(self): super().__init__() @property def type(self) -> str: return 'page_break' def render(self, console_width: int) -> RenderResult: """Render page break with page number""" return Text(f"--- Page {self.page_number} ---", style="dim") def create_node( type_: str, content: Union[str, List[TableRow]], style: StyleInfo, level: int = 1, metadata: Optional[Dict[str, Any]] = None, page_number: Optional[int] = None ) -> BaseNode: """Create a node with optional metadata""" metadata = metadata or {} if type_ == 'heading': node = HeadingNode(content=content, style=style, level=level) elif type_ == 'text_block': node = TextBlockNode(content=content, style=style) elif type_ == 'table': node = TableNode(content=content, style=style) elif type_ == 'page_break': node = PageBreakNode(page_number=page_number) else: raise ValueError(f"Unknown node type: {type_}") # Apply metadata after creation if metadata: node.metadata.update(metadata) return node # 1. Add type literals and type guards NodeType = Literal['heading', 'text_block', 'table', 'page_break'] ContentType = Union[str, Dict[str, Any], List[TableRow]] def is_table_content(content: ContentType) -> bool: return isinstance(content, list) and all(isinstance(x, TableRow) for x in content) def is_text_content(content: ContentType) -> bool: return isinstance(content, str) def is_dict_content(content: ContentType) -> bool: return isinstance(content, dict) class IXTagTracker: """Tracks IX tag context throughout HTML parsing""" def __init__(self): # Maps continuation IDs to their original ix tag info self.continuation_map: Dict[str, Dict[str, str]] = {} # Current stack of ix tags self.tag_stack: List[Dict[str, str]] = [] def enter_tag(self, element: Tag) -> None: """Process entering an ix: tag, handling both regular tags and continuations""" if not element.name.startswith('ix:'): return if element.name == 'ix:continuation': # For continuation tags, look up the original tag's metadata continued_at = element.get('continuedAt') tag_id = element.get('id') if continued_at and tag_id: self.continuation_map[tag_id] = self.continuation_map.get(continued_at, {}) else: # For regular ix tags, store their metadata tag_info = { 'name': element.get('name', ''), 'contextRef': element.get('contextRef', ''), 'id': element.get('id', '') } # Store any additional attributes for key, value in element.attrs.items(): if key not in {'name', 'contextRef', 'id'}: tag_info[key] = value # Add to continuation map if this tag has an ID if tag_info['id']: self.continuation_map[tag_info['id']] = tag_info self.tag_stack.append(tag_info) def exit_tag(self, element: Tag) -> None: """Record exiting an ix: tag""" if element.name.startswith('ix:') and element.name != 'ix:continuation': if self.tag_stack: self.tag_stack.pop() def get_current_context(self, element: Tag) -> Dict[str, Any]: """Get the current ix tag context, handling both regular tags and continuations""" # First check if we're in a continuation if element.name == 'ix:continuation': tag_id = element.get('id') if tag_id in self.continuation_map: original_tag = self.continuation_map[tag_id] return { 'ix_tag': original_tag.get('name'), 'ix_context': original_tag.get('contextRef'), 'ix_original_id': original_tag.get('id'), 'ix_continuation_id': tag_id, **{f'ix_{k}': v for k, v in original_tag.items() if k not in {'name', 'contextRef', 'id'}} } return {} # Otherwise use current tag stack if not self.tag_stack: return {} current = self.tag_stack[-1] metadata = { 'ix_tag': current.get('name'), 'ix_context': current.get('contextRef'), 'ix_id': current.get('id') } # Add any additional attributes for key, value in current.items(): if key not in {'name', 'contextRef', 'id'}: metadata[f'ix_{key}'] = value return metadata @dataclass class DocumentNode: type: Literal['heading', 'text_block', 'table'] # Changed from 'paragraph' to 'text_block' content: Union[str, Dict[str, Any], List[TableRow]] style: StyleInfo level: int = 0 def _validate_content(self) -> None: """Validate content matches the node type""" if self.type == 'table' and not is_table_content(self.content): raise ValueError(f"Table node must have List[TableRow] content, got {type(self.content)}") elif self.type in ('heading', 'text_block') and not is_text_content(self.content): raise ValueError(f"{self.type} node must have string content, got {type(self.content)}") @property def text(self) -> str: """Helper method for accessing text content""" if not is_text_content(self.content): raise ValueError(f"Cannot get text from {self.type} node") return self.content @property def rows(self) -> List[TableRow]: """Helper method for accessing table rows""" if not is_table_content(self.content): raise ValueError(f"Cannot get rows from {self.type} node") return self.content @dataclass class Document: """Document class that works with the new node hierarchy""" nodes: List[BaseNode] def __len__(self): return len(self.nodes) def __getitem__(self, index): return self.nodes[index] def empty(self) -> bool: return len(self.nodes) == 0 @staticmethod def _get_width() -> int: """Get the width of the console that this document is being rendered into""" return Console().width @property def tables(self) -> List[BaseNode]: """Get all table nodes in the document""" return [node for node in self.nodes if node.type == 'table'] @property def headings(self) -> List[BaseNode]: """Get all heading nodes in the document""" return [node for node in self.nodes if node.type == 'heading'] @classmethod def parse(cls, html: str, include_page_breaks: bool = False) -> Optional['Document']: root = HtmlDocument.get_root(html) if root: parser = SECHTMLParser(root, include_page_breaks=include_page_breaks) return parser.parse() def to_markdown(self, start_page_number: int = 0) -> str: from edgar.files.markdown import MarkdownRenderer return MarkdownRenderer(self, start_page_number=start_page_number).render() def __rich__(self) -> RenderResult: """Rich console protocol for rendering document""" console = Console() console_width = console.width renderable_elements = [] for node in self.nodes: element = node.render(console_width) if element: renderable_elements.append(element) return Group(*renderable_elements) def __repr__(self): return repr_rich(self) @dataclass class StyledText: """Represents a piece of text with its associated style""" content: str style: StyleInfo is_paragraph: bool = False # Track if this came from a

tag class SECHTMLParser: def __init__(self, root: Tag, extract_data: bool = True, include_page_breaks: bool = False): self.data:DocumentData = HtmlDocument.extract_data(root) if extract_data else None self.root:Tag = root self.base_font_size = 10.0 # Default base font size in pt self.style_stack: List[StyleInfo] = [] self.ix_tracker = IXTagTracker() # Add IX tag tracker self.include_page_breaks = include_page_breaks self.current_page = -1 # Start at -1 so first page div becomes page 0 def parse(self) -> Optional[Document]: body = self.root.find('body') if not body: log.warning("No body tag found in HTML") return None # If page breaks are enabled, detect them first if self.include_page_breaks: self._mark_page_breaks(body) nodes = self._parse_element(body) # If page breaks are enabled, ensure proper page numbering if self.include_page_breaks and nodes: # Find the first page break node first_page_break_idx = None for i, node in enumerate(nodes): if node.type == 'page_break': first_page_break_idx = i break if first_page_break_idx is None: # No page breaks found, this shouldn't happen if include_page_breaks is True # but add a document start page break just in case initial_page_break = create_node( type_='page_break', content=None, style=StyleInfo(), page_number=0, metadata={'source_element': 'document_start'} ) nodes.insert(0, initial_page_break) elif first_page_break_idx > 0: # There's content before the first page break, add document start page break initial_page_break = create_node( type_='page_break', content=None, style=StyleInfo(), page_number=0, metadata={'source_element': 'document_start'} ) nodes.insert(0, initial_page_break) # Re-number subsequent page breaks for i in range(1, len(nodes)): if nodes[i].type == 'page_break': nodes[i].page_number = i // 2 # Rough estimate, will be fixed in next loop # Final pass: renumber all page breaks sequentially page_counter = 0 for node in nodes: if node.type == 'page_break': node.page_number = page_counter page_counter += 1 return Document(nodes=nodes) def _mark_page_breaks(self, element: Tag) -> None: """Mark page break elements for detection during parsing""" from .page_breaks import PageBreakDetector PageBreakDetector.mark_page_breaks(element) def _mark_page_divs(self, element: Tag) -> None: """Mark div elements with page-like dimensions as page breaks""" # This is now handled by PageBreakDetector.mark_page_breaks() # Keeping this method for backward compatibility pass def _is_page_like_div(self, style: str) -> bool: """Check if a div has page-like dimensions based on its style""" from .page_breaks import PageBreakDetector return PageBreakDetector._is_page_like_div(style) def _process_div_content(self, element: Tag) -> Optional[Union[BaseNode, List[BaseNode]]]: """Process the content inside a page div without treating the div itself as a page break""" # Parse current element's style current_style = parse_style(element.get('style', '')) # Merge with parent style if there is one if self.style_stack: current_style = current_style.merge(self.style_stack[-1]) # Track entering ix tags and get metadata self.ix_tracker.enter_tag(element) ix_metadata = self.ix_tracker.get_current_context(element) try: # Push current style to stack before processing children self.style_stack.append(current_style) try: # Check if this div contains tables if element.get('has_table'): # Structure-preserving mode for divs with tables result = self._process_structured_content(element, current_style) else: # Content-combining mode for divs without tables result = self._process_inline_content(element, current_style) # Apply ix metadata if available if result and ix_metadata: if isinstance(result, list): for node in result: if hasattr(node, 'metadata'): node.metadata.update(ix_metadata) else: if hasattr(result, 'metadata'): result.metadata.update(ix_metadata) return result finally: # Always pop the style from stack when done self.style_stack.pop() finally: # Track exiting ix tags self.ix_tracker.exit_tag(element) def _parse_element(self, element: Tag) -> List[BaseNode]: nodes = [] for child in element.children: if not isinstance(child, Tag): continue node = self._process_element(child) if node: nodes.extend(node if isinstance(node, list) else [node]) return self._merge_adjacent_nodes(nodes) def _parse_width(self, value: str) -> Optional[Width]: """Parse CSS width value into Width object""" if not value: return None # Handle percentage values if value.endswith('%'): try: return Width(float(value[:-1]), '%') except ValueError: return None # Extract number and unit match = re.match(r'(-?\d*\.?\d+)([a-z]*)', value) if not match: return None number, unit = match.groups() try: number = float(number) except ValueError: return None # Map CSS units to our unit types unit_map = { 'in': 'in', 'pt': 'pt', 'px': 'px', 'cm': 'cm', 'mm': 'mm', '': 'px' # default to pixels if no unit specified } unit = unit_map.get(unit) if not unit: return None return Width(number, unit) def _parse_unit(self, value: str) -> Optional[float]: """Parse CSS unit values into integer character width""" if not value: return None # Handle percentage values if value.endswith('%'): try: return float(value[:-1]) / 100.0 except ValueError: return None # Extract number and unit match = re.match(r'(-?\d*\.?\d+)([a-z]*)', value) if not match: return None number, unit = match.groups() try: number = float(number) except ValueError: return None # Convert different units to characters # Assuming typical terminal character widths: # - 80 chars ≈ 6.5 inches # - 1 inch ≈ 12.3 chars chars_per_unit = { 'in': 12.3, # 1 inch ≈ 12.3 chars 'pt': 12.3/72, # 1 pt = 1/72 inch 'px': 12.3/96, # 1 px = 1/96 inch 'cm': 4.84, # 1 cm ≈ 4.84 chars 'mm': 0.484, # 1 mm ≈ 0.484 chars 'em': 1.6, # 1 em ≈ 1.6 chars (assuming typical font) 'rem': 1.6, # Same as em } multiplier = chars_per_unit.get(unit, 1.0) return int(number * multiplier) def _clean_text(self, text: str) -> str: """Clean and normalize text content while preserving meaningful whitespace""" # Replace HTML entities entities = { ' ': ' ', '&': '&', '<': '<', '>': '>', '"': '"', ''': "'", ' ': ' ', # hair space '​': '', # zero-width space '\xa0': ' ', # non-breaking space '\u200b': '', # zero-width space '\u200c': '', # zero-width non-joiner '\u200d': '', # zero-width joiner '\u2028': ' ', # line separator '\u2029': ' ', # paragraph separator '\ufeff': '' # byte order mark } for entity, replacement in entities.items(): text = text.replace(entity, replacement) # Replace multiple consecutive spaces with a single space text = ' '.join(text.split()) # Normalize whitespace while preserving single newlines lines = text.splitlines() lines = [' '.join(line.split()) for line in lines] text = '\n'.join(lines) # Clean up any remaining multiple spaces around newlines text = re.sub(r'\s*\n\s*', '\n', text) # Remove any remaining consecutive spaces text = re.sub(r' +', ' ', text) return text.strip() def _handle_page_break_element(self, element: Tag) -> Optional[Union[BaseNode, List[BaseNode]]]: """Handle elements marked as page breaks""" # For the first page break, don't increment if we're at -1 (start) if self.current_page == -1: self.current_page = 0 else: self.current_page += 1 page_break_node = create_node( type_='page_break', content=None, style=StyleInfo(), page_number=self.current_page, metadata={'source_element': element.name} ) # Check if this is a container page break or content-bearing page break if element.name == 'div' and self._is_page_like_div(element.get('style', '')): # This is a page div - return page break AND process content inside nodes = [page_break_node] content_nodes = self._process_div_content(element) if content_nodes: if isinstance(content_nodes, list): nodes.extend(content_nodes) else: nodes.append(content_nodes) return nodes elif element.name in ['p', 'div'] and element.get_text(strip=True): # This is a content-bearing element with page break style - return page break AND content nodes = [page_break_node] if element.name == 'p': current_style = parse_style(element.get('style', '')) if self.style_stack: current_style = current_style.merge(self.style_stack[-1]) content_node = self._process_paragraph(element, current_style) if content_node: nodes.append(content_node) elif element.name == 'div': content_nodes = self._process_div_content(element) if content_nodes: if isinstance(content_nodes, list): nodes.extend(content_nodes) else: nodes.append(content_nodes) return nodes else: # This is a marker-only page break (hr, empty elements) - return just the page break return page_break_node def _apply_metadata_to_nodes(self, nodes: Union[BaseNode, List[BaseNode]], metadata: Dict[str, Any]) -> None: """Apply metadata to a node or list of nodes""" if not metadata: return if isinstance(nodes, list): for node in nodes: if hasattr(node, 'metadata'): node.metadata.update(metadata) else: if hasattr(nodes, 'metadata'): nodes.metadata.update(metadata) def _process_element_with_page_breaks(self, element: Tag) -> Optional[Union[BaseNode, List[BaseNode]]]: """Process an element that contains page break descendants""" nodes = [] for child in element.children: if isinstance(child, Tag): # Check if this child is a page break or contains page breaks if child.get('_is_page_break') == 'true': page_break_result = self._handle_page_break_element(child) if page_break_result: if isinstance(page_break_result, list): nodes.extend(page_break_result) else: nodes.append(page_break_result) else: # Process child normally child_result = self._process_element(child) if child_result: if isinstance(child_result, list): nodes.extend(child_result) else: nodes.append(child_result) return nodes[0] if len(nodes) == 1 else nodes if nodes else None def _dispatch_element_processing(self, element: Tag, current_style: StyleInfo, ix_metadata: Dict[str, Any]) -> Optional[Union[BaseNode, List[BaseNode]]]: """Dispatch element processing based on element type""" # Handle ix: tags by processing their content sequentially if element.name.startswith('ix:'): return self._process_ix_element(element, current_style, ix_metadata) # Process table elements directly if element.name == 'table': table_node = self._process_table(element) self._apply_metadata_to_nodes(table_node, ix_metadata) return table_node elif element.name == 'p': return self._process_paragraph_element(element, current_style, ix_metadata) elif element.name == 'div': return self._process_div_element(element, current_style, ix_metadata) # For other elements, process children return self._process_generic_element(element, ix_metadata) def _process_ix_element(self, element: Tag, current_style: StyleInfo, ix_metadata: Dict[str, Any]) -> Optional[Union[BaseNode, List[BaseNode]]]: """Process ix: tagged elements""" nodes = [] children = list(element.children) # Convert to list to avoid iterator modification for child in children: if isinstance(child, Tag): if child.name == 'table': table_node = self._process_table(child) if table_node: self._apply_metadata_to_nodes(table_node, ix_metadata) nodes.append(table_node) elif child.name == 'p': para_node = self._process_paragraph(child, current_style) if para_node: self._apply_metadata_to_nodes(para_node, ix_metadata) nodes.append(para_node) elif child.name == 'div': div_style = parse_style(child.get('style', '')).merge(current_style) div_result = self._process_structured_content(child, div_style) if div_result: self._apply_metadata_to_nodes(div_result, ix_metadata) if isinstance(div_result, list): nodes.extend(div_result) else: nodes.append(div_result) else: child_result = self._process_element(child) if child_result: self._apply_metadata_to_nodes(child_result, ix_metadata) if isinstance(child_result, list): nodes.extend(child_result) else: nodes.append(child_result) return nodes[0] if len(nodes) == 1 else nodes if nodes else None def _process_paragraph_element(self, element: Tag, current_style: StyleInfo, ix_metadata: Dict[str, Any]) -> Optional[BaseNode]: """Process paragraph elements, checking for headings first""" para_text = element.get_text(strip=True) if para_text: heading_level = get_heading_level(element, current_style, para_text) if heading_level is not None: node = create_node( type_='heading', content=para_text, style=current_style, level=heading_level ) self._apply_metadata_to_nodes(node, ix_metadata) return node para_node = self._process_paragraph(element, current_style) self._apply_metadata_to_nodes(para_node, ix_metadata) return para_node def _process_div_element(self, element: Tag, current_style: StyleInfo, ix_metadata: Dict[str, Any]) -> Optional[Union[BaseNode, List[BaseNode]]]: """Process div elements based on whether they contain tables""" if element.get('has_table'): # Structure-preserving mode for divs with tables block_result = self._process_structured_content(element, current_style) else: # Content-combining mode for divs without tables block_result = self._process_inline_content(element, current_style) self._apply_metadata_to_nodes(block_result, ix_metadata) return block_result def _process_generic_element(self, element: Tag, ix_metadata: Dict[str, Any]) -> Optional[Union[BaseNode, List[BaseNode]]]: """Process generic elements by processing their children""" nodes = [] for child in element.children: if isinstance(child, Tag): child_result = self._process_element(child) if child_result: self._apply_metadata_to_nodes(child_result, ix_metadata) if isinstance(child_result, list): nodes.extend(child_result) else: nodes.append(child_result) return nodes[0] if len(nodes) == 1 else nodes if nodes else None def _process_element(self, element: Tag) -> Optional[Union[BaseNode, List[BaseNode]]]: """Process an element into one or more nodes with inherited styles and ix metadata""" # Handle page break elements first if self.include_page_breaks and element.get('_is_page_break') == 'true': return self._handle_page_break_element(element) # Also check if this element contains page break descendants if self.include_page_breaks and element.select('[_is_page_break="true"]'): # This element contains page breaks, process them individually return self._process_element_with_page_breaks(element) # Phase 1: Mark all ancestors of tables tables = element.find_all('table', recursive=True) for table in tables: parent = table.parent while parent: parent['has_table'] = True parent = parent.parent # Parse current element's style current_style = parse_style(element.get('style', '')) # Merge with parent style if there is one if self.style_stack: current_style = current_style.merge(self.style_stack[-1]) # Track entering ix tags and get metadata self.ix_tracker.enter_tag(element) ix_metadata = self.ix_tracker.get_current_context(element) try: # Push current style to stack before processing children self.style_stack.append(current_style) try: # First check if this element could be a heading text = element.get_text(strip=True) if text: # Only check for headings if there's text content heading_level = get_heading_level(element, current_style, text) if heading_level is not None: node = create_node( type_='heading', content=text, style=current_style, level=heading_level ) if ix_metadata: node.metadata.update(ix_metadata) return node # Dispatch to appropriate element handler return self._dispatch_element_processing(element, current_style, ix_metadata) finally: # Always pop the style from stack when done self.style_stack.pop() finally: # Always track exiting ix tags self.ix_tracker.exit_tag(element) def _process_structured_content(self, element: Tag, style: StyleInfo) -> Optional[Union[BaseNode, List[BaseNode]]]: """Process content in structure-preserving mode (for elements containing tables)""" nodes = [] text_parts = [] def flush_text(): if text_parts: text = ' '.join(text_parts).strip() if text: nodes.append(create_node( type_='text_block', content=text, style=style )) text_parts.clear() for child in element.children: if isinstance(child, NavigableString): text = str(child).strip() if text: text_parts.append(text) elif isinstance(child, Tag): if child.name == 'table': flush_text() table_node = self._process_table(child) if table_node: nodes.append(table_node) elif child.get('has_table'): # This child contains a table somewhere, process structurally flush_text() child_result = self._process_element(child) if child_result: if isinstance(child_result, list): nodes.extend(child_result) else: nodes.append(child_result) else: # Non-table-containing element, can process for text text = self._get_text_with_spacing(child).strip() if text: text_parts.append(text) flush_text() return nodes[0] if len(nodes) == 1 else nodes if nodes else None def _process_inline_content(self, element: Tag, style: StyleInfo) -> Optional[Union[BaseNode, List[BaseNode]]]: """Process content in content-combining mode (for elements without tables)""" # First check if the entire element is a heading text = element.get_text(strip=True) if text: heading_level = get_heading_level(element, style, text) if heading_level is not None: return create_node( type_='heading', content=text, style=style, level=heading_level ) nodes = [] text_parts: List[str] = [] # Explicitly type as strings def flush_text(): if text_parts: text = ' '.join(text_parts).strip() if text: # Check if combined text forms a heading heading_level = get_heading_level(element, style, text) if heading_level is not None: nodes.append(create_node( type_='heading', content=text, style=style, level=heading_level )) else: nodes.append(create_node( type_='text_block', content=text, style=style )) text_parts.clear() # Process children while handling special cases for child in element.children: if isinstance(child, NavigableString): text = str(child).strip() if text and text != '\u200B': # Skip zero-width spaces text_parts.append(text) elif isinstance(child, Tag): if child.name == 'br': text_parts.append('\n') elif not self._is_block_element(child): # Get the child's style combined with parent style child_style = parse_style(child.get('style', '')).merge(style) text = self._get_text_with_spacing(child).strip() if text: # Check if this individual child is a heading heading_level = get_heading_level(child, child_style, text) if heading_level is not None: # Flush any existing text first flush_text() nodes.append(create_node( type_='heading', content=text, style=child_style, level=heading_level )) else: # Store just the text, but use child_style when creating the node text_parts.append(text) # Update the style for the current text block to use the child's style style = child_style else: # For block elements, flush current text and process the element flush_text() child_result = self._process_element(child) if child_result: if isinstance(child_result, list): nodes.extend(child_result) else: nodes.append(child_result) # Flush any remaining text flush_text() return nodes[0] if len(nodes) == 1 else nodes if nodes else None def _normalize_text_parts(self, parts: List[str]) -> str: """Normalize text parts while preserving intentional line breaks""" # Remove empty parts and normalize spaces normalized_parts = [] for i, part in enumerate(parts): if part == '\n': # Keep newlines but ensure no extra spaces around them normalized_parts.append('\n') else: # For text content, strip and add only if non-empty stripped = part.strip() if stripped: # Don't add space if previous part was a newline or this is the first part if normalized_parts and normalized_parts[-1] != '\n' and i > 0: normalized_parts.append(' ') normalized_parts.append(stripped) # Join all parts and remove any extra whitespace around newlines text = ''.join(normalized_parts) # Clean up any potential multiple newlines or spaces #text = re.sub(r'\s*\n\s*', '\n', text) text = re.sub(r' +', ' ', text) return text.strip() def _process_table(self, element: Tag) -> Optional[BaseNode]: """Process table element into a TableNode with precise line break handling""" if not element: return None def replace_html_entities(text: str) -> str: """Replace HTML entities with markdown-safe alternatives""" # Map of HTML entities to their markdown-safe replacements entity_replacements = { '―': '-----', # Horizontal bar '—': '-----', # Em dash '–': '---', # En dash '−': '-', # Minus sign '‐': '-', # Hyphen '‐': '-', # Generic dash # Add other common entities that might need replacement ' ': ' ', # Non-breaking space '&': '&', # Ampersand '<': '<', # Less than '>': '>', # Greater than '"': '"', # Quote ''': "'", # Apostrophe ' ': ' ', # Hair space '​': '', # Zero-width space '—': '-----', # Another way to encode mdash '–': '---', # Another way to encode ndash '−': '-', # Another way to encode minus } # Also handle numeric entities that might represent dashes # Unicode values for various dashes dash_codepoints = { '8208': '-', # hyphen '8209': '-', # non-breaking hyphen '8210': '-', # figure dash '8211': '---', # en dash '8212': '-----', # em dash '8213': '-----', # horizontal bar '8722': '-', # minus sign } result = text # Replace named entities for entity, replacement in entity_replacements.items(): result = result.replace(entity, replacement) # Replace numeric entities (both decimal and hex) for dashes for code, replacement in dash_codepoints.items(): # Replace decimal format result = result.replace(f'&#{code};', replacement) # Replace hexadecimal format result = result.replace(f'&#x{hex(int(code))[2:]};', replacement) return result def extract_cell_text(cell: Tag) -> str: """Extract text from cell with careful line break handling""" # First check for div children divs = cell.find_all('div', recursive=False) if divs: # Get text from each div and handle entities div_texts = [replace_html_entities(div.get_text(strip=True)) for div in divs] return '\n'.join(div_texts) # Handle
tags by replacing them with newlines for br in cell.find_all('br'): br.replace_with('\n') # Get text and handle entities text = cell.get_text(strip=False) text = replace_html_entities(text) return text.strip() def process_cell(cell: Tag) -> List[TableCell]: """Process cell preserving exact colspan and positioning values correctly""" try: colspan = int(cell.get('colspan', '1')) except ValueError: colspan = 1 style = parse_style(cell.get('style', '')) text = extract_cell_text(cell) # If this is a right-aligned cell with colspan > 1 (like percentage values) if style.text_align == 'right' and colspan > 1: # Create empty cells for all but last column of colspan cells = [ TableCell(content='', colspan=1, align='right', is_currency=False) for _ in range(colspan - 1) ] # Add actual value in last column cells.append(TableCell( content=text, colspan=1, align='right', is_currency=False )) return cells # For single cells return [TableCell( content=text, colspan=colspan, align=style.text_align or 'left', is_currency=text.startswith('$') )] def process_row(row: Tag) -> TableRow: """Process row preserving cell structure""" cells = [] # Find direct child cells only to avoid nested table conflicts for td in row.find_all(['td', 'th'], recursive=False): # Check if cell contains a nested table nested_table = td.find('table') if nested_table: # Create a TableNode from the nested table using _process_table table_node = self._process_table(nested_table) if table_node: # Store the table node in the cell content cells.extend([TableCell( content=table_node, # We'll need to handle this special content later colspan=int(td.get('colspan', '1')), align=td.get('align', 'left') )]) else: cells.extend(process_cell(td)) return TableRow(cells=cells, is_header=row.find_parent('thead') is not None) # Process all rows (including those nested in tbody, thead, tfoot) rows = [] # First, try to find direct child tr elements direct_trs = element.find_all('tr', recursive=False) if direct_trs: # If we found direct tr elements, use them for tr in direct_trs: row = process_row(tr) if row.cells: rows.append(row) else: # If no direct tr elements, look in tbody, thead, tfoot children for section in element.find_all(['tbody', 'thead', 'tfoot'], recursive=False): for tr in section.find_all('tr', recursive=False): row = process_row(tr) if row.cells: rows.append(row) if rows: # Create metadata from table attributes metadata = { 'id': element.get('id', ''), 'class': element.get('class', []), 'data_attrs': { k: v for k, v in element.attrs.items() if k.startswith('data-') } } return create_node( 'table', rows, parse_style(element.get('style', '')), metadata=metadata ) return None def _process_paragraph(self, element: Tag, style: StyleInfo) -> Optional[BaseNode]: """Process a paragraph element with inherited styles""" text_parts = [] last_was_text = False for child in element.children: if isinstance(child, NavigableString): text = str(child) if text.strip(): text_parts.append(text) last_was_text = True elif text.isspace() and last_was_text: text_parts.append(' ') elif isinstance(child, Tag): if child.name == 'br': text_parts.append('\n') last_was_text = False elif child.name in ['span', 'font', 'strong', 'em', 'b', 'i', 'a']: text = self._get_text_with_spacing(child) if text.strip(): text_parts.append(text.strip()) last_was_text = True if not text_parts: return None # Join all parts and normalize whitespace while preserving intentional breaks text = ''.join(text_parts) # Split into lines, normalize each line's whitespace, then rejoin lines = [' '.join(line.split()) for line in text.split('\n')] text = '\n'.join(line for line in lines if line) if text.strip(): return create_node( type_='text_block', content=text, style=style ) return None def _normalize_text(self, pieces: List[StyledText], is_paragraph: bool) -> str: """Normalize text differently for paragraphs vs general text blocks""" if is_paragraph: # For actual paragraphs, collapse all whitespace text = ' '.join(piece.content for piece in pieces) return ' '.join(text.split()) else: # For general text blocks, preserve line breaks lines = [] current_line = [] for piece in pieces: if piece.content == '\n': # Flush current line if current_line: lines.append(' '.join(''.join(current_line).split())) current_line = [] lines.append('') # Add empty line for break else: current_line.append(piece.content) # Flush any remaining content if current_line: lines.append(' '.join(''.join(current_line).split())) # Remove any extra empty lines but preserve single line breaks text = '\n'.join(lines) return re.sub(r'\n{3,}', '\n\n', text) def _is_block_element(self, element: Tag) -> bool: """Determine if an element is block-level""" # Check explicit display style first style = parse_style(element.get('style', '')) if style.display: return style.display != 'inline' # Default block elements block_elements = { 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'blockquote', 'pre', 'hr', 'table', 'form', 'fieldset', 'address' } return element.name in block_elements and 'float:left' not in element.get('style', '') def _collect_styled_text(self, element: Tag, style: StyleInfo) -> List[StyledText]: """Collect text with style information from inline elements""" pieces = [] for child in element.children: if isinstance(child, NavigableString): text = str(child) if text.strip(): pieces.append(StyledText(text, style)) elif isinstance(child, Tag): if child.name == 'br': pieces.append(StyledText('\n', style)) elif child.name != 'table': # Skip tables in inline collection child_style = self._get_combined_style(child, style) pieces.extend(self._collect_styled_text(child, child_style)) return pieces def _get_combined_style(self, element: Tag, parent_style: StyleInfo) -> StyleInfo: """Combine element's style with parent style, including HTML attributes""" style = parse_style(element.get('style', '')) # Handle specific HTML tags and their attributes if element.name == 'font': if size := element.get('size'): try: size_num = float(size.replace('pt', '')) style.font_size = size_num except ValueError: pass elif element.name in {'b', 'strong'}: style.font_weight = 'bold' elif element.name in {'i', 'em'}: style.font_style = 'italic' return style.merge(parent_style) def _convert_pieces_to_nodes(self, pieces: List[StyledText]) -> List[DocumentNode]: """Convert collected text pieces into document nodes""" nodes = [] current_paragraph: List[StyledText] = [] def flush_paragraph(): if not current_paragraph: return # Combine text and determine final style text = self._normalize_paragraph_text(current_paragraph) # Use the most specific (last) style for the paragraph final_style = current_paragraph[-1].style if text.strip(): nodes.append(DocumentNode( type='text_block', content=text, style=final_style )) current_paragraph.clear() for piece in pieces: if piece.is_block: flush_paragraph() else: current_paragraph.append(piece) # Flush any remaining content flush_paragraph() return nodes def _normalize_paragraph_text(self, pieces: List[StyledText]) -> str: """Normalize text within a paragraph while preserving intentional breaks""" # Join all pieces and split into lines text = ''.join(piece.content for piece in pieces) lines = text.splitlines() # Normalize each line individually normalized_lines = [] for line in lines: # Collapse whitespace within each line normalized = ' '.join(line.split()) if normalized: normalized_lines.append(normalized) # Join lines with single newlines return '\n'.join(normalized_lines) def _is_inline(self, element: Tag) -> bool: """Determine if an element should be treated as inline""" style = parse_style(element.get('style', '')) if style.display == 'inline': return True # Standard inline elements inline_elements = { 'span', 'font', 'b', 'strong', 'i', 'em', 'a', 'sub', 'sup', 'u', 'small', 'mark' } return element.name in inline_elements def _is_empty_text(self, text: str) -> bool: """Check if text is effectively empty""" return not bool(text.strip()) def _get_text_with_spacing(self, element: Tag) -> str: """Extract text while preserving meaningful whitespace""" if element.name == 'table': return '' texts = [] last_was_text = False for child in element.children: if isinstance(child, NavigableString): text = self._clean_text(str(child)) if text.strip(): texts.append(text.strip()) last_was_text = True elif text.isspace() and last_was_text: texts.append(' ') elif child.name == 'br': texts.append('\n') last_was_text = False elif child.name == 'table': continue else: child_text = self._get_text_with_spacing(child) if child_text.strip(): # Only add space if needed if texts and last_was_text and not texts[-1].endswith(' ') and not child_text.startswith(' '): texts.append(' ') texts.append(child_text.strip()) last_was_text = True return ''.join(texts) def _merge_adjacent_nodes(self, nodes: List[BaseNode]) -> List[BaseNode]: """Merge adjacent nodes while preserving styling from both nodes""" if not nodes: return [] def merge_styles(style1: StyleInfo, style2: StyleInfo) -> StyleInfo: """Merge two styles intelligently""" # Start with a new style object merged = StyleInfo() # For each style attribute, take the non-None value # If both have values, use the more specific one merged.display = style2.display or style1.display merged.margin_top = style1.margin_top # Keep first node's top margin merged.margin_bottom = style2.margin_bottom # Keep second node's bottom margin # For font properties, prefer the second node's style if it's different # This preserves intentional style changes in the second block if style2.font_size and style2.font_size != style1.font_size: merged.font_size = style2.font_size else: merged.font_size = style1.font_size if style2.font_weight and style2.font_weight != style1.font_weight: merged.font_weight = style2.font_weight else: merged.font_weight = style1.font_weight # For alignment, if they differ, don't merge if style1.text_align != style2.text_align: return None merged.text_align = style1.text_align # Improved width handling if style1.width and style2.width: # If units differ, prefer the larger width's unit if style1.width.unit != style2.width.unit: # Convert both to pixels for comparison # This is a simplified conversion - you might want to use the existing # Width.to_chars method for more accurate conversion w1_px = _to_pixels(style1.width) w2_px = _to_pixels(style2.width) # If one width is significantly smaller (like a bullet point) # use the larger width if w1_px < w2_px * 0.3: # First node is much smaller merged.width = style2.width elif w2_px < w1_px * 0.3: # Second node is much smaller merged.width = style1.width else: # Widths are comparable, use the second node's width merged.width = style2.width else: # Same units, apply the same logic if style1.width.value < style2.width.value * 0.3: merged.width = style2.width elif style2.width.value < style1.width.value * 0.3: merged.width = style1.width else: merged.width = style2.width else: # If only one has width, use that merged.width = style2.width or style1.width merged.text_decoration = style2.text_decoration or style1.text_decoration merged.line_height = style2.line_height or style1.line_height return merged def _to_pixels(width: Width) -> float: """Convert width to pixels for comparison""" # Conversion factors (approximate) conversions = { 'px': 1, 'pt': 1.333, # 1pt ≈ 1.333px 'in': 96, # 1in = 96px 'cm': 37.795, # 1cm ≈ 37.795px 'mm': 3.7795, # 1mm ≈ 3.7795px '%': 1 # Handle percentages separately } return width.value * conversions.get(width.unit, 1) def can_merge_nodes(node1: BaseNode, node2: BaseNode) -> bool: """Determine if two nodes can be safely merged""" if node1.type != 'text_block' or node2.type != 'text_block': return False # Don't merge if either has special metadata if node1.metadata or node2.metadata: return False # Try to merge styles merged_style = merge_styles(node1.style, node2.style) if merged_style is None: return False return True merged = [] current = None for node in nodes: if not current: current = node continue if can_merge_nodes(current, node): merged_style = merge_styles(current.style, node.style) # Create new merged text block with the combined style merged_content = f"{current.content}\n\n{node.content}" current = create_node( 'text_block', merged_content, merged_style ) else: merged.append(current) current = node if current: merged.append(current) return merged def _similar_styles(self, style1: StyleInfo, style2: StyleInfo) -> bool: # Compare relevant style attributes to determine if they're similar return ( style1.font_size == style2.font_size and style1.font_weight == style2.font_weight and style1.text_align == style2.text_align )