Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,456 @@
"""
Node hierarchy for the document tree.
"""
import uuid
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any, Callable, Iterator
from edgar.documents.types import NodeType, SemanticType, Style
from edgar.documents.cache_mixin import CacheableMixin
@dataclass
class Node(ABC):
"""
Base node class for document tree.
All nodes in the document inherit from this class and implement
the abstract methods for text and HTML generation.
"""
# Identity
id: str = field(default_factory=lambda: str(uuid.uuid4()))
type: NodeType = NodeType.DOCUMENT
# Hierarchy
parent: Optional['Node'] = field(default=None, repr=False)
children: List['Node'] = field(default_factory=list, repr=False)
# Content
content: Any = None
metadata: Dict[str, Any] = field(default_factory=dict)
style: Style = field(default_factory=Style)
# Semantic info
semantic_type: Optional[SemanticType] = None
semantic_role: Optional[str] = None
def add_child(self, child: 'Node') -> None:
"""Add child node, maintaining parent reference."""
child.parent = self
self.children.append(child)
def remove_child(self, child: 'Node') -> None:
"""Remove child node."""
if child in self.children:
self.children.remove(child)
child.parent = None
def insert_child(self, index: int, child: 'Node') -> None:
"""Insert child at specific index."""
child.parent = self
self.children.insert(index, child)
@abstractmethod
def text(self) -> str:
"""Extract text content from node and its children."""
pass
@abstractmethod
def html(self) -> str:
"""Generate HTML representation of node."""
pass
def find(self, predicate: Callable[['Node'], bool]) -> List['Node']:
"""Find all nodes matching predicate."""
results = []
if predicate(self):
results.append(self)
for child in self.children:
results.extend(child.find(predicate))
return results
def find_first(self, predicate: Callable[['Node'], bool]) -> Optional['Node']:
"""Find first node matching predicate."""
if predicate(self):
return self
for child in self.children:
result = child.find_first(predicate)
if result:
return result
return None
def xpath(self, expression: str) -> List['Node']:
"""
Simple XPath-like node selection.
Supports:
- //node_type - Find all nodes of type
- /node_type - Direct children of type
- [@attr=value] - Attribute matching
"""
# Simple implementation - can be extended
if expression.startswith('//'):
node_type = expression[2:].lower()
return self.find(lambda n: n.type.name.lower() == node_type)
elif expression.startswith('/'):
node_type = expression[1:].lower()
return [c for c in self.children if c.type.name.lower() == node_type]
return []
def walk(self) -> Iterator['Node']:
"""Walk the tree depth-first."""
yield self
for child in self.children:
yield from child.walk()
@property
def depth(self) -> int:
"""Get depth of node in tree."""
depth = 0
current = self.parent
while current:
depth += 1
current = current.parent
return depth
@property
def path(self) -> str:
"""Get path from root to this node."""
parts = []
current = self
while current:
parts.append(current.type.name)
current = current.parent
return '/'.join(reversed(parts))
def get_metadata(self, key: str, default: Any = None) -> Any:
"""Get metadata value with default."""
return self.metadata.get(key, default)
def set_metadata(self, key: str, value: Any) -> None:
"""Set metadata value."""
self.metadata[key] = value
def has_metadata(self, key: str) -> bool:
"""Check if metadata key exists."""
return key in self.metadata
@dataclass
class DocumentNode(Node, CacheableMixin):
"""Root document node."""
type: NodeType = field(default=NodeType.DOCUMENT, init=False)
def text(self) -> str:
"""Extract all text from document with caching."""
def _generate_text():
parts = []
for child in self.children:
text = child.text()
if text:
parts.append(text)
return '\n\n'.join(parts)
return self._get_cached_text(_generate_text)
def html(self) -> str:
"""Generate complete HTML document."""
body_content = '\n'.join(child.html() for child in self.children)
return f"""<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Document</title>
</head>
<body>
{body_content}
</body>
</html>"""
@dataclass
class TextNode(Node):
"""Plain text content node."""
type: NodeType = field(default=NodeType.TEXT, init=False)
content: str = ""
def text(self) -> str:
"""Return text content."""
return self.content
def html(self) -> str:
"""Generate HTML for text."""
# Escape HTML entities
text = self.content
text = text.replace('&', '&amp;')
text = text.replace('<', '&lt;')
text = text.replace('>', '&gt;')
return text
@dataclass
class ParagraphNode(Node, CacheableMixin):
"""Paragraph node."""
type: NodeType = field(default=NodeType.PARAGRAPH, init=False)
def text(self) -> str:
"""Extract paragraph text with intelligent spacing and caching."""
def _generate_text():
parts = []
for i, child in enumerate(self.children):
text = child.text()
if text:
# For the first child, just add the text
if i == 0:
parts.append(text)
else:
# For subsequent children, check if previous child had tail whitespace
prev_child = self.children[i - 1]
should_add_space = False
# Add space if previous child had tail whitespace
if hasattr(prev_child, 'get_metadata') and prev_child.get_metadata('has_tail_whitespace'):
should_add_space = True
# Add space if current text starts with space (preserve intended spacing)
elif text.startswith(' '):
should_add_space = True
# Remove the leading space from text since we're adding it as separation
text = text.lstrip()
# Add space if previous text ends with punctuation (sentence boundaries)
elif parts and parts[-1].rstrip()[-1:] in '.!?:;':
should_add_space = True
# Add space between adjacent inline elements if the current text starts with a letter/digit
# This handles cases where whitespace was stripped but spacing is semantically important
elif (text and text[0].isalpha() and
parts and parts[-1] and not parts[-1].endswith(' ') and
hasattr(child, 'get_metadata') and child.get_metadata('original_tag') in ['span', 'a', 'em', 'strong', 'i', 'b']):
should_add_space = True
if should_add_space:
parts.append(' ' + text)
else:
# Concatenate directly without space
if parts:
parts[-1] += text
else:
parts.append(text)
return ''.join(parts)
return self._get_cached_text(_generate_text)
def html(self) -> str:
"""Generate paragraph HTML."""
content = ''.join(child.html() for child in self.children)
style_attr = self._generate_style_attr()
return f'<p{style_attr}>{content}</p>'
def _generate_style_attr(self) -> str:
"""Generate style attribute from style object."""
if not self.style:
return ''
styles = []
if self.style.text_align:
styles.append(f'text-align: {self.style.text_align}')
if self.style.margin_top:
styles.append(f'margin-top: {self.style.margin_top}px')
if self.style.margin_bottom:
styles.append(f'margin-bottom: {self.style.margin_bottom}px')
if styles:
return f' style="{"; ".join(styles)}"'
return ''
@dataclass
class HeadingNode(Node):
"""Heading node with level."""
type: NodeType = field(default=NodeType.HEADING, init=False)
level: int = 1
def text(self) -> str:
"""Extract heading text."""
if isinstance(self.content, str):
return self.content
parts = []
for child in self.children:
text = child.text()
if text:
parts.append(text)
return ' '.join(parts)
def html(self) -> str:
"""Generate heading HTML."""
level = max(1, min(6, self.level)) # Ensure level is 1-6
content = self.text()
style_attr = self._generate_style_attr()
return f'<h{level}{style_attr}>{content}</h{level}>'
def _generate_style_attr(self) -> str:
"""Generate style attribute."""
styles = []
if self.style.text_align:
styles.append(f'text-align: {self.style.text_align}')
if self.style.color:
styles.append(f'color: {self.style.color}')
if styles:
return f' style="{"; ".join(styles)}"'
return ''
@dataclass
class ContainerNode(Node, CacheableMixin):
"""Generic container node (div, section, etc.)."""
type: NodeType = field(default=NodeType.CONTAINER, init=False)
tag_name: str = 'div'
def text(self) -> str:
"""Extract text from container with caching."""
def _generate_text():
parts = []
for child in self.children:
text = child.text()
if text:
parts.append(text)
return '\n'.join(parts)
return self._get_cached_text(_generate_text)
def html(self) -> str:
"""Generate container HTML."""
content = '\n'.join(child.html() for child in self.children)
style_attr = self._generate_style_attr()
class_attr = f' class="{self.semantic_role}"' if self.semantic_role else ''
return f'<{self.tag_name}{style_attr}{class_attr}>{content}</{self.tag_name}>'
def _generate_style_attr(self) -> str:
"""Generate style attribute."""
if not self.style:
return ''
styles = []
if self.style.margin_top:
styles.append(f'margin-top: {self.style.margin_top}px')
if self.style.margin_bottom:
styles.append(f'margin-bottom: {self.style.margin_bottom}px')
if self.style.padding_left:
styles.append(f'padding-left: {self.style.padding_left}px')
if styles:
return f' style="{"; ".join(styles)}"'
return ''
@dataclass
class SectionNode(ContainerNode):
"""Document section node."""
type: NodeType = field(default=NodeType.SECTION, init=False)
section_name: Optional[str] = None
tag_name: str = field(default='section', init=False)
def __post_init__(self):
if self.section_name:
self.set_metadata('section_name', self.section_name)
@dataclass
class ListNode(Node):
"""List node (ordered or unordered)."""
type: NodeType = field(default=NodeType.LIST, init=False)
ordered: bool = False
def text(self) -> str:
"""Extract list text."""
parts = []
for i, child in enumerate(self.children):
if self.ordered:
prefix = f"{i+1}. "
else:
prefix = ""
text = child.text()
if text:
parts.append(f"{prefix}{text}")
return '\n'.join(parts)
def html(self) -> str:
"""Generate list HTML."""
tag = 'ol' if self.ordered else 'ul'
items = '\n'.join(child.html() for child in self.children)
return f'<{tag}>\n{items}\n</{tag}>'
@dataclass
class ListItemNode(Node):
"""List item node."""
type: NodeType = field(default=NodeType.LIST_ITEM, init=False)
def text(self) -> str:
"""Extract list item text."""
parts = []
for child in self.children:
text = child.text()
if text:
parts.append(text)
return ' '.join(parts)
def html(self) -> str:
"""Generate list item HTML."""
content = ''.join(child.html() for child in self.children)
return f'<li>{content}</li>'
@dataclass
class LinkNode(Node):
"""Hyperlink node."""
type: NodeType = field(default=NodeType.LINK, init=False)
href: Optional[str] = None
title: Optional[str] = None
def text(self) -> str:
"""Extract link text."""
if isinstance(self.content, str):
return self.content
parts = []
for child in self.children:
text = child.text()
if text:
parts.append(text)
return ' '.join(parts)
def html(self) -> str:
"""Generate link HTML."""
content = self.text()
href_attr = f' href="{self.href}"' if self.href else ''
title_attr = f' title="{self.title}"' if self.title else ''
return f'<a{href_attr}{title_attr}>{content}</a>'
@dataclass
class ImageNode(Node):
"""Image node."""
type: NodeType = field(default=NodeType.IMAGE, init=False)
src: Optional[str] = None
alt: Optional[str] = None
width: Optional[int] = None
height: Optional[int] = None
def text(self) -> str:
"""Extract image alt text."""
return self.alt or ''
def html(self) -> str:
"""Generate image HTML."""
src_attr = f' src="{self.src}"' if self.src else ''
alt_attr = f' alt="{self.alt}"' if self.alt else ''
width_attr = f' width="{self.width}"' if self.width else ''
height_attr = f' height="{self.height}"' if self.height else ''
return f'<img{src_attr}{alt_attr}{width_attr}{height_attr}>'