Initial commit
This commit is contained in:
456
venv/lib/python3.10/site-packages/edgar/documents/nodes.py
Normal file
456
venv/lib/python3.10/site-packages/edgar/documents/nodes.py
Normal file
@@ -0,0 +1,456 @@
|
||||
"""
|
||||
Node hierarchy for the document tree.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict, Any, Callable, Iterator
|
||||
|
||||
from edgar.documents.types import NodeType, SemanticType, Style
|
||||
from edgar.documents.cache_mixin import CacheableMixin
|
||||
|
||||
|
||||
@dataclass
|
||||
class Node(ABC):
|
||||
"""
|
||||
Base node class for document tree.
|
||||
|
||||
All nodes in the document inherit from this class and implement
|
||||
the abstract methods for text and HTML generation.
|
||||
"""
|
||||
|
||||
# Identity
|
||||
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
||||
type: NodeType = NodeType.DOCUMENT
|
||||
|
||||
# Hierarchy
|
||||
parent: Optional['Node'] = field(default=None, repr=False)
|
||||
children: List['Node'] = field(default_factory=list, repr=False)
|
||||
|
||||
# Content
|
||||
content: Any = None
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
style: Style = field(default_factory=Style)
|
||||
|
||||
# Semantic info
|
||||
semantic_type: Optional[SemanticType] = None
|
||||
semantic_role: Optional[str] = None
|
||||
|
||||
def add_child(self, child: 'Node') -> None:
|
||||
"""Add child node, maintaining parent reference."""
|
||||
child.parent = self
|
||||
self.children.append(child)
|
||||
|
||||
def remove_child(self, child: 'Node') -> None:
|
||||
"""Remove child node."""
|
||||
if child in self.children:
|
||||
self.children.remove(child)
|
||||
child.parent = None
|
||||
|
||||
def insert_child(self, index: int, child: 'Node') -> None:
|
||||
"""Insert child at specific index."""
|
||||
child.parent = self
|
||||
self.children.insert(index, child)
|
||||
|
||||
@abstractmethod
|
||||
def text(self) -> str:
|
||||
"""Extract text content from node and its children."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def html(self) -> str:
|
||||
"""Generate HTML representation of node."""
|
||||
pass
|
||||
|
||||
def find(self, predicate: Callable[['Node'], bool]) -> List['Node']:
|
||||
"""Find all nodes matching predicate."""
|
||||
results = []
|
||||
if predicate(self):
|
||||
results.append(self)
|
||||
for child in self.children:
|
||||
results.extend(child.find(predicate))
|
||||
return results
|
||||
|
||||
def find_first(self, predicate: Callable[['Node'], bool]) -> Optional['Node']:
|
||||
"""Find first node matching predicate."""
|
||||
if predicate(self):
|
||||
return self
|
||||
for child in self.children:
|
||||
result = child.find_first(predicate)
|
||||
if result:
|
||||
return result
|
||||
return None
|
||||
|
||||
def xpath(self, expression: str) -> List['Node']:
|
||||
"""
|
||||
Simple XPath-like node selection.
|
||||
|
||||
Supports:
|
||||
- //node_type - Find all nodes of type
|
||||
- /node_type - Direct children of type
|
||||
- [@attr=value] - Attribute matching
|
||||
"""
|
||||
# Simple implementation - can be extended
|
||||
if expression.startswith('//'):
|
||||
node_type = expression[2:].lower()
|
||||
return self.find(lambda n: n.type.name.lower() == node_type)
|
||||
elif expression.startswith('/'):
|
||||
node_type = expression[1:].lower()
|
||||
return [c for c in self.children if c.type.name.lower() == node_type]
|
||||
return []
|
||||
|
||||
def walk(self) -> Iterator['Node']:
|
||||
"""Walk the tree depth-first."""
|
||||
yield self
|
||||
for child in self.children:
|
||||
yield from child.walk()
|
||||
|
||||
@property
|
||||
def depth(self) -> int:
|
||||
"""Get depth of node in tree."""
|
||||
depth = 0
|
||||
current = self.parent
|
||||
while current:
|
||||
depth += 1
|
||||
current = current.parent
|
||||
return depth
|
||||
|
||||
@property
|
||||
def path(self) -> str:
|
||||
"""Get path from root to this node."""
|
||||
parts = []
|
||||
current = self
|
||||
while current:
|
||||
parts.append(current.type.name)
|
||||
current = current.parent
|
||||
return '/'.join(reversed(parts))
|
||||
|
||||
def get_metadata(self, key: str, default: Any = None) -> Any:
|
||||
"""Get metadata value with default."""
|
||||
return self.metadata.get(key, default)
|
||||
|
||||
def set_metadata(self, key: str, value: Any) -> None:
|
||||
"""Set metadata value."""
|
||||
self.metadata[key] = value
|
||||
|
||||
def has_metadata(self, key: str) -> bool:
|
||||
"""Check if metadata key exists."""
|
||||
return key in self.metadata
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentNode(Node, CacheableMixin):
|
||||
"""Root document node."""
|
||||
type: NodeType = field(default=NodeType.DOCUMENT, init=False)
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract all text from document with caching."""
|
||||
def _generate_text():
|
||||
parts = []
|
||||
for child in self.children:
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return '\n\n'.join(parts)
|
||||
|
||||
return self._get_cached_text(_generate_text)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate complete HTML document."""
|
||||
body_content = '\n'.join(child.html() for child in self.children)
|
||||
return f"""<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Document</title>
|
||||
</head>
|
||||
<body>
|
||||
{body_content}
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextNode(Node):
|
||||
"""Plain text content node."""
|
||||
type: NodeType = field(default=NodeType.TEXT, init=False)
|
||||
content: str = ""
|
||||
|
||||
def text(self) -> str:
|
||||
"""Return text content."""
|
||||
return self.content
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate HTML for text."""
|
||||
# Escape HTML entities
|
||||
text = self.content
|
||||
text = text.replace('&', '&')
|
||||
text = text.replace('<', '<')
|
||||
text = text.replace('>', '>')
|
||||
return text
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParagraphNode(Node, CacheableMixin):
|
||||
"""Paragraph node."""
|
||||
type: NodeType = field(default=NodeType.PARAGRAPH, init=False)
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract paragraph text with intelligent spacing and caching."""
|
||||
def _generate_text():
|
||||
parts = []
|
||||
for i, child in enumerate(self.children):
|
||||
text = child.text()
|
||||
if text:
|
||||
# For the first child, just add the text
|
||||
if i == 0:
|
||||
parts.append(text)
|
||||
else:
|
||||
# For subsequent children, check if previous child had tail whitespace
|
||||
prev_child = self.children[i - 1]
|
||||
should_add_space = False
|
||||
|
||||
# Add space if previous child had tail whitespace
|
||||
if hasattr(prev_child, 'get_metadata') and prev_child.get_metadata('has_tail_whitespace'):
|
||||
should_add_space = True
|
||||
|
||||
# Add space if current text starts with space (preserve intended spacing)
|
||||
elif text.startswith(' '):
|
||||
should_add_space = True
|
||||
# Remove the leading space from text since we're adding it as separation
|
||||
text = text.lstrip()
|
||||
|
||||
# Add space if previous text ends with punctuation (sentence boundaries)
|
||||
elif parts and parts[-1].rstrip()[-1:] in '.!?:;':
|
||||
should_add_space = True
|
||||
|
||||
# Add space between adjacent inline elements if the current text starts with a letter/digit
|
||||
# This handles cases where whitespace was stripped but spacing is semantically important
|
||||
elif (text and text[0].isalpha() and
|
||||
parts and parts[-1] and not parts[-1].endswith(' ') and
|
||||
hasattr(child, 'get_metadata') and child.get_metadata('original_tag') in ['span', 'a', 'em', 'strong', 'i', 'b']):
|
||||
should_add_space = True
|
||||
|
||||
if should_add_space:
|
||||
parts.append(' ' + text)
|
||||
else:
|
||||
# Concatenate directly without space
|
||||
if parts:
|
||||
parts[-1] += text
|
||||
else:
|
||||
parts.append(text)
|
||||
|
||||
return ''.join(parts)
|
||||
|
||||
return self._get_cached_text(_generate_text)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate paragraph HTML."""
|
||||
content = ''.join(child.html() for child in self.children)
|
||||
style_attr = self._generate_style_attr()
|
||||
return f'<p{style_attr}>{content}</p>'
|
||||
|
||||
def _generate_style_attr(self) -> str:
|
||||
"""Generate style attribute from style object."""
|
||||
if not self.style:
|
||||
return ''
|
||||
|
||||
styles = []
|
||||
if self.style.text_align:
|
||||
styles.append(f'text-align: {self.style.text_align}')
|
||||
if self.style.margin_top:
|
||||
styles.append(f'margin-top: {self.style.margin_top}px')
|
||||
if self.style.margin_bottom:
|
||||
styles.append(f'margin-bottom: {self.style.margin_bottom}px')
|
||||
|
||||
if styles:
|
||||
return f' style="{"; ".join(styles)}"'
|
||||
return ''
|
||||
|
||||
|
||||
@dataclass
|
||||
class HeadingNode(Node):
|
||||
"""Heading node with level."""
|
||||
type: NodeType = field(default=NodeType.HEADING, init=False)
|
||||
level: int = 1
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract heading text."""
|
||||
if isinstance(self.content, str):
|
||||
return self.content
|
||||
|
||||
parts = []
|
||||
for child in self.children:
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return ' '.join(parts)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate heading HTML."""
|
||||
level = max(1, min(6, self.level)) # Ensure level is 1-6
|
||||
content = self.text()
|
||||
style_attr = self._generate_style_attr()
|
||||
return f'<h{level}{style_attr}>{content}</h{level}>'
|
||||
|
||||
def _generate_style_attr(self) -> str:
|
||||
"""Generate style attribute."""
|
||||
styles = []
|
||||
if self.style.text_align:
|
||||
styles.append(f'text-align: {self.style.text_align}')
|
||||
if self.style.color:
|
||||
styles.append(f'color: {self.style.color}')
|
||||
if styles:
|
||||
return f' style="{"; ".join(styles)}"'
|
||||
return ''
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContainerNode(Node, CacheableMixin):
|
||||
"""Generic container node (div, section, etc.)."""
|
||||
type: NodeType = field(default=NodeType.CONTAINER, init=False)
|
||||
tag_name: str = 'div'
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract text from container with caching."""
|
||||
def _generate_text():
|
||||
parts = []
|
||||
for child in self.children:
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return '\n'.join(parts)
|
||||
|
||||
return self._get_cached_text(_generate_text)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate container HTML."""
|
||||
content = '\n'.join(child.html() for child in self.children)
|
||||
style_attr = self._generate_style_attr()
|
||||
class_attr = f' class="{self.semantic_role}"' if self.semantic_role else ''
|
||||
return f'<{self.tag_name}{style_attr}{class_attr}>{content}</{self.tag_name}>'
|
||||
|
||||
def _generate_style_attr(self) -> str:
|
||||
"""Generate style attribute."""
|
||||
if not self.style:
|
||||
return ''
|
||||
|
||||
styles = []
|
||||
if self.style.margin_top:
|
||||
styles.append(f'margin-top: {self.style.margin_top}px')
|
||||
if self.style.margin_bottom:
|
||||
styles.append(f'margin-bottom: {self.style.margin_bottom}px')
|
||||
if self.style.padding_left:
|
||||
styles.append(f'padding-left: {self.style.padding_left}px')
|
||||
|
||||
if styles:
|
||||
return f' style="{"; ".join(styles)}"'
|
||||
return ''
|
||||
|
||||
|
||||
@dataclass
|
||||
class SectionNode(ContainerNode):
|
||||
"""Document section node."""
|
||||
type: NodeType = field(default=NodeType.SECTION, init=False)
|
||||
section_name: Optional[str] = None
|
||||
tag_name: str = field(default='section', init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
if self.section_name:
|
||||
self.set_metadata('section_name', self.section_name)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListNode(Node):
|
||||
"""List node (ordered or unordered)."""
|
||||
type: NodeType = field(default=NodeType.LIST, init=False)
|
||||
ordered: bool = False
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract list text."""
|
||||
parts = []
|
||||
for i, child in enumerate(self.children):
|
||||
if self.ordered:
|
||||
prefix = f"{i+1}. "
|
||||
else:
|
||||
prefix = "• "
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(f"{prefix}{text}")
|
||||
return '\n'.join(parts)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate list HTML."""
|
||||
tag = 'ol' if self.ordered else 'ul'
|
||||
items = '\n'.join(child.html() for child in self.children)
|
||||
return f'<{tag}>\n{items}\n</{tag}>'
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListItemNode(Node):
|
||||
"""List item node."""
|
||||
type: NodeType = field(default=NodeType.LIST_ITEM, init=False)
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract list item text."""
|
||||
parts = []
|
||||
for child in self.children:
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return ' '.join(parts)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate list item HTML."""
|
||||
content = ''.join(child.html() for child in self.children)
|
||||
return f'<li>{content}</li>'
|
||||
|
||||
|
||||
@dataclass
|
||||
class LinkNode(Node):
|
||||
"""Hyperlink node."""
|
||||
type: NodeType = field(default=NodeType.LINK, init=False)
|
||||
href: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract link text."""
|
||||
if isinstance(self.content, str):
|
||||
return self.content
|
||||
|
||||
parts = []
|
||||
for child in self.children:
|
||||
text = child.text()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return ' '.join(parts)
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate link HTML."""
|
||||
content = self.text()
|
||||
href_attr = f' href="{self.href}"' if self.href else ''
|
||||
title_attr = f' title="{self.title}"' if self.title else ''
|
||||
return f'<a{href_attr}{title_attr}>{content}</a>'
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImageNode(Node):
|
||||
"""Image node."""
|
||||
type: NodeType = field(default=NodeType.IMAGE, init=False)
|
||||
src: Optional[str] = None
|
||||
alt: Optional[str] = None
|
||||
width: Optional[int] = None
|
||||
height: Optional[int] = None
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract image alt text."""
|
||||
return self.alt or ''
|
||||
|
||||
def html(self) -> str:
|
||||
"""Generate image HTML."""
|
||||
src_attr = f' src="{self.src}"' if self.src else ''
|
||||
alt_attr = f' alt="{self.alt}"' if self.alt else ''
|
||||
width_attr = f' width="{self.width}"' if self.width else ''
|
||||
height_attr = f' height="{self.height}"' if self.height else ''
|
||||
return f'<img{src_attr}{alt_attr}{width_attr}{height_attr}>'
|
||||
Reference in New Issue
Block a user