Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/documents/nodes.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/nodes.py
@@ -0,0 +1,456 @@
+"""
+Node hierarchy for the document tree.
+"""
+
+import uuid
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict, Any, Callable, Iterator
+
+from edgar.documents.types import NodeType, SemanticType, Style
+from edgar.documents.cache_mixin import CacheableMixin
+
+
+@dataclass
+class Node(ABC):
+    """
+    Base node class for document tree.
+    
+    All nodes in the document inherit from this class and implement
+    the abstract methods for text and HTML generation.
+    """
+    
+    # Identity
+    id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    type: NodeType = NodeType.DOCUMENT
+    
+    # Hierarchy
+    parent: Optional['Node'] = field(default=None, repr=False)
+    children: List['Node'] = field(default_factory=list, repr=False)
+    
+    # Content
+    content: Any = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    style: Style = field(default_factory=Style)
+    
+    # Semantic info
+    semantic_type: Optional[SemanticType] = None
+    semantic_role: Optional[str] = None
+    
+    def add_child(self, child: 'Node') -> None:
+        """Add child node, maintaining parent reference."""
+        child.parent = self
+        self.children.append(child)
+    
+    def remove_child(self, child: 'Node') -> None:
+        """Remove child node."""
+        if child in self.children:
+            self.children.remove(child)
+            child.parent = None
+    
+    def insert_child(self, index: int, child: 'Node') -> None:
+        """Insert child at specific index."""
+        child.parent = self
+        self.children.insert(index, child)
+    
+    @abstractmethod
+    def text(self) -> str:
+        """Extract text content from node and its children."""
+        pass
+    
+    @abstractmethod
+    def html(self) -> str:
+        """Generate HTML representation of node."""
+        pass
+    
+    def find(self, predicate: Callable[['Node'], bool]) -> List['Node']:
+        """Find all nodes matching predicate."""
+        results = []
+        if predicate(self):
+            results.append(self)
+        for child in self.children:
+            results.extend(child.find(predicate))
+        return results
+    
+    def find_first(self, predicate: Callable[['Node'], bool]) -> Optional['Node']:
+        """Find first node matching predicate."""
+        if predicate(self):
+            return self
+        for child in self.children:
+            result = child.find_first(predicate)
+            if result:
+                return result
+        return None
+    
+    def xpath(self, expression: str) -> List['Node']:
+        """
+        Simple XPath-like node selection.
+        
+        Supports:
+        - //node_type - Find all nodes of type
+        - /node_type - Direct children of type
+        - [@attr=value] - Attribute matching
+        """
+        # Simple implementation - can be extended
+        if expression.startswith('//'):
+            node_type = expression[2:].lower()
+            return self.find(lambda n: n.type.name.lower() == node_type)
+        elif expression.startswith('/'):
+            node_type = expression[1:].lower()
+            return [c for c in self.children if c.type.name.lower() == node_type]
+        return []
+    
+    def walk(self) -> Iterator['Node']:
+        """Walk the tree depth-first."""
+        yield self
+        for child in self.children:
+            yield from child.walk()
+    
+    @property
+    def depth(self) -> int:
+        """Get depth of node in tree."""
+        depth = 0
+        current = self.parent
+        while current:
+            depth += 1
+            current = current.parent
+        return depth
+    
+    @property
+    def path(self) -> str:
+        """Get path from root to this node."""
+        parts = []
+        current = self
+        while current:
+            parts.append(current.type.name)
+            current = current.parent
+        return '/'.join(reversed(parts))
+    
+    def get_metadata(self, key: str, default: Any = None) -> Any:
+        """Get metadata value with default."""
+        return self.metadata.get(key, default)
+    
+    def set_metadata(self, key: str, value: Any) -> None:
+        """Set metadata value."""
+        self.metadata[key] = value
+    
+    def has_metadata(self, key: str) -> bool:
+        """Check if metadata key exists."""
+        return key in self.metadata
+
+
+@dataclass
+class DocumentNode(Node, CacheableMixin):
+    """Root document node."""
+    type: NodeType = field(default=NodeType.DOCUMENT, init=False)
+
+    def text(self) -> str:
+        """Extract all text from document with caching."""
+        def _generate_text():
+            parts = []
+            for child in self.children:
+                text = child.text()
+                if text:
+                    parts.append(text)
+            return '\n\n'.join(parts)
+
+        return self._get_cached_text(_generate_text)
+    
+    def html(self) -> str:
+        """Generate complete HTML document."""
+        body_content = '\n'.join(child.html() for child in self.children)
+        return f"""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <title>Document</title>
+</head>
+<body>
+{body_content}
+</body>
+</html>"""
+
+
+@dataclass
+class TextNode(Node):
+    """Plain text content node."""
+    type: NodeType = field(default=NodeType.TEXT, init=False)
+    content: str = ""
+    
+    def text(self) -> str:
+        """Return text content."""
+        return self.content
+    
+    def html(self) -> str:
+        """Generate HTML for text."""
+        # Escape HTML entities
+        text = self.content
+        text = text.replace('&', '&amp;')
+        text = text.replace('<', '&lt;')
+        text = text.replace('>', '&gt;')
+        return text
+
+
+@dataclass
+class ParagraphNode(Node, CacheableMixin):
+    """Paragraph node."""
+    type: NodeType = field(default=NodeType.PARAGRAPH, init=False)
+
+    def text(self) -> str:
+        """Extract paragraph text with intelligent spacing and caching."""
+        def _generate_text():
+            parts = []
+            for i, child in enumerate(self.children):
+                text = child.text()
+                if text:
+                    # For the first child, just add the text
+                    if i == 0:
+                        parts.append(text)
+                    else:
+                        # For subsequent children, check if previous child had tail whitespace
+                        prev_child = self.children[i - 1]
+                        should_add_space = False
+
+                        # Add space if previous child had tail whitespace
+                        if hasattr(prev_child, 'get_metadata') and prev_child.get_metadata('has_tail_whitespace'):
+                            should_add_space = True
+
+                        # Add space if current text starts with space (preserve intended spacing)
+                        elif text.startswith(' '):
+                            should_add_space = True
+                            # Remove the leading space from text since we're adding it as separation
+                            text = text.lstrip()
+
+                        # Add space if previous text ends with punctuation (sentence boundaries)
+                        elif parts and parts[-1].rstrip()[-1:] in '.!?:;':
+                            should_add_space = True
+
+                        # Add space between adjacent inline elements if the current text starts with a letter/digit
+                        # This handles cases where whitespace was stripped but spacing is semantically important
+                        elif (text and text[0].isalpha() and
+                              parts and parts[-1] and not parts[-1].endswith(' ') and
+                              hasattr(child, 'get_metadata') and child.get_metadata('original_tag') in ['span', 'a', 'em', 'strong', 'i', 'b']):
+                            should_add_space = True
+
+                        if should_add_space:
+                            parts.append(' ' + text)
+                        else:
+                            # Concatenate directly without space
+                            if parts:
+                                parts[-1] += text
+                            else:
+                                parts.append(text)
+
+            return ''.join(parts)
+
+        return self._get_cached_text(_generate_text)
+    
+    def html(self) -> str:
+        """Generate paragraph HTML."""
+        content = ''.join(child.html() for child in self.children)
+        style_attr = self._generate_style_attr()
+        return f'<p{style_attr}>{content}</p>'
+    
+    def _generate_style_attr(self) -> str:
+        """Generate style attribute from style object."""
+        if not self.style:
+            return ''
+        
+        styles = []
+        if self.style.text_align:
+            styles.append(f'text-align: {self.style.text_align}')
+        if self.style.margin_top:
+            styles.append(f'margin-top: {self.style.margin_top}px')
+        if self.style.margin_bottom:
+            styles.append(f'margin-bottom: {self.style.margin_bottom}px')
+        
+        if styles:
+            return f' style="{"; ".join(styles)}"'
+        return ''
+
+
+@dataclass
+class HeadingNode(Node):
+    """Heading node with level."""
+    type: NodeType = field(default=NodeType.HEADING, init=False)
+    level: int = 1
+    
+    def text(self) -> str:
+        """Extract heading text."""
+        if isinstance(self.content, str):
+            return self.content
+        
+        parts = []
+        for child in self.children:
+            text = child.text()
+            if text:
+                parts.append(text)
+        return ' '.join(parts)
+    
+    def html(self) -> str:
+        """Generate heading HTML."""
+        level = max(1, min(6, self.level))  # Ensure level is 1-6
+        content = self.text()
+        style_attr = self._generate_style_attr()
+        return f'<h{level}{style_attr}>{content}</h{level}>'
+    
+    def _generate_style_attr(self) -> str:
+        """Generate style attribute."""
+        styles = []
+        if self.style.text_align:
+            styles.append(f'text-align: {self.style.text_align}')
+        if self.style.color:
+            styles.append(f'color: {self.style.color}')
+        if styles:
+            return f' style="{"; ".join(styles)}"'
+        return ''
+
+
+@dataclass
+class ContainerNode(Node, CacheableMixin):
+    """Generic container node (div, section, etc.)."""
+    type: NodeType = field(default=NodeType.CONTAINER, init=False)
+    tag_name: str = 'div'
+
+    def text(self) -> str:
+        """Extract text from container with caching."""
+        def _generate_text():
+            parts = []
+            for child in self.children:
+                text = child.text()
+                if text:
+                    parts.append(text)
+            return '\n'.join(parts)
+
+        return self._get_cached_text(_generate_text)
+    
+    def html(self) -> str:
+        """Generate container HTML."""
+        content = '\n'.join(child.html() for child in self.children)
+        style_attr = self._generate_style_attr()
+        class_attr = f' class="{self.semantic_role}"' if self.semantic_role else ''
+        return f'<{self.tag_name}{style_attr}{class_attr}>{content}</{self.tag_name}>'
+    
+    def _generate_style_attr(self) -> str:
+        """Generate style attribute."""
+        if not self.style:
+            return ''
+        
+        styles = []
+        if self.style.margin_top:
+            styles.append(f'margin-top: {self.style.margin_top}px')
+        if self.style.margin_bottom:
+            styles.append(f'margin-bottom: {self.style.margin_bottom}px')
+        if self.style.padding_left:
+            styles.append(f'padding-left: {self.style.padding_left}px')
+        
+        if styles:
+            return f' style="{"; ".join(styles)}"'
+        return ''
+
+
+@dataclass 
+class SectionNode(ContainerNode):
+    """Document section node."""
+    type: NodeType = field(default=NodeType.SECTION, init=False)
+    section_name: Optional[str] = None
+    tag_name: str = field(default='section', init=False)
+    
+    def __post_init__(self):
+        if self.section_name:
+            self.set_metadata('section_name', self.section_name)
+
+
+@dataclass
+class ListNode(Node):
+    """List node (ordered or unordered)."""
+    type: NodeType = field(default=NodeType.LIST, init=False)
+    ordered: bool = False
+    
+    def text(self) -> str:
+        """Extract list text."""
+        parts = []
+        for i, child in enumerate(self.children):
+            if self.ordered:
+                prefix = f"{i+1}. "
+            else:
+                prefix = "• "
+            text = child.text()
+            if text:
+                parts.append(f"{prefix}{text}")
+        return '\n'.join(parts)
+    
+    def html(self) -> str:
+        """Generate list HTML."""
+        tag = 'ol' if self.ordered else 'ul'
+        items = '\n'.join(child.html() for child in self.children)
+        return f'<{tag}>\n{items}\n</{tag}>'
+
+
+@dataclass
+class ListItemNode(Node):
+    """List item node."""
+    type: NodeType = field(default=NodeType.LIST_ITEM, init=False)
+    
+    def text(self) -> str:
+        """Extract list item text."""
+        parts = []
+        for child in self.children:
+            text = child.text()
+            if text:
+                parts.append(text)
+        return ' '.join(parts)
+    
+    def html(self) -> str:
+        """Generate list item HTML."""
+        content = ''.join(child.html() for child in self.children)
+        return f'<li>{content}</li>'
+
+
+@dataclass
+class LinkNode(Node):
+    """Hyperlink node."""
+    type: NodeType = field(default=NodeType.LINK, init=False)
+    href: Optional[str] = None
+    title: Optional[str] = None
+    
+    def text(self) -> str:
+        """Extract link text."""
+        if isinstance(self.content, str):
+            return self.content
+        
+        parts = []
+        for child in self.children:
+            text = child.text()
+            if text:
+                parts.append(text)
+        return ' '.join(parts)
+    
+    def html(self) -> str:
+        """Generate link HTML."""
+        content = self.text()
+        href_attr = f' href="{self.href}"' if self.href else ''
+        title_attr = f' title="{self.title}"' if self.title else ''
+        return f'<a{href_attr}{title_attr}>{content}</a>'
+
+
+@dataclass
+class ImageNode(Node):
+    """Image node."""
+    type: NodeType = field(default=NodeType.IMAGE, init=False)
+    src: Optional[str] = None
+    alt: Optional[str] = None
+    width: Optional[int] = None
+    height: Optional[int] = None
+    
+    def text(self) -> str:
+        """Extract image alt text."""
+        return self.alt or ''
+    
+    def html(self) -> str:
+        """Generate image HTML."""
+        src_attr = f' src="{self.src}"' if self.src else ''
+        alt_attr = f' alt="{self.alt}"' if self.alt else ''
+        width_attr = f' width="{self.width}"' if self.width else ''
+        height_attr = f' height="{self.height}"' if self.height else ''
+        return f'<img{src_attr}{alt_attr}{width_attr}{height_attr}>'