Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/init.py
@@ -0,0 +1,11 @@
+"""
+Document processors for preprocessing and postprocessing.
+"""
+
+from edgar.documents.processors.preprocessor import HTMLPreprocessor
+from edgar.documents.processors.postprocessor import DocumentPostprocessor
+
+__all__ = [
+    'HTMLPreprocessor',
+    'DocumentPostprocessor'
+]
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/postprocessor.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/postprocessor.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/preprocessor.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/pycache/preprocessor.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/postprocessor.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/postprocessor.py
@@ -0,0 +1,283 @@
+"""
+Document postprocessor for final processing after parsing.
+"""
+
+from typing import List, Set
+from edgar.documents.config import ParserConfig
+from edgar.documents.document import Document
+from edgar.documents.nodes import Node, TextNode, ParagraphNode, HeadingNode
+from edgar.documents.types import NodeType
+
+
+class DocumentPostprocessor:
+    """
+    Postprocesses parsed documents to improve quality.
+    
+    Handles:
+    - Adjacent node merging
+    - Empty node removal
+    - Heading level normalization
+    - Section detection enhancement
+    - Metadata enrichment
+    """
+    
+    def __init__(self, config: ParserConfig):
+        """Initialize postprocessor with configuration."""
+        self.config = config
+    
+    def process(self, document: Document) -> Document:
+        """
+        Postprocess document.
+        
+        Args:
+            document: Parsed document
+            
+        Returns:
+            Processed document
+        """
+        # Remove empty nodes
+        self._remove_empty_nodes(document.root)
+        
+        # Merge adjacent text nodes if configured
+        if self.config.merge_adjacent_nodes:
+            self._merge_adjacent_nodes(document.root)
+        
+        # Normalize heading levels
+        self._normalize_heading_levels(document.root)
+        
+        # Enhance section detection if configured
+        if self.config.detect_sections:
+            self._enhance_sections(document)
+        
+        # Add document statistics
+        self._add_statistics(document)
+        
+        # Validate document structure
+        self._validate_structure(document)
+        
+        return document
+    
+    def _remove_empty_nodes(self, node: Node):
+        """Remove empty nodes from tree."""
+        # Process children first (bottom-up)
+        children_to_remove = []
+        
+        for child in node.children:
+            self._remove_empty_nodes(child)
+            
+            # Check if child is empty
+            if self._is_empty_node(child):
+                children_to_remove.append(child)
+        
+        # Remove empty children
+        for child in children_to_remove:
+            node.remove_child(child)
+    
+    def _is_empty_node(self, node: Node) -> bool:
+        """Check if node is empty and can be removed."""
+        # Never remove table nodes
+        if node.type == NodeType.TABLE:
+            return False
+        
+        # Never remove nodes with metadata
+        if node.metadata:
+            return False
+        
+        # Check text nodes
+        if isinstance(node, TextNode):
+            return not node.text().strip()
+        
+        # Check other nodes with text content
+        if hasattr(node, 'content') and isinstance(node.content, str):
+            return not node.content.strip()
+        
+        # Check container nodes
+        if not node.children:
+            # Empty container with no children
+            return True
+        
+        return False
+    
+    def _merge_adjacent_nodes(self, node: Node):
+        """Merge adjacent text nodes with similar properties."""
+        if not node.children:
+            return
+        
+        # Process children first
+        for child in node.children:
+            self._merge_adjacent_nodes(child)
+        
+        # Merge adjacent text nodes
+        merged_children = []
+        i = 0
+        
+        while i < len(node.children):
+            current = node.children[i]
+            
+            # Look for mergeable nodes
+            if self._can_merge(current):
+                # Collect all adjacent mergeable nodes
+                merge_group = [current]
+                j = i + 1
+                
+                while j < len(node.children) and self._can_merge_with(current, node.children[j]):
+                    merge_group.append(node.children[j])
+                    j += 1
+                
+                # Merge if we have multiple nodes
+                if len(merge_group) > 1:
+                    merged = self._merge_nodes(merge_group)
+                    merged_children.append(merged)
+                    i = j
+                else:
+                    merged_children.append(current)
+                    i += 1
+            else:
+                merged_children.append(current)
+                i += 1
+        
+        # Update children
+        node.children = merged_children
+        
+        # Update parent references
+        for child in node.children:
+            child.parent = node
+    
+    def _can_merge(self, node: Node) -> bool:
+        """Check if node can be merged."""
+        # Only merge TextNodes, not ParagraphNodes
+        return isinstance(node, TextNode) and not node.metadata
+    
+    def _can_merge_with(self, node1: Node, node2: Node) -> bool:
+        """Check if two nodes can be merged."""
+        # Must be same type
+        if type(node1) != type(node2):
+            return False
+        
+        # Must have compatible styles
+        if not self._compatible_styles(node1.style, node2.style):
+            return False
+        
+        # Must not have metadata
+        if node1.metadata or node2.metadata:
+            return False
+        
+        return True
+    
+    def _compatible_styles(self, style1, style2) -> bool:
+        """Check if two styles are compatible for merging."""
+        # For now, just check key properties
+        return (
+            style1.font_size == style2.font_size and
+            style1.font_weight == style2.font_weight and
+            style1.text_align == style2.text_align
+        )
+    
+    def _merge_nodes(self, nodes: List[Node]) -> Node:
+        """Merge multiple nodes into one."""
+        if not nodes:
+            return None
+        
+        # Use first node as base
+        merged = nodes[0]
+        
+        # Merge content
+        if isinstance(merged, TextNode):
+            texts = [n.text() for n in nodes]
+            merged.content = '\n'.join(texts)
+        elif isinstance(merged, ParagraphNode):
+            # Merge all children
+            for node in nodes[1:]:
+                merged.children.extend(node.children)
+        
+        return merged
+    
+    def _normalize_heading_levels(self, node: Node):
+        """Normalize heading levels to ensure proper hierarchy."""
+        # Collect all headings
+        headings = []
+        self._collect_headings(node, headings)
+        
+        if not headings:
+            return
+        
+        # Analyze heading structure
+        levels_used = set(h.level for h in headings)
+        
+        # If we're missing level 1, promote headings
+        if 1 not in levels_used and levels_used:
+            min_level = min(levels_used)
+            adjustment = min_level - 1
+            
+            for heading in headings:
+                heading.level = max(1, heading.level - adjustment)
+    
+    def _collect_headings(self, node: Node, headings: List[HeadingNode]):
+        """Collect all heading nodes."""
+        if isinstance(node, HeadingNode):
+            headings.append(node)
+        
+        for child in node.children:
+            self._collect_headings(child, headings)
+    
+    def _enhance_sections(self, document: Document):
+        """Enhance section detection and metadata."""
+        # Only extract sections eagerly if configured to do so
+        if not self.config.eager_section_extraction:
+            return
+            
+        # Force section extraction to populate cache
+        _ = document.sections
+        
+        # Add section metadata to nodes
+        for section_name, section in document.sections.items():
+            # Add section name to all nodes in section
+            for node in section.node.walk():
+                node.set_metadata('section', section_name)
+    
+    def _add_statistics(self, document: Document):
+        """Add document statistics to metadata."""
+        stats = {
+            'node_count': sum(1 for _ in document.root.walk()),
+            'text_length': len(document.text()),
+            'table_count': len(document.tables),
+            'heading_count': len(document.headings),
+        }
+        
+        # Only add section count if sections were extracted
+        if self.config.eager_section_extraction:
+            stats['section_count'] = len(document.sections)
+        
+        document.metadata.statistics = stats
+    
+    def _validate_structure(self, document: Document):
+        """Validate document structure and fix issues."""
+        issues = []
+        
+        # Check for orphaned nodes
+        for node in document.root.walk():
+            if node != document.root and node.parent is None:
+                issues.append(f"Orphaned node: {node.type}")
+                # Fix by adding to root
+                document.root.add_child(node)
+        
+        # Check for circular references
+        visited = set()
+        
+        def check_cycles(node: Node, path: Set[str]):
+            if node.id in path:
+                issues.append(f"Circular reference detected: {node.type}")
+                return
+            
+            path.add(node.id)
+            visited.add(node.id)
+            
+            for child in node.children:
+                if child.id not in visited:
+                    check_cycles(child, path.copy())
+        
+        check_cycles(document.root, set())
+        
+        # Store validation results
+        if issues:
+            document.metadata.validation_issues = issues
--- a/venv/lib/python3.10/site-packages/edgar/documents/processors/preprocessor.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/processors/preprocessor.py
@@ -0,0 +1,242 @@
+"""
+HTML preprocessor for cleaning and normalizing HTML before parsing.
+"""
+
+import re
+
+from edgar.documents.config import ParserConfig
+from edgar.documents.utils.html_utils import remove_xml_declaration
+
+
+class HTMLPreprocessor:
+    """
+    Preprocesses HTML to fix common issues and normalize content.
+    
+    Handles:
+    - Character encoding issues
+    - Malformed HTML
+    - Excessive whitespace
+    - Script/style removal
+    - Entity normalization
+    """
+    
+    def __init__(self, config: ParserConfig):
+        """Initialize preprocessor with configuration."""
+        self.config = config
+        
+        # Pre-compile regex patterns for performance
+        self._compiled_patterns = self._compile_patterns()
+    
+    def _compile_patterns(self):
+        """Pre-compile frequently used regex patterns."""
+        return {
+            # Encoding and cleanup
+            'control_chars': re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]'),
+
+            # Script/style removal
+            'script_tags': re.compile(r'<script[^>]*>.*?</script>', re.IGNORECASE | re.DOTALL),
+            'style_tags': re.compile(r'<style[^>]*>.*?</style>', re.IGNORECASE | re.DOTALL),
+            'link_tags': re.compile(r'<link[^>]*>', re.IGNORECASE),
+            'comments': re.compile(r'<!--.*?-->', re.DOTALL),
+            'ix_hidden': re.compile(r'<ix:hidden[^>]*>.*?</ix:hidden>', re.IGNORECASE | re.DOTALL),
+            'ix_header': re.compile(r'<ix:header[^>]*>.*?</ix:header>', re.IGNORECASE | re.DOTALL),
+
+            # Malformed tags
+            'br_tags': re.compile(r'<br(?![^>]*/)>', re.IGNORECASE),
+            'img_tags': re.compile(r'<img([^>]+)(?<!/)>', re.IGNORECASE),
+            'input_tags': re.compile(r'<input([^>]+)(?<!/)>', re.IGNORECASE),
+            'hr_tags': re.compile(r'<hr(?![^>]*/)>', re.IGNORECASE),
+            'nested_p_open': re.compile(r'<p>\s*<p>', re.IGNORECASE),
+            'nested_p_close': re.compile(r'</p>\s*</p>', re.IGNORECASE),
+
+            # Whitespace normalization
+            'multiple_spaces': re.compile(r'[ \t]+'),
+            'multiple_newlines': re.compile(r'\n{3,}'),
+            'spaces_around_tags': re.compile(r'\s*(<[^>]+>)\s*'),
+
+            # Block element newlines - combined pattern for opening tags
+            'block_open_tags': re.compile(
+                r'(<(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)[^>]*>)',
+                re.IGNORECASE
+            ),
+            # Block element newlines - combined pattern for closing tags
+            'block_close_tags': re.compile(
+                r'(</(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)>)',
+                re.IGNORECASE
+            ),
+
+            # Empty tags removal - combined pattern for all removable tags
+            'empty_tags': re.compile(
+                r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*>\s*</(?:span|div|p|font|b|i|u|strong|em)>',
+                re.IGNORECASE
+            ),
+            'empty_self_closing': re.compile(
+                r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*/>\s*',
+                re.IGNORECASE
+            ),
+
+            # Common issues
+            'multiple_br': re.compile(r'(<br\s*/?>[\s\n]*){3,}', re.IGNORECASE),
+            'space_before_punct': re.compile(r'\s+([.,;!?])'),
+            'missing_space_after_punct': re.compile(r'([.,;!?])([A-Z])'),
+        }
+    
+    def process(self, html: str) -> str:
+        """
+        Preprocess HTML content.
+        
+        Args:
+            html: Raw HTML content
+            
+        Returns:
+            Cleaned HTML ready for parsing
+        """
+        # Remove BOM if present
+        if html.startswith('\ufeff'):
+            html = html[1:]
+        
+        # Remove XML declaration if present
+        html = remove_xml_declaration(html)
+        
+        # Fix common character encoding issues
+        html = self._fix_encoding_issues(html)
+        
+        # Remove script and style tags
+        html = self._remove_script_style(html)
+        
+        # Normalize entities
+        html = self._normalize_entities(html)
+        
+        # Fix malformed tags
+        html = self._fix_malformed_tags(html)
+        
+        # Normalize whitespace if not preserving
+        if not self.config.preserve_whitespace:
+            html = self._normalize_whitespace(html)
+        
+        # Remove empty tags
+        html = self._remove_empty_tags(html)
+        
+        # Fix common HTML issues
+        html = self._fix_common_issues(html)
+        
+        return html
+    
+    def _fix_encoding_issues(self, html: str) -> str:
+        """Fix common character encoding issues."""
+        # Replace Windows-1252 characters with Unicode equivalents
+        replacements = {
+            '\x91': "'",  # Left single quote
+            '\x92': "'",  # Right single quote
+            '\x93': '"',  # Left double quote
+            '\x94': '"',  # Right double quote
+            '\x95': '•',  # Bullet
+            '\x96': '–',  # En dash
+            '\x97': '—',  # Em dash
+            '\xa0': ' ',  # Non-breaking space
+        }
+        
+        for old, new in replacements.items():
+            html = html.replace(old, new)
+        
+        # Remove other control characters
+        html = self._compiled_patterns['control_chars'].sub('', html)
+        
+        return html
+    
+    def _remove_script_style(self, html: str) -> str:
+        """Remove script and style tags with content."""
+        # Use pre-compiled patterns for better performance
+        html = self._compiled_patterns['script_tags'].sub('', html)
+        html = self._compiled_patterns['style_tags'].sub('', html)
+        html = self._compiled_patterns['link_tags'].sub('', html)
+        html = self._compiled_patterns['comments'].sub('', html)
+        html = self._compiled_patterns['ix_hidden'].sub('', html)
+        html = self._compiled_patterns['ix_header'].sub('', html)
+
+        return html
+    
+    def _normalize_entities(self, html: str) -> str:
+        """Normalize HTML entities."""
+        # Common entity replacements
+        entities = {
+            '&nbsp;': ' ',
+            '&ensp;': ' ',
+            '&emsp;': '  ',
+            '&thinsp;': ' ',
+            '&#160;': ' ',
+            '&#32;': ' ',
+            '&zwj;': '',  # Zero-width joiner
+            '&zwnj;': '',  # Zero-width non-joiner
+            '&#8203;': '',  # Zero-width space
+        }
+        
+        for entity, replacement in entities.items():
+            html = html.replace(entity, replacement)
+        
+        # Fix double-encoded entities
+        html = html.replace('&amp;amp;', '&amp;')
+        html = html.replace('&amp;nbsp;', ' ')
+        html = html.replace('&amp;lt;', '&lt;')
+        html = html.replace('&amp;gt;', '&gt;')
+        
+        return html
+    
+    def _fix_malformed_tags(self, html: str) -> str:
+        """Fix common malformed tag issues."""
+        # Use pre-compiled patterns for better performance
+        html = self._compiled_patterns['br_tags'].sub('<br/>', html)
+        html = self._compiled_patterns['img_tags'].sub(r'<img\1/>', html)
+        html = self._compiled_patterns['input_tags'].sub(r'<input\1/>', html)
+        html = self._compiled_patterns['hr_tags'].sub('<hr/>', html)
+        html = self._compiled_patterns['nested_p_open'].sub('<p>', html)
+        html = self._compiled_patterns['nested_p_close'].sub('</p>', html)
+
+        return html
+    
+    def _normalize_whitespace(self, html: str) -> str:
+        """Normalize whitespace in HTML."""
+        # Use pre-compiled patterns for better performance
+        # Replace multiple spaces with single space
+        html = self._compiled_patterns['multiple_spaces'].sub(' ', html)
+
+        # Replace multiple newlines with double newline
+        html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
+
+        # Remove spaces around tags
+        html = self._compiled_patterns['spaces_around_tags'].sub(r'\1', html)
+
+        # Add newlines around block elements for readability
+        # Using combined patterns instead of looping over individual tags
+        html = self._compiled_patterns['block_open_tags'].sub(r'\n\1', html)
+        html = self._compiled_patterns['block_close_tags'].sub(r'\1\n', html)
+
+        # Clean up excessive newlines (apply again after adding newlines)
+        html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
+
+        return html.strip()
+    
+    def _remove_empty_tags(self, html: str) -> str:
+        """Remove empty tags that don't contribute content."""
+        # Use pre-compiled combined patterns instead of looping
+        html = self._compiled_patterns['empty_tags'].sub('', html)
+        html = self._compiled_patterns['empty_self_closing'].sub('', html)
+
+        return html
+    
+    def _fix_common_issues(self, html: str) -> str:
+        """Fix other common HTML issues."""
+        # Use pre-compiled patterns for better performance
+        html = self._compiled_patterns['multiple_br'].sub('<br/><br/>', html)
+        html = self._compiled_patterns['space_before_punct'].sub(r'\1', html)
+        html = self._compiled_patterns['missing_space_after_punct'].sub(r'\1 \2', html)
+
+        # Remove zero-width spaces (simple string replace is faster than regex)
+        html = html.replace('\u200b', '')
+        html = html.replace('\ufeff', '')
+
+        # Fix common typos in tags (simple string replace is faster than regex)
+        html = html.replace('<tabel', '<table')
+        html = html.replace('</tabel>', '</table>')
+
+        return html