Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/documents/migration.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/migration.py
@@ -0,0 +1,318 @@
+"""
+Migration and compatibility layer for transitioning from old parser to new.
+
+NOTE: This compatibility layer is documented for user migration from v1.x → v2.0
+It is intentionally not used internally but kept for user convenience.
+Do not remove without versioning consideration.
+"""
+
+from typing import Optional, List, Dict, Any
+import warnings
+from edgar.documents import HTMLParser, Document, ParserConfig
+from edgar.documents.search import DocumentSearch
+
+
+class LegacyHTMLDocument:
+    """
+    Compatibility wrapper that mimics the old Document API.
+    
+    This allows existing code to work with the new parser
+    while providing deprecation warnings.
+    """
+    
+    def __init__(self, new_document: Document):
+        """Initialize with new document."""
+        self._doc = new_document
+        self._warn_on_use = True
+    
+    def _deprecation_warning(self, old_method: str, new_method: str = None):
+        """Issue deprecation warning."""
+        if self._warn_on_use:
+            msg = f"Document.{old_method} is deprecated."
+            if new_method:
+                msg += f" Use {new_method} instead."
+            warnings.warn(msg, DeprecationWarning, stacklevel=3)
+    
+    @property
+    def text(self) -> str:
+        """Get document text (old API)."""
+        self._deprecation_warning("text", "Document.text()")
+        return self._doc.text()
+    
+    def get_text(self, clean: bool = True) -> str:
+        """Get text with options (old API)."""
+        self._deprecation_warning("get_text()", "Document.text()")
+        return self._doc.text()
+    
+    @property
+    def tables(self) -> List[Any]:
+        """Get tables (old API)."""
+        self._deprecation_warning("tables", "Document.tables")
+        return self._doc.tables
+    
+    def find_all(self, tag: str) -> List[Any]:
+        """Find elements by tag (old API)."""
+        self._deprecation_warning("find_all()", "Document.root.find()")
+        
+        # Map old tag names to node types
+        from edgar.documents.types import NodeType
+        
+        tag_map = {
+            'h1': NodeType.HEADING,
+            'h2': NodeType.HEADING,
+            'h3': NodeType.HEADING,
+            'p': NodeType.PARAGRAPH,
+            'table': NodeType.TABLE,
+        }
+        
+        node_type = tag_map.get(tag.lower())
+        if node_type:
+            return self._doc.root.find(lambda n: n.type == node_type)
+        
+        return []
+    
+    def search(self, pattern: str) -> List[str]:
+        """Search document (old API)."""
+        self._deprecation_warning("search()", "DocumentSearch.search()")
+        
+        search = DocumentSearch(self._doc)
+        results = search.search(pattern)
+        return [r.text for r in results]
+    
+    @property
+    def sections(self) -> Dict[str, Any]:
+        """Get sections (old API)."""
+        # Convert new sections to old format
+        new_sections = self._doc.sections
+        old_sections = {}
+        
+        for name, section in new_sections.items():
+            old_sections[name] = {
+                'title': section.title,
+                'text': section.text(),
+                'start': section.start_offset,
+                'end': section.end_offset
+            }
+        
+        return old_sections
+    
+    def to_markdown(self) -> str:
+        """Convert to markdown (old API)."""
+        self._deprecation_warning("to_markdown()", "MarkdownRenderer.render()")
+        
+        from edgar.documents.renderers import MarkdownRenderer
+        renderer = MarkdownRenderer()
+        return renderer.render(self._doc)
+
+
+class LegacySECHTMLParser:
+    """
+    Compatibility wrapper for old SECHTMLParser.
+    
+    Maps old parser methods to new parser.
+    """
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """Initialize with optional config."""
+        # Convert old config to new
+        new_config = self._convert_config(config)
+        self._parser = HTMLParser(new_config)
+        self._warn_on_use = True
+    
+    def _convert_config(self, old_config: Optional[Dict[str, Any]]) -> ParserConfig:
+        """Convert old config format to new."""
+        if not old_config:
+            return ParserConfig()
+        
+        new_config = ParserConfig()
+        
+        # Map old config keys to new
+        if 'clean_text' in old_config:
+            new_config.clean_text = old_config['clean_text']
+        
+        if 'extract_tables' in old_config:
+            new_config.table_extraction = old_config['extract_tables']
+        
+        if 'preserve_layout' in old_config:
+            new_config.preserve_whitespace = old_config['preserve_layout']
+        
+        return new_config
+    
+    def parse(self, html: str) -> LegacyHTMLDocument:
+        """Parse HTML (old API)."""
+        if self._warn_on_use:
+            warnings.warn(
+                "SECHTMLParser is deprecated. Use HTMLParser instead.",
+                DeprecationWarning,
+                stacklevel=2
+            )
+        
+        new_doc = self._parser.parse(html)
+        return LegacyHTMLDocument(new_doc)
+    
+    def parse_file(self, filepath: str) -> LegacyHTMLDocument:
+        """Parse HTML file (old API)."""
+        if self._warn_on_use:
+            warnings.warn(
+                "SECHTMLParser.parse_file() is deprecated. Use HTMLParser.parse_file() instead.",
+                DeprecationWarning,
+                stacklevel=2
+            )
+        
+        new_doc = self._parser.parse_file(filepath)
+        return LegacyHTMLDocument(new_doc)
+
+
+def migrate_parser_usage(code: str) -> str:
+    """
+    Helper to migrate code from old parser to new.
+    
+    Args:
+        code: Python code using old parser
+        
+    Returns:
+        Updated code using new parser
+    """
+    replacements = [
+        # Import statements
+        ("from edgar.files.html import SECHTMLParser", 
+         "from edgar.documents import HTMLParser"),
+        
+        ("from edgar.files.html import Document",
+         "from edgar.documents import Document"),
+        
+        # Class instantiation
+        ("SECHTMLParser(", "HTMLParser("),
+        
+        # Method calls
+        ("document.text", "document.text()"),
+        ("document.get_text(", "document.text("),
+        ("document.find_all(", "document.root.find(lambda n: n.tag == "),
+        ("document.to_markdown(", "MarkdownRenderer().render(document"),
+        
+        # Config changes
+        ("extract_tables=", "table_extraction="),
+        ("preserve_layout=", "preserve_whitespace="),
+    ]
+    
+    migrated = code
+    for old, new in replacements:
+        migrated = migrated.replace(old, new)
+    
+    return migrated
+
+
+class MigrationGuide:
+    """
+    Provides migration guidance and utilities.
+    """
+    
+    @staticmethod
+    def check_compatibility(old_parser_instance) -> Dict[str, Any]:
+        """
+        Check if old parser instance can be migrated.
+        
+        Returns:
+            Dict with compatibility info
+        """
+        return {
+            'can_migrate': True,
+            'warnings': [],
+            'recommendations': [
+                "Replace SECHTMLParser with HTMLParser",
+                "Update document.text to document.text()",
+                "Use DocumentSearch for search functionality",
+                "Use MarkdownRenderer for markdown conversion"
+            ]
+        }
+    
+    @staticmethod
+    def print_migration_guide():
+        """Print migration guide."""
+        guide = """
+        HTML Parser Migration Guide
+        ==========================
+        
+        The new HTML parser provides significant improvements:
+        - 10x performance improvement
+        - Better table parsing
+        - Reliable section detection
+        - Advanced search capabilities
+        
+        Key Changes:
+        -----------
+        
+        1. Imports:
+           OLD: from edgar.files.html import SECHTMLParser, Document
+           NEW: from edgar.documents import HTMLParser, Document
+        
+        2. Parser Creation:
+           OLD: parser = SECHTMLParser()
+           NEW: parser = HTMLParser()
+        
+        3. Document Text:
+           OLD: document.text or document.get_text()
+           NEW: document.text()
+        
+        4. Search:
+           OLD: document.search(pattern)
+           NEW: search = DocumentSearch(document)
+                results = search.search(pattern)
+        
+        5. Tables:
+           OLD: document.tables
+           NEW: document.tables (same, but returns richer TableNode objects)
+        
+        6. Sections:
+           OLD: document.sections
+           NEW: document.sections (returns Section objects with more features)
+        
+        7. Markdown:
+           OLD: document.to_markdown()
+           NEW: renderer = MarkdownRenderer()
+                markdown = renderer.render(document)
+        
+        Compatibility:
+        -------------
+        
+        For gradual migration, use the compatibility layer:
+        
+        from edgar.documents.migration import LegacySECHTMLParser
+        parser = LegacySECHTMLParser()  # Works like old parser
+        
+        This will issue deprecation warnings to help you migrate.
+        
+        Performance Config:
+        ------------------
+        
+        For best performance:
+        parser = HTMLParser.create_for_performance()
+        
+        For best accuracy:
+        parser = HTMLParser.create_for_accuracy()
+        
+        For AI/LLM processing:
+        parser = HTMLParser.create_for_ai()
+        """
+        
+        print(guide)
+
+
+# Compatibility aliases
+SECHTMLParser = LegacySECHTMLParser
+HTMLDocument = LegacyHTMLDocument
+
+
+# Auto-migration for common imports
+def __getattr__(name):
+    """Provide compatibility imports with warnings."""
+    if name == "SECHTMLParser":
+        warnings.warn(
+            "Importing SECHTMLParser from edgar.documents.migration is deprecated. "
+            "Use HTMLParser from edgar.documents instead.",
+            DeprecationWarning,
+            stacklevel=2
+        )
+        return LegacySECHTMLParser
+    
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")