edgartools/venv/lib/python3.10/site-packages/edgar/documents/migration.py

"""
Migration and compatibility layer for transitioning from old parser to new.

NOTE: This compatibility layer is documented for user migration from v1.x → v2.0
It is intentionally not used internally but kept for user convenience.
Do not remove without versioning consideration.
"""

from typing import Optional, List, Dict, Any
import warnings
from edgar.documents import HTMLParser, Document, ParserConfig
from edgar.documents.search import DocumentSearch


class LegacyHTMLDocument:
    """
    Compatibility wrapper that mimics the old Document API.

    This allows existing code to work with the new parser
    while providing deprecation warnings.
    """

    def __init__(self, new_document: Document):
        """Initialize with new document."""
        self._doc = new_document
        self._warn_on_use = True

    def _deprecation_warning(self, old_method: str, new_method: str = None):
        """Issue deprecation warning."""
        if self._warn_on_use:
            msg = f"Document.{old_method} is deprecated."
            if new_method:
                msg += f" Use {new_method} instead."
            warnings.warn(msg, DeprecationWarning, stacklevel=3)

    @property
    def text(self) -> str:
        """Get document text (old API)."""
        self._deprecation_warning("text", "Document.text()")
        return self._doc.text()

    def get_text(self, clean: bool = True) -> str:
        """Get text with options (old API)."""
        self._deprecation_warning("get_text()", "Document.text()")
        return self._doc.text()

    @property
    def tables(self) -> List[Any]:
        """Get tables (old API)."""
        self._deprecation_warning("tables", "Document.tables")
        return self._doc.tables

    def find_all(self, tag: str) -> List[Any]:
        """Find elements by tag (old API)."""
        self._deprecation_warning("find_all()", "Document.root.find()")

        # Map old tag names to node types
        from edgar.documents.types import NodeType

        tag_map = {
            'h1': NodeType.HEADING,
            'h2': NodeType.HEADING,
            'h3': NodeType.HEADING,
            'p': NodeType.PARAGRAPH,
            'table': NodeType.TABLE,
        }

        node_type = tag_map.get(tag.lower())
        if node_type:
            return self._doc.root.find(lambda n: n.type == node_type)

        return []

    def search(self, pattern: str) -> List[str]:
        """Search document (old API)."""
        self._deprecation_warning("search()", "DocumentSearch.search()")

        search = DocumentSearch(self._doc)
        results = search.search(pattern)
        return [r.text for r in results]

    @property
    def sections(self) -> Dict[str, Any]:
        """Get sections (old API)."""
        # Convert new sections to old format
        new_sections = self._doc.sections
        old_sections = {}

        for name, section in new_sections.items():
            old_sections[name] = {
                'title': section.title,
                'text': section.text(),
                'start': section.start_offset,
                'end': section.end_offset
            }

        return old_sections

    def to_markdown(self) -> str:
        """Convert to markdown (old API)."""
        self._deprecation_warning("to_markdown()", "MarkdownRenderer.render()")

        from edgar.documents.renderers import MarkdownRenderer
        renderer = MarkdownRenderer()
        return renderer.render(self._doc)


class LegacySECHTMLParser:
    """
    Compatibility wrapper for old SECHTMLParser.

    Maps old parser methods to new parser.
    """

    def __init__(self, config: Optional[Dict[str, Any]] = None):
        """Initialize with optional config."""
        # Convert old config to new
        new_config = self._convert_config(config)
        self._parser = HTMLParser(new_config)
        self._warn_on_use = True

    def _convert_config(self, old_config: Optional[Dict[str, Any]]) -> ParserConfig:
        """Convert old config format to new."""
        if not old_config:
            return ParserConfig()

        new_config = ParserConfig()

        # Map old config keys to new
        if 'clean_text' in old_config:
            new_config.clean_text = old_config['clean_text']

        if 'extract_tables' in old_config:
            new_config.table_extraction = old_config['extract_tables']

        if 'preserve_layout' in old_config:
            new_config.preserve_whitespace = old_config['preserve_layout']

        return new_config

    def parse(self, html: str) -> LegacyHTMLDocument:
        """Parse HTML (old API)."""
        if self._warn_on_use:
            warnings.warn(
                "SECHTMLParser is deprecated. Use HTMLParser instead.",
                DeprecationWarning,
                stacklevel=2
            )

        new_doc = self._parser.parse(html)
        return LegacyHTMLDocument(new_doc)

    def parse_file(self, filepath: str) -> LegacyHTMLDocument:
        """Parse HTML file (old API)."""
        if self._warn_on_use:
            warnings.warn(
                "SECHTMLParser.parse_file() is deprecated. Use HTMLParser.parse_file() instead.",
                DeprecationWarning,
                stacklevel=2
            )

        new_doc = self._parser.parse_file(filepath)
        return LegacyHTMLDocument(new_doc)


def migrate_parser_usage(code: str) -> str:
    """
    Helper to migrate code from old parser to new.

    Args:
        code: Python code using old parser

    Returns:
        Updated code using new parser
    """
    replacements = [
        # Import statements
        ("from edgar.files.html import SECHTMLParser",
         "from edgar.documents import HTMLParser"),

        ("from edgar.files.html import Document",
         "from edgar.documents import Document"),

        # Class instantiation
        ("SECHTMLParser(", "HTMLParser("),

        # Method calls
        ("document.text", "document.text()"),
        ("document.get_text(", "document.text("),
        ("document.find_all(", "document.root.find(lambda n: n.tag == "),
        ("document.to_markdown(", "MarkdownRenderer().render(document"),

        # Config changes
        ("extract_tables=", "table_extraction="),
        ("preserve_layout=", "preserve_whitespace="),
    ]

    migrated = code
    for old, new in replacements:
        migrated = migrated.replace(old, new)

    return migrated


class MigrationGuide:
    """
    Provides migration guidance and utilities.
    """

    @staticmethod
    def check_compatibility(old_parser_instance) -> Dict[str, Any]:
        """
        Check if old parser instance can be migrated.

        Returns:
            Dict with compatibility info
        """
        return {
            'can_migrate': True,
            'warnings': [],
            'recommendations': [
                "Replace SECHTMLParser with HTMLParser",
                "Update document.text to document.text()",
                "Use DocumentSearch for search functionality",
                "Use MarkdownRenderer for markdown conversion"
            ]
        }

    @staticmethod
    def print_migration_guide():
        """Print migration guide."""
        guide = """
        HTML Parser Migration Guide
        ==========================

        The new HTML parser provides significant improvements:
        - 10x performance improvement
        - Better table parsing
        - Reliable section detection
        - Advanced search capabilities

        Key Changes:
        -----------

        1. Imports:
           OLD: from edgar.files.html import SECHTMLParser, Document
           NEW: from edgar.documents import HTMLParser, Document

        2. Parser Creation:
           OLD: parser = SECHTMLParser()
           NEW: parser = HTMLParser()

        3. Document Text:
           OLD: document.text or document.get_text()
           NEW: document.text()

        4. Search:
           OLD: document.search(pattern)
           NEW: search = DocumentSearch(document)
                results = search.search(pattern)

        5. Tables:
           OLD: document.tables
           NEW: document.tables (same, but returns richer TableNode objects)

        6. Sections:
           OLD: document.sections
           NEW: document.sections (returns Section objects with more features)

        7. Markdown:
           OLD: document.to_markdown()
           NEW: renderer = MarkdownRenderer()
                markdown = renderer.render(document)

        Compatibility:
        -------------

        For gradual migration, use the compatibility layer:

        from edgar.documents.migration import LegacySECHTMLParser
        parser = LegacySECHTMLParser()  # Works like old parser

        This will issue deprecation warnings to help you migrate.

        Performance Config:
        ------------------

        For best performance:
        parser = HTMLParser.create_for_performance()

        For best accuracy:
        parser = HTMLParser.create_for_accuracy()

        For AI/LLM processing:
        parser = HTMLParser.create_for_ai()
        """

        print(guide)


# Compatibility aliases
SECHTMLParser = LegacySECHTMLParser
HTMLDocument = LegacyHTMLDocument


# Auto-migration for common imports
def __getattr__(name):
    """Provide compatibility imports with warnings."""
    if name == "SECHTMLParser":
        warnings.warn(
            "Importing SECHTMLParser from edgar.documents.migration is deprecated. "
            "Use HTMLParser from edgar.documents instead.",
            DeprecationWarning,
            stacklevel=2
        )
        return LegacySECHTMLParser

    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")