Files
edgartools/venv/lib/python3.10/site-packages/edgar/documents/migration.py
2025-12-09 12:13:01 +01:00

318 lines
9.9 KiB
Python

"""
Migration and compatibility layer for transitioning from old parser to new.
NOTE: This compatibility layer is documented for user migration from v1.x → v2.0
It is intentionally not used internally but kept for user convenience.
Do not remove without versioning consideration.
"""
from typing import Optional, List, Dict, Any
import warnings
from edgar.documents import HTMLParser, Document, ParserConfig
from edgar.documents.search import DocumentSearch
class LegacyHTMLDocument:
"""
Compatibility wrapper that mimics the old Document API.
This allows existing code to work with the new parser
while providing deprecation warnings.
"""
def __init__(self, new_document: Document):
"""Initialize with new document."""
self._doc = new_document
self._warn_on_use = True
def _deprecation_warning(self, old_method: str, new_method: str = None):
"""Issue deprecation warning."""
if self._warn_on_use:
msg = f"Document.{old_method} is deprecated."
if new_method:
msg += f" Use {new_method} instead."
warnings.warn(msg, DeprecationWarning, stacklevel=3)
@property
def text(self) -> str:
"""Get document text (old API)."""
self._deprecation_warning("text", "Document.text()")
return self._doc.text()
def get_text(self, clean: bool = True) -> str:
"""Get text with options (old API)."""
self._deprecation_warning("get_text()", "Document.text()")
return self._doc.text()
@property
def tables(self) -> List[Any]:
"""Get tables (old API)."""
self._deprecation_warning("tables", "Document.tables")
return self._doc.tables
def find_all(self, tag: str) -> List[Any]:
"""Find elements by tag (old API)."""
self._deprecation_warning("find_all()", "Document.root.find()")
# Map old tag names to node types
from edgar.documents.types import NodeType
tag_map = {
'h1': NodeType.HEADING,
'h2': NodeType.HEADING,
'h3': NodeType.HEADING,
'p': NodeType.PARAGRAPH,
'table': NodeType.TABLE,
}
node_type = tag_map.get(tag.lower())
if node_type:
return self._doc.root.find(lambda n: n.type == node_type)
return []
def search(self, pattern: str) -> List[str]:
"""Search document (old API)."""
self._deprecation_warning("search()", "DocumentSearch.search()")
search = DocumentSearch(self._doc)
results = search.search(pattern)
return [r.text for r in results]
@property
def sections(self) -> Dict[str, Any]:
"""Get sections (old API)."""
# Convert new sections to old format
new_sections = self._doc.sections
old_sections = {}
for name, section in new_sections.items():
old_sections[name] = {
'title': section.title,
'text': section.text(),
'start': section.start_offset,
'end': section.end_offset
}
return old_sections
def to_markdown(self) -> str:
"""Convert to markdown (old API)."""
self._deprecation_warning("to_markdown()", "MarkdownRenderer.render()")
from edgar.documents.renderers import MarkdownRenderer
renderer = MarkdownRenderer()
return renderer.render(self._doc)
class LegacySECHTMLParser:
"""
Compatibility wrapper for old SECHTMLParser.
Maps old parser methods to new parser.
"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""Initialize with optional config."""
# Convert old config to new
new_config = self._convert_config(config)
self._parser = HTMLParser(new_config)
self._warn_on_use = True
def _convert_config(self, old_config: Optional[Dict[str, Any]]) -> ParserConfig:
"""Convert old config format to new."""
if not old_config:
return ParserConfig()
new_config = ParserConfig()
# Map old config keys to new
if 'clean_text' in old_config:
new_config.clean_text = old_config['clean_text']
if 'extract_tables' in old_config:
new_config.table_extraction = old_config['extract_tables']
if 'preserve_layout' in old_config:
new_config.preserve_whitespace = old_config['preserve_layout']
return new_config
def parse(self, html: str) -> LegacyHTMLDocument:
"""Parse HTML (old API)."""
if self._warn_on_use:
warnings.warn(
"SECHTMLParser is deprecated. Use HTMLParser instead.",
DeprecationWarning,
stacklevel=2
)
new_doc = self._parser.parse(html)
return LegacyHTMLDocument(new_doc)
def parse_file(self, filepath: str) -> LegacyHTMLDocument:
"""Parse HTML file (old API)."""
if self._warn_on_use:
warnings.warn(
"SECHTMLParser.parse_file() is deprecated. Use HTMLParser.parse_file() instead.",
DeprecationWarning,
stacklevel=2
)
new_doc = self._parser.parse_file(filepath)
return LegacyHTMLDocument(new_doc)
def migrate_parser_usage(code: str) -> str:
"""
Helper to migrate code from old parser to new.
Args:
code: Python code using old parser
Returns:
Updated code using new parser
"""
replacements = [
# Import statements
("from edgar.files.html import SECHTMLParser",
"from edgar.documents import HTMLParser"),
("from edgar.files.html import Document",
"from edgar.documents import Document"),
# Class instantiation
("SECHTMLParser(", "HTMLParser("),
# Method calls
("document.text", "document.text()"),
("document.get_text(", "document.text("),
("document.find_all(", "document.root.find(lambda n: n.tag == "),
("document.to_markdown(", "MarkdownRenderer().render(document"),
# Config changes
("extract_tables=", "table_extraction="),
("preserve_layout=", "preserve_whitespace="),
]
migrated = code
for old, new in replacements:
migrated = migrated.replace(old, new)
return migrated
class MigrationGuide:
"""
Provides migration guidance and utilities.
"""
@staticmethod
def check_compatibility(old_parser_instance) -> Dict[str, Any]:
"""
Check if old parser instance can be migrated.
Returns:
Dict with compatibility info
"""
return {
'can_migrate': True,
'warnings': [],
'recommendations': [
"Replace SECHTMLParser with HTMLParser",
"Update document.text to document.text()",
"Use DocumentSearch for search functionality",
"Use MarkdownRenderer for markdown conversion"
]
}
@staticmethod
def print_migration_guide():
"""Print migration guide."""
guide = """
HTML Parser Migration Guide
==========================
The new HTML parser provides significant improvements:
- 10x performance improvement
- Better table parsing
- Reliable section detection
- Advanced search capabilities
Key Changes:
-----------
1. Imports:
OLD: from edgar.files.html import SECHTMLParser, Document
NEW: from edgar.documents import HTMLParser, Document
2. Parser Creation:
OLD: parser = SECHTMLParser()
NEW: parser = HTMLParser()
3. Document Text:
OLD: document.text or document.get_text()
NEW: document.text()
4. Search:
OLD: document.search(pattern)
NEW: search = DocumentSearch(document)
results = search.search(pattern)
5. Tables:
OLD: document.tables
NEW: document.tables (same, but returns richer TableNode objects)
6. Sections:
OLD: document.sections
NEW: document.sections (returns Section objects with more features)
7. Markdown:
OLD: document.to_markdown()
NEW: renderer = MarkdownRenderer()
markdown = renderer.render(document)
Compatibility:
-------------
For gradual migration, use the compatibility layer:
from edgar.documents.migration import LegacySECHTMLParser
parser = LegacySECHTMLParser() # Works like old parser
This will issue deprecation warnings to help you migrate.
Performance Config:
------------------
For best performance:
parser = HTMLParser.create_for_performance()
For best accuracy:
parser = HTMLParser.create_for_accuracy()
For AI/LLM processing:
parser = HTMLParser.create_for_ai()
"""
print(guide)
# Compatibility aliases
SECHTMLParser = LegacySECHTMLParser
HTMLDocument = LegacyHTMLDocument
# Auto-migration for common imports
def __getattr__(name):
"""Provide compatibility imports with warnings."""
if name == "SECHTMLParser":
warnings.warn(
"Importing SECHTMLParser from edgar.documents.migration is deprecated. "
"Use HTMLParser from edgar.documents instead.",
DeprecationWarning,
stacklevel=2
)
return LegacySECHTMLParser
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")