Initial commit
This commit is contained in:
318
venv/lib/python3.10/site-packages/edgar/documents/migration.py
Normal file
318
venv/lib/python3.10/site-packages/edgar/documents/migration.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""
|
||||
Migration and compatibility layer for transitioning from old parser to new.
|
||||
|
||||
NOTE: This compatibility layer is documented for user migration from v1.x → v2.0
|
||||
It is intentionally not used internally but kept for user convenience.
|
||||
Do not remove without versioning consideration.
|
||||
"""
|
||||
|
||||
from typing import Optional, List, Dict, Any
|
||||
import warnings
|
||||
from edgar.documents import HTMLParser, Document, ParserConfig
|
||||
from edgar.documents.search import DocumentSearch
|
||||
|
||||
|
||||
class LegacyHTMLDocument:
|
||||
"""
|
||||
Compatibility wrapper that mimics the old Document API.
|
||||
|
||||
This allows existing code to work with the new parser
|
||||
while providing deprecation warnings.
|
||||
"""
|
||||
|
||||
def __init__(self, new_document: Document):
|
||||
"""Initialize with new document."""
|
||||
self._doc = new_document
|
||||
self._warn_on_use = True
|
||||
|
||||
def _deprecation_warning(self, old_method: str, new_method: str = None):
|
||||
"""Issue deprecation warning."""
|
||||
if self._warn_on_use:
|
||||
msg = f"Document.{old_method} is deprecated."
|
||||
if new_method:
|
||||
msg += f" Use {new_method} instead."
|
||||
warnings.warn(msg, DeprecationWarning, stacklevel=3)
|
||||
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""Get document text (old API)."""
|
||||
self._deprecation_warning("text", "Document.text()")
|
||||
return self._doc.text()
|
||||
|
||||
def get_text(self, clean: bool = True) -> str:
|
||||
"""Get text with options (old API)."""
|
||||
self._deprecation_warning("get_text()", "Document.text()")
|
||||
return self._doc.text()
|
||||
|
||||
@property
|
||||
def tables(self) -> List[Any]:
|
||||
"""Get tables (old API)."""
|
||||
self._deprecation_warning("tables", "Document.tables")
|
||||
return self._doc.tables
|
||||
|
||||
def find_all(self, tag: str) -> List[Any]:
|
||||
"""Find elements by tag (old API)."""
|
||||
self._deprecation_warning("find_all()", "Document.root.find()")
|
||||
|
||||
# Map old tag names to node types
|
||||
from edgar.documents.types import NodeType
|
||||
|
||||
tag_map = {
|
||||
'h1': NodeType.HEADING,
|
||||
'h2': NodeType.HEADING,
|
||||
'h3': NodeType.HEADING,
|
||||
'p': NodeType.PARAGRAPH,
|
||||
'table': NodeType.TABLE,
|
||||
}
|
||||
|
||||
node_type = tag_map.get(tag.lower())
|
||||
if node_type:
|
||||
return self._doc.root.find(lambda n: n.type == node_type)
|
||||
|
||||
return []
|
||||
|
||||
def search(self, pattern: str) -> List[str]:
|
||||
"""Search document (old API)."""
|
||||
self._deprecation_warning("search()", "DocumentSearch.search()")
|
||||
|
||||
search = DocumentSearch(self._doc)
|
||||
results = search.search(pattern)
|
||||
return [r.text for r in results]
|
||||
|
||||
@property
|
||||
def sections(self) -> Dict[str, Any]:
|
||||
"""Get sections (old API)."""
|
||||
# Convert new sections to old format
|
||||
new_sections = self._doc.sections
|
||||
old_sections = {}
|
||||
|
||||
for name, section in new_sections.items():
|
||||
old_sections[name] = {
|
||||
'title': section.title,
|
||||
'text': section.text(),
|
||||
'start': section.start_offset,
|
||||
'end': section.end_offset
|
||||
}
|
||||
|
||||
return old_sections
|
||||
|
||||
def to_markdown(self) -> str:
|
||||
"""Convert to markdown (old API)."""
|
||||
self._deprecation_warning("to_markdown()", "MarkdownRenderer.render()")
|
||||
|
||||
from edgar.documents.renderers import MarkdownRenderer
|
||||
renderer = MarkdownRenderer()
|
||||
return renderer.render(self._doc)
|
||||
|
||||
|
||||
class LegacySECHTMLParser:
|
||||
"""
|
||||
Compatibility wrapper for old SECHTMLParser.
|
||||
|
||||
Maps old parser methods to new parser.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""Initialize with optional config."""
|
||||
# Convert old config to new
|
||||
new_config = self._convert_config(config)
|
||||
self._parser = HTMLParser(new_config)
|
||||
self._warn_on_use = True
|
||||
|
||||
def _convert_config(self, old_config: Optional[Dict[str, Any]]) -> ParserConfig:
|
||||
"""Convert old config format to new."""
|
||||
if not old_config:
|
||||
return ParserConfig()
|
||||
|
||||
new_config = ParserConfig()
|
||||
|
||||
# Map old config keys to new
|
||||
if 'clean_text' in old_config:
|
||||
new_config.clean_text = old_config['clean_text']
|
||||
|
||||
if 'extract_tables' in old_config:
|
||||
new_config.table_extraction = old_config['extract_tables']
|
||||
|
||||
if 'preserve_layout' in old_config:
|
||||
new_config.preserve_whitespace = old_config['preserve_layout']
|
||||
|
||||
return new_config
|
||||
|
||||
def parse(self, html: str) -> LegacyHTMLDocument:
|
||||
"""Parse HTML (old API)."""
|
||||
if self._warn_on_use:
|
||||
warnings.warn(
|
||||
"SECHTMLParser is deprecated. Use HTMLParser instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
|
||||
new_doc = self._parser.parse(html)
|
||||
return LegacyHTMLDocument(new_doc)
|
||||
|
||||
def parse_file(self, filepath: str) -> LegacyHTMLDocument:
|
||||
"""Parse HTML file (old API)."""
|
||||
if self._warn_on_use:
|
||||
warnings.warn(
|
||||
"SECHTMLParser.parse_file() is deprecated. Use HTMLParser.parse_file() instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
|
||||
new_doc = self._parser.parse_file(filepath)
|
||||
return LegacyHTMLDocument(new_doc)
|
||||
|
||||
|
||||
def migrate_parser_usage(code: str) -> str:
|
||||
"""
|
||||
Helper to migrate code from old parser to new.
|
||||
|
||||
Args:
|
||||
code: Python code using old parser
|
||||
|
||||
Returns:
|
||||
Updated code using new parser
|
||||
"""
|
||||
replacements = [
|
||||
# Import statements
|
||||
("from edgar.files.html import SECHTMLParser",
|
||||
"from edgar.documents import HTMLParser"),
|
||||
|
||||
("from edgar.files.html import Document",
|
||||
"from edgar.documents import Document"),
|
||||
|
||||
# Class instantiation
|
||||
("SECHTMLParser(", "HTMLParser("),
|
||||
|
||||
# Method calls
|
||||
("document.text", "document.text()"),
|
||||
("document.get_text(", "document.text("),
|
||||
("document.find_all(", "document.root.find(lambda n: n.tag == "),
|
||||
("document.to_markdown(", "MarkdownRenderer().render(document"),
|
||||
|
||||
# Config changes
|
||||
("extract_tables=", "table_extraction="),
|
||||
("preserve_layout=", "preserve_whitespace="),
|
||||
]
|
||||
|
||||
migrated = code
|
||||
for old, new in replacements:
|
||||
migrated = migrated.replace(old, new)
|
||||
|
||||
return migrated
|
||||
|
||||
|
||||
class MigrationGuide:
|
||||
"""
|
||||
Provides migration guidance and utilities.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def check_compatibility(old_parser_instance) -> Dict[str, Any]:
|
||||
"""
|
||||
Check if old parser instance can be migrated.
|
||||
|
||||
Returns:
|
||||
Dict with compatibility info
|
||||
"""
|
||||
return {
|
||||
'can_migrate': True,
|
||||
'warnings': [],
|
||||
'recommendations': [
|
||||
"Replace SECHTMLParser with HTMLParser",
|
||||
"Update document.text to document.text()",
|
||||
"Use DocumentSearch for search functionality",
|
||||
"Use MarkdownRenderer for markdown conversion"
|
||||
]
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def print_migration_guide():
|
||||
"""Print migration guide."""
|
||||
guide = """
|
||||
HTML Parser Migration Guide
|
||||
==========================
|
||||
|
||||
The new HTML parser provides significant improvements:
|
||||
- 10x performance improvement
|
||||
- Better table parsing
|
||||
- Reliable section detection
|
||||
- Advanced search capabilities
|
||||
|
||||
Key Changes:
|
||||
-----------
|
||||
|
||||
1. Imports:
|
||||
OLD: from edgar.files.html import SECHTMLParser, Document
|
||||
NEW: from edgar.documents import HTMLParser, Document
|
||||
|
||||
2. Parser Creation:
|
||||
OLD: parser = SECHTMLParser()
|
||||
NEW: parser = HTMLParser()
|
||||
|
||||
3. Document Text:
|
||||
OLD: document.text or document.get_text()
|
||||
NEW: document.text()
|
||||
|
||||
4. Search:
|
||||
OLD: document.search(pattern)
|
||||
NEW: search = DocumentSearch(document)
|
||||
results = search.search(pattern)
|
||||
|
||||
5. Tables:
|
||||
OLD: document.tables
|
||||
NEW: document.tables (same, but returns richer TableNode objects)
|
||||
|
||||
6. Sections:
|
||||
OLD: document.sections
|
||||
NEW: document.sections (returns Section objects with more features)
|
||||
|
||||
7. Markdown:
|
||||
OLD: document.to_markdown()
|
||||
NEW: renderer = MarkdownRenderer()
|
||||
markdown = renderer.render(document)
|
||||
|
||||
Compatibility:
|
||||
-------------
|
||||
|
||||
For gradual migration, use the compatibility layer:
|
||||
|
||||
from edgar.documents.migration import LegacySECHTMLParser
|
||||
parser = LegacySECHTMLParser() # Works like old parser
|
||||
|
||||
This will issue deprecation warnings to help you migrate.
|
||||
|
||||
Performance Config:
|
||||
------------------
|
||||
|
||||
For best performance:
|
||||
parser = HTMLParser.create_for_performance()
|
||||
|
||||
For best accuracy:
|
||||
parser = HTMLParser.create_for_accuracy()
|
||||
|
||||
For AI/LLM processing:
|
||||
parser = HTMLParser.create_for_ai()
|
||||
"""
|
||||
|
||||
print(guide)
|
||||
|
||||
|
||||
# Compatibility aliases
|
||||
SECHTMLParser = LegacySECHTMLParser
|
||||
HTMLDocument = LegacyHTMLDocument
|
||||
|
||||
|
||||
# Auto-migration for common imports
|
||||
def __getattr__(name):
|
||||
"""Provide compatibility imports with warnings."""
|
||||
if name == "SECHTMLParser":
|
||||
warnings.warn(
|
||||
"Importing SECHTMLParser from edgar.documents.migration is deprecated. "
|
||||
"Use HTMLParser from edgar.documents instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
return LegacySECHTMLParser
|
||||
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
Reference in New Issue
Block a user