318 lines
9.9 KiB
Python
318 lines
9.9 KiB
Python
"""
|
|
Migration and compatibility layer for transitioning from old parser to new.
|
|
|
|
NOTE: This compatibility layer is documented for user migration from v1.x → v2.0
|
|
It is intentionally not used internally but kept for user convenience.
|
|
Do not remove without versioning consideration.
|
|
"""
|
|
|
|
from typing import Optional, List, Dict, Any
|
|
import warnings
|
|
from edgar.documents import HTMLParser, Document, ParserConfig
|
|
from edgar.documents.search import DocumentSearch
|
|
|
|
|
|
class LegacyHTMLDocument:
|
|
"""
|
|
Compatibility wrapper that mimics the old Document API.
|
|
|
|
This allows existing code to work with the new parser
|
|
while providing deprecation warnings.
|
|
"""
|
|
|
|
def __init__(self, new_document: Document):
|
|
"""Initialize with new document."""
|
|
self._doc = new_document
|
|
self._warn_on_use = True
|
|
|
|
def _deprecation_warning(self, old_method: str, new_method: str = None):
|
|
"""Issue deprecation warning."""
|
|
if self._warn_on_use:
|
|
msg = f"Document.{old_method} is deprecated."
|
|
if new_method:
|
|
msg += f" Use {new_method} instead."
|
|
warnings.warn(msg, DeprecationWarning, stacklevel=3)
|
|
|
|
@property
|
|
def text(self) -> str:
|
|
"""Get document text (old API)."""
|
|
self._deprecation_warning("text", "Document.text()")
|
|
return self._doc.text()
|
|
|
|
def get_text(self, clean: bool = True) -> str:
|
|
"""Get text with options (old API)."""
|
|
self._deprecation_warning("get_text()", "Document.text()")
|
|
return self._doc.text()
|
|
|
|
@property
|
|
def tables(self) -> List[Any]:
|
|
"""Get tables (old API)."""
|
|
self._deprecation_warning("tables", "Document.tables")
|
|
return self._doc.tables
|
|
|
|
def find_all(self, tag: str) -> List[Any]:
|
|
"""Find elements by tag (old API)."""
|
|
self._deprecation_warning("find_all()", "Document.root.find()")
|
|
|
|
# Map old tag names to node types
|
|
from edgar.documents.types import NodeType
|
|
|
|
tag_map = {
|
|
'h1': NodeType.HEADING,
|
|
'h2': NodeType.HEADING,
|
|
'h3': NodeType.HEADING,
|
|
'p': NodeType.PARAGRAPH,
|
|
'table': NodeType.TABLE,
|
|
}
|
|
|
|
node_type = tag_map.get(tag.lower())
|
|
if node_type:
|
|
return self._doc.root.find(lambda n: n.type == node_type)
|
|
|
|
return []
|
|
|
|
def search(self, pattern: str) -> List[str]:
|
|
"""Search document (old API)."""
|
|
self._deprecation_warning("search()", "DocumentSearch.search()")
|
|
|
|
search = DocumentSearch(self._doc)
|
|
results = search.search(pattern)
|
|
return [r.text for r in results]
|
|
|
|
@property
|
|
def sections(self) -> Dict[str, Any]:
|
|
"""Get sections (old API)."""
|
|
# Convert new sections to old format
|
|
new_sections = self._doc.sections
|
|
old_sections = {}
|
|
|
|
for name, section in new_sections.items():
|
|
old_sections[name] = {
|
|
'title': section.title,
|
|
'text': section.text(),
|
|
'start': section.start_offset,
|
|
'end': section.end_offset
|
|
}
|
|
|
|
return old_sections
|
|
|
|
def to_markdown(self) -> str:
|
|
"""Convert to markdown (old API)."""
|
|
self._deprecation_warning("to_markdown()", "MarkdownRenderer.render()")
|
|
|
|
from edgar.documents.renderers import MarkdownRenderer
|
|
renderer = MarkdownRenderer()
|
|
return renderer.render(self._doc)
|
|
|
|
|
|
class LegacySECHTMLParser:
|
|
"""
|
|
Compatibility wrapper for old SECHTMLParser.
|
|
|
|
Maps old parser methods to new parser.
|
|
"""
|
|
|
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
"""Initialize with optional config."""
|
|
# Convert old config to new
|
|
new_config = self._convert_config(config)
|
|
self._parser = HTMLParser(new_config)
|
|
self._warn_on_use = True
|
|
|
|
def _convert_config(self, old_config: Optional[Dict[str, Any]]) -> ParserConfig:
|
|
"""Convert old config format to new."""
|
|
if not old_config:
|
|
return ParserConfig()
|
|
|
|
new_config = ParserConfig()
|
|
|
|
# Map old config keys to new
|
|
if 'clean_text' in old_config:
|
|
new_config.clean_text = old_config['clean_text']
|
|
|
|
if 'extract_tables' in old_config:
|
|
new_config.table_extraction = old_config['extract_tables']
|
|
|
|
if 'preserve_layout' in old_config:
|
|
new_config.preserve_whitespace = old_config['preserve_layout']
|
|
|
|
return new_config
|
|
|
|
def parse(self, html: str) -> LegacyHTMLDocument:
|
|
"""Parse HTML (old API)."""
|
|
if self._warn_on_use:
|
|
warnings.warn(
|
|
"SECHTMLParser is deprecated. Use HTMLParser instead.",
|
|
DeprecationWarning,
|
|
stacklevel=2
|
|
)
|
|
|
|
new_doc = self._parser.parse(html)
|
|
return LegacyHTMLDocument(new_doc)
|
|
|
|
def parse_file(self, filepath: str) -> LegacyHTMLDocument:
|
|
"""Parse HTML file (old API)."""
|
|
if self._warn_on_use:
|
|
warnings.warn(
|
|
"SECHTMLParser.parse_file() is deprecated. Use HTMLParser.parse_file() instead.",
|
|
DeprecationWarning,
|
|
stacklevel=2
|
|
)
|
|
|
|
new_doc = self._parser.parse_file(filepath)
|
|
return LegacyHTMLDocument(new_doc)
|
|
|
|
|
|
def migrate_parser_usage(code: str) -> str:
|
|
"""
|
|
Helper to migrate code from old parser to new.
|
|
|
|
Args:
|
|
code: Python code using old parser
|
|
|
|
Returns:
|
|
Updated code using new parser
|
|
"""
|
|
replacements = [
|
|
# Import statements
|
|
("from edgar.files.html import SECHTMLParser",
|
|
"from edgar.documents import HTMLParser"),
|
|
|
|
("from edgar.files.html import Document",
|
|
"from edgar.documents import Document"),
|
|
|
|
# Class instantiation
|
|
("SECHTMLParser(", "HTMLParser("),
|
|
|
|
# Method calls
|
|
("document.text", "document.text()"),
|
|
("document.get_text(", "document.text("),
|
|
("document.find_all(", "document.root.find(lambda n: n.tag == "),
|
|
("document.to_markdown(", "MarkdownRenderer().render(document"),
|
|
|
|
# Config changes
|
|
("extract_tables=", "table_extraction="),
|
|
("preserve_layout=", "preserve_whitespace="),
|
|
]
|
|
|
|
migrated = code
|
|
for old, new in replacements:
|
|
migrated = migrated.replace(old, new)
|
|
|
|
return migrated
|
|
|
|
|
|
class MigrationGuide:
|
|
"""
|
|
Provides migration guidance and utilities.
|
|
"""
|
|
|
|
@staticmethod
|
|
def check_compatibility(old_parser_instance) -> Dict[str, Any]:
|
|
"""
|
|
Check if old parser instance can be migrated.
|
|
|
|
Returns:
|
|
Dict with compatibility info
|
|
"""
|
|
return {
|
|
'can_migrate': True,
|
|
'warnings': [],
|
|
'recommendations': [
|
|
"Replace SECHTMLParser with HTMLParser",
|
|
"Update document.text to document.text()",
|
|
"Use DocumentSearch for search functionality",
|
|
"Use MarkdownRenderer for markdown conversion"
|
|
]
|
|
}
|
|
|
|
@staticmethod
|
|
def print_migration_guide():
|
|
"""Print migration guide."""
|
|
guide = """
|
|
HTML Parser Migration Guide
|
|
==========================
|
|
|
|
The new HTML parser provides significant improvements:
|
|
- 10x performance improvement
|
|
- Better table parsing
|
|
- Reliable section detection
|
|
- Advanced search capabilities
|
|
|
|
Key Changes:
|
|
-----------
|
|
|
|
1. Imports:
|
|
OLD: from edgar.files.html import SECHTMLParser, Document
|
|
NEW: from edgar.documents import HTMLParser, Document
|
|
|
|
2. Parser Creation:
|
|
OLD: parser = SECHTMLParser()
|
|
NEW: parser = HTMLParser()
|
|
|
|
3. Document Text:
|
|
OLD: document.text or document.get_text()
|
|
NEW: document.text()
|
|
|
|
4. Search:
|
|
OLD: document.search(pattern)
|
|
NEW: search = DocumentSearch(document)
|
|
results = search.search(pattern)
|
|
|
|
5. Tables:
|
|
OLD: document.tables
|
|
NEW: document.tables (same, but returns richer TableNode objects)
|
|
|
|
6. Sections:
|
|
OLD: document.sections
|
|
NEW: document.sections (returns Section objects with more features)
|
|
|
|
7. Markdown:
|
|
OLD: document.to_markdown()
|
|
NEW: renderer = MarkdownRenderer()
|
|
markdown = renderer.render(document)
|
|
|
|
Compatibility:
|
|
-------------
|
|
|
|
For gradual migration, use the compatibility layer:
|
|
|
|
from edgar.documents.migration import LegacySECHTMLParser
|
|
parser = LegacySECHTMLParser() # Works like old parser
|
|
|
|
This will issue deprecation warnings to help you migrate.
|
|
|
|
Performance Config:
|
|
------------------
|
|
|
|
For best performance:
|
|
parser = HTMLParser.create_for_performance()
|
|
|
|
For best accuracy:
|
|
parser = HTMLParser.create_for_accuracy()
|
|
|
|
For AI/LLM processing:
|
|
parser = HTMLParser.create_for_ai()
|
|
"""
|
|
|
|
print(guide)
|
|
|
|
|
|
# Compatibility aliases
|
|
SECHTMLParser = LegacySECHTMLParser
|
|
HTMLDocument = LegacyHTMLDocument
|
|
|
|
|
|
# Auto-migration for common imports
|
|
def __getattr__(name):
|
|
"""Provide compatibility imports with warnings."""
|
|
if name == "SECHTMLParser":
|
|
warnings.warn(
|
|
"Importing SECHTMLParser from edgar.documents.migration is deprecated. "
|
|
"Use HTMLParser from edgar.documents instead.",
|
|
DeprecationWarning,
|
|
stacklevel=2
|
|
)
|
|
return LegacySECHTMLParser
|
|
|
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}") |