Files
edgartools/venv/lib/python3.10/site-packages/edgar/documents/utils/html_utils.py
2025-12-09 12:13:01 +01:00

97 lines
2.9 KiB
Python

"""
HTML utility functions for document parsing.
This module consolidates common HTML processing utilities used across
the parser, preprocessor, and simple parser implementations.
"""
import lxml.html
from typing import Optional
def remove_xml_declaration(html: str) -> str:
"""
Remove XML declaration from HTML if present.
SEC HTML documents sometimes include XML declarations like:
<?xml version="1.0" encoding="UTF-8"?>
These can interfere with HTML parsing and are safely removed since
the encoding is handled separately by the parser.
Args:
html: HTML string that may contain XML declaration
Returns:
HTML string with XML declaration removed (if present)
Examples:
>>> html = '<?xml version="1.0"?><!DOCTYPE html><html>...'
>>> remove_xml_declaration(html)
'<!DOCTYPE html><html>...'
>>> html = '<!DOCTYPE html><html>...' # No XML declaration
>>> remove_xml_declaration(html)
'<!DOCTYPE html><html>...'
"""
html_stripped = html.strip()
if html_stripped.startswith('<?xml'):
xml_end = html.find('?>') + 2
return html[xml_end:]
return html
def create_lxml_parser(
remove_blank_text: bool = True,
remove_comments: bool = True,
recover: bool = True,
encoding: Optional[str] = 'utf-8'
) -> lxml.html.HTMLParser:
"""
Create a configured lxml HTMLParser.
This factory function creates an lxml HTMLParser with consistent
configuration settings used across the document parsing system.
Args:
remove_blank_text: Remove blank text nodes between tags.
Default True for cleaner tree structure.
remove_comments: Remove HTML comments from parsed tree.
Default True since comments are rarely needed.
recover: Enable error recovery mode to handle malformed HTML.
Default True since SEC filings often have HTML issues.
encoding: Character encoding for the parser.
Default 'utf-8'. Set to None to disable encoding handling.
Returns:
Configured lxml.html.HTMLParser instance
Examples:
>>> # Standard parser (removes whitespace and comments, recovers from errors)
>>> parser = create_lxml_parser()
>>> # Parser that preserves all content (for XBRL)
>>> parser = create_lxml_parser(
... remove_blank_text=False,
... remove_comments=False
... )
>>> # Parser without encoding (auto-detect)
>>> parser = create_lxml_parser(encoding=None)
Note:
The recover=True setting is critical for SEC documents which
often contain non-standard HTML structures.
"""
kwargs = {
'remove_blank_text': remove_blank_text,
'remove_comments': remove_comments,
'recover': recover,
}
# Only add encoding if specified
if encoding is not None:
kwargs['encoding'] = encoding
return lxml.html.HTMLParser(**kwargs)