Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/documents/utils/html_utils.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/utils/html_utils.py
@@ -0,0 +1,96 @@
+"""
+HTML utility functions for document parsing.
+
+This module consolidates common HTML processing utilities used across
+the parser, preprocessor, and simple parser implementations.
+"""
+
+import lxml.html
+from typing import Optional
+
+
+def remove_xml_declaration(html: str) -> str:
+    """
+    Remove XML declaration from HTML if present.
+
+    SEC HTML documents sometimes include XML declarations like:
+        <?xml version="1.0" encoding="UTF-8"?>
+
+    These can interfere with HTML parsing and are safely removed since
+    the encoding is handled separately by the parser.
+
+    Args:
+        html: HTML string that may contain XML declaration
+
+    Returns:
+        HTML string with XML declaration removed (if present)
+
+    Examples:
+        >>> html = '<?xml version="1.0"?><!DOCTYPE html><html>...'
+        >>> remove_xml_declaration(html)
+        '<!DOCTYPE html><html>...'
+
+        >>> html = '<!DOCTYPE html><html>...'  # No XML declaration
+        >>> remove_xml_declaration(html)
+        '<!DOCTYPE html><html>...'
+    """
+    html_stripped = html.strip()
+    if html_stripped.startswith('<?xml'):
+        xml_end = html.find('?>') + 2
+        return html[xml_end:]
+    return html
+
+
+def create_lxml_parser(
+    remove_blank_text: bool = True,
+    remove_comments: bool = True,
+    recover: bool = True,
+    encoding: Optional[str] = 'utf-8'
+) -> lxml.html.HTMLParser:
+    """
+    Create a configured lxml HTMLParser.
+
+    This factory function creates an lxml HTMLParser with consistent
+    configuration settings used across the document parsing system.
+
+    Args:
+        remove_blank_text: Remove blank text nodes between tags.
+            Default True for cleaner tree structure.
+        remove_comments: Remove HTML comments from parsed tree.
+            Default True since comments are rarely needed.
+        recover: Enable error recovery mode to handle malformed HTML.
+            Default True since SEC filings often have HTML issues.
+        encoding: Character encoding for the parser.
+            Default 'utf-8'. Set to None to disable encoding handling.
+
+    Returns:
+        Configured lxml.html.HTMLParser instance
+
+    Examples:
+        >>> # Standard parser (removes whitespace and comments, recovers from errors)
+        >>> parser = create_lxml_parser()
+
+        >>> # Parser that preserves all content (for XBRL)
+        >>> parser = create_lxml_parser(
+        ...     remove_blank_text=False,
+        ...     remove_comments=False
+        ... )
+
+        >>> # Parser without encoding (auto-detect)
+        >>> parser = create_lxml_parser(encoding=None)
+
+    Note:
+        The recover=True setting is critical for SEC documents which
+        often contain non-standard HTML structures.
+    """
+    kwargs = {
+        'remove_blank_text': remove_blank_text,
+        'remove_comments': remove_comments,
+        'recover': recover,
+    }
+
+    # Only add encoding if specified
+    if encoding is not None:
+        kwargs['encoding'] = encoding
+
+    return lxml.html.HTMLParser(**kwargs)