Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/sgml/tools.py
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/tools.py
@@ -0,0 +1,82 @@
+import base64
+import re
+
+__all__ = ['extract_text_between_tags', 'get_content_between_tags', 'strip_tags', 'is_xml', 'decode_uu']
+
+def extract_text_between_tags(content: str, tag: str) -> str:
+    """
+    Extracts text from provided content between the specified HTML/XML tags.
+
+    :param content: The text content to search through
+    :param tag: The tag to extract the content from
+    :return: The extracted text between the tags
+    """
+    tag_start = f'<{tag}>'
+    tag_end = f'</{tag}>'
+    is_tag = False
+    extracted_content = ""
+
+    for line in content.splitlines():
+        if line.startswith(tag_start):
+            is_tag = True
+            continue  # Skip the start tag line
+        elif line.startswith(tag_end):
+            break  # Stop reading if end tag is found
+        elif is_tag:
+            extracted_content += line + '\n'  # Add line to result
+
+    return extracted_content.strip()
+
+
+def get_content_between_tags(content: str, outer_tag: str = None) -> str:
+    """
+    Extract content between specified tags, starting from most nested tags.
+
+    Args:
+        content: Raw content containing tagged sections
+        outer_tag: Optional specific tag to extract from (e.g. 'XBRL', 'TEXT')
+
+    Returns:
+        str: Content between the specified tags, or innermost content if no tag specified
+    """
+    known_tags = ["PDF", "XBRL", "XML", "TEXT"]  # Ordered from most nested to least nested
+
+    if outer_tag:
+        # Extract content for specific tag
+        pattern = f'<{outer_tag}>(.*?)</{outer_tag}>'
+        match = re.search(pattern, content, re.DOTALL)
+        return match.group(1).strip() if match else ''
+
+    # If no tag specified, find the first matching tag from most nested to least
+    for tag in known_tags:
+        pattern = f'<{tag}>(.*?)</{tag}>'
+        match = re.search(pattern, content, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+
+    return ''
+
+
+def strip_tags(text: str, start_tag: str, end_tag: str) -> str:
+    """Strip XML/HTML tags from text if present."""
+    if text.startswith(start_tag) and text.endswith(end_tag):
+        return text[len(start_tag):-len(end_tag)].strip()
+    return text
+
+def is_xml(filename: str) -> bool:
+    """Check if a file is XML based on the file extension.
+    .xsd, .xml, .xbrl
+    """
+    return filename.lower().endswith(('.xsd', '.xml', '.xbrl'))
+
+
+def decode_uu(uu_content):
+    lines = uu_content.split('\n')
+    data = ''
+    for line in lines[1:]:  # Skip "begin" line
+        if line.startswith('`') or line.startswith('end'):
+            break
+        # Convert UU to base64 padding
+        data += ''.join([chr(((ord(c) - 32) & 63) + 32) for c in line.strip()])
+
+    return base64.b64decode(data)