Initial commit
This commit is contained in:
82
venv/lib/python3.10/site-packages/edgar/sgml/tools.py
Normal file
82
venv/lib/python3.10/site-packages/edgar/sgml/tools.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import base64
|
||||
import re
|
||||
|
||||
__all__ = ['extract_text_between_tags', 'get_content_between_tags', 'strip_tags', 'is_xml', 'decode_uu']
|
||||
|
||||
def extract_text_between_tags(content: str, tag: str) -> str:
|
||||
"""
|
||||
Extracts text from provided content between the specified HTML/XML tags.
|
||||
|
||||
:param content: The text content to search through
|
||||
:param tag: The tag to extract the content from
|
||||
:return: The extracted text between the tags
|
||||
"""
|
||||
tag_start = f'<{tag}>'
|
||||
tag_end = f'</{tag}>'
|
||||
is_tag = False
|
||||
extracted_content = ""
|
||||
|
||||
for line in content.splitlines():
|
||||
if line.startswith(tag_start):
|
||||
is_tag = True
|
||||
continue # Skip the start tag line
|
||||
elif line.startswith(tag_end):
|
||||
break # Stop reading if end tag is found
|
||||
elif is_tag:
|
||||
extracted_content += line + '\n' # Add line to result
|
||||
|
||||
return extracted_content.strip()
|
||||
|
||||
|
||||
def get_content_between_tags(content: str, outer_tag: str = None) -> str:
|
||||
"""
|
||||
Extract content between specified tags, starting from most nested tags.
|
||||
|
||||
Args:
|
||||
content: Raw content containing tagged sections
|
||||
outer_tag: Optional specific tag to extract from (e.g. 'XBRL', 'TEXT')
|
||||
|
||||
Returns:
|
||||
str: Content between the specified tags, or innermost content if no tag specified
|
||||
"""
|
||||
known_tags = ["PDF", "XBRL", "XML", "TEXT"] # Ordered from most nested to least nested
|
||||
|
||||
if outer_tag:
|
||||
# Extract content for specific tag
|
||||
pattern = f'<{outer_tag}>(.*?)</{outer_tag}>'
|
||||
match = re.search(pattern, content, re.DOTALL)
|
||||
return match.group(1).strip() if match else ''
|
||||
|
||||
# If no tag specified, find the first matching tag from most nested to least
|
||||
for tag in known_tags:
|
||||
pattern = f'<{tag}>(.*?)</{tag}>'
|
||||
match = re.search(pattern, content, re.DOTALL)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
def strip_tags(text: str, start_tag: str, end_tag: str) -> str:
|
||||
"""Strip XML/HTML tags from text if present."""
|
||||
if text.startswith(start_tag) and text.endswith(end_tag):
|
||||
return text[len(start_tag):-len(end_tag)].strip()
|
||||
return text
|
||||
|
||||
def is_xml(filename: str) -> bool:
|
||||
"""Check if a file is XML based on the file extension.
|
||||
.xsd, .xml, .xbrl
|
||||
"""
|
||||
return filename.lower().endswith(('.xsd', '.xml', '.xbrl'))
|
||||
|
||||
|
||||
def decode_uu(uu_content):
|
||||
lines = uu_content.split('\n')
|
||||
data = ''
|
||||
for line in lines[1:]: # Skip "begin" line
|
||||
if line.startswith('`') or line.startswith('end'):
|
||||
break
|
||||
# Convert UU to base64 padding
|
||||
data += ''.join([chr(((ord(c) - 32) & 63) + 32) for c in line.strip()])
|
||||
|
||||
return base64.b64decode(data)
|
||||
Reference in New Issue
Block a user