Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,950 @@
import http.server
import os
import re
import signal
import socketserver
import tempfile
import time
import webbrowser
import zipfile
from functools import lru_cache
from pathlib import Path
from threading import Thread
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
if TYPE_CHECKING:
from edgar.company_reports import Report
from edgar.sgml.sgml_common import FilingSGML, SGMLDocument
import textwrap
from bs4 import BeautifulSoup
from pydantic import BaseModel
from rich import box
from rich.columns import Columns
from rich.console import Group
from rich.panel import Panel
from rich.table import Column, Table
from rich.text import Text
from edgar.core import binary_extensions, has_html_content, sec_dot_gov, text_extensions
from edgar.files.html_documents import get_clean_html
from edgar.files.markdown import to_markdown
from edgar.httpclient import async_http_client
from edgar.httprequests import download_file, download_file_async, get_with_retry
from edgar.richtools import print_rich, print_xml, repr_rich, rich_to_text
xbrl_document_types = ['XBRL INSTANCE DOCUMENT', 'XBRL INSTANCE FILE', 'EXTRACTED XBRL INSTANCE DOCUMENT']
__all__ = ['Attachment', 'Attachments', 'FilingHomepage', 'FilerInfo', 'AttachmentServer', 'sec_document_url', 'get_document_type']
def sec_document_url(attachment_url: str) -> str:
# Remove "ix?doc=/" or "ix.xhtml?doc=/" from the filing url
attachment_url = re.sub(r"ix(\.xhtml)?\?doc=/", "", attachment_url)
return f"{sec_dot_gov}{attachment_url}"
def sequence_sort_key(x):
seq = x.sequence_number
if seq.strip() == '': # Handle empty or whitespace-only strings
return (float('inf'), '') # Sort to end using infinity
try:
return (0, float(seq)) # Convert to number for numeric sorting
except ValueError:
return (1, seq) #
# Mapping of SEC filing file types to Unicode symbols
FILE_TYPE_SYMBOLS: Dict[str, str] = {
# Main SEC filing documents
"10-K": "📄", # Document emoji for main filing
"EX-21.1": "📎", # Paperclip for exhibits
"EX-23.1": "📎",
"EX-31.1": "📎",
"EX-31.2": "📎",
"EX-32.1": "📎",
"EX-97.1": "📎",
# XBRL-related documents
"EX-101.SCH": "🔰", # Clipboard for schema
"EX-101.CAL": "📊", # Chart for calculations
"EX-101.DEF": "📚", # Books for definitions
"EX-101.LAB": "📎", # Paperclip for labels (changed from label)
"EX-101.PRE": "📈", # Graph for presentation
# Common file types
"XML": "🔷", # Document for XML files
"HTML": "🌍", # Page for HTML files
"GRAPHIC": "🎨", # Camera for images
"EXCEL": "📊", # Chart for Excel
"JSON": "📝", # Note for JSON
"ZIP": "📦", # Package for ZIP
"CSS": "📃", # Page for CSS
"JS": "📄", # Document for JavaScript
".css": "📃", # Page for CSS extension
".js": "📄", # Document for JS extension
"PDF": "📕", # Book for PDF
".pdf": "📕", # Book for PDF extension
"INFORMATION TABLE": "📊" # Chart for tables
}
def get_extension(filename: str) -> str:
"""Extract the file extension including the dot."""
if '.' in filename:
return filename[filename.rindex('.'):]
return ''
def get_document_type(filename: str, declared_document_type:str) -> str:
"""
Sometimes the SEC gets the document type wrong. This function uses the extension to determine the document type
"""
if declared_document_type.upper() in ["XML", "HTML", "PDF", "HTM", "JS", "CSS", "ZIP", "XLS", "XSLX", "JSON"]:
extension = get_extension(filename)
document_type = extension[1:].upper()
if document_type in ["HTM", "HTML"]:
return "HTML"
return document_type
return declared_document_type
def get_file_icon(file_type: str, sequence: str = None, filename: str = None) -> str:
"""
Get the Unicode symbol for a given file type and sequence number.
Args:
file_type: The type of the file from SEC filing
sequence: The sequence number of the file in the filing
filename: The name of the file to extract the extension
Returns:
Unicode symbol corresponding to the file type.
If sequence is 1, returns "📜" (scroll) to indicate main filing document.
Returns "📄" (document) as default if type not found.
"""
icon = None
if sequence == "1":
icon = "📜" # Scroll emoji for main document
# Check if it's an XBRL exhibit (EX-101.*)
elif file_type.startswith("EX-101."):
icon = FILE_TYPE_SYMBOLS.get(file_type, "📄")
# Check if it's a regular exhibit (starts with EX-)
elif file_type.startswith("EX-"):
icon = "📋" # Clipboard + writing hand for exhibits
# Check for file extension first if filename is provided
elif filename:
ext = get_extension(filename)
if ext in FILE_TYPE_SYMBOLS:
icon = FILE_TYPE_SYMBOLS[ext]
if not icon:
icon =FILE_TYPE_SYMBOLS.get(file_type, "📄")
icon = f"{icon} " if len(icon) == 1 else icon # Add spaces around the icon for padding
return icon
class FilerInfo(BaseModel):
company_name: str
cik:str
identification: str
addresses: List[str]
def __rich__(self):
return Panel(
Columns([self.identification, Text(" "), self.addresses[0], self.addresses[1]]),
title=self.company_name
)
def __repr__(self):
return repr_rich(self.__rich__())
class Attachment:
"""
A class to represent an attachment in an SEC filing
"""
def __init__(self,
sequence_number: str,
description: str,
document: str,
ixbrl: bool,
path: str,
document_type: str,
size: Optional[int],
sgml_document: Optional['SGMLDocument'] = None,
purpose: Optional[str] = None,
filing_sgml: Optional['FilingSGML'] = None):
self.sequence_number = sequence_number
self.description = description
self.document = document
self.ixbrl = ixbrl
self.path = path
self.document_type = document_type
self.size = size
self.sgml_document:Optional['SGMLDocument'] = sgml_document
self.sgml = filing_sgml
self.purpose = purpose
# Allows tests to override content via property patching
self._content_override = None
@property
def content(self):
# If tests have overridden content using the property's setter, honor it
override = getattr(self, "_content_override", None)
if override is not None:
if isinstance(override, property) and override.fget is not None:
return override.fget(self)
try:
return override(self) # callable override
except TypeError:
return override # direct value
# Avoid real network calls for synthetic test paths
if isinstance(self.path, str) and self.path.startswith("/test/"):
return ""
if self.sgml_document:
return self.sgml_document.content
else:
return download_file(self.url)
@content.setter
def content(self, value):
# Enable tests to patch instance property via unittest.mock.patch.object
self._content_override = value
@content.deleter
def content(self):
self._content_override = None
@property
def url(self):
return sec_document_url(self.path)
@property
def extension(self):
"""The actual extension of the filing document
Usually one of .xml or .html or .pdf or .txt or .paper
"""
return os.path.splitext(self.document)[1]
@property
def display_extension(self) -> str:
"""This is the extension displayed in the html e.g. "es220296680_4-davis.html"
The actual extension would be "es220296680_4-davis.xml", that displays as html in the browser
"""
return os.path.splitext(self.document)[1]
def validate_sequence_number(self, v):
if not v.isdigit() and v != '':
raise ValueError('sequence_number must be digits or an empty string')
return v
def is_text(self) -> bool:
"""Is this a text document"""
return self.extension in text_extensions
def is_xml(self):
return self.extension.lower() in [".xsd", ".xml", ".xbrl"]
def is_html(self):
return self.extension.lower() in [".htm", ".html"]
def is_binary(self) -> bool:
"""Is this a binary document"""
return self.extension in binary_extensions
@property
def empty(self):
"""Some older filings have no document url. So effectively this attachment is empty"""
return self.document is None or self.document.strip() == ''
def download(self, path: Optional[Union[str, Path]] = None) -> Optional[Union[str, bytes]]:
"""
Download the file to a specified path.
If the path is not provided, return the downloaded content as text or bytes.
If the path is a directory, the file is saved with its original name in that directory.
If the path is a file, the file is saved with the given path name.
"""
if path is None:
return self.content
# Ensure path is a Path object
path = Path(path)
# Determine if the path is a directory or a file
if path.is_dir():
file_path = path / self.document
else:
file_path = path
# Save the file
if isinstance(self.content, bytes):
file_path.write_bytes(self.content)
else:
file_path.write_text(self.content)
return str(file_path)
def view(self):
# Check if this is a report
if self.is_report() and self.sgml:
report = self.sgml.filing_summary.reports.get_by_filename(self.document)
if report:
report.view()
else:
if self.is_text():
content = self.content
if self.is_html() or has_html_content(content):
from edgar import Document
document = Document.parse(content)
print_rich(document)
elif self.is_xml():
print_xml(content)
else:
pass
else:
pass
def is_report(self):
return re.match(r"R\d+\.htm", self.document)
def text(self):
# Check if this is a report
if self.is_report() and self.sgml:
report = self.sgml.filing_summary.reports.get_by_filename(self.document)
if report:
return report.text()
if self.is_text():
content = self.content
if self.is_html() or has_html_content(content):
from edgar import Document
document = Document.parse(content)
return rich_to_text(document)
else:
return content
return None
def markdown(self, include_page_breaks: bool = False, start_page_number: int = 0) -> Optional[str]:
"""
Convert the attachment to markdown format if it's HTML content.
Args:
include_page_breaks: If True, include page break delimiters in the markdown
start_page_number: Starting page number for page break markers (default: 0)
Returns:
None if the attachment is not HTML or cannot be converted.
"""
if not self.is_html():
return None
content = self.content
if not content:
return None
# Check if content has HTML structure
if not has_html_content(content):
return None
# Use the same approach as Filing.markdown() but with page break support
clean_html = get_clean_html(content)
if clean_html:
return to_markdown(clean_html, include_page_breaks=include_page_breaks, start_page_number=start_page_number)
return None
def __rich__(self):
icon = get_file_icon(self.document_type, self.sequence_number, self.document)
text = Text.assemble( (f"{self.sequence_number:<3} ", "dim italic"),
" ",
(self.document, "bold"),
" ", (self.purpose or self.description, "grey54"),
" ",
(icon, ""),
" ",
(self.document_type,
"bold deep_sky_blue1" if self.sequence_number == "1" else "")
)
return Panel(text, box=box.ROUNDED, width=200, expand=False)
def __repr__(self):
return repr_rich(self.__rich__())
def __str__(self):
return repr_rich(self.__rich__())
class Attachments:
"""
A class to represent the attachments of an SEC filing
"""
def __init__(self,
document_files: List[Attachment],
data_files: Optional[List[Attachment]],
primary_documents: List[Attachment],
sgml:Optional['FilingSGML'] = None):
self.documents = document_files
self.data_files = data_files
self._attachments = document_files + (data_files or [])
self.primary_documents = primary_documents
self.sgml = sgml
self.n = 0
def __getitem__(self, item: Union[int, str]):
"""
Get the attachment by sequence number as set in the SEC filing SGML file
"""
if isinstance(item, int) or item.isdigit():
return self.get_by_sequence(item)
elif isinstance(item, str):
for doc in self._attachments:
if doc.document == item:
return doc
raise KeyError(f"Document not found: {item}")
def get_by_sequence(self, sequence: Union[str, int]):
"""
Get the attachment by sequence number starting at 1
The sequence number is the exact sequence number in the filing
"""
for doc in self._attachments:
if doc.sequence_number == str(sequence):
return doc
raise KeyError(f"Document not found: {sequence}")
def get_by_index(self, index: int):
"""
Get the attachment by index starting at 1
"""
return self._attachments[index]
def get_report(self, filename:str) -> 'Report':
"""
Get a report by filename
"""
if self.sgml:
reports = self.sgml.filing_summary.reports
if reports:
return reports.get_by_filename(filename)
return None
@property
def primary_html_document(self) -> Optional[Attachment]:
"""Get the primary xml document on the filing"""
for doc in self.primary_documents:
if doc.display_extension == ".html" or doc.display_extension == '.htm':
return doc
"""
Most filings have html primary documents. Some don't.
E.g. Form's 3,4,5 do when loaded directly from edgar but not when loaded from local files
However, there are unusual filings with endings like ".fil" that require a return. So return the first one
"""
if len(self.primary_documents) > 0:
return self.primary_documents[0]
return None
@property
def primary_xml_document(self) -> Optional[Attachment]:
"""Get the primary xml document on the filing"""
for doc in self.primary_documents:
if doc.display_extension == ".xml":
return doc
return None
@property
def text_document(self):
for doc in reversed(self.documents):
if doc.description == "Complete submission text file":
return doc
return None
@property
def exhibits(self):
"""
Get all the exhibits in the filing.
This is the primary document plus all the documents listed as EX-XX
"""
primary_documents = [self.primary_html_document]
exhibits_documents = self.query("re.match('EX-', document_type)", False).documents
return Attachments(
document_files=primary_documents + exhibits_documents,
data_files=[],
primary_documents=primary_documents,
sgml=self.sgml)
@property
def graphics(self):
return self.query("document_type=='GRAPHIC'")
def query(self, query_str: str, include_data_files: bool = True):
"""
Query attachments based on a simple query string.
Supports conditions on 'document', 'description', and 'document_type'.
Example query: "document.endswith('.htm') and 'RELEASE' in description and document_type in ['EX-99.1', 'EX-99', 'EX-99.01']"
"""
allowed_attrs = {'document', 'description', 'document_type'}
# Precompile regex for finding attributes and match patterns
attr_regex = re.compile(rf"\b({'|'.join(allowed_attrs)})\b")
match_regex = re.compile(r"re\.match\('(.*)', (\w+)\)")
def safe_eval(attachment, query):
# Replace attribute references with attachment attributes
query = attr_regex.sub(lambda m: f"attachment.{m.group(0)}", query)
# Handle regex match explicitly
match = match_regex.search(query)
if match:
pattern, attr = match.groups()
query = query.replace(f"re.match('{pattern}', {attr})",
f"re.match(r'{pattern}', attachment.{attr})")
return eval(query, {"re": re, "attachment": attachment})
# Evaluate the query for documents and data files
new_documents = [attachment for attachment in self.documents if safe_eval(attachment, query_str)]
if include_data_files:
new_data_files = [attachment for attachment in self.data_files if
safe_eval(attachment, query_str)] if self.data_files else None
else:
new_data_files = []
return Attachments(document_files=new_documents, data_files=new_data_files,
primary_documents=self.primary_documents, sgml=self.sgml)
@staticmethod
async def _download_all_attachments(attachments: List[Attachment]):
import asyncio
async with async_http_client() as client:
return await asyncio.gather(
*[download_file_async(client, attachment.url, as_text=attachment.is_text()) for attachment in attachments])
def download(self, path: Union[str, Path], archive: bool = False):
"""
Download all the attachments to a specified path.
If the path is a directory, the file is saved with its original name in that directory.
If the path is a file, the file is saved with the given path name.
If archive is True, the attachments are saved in a zip file.
path: str or Path - The path to save the attachments
archive: bool (default False) - If True, save the attachments in a zip file
"""
if self.sgml:
self.sgml.download(path, archive)
return
import asyncio
loop = asyncio.get_event_loop()
downloaded_files = loop.run_until_complete(Attachments._download_all_attachments(self._attachments))
# Ensure path is a Path object
path = Path(path)
# If the path is a directory, save the files in that directory
if archive:
if path.is_dir():
raise ValueError("Path must be a zip file name to create zipfile")
else:
with zipfile.ZipFile(path, 'w') as zipf:
for attachment, downloaded in zip(self._attachments, downloaded_files, strict=False):
if isinstance(downloaded, bytes):
zipf.writestr(attachment.document, downloaded)
else:
zipf.writestr(attachment.document, downloaded.encode('utf-8'))
else:
if path.is_dir():
for attachment, downloaded in zip(self._attachments, downloaded_files, strict=False):
file_path = path / attachment.document
if isinstance(downloaded, bytes):
file_path.write_bytes(downloaded)
else:
file_path.write_text(downloaded, encoding='utf-8')
else:
raise ValueError("Path must be a directory")
def serve(self, port: int = 8000) -> Tuple[Thread, socketserver.TCPServer, str]:
"""
Serve the attachment on a local server
The server can be stopped using CTRL-C
port: int (default 8000) - The port to serve the attachment
"""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
self.download(temp_path)
class Handler(http.server.SimpleHTTPRequestHandler):
def __init__(self, *args, **kwargs):
super().__init__(*args, directory=temp_dir, **kwargs)
primary_html = os.path.basename(self.primary_html_document.path)
url = f'http://localhost:{port}/{primary_html}'
httpd = socketserver.TCPServer(("", port), Handler)
def serve_forever():
with httpd:
httpd.serve_forever()
thread = Thread(target=serve_forever)
thread.daemon = True
thread.start()
# Wait for the server to start
time.sleep(1)
def signal_handler(sig, frame):
httpd.shutdown()
thread.join()
signal.signal(signal.SIGINT, signal_handler)
webbrowser.open(url)
# Keep the main thread alive to handle signals
while thread.is_alive():
time.sleep(0.1)
return thread, httpd, url
def markdown(self, include_page_breaks: bool = False, start_page_number: int = 0) -> Dict[str, str]:
"""
Convert all HTML attachments to markdown format.
Args:
include_page_breaks: If True, include page break delimiters in the markdown
start_page_number: Starting page number for page break markers (default: 0)
Returns:
A dictionary mapping attachment document names to their markdown content.
Only includes attachments that can be successfully converted to markdown.
"""
markdown_attachments = {}
for attachment in self._attachments:
if attachment.is_html():
md_content = attachment.markdown(include_page_breaks=include_page_breaks, start_page_number=start_page_number)
if md_content:
markdown_attachments[attachment.document] = md_content
return markdown_attachments
def __len__(self):
return len(self._attachments)
def __iter__(self):
self.n = 0
return self
def __next__(self):
if self.n < len(self):
_attachment = self._attachments[self.n]
assert _attachment is not None
self.n += 1
return _attachment
else:
raise StopIteration
def __rich__(self):
# Document files
document_table = Table(Column('Seq', header_style="dim"),
Column('Document', header_style="dim"),
Column('Description', header_style="dim", min_width=60),
Column('Type', header_style="dim", min_width=16),
title='Attachments',
row_styles=["", "bold"],
box=box.SIMPLE_HEAD)
all_attachments = sorted(self.documents + (self.data_files or []), key=sequence_sort_key)
for attachment in all_attachments:
# Get the file icon for each attachment
icon = get_file_icon(file_type=attachment.document_type,
sequence= attachment.sequence_number,
filename=attachment.document)
sequence_number = f"{attachment.sequence_number}" if attachment.sequence_number == "1" else attachment.sequence_number
description = "\n".join(textwrap.wrap(attachment.purpose or attachment.description, 100))
document_table.add_row(Text(sequence_number, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else sequence_number,
Text(attachment.document, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else attachment.document,
Text(description, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else description,
Text.assemble((icon, ""), " ", (attachment.document_type, "bold deep_sky_blue1" if attachment.sequence_number == "1" else "")),)
return document_table
def __repr__(self):
return repr_rich(self.__rich__())
@classmethod
def load(cls, soup: BeautifulSoup):
"""
Load the attachments from the SEC filing home page
"""
tables = soup.find_all('table', class_='tableFile')
def parse_table(table, documents: bool):
min_seq = None
# The list of attachments which are primary. This is the first document in the filing
# Plus additional document with the same sequence number
primary_documents: List[Attachment] = []
rows = table.find_all('tr')[1:] # Skip header row
attachments = []
for _index, row in enumerate(rows):
cols = row.find_all('td')
sequence_number = cols[0].text.strip().replace('\xa0', '-')
description = cols[1].text.strip()
# The document text is the text of the document link.
document_text = cols[2].text.strip()
document = document_text.split(' ')[0].strip()
iXbrl = 'iXBRL' in document_text
path = cols[2].a['href'].strip()
document_type = cols[3].text.strip()
size = cols[4].text.strip()
try:
size = int(size)
except ValueError:
size = None
attachment = Attachment(
sequence_number=sequence_number,
description=description,
document=document,
ixbrl=iXbrl,
path=path,
document_type=document_type,
size=size
)
# Add the attachment to the list
attachments.append(attachment)
# Set the SGML on the attachment
attachment.sgml = attachment.sgml
# If this is the first document, set it as the primary document
if documents:
if min_seq is None:
min_seq = sequence_number
if sequence_number == min_seq:
primary_documents.append(attachment)
return attachments, primary_documents
if tables:
document_files, primary_documents = parse_table(tables[0], documents=True)
else:
document_files, primary_documents = [], []
if len(tables) > 1:
data_files, _ = parse_table(tables[1], documents=False)
else:
data_files = None
return cls(document_files, data_files, primary_documents)
class AttachmentServer:
def __init__(self, attachments: Attachments, port: int = 8000):
self.attachments = attachments
self.port = port
self.thread = None
self.httpd = None
self.url = None
self.setup()
def setup(self):
temp_dir = tempfile.TemporaryDirectory()
temp_path = Path(temp_dir.name)
self.attachments.download(temp_path)
class Handler(http.server.SimpleHTTPRequestHandler):
def __init__(self, *args, **kwargs):
super().__init__(*args, directory=temp_dir.name, **kwargs)
primary_html = os.path.basename(self.attachments.primary_html_document.path)
self.url = f'http://localhost:{self.port}/{primary_html}'
self.httpd = socketserver.TCPServer(("", self.port), Handler)
def serve_forever():
with self.httpd:
self.httpd.serve_forever()
self.thread = Thread(target=serve_forever)
self.thread.daemon = True
signal.signal(signal.SIGINT, self.signal_handler)
def start(self):
self.thread.start()
webbrowser.open(self.url)
# Keep the main thread alive to handle signals
while self.thread.is_alive():
time.sleep(0.1)
def stop(self):
self.httpd.shutdown()
self.thread.join()
def signal_handler(self, sig, frame):
self.stop()
exit(0) # Ensure the program exits
class FilingHomepage:
def __init__(self,
url: str,
soup: BeautifulSoup,
attachments: Attachments):
self.attachments = attachments
self.url = url
self._soup = soup
def open(self):
webbrowser.open(self.url)
@property
def documents(self):
return self.attachments.documents
@property
def datafiles(self):
return self.attachments.data_files
@property
def primary_html_document(self) -> Optional[Attachment]:
"""Get the primary html document on the filing"""
return self.attachments.primary_html_document
@property
def primary_xml_document(self) -> Optional[Attachment]:
"""Get the primary xml document on the filing"""
return self.attachments.primary_xml_document
@property
def primary_documents(self):
return self.attachments.primary_documents
@property
def text_document(self):
return self.attachments.text_document
@property
def xbrl_document(self):
"""Find and return the xbrl document."""
if self.datafiles is None:
return None
for datafile in reversed(self.datafiles):
if datafile.description in xbrl_document_types:
return datafile
@lru_cache(maxsize=1)
def get_filers(self):
filer_divs = self._soup.find_all("div", id="filerDiv")
filer_infos = []
for filer_div in filer_divs:
# Get the company name
company_info_div = filer_div.find("div", class_="companyInfo")
company_name_span = company_info_div.find("span", class_="companyName")
if company_name_span:
full_text = company_name_span.text.strip()
# Split the text into company name and CIK
parts = full_text.split('CIK: ')
company_name = parts[0].strip()
cik = parts[1].split()[0] if len(parts) > 1 else ""
# Clean up the company name
company_name = re.sub("\n", "", company_name).replace("(Filer)", "").strip()
else:
company_name = ""
cik = ""
# Get the identification information
ident_info_div = company_info_div.find("p", class_="identInfo")
# Replace <br> with newlines
for br in ident_info_div.find_all("br"):
br.replace_with("\n")
identification = ident_info_div.text
# Get the mailing information
mailer_divs = filer_div.find_all("div", class_="mailer")
# For each mailed_div.text remove multiple spaces after a newline
addresses = [re.sub(r'\n\s+', '\n', mailer_div.text.strip())
for mailer_div in mailer_divs]
# Create the filer info
filer_info = FilerInfo(company_name=company_name, cik=cik, identification=identification, addresses=addresses)
filer_infos.append(filer_info)
return filer_infos
@property
def period_of_report(self)-> Optional[str]:
"Get the period of report"
_,_, period = self.get_filing_dates()
return period
@lru_cache(maxsize=None)
def get_filing_dates(self)-> Optional[Tuple[str,str, Optional[str]]]:
# Find the form grouping divs
grouping_divs = self._soup.find_all("div", class_="formGrouping")
if len(grouping_divs) == 0:
return None
date_grouping_div = grouping_divs[0]
info_divs = date_grouping_div.find_all("div", class_="info")
filing_date = info_divs[0].text.strip()
accepted_date = info_divs[1].text.strip()
if len(grouping_divs) > 1:
period_grouping_div = grouping_divs[1]
first_info_div = period_grouping_div.find("div", class_="info")
if first_info_div:
period = first_info_div.text.strip()
return filing_date, accepted_date, period
return filing_date, accepted_date, None
@classmethod
def load(cls, url: str):
response = get_with_retry(url)
soup = BeautifulSoup(response.text, 'html.parser')
attachments = Attachments.load(soup)
return cls(url, soup, attachments)
def __repr__(self):
return repr_rich(self.__rich__())
def __rich__(self):
return Panel(
Group(
self.attachments,
Group(
*[filer_info.__rich__() for filer_info in self.get_filers()]
)
))