Initial commit
This commit is contained in:
950
venv/lib/python3.10/site-packages/edgar/attachments.py
Normal file
950
venv/lib/python3.10/site-packages/edgar/attachments.py
Normal file
@@ -0,0 +1,950 @@
|
||||
import http.server
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import socketserver
|
||||
import tempfile
|
||||
import time
|
||||
import webbrowser
|
||||
import zipfile
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from threading import Thread
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar.company_reports import Report
|
||||
from edgar.sgml.sgml_common import FilingSGML, SGMLDocument
|
||||
|
||||
import textwrap
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel
|
||||
from rich import box
|
||||
from rich.columns import Columns
|
||||
from rich.console import Group
|
||||
from rich.panel import Panel
|
||||
from rich.table import Column, Table
|
||||
from rich.text import Text
|
||||
|
||||
from edgar.core import binary_extensions, has_html_content, sec_dot_gov, text_extensions
|
||||
from edgar.files.html_documents import get_clean_html
|
||||
from edgar.files.markdown import to_markdown
|
||||
from edgar.httpclient import async_http_client
|
||||
from edgar.httprequests import download_file, download_file_async, get_with_retry
|
||||
from edgar.richtools import print_rich, print_xml, repr_rich, rich_to_text
|
||||
|
||||
xbrl_document_types = ['XBRL INSTANCE DOCUMENT', 'XBRL INSTANCE FILE', 'EXTRACTED XBRL INSTANCE DOCUMENT']
|
||||
|
||||
__all__ = ['Attachment', 'Attachments', 'FilingHomepage', 'FilerInfo', 'AttachmentServer', 'sec_document_url', 'get_document_type']
|
||||
|
||||
|
||||
def sec_document_url(attachment_url: str) -> str:
|
||||
# Remove "ix?doc=/" or "ix.xhtml?doc=/" from the filing url
|
||||
attachment_url = re.sub(r"ix(\.xhtml)?\?doc=/", "", attachment_url)
|
||||
return f"{sec_dot_gov}{attachment_url}"
|
||||
|
||||
def sequence_sort_key(x):
|
||||
seq = x.sequence_number
|
||||
if seq.strip() == '': # Handle empty or whitespace-only strings
|
||||
return (float('inf'), '') # Sort to end using infinity
|
||||
try:
|
||||
return (0, float(seq)) # Convert to number for numeric sorting
|
||||
except ValueError:
|
||||
return (1, seq) #
|
||||
|
||||
|
||||
# Mapping of SEC filing file types to Unicode symbols
|
||||
FILE_TYPE_SYMBOLS: Dict[str, str] = {
|
||||
# Main SEC filing documents
|
||||
"10-K": "📄", # Document emoji for main filing
|
||||
"EX-21.1": "📎", # Paperclip for exhibits
|
||||
"EX-23.1": "📎",
|
||||
"EX-31.1": "📎",
|
||||
"EX-31.2": "📎",
|
||||
"EX-32.1": "📎",
|
||||
"EX-97.1": "📎",
|
||||
|
||||
# XBRL-related documents
|
||||
"EX-101.SCH": "🔰", # Clipboard for schema
|
||||
"EX-101.CAL": "📊", # Chart for calculations
|
||||
"EX-101.DEF": "📚", # Books for definitions
|
||||
"EX-101.LAB": "📎", # Paperclip for labels (changed from label)
|
||||
"EX-101.PRE": "📈", # Graph for presentation
|
||||
|
||||
# Common file types
|
||||
"XML": "🔷", # Document for XML files
|
||||
"HTML": "🌍", # Page for HTML files
|
||||
"GRAPHIC": "🎨", # Camera for images
|
||||
"EXCEL": "📊", # Chart for Excel
|
||||
"JSON": "📝", # Note for JSON
|
||||
"ZIP": "📦", # Package for ZIP
|
||||
"CSS": "📃", # Page for CSS
|
||||
"JS": "📄", # Document for JavaScript
|
||||
".css": "📃", # Page for CSS extension
|
||||
".js": "📄", # Document for JS extension
|
||||
"PDF": "📕", # Book for PDF
|
||||
".pdf": "📕", # Book for PDF extension
|
||||
"INFORMATION TABLE": "📊" # Chart for tables
|
||||
}
|
||||
|
||||
|
||||
def get_extension(filename: str) -> str:
|
||||
"""Extract the file extension including the dot."""
|
||||
if '.' in filename:
|
||||
return filename[filename.rindex('.'):]
|
||||
return ''
|
||||
|
||||
def get_document_type(filename: str, declared_document_type:str) -> str:
|
||||
"""
|
||||
Sometimes the SEC gets the document type wrong. This function uses the extension to determine the document type
|
||||
"""
|
||||
if declared_document_type.upper() in ["XML", "HTML", "PDF", "HTM", "JS", "CSS", "ZIP", "XLS", "XSLX", "JSON"]:
|
||||
extension = get_extension(filename)
|
||||
document_type = extension[1:].upper()
|
||||
if document_type in ["HTM", "HTML"]:
|
||||
return "HTML"
|
||||
return document_type
|
||||
return declared_document_type
|
||||
|
||||
def get_file_icon(file_type: str, sequence: str = None, filename: str = None) -> str:
|
||||
"""
|
||||
Get the Unicode symbol for a given file type and sequence number.
|
||||
|
||||
Args:
|
||||
file_type: The type of the file from SEC filing
|
||||
sequence: The sequence number of the file in the filing
|
||||
filename: The name of the file to extract the extension
|
||||
|
||||
Returns:
|
||||
Unicode symbol corresponding to the file type.
|
||||
If sequence is 1, returns "📜" (scroll) to indicate main filing document.
|
||||
Returns "📄" (document) as default if type not found.
|
||||
"""
|
||||
icon = None
|
||||
if sequence == "1":
|
||||
icon = "📜" # Scroll emoji for main document
|
||||
|
||||
# Check if it's an XBRL exhibit (EX-101.*)
|
||||
elif file_type.startswith("EX-101."):
|
||||
icon = FILE_TYPE_SYMBOLS.get(file_type, "📄")
|
||||
|
||||
# Check if it's a regular exhibit (starts with EX-)
|
||||
elif file_type.startswith("EX-"):
|
||||
icon = "📋" # Clipboard + writing hand for exhibits
|
||||
|
||||
# Check for file extension first if filename is provided
|
||||
elif filename:
|
||||
ext = get_extension(filename)
|
||||
if ext in FILE_TYPE_SYMBOLS:
|
||||
icon = FILE_TYPE_SYMBOLS[ext]
|
||||
|
||||
if not icon:
|
||||
icon =FILE_TYPE_SYMBOLS.get(file_type, "📄")
|
||||
icon = f"{icon} " if len(icon) == 1 else icon # Add spaces around the icon for padding
|
||||
return icon
|
||||
|
||||
|
||||
class FilerInfo(BaseModel):
|
||||
company_name: str
|
||||
cik:str
|
||||
identification: str
|
||||
addresses: List[str]
|
||||
|
||||
def __rich__(self):
|
||||
return Panel(
|
||||
Columns([self.identification, Text(" "), self.addresses[0], self.addresses[1]]),
|
||||
title=self.company_name
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
class Attachment:
|
||||
"""
|
||||
A class to represent an attachment in an SEC filing
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
sequence_number: str,
|
||||
description: str,
|
||||
document: str,
|
||||
ixbrl: bool,
|
||||
path: str,
|
||||
document_type: str,
|
||||
size: Optional[int],
|
||||
sgml_document: Optional['SGMLDocument'] = None,
|
||||
purpose: Optional[str] = None,
|
||||
filing_sgml: Optional['FilingSGML'] = None):
|
||||
self.sequence_number = sequence_number
|
||||
self.description = description
|
||||
self.document = document
|
||||
self.ixbrl = ixbrl
|
||||
self.path = path
|
||||
self.document_type = document_type
|
||||
self.size = size
|
||||
self.sgml_document:Optional['SGMLDocument'] = sgml_document
|
||||
self.sgml = filing_sgml
|
||||
self.purpose = purpose
|
||||
# Allows tests to override content via property patching
|
||||
self._content_override = None
|
||||
|
||||
@property
|
||||
def content(self):
|
||||
# If tests have overridden content using the property's setter, honor it
|
||||
override = getattr(self, "_content_override", None)
|
||||
if override is not None:
|
||||
if isinstance(override, property) and override.fget is not None:
|
||||
return override.fget(self)
|
||||
try:
|
||||
return override(self) # callable override
|
||||
except TypeError:
|
||||
return override # direct value
|
||||
|
||||
# Avoid real network calls for synthetic test paths
|
||||
if isinstance(self.path, str) and self.path.startswith("/test/"):
|
||||
return ""
|
||||
|
||||
if self.sgml_document:
|
||||
return self.sgml_document.content
|
||||
else:
|
||||
return download_file(self.url)
|
||||
|
||||
@content.setter
|
||||
def content(self, value):
|
||||
# Enable tests to patch instance property via unittest.mock.patch.object
|
||||
self._content_override = value
|
||||
|
||||
@content.deleter
|
||||
def content(self):
|
||||
self._content_override = None
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return sec_document_url(self.path)
|
||||
|
||||
@property
|
||||
def extension(self):
|
||||
"""The actual extension of the filing document
|
||||
Usually one of .xml or .html or .pdf or .txt or .paper
|
||||
"""
|
||||
return os.path.splitext(self.document)[1]
|
||||
|
||||
@property
|
||||
def display_extension(self) -> str:
|
||||
"""This is the extension displayed in the html e.g. "es220296680_4-davis.html"
|
||||
The actual extension would be "es220296680_4-davis.xml", that displays as html in the browser
|
||||
"""
|
||||
return os.path.splitext(self.document)[1]
|
||||
|
||||
def validate_sequence_number(self, v):
|
||||
if not v.isdigit() and v != '':
|
||||
raise ValueError('sequence_number must be digits or an empty string')
|
||||
return v
|
||||
|
||||
def is_text(self) -> bool:
|
||||
"""Is this a text document"""
|
||||
return self.extension in text_extensions
|
||||
|
||||
def is_xml(self):
|
||||
return self.extension.lower() in [".xsd", ".xml", ".xbrl"]
|
||||
|
||||
def is_html(self):
|
||||
return self.extension.lower() in [".htm", ".html"]
|
||||
|
||||
def is_binary(self) -> bool:
|
||||
"""Is this a binary document"""
|
||||
return self.extension in binary_extensions
|
||||
|
||||
@property
|
||||
def empty(self):
|
||||
"""Some older filings have no document url. So effectively this attachment is empty"""
|
||||
return self.document is None or self.document.strip() == ''
|
||||
|
||||
def download(self, path: Optional[Union[str, Path]] = None) -> Optional[Union[str, bytes]]:
|
||||
"""
|
||||
Download the file to a specified path.
|
||||
If the path is not provided, return the downloaded content as text or bytes.
|
||||
If the path is a directory, the file is saved with its original name in that directory.
|
||||
If the path is a file, the file is saved with the given path name.
|
||||
"""
|
||||
if path is None:
|
||||
return self.content
|
||||
|
||||
# Ensure path is a Path object
|
||||
path = Path(path)
|
||||
|
||||
# Determine if the path is a directory or a file
|
||||
if path.is_dir():
|
||||
file_path = path / self.document
|
||||
else:
|
||||
file_path = path
|
||||
|
||||
# Save the file
|
||||
if isinstance(self.content, bytes):
|
||||
file_path.write_bytes(self.content)
|
||||
else:
|
||||
file_path.write_text(self.content)
|
||||
|
||||
return str(file_path)
|
||||
|
||||
def view(self):
|
||||
# Check if this is a report
|
||||
if self.is_report() and self.sgml:
|
||||
report = self.sgml.filing_summary.reports.get_by_filename(self.document)
|
||||
if report:
|
||||
report.view()
|
||||
else:
|
||||
if self.is_text():
|
||||
content = self.content
|
||||
if self.is_html() or has_html_content(content):
|
||||
from edgar import Document
|
||||
document = Document.parse(content)
|
||||
print_rich(document)
|
||||
elif self.is_xml():
|
||||
print_xml(content)
|
||||
else:
|
||||
pass
|
||||
else:
|
||||
pass
|
||||
|
||||
def is_report(self):
|
||||
return re.match(r"R\d+\.htm", self.document)
|
||||
|
||||
def text(self):
|
||||
# Check if this is a report
|
||||
if self.is_report() and self.sgml:
|
||||
report = self.sgml.filing_summary.reports.get_by_filename(self.document)
|
||||
if report:
|
||||
return report.text()
|
||||
|
||||
if self.is_text():
|
||||
content = self.content
|
||||
if self.is_html() or has_html_content(content):
|
||||
from edgar import Document
|
||||
document = Document.parse(content)
|
||||
return rich_to_text(document)
|
||||
else:
|
||||
return content
|
||||
return None
|
||||
|
||||
def markdown(self, include_page_breaks: bool = False, start_page_number: int = 0) -> Optional[str]:
|
||||
"""
|
||||
Convert the attachment to markdown format if it's HTML content.
|
||||
|
||||
Args:
|
||||
include_page_breaks: If True, include page break delimiters in the markdown
|
||||
start_page_number: Starting page number for page break markers (default: 0)
|
||||
|
||||
Returns:
|
||||
None if the attachment is not HTML or cannot be converted.
|
||||
"""
|
||||
if not self.is_html():
|
||||
return None
|
||||
|
||||
content = self.content
|
||||
if not content:
|
||||
return None
|
||||
|
||||
# Check if content has HTML structure
|
||||
if not has_html_content(content):
|
||||
return None
|
||||
|
||||
# Use the same approach as Filing.markdown() but with page break support
|
||||
clean_html = get_clean_html(content)
|
||||
if clean_html:
|
||||
return to_markdown(clean_html, include_page_breaks=include_page_breaks, start_page_number=start_page_number)
|
||||
|
||||
return None
|
||||
|
||||
def __rich__(self):
|
||||
icon = get_file_icon(self.document_type, self.sequence_number, self.document)
|
||||
text = Text.assemble( (f"{self.sequence_number:<3} ", "dim italic"),
|
||||
" ",
|
||||
(self.document, "bold"),
|
||||
" ", (self.purpose or self.description, "grey54"),
|
||||
" ",
|
||||
(icon, ""),
|
||||
" ",
|
||||
(self.document_type,
|
||||
"bold deep_sky_blue1" if self.sequence_number == "1" else "")
|
||||
)
|
||||
return Panel(text, box=box.ROUNDED, width=200, expand=False)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def __str__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
class Attachments:
|
||||
"""
|
||||
A class to represent the attachments of an SEC filing
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
document_files: List[Attachment],
|
||||
data_files: Optional[List[Attachment]],
|
||||
primary_documents: List[Attachment],
|
||||
sgml:Optional['FilingSGML'] = None):
|
||||
self.documents = document_files
|
||||
self.data_files = data_files
|
||||
self._attachments = document_files + (data_files or [])
|
||||
self.primary_documents = primary_documents
|
||||
self.sgml = sgml
|
||||
self.n = 0
|
||||
|
||||
|
||||
def __getitem__(self, item: Union[int, str]):
|
||||
"""
|
||||
Get the attachment by sequence number as set in the SEC filing SGML file
|
||||
"""
|
||||
if isinstance(item, int) or item.isdigit():
|
||||
return self.get_by_sequence(item)
|
||||
elif isinstance(item, str):
|
||||
for doc in self._attachments:
|
||||
if doc.document == item:
|
||||
return doc
|
||||
raise KeyError(f"Document not found: {item}")
|
||||
|
||||
def get_by_sequence(self, sequence: Union[str, int]):
|
||||
"""
|
||||
Get the attachment by sequence number starting at 1
|
||||
The sequence number is the exact sequence number in the filing
|
||||
"""
|
||||
for doc in self._attachments:
|
||||
if doc.sequence_number == str(sequence):
|
||||
return doc
|
||||
raise KeyError(f"Document not found: {sequence}")
|
||||
|
||||
def get_by_index(self, index: int):
|
||||
"""
|
||||
Get the attachment by index starting at 1
|
||||
"""
|
||||
return self._attachments[index]
|
||||
|
||||
|
||||
def get_report(self, filename:str) -> 'Report':
|
||||
"""
|
||||
Get a report by filename
|
||||
"""
|
||||
if self.sgml:
|
||||
reports = self.sgml.filing_summary.reports
|
||||
if reports:
|
||||
return reports.get_by_filename(filename)
|
||||
return None
|
||||
|
||||
|
||||
@property
|
||||
def primary_html_document(self) -> Optional[Attachment]:
|
||||
"""Get the primary xml document on the filing"""
|
||||
for doc in self.primary_documents:
|
||||
if doc.display_extension == ".html" or doc.display_extension == '.htm':
|
||||
return doc
|
||||
"""
|
||||
Most filings have html primary documents. Some don't.
|
||||
E.g. Form's 3,4,5 do when loaded directly from edgar but not when loaded from local files
|
||||
However, there are unusual filings with endings like ".fil" that require a return. So return the first one
|
||||
"""
|
||||
if len(self.primary_documents) > 0:
|
||||
return self.primary_documents[0]
|
||||
return None
|
||||
|
||||
|
||||
@property
|
||||
def primary_xml_document(self) -> Optional[Attachment]:
|
||||
"""Get the primary xml document on the filing"""
|
||||
for doc in self.primary_documents:
|
||||
if doc.display_extension == ".xml":
|
||||
return doc
|
||||
return None
|
||||
|
||||
@property
|
||||
def text_document(self):
|
||||
for doc in reversed(self.documents):
|
||||
if doc.description == "Complete submission text file":
|
||||
return doc
|
||||
return None
|
||||
|
||||
@property
|
||||
def exhibits(self):
|
||||
"""
|
||||
Get all the exhibits in the filing.
|
||||
This is the primary document plus all the documents listed as EX-XX
|
||||
"""
|
||||
primary_documents = [self.primary_html_document]
|
||||
exhibits_documents = self.query("re.match('EX-', document_type)", False).documents
|
||||
return Attachments(
|
||||
document_files=primary_documents + exhibits_documents,
|
||||
data_files=[],
|
||||
primary_documents=primary_documents,
|
||||
sgml=self.sgml)
|
||||
|
||||
@property
|
||||
def graphics(self):
|
||||
return self.query("document_type=='GRAPHIC'")
|
||||
|
||||
def query(self, query_str: str, include_data_files: bool = True):
|
||||
"""
|
||||
Query attachments based on a simple query string.
|
||||
Supports conditions on 'document', 'description', and 'document_type'.
|
||||
Example query: "document.endswith('.htm') and 'RELEASE' in description and document_type in ['EX-99.1', 'EX-99', 'EX-99.01']"
|
||||
"""
|
||||
allowed_attrs = {'document', 'description', 'document_type'}
|
||||
|
||||
# Precompile regex for finding attributes and match patterns
|
||||
attr_regex = re.compile(rf"\b({'|'.join(allowed_attrs)})\b")
|
||||
match_regex = re.compile(r"re\.match\('(.*)', (\w+)\)")
|
||||
|
||||
def safe_eval(attachment, query):
|
||||
# Replace attribute references with attachment attributes
|
||||
query = attr_regex.sub(lambda m: f"attachment.{m.group(0)}", query)
|
||||
|
||||
# Handle regex match explicitly
|
||||
match = match_regex.search(query)
|
||||
if match:
|
||||
pattern, attr = match.groups()
|
||||
query = query.replace(f"re.match('{pattern}', {attr})",
|
||||
f"re.match(r'{pattern}', attachment.{attr})")
|
||||
|
||||
return eval(query, {"re": re, "attachment": attachment})
|
||||
|
||||
# Evaluate the query for documents and data files
|
||||
new_documents = [attachment for attachment in self.documents if safe_eval(attachment, query_str)]
|
||||
if include_data_files:
|
||||
new_data_files = [attachment for attachment in self.data_files if
|
||||
safe_eval(attachment, query_str)] if self.data_files else None
|
||||
else:
|
||||
new_data_files = []
|
||||
|
||||
return Attachments(document_files=new_documents, data_files=new_data_files,
|
||||
primary_documents=self.primary_documents, sgml=self.sgml)
|
||||
|
||||
@staticmethod
|
||||
async def _download_all_attachments(attachments: List[Attachment]):
|
||||
import asyncio
|
||||
|
||||
async with async_http_client() as client:
|
||||
return await asyncio.gather(
|
||||
*[download_file_async(client, attachment.url, as_text=attachment.is_text()) for attachment in attachments])
|
||||
|
||||
|
||||
def download(self, path: Union[str, Path], archive: bool = False):
|
||||
"""
|
||||
Download all the attachments to a specified path.
|
||||
If the path is a directory, the file is saved with its original name in that directory.
|
||||
If the path is a file, the file is saved with the given path name.
|
||||
If archive is True, the attachments are saved in a zip file.
|
||||
path: str or Path - The path to save the attachments
|
||||
archive: bool (default False) - If True, save the attachments in a zip file
|
||||
"""
|
||||
if self.sgml:
|
||||
self.sgml.download(path, archive)
|
||||
return
|
||||
|
||||
import asyncio
|
||||
loop = asyncio.get_event_loop()
|
||||
downloaded_files = loop.run_until_complete(Attachments._download_all_attachments(self._attachments))
|
||||
|
||||
# Ensure path is a Path object
|
||||
path = Path(path)
|
||||
|
||||
# If the path is a directory, save the files in that directory
|
||||
if archive:
|
||||
if path.is_dir():
|
||||
raise ValueError("Path must be a zip file name to create zipfile")
|
||||
else:
|
||||
with zipfile.ZipFile(path, 'w') as zipf:
|
||||
for attachment, downloaded in zip(self._attachments, downloaded_files, strict=False):
|
||||
if isinstance(downloaded, bytes):
|
||||
zipf.writestr(attachment.document, downloaded)
|
||||
else:
|
||||
zipf.writestr(attachment.document, downloaded.encode('utf-8'))
|
||||
else:
|
||||
if path.is_dir():
|
||||
for attachment, downloaded in zip(self._attachments, downloaded_files, strict=False):
|
||||
file_path = path / attachment.document
|
||||
if isinstance(downloaded, bytes):
|
||||
file_path.write_bytes(downloaded)
|
||||
else:
|
||||
file_path.write_text(downloaded, encoding='utf-8')
|
||||
else:
|
||||
raise ValueError("Path must be a directory")
|
||||
|
||||
|
||||
def serve(self, port: int = 8000) -> Tuple[Thread, socketserver.TCPServer, str]:
|
||||
"""
|
||||
Serve the attachment on a local server
|
||||
The server can be stopped using CTRL-C
|
||||
port: int (default 8000) - The port to serve the attachment
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_path = Path(temp_dir)
|
||||
self.download(temp_path)
|
||||
|
||||
class Handler(http.server.SimpleHTTPRequestHandler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, directory=temp_dir, **kwargs)
|
||||
|
||||
primary_html = os.path.basename(self.primary_html_document.path)
|
||||
|
||||
url = f'http://localhost:{port}/{primary_html}'
|
||||
|
||||
httpd = socketserver.TCPServer(("", port), Handler)
|
||||
|
||||
def serve_forever():
|
||||
with httpd:
|
||||
httpd.serve_forever()
|
||||
|
||||
thread = Thread(target=serve_forever)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
# Wait for the server to start
|
||||
time.sleep(1)
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
httpd.shutdown()
|
||||
thread.join()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
webbrowser.open(url)
|
||||
|
||||
# Keep the main thread alive to handle signals
|
||||
while thread.is_alive():
|
||||
time.sleep(0.1)
|
||||
|
||||
return thread, httpd, url
|
||||
|
||||
def markdown(self, include_page_breaks: bool = False, start_page_number: int = 0) -> Dict[str, str]:
|
||||
"""
|
||||
Convert all HTML attachments to markdown format.
|
||||
|
||||
Args:
|
||||
include_page_breaks: If True, include page break delimiters in the markdown
|
||||
start_page_number: Starting page number for page break markers (default: 0)
|
||||
|
||||
Returns:
|
||||
A dictionary mapping attachment document names to their markdown content.
|
||||
Only includes attachments that can be successfully converted to markdown.
|
||||
"""
|
||||
markdown_attachments = {}
|
||||
|
||||
for attachment in self._attachments:
|
||||
if attachment.is_html():
|
||||
md_content = attachment.markdown(include_page_breaks=include_page_breaks, start_page_number=start_page_number)
|
||||
if md_content:
|
||||
markdown_attachments[attachment.document] = md_content
|
||||
|
||||
return markdown_attachments
|
||||
|
||||
def __len__(self):
|
||||
return len(self._attachments)
|
||||
|
||||
def __iter__(self):
|
||||
self.n = 0
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.n < len(self):
|
||||
_attachment = self._attachments[self.n]
|
||||
assert _attachment is not None
|
||||
|
||||
self.n += 1
|
||||
return _attachment
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
def __rich__(self):
|
||||
|
||||
# Document files
|
||||
document_table = Table(Column('Seq', header_style="dim"),
|
||||
Column('Document', header_style="dim"),
|
||||
Column('Description', header_style="dim", min_width=60),
|
||||
Column('Type', header_style="dim", min_width=16),
|
||||
title='Attachments',
|
||||
row_styles=["", "bold"],
|
||||
box=box.SIMPLE_HEAD)
|
||||
all_attachments = sorted(self.documents + (self.data_files or []), key=sequence_sort_key)
|
||||
|
||||
|
||||
|
||||
for attachment in all_attachments:
|
||||
# Get the file icon for each attachment
|
||||
icon = get_file_icon(file_type=attachment.document_type,
|
||||
sequence= attachment.sequence_number,
|
||||
filename=attachment.document)
|
||||
sequence_number = f"{attachment.sequence_number}" if attachment.sequence_number == "1" else attachment.sequence_number
|
||||
description = "\n".join(textwrap.wrap(attachment.purpose or attachment.description, 100))
|
||||
document_table.add_row(Text(sequence_number, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else sequence_number,
|
||||
Text(attachment.document, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else attachment.document,
|
||||
Text(description, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else description,
|
||||
Text.assemble((icon, ""), " ", (attachment.document_type, "bold deep_sky_blue1" if attachment.sequence_number == "1" else "")),)
|
||||
|
||||
|
||||
return document_table
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
@classmethod
|
||||
def load(cls, soup: BeautifulSoup):
|
||||
"""
|
||||
Load the attachments from the SEC filing home page
|
||||
"""
|
||||
tables = soup.find_all('table', class_='tableFile')
|
||||
|
||||
def parse_table(table, documents: bool):
|
||||
min_seq = None
|
||||
# The list of attachments which are primary. This is the first document in the filing
|
||||
# Plus additional document with the same sequence number
|
||||
primary_documents: List[Attachment] = []
|
||||
|
||||
rows = table.find_all('tr')[1:] # Skip header row
|
||||
attachments = []
|
||||
for _index, row in enumerate(rows):
|
||||
cols = row.find_all('td')
|
||||
sequence_number = cols[0].text.strip().replace('\xa0', '-')
|
||||
|
||||
description = cols[1].text.strip()
|
||||
# The document text is the text of the document link.
|
||||
document_text = cols[2].text.strip()
|
||||
document = document_text.split(' ')[0].strip()
|
||||
iXbrl = 'iXBRL' in document_text
|
||||
path = cols[2].a['href'].strip()
|
||||
document_type = cols[3].text.strip()
|
||||
size = cols[4].text.strip()
|
||||
|
||||
try:
|
||||
size = int(size)
|
||||
except ValueError:
|
||||
size = None
|
||||
|
||||
attachment = Attachment(
|
||||
sequence_number=sequence_number,
|
||||
description=description,
|
||||
document=document,
|
||||
ixbrl=iXbrl,
|
||||
path=path,
|
||||
document_type=document_type,
|
||||
size=size
|
||||
)
|
||||
# Add the attachment to the list
|
||||
attachments.append(attachment)
|
||||
|
||||
# Set the SGML on the attachment
|
||||
attachment.sgml = attachment.sgml
|
||||
# If this is the first document, set it as the primary document
|
||||
if documents:
|
||||
if min_seq is None:
|
||||
min_seq = sequence_number
|
||||
if sequence_number == min_seq:
|
||||
primary_documents.append(attachment)
|
||||
return attachments, primary_documents
|
||||
|
||||
if tables:
|
||||
document_files, primary_documents = parse_table(tables[0], documents=True)
|
||||
else:
|
||||
document_files, primary_documents = [], []
|
||||
|
||||
if len(tables) > 1:
|
||||
data_files, _ = parse_table(tables[1], documents=False)
|
||||
else:
|
||||
data_files = None
|
||||
|
||||
return cls(document_files, data_files, primary_documents)
|
||||
|
||||
|
||||
class AttachmentServer:
|
||||
def __init__(self, attachments: Attachments, port: int = 8000):
|
||||
self.attachments = attachments
|
||||
self.port = port
|
||||
self.thread = None
|
||||
self.httpd = None
|
||||
self.url = None
|
||||
self.setup()
|
||||
|
||||
def setup(self):
|
||||
temp_dir = tempfile.TemporaryDirectory()
|
||||
temp_path = Path(temp_dir.name)
|
||||
self.attachments.download(temp_path)
|
||||
|
||||
class Handler(http.server.SimpleHTTPRequestHandler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, directory=temp_dir.name, **kwargs)
|
||||
|
||||
primary_html = os.path.basename(self.attachments.primary_html_document.path)
|
||||
|
||||
self.url = f'http://localhost:{self.port}/{primary_html}'
|
||||
|
||||
self.httpd = socketserver.TCPServer(("", self.port), Handler)
|
||||
|
||||
def serve_forever():
|
||||
with self.httpd:
|
||||
self.httpd.serve_forever()
|
||||
|
||||
self.thread = Thread(target=serve_forever)
|
||||
self.thread.daemon = True
|
||||
|
||||
signal.signal(signal.SIGINT, self.signal_handler)
|
||||
|
||||
def start(self):
|
||||
self.thread.start()
|
||||
webbrowser.open(self.url)
|
||||
|
||||
# Keep the main thread alive to handle signals
|
||||
while self.thread.is_alive():
|
||||
time.sleep(0.1)
|
||||
|
||||
def stop(self):
|
||||
self.httpd.shutdown()
|
||||
self.thread.join()
|
||||
|
||||
def signal_handler(self, sig, frame):
|
||||
self.stop()
|
||||
exit(0) # Ensure the program exits
|
||||
|
||||
|
||||
|
||||
class FilingHomepage:
|
||||
|
||||
def __init__(self,
|
||||
url: str,
|
||||
soup: BeautifulSoup,
|
||||
attachments: Attachments):
|
||||
self.attachments = attachments
|
||||
self.url = url
|
||||
self._soup = soup
|
||||
|
||||
def open(self):
|
||||
webbrowser.open(self.url)
|
||||
|
||||
@property
|
||||
def documents(self):
|
||||
return self.attachments.documents
|
||||
|
||||
@property
|
||||
def datafiles(self):
|
||||
return self.attachments.data_files
|
||||
|
||||
@property
|
||||
def primary_html_document(self) -> Optional[Attachment]:
|
||||
"""Get the primary html document on the filing"""
|
||||
return self.attachments.primary_html_document
|
||||
|
||||
@property
|
||||
def primary_xml_document(self) -> Optional[Attachment]:
|
||||
"""Get the primary xml document on the filing"""
|
||||
return self.attachments.primary_xml_document
|
||||
|
||||
@property
|
||||
def primary_documents(self):
|
||||
return self.attachments.primary_documents
|
||||
|
||||
@property
|
||||
def text_document(self):
|
||||
return self.attachments.text_document
|
||||
|
||||
@property
|
||||
def xbrl_document(self):
|
||||
"""Find and return the xbrl document."""
|
||||
|
||||
if self.datafiles is None:
|
||||
return None
|
||||
for datafile in reversed(self.datafiles):
|
||||
if datafile.description in xbrl_document_types:
|
||||
return datafile
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_filers(self):
|
||||
filer_divs = self._soup.find_all("div", id="filerDiv")
|
||||
filer_infos = []
|
||||
for filer_div in filer_divs:
|
||||
|
||||
# Get the company name
|
||||
company_info_div = filer_div.find("div", class_="companyInfo")
|
||||
|
||||
company_name_span = company_info_div.find("span", class_="companyName")
|
||||
|
||||
if company_name_span:
|
||||
full_text = company_name_span.text.strip()
|
||||
# Split the text into company name and CIK
|
||||
parts = full_text.split('CIK: ')
|
||||
company_name = parts[0].strip()
|
||||
cik = parts[1].split()[0] if len(parts) > 1 else ""
|
||||
|
||||
# Clean up the company name
|
||||
company_name = re.sub("\n", "", company_name).replace("(Filer)", "").strip()
|
||||
else:
|
||||
company_name = ""
|
||||
cik = ""
|
||||
|
||||
# Get the identification information
|
||||
ident_info_div = company_info_div.find("p", class_="identInfo")
|
||||
|
||||
# Replace <br> with newlines
|
||||
for br in ident_info_div.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
|
||||
identification = ident_info_div.text
|
||||
|
||||
# Get the mailing information
|
||||
mailer_divs = filer_div.find_all("div", class_="mailer")
|
||||
# For each mailed_div.text remove multiple spaces after a newline
|
||||
|
||||
addresses = [re.sub(r'\n\s+', '\n', mailer_div.text.strip())
|
||||
for mailer_div in mailer_divs]
|
||||
|
||||
# Create the filer info
|
||||
filer_info = FilerInfo(company_name=company_name, cik=cik, identification=identification, addresses=addresses)
|
||||
|
||||
filer_infos.append(filer_info)
|
||||
|
||||
return filer_infos
|
||||
|
||||
@property
|
||||
def period_of_report(self)-> Optional[str]:
|
||||
"Get the period of report"
|
||||
_,_, period = self.get_filing_dates()
|
||||
return period
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_filing_dates(self)-> Optional[Tuple[str,str, Optional[str]]]:
|
||||
# Find the form grouping divs
|
||||
grouping_divs = self._soup.find_all("div", class_="formGrouping")
|
||||
if len(grouping_divs) == 0:
|
||||
return None
|
||||
date_grouping_div = grouping_divs[0]
|
||||
info_divs = date_grouping_div.find_all("div", class_="info")
|
||||
filing_date = info_divs[0].text.strip()
|
||||
accepted_date = info_divs[1].text.strip()
|
||||
|
||||
if len(grouping_divs) > 1:
|
||||
period_grouping_div = grouping_divs[1]
|
||||
first_info_div = period_grouping_div.find("div", class_="info")
|
||||
if first_info_div:
|
||||
period = first_info_div.text.strip()
|
||||
return filing_date, accepted_date, period
|
||||
return filing_date, accepted_date, None
|
||||
|
||||
@classmethod
|
||||
def load(cls, url: str):
|
||||
response = get_with_retry(url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
attachments = Attachments.load(soup)
|
||||
return cls(url, soup, attachments)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def __rich__(self):
|
||||
|
||||
return Panel(
|
||||
Group(
|
||||
self.attachments,
|
||||
Group(
|
||||
*[filer_info.__rich__() for filer_info in self.get_filers()]
|
||||
)
|
||||
))
|
||||
Reference in New Issue
Block a user