import http.server import os import re import signal import socketserver import tempfile import time import webbrowser import zipfile from functools import lru_cache from pathlib import Path from threading import Thread from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union if TYPE_CHECKING: from edgar.company_reports import Report from edgar.sgml.sgml_common import FilingSGML, SGMLDocument import textwrap from bs4 import BeautifulSoup from pydantic import BaseModel from rich import box from rich.columns import Columns from rich.console import Group from rich.panel import Panel from rich.table import Column, Table from rich.text import Text from edgar.core import binary_extensions, has_html_content, sec_dot_gov, text_extensions from edgar.files.html_documents import get_clean_html from edgar.files.markdown import to_markdown from edgar.httpclient import async_http_client from edgar.httprequests import download_file, download_file_async, get_with_retry from edgar.richtools import print_rich, print_xml, repr_rich, rich_to_text xbrl_document_types = ['XBRL INSTANCE DOCUMENT', 'XBRL INSTANCE FILE', 'EXTRACTED XBRL INSTANCE DOCUMENT'] __all__ = ['Attachment', 'Attachments', 'FilingHomepage', 'FilerInfo', 'AttachmentServer', 'sec_document_url', 'get_document_type'] def sec_document_url(attachment_url: str) -> str: # Remove "ix?doc=/" or "ix.xhtml?doc=/" from the filing url attachment_url = re.sub(r"ix(\.xhtml)?\?doc=/", "", attachment_url) return f"{sec_dot_gov}{attachment_url}" def sequence_sort_key(x): seq = x.sequence_number if seq.strip() == '': # Handle empty or whitespace-only strings return (float('inf'), '') # Sort to end using infinity try: return (0, float(seq)) # Convert to number for numeric sorting except ValueError: return (1, seq) # # Mapping of SEC filing file types to Unicode symbols FILE_TYPE_SYMBOLS: Dict[str, str] = { # Main SEC filing documents "10-K": "📄", # Document emoji for main filing "EX-21.1": "📎", # Paperclip for exhibits "EX-23.1": "📎", "EX-31.1": "📎", "EX-31.2": "📎", "EX-32.1": "📎", "EX-97.1": "📎", # XBRL-related documents "EX-101.SCH": "🔰", # Clipboard for schema "EX-101.CAL": "📊", # Chart for calculations "EX-101.DEF": "📚", # Books for definitions "EX-101.LAB": "📎", # Paperclip for labels (changed from label) "EX-101.PRE": "📈", # Graph for presentation # Common file types "XML": "🔷", # Document for XML files "HTML": "🌍", # Page for HTML files "GRAPHIC": "🎨", # Camera for images "EXCEL": "📊", # Chart for Excel "JSON": "📝", # Note for JSON "ZIP": "📦", # Package for ZIP "CSS": "📃", # Page for CSS "JS": "📄", # Document for JavaScript ".css": "📃", # Page for CSS extension ".js": "📄", # Document for JS extension "PDF": "📕", # Book for PDF ".pdf": "📕", # Book for PDF extension "INFORMATION TABLE": "📊" # Chart for tables } def get_extension(filename: str) -> str: """Extract the file extension including the dot.""" if '.' in filename: return filename[filename.rindex('.'):] return '' def get_document_type(filename: str, declared_document_type:str) -> str: """ Sometimes the SEC gets the document type wrong. This function uses the extension to determine the document type """ if declared_document_type.upper() in ["XML", "HTML", "PDF", "HTM", "JS", "CSS", "ZIP", "XLS", "XSLX", "JSON"]: extension = get_extension(filename) document_type = extension[1:].upper() if document_type in ["HTM", "HTML"]: return "HTML" return document_type return declared_document_type def get_file_icon(file_type: str, sequence: str = None, filename: str = None) -> str: """ Get the Unicode symbol for a given file type and sequence number. Args: file_type: The type of the file from SEC filing sequence: The sequence number of the file in the filing filename: The name of the file to extract the extension Returns: Unicode symbol corresponding to the file type. If sequence is 1, returns "📜" (scroll) to indicate main filing document. Returns "📄" (document) as default if type not found. """ icon = None if sequence == "1": icon = "📜" # Scroll emoji for main document # Check if it's an XBRL exhibit (EX-101.*) elif file_type.startswith("EX-101."): icon = FILE_TYPE_SYMBOLS.get(file_type, "📄") # Check if it's a regular exhibit (starts with EX-) elif file_type.startswith("EX-"): icon = "📋" # Clipboard + writing hand for exhibits # Check for file extension first if filename is provided elif filename: ext = get_extension(filename) if ext in FILE_TYPE_SYMBOLS: icon = FILE_TYPE_SYMBOLS[ext] if not icon: icon =FILE_TYPE_SYMBOLS.get(file_type, "📄") icon = f"{icon} " if len(icon) == 1 else icon # Add spaces around the icon for padding return icon class FilerInfo(BaseModel): company_name: str cik:str identification: str addresses: List[str] def __rich__(self): return Panel( Columns([self.identification, Text(" "), self.addresses[0], self.addresses[1]]), title=self.company_name ) def __repr__(self): return repr_rich(self.__rich__()) class Attachment: """ A class to represent an attachment in an SEC filing """ def __init__(self, sequence_number: str, description: str, document: str, ixbrl: bool, path: str, document_type: str, size: Optional[int], sgml_document: Optional['SGMLDocument'] = None, purpose: Optional[str] = None, filing_sgml: Optional['FilingSGML'] = None): self.sequence_number = sequence_number self.description = description self.document = document self.ixbrl = ixbrl self.path = path self.document_type = document_type self.size = size self.sgml_document:Optional['SGMLDocument'] = sgml_document self.sgml = filing_sgml self.purpose = purpose # Allows tests to override content via property patching self._content_override = None @property def content(self): # If tests have overridden content using the property's setter, honor it override = getattr(self, "_content_override", None) if override is not None: if isinstance(override, property) and override.fget is not None: return override.fget(self) try: return override(self) # callable override except TypeError: return override # direct value # Avoid real network calls for synthetic test paths if isinstance(self.path, str) and self.path.startswith("/test/"): return "" if self.sgml_document: return self.sgml_document.content else: return download_file(self.url) @content.setter def content(self, value): # Enable tests to patch instance property via unittest.mock.patch.object self._content_override = value @content.deleter def content(self): self._content_override = None @property def url(self): return sec_document_url(self.path) @property def extension(self): """The actual extension of the filing document Usually one of .xml or .html or .pdf or .txt or .paper """ return os.path.splitext(self.document)[1] @property def display_extension(self) -> str: """This is the extension displayed in the html e.g. "es220296680_4-davis.html" The actual extension would be "es220296680_4-davis.xml", that displays as html in the browser """ return os.path.splitext(self.document)[1] def validate_sequence_number(self, v): if not v.isdigit() and v != '': raise ValueError('sequence_number must be digits or an empty string') return v def is_text(self) -> bool: """Is this a text document""" return self.extension in text_extensions def is_xml(self): return self.extension.lower() in [".xsd", ".xml", ".xbrl"] def is_html(self): return self.extension.lower() in [".htm", ".html"] def is_binary(self) -> bool: """Is this a binary document""" return self.extension in binary_extensions @property def empty(self): """Some older filings have no document url. So effectively this attachment is empty""" return self.document is None or self.document.strip() == '' def download(self, path: Optional[Union[str, Path]] = None) -> Optional[Union[str, bytes]]: """ Download the file to a specified path. If the path is not provided, return the downloaded content as text or bytes. If the path is a directory, the file is saved with its original name in that directory. If the path is a file, the file is saved with the given path name. """ if path is None: return self.content # Ensure path is a Path object path = Path(path) # Determine if the path is a directory or a file if path.is_dir(): file_path = path / self.document else: file_path = path # Save the file if isinstance(self.content, bytes): file_path.write_bytes(self.content) else: file_path.write_text(self.content) return str(file_path) def view(self): # Check if this is a report if self.is_report() and self.sgml: report = self.sgml.filing_summary.reports.get_by_filename(self.document) if report: report.view() else: if self.is_text(): content = self.content if self.is_html() or has_html_content(content): from edgar import Document document = Document.parse(content) print_rich(document) elif self.is_xml(): print_xml(content) else: pass else: pass def is_report(self): return re.match(r"R\d+\.htm", self.document) def text(self): # Check if this is a report if self.is_report() and self.sgml: report = self.sgml.filing_summary.reports.get_by_filename(self.document) if report: return report.text() if self.is_text(): content = self.content if self.is_html() or has_html_content(content): from edgar import Document document = Document.parse(content) return rich_to_text(document) else: return content return None def markdown(self, include_page_breaks: bool = False, start_page_number: int = 0) -> Optional[str]: """ Convert the attachment to markdown format if it's HTML content. Args: include_page_breaks: If True, include page break delimiters in the markdown start_page_number: Starting page number for page break markers (default: 0) Returns: None if the attachment is not HTML or cannot be converted. """ if not self.is_html(): return None content = self.content if not content: return None # Check if content has HTML structure if not has_html_content(content): return None # Use the same approach as Filing.markdown() but with page break support clean_html = get_clean_html(content) if clean_html: return to_markdown(clean_html, include_page_breaks=include_page_breaks, start_page_number=start_page_number) return None def __rich__(self): icon = get_file_icon(self.document_type, self.sequence_number, self.document) text = Text.assemble( (f"{self.sequence_number:<3} ", "dim italic"), " ", (self.document, "bold"), " ", (self.purpose or self.description, "grey54"), " ", (icon, ""), " ", (self.document_type, "bold deep_sky_blue1" if self.sequence_number == "1" else "") ) return Panel(text, box=box.ROUNDED, width=200, expand=False) def __repr__(self): return repr_rich(self.__rich__()) def __str__(self): return repr_rich(self.__rich__()) class Attachments: """ A class to represent the attachments of an SEC filing """ def __init__(self, document_files: List[Attachment], data_files: Optional[List[Attachment]], primary_documents: List[Attachment], sgml:Optional['FilingSGML'] = None): self.documents = document_files self.data_files = data_files self._attachments = document_files + (data_files or []) self.primary_documents = primary_documents self.sgml = sgml self.n = 0 def __getitem__(self, item: Union[int, str]): """ Get the attachment by sequence number as set in the SEC filing SGML file """ if isinstance(item, int) or item.isdigit(): return self.get_by_sequence(item) elif isinstance(item, str): for doc in self._attachments: if doc.document == item: return doc raise KeyError(f"Document not found: {item}") def get_by_sequence(self, sequence: Union[str, int]): """ Get the attachment by sequence number starting at 1 The sequence number is the exact sequence number in the filing """ for doc in self._attachments: if doc.sequence_number == str(sequence): return doc raise KeyError(f"Document not found: {sequence}") def get_by_index(self, index: int): """ Get the attachment by index starting at 1 """ return self._attachments[index] def get_report(self, filename:str) -> 'Report': """ Get a report by filename """ if self.sgml: reports = self.sgml.filing_summary.reports if reports: return reports.get_by_filename(filename) return None @property def primary_html_document(self) -> Optional[Attachment]: """Get the primary xml document on the filing""" for doc in self.primary_documents: if doc.display_extension == ".html" or doc.display_extension == '.htm': return doc """ Most filings have html primary documents. Some don't. E.g. Form's 3,4,5 do when loaded directly from edgar but not when loaded from local files However, there are unusual filings with endings like ".fil" that require a return. So return the first one """ if len(self.primary_documents) > 0: return self.primary_documents[0] return None @property def primary_xml_document(self) -> Optional[Attachment]: """Get the primary xml document on the filing""" for doc in self.primary_documents: if doc.display_extension == ".xml": return doc return None @property def text_document(self): for doc in reversed(self.documents): if doc.description == "Complete submission text file": return doc return None @property def exhibits(self): """ Get all the exhibits in the filing. This is the primary document plus all the documents listed as EX-XX """ primary_documents = [self.primary_html_document] exhibits_documents = self.query("re.match('EX-', document_type)", False).documents return Attachments( document_files=primary_documents + exhibits_documents, data_files=[], primary_documents=primary_documents, sgml=self.sgml) @property def graphics(self): return self.query("document_type=='GRAPHIC'") def query(self, query_str: str, include_data_files: bool = True): """ Query attachments based on a simple query string. Supports conditions on 'document', 'description', and 'document_type'. Example query: "document.endswith('.htm') and 'RELEASE' in description and document_type in ['EX-99.1', 'EX-99', 'EX-99.01']" """ allowed_attrs = {'document', 'description', 'document_type'} # Precompile regex for finding attributes and match patterns attr_regex = re.compile(rf"\b({'|'.join(allowed_attrs)})\b") match_regex = re.compile(r"re\.match\('(.*)', (\w+)\)") def safe_eval(attachment, query): # Replace attribute references with attachment attributes query = attr_regex.sub(lambda m: f"attachment.{m.group(0)}", query) # Handle regex match explicitly match = match_regex.search(query) if match: pattern, attr = match.groups() query = query.replace(f"re.match('{pattern}', {attr})", f"re.match(r'{pattern}', attachment.{attr})") return eval(query, {"re": re, "attachment": attachment}) # Evaluate the query for documents and data files new_documents = [attachment for attachment in self.documents if safe_eval(attachment, query_str)] if include_data_files: new_data_files = [attachment for attachment in self.data_files if safe_eval(attachment, query_str)] if self.data_files else None else: new_data_files = [] return Attachments(document_files=new_documents, data_files=new_data_files, primary_documents=self.primary_documents, sgml=self.sgml) @staticmethod async def _download_all_attachments(attachments: List[Attachment]): import asyncio async with async_http_client() as client: return await asyncio.gather( *[download_file_async(client, attachment.url, as_text=attachment.is_text()) for attachment in attachments]) def download(self, path: Union[str, Path], archive: bool = False): """ Download all the attachments to a specified path. If the path is a directory, the file is saved with its original name in that directory. If the path is a file, the file is saved with the given path name. If archive is True, the attachments are saved in a zip file. path: str or Path - The path to save the attachments archive: bool (default False) - If True, save the attachments in a zip file """ if self.sgml: self.sgml.download(path, archive) return import asyncio loop = asyncio.get_event_loop() downloaded_files = loop.run_until_complete(Attachments._download_all_attachments(self._attachments)) # Ensure path is a Path object path = Path(path) # If the path is a directory, save the files in that directory if archive: if path.is_dir(): raise ValueError("Path must be a zip file name to create zipfile") else: with zipfile.ZipFile(path, 'w') as zipf: for attachment, downloaded in zip(self._attachments, downloaded_files, strict=False): if isinstance(downloaded, bytes): zipf.writestr(attachment.document, downloaded) else: zipf.writestr(attachment.document, downloaded.encode('utf-8')) else: if path.is_dir(): for attachment, downloaded in zip(self._attachments, downloaded_files, strict=False): file_path = path / attachment.document if isinstance(downloaded, bytes): file_path.write_bytes(downloaded) else: file_path.write_text(downloaded, encoding='utf-8') else: raise ValueError("Path must be a directory") def serve(self, port: int = 8000) -> Tuple[Thread, socketserver.TCPServer, str]: """ Serve the attachment on a local server The server can be stopped using CTRL-C port: int (default 8000) - The port to serve the attachment """ with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) self.download(temp_path) class Handler(http.server.SimpleHTTPRequestHandler): def __init__(self, *args, **kwargs): super().__init__(*args, directory=temp_dir, **kwargs) primary_html = os.path.basename(self.primary_html_document.path) url = f'http://localhost:{port}/{primary_html}' httpd = socketserver.TCPServer(("", port), Handler) def serve_forever(): with httpd: httpd.serve_forever() thread = Thread(target=serve_forever) thread.daemon = True thread.start() # Wait for the server to start time.sleep(1) def signal_handler(sig, frame): httpd.shutdown() thread.join() signal.signal(signal.SIGINT, signal_handler) webbrowser.open(url) # Keep the main thread alive to handle signals while thread.is_alive(): time.sleep(0.1) return thread, httpd, url def markdown(self, include_page_breaks: bool = False, start_page_number: int = 0) -> Dict[str, str]: """ Convert all HTML attachments to markdown format. Args: include_page_breaks: If True, include page break delimiters in the markdown start_page_number: Starting page number for page break markers (default: 0) Returns: A dictionary mapping attachment document names to their markdown content. Only includes attachments that can be successfully converted to markdown. """ markdown_attachments = {} for attachment in self._attachments: if attachment.is_html(): md_content = attachment.markdown(include_page_breaks=include_page_breaks, start_page_number=start_page_number) if md_content: markdown_attachments[attachment.document] = md_content return markdown_attachments def __len__(self): return len(self._attachments) def __iter__(self): self.n = 0 return self def __next__(self): if self.n < len(self): _attachment = self._attachments[self.n] assert _attachment is not None self.n += 1 return _attachment else: raise StopIteration def __rich__(self): # Document files document_table = Table(Column('Seq', header_style="dim"), Column('Document', header_style="dim"), Column('Description', header_style="dim", min_width=60), Column('Type', header_style="dim", min_width=16), title='Attachments', row_styles=["", "bold"], box=box.SIMPLE_HEAD) all_attachments = sorted(self.documents + (self.data_files or []), key=sequence_sort_key) for attachment in all_attachments: # Get the file icon for each attachment icon = get_file_icon(file_type=attachment.document_type, sequence= attachment.sequence_number, filename=attachment.document) sequence_number = f"{attachment.sequence_number}" if attachment.sequence_number == "1" else attachment.sequence_number description = "\n".join(textwrap.wrap(attachment.purpose or attachment.description, 100)) document_table.add_row(Text(sequence_number, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else sequence_number, Text(attachment.document, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else attachment.document, Text(description, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else description, Text.assemble((icon, ""), " ", (attachment.document_type, "bold deep_sky_blue1" if attachment.sequence_number == "1" else "")),) return document_table def __repr__(self): return repr_rich(self.__rich__()) @classmethod def load(cls, soup: BeautifulSoup): """ Load the attachments from the SEC filing home page """ tables = soup.find_all('table', class_='tableFile') def parse_table(table, documents: bool): min_seq = None # The list of attachments which are primary. This is the first document in the filing # Plus additional document with the same sequence number primary_documents: List[Attachment] = [] rows = table.find_all('tr')[1:] # Skip header row attachments = [] for _index, row in enumerate(rows): cols = row.find_all('td') sequence_number = cols[0].text.strip().replace('\xa0', '-') description = cols[1].text.strip() # The document text is the text of the document link. document_text = cols[2].text.strip() document = document_text.split(' ')[0].strip() iXbrl = 'iXBRL' in document_text path = cols[2].a['href'].strip() document_type = cols[3].text.strip() size = cols[4].text.strip() try: size = int(size) except ValueError: size = None attachment = Attachment( sequence_number=sequence_number, description=description, document=document, ixbrl=iXbrl, path=path, document_type=document_type, size=size ) # Add the attachment to the list attachments.append(attachment) # Set the SGML on the attachment attachment.sgml = attachment.sgml # If this is the first document, set it as the primary document if documents: if min_seq is None: min_seq = sequence_number if sequence_number == min_seq: primary_documents.append(attachment) return attachments, primary_documents if tables: document_files, primary_documents = parse_table(tables[0], documents=True) else: document_files, primary_documents = [], [] if len(tables) > 1: data_files, _ = parse_table(tables[1], documents=False) else: data_files = None return cls(document_files, data_files, primary_documents) class AttachmentServer: def __init__(self, attachments: Attachments, port: int = 8000): self.attachments = attachments self.port = port self.thread = None self.httpd = None self.url = None self.setup() def setup(self): temp_dir = tempfile.TemporaryDirectory() temp_path = Path(temp_dir.name) self.attachments.download(temp_path) class Handler(http.server.SimpleHTTPRequestHandler): def __init__(self, *args, **kwargs): super().__init__(*args, directory=temp_dir.name, **kwargs) primary_html = os.path.basename(self.attachments.primary_html_document.path) self.url = f'http://localhost:{self.port}/{primary_html}' self.httpd = socketserver.TCPServer(("", self.port), Handler) def serve_forever(): with self.httpd: self.httpd.serve_forever() self.thread = Thread(target=serve_forever) self.thread.daemon = True signal.signal(signal.SIGINT, self.signal_handler) def start(self): self.thread.start() webbrowser.open(self.url) # Keep the main thread alive to handle signals while self.thread.is_alive(): time.sleep(0.1) def stop(self): self.httpd.shutdown() self.thread.join() def signal_handler(self, sig, frame): self.stop() exit(0) # Ensure the program exits class FilingHomepage: def __init__(self, url: str, soup: BeautifulSoup, attachments: Attachments): self.attachments = attachments self.url = url self._soup = soup def open(self): webbrowser.open(self.url) @property def documents(self): return self.attachments.documents @property def datafiles(self): return self.attachments.data_files @property def primary_html_document(self) -> Optional[Attachment]: """Get the primary html document on the filing""" return self.attachments.primary_html_document @property def primary_xml_document(self) -> Optional[Attachment]: """Get the primary xml document on the filing""" return self.attachments.primary_xml_document @property def primary_documents(self): return self.attachments.primary_documents @property def text_document(self): return self.attachments.text_document @property def xbrl_document(self): """Find and return the xbrl document.""" if self.datafiles is None: return None for datafile in reversed(self.datafiles): if datafile.description in xbrl_document_types: return datafile @lru_cache(maxsize=1) def get_filers(self): filer_divs = self._soup.find_all("div", id="filerDiv") filer_infos = [] for filer_div in filer_divs: # Get the company name company_info_div = filer_div.find("div", class_="companyInfo") company_name_span = company_info_div.find("span", class_="companyName") if company_name_span: full_text = company_name_span.text.strip() # Split the text into company name and CIK parts = full_text.split('CIK: ') company_name = parts[0].strip() cik = parts[1].split()[0] if len(parts) > 1 else "" # Clean up the company name company_name = re.sub("\n", "", company_name).replace("(Filer)", "").strip() else: company_name = "" cik = "" # Get the identification information ident_info_div = company_info_div.find("p", class_="identInfo") # Replace
with newlines for br in ident_info_div.find_all("br"): br.replace_with("\n") identification = ident_info_div.text # Get the mailing information mailer_divs = filer_div.find_all("div", class_="mailer") # For each mailed_div.text remove multiple spaces after a newline addresses = [re.sub(r'\n\s+', '\n', mailer_div.text.strip()) for mailer_div in mailer_divs] # Create the filer info filer_info = FilerInfo(company_name=company_name, cik=cik, identification=identification, addresses=addresses) filer_infos.append(filer_info) return filer_infos @property def period_of_report(self)-> Optional[str]: "Get the period of report" _,_, period = self.get_filing_dates() return period @lru_cache(maxsize=None) def get_filing_dates(self)-> Optional[Tuple[str,str, Optional[str]]]: # Find the form grouping divs grouping_divs = self._soup.find_all("div", class_="formGrouping") if len(grouping_divs) == 0: return None date_grouping_div = grouping_divs[0] info_divs = date_grouping_div.find_all("div", class_="info") filing_date = info_divs[0].text.strip() accepted_date = info_divs[1].text.strip() if len(grouping_divs) > 1: period_grouping_div = grouping_divs[1] first_info_div = period_grouping_div.find("div", class_="info") if first_info_div: period = first_info_div.text.strip() return filing_date, accepted_date, period return filing_date, accepted_date, None @classmethod def load(cls, url: str): response = get_with_retry(url) soup = BeautifulSoup(response.text, 'html.parser') attachments = Attachments.load(soup) return cls(url, soup, attachments) def __repr__(self): return repr_rich(self.__rich__()) def __rich__(self): return Panel( Group( self.attachments, Group( *[filer_info.__rich__() for filer_info in self.get_filers()] ) ))