import re import zipfile from collections import defaultdict from functools import cached_property from pathlib import Path from typing import TYPE_CHECKING, DefaultDict, Dict, Iterator, List, Optional, Tuple, Union if TYPE_CHECKING: from edgar._filings import Filing from edgar.attachments import Attachment, Attachments, get_document_type from edgar.httprequests import stream_with_retry from edgar.sgml.filing_summary import FilingSummary from edgar.sgml.sgml_header import FilingHeader from edgar.sgml.sgml_parser import SGMLDocument, SGMLFormatType, SGMLParser from edgar.sgml.tools import is_xml __all__ = ['iter_documents', 'list_documents', 'FilingSGML', 'FilingHeader'] def parse_document(document_str: str) -> SGMLDocument: """ Parse a single SGML document section, maintaining raw content. """ # Extract individual fields with separate patterns type_match = re.search(r'([^<\n]+)', document_str) sequence_match = re.search(r'([^<\n]+)', document_str) filename_match = re.search(r'([^<\n]+)', document_str) description_match = re.search(r'([^<\n]+)', document_str) return SGMLDocument( type=type_match.group(1).strip() if type_match else "", sequence=sequence_match.group(1).strip() if sequence_match else "", filename=filename_match.group(1).strip() if filename_match else "", description=description_match.group(1).strip() if description_match else "", raw_content=document_str ) def read_content(source: Union[str, Path]) -> Iterator[str]: """ Read content from either a URL or file path, yielding lines as strings. Automatically handles gzip-compressed files with .gz extension. Args: source: Either a URL string or a file path Yields: str: Lines of content from the source Raises: TooManyRequestsError: If the server returns a 429 response FileNotFoundError: If the file path doesn't exist gzip.BadGzipFile: If the file is not a valid gzip file """ if isinstance(source, str) and (source.startswith('http://') or source.startswith('https://')): # Handle URL using stream_with_retry for response in stream_with_retry(source): # Process each line from the response and decode from bytes for line in response.iter_lines(): if line is not None: yield line + "\n" else: # Handle file path path = Path(source) # Check if the file is gzip-compressed if str(path).endswith('.gz'): import gzip with gzip.open(path, 'rt', encoding='utf-8', errors='replace') as file: yield from file else: # Regular file handling with path.open('r', encoding='utf-8', errors='replace') as file: yield from file def read_content_as_string(source: Union[str, Path]) -> str: """ Read content from either a URL or file path into a string. Uses existing read_content generator function. Args: source: Either a URL string or a file path Returns: str: Full content as string Raises: TooManyRequestsError: If the server returns a 429 response FileNotFoundError: If file path doesn't exist """ # Convert lines from read_content to string lines = [] for line in read_content(source): # Handle both string and bytes from response if isinstance(line, bytes): lines.append(line.decode('utf-8', errors='replace')) else: lines.append(line) return ''.join(lines) def iter_documents(source: Union[str, Path]) -> Iterator[SGMLDocument]: """ Stream SGML documents from either a URL or file path, yielding parsed documents. Args: source: Either a URL string or a file path (string or Path object) Yields: SGMLDocument objects containing the parsed content Raises: ValueError: If the source is invalid ConnectionError: If URL retrieval fails after retries FileNotFoundError: If the file path doesn't exist """ try: content = ''.join(read_content(source)) document_pattern = re.compile(r'([\s\S]*?)') for match in document_pattern.finditer(content): document = parse_document(match.group(1)) if document: yield document except (ValueError, ConnectionError, FileNotFoundError) as e: raise type(e)(f"Error processing source {source}: {str(e)}") from e def list_documents(source: Union[str, Path]) -> list[SGMLDocument]: """ Convenience method to parse all documents from a source into a list. Args: source: Either a URL string or a file path Returns: List of SGMLDocument objects """ return list(iter_documents(source)) def parse_file(source: Union[str, Path]) -> list[SGMLDocument]: """ Convenience method to parse all documents from a source into a list. Args: source: Either a URL string or a file path Returns: List of SGMLDocument objects """ return list(iter_documents(source)) def parse_submission_text(content: str) -> Tuple[FilingHeader, DefaultDict[str, List[SGMLDocument]]]: """ Parses the raw submission text and returns the filing header along with a dictionary mapping document sequence numbers to lists of SGMLDocument objects. Args: content (str): The raw text content of the submission. Returns: Tuple[FilingHeader, DefaultDict[str, List[SGMLDocument]]]: A tuple where the first element is the FilingHeader object representing the parsed header information, and the second element is a defaultdict mapping document sequence identifiers to their corresponding list of SGMLDocument objects. Details: - For submissions with the SGMLFormatType.SUBMISSION format, the function uses the pre-parsed filer data to create the FilingHeader. - For SEC-DOCUMENT formatted content, the header is initially parsed from the SGML text; if this fails, the header is parsed again with preprocessing enabled. - The function creates an SGMLDocument for each parsed document and groups them by their sequence identifier. Raises: Exception: Any exceptions raised during header parsing (handled internally by attempting to preprocess the header in case of failure). """ # Create parser and get structure including header and documents parser = SGMLParser() parsed_data = parser.parse(content) # Create FilingHeader using already parsed data if parsed_data['format'] == SGMLFormatType.SUBMISSION: # For submission format, we already have parsed filer data header = FilingHeader.parse_submission_format_header(parsed_data=parsed_data) else: # For SEC-DOCUMENT format, pass the header text to the # specialized header parser since we need additional processing try: header = FilingHeader.parse_from_sgml_text(parsed_data['header']) except Exception: header = FilingHeader.parse_from_sgml_text(parsed_data['header'], preprocess=True) # Create document dictionary documents = defaultdict(list) for doc_data in parsed_data['documents']: doc = SGMLDocument.from_parsed_data(doc_data) documents[doc.sequence].append(doc) return header, documents class FilingSGML: """ Main class that parses and provides access to both the header and documents from an SGML filing. """ __slots__ = ('header', '_documents_by_sequence', '__dict__') # Use slots for memory efficiency def __init__(self, header: FilingHeader, documents: defaultdict[str, List[SGMLDocument]]): """ Initialize FilingSGML with parsed header and documents. Args: header (FilingHeader): Parsed header information documents (Dict[str, SGMLDocument]): Dictionary of parsed documents keyed by sequence """ self.header:FilingHeader = header self._documents_by_sequence:defaultdict[str, List[SGMLDocument]] = documents self._documents_by_name:Dict[str, SGMLDocument] = { doc.filename: doc for doc_lst in documents.values() for doc in doc_lst } @property def accession_number(self): return self.header.accession_number @property def cik(self): return self.header.cik @cached_property def entity(self): from edgar.entity import Entity cik = self.cik if cik: return Entity(cik) @property def form(self): return self.header.form @property def filing_date(self): return self.header.filing_date @property def date_as_of_change(self): return self.header.date_as_of_change @property def period_of_report(self): return self.header.period_of_report @property def effective_date(self): return self.header.filing_metadata.get('EFFECTIVE DATE') @property def path(self): """ Get the root path of the filing. """ if self.accession_number: return f"/Archives/edgar/data/{self.header.cik}/{self.accession_number.replace('-', '')}" else: return "/" def html(self): html_document = self.attachments.primary_html_document if html_document and not html_document.is_binary() and not html_document.empty: html_text = self.get_content(html_document.document) if isinstance(html_text, bytes): html_text = html_text.decode('utf-8') return html_text def xml(self): xml_document = self.attachments.primary_xml_document if xml_document and not xml_document.is_binary() and not xml_document.empty: xml_text = self.get_content(xml_document.document) if isinstance(xml_text, bytes): xml_text = xml_text.decode('utf-8') return xml_text def get_content(self, filename: str) -> Optional[str]: """ Get the content of a document by its filename. """ document = self._documents_by_name.get(filename) if document: return document.content @cached_property def attachments(self) -> Attachments: """ Get all attachments from the filing. """ is_datafile = False documents, datafiles, primary_files = [], [], [] # Get the filing summary filing_summary = self.filing_summary for sequence, document_lst in self._documents_by_sequence.items(): for document in document_lst: attachment = Attachment( sequence_number=sequence, ixbrl=False, path=f"{self.path}/{document.filename}", document=document.filename, document_type=get_document_type(filename=document.filename, declared_document_type=document.type), description=document.description, size=None, sgml_document=document, filing_sgml=self ) # Add from the filing summary if available if filing_summary: report = filing_summary.get_reports_by_filename(document.filename) if report: attachment.purpose = report.short_name # Check if the document is a primary document if sequence == "1": primary_files.append(attachment) documents.append(attachment) else: if not is_datafile: is_datafile = is_xml(filename=document.filename) if is_datafile: datafiles.append(attachment) else: documents.append(attachment) return Attachments(document_files=documents, data_files=datafiles, primary_documents=primary_files, sgml=self) @cached_property def filing_summary(self): summary_attachment = self._documents_by_name.get("FilingSummary.xml") if summary_attachment: filing_summary = FilingSummary.parse(summary_attachment.content) filing_summary.reports._filing_summary = filing_summary filing_summary._filing_sgml = self return filing_summary def download(self, path: Union[str, Path], archive: bool = False): """ Download all the attachments to a specified path. If the path is a directory, the file is saved with its original name in that directory. If the path is a file, the file is saved with the given path name. If archive is True, the attachments are saved in a zip file. path: str or Path - The path to save the attachments archive: bool (default False) - If True, save the attachments in a zip file """ if archive: if path.is_dir(): raise ValueError("Path must be a zip file name to create zipfile") else: with zipfile.ZipFile(path, 'w') as zipf: for document in self._documents_by_name.values(): zipf.writestr(document.filename, document.content) else: if path.is_dir(): for document in self._documents_by_name.values(): file_path = path / document.filename content = document.content if isinstance(content, bytes): file_path.write_bytes(content) else: file_path.write_text(content, encoding='utf-8') else: raise ValueError("Path must be a directory") @property def primary_documents(self): """ Get the primary documents from the filing. """ return self.attachments.primary_documents @classmethod def from_source(cls, source: Union[str, Path]) -> "FilingSGML": """ Create FilingSGML instance from either a URL or file path. Parses both header and documents. Args: source: Either a URL string or a file path Returns: FilingSGML: New instance with parsed header and documents Raises: ValueError: If header section cannot be found IOError: If file cannot be read """ # Read content once content = read_content_as_string(source) # Parse header and documents header, documents = parse_submission_text(content) # Create FilingSGML instance return cls(header=header, documents=documents) @classmethod def from_text(cls, full_text_submission: str) -> "FilingSGML": """ Create FilingSGML instance from either full text submission. Parses both header and documents. Args: full_text_submission: String containing full text submission Returns: FilingSGML: New instance with parsed header and documents Raises: ValueError: If header section cannot be found """ # Parse header and documents header, documents = parse_submission_text(full_text_submission) # Create FilingSGML instance return cls(header=header, documents=documents) def get_document_by_sequence(self, sequence: str) -> Optional[SGMLDocument]: """ Get a document by its sequence number. Direct dictionary lookup for O(1) performance. """ results = self._documents_by_sequence.get(sequence) if results and len(results) > 0: return results[0] def get_document_by_name(self, filename: str) -> Optional[SGMLDocument]: """ Get a document by its filename. Direct dictionary lookup for O(1) performance. """ return self._documents_by_name.get(filename) @classmethod def from_filing(cls, filing: 'Filing') -> 'FilingSGML': """Create from a Filing object that provides text_url.""" filing_sgml = cls.from_source(filing.text_url) if not filing_sgml.accession_number: filing_sgml.header.filing_metadata.update('ACCESSION NUMBER', filing.accession_no) if not filing_sgml.header.filing_metadata.get("CIK"): filing_sgml.header.filing_metadata.update('CIK', str(filing.cik).zfill(10)) if not filing_sgml.header.form: filing_sgml.header.filing_metadata.update("CONFORMED SUBMISSION TYPE", filing.form) return filing_sgml def __str__(self) -> str: """String representation with basic filing info.""" doc_count = len(self._documents_by_name) return f"FilingSGML(accession={self.header.accession_number}, document_count={doc_count})" def __repr__(self) -> str: return str(self) def get_document_sequences(self) -> List[str]: """ Get all document sequences. Using list() is more efficient than sorted() when order doesn't matter. """ return list(self._documents_by_sequence.keys()) def get_all_document_types(self) -> List[str]: """ Get unique document types in filing. Using set for deduplication. """ return list({doc.type for doc in self._documents_by_sequence.values()}) def get_document_count(self) -> int: """Get total number of documents.""" return len(self._documents_by_sequence)