Initial commit
This commit is contained in:
482
venv/lib/python3.10/site-packages/edgar/sgml/sgml_common.py
Normal file
482
venv/lib/python3.10/site-packages/edgar/sgml/sgml_common.py
Normal file
@@ -0,0 +1,482 @@
|
||||
import re
|
||||
import zipfile
|
||||
from collections import defaultdict
|
||||
from functools import cached_property
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, DefaultDict, Dict, Iterator, List, Optional, Tuple, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar._filings import Filing
|
||||
|
||||
from edgar.attachments import Attachment, Attachments, get_document_type
|
||||
from edgar.httprequests import stream_with_retry
|
||||
from edgar.sgml.filing_summary import FilingSummary
|
||||
from edgar.sgml.sgml_header import FilingHeader
|
||||
from edgar.sgml.sgml_parser import SGMLDocument, SGMLFormatType, SGMLParser
|
||||
from edgar.sgml.tools import is_xml
|
||||
|
||||
__all__ = ['iter_documents', 'list_documents', 'FilingSGML', 'FilingHeader']
|
||||
|
||||
|
||||
def parse_document(document_str: str) -> SGMLDocument:
|
||||
"""
|
||||
Parse a single SGML document section, maintaining raw content.
|
||||
"""
|
||||
# Extract individual fields with separate patterns
|
||||
type_match = re.search(r'<TYPE>([^<\n]+)', document_str)
|
||||
sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', document_str)
|
||||
filename_match = re.search(r'<FILENAME>([^<\n]+)', document_str)
|
||||
description_match = re.search(r'<DESCRIPTION>([^<\n]+)', document_str)
|
||||
|
||||
return SGMLDocument(
|
||||
type=type_match.group(1).strip() if type_match else "",
|
||||
sequence=sequence_match.group(1).strip() if sequence_match else "",
|
||||
filename=filename_match.group(1).strip() if filename_match else "",
|
||||
description=description_match.group(1).strip() if description_match else "",
|
||||
raw_content=document_str
|
||||
)
|
||||
|
||||
|
||||
def read_content(source: Union[str, Path]) -> Iterator[str]:
|
||||
"""
|
||||
Read content from either a URL or file path, yielding lines as strings.
|
||||
Automatically handles gzip-compressed files with .gz extension.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path
|
||||
|
||||
Yields:
|
||||
str: Lines of content from the source
|
||||
|
||||
Raises:
|
||||
TooManyRequestsError: If the server returns a 429 response
|
||||
FileNotFoundError: If the file path doesn't exist
|
||||
gzip.BadGzipFile: If the file is not a valid gzip file
|
||||
"""
|
||||
if isinstance(source, str) and (source.startswith('http://') or source.startswith('https://')):
|
||||
# Handle URL using stream_with_retry
|
||||
for response in stream_with_retry(source):
|
||||
# Process each line from the response and decode from bytes
|
||||
for line in response.iter_lines():
|
||||
if line is not None:
|
||||
yield line + "\n"
|
||||
else:
|
||||
# Handle file path
|
||||
path = Path(source)
|
||||
|
||||
# Check if the file is gzip-compressed
|
||||
if str(path).endswith('.gz'):
|
||||
import gzip
|
||||
with gzip.open(path, 'rt', encoding='utf-8', errors='replace') as file:
|
||||
yield from file
|
||||
else:
|
||||
# Regular file handling
|
||||
with path.open('r', encoding='utf-8', errors='replace') as file:
|
||||
yield from file
|
||||
|
||||
|
||||
def read_content_as_string(source: Union[str, Path]) -> str:
|
||||
"""
|
||||
Read content from either a URL or file path into a string.
|
||||
Uses existing read_content generator function.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path
|
||||
|
||||
Returns:
|
||||
str: Full content as string
|
||||
|
||||
Raises:
|
||||
TooManyRequestsError: If the server returns a 429 response
|
||||
FileNotFoundError: If file path doesn't exist
|
||||
"""
|
||||
# Convert lines from read_content to string
|
||||
lines = []
|
||||
for line in read_content(source):
|
||||
# Handle both string and bytes from response
|
||||
if isinstance(line, bytes):
|
||||
lines.append(line.decode('utf-8', errors='replace'))
|
||||
else:
|
||||
lines.append(line)
|
||||
|
||||
return ''.join(lines)
|
||||
|
||||
|
||||
def iter_documents(source: Union[str, Path]) -> Iterator[SGMLDocument]:
|
||||
"""
|
||||
Stream SGML documents from either a URL or file path, yielding parsed documents.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path (string or Path object)
|
||||
|
||||
Yields:
|
||||
SGMLDocument objects containing the parsed content
|
||||
|
||||
Raises:
|
||||
ValueError: If the source is invalid
|
||||
ConnectionError: If URL retrieval fails after retries
|
||||
FileNotFoundError: If the file path doesn't exist
|
||||
"""
|
||||
try:
|
||||
content = ''.join(read_content(source))
|
||||
document_pattern = re.compile(r'<DOCUMENT>([\s\S]*?)</DOCUMENT>')
|
||||
|
||||
for match in document_pattern.finditer(content):
|
||||
document = parse_document(match.group(1))
|
||||
if document:
|
||||
yield document
|
||||
|
||||
except (ValueError, ConnectionError, FileNotFoundError) as e:
|
||||
raise type(e)(f"Error processing source {source}: {str(e)}") from e
|
||||
|
||||
def list_documents(source: Union[str, Path]) -> list[SGMLDocument]:
|
||||
"""
|
||||
Convenience method to parse all documents from a source into a list.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path
|
||||
|
||||
Returns:
|
||||
List of SGMLDocument objects
|
||||
"""
|
||||
return list(iter_documents(source))
|
||||
|
||||
|
||||
def parse_file(source: Union[str, Path]) -> list[SGMLDocument]:
|
||||
"""
|
||||
Convenience method to parse all documents from a source into a list.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path
|
||||
|
||||
Returns:
|
||||
List of SGMLDocument objects
|
||||
"""
|
||||
return list(iter_documents(source))
|
||||
|
||||
def parse_submission_text(content: str) -> Tuple[FilingHeader, DefaultDict[str, List[SGMLDocument]]]:
|
||||
"""
|
||||
Parses the raw submission text and returns the filing header along with
|
||||
a dictionary mapping document sequence numbers to lists of SGMLDocument objects.
|
||||
Args:
|
||||
content (str): The raw text content of the submission.
|
||||
Returns:
|
||||
Tuple[FilingHeader, DefaultDict[str, List[SGMLDocument]]]:
|
||||
A tuple where the first element is the FilingHeader object representing
|
||||
the parsed header information, and the second element is a defaultdict
|
||||
mapping document sequence identifiers to their corresponding list of SGMLDocument objects.
|
||||
Details:
|
||||
- For submissions with the SGMLFormatType.SUBMISSION format, the function uses
|
||||
the pre-parsed filer data to create the FilingHeader.
|
||||
- For SEC-DOCUMENT formatted content, the header is initially parsed from the SGML text;
|
||||
if this fails, the header is parsed again with preprocessing enabled.
|
||||
- The function creates an SGMLDocument for each parsed document and groups them by
|
||||
their sequence identifier.
|
||||
Raises:
|
||||
Exception: Any exceptions raised during header parsing (handled internally
|
||||
by attempting to preprocess the header in case of failure).
|
||||
"""
|
||||
# Create parser and get structure including header and documents
|
||||
parser = SGMLParser()
|
||||
parsed_data = parser.parse(content)
|
||||
|
||||
# Create FilingHeader using already parsed data
|
||||
if parsed_data['format'] == SGMLFormatType.SUBMISSION:
|
||||
# For submission format, we already have parsed filer data
|
||||
header = FilingHeader.parse_submission_format_header(parsed_data=parsed_data)
|
||||
else:
|
||||
# For SEC-DOCUMENT format, pass the header text to the
|
||||
# specialized header parser since we need additional processing
|
||||
try:
|
||||
header = FilingHeader.parse_from_sgml_text(parsed_data['header'])
|
||||
except Exception:
|
||||
header = FilingHeader.parse_from_sgml_text(parsed_data['header'], preprocess=True)
|
||||
|
||||
# Create document dictionary
|
||||
documents = defaultdict(list)
|
||||
for doc_data in parsed_data['documents']:
|
||||
doc = SGMLDocument.from_parsed_data(doc_data)
|
||||
documents[doc.sequence].append(doc)
|
||||
return header, documents
|
||||
|
||||
|
||||
|
||||
class FilingSGML:
|
||||
"""
|
||||
Main class that parses and provides access to both the header and documents
|
||||
from an SGML filing.
|
||||
"""
|
||||
__slots__ = ('header', '_documents_by_sequence', '__dict__') # Use slots for memory efficiency
|
||||
|
||||
def __init__(self, header: FilingHeader, documents: defaultdict[str, List[SGMLDocument]]):
|
||||
"""
|
||||
Initialize FilingSGML with parsed header and documents.
|
||||
|
||||
Args:
|
||||
header (FilingHeader): Parsed header information
|
||||
documents (Dict[str, SGMLDocument]): Dictionary of parsed documents keyed by sequence
|
||||
"""
|
||||
self.header:FilingHeader = header
|
||||
self._documents_by_sequence:defaultdict[str, List[SGMLDocument]] = documents
|
||||
self._documents_by_name:Dict[str, SGMLDocument] = {
|
||||
doc.filename: doc for doc_lst in documents.values() for doc in doc_lst
|
||||
}
|
||||
|
||||
@property
|
||||
def accession_number(self):
|
||||
return self.header.accession_number
|
||||
|
||||
@property
|
||||
def cik(self):
|
||||
return self.header.cik
|
||||
|
||||
@cached_property
|
||||
def entity(self):
|
||||
from edgar.entity import Entity
|
||||
cik = self.cik
|
||||
if cik:
|
||||
return Entity(cik)
|
||||
|
||||
@property
|
||||
def form(self):
|
||||
return self.header.form
|
||||
|
||||
@property
|
||||
def filing_date(self):
|
||||
return self.header.filing_date
|
||||
|
||||
@property
|
||||
def date_as_of_change(self):
|
||||
return self.header.date_as_of_change
|
||||
|
||||
@property
|
||||
def period_of_report(self):
|
||||
return self.header.period_of_report
|
||||
|
||||
@property
|
||||
def effective_date(self):
|
||||
return self.header.filing_metadata.get('EFFECTIVE DATE')
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
"""
|
||||
Get the root path of the filing.
|
||||
"""
|
||||
if self.accession_number:
|
||||
return f"/Archives/edgar/data/{self.header.cik}/{self.accession_number.replace('-', '')}"
|
||||
else:
|
||||
return "/<SGML FILE>"
|
||||
|
||||
|
||||
def html(self):
|
||||
html_document = self.attachments.primary_html_document
|
||||
if html_document and not html_document.is_binary() and not html_document.empty:
|
||||
html_text = self.get_content(html_document.document)
|
||||
if isinstance(html_text, bytes):
|
||||
html_text = html_text.decode('utf-8')
|
||||
return html_text
|
||||
|
||||
def xml(self):
|
||||
xml_document = self.attachments.primary_xml_document
|
||||
if xml_document and not xml_document.is_binary() and not xml_document.empty:
|
||||
xml_text = self.get_content(xml_document.document)
|
||||
if isinstance(xml_text, bytes):
|
||||
xml_text = xml_text.decode('utf-8')
|
||||
return xml_text
|
||||
|
||||
def get_content(self, filename: str) -> Optional[str]:
|
||||
"""
|
||||
Get the content of a document by its filename.
|
||||
"""
|
||||
document = self._documents_by_name.get(filename)
|
||||
if document:
|
||||
return document.content
|
||||
|
||||
@cached_property
|
||||
def attachments(self) -> Attachments:
|
||||
"""
|
||||
Get all attachments from the filing.
|
||||
"""
|
||||
is_datafile = False
|
||||
documents, datafiles, primary_files = [], [], []
|
||||
|
||||
# Get the filing summary
|
||||
filing_summary = self.filing_summary
|
||||
|
||||
for sequence, document_lst in self._documents_by_sequence.items():
|
||||
for document in document_lst:
|
||||
attachment = Attachment(
|
||||
sequence_number=sequence,
|
||||
ixbrl=False,
|
||||
path=f"{self.path}/{document.filename}",
|
||||
document=document.filename,
|
||||
document_type=get_document_type(filename=document.filename, declared_document_type=document.type),
|
||||
description=document.description,
|
||||
size=None,
|
||||
sgml_document=document,
|
||||
filing_sgml=self
|
||||
)
|
||||
# Add from the filing summary if available
|
||||
if filing_summary:
|
||||
report = filing_summary.get_reports_by_filename(document.filename)
|
||||
if report:
|
||||
attachment.purpose = report.short_name
|
||||
# Check if the document is a primary document
|
||||
if sequence == "1":
|
||||
primary_files.append(attachment)
|
||||
documents.append(attachment)
|
||||
else:
|
||||
if not is_datafile:
|
||||
is_datafile = is_xml(filename=document.filename)
|
||||
if is_datafile:
|
||||
datafiles.append(attachment)
|
||||
else:
|
||||
documents.append(attachment)
|
||||
|
||||
return Attachments(document_files=documents, data_files=datafiles, primary_documents=primary_files, sgml=self)
|
||||
|
||||
@cached_property
|
||||
def filing_summary(self):
|
||||
summary_attachment = self._documents_by_name.get("FilingSummary.xml")
|
||||
if summary_attachment:
|
||||
filing_summary = FilingSummary.parse(summary_attachment.content)
|
||||
filing_summary.reports._filing_summary = filing_summary
|
||||
filing_summary._filing_sgml = self
|
||||
return filing_summary
|
||||
|
||||
def download(self, path: Union[str, Path], archive: bool = False):
|
||||
"""
|
||||
Download all the attachments to a specified path.
|
||||
If the path is a directory, the file is saved with its original name in that directory.
|
||||
If the path is a file, the file is saved with the given path name.
|
||||
If archive is True, the attachments are saved in a zip file.
|
||||
path: str or Path - The path to save the attachments
|
||||
archive: bool (default False) - If True, save the attachments in a zip file
|
||||
"""
|
||||
if archive:
|
||||
if path.is_dir():
|
||||
raise ValueError("Path must be a zip file name to create zipfile")
|
||||
else:
|
||||
with zipfile.ZipFile(path, 'w') as zipf:
|
||||
for document in self._documents_by_name.values():
|
||||
zipf.writestr(document.filename, document.content)
|
||||
else:
|
||||
if path.is_dir():
|
||||
for document in self._documents_by_name.values():
|
||||
file_path = path / document.filename
|
||||
content = document.content
|
||||
if isinstance(content, bytes):
|
||||
file_path.write_bytes(content)
|
||||
else:
|
||||
file_path.write_text(content, encoding='utf-8')
|
||||
else:
|
||||
raise ValueError("Path must be a directory")
|
||||
|
||||
@property
|
||||
def primary_documents(self):
|
||||
"""
|
||||
Get the primary documents from the filing.
|
||||
"""
|
||||
return self.attachments.primary_documents
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_source(cls, source: Union[str, Path]) -> "FilingSGML":
|
||||
"""
|
||||
Create FilingSGML instance from either a URL or file path.
|
||||
Parses both header and documents.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path
|
||||
|
||||
Returns:
|
||||
FilingSGML: New instance with parsed header and documents
|
||||
|
||||
Raises:
|
||||
ValueError: If header section cannot be found
|
||||
IOError: If file cannot be read
|
||||
"""
|
||||
# Read content once
|
||||
content = read_content_as_string(source)
|
||||
|
||||
# Parse header and documents
|
||||
header, documents = parse_submission_text(content)
|
||||
|
||||
# Create FilingSGML instance
|
||||
return cls(header=header, documents=documents)
|
||||
|
||||
@classmethod
|
||||
def from_text(cls, full_text_submission: str) -> "FilingSGML":
|
||||
"""
|
||||
Create FilingSGML instance from either full text submission.
|
||||
Parses both header and documents.
|
||||
|
||||
Args:
|
||||
full_text_submission: String containing full text submission
|
||||
|
||||
Returns:
|
||||
FilingSGML: New instance with parsed header and documents
|
||||
|
||||
Raises:
|
||||
ValueError: If header section cannot be found
|
||||
"""
|
||||
# Parse header and documents
|
||||
header, documents = parse_submission_text(full_text_submission)
|
||||
|
||||
# Create FilingSGML instance
|
||||
return cls(header=header, documents=documents)
|
||||
|
||||
|
||||
def get_document_by_sequence(self, sequence: str) -> Optional[SGMLDocument]:
|
||||
"""
|
||||
Get a document by its sequence number.
|
||||
Direct dictionary lookup for O(1) performance.
|
||||
"""
|
||||
results = self._documents_by_sequence.get(sequence)
|
||||
if results and len(results) > 0:
|
||||
return results[0]
|
||||
|
||||
def get_document_by_name(self, filename: str) -> Optional[SGMLDocument]:
|
||||
"""
|
||||
Get a document by its filename.
|
||||
Direct dictionary lookup for O(1) performance.
|
||||
"""
|
||||
return self._documents_by_name.get(filename)
|
||||
|
||||
@classmethod
|
||||
def from_filing(cls, filing: 'Filing') -> 'FilingSGML':
|
||||
"""Create from a Filing object that provides text_url."""
|
||||
filing_sgml = cls.from_source(filing.text_url)
|
||||
if not filing_sgml.accession_number:
|
||||
filing_sgml.header.filing_metadata.update('ACCESSION NUMBER', filing.accession_no)
|
||||
if not filing_sgml.header.filing_metadata.get("CIK"):
|
||||
filing_sgml.header.filing_metadata.update('CIK', str(filing.cik).zfill(10))
|
||||
if not filing_sgml.header.form:
|
||||
filing_sgml.header.filing_metadata.update("CONFORMED SUBMISSION TYPE", filing.form)
|
||||
return filing_sgml
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""String representation with basic filing info."""
|
||||
doc_count = len(self._documents_by_name)
|
||||
return f"FilingSGML(accession={self.header.accession_number}, document_count={doc_count})"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return str(self)
|
||||
|
||||
def get_document_sequences(self) -> List[str]:
|
||||
"""
|
||||
Get all document sequences.
|
||||
Using list() is more efficient than sorted() when order doesn't matter.
|
||||
"""
|
||||
return list(self._documents_by_sequence.keys())
|
||||
|
||||
def get_all_document_types(self) -> List[str]:
|
||||
"""
|
||||
Get unique document types in filing.
|
||||
Using set for deduplication.
|
||||
"""
|
||||
return list({doc.type for doc in self._documents_by_sequence.values()})
|
||||
|
||||
def get_document_count(self) -> int:
|
||||
"""Get total number of documents."""
|
||||
return len(self._documents_by_sequence)
|
||||
Reference in New Issue
Block a user