from datetime import datetime from typing import Any, Dict, List, Optional import orjson as json import pandas as pd from bs4 import BeautifulSoup, Comment, Tag from pydantic import BaseModel from rich import box from rich.console import Group from rich.panel import Panel from rich.table import Column, Table from rich.text import Text from edgar._party import Address, get_addresses_as_columns from edgar.core import sec_dot_gov from edgar.formatting import display_size from edgar.httprequests import download_file from edgar.reference import describe_form from edgar.richtools import repr_rich __all__ = ['FilingDirectory', 'IndexHeaders', 'ReportingOwner', 'CompanyData', 'FilingValues', 'FormerCompany'] class FilingDirectory: """ The location for the filing on SEC EDGAR and detailed locations and timestamps for the files in the filing Sourced from the index.json file in the filing directory """ def __init__(self, name: str, parent_dir: str, items: pd.DataFrame): self.name = name self.parent_dir = parent_dir self.items = items @property def accession_number(self): "Convert 000121390024004875 to 0001213900-24-004875" accession_no = self.name.split("/")[-1] return f"{accession_no[:10]}-{accession_no[10:12]}-{accession_no[12:]}" @property def index_headers(self): return download_file(f"{sec_dot_gov}/{self.name}/{self.accession_number}-index-headers.html") @classmethod def load(cls, basedir: str): index_url = f"{basedir}/index.json" index = json.loads(download_file(index_url)) directory_json = index['directory'] items = (pd.DataFrame(data=directory_json['item']) .rename(columns={"name": "Name", "last-modified": "LastModified", "size": "Size"}) .filter(["Name", "LastModified", "Size"]) ) directory: FilingDirectory = FilingDirectory( name=directory_json['name'], parent_dir=directory_json['parent-dir'], items=items ) return directory def __len__(self): return len(self.items) def __rich__(self): table = Table( "Name", "LastModified", "Size", title=Text(f"Filing Directory {self.name}", style="bold"), row_styles=["", "bold"], box=box.SIMPLE) for _, row in self.items.iterrows(): table.add_row(row['Name'], row['LastModified'], display_size(row['Size'])) return table def __repr__(self): return repr_rich(self.__rich__()) """ Represent the SEC filing headers of a filing. The headers are extracted from the HTML file of the filing. This is the file `-index-headers.html` """ class CompanyData(BaseModel): conformed_name: str cik: str assigned_sic: Optional[str] = None organization_name: Optional[str] = None irs_number: Optional[str] = None fiscal_year_end: Optional[str] = None @property def name(self): return self.conformed_name def __rich__(self): table = Table(Column("", style="bold deep_sky_blue1"), "", box=box.ROUNDED, show_header=False, ) table.add_row(self.conformed_name, self.cik) return table class FilingValues(BaseModel): form_type: str act: str file_number: str film_number: str class FormerCompany(BaseModel): former_conformed_name: str date_changed: str class Filer(BaseModel): company_data: CompanyData filing_values: FilingValues business_address: Address mail_address: Address former_company: List[FormerCompany] def __rich__(self): contents = [self.company_data, get_addresses_as_columns(business_address=self.business_address, mailing_address=self.mail_address) ] return Panel(Group(*contents), title="Filer", style="bold", box=box.ROUNDED) def __repr__(self): return repr_rich(self.__rich__()) class SubjectCompany(BaseModel): company_data: CompanyData filing_values: FilingValues business_address: Address mail_address: Address former_company: List[FormerCompany] def __rich__(self): contents = [self.company_data, get_addresses_as_columns(business_address=self.business_address, mailing_address=self.mail_address) ] return Panel(Group(*contents), title="Subject Company", style="bold", box=box.ROUNDED) def __repr__(self): return repr_rich(self.__rich__()) class OwnerData(BaseModel): conformed_name: str cik: str organization_name: Optional[str] = None @property def name(self): return self.conformed_name def __rich__(self): table = Table(Column("", style="bold deep_sky_blue1"), "", box=box.ROUNDED, show_header=False, ) table.add_row(self.conformed_name, self.cik) return table def __repr__(self): return repr_rich(self.__rich__()) class ReportingOwner(BaseModel): company_data: Optional[CompanyData] owner_data: Optional[OwnerData] filing_values: FilingValues mail_address: Address def __rich__(self): contents = [] if self.company_data: contents.append(self.company_data) if self.owner_data: contents.append(self.owner_data) contents.append(Text(str(self.mail_address))) return Panel(Group(*contents), title="Reporting Owner", style="bold", box=box.ROUNDED) def __repr__(self): return repr_rich(self.__rich__()) class Issuer(BaseModel): company_data: CompanyData mail_address: Address business_address: Address nested_tags = [ 'filer', 'issuer', 'subject_company', 'reporting_owner', 'owner_data', 'company_data', 'filing_values', 'business_address', 'mail_address', 'former_company' ] class IndexHeaders(BaseModel): """ Represent the SEC filing headers of a filing. This is parsed from the comment section of the HTML file `-index-headers.html` """ filing_date: str acceptance_datetime: datetime accession_number: str form: str public_document_count: int period: Optional[str] = None items: List[str] date_of_filing_date_change: Optional[str] = None effectiveness_date: Optional[str] = None filer: Optional[Filer] = None reporting_owner: Optional[ReportingOwner] = None subject_company: Optional[SubjectCompany] = None issuer: Optional[Issuer] = None @property def company_name(self): if self.filer: return self.filer.company_data.conformed_name elif self.subject_company: return self.subject_company.company_data.conformed_name elif self.issuer: return self.issuer.company_data.conformed_name return "" @property def title(self): return f"{self.form} - {self.company_name} {self.accession_number}" @staticmethod def _prepare_address(data: Dict[str, Any], address_type: str) -> Address: """ Prepare an address object from the data dictionary. """ address_dict = data.pop(address_type, {}) address_dict['zipcode'] = address_dict.pop('zip', '') address_dict['state_or_country'] = address_dict.pop('state', '') return Address(**address_dict) @classmethod def load(cls, header_text: str): """ Load the IndexHeaders from the HTML file content. """ soup = BeautifulSoup(header_text, 'html.parser') # The SEC-HEADER tag contains the filing header information header_text = soup.find_all(string=lambda text: isinstance(text, Comment))[0].strip() lines = header_text.strip().split("\n") data: Dict[str, Any] = {} stack = [data] for line in lines: line = line.strip() # Skip the main SEC-HEADER tag if line.startswith(""): continue # Handle closing tags by popping the context stack if line.startswith(" 1: # Ensure we don't pop the root context stack.pop() continue # Handle opening tags and values if line.startswith("<"): tag = line[1:].split(">")[0] class_name = tag.lower().replace("-", "_") value = line[len(tag) + 2:].strip() # If there is a value, add it to the current context if class_name not in nested_tags: if isinstance(stack[-1], dict): if class_name not in stack[-1]: stack[-1][class_name] = value else: if not isinstance(stack[-1][class_name], list): stack[-1][class_name] = [stack[-1][class_name]] stack[-1][class_name].append(value) else: # Create a new context for nested tags new_context = {} if isinstance(stack[-1], dict): stack[-1][class_name] = new_context elif isinstance(stack[-1], list): stack[-1].append(new_context) stack.append(new_context) else: # Handle text content within the current context if isinstance(stack[-1], dict): stack[-1][class_name] = line elif isinstance(stack[-1], list): stack[-1].append(line) else: stack[-1] = [stack[-1], line] # Parsing nested objects into their respective classes filer_data = data.pop("filer", None) filer = None if filer_data: # Extract and initialize nested CompanyData for Filer company_data = CompanyData(**filer_data.pop("company_data", {})) # Extract and initialize nested FilingValues for Filer filing_values = FilingValues( form_type=filer_data["filing_values"].get("form_type", ""), act=filer_data["filing_values"].get("act", ""), file_number=filer_data["filing_values"].get("file_number", ""), film_number=filer_data["filing_values"].get("film_number", "") ) # Extract and initialize nested Business and Mail Address for Filer business_address = cls._prepare_address(filer_data, "business_address") mail_address = cls._prepare_address(filer_data, "mail_address") # Handle FormerCompany elements former_company_raw = filer_data.pop("former_company", []) former_company = [] if isinstance(former_company_raw, list): for fc in former_company_raw: if isinstance(fc, dict): former_company.append(FormerCompany(**fc)) elif isinstance(fc, str): former_company.append(FormerCompany(former_conformed_name=fc, date_changed='')) # Initialize Filer with nested data filer = Filer( company_data=company_data, filing_values=filing_values, business_address=business_address, mail_address=mail_address, former_company=former_company ) data["filer"] = filer # Process SubjectCompany if present subject_company_data = data.pop("subject_company", None) subject_company = None if subject_company_data: # Extract and initialize nested CompanyData for SubjectCompany company_data = CompanyData(**subject_company_data.pop("company_data", {})) # Extract and initialize nested FilingValues for SubjectCompany filing_values = FilingValues( form_type=subject_company_data["filing_values"].get("form_type", ""), act=subject_company_data["filing_values"].get("act", ""), file_number=subject_company_data["filing_values"].get("file_number", ""), film_number=subject_company_data["filing_values"].get("film_number", "") ) # Extract and initialize nested Business and Mail Address for SubjectCompany business_address = cls._prepare_address(subject_company_data, "business_address") mail_address = cls._prepare_address(subject_company_data, "mail_address") # Handle FormerCompany elements former_company_raw = subject_company_data.pop("former_company", []) former_company = [] if isinstance(former_company_raw, list): for fc in former_company_raw: if isinstance(fc, dict): former_company.append(FormerCompany(**fc)) elif isinstance(fc, str): former_company.append(FormerCompany(former_conformed_name=fc, date_changed='')) # Initialize SubjectCompany with nested data subject_company = SubjectCompany( company_data=company_data, filing_values=filing_values, business_address=business_address, mail_address=mail_address, former_company=former_company ) data["subject_company"] = subject_company # Process ReportingOwner if present reporting_owner_data = data.pop("reporting_owner", None) reporting_owner = None if reporting_owner_data: # Extract and initialize nested OwnerData or CompanyData for ReportingOwner owner_data = reporting_owner_data.pop("owner_data", None) company_data = reporting_owner_data.pop("company_data", None) owner_data_obj = OwnerData(**owner_data) if owner_data else None company_data_obj = CompanyData(**company_data) if company_data else None # Extract and initialize nested FilingValues for ReportingOwner filing_values = FilingValues( form_type=reporting_owner_data["filing_values"].get("form_type", ""), act=reporting_owner_data["filing_values"].get("act", ""), file_number=reporting_owner_data["filing_values"].get("file_number", ""), film_number=reporting_owner_data["filing_values"].get("film_number", "") ) # Extract and initialize nested Mail Address for ReportingOwner mail_address = cls._prepare_address(reporting_owner_data, "mail_address") # Initialize ReportingOwner with nested data reporting_owner = ReportingOwner( company_data=company_data_obj, owner_data=owner_data_obj, filing_values=filing_values, mail_address=mail_address ) data["reporting_owner"] = reporting_owner # Process Issuer if present issuer_data = data.pop("issuer", None) issuer = None if issuer_data: # Extract and initialize nested CompanyData for Issuer company_data = CompanyData(**issuer_data.pop("company_data", {})) # Extract and initialize nested Business and Mail Address for Issuer business_address = cls._prepare_address(issuer_data, "business_address") mail_address = cls._prepare_address(issuer_data, "mail_address") # Initialize Issuer with nested data issuer = Issuer( company_data=company_data, business_address=business_address, mail_address=mail_address ) data["issuer"] = issuer # Ensure items is a list items = data.pop("items", []) if isinstance(items, str): items = [items] # Convert acceptance_datetime to datetime object acceptance_datetime_str = data.pop("acceptance_datetime") acceptance_datetime = datetime.strptime(acceptance_datetime_str, '%Y%m%d%H%M%S') if acceptance_datetime_str else None # Convert filing_date to date object filing_date_str = data.pop("filing_date", None) filing_date = datetime.strptime(filing_date_str, '%Y%m%d').strftime('%Y-%m-%d') if filing_date_str else None date_of_change_str = data.pop("date_of_filing_date_change", None) if date_of_change_str: data["date_of_filing_date_change"] = datetime.strptime(date_of_change_str, '%Y%m%d').strftime('%Y-%m-%d') # The type is really the form data["form"] = data.pop("type") # The public document count is an integer data["public_document_count"] = int(data.pop("public_document_count", 0)) # Prepare the final dictionary for IndexHeaders initialization sec_header_data = { **data, "filing_date": filing_date, "acceptance_datetime": acceptance_datetime, "items": items, "filer": filer, "subject_company": subject_company, "reporting_owner": reporting_owner, "issuer": issuer } # The
 block contains the HTML for the documents
        #documents = IndexHeaders._extract_documents_from_pre(soup.find("pre"))

        # Initialize IndexHeaders with the parsed data
        return cls(**sec_header_data)

    @staticmethod
    def _extract_documents_from_pre(pre_tag:Tag):
        soup = BeautifulSoup(pre_tag.text)
        document_tags = soup.find_all("document")
        for document_tag in document_tags:
            document_tag.find("type")


    @staticmethod
    def _extract_comment_text(soup):
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        if comments:
            return comments[0].strip()
        return None

    @staticmethod
    def _extract_accession_number(title: str):
        import re
        match = re.search(r'SEC EDGAR Submission (\d{10}-\d{2}-\d{6})', title)
        if match:
            return match.group(1)
        return None

    def __rich__(self):
        # Summary Information
        summary_table = Table("Filing Date", "Acceptance Datetime", "Documents", box=box.ROUNDED)
        summary_table.add_row(
            self.filing_date,
            datetime.strftime(self.acceptance_datetime, '%Y-%m-%d %H:%M:%S'),
            str(self.public_document_count))

        main_contents = [summary_table]
        if self.filer:
            main_contents.append(self.filer)
        if self.subject_company:
            main_contents.append(self.subject_company)
        if self.reporting_owner:
            main_contents.append(self.reporting_owner)

        if self.items and len(self.items) > 0:
            items_table = Table("Items", box=box.ROUNDED)
            for item in self.items:
                items_table.add_row(item)
            main_contents.append(items_table)

        main_panel: Panel = Panel(
            Group(*main_contents),
            box=box.ROUNDED,
            title=self.title,
            subtitle=describe_form(self.form),
            style="bold"
        )
        return main_panel

    def __repr__(self):
        return repr_rich(self.__rich__())