Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,536 @@
from datetime import datetime
from typing import Any, Dict, List, Optional
import orjson as json
import pandas as pd
from bs4 import BeautifulSoup, Comment, Tag
from pydantic import BaseModel
from rich import box
from rich.console import Group
from rich.panel import Panel
from rich.table import Column, Table
from rich.text import Text
from edgar._party import Address, get_addresses_as_columns
from edgar.core import sec_dot_gov
from edgar.formatting import display_size
from edgar.httprequests import download_file
from edgar.reference import describe_form
from edgar.richtools import repr_rich
__all__ = ['FilingDirectory', 'IndexHeaders', 'ReportingOwner', 'CompanyData', 'FilingValues', 'FormerCompany']
class FilingDirectory:
"""
The location for the filing on SEC EDGAR and detailed locations and timestamps for the files in the filing
Sourced from the index.json file in the filing directory
"""
def __init__(self, name: str, parent_dir: str, items: pd.DataFrame):
self.name = name
self.parent_dir = parent_dir
self.items = items
@property
def accession_number(self):
"Convert 000121390024004875 to 0001213900-24-004875"
accession_no = self.name.split("/")[-1]
return f"{accession_no[:10]}-{accession_no[10:12]}-{accession_no[12:]}"
@property
def index_headers(self):
return download_file(f"{sec_dot_gov}/{self.name}/{self.accession_number}-index-headers.html")
@classmethod
def load(cls, basedir: str):
index_url = f"{basedir}/index.json"
index = json.loads(download_file(index_url))
directory_json = index['directory']
items = (pd.DataFrame(data=directory_json['item'])
.rename(columns={"name": "Name", "last-modified": "LastModified", "size": "Size"})
.filter(["Name", "LastModified", "Size"])
)
directory: FilingDirectory = FilingDirectory(
name=directory_json['name'],
parent_dir=directory_json['parent-dir'],
items=items
)
return directory
def __len__(self):
return len(self.items)
def __rich__(self):
table = Table(
"Name", "LastModified", "Size",
title=Text(f"Filing Directory {self.name}", style="bold"),
row_styles=["", "bold"],
box=box.SIMPLE)
for _, row in self.items.iterrows():
table.add_row(row['Name'], row['LastModified'], display_size(row['Size']))
return table
def __repr__(self):
return repr_rich(self.__rich__())
"""
Represent the SEC filing headers of a filing.
The headers are extracted from the HTML file of the filing. This is the file `<accession-number>-index-headers.html`
"""
class CompanyData(BaseModel):
conformed_name: str
cik: str
assigned_sic: Optional[str] = None
organization_name: Optional[str] = None
irs_number: Optional[str] = None
fiscal_year_end: Optional[str] = None
@property
def name(self):
return self.conformed_name
def __rich__(self):
table = Table(Column("", style="bold deep_sky_blue1"), "",
box=box.ROUNDED,
show_header=False,
)
table.add_row(self.conformed_name, self.cik)
return table
class FilingValues(BaseModel):
form_type: str
act: str
file_number: str
film_number: str
class FormerCompany(BaseModel):
former_conformed_name: str
date_changed: str
class Filer(BaseModel):
company_data: CompanyData
filing_values: FilingValues
business_address: Address
mail_address: Address
former_company: List[FormerCompany]
def __rich__(self):
contents = [self.company_data,
get_addresses_as_columns(business_address=self.business_address, mailing_address=self.mail_address)
]
return Panel(Group(*contents), title="Filer", style="bold", box=box.ROUNDED)
def __repr__(self):
return repr_rich(self.__rich__())
class SubjectCompany(BaseModel):
company_data: CompanyData
filing_values: FilingValues
business_address: Address
mail_address: Address
former_company: List[FormerCompany]
def __rich__(self):
contents = [self.company_data,
get_addresses_as_columns(business_address=self.business_address, mailing_address=self.mail_address)
]
return Panel(Group(*contents), title="Subject Company", style="bold", box=box.ROUNDED)
def __repr__(self):
return repr_rich(self.__rich__())
class OwnerData(BaseModel):
conformed_name: str
cik: str
organization_name: Optional[str] = None
@property
def name(self):
return self.conformed_name
def __rich__(self):
table = Table(Column("", style="bold deep_sky_blue1"), "",
box=box.ROUNDED,
show_header=False,
)
table.add_row(self.conformed_name, self.cik)
return table
def __repr__(self):
return repr_rich(self.__rich__())
class ReportingOwner(BaseModel):
company_data: Optional[CompanyData]
owner_data: Optional[OwnerData]
filing_values: FilingValues
mail_address: Address
def __rich__(self):
contents = []
if self.company_data:
contents.append(self.company_data)
if self.owner_data:
contents.append(self.owner_data)
contents.append(Text(str(self.mail_address)))
return Panel(Group(*contents), title="Reporting Owner", style="bold", box=box.ROUNDED)
def __repr__(self):
return repr_rich(self.__rich__())
class Issuer(BaseModel):
company_data: CompanyData
mail_address: Address
business_address: Address
nested_tags = [
'filer',
'issuer',
'subject_company',
'reporting_owner',
'owner_data',
'company_data',
'filing_values',
'business_address',
'mail_address',
'former_company'
]
class IndexHeaders(BaseModel):
"""
Represent the SEC filing headers of a filing.
This is parsed from the comment section of the HTML file `<accession-number>-index-headers.html`
"""
filing_date: str
acceptance_datetime: datetime
accession_number: str
form: str
public_document_count: int
period: Optional[str] = None
items: List[str]
date_of_filing_date_change: Optional[str] = None
effectiveness_date: Optional[str] = None
filer: Optional[Filer] = None
reporting_owner: Optional[ReportingOwner] = None
subject_company: Optional[SubjectCompany] = None
issuer: Optional[Issuer] = None
@property
def company_name(self):
if self.filer:
return self.filer.company_data.conformed_name
elif self.subject_company:
return self.subject_company.company_data.conformed_name
elif self.issuer:
return self.issuer.company_data.conformed_name
return ""
@property
def title(self):
return f"{self.form} - {self.company_name} {self.accession_number}"
@staticmethod
def _prepare_address(data: Dict[str, Any], address_type: str) -> Address:
"""
Prepare an address object from the data dictionary.
"""
address_dict = data.pop(address_type, {})
address_dict['zipcode'] = address_dict.pop('zip', '')
address_dict['state_or_country'] = address_dict.pop('state', '')
return Address(**address_dict)
@classmethod
def load(cls, header_text: str):
"""
Load the IndexHeaders from the HTML file content.
"""
soup = BeautifulSoup(header_text, 'html.parser')
# The SEC-HEADER tag contains the filing header information
header_text = soup.find_all(string=lambda text: isinstance(text, Comment))[0].strip()
lines = header_text.strip().split("\n")
data: Dict[str, Any] = {}
stack = [data]
for line in lines:
line = line.strip()
# Skip the main SEC-HEADER tag
if line.startswith("<SEC-HEADER>"):
continue
# Handle closing tags by popping the context stack
if line.startswith("</"):
if len(stack) > 1: # Ensure we don't pop the root context
stack.pop()
continue
# Handle opening tags and values
if line.startswith("<"):
tag = line[1:].split(">")[0]
class_name = tag.lower().replace("-", "_")
value = line[len(tag) + 2:].strip()
# If there is a value, add it to the current context
if class_name not in nested_tags:
if isinstance(stack[-1], dict):
if class_name not in stack[-1]:
stack[-1][class_name] = value
else:
if not isinstance(stack[-1][class_name], list):
stack[-1][class_name] = [stack[-1][class_name]]
stack[-1][class_name].append(value)
else:
# Create a new context for nested tags
new_context = {}
if isinstance(stack[-1], dict):
stack[-1][class_name] = new_context
elif isinstance(stack[-1], list):
stack[-1].append(new_context)
stack.append(new_context)
else:
# Handle text content within the current context
if isinstance(stack[-1], dict):
stack[-1][class_name] = line
elif isinstance(stack[-1], list):
stack[-1].append(line)
else:
stack[-1] = [stack[-1], line]
# Parsing nested objects into their respective classes
filer_data = data.pop("filer", None)
filer = None
if filer_data:
# Extract and initialize nested CompanyData for Filer
company_data = CompanyData(**filer_data.pop("company_data", {}))
# Extract and initialize nested FilingValues for Filer
filing_values = FilingValues(
form_type=filer_data["filing_values"].get("form_type", ""),
act=filer_data["filing_values"].get("act", ""),
file_number=filer_data["filing_values"].get("file_number", ""),
film_number=filer_data["filing_values"].get("film_number", "")
)
# Extract and initialize nested Business and Mail Address for Filer
business_address = cls._prepare_address(filer_data, "business_address")
mail_address = cls._prepare_address(filer_data, "mail_address")
# Handle FormerCompany elements
former_company_raw = filer_data.pop("former_company", [])
former_company = []
if isinstance(former_company_raw, list):
for fc in former_company_raw:
if isinstance(fc, dict):
former_company.append(FormerCompany(**fc))
elif isinstance(fc, str):
former_company.append(FormerCompany(former_conformed_name=fc, date_changed=''))
# Initialize Filer with nested data
filer = Filer(
company_data=company_data,
filing_values=filing_values,
business_address=business_address,
mail_address=mail_address,
former_company=former_company
)
data["filer"] = filer
# Process SubjectCompany if present
subject_company_data = data.pop("subject_company", None)
subject_company = None
if subject_company_data:
# Extract and initialize nested CompanyData for SubjectCompany
company_data = CompanyData(**subject_company_data.pop("company_data", {}))
# Extract and initialize nested FilingValues for SubjectCompany
filing_values = FilingValues(
form_type=subject_company_data["filing_values"].get("form_type", ""),
act=subject_company_data["filing_values"].get("act", ""),
file_number=subject_company_data["filing_values"].get("file_number", ""),
film_number=subject_company_data["filing_values"].get("film_number", "")
)
# Extract and initialize nested Business and Mail Address for SubjectCompany
business_address = cls._prepare_address(subject_company_data, "business_address")
mail_address = cls._prepare_address(subject_company_data, "mail_address")
# Handle FormerCompany elements
former_company_raw = subject_company_data.pop("former_company", [])
former_company = []
if isinstance(former_company_raw, list):
for fc in former_company_raw:
if isinstance(fc, dict):
former_company.append(FormerCompany(**fc))
elif isinstance(fc, str):
former_company.append(FormerCompany(former_conformed_name=fc, date_changed=''))
# Initialize SubjectCompany with nested data
subject_company = SubjectCompany(
company_data=company_data,
filing_values=filing_values,
business_address=business_address,
mail_address=mail_address,
former_company=former_company
)
data["subject_company"] = subject_company
# Process ReportingOwner if present
reporting_owner_data = data.pop("reporting_owner", None)
reporting_owner = None
if reporting_owner_data:
# Extract and initialize nested OwnerData or CompanyData for ReportingOwner
owner_data = reporting_owner_data.pop("owner_data", None)
company_data = reporting_owner_data.pop("company_data", None)
owner_data_obj = OwnerData(**owner_data) if owner_data else None
company_data_obj = CompanyData(**company_data) if company_data else None
# Extract and initialize nested FilingValues for ReportingOwner
filing_values = FilingValues(
form_type=reporting_owner_data["filing_values"].get("form_type", ""),
act=reporting_owner_data["filing_values"].get("act", ""),
file_number=reporting_owner_data["filing_values"].get("file_number", ""),
film_number=reporting_owner_data["filing_values"].get("film_number", "")
)
# Extract and initialize nested Mail Address for ReportingOwner
mail_address = cls._prepare_address(reporting_owner_data, "mail_address")
# Initialize ReportingOwner with nested data
reporting_owner = ReportingOwner(
company_data=company_data_obj,
owner_data=owner_data_obj,
filing_values=filing_values,
mail_address=mail_address
)
data["reporting_owner"] = reporting_owner
# Process Issuer if present
issuer_data = data.pop("issuer", None)
issuer = None
if issuer_data:
# Extract and initialize nested CompanyData for Issuer
company_data = CompanyData(**issuer_data.pop("company_data", {}))
# Extract and initialize nested Business and Mail Address for Issuer
business_address = cls._prepare_address(issuer_data, "business_address")
mail_address = cls._prepare_address(issuer_data, "mail_address")
# Initialize Issuer with nested data
issuer = Issuer(
company_data=company_data,
business_address=business_address,
mail_address=mail_address
)
data["issuer"] = issuer
# Ensure items is a list
items = data.pop("items", [])
if isinstance(items, str):
items = [items]
# Convert acceptance_datetime to datetime object
acceptance_datetime_str = data.pop("acceptance_datetime")
acceptance_datetime = datetime.strptime(acceptance_datetime_str,
'%Y%m%d%H%M%S') if acceptance_datetime_str else None
# Convert filing_date to date object
filing_date_str = data.pop("filing_date", None)
filing_date = datetime.strptime(filing_date_str, '%Y%m%d').strftime('%Y-%m-%d') if filing_date_str else None
date_of_change_str = data.pop("date_of_filing_date_change", None)
if date_of_change_str:
data["date_of_filing_date_change"] = datetime.strptime(date_of_change_str, '%Y%m%d').strftime('%Y-%m-%d')
# The type is really the form
data["form"] = data.pop("type")
# The public document count is an integer
data["public_document_count"] = int(data.pop("public_document_count", 0))
# Prepare the final dictionary for IndexHeaders initialization
sec_header_data = {
**data,
"filing_date": filing_date,
"acceptance_datetime": acceptance_datetime,
"items": items,
"filer": filer,
"subject_company": subject_company,
"reporting_owner": reporting_owner,
"issuer": issuer
}
# The <PRE> block contains the HTML for the documents
#documents = IndexHeaders._extract_documents_from_pre(soup.find("pre"))
# Initialize IndexHeaders with the parsed data
return cls(**sec_header_data)
@staticmethod
def _extract_documents_from_pre(pre_tag:Tag):
soup = BeautifulSoup(pre_tag.text)
document_tags = soup.find_all("document")
for document_tag in document_tags:
document_tag.find("type")
@staticmethod
def _extract_comment_text(soup):
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
if comments:
return comments[0].strip()
return None
@staticmethod
def _extract_accession_number(title: str):
import re
match = re.search(r'SEC EDGAR Submission (\d{10}-\d{2}-\d{6})', title)
if match:
return match.group(1)
return None
def __rich__(self):
# Summary Information
summary_table = Table("Filing Date", "Acceptance Datetime", "Documents", box=box.ROUNDED)
summary_table.add_row(
self.filing_date,
datetime.strftime(self.acceptance_datetime, '%Y-%m-%d %H:%M:%S'),
str(self.public_document_count))
main_contents = [summary_table]
if self.filer:
main_contents.append(self.filer)
if self.subject_company:
main_contents.append(self.subject_company)
if self.reporting_owner:
main_contents.append(self.reporting_owner)
if self.items and len(self.items) > 0:
items_table = Table("Items", box=box.ROUNDED)
for item in self.items:
items_table.add_row(item)
main_contents.append(items_table)
main_panel: Panel = Panel(
Group(*main_contents),
box=box.ROUNDED,
title=self.title,
subtitle=describe_form(self.form),
style="bold"
)
return main_panel
def __repr__(self):
return repr_rich(self.__rich__())