Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,746 @@
import io
import logging
from pathlib import Path
from typing import Any, Dict, Generator, Optional
from lxml import etree as ET
from .data import (
ClassInfo,
IncludedManager,
PrimaryDoc,
ProxyTable,
ProxyVoteTable,
ReportSeriesClassInfo,
SeriesReport,
VoteCategory,
VoteRecord,
)
log = logging.getLogger(__name__)
class BaseExtractor:
"""Base class for XML extractors."""
def __init__(self, xml_bytes: bytes):
"""Initialize the extractor with raw XML bytes and parse the root element."""
self.xml_bytes = xml_bytes
try:
# Use a recovering parser for robustness against minor XML issues.
parser = ET.XMLParser(recover=True)
self.root: Optional[ET._Element] = ET.fromstring(xml_bytes, parser=parser)
if self.root is None:
# This might occur if the XML is severely malformed beyond recovery.
raise ET.ParseError(
"Failed to parse XML: root element is None even after recovery."
)
except ET.ParseError as e:
raise ValueError(f"Error parsing XML bytes during initial check: {e}") from e
@classmethod
def from_file(cls, xml_file_path: Path) -> "BaseExtractor":
"""Factory method to create an extractor instance from an XML file path."""
if not xml_file_path.exists():
raise FileNotFoundError(f"XML file not found: {xml_file_path}")
xml_bytes = xml_file_path.read_bytes()
return cls(xml_bytes)
def extract(self) -> Any:
"""Main extraction method to be implemented by subclasses."""
raise NotImplementedError("Subclasses must implement the extract method.")
def _get_optional_text(
self,
element: Optional[ET._Element],
xpath: str,
namespaces: Optional[Dict[str, str]] = None,
) -> Optional[str]:
"""Safely get text from an element, returning None if not found or empty."""
if element is None:
return None
# Ensure namespaces is not None if used; provide an empty dict as default for find if None.
ns = namespaces if namespaces is not None else {}
found_element = element.find(xpath, namespaces=ns)
if found_element is not None and found_element.text:
return found_element.text.strip()
return None
def _get_required_text(
self,
element: ET._Element,
xpath: str,
namespaces: Optional[Dict[str, str]] = None,
) -> str:
"""Get text from an element, raising ValueError if not found or empty."""
text = self._get_optional_text(element, xpath, namespaces)
if text is None:
element_str = ET.tostring(element, pretty_print=True).decode()
raise ValueError(
f"Required text not found for xpath: {xpath} in element: {element_str}"
)
return text
def _get_optional_float(
self,
element: Optional[ET._Element],
xpath: str,
namespaces: Optional[Dict[str, str]] = None,
) -> Optional[float]:
"""Safely get float from an element, returning None if not found or not a valid float."""
text = self._get_optional_text(element, xpath, namespaces)
if text is None:
return None
try:
return float(text)
except ValueError:
return None
def _get_required_float(
self,
element: ET._Element,
xpath: str,
namespaces: Optional[Dict[str, str]] = None,
) -> float:
"""Get float from an element, raising ValueError if not found or not a valid float."""
val = self._get_optional_float(element, xpath, namespaces)
if val is None:
element_str = ET.tostring(element, pretty_print=True).decode()
raise ValueError(
f"Required float not found or invalid for xpath: {xpath} in element: {element_str}"
)
return val
# Namespaces for different filing formats
PRIMARY_DOC_NAMESPACES: Dict[str, str] = {
"npx": "http://www.sec.gov/edgar/npx",
"com": "http://www.sec.gov/edgar/common",
}
class PrimaryDocExtractor(BaseExtractor):
"""
Extracts data from a primary_doc.xml string into a PrimaryDoc dataclass.
Handles different XML formats including standard N-PX and amendment filings.
"""
def __init__(self, xml_bytes: bytes):
"""Initialize the extractor with raw XML *bytes*."""
super().__init__(xml_bytes)
self.doc_prefix = "npx"
def _get_submission_type(self) -> Optional[str]:
"""Extract the submission_type from the XML to determine filing type."""
npx_prefix = "npx"
path = f"{npx_prefix}:headerData/{npx_prefix}:submissionType"
return self._get_optional_text(self.root, path, PRIMARY_DOC_NAMESPACES)
def extract(self) -> PrimaryDoc:
"""
Parses the XML and populates the PrimaryDoc dataclass.
Handles both standard N-PX filings and amendment filings with different structures.
"""
if self.root is None:
raise ValueError("XML root not parsed. Cannot extract.")
prefix = self.doc_prefix
submission_type = self._get_submission_type()
is_amendment_submission = "/A" in submission_type if submission_type else False
header_data = self.root.find(f"{prefix}:headerData", PRIMARY_DOC_NAMESPACES)
if header_data is None:
raise ValueError("Required <headerData> element not found in XML.")
form_data = self.root.find(f"{prefix}:formData", PRIMARY_DOC_NAMESPACES)
if form_data is None:
raise ValueError("Required <formData> element not found in XML.")
cover_page = form_data.find(f"{prefix}:coverPage", PRIMARY_DOC_NAMESPACES)
if cover_page is None:
raise ValueError(
f"Required <coverPage> element not found in XML using prefix {prefix}."
)
reporting_person = cover_page.find(
f"{prefix}:reportingPerson", PRIMARY_DOC_NAMESPACES
)
if reporting_person is None:
raise ValueError(
"Required <reportingPerson> element not found in <coverPage>."
)
reporting_person_address = reporting_person.find(
f"{prefix}:address", PRIMARY_DOC_NAMESPACES
)
if reporting_person_address is None:
raise ValueError(
"Required <address> element not found in <reportingPerson>."
)
agent_for_service = cover_page.find(
f"{prefix}:agentForService", PRIMARY_DOC_NAMESPACES
)
agent_for_service_address = (
agent_for_service.find(f"{prefix}:address", PRIMARY_DOC_NAMESPACES)
if agent_for_service is not None
else None
)
signature_page = form_data.find(
f"{prefix}:signaturePage", PRIMARY_DOC_NAMESPACES
)
if signature_page is None:
raise ValueError(
f"Required <signaturePage> element not found in XML using prefix {prefix}."
)
summary_page = form_data.find(f"{prefix}:summaryPage", PRIMARY_DOC_NAMESPACES)
series_page = form_data.find(f"{prefix}:seriesPage", PRIMARY_DOC_NAMESPACES)
report_info = cover_page.find(f"{prefix}:reportInfo", PRIMARY_DOC_NAMESPACES)
explanatory_info = cover_page.find(
f"{prefix}:explanatoryInformation", PRIMARY_DOC_NAMESPACES
)
amendment_info = cover_page.find(
f"{prefix}:amendmentInfo", PRIMARY_DOC_NAMESPACES
)
contact = header_data.find(f"{prefix}:contact", PRIMARY_DOC_NAMESPACES)
included_managers = []
if summary_page is not None:
other_managers_section = summary_page.find(
f"{prefix}:otherManagers2", PRIMARY_DOC_NAMESPACES
)
if other_managers_section is not None:
# Corrected XPath to use :manager as per original logic for the parent element
for manager_elem in other_managers_section.findall(
f"{prefix}:investmentManagers", PRIMARY_DOC_NAMESPACES
):
manager = IncludedManager(
# Assuming these sub-elements (serialNo, etc.) exist under each 'manager' element
# and align with the IncludedManager dataclass fields.
serial_no=self._get_required_text(
manager_elem, f"{prefix}:serialNo", PRIMARY_DOC_NAMESPACES
),
form13f_file_number=self._get_optional_text(
manager_elem,
f"{prefix}:form13FFileNumber",
PRIMARY_DOC_NAMESPACES,
),
name=self._get_required_text(
manager_elem, f"{prefix}:name", PRIMARY_DOC_NAMESPACES
),
sec_file_number=self._get_optional_text(
manager_elem,
f"{prefix}:secFileNumber",
PRIMARY_DOC_NAMESPACES,
),
)
included_managers.append(manager)
report_series_class_infos = []
series_class_section = header_data.find(
f"{prefix}:seriesClass", PRIMARY_DOC_NAMESPACES
)
if series_class_section is not None:
report_series_class = series_class_section.find(
f"{prefix}:reportSeriesClass", PRIMARY_DOC_NAMESPACES
)
if report_series_class is not None:
for rpt_series_class_info_elem in report_series_class.findall(
f"{prefix}:rptSeriesClassInfo", PRIMARY_DOC_NAMESPACES
):
series_id = self._get_required_text(
rpt_series_class_info_elem,
f"{prefix}:seriesId",
PRIMARY_DOC_NAMESPACES,
)
class_infos = []
for class_info_elem in rpt_series_class_info_elem.findall(
f"{prefix}:classInfo", PRIMARY_DOC_NAMESPACES
):
class_id = self._get_required_text(
class_info_elem, f"{prefix}:classId", PRIMARY_DOC_NAMESPACES
)
class_infos.append(ClassInfo(class_id=class_id))
report_series_class_infos.append(
ReportSeriesClassInfo(
series_id=series_id, class_infos=class_infos
)
)
series_reports = []
if series_page is not None:
series_details = series_page.find(
f"{prefix}:seriesDetails", PRIMARY_DOC_NAMESPACES
)
if series_details is not None:
for series_report_elem in series_details.findall(
f"{prefix}:seriesReports", PRIMARY_DOC_NAMESPACES
):
series_reports.append(
SeriesReport(
id_of_series=self._get_required_text(
series_report_elem,
f"{prefix}:idOfSeries",
PRIMARY_DOC_NAMESPACES,
),
name_of_series=self._get_optional_text(
series_report_elem,
f"{prefix}:nameOfSeries",
PRIMARY_DOC_NAMESPACES,
),
lei_of_series=self._get_optional_text(
series_report_elem,
f"{prefix}:leiOfSeries",
PRIMARY_DOC_NAMESPACES,
),
)
)
return PrimaryDoc(
cik=self._get_required_text(
header_data,
f"{prefix}:filerInfo/{prefix}:filer/{prefix}:issuerCredentials/{prefix}:cik",
PRIMARY_DOC_NAMESPACES,
),
submission_type=submission_type or "",
period_of_report=self._get_required_text(
header_data,
f"{prefix}:filerInfo/{prefix}:periodOfReport",
PRIMARY_DOC_NAMESPACES,
),
fund_name=self._get_required_text(
reporting_person, f"{prefix}:name", PRIMARY_DOC_NAMESPACES
),
phone_number=self._get_optional_text(
reporting_person, f"{prefix}:phoneNumber", PRIMARY_DOC_NAMESPACES
),
street1=self._get_required_text(
reporting_person_address, "com:street1", PRIMARY_DOC_NAMESPACES
),
street2=self._get_optional_text(
reporting_person_address, "com:street2", PRIMARY_DOC_NAMESPACES
),
city=self._get_required_text(
reporting_person_address, "com:city", PRIMARY_DOC_NAMESPACES
),
state=self._get_required_text(
reporting_person_address, "com:stateOrCountry", PRIMARY_DOC_NAMESPACES
),
zip_code=self._get_required_text(
reporting_person_address, "com:zipCode", PRIMARY_DOC_NAMESPACES
),
crd_number=self._get_optional_text(
cover_page, f"{prefix}:reportingCrdNumber", PRIMARY_DOC_NAMESPACES
)
or self._get_optional_text(
cover_page, f"{prefix}:crdNumber", PRIMARY_DOC_NAMESPACES
),
filer_sec_file_number=self._get_optional_text(
cover_page, f"{prefix}:reportingSecFileNumber", PRIMARY_DOC_NAMESPACES
)
or self._get_optional_text(
cover_page, f"{prefix}:filerSecFileNumber", PRIMARY_DOC_NAMESPACES
)
or self._get_optional_text(
header_data,
f"{prefix}:filerInfo/{prefix}:filer/{prefix}:fileNumber",
PRIMARY_DOC_NAMESPACES,
),
lei_number=self._get_optional_text(
cover_page, f"{prefix}:lei", PRIMARY_DOC_NAMESPACES
)
or self._get_optional_text(
cover_page, f"{prefix}:leiNumber", PRIMARY_DOC_NAMESPACES
),
report_calendar_year=self._get_required_text(
cover_page, f"{prefix}:reportCalendarYear", PRIMARY_DOC_NAMESPACES
),
report_type=(
self._get_required_text(
report_info, f"{prefix}:reportType", PRIMARY_DOC_NAMESPACES
)
if report_info is not None
else None
),
confidential_treatment=(
self._get_optional_text(
report_info,
f"{prefix}:confidentialTreatment",
PRIMARY_DOC_NAMESPACES,
)
if report_info is not None
else None
),
notice_explanation=(
self._get_optional_text(
explanatory_info,
f"{prefix}:noticeExplanation",
PRIMARY_DOC_NAMESPACES,
)
if explanatory_info is not None
else None
),
npx_file_number=self._get_optional_text(
cover_page, f"{prefix}:fileNumber", PRIMARY_DOC_NAMESPACES
),
explanatory_choice=(
self._get_optional_text(
explanatory_info,
f"{prefix}:explanatoryChoice",
PRIMARY_DOC_NAMESPACES,
)
if explanatory_info is not None
else None
),
other_included_managers_count=(
self._get_optional_text(
summary_page,
f"{prefix}:otherIncludedManagersCount",
PRIMARY_DOC_NAMESPACES,
)
if summary_page is not None
else "0"
),
signer_name=self._get_required_text(
signature_page, f"{prefix}:txSignature", PRIMARY_DOC_NAMESPACES
),
signer_title=self._get_required_text(
signature_page, f"{prefix}:txTitle", PRIMARY_DOC_NAMESPACES
),
signature_date=self._get_required_text(
signature_page, f"{prefix}:txAsOfDate", PRIMARY_DOC_NAMESPACES
),
tx_printed_signature=self._get_optional_text(
signature_page, f"{prefix}:txPrintedSignature", PRIMARY_DOC_NAMESPACES
),
agent_for_service_name=(
self._get_optional_text(
agent_for_service, f"{prefix}:name", PRIMARY_DOC_NAMESPACES
)
if agent_for_service is not None
else None
),
agent_for_service_address_street1=(
self._get_optional_text(
agent_for_service_address, "com:street1", PRIMARY_DOC_NAMESPACES
)
if agent_for_service_address is not None
else None
),
agent_for_service_address_street2=(
self._get_optional_text(
agent_for_service_address, "com:street2", PRIMARY_DOC_NAMESPACES
)
if agent_for_service_address is not None
else None
),
agent_for_service_address_city=(
self._get_optional_text(
agent_for_service_address, "com:city", PRIMARY_DOC_NAMESPACES
)
if agent_for_service_address is not None
else None
),
agent_for_service_address_state_country=(
self._get_optional_text(
agent_for_service_address,
"com:stateOrCountry",
PRIMARY_DOC_NAMESPACES,
)
if agent_for_service_address is not None
else None
),
agent_for_service_address_zip_code=(
self._get_optional_text(
agent_for_service_address, "com:zipCode", PRIMARY_DOC_NAMESPACES
)
if agent_for_service_address is not None
else None
),
is_amendment=(
is_amendment_submission if submission_type is not None else None
),
amendment_no=(
self._get_optional_text(
amendment_info, f"{prefix}:amendmentNo", PRIMARY_DOC_NAMESPACES
)
if amendment_info is not None
else None
),
amendment_type=(
self._get_optional_text(
amendment_info, f"{prefix}:amendmentType", PRIMARY_DOC_NAMESPACES
)
if amendment_info is not None
else None
),
conf_denied_expired=(
self._get_optional_text(
amendment_info,
f"{prefix}:confDeniedExpired",
PRIMARY_DOC_NAMESPACES,
)
if amendment_info is not None
else None
),
de_novo_request_choice=self._get_optional_text(
header_data,
f"{prefix}:filerInfo/{prefix}:deNovoRequestChoice",
PRIMARY_DOC_NAMESPACES,
),
year_or_quarter=self._get_optional_text(
cover_page, f"{prefix}:yearOrQuarter", PRIMARY_DOC_NAMESPACES
),
included_managers=included_managers,
registrant_type=self._get_optional_text(
header_data,
f"{prefix}:filerInfo/{prefix}:registrantType",
PRIMARY_DOC_NAMESPACES,
),
live_test_flag=self._get_optional_text(
header_data,
f"{prefix}:filerInfo/{prefix}:liveTestFlag",
PRIMARY_DOC_NAMESPACES,
),
ccc=self._get_optional_text(
header_data,
f"{prefix}:filerInfo/{prefix}:filer/{prefix}:issuerCredentials/{prefix}:ccc",
PRIMARY_DOC_NAMESPACES,
),
contact_name=(
self._get_optional_text(
contact, f"{prefix}:name", PRIMARY_DOC_NAMESPACES
)
if contact is not None
else None
),
contact_phone_number=(
self._get_optional_text(
contact, f"{prefix}:phoneNumber", PRIMARY_DOC_NAMESPACES
)
if contact is not None
else None
),
contact_email_address=(
self._get_optional_text(
contact, f"{prefix}:emailAddress", PRIMARY_DOC_NAMESPACES
)
if contact is not None
else None
),
override_internet_flag=self._get_optional_text(
header_data, f"{prefix}:overrideInternetFlag", PRIMARY_DOC_NAMESPACES
),
confirming_copy_flag=self._get_optional_text(
header_data, f"{prefix}:confirmingCopyFlag", PRIMARY_DOC_NAMESPACES
),
investment_company_type=self._get_optional_text(
header_data,
f"{prefix}:filerInfo/{prefix}:investmentCompanyType",
PRIMARY_DOC_NAMESPACES,
),
rpt_include_all_series_flag=(
self._get_optional_text(
series_class_section.find(
f"{prefix}:reportSeriesClass", PRIMARY_DOC_NAMESPACES
),
f"{prefix}:rptIncludeAllSeriesFlag",
PRIMARY_DOC_NAMESPACES,
)
if series_class_section is not None
and series_class_section.find(
f"{prefix}:reportSeriesClass", PRIMARY_DOC_NAMESPACES
)
is not None
else None
),
series_count=self._get_optional_text(
series_page, f"{prefix}:seriesCount", PRIMARY_DOC_NAMESPACES
),
report_series_class_infos=report_series_class_infos,
series_reports=series_reports,
)
# Define the namespace for easier access
PROXY_VOTE_TABLE_NAMESPACES = {
"inf": "http://www.sec.gov/edgar/document/npxproxy/informationtable"
}
class ProxyVoteTableExtractor(BaseExtractor):
"""
Extracts proxy vote information from SEC N-PX proxy vote table XML data.
Uses lxml.etree.iterparse for memory-efficient parsing of potentially large files.
"""
def __init__(self, xml_bytes: bytes):
"""Initialize the extractor with raw XML *bytes*."""
super().__init__(xml_bytes)
# The tag for iterparse should be the fully qualified name of the proxyTable element.
self.proxy_table_iter_tag = (
f"{{{PROXY_VOTE_TABLE_NAMESPACES['inf']}}}proxyTable"
)
def _extract_proxy_table_generator(self) -> Generator[ProxyTable, None, None]:
"""
Parses the XML and yields ProxyTable objects.
This is a generator method, renamed to avoid conflict if extract() was also a generator.
Yields:
Generator[ProxyTable, None, None]: A generator of ProxyTable dataclass instances.
"""
xml_file_like = io.BytesIO(self.xml_bytes)
context = ET.iterparse(
xml_file_like,
events=("end",),
tag=self.proxy_table_iter_tag,
recover=True,
)
for _, element in context:
try:
issuer_name = self._get_required_text(
element, "inf:issuerName", PROXY_VOTE_TABLE_NAMESPACES
)
meeting_date = self._get_required_text(
element, "inf:meetingDate", PROXY_VOTE_TABLE_NAMESPACES
)
vote_description = self._get_required_text(
element, "inf:voteDescription", PROXY_VOTE_TABLE_NAMESPACES
)
shares_voted_val = self._get_required_float(
element, "inf:sharesVoted", PROXY_VOTE_TABLE_NAMESPACES
)
shares_on_loan_val = self._get_required_float(
element, "inf:sharesOnLoan", PROXY_VOTE_TABLE_NAMESPACES
)
cusip = self._get_optional_text(
element, "inf:cusip", PROXY_VOTE_TABLE_NAMESPACES
)
isin = self._get_optional_text(
element, "inf:isin", PROXY_VOTE_TABLE_NAMESPACES
)
figi = self._get_optional_text(
element, "inf:figi", PROXY_VOTE_TABLE_NAMESPACES
)
other_vote_desc = self._get_optional_text(
element, "inf:otherVoteDescription", PROXY_VOTE_TABLE_NAMESPACES
)
vote_source = self._get_optional_text(
element, "inf:voteSource", PROXY_VOTE_TABLE_NAMESPACES
)
vote_series = self._get_optional_text(
element, "inf:voteSeries", PROXY_VOTE_TABLE_NAMESPACES
)
vote_other_info = self._get_optional_text(
element, "inf:voteOtherInfo", PROXY_VOTE_TABLE_NAMESPACES
)
vote_categories_list = []
vote_categories_element = element.find(
"inf:voteCategories", namespaces=PROXY_VOTE_TABLE_NAMESPACES
)
if vote_categories_element is not None:
for cat_elem in vote_categories_element.findall(
"inf:voteCategory", namespaces=PROXY_VOTE_TABLE_NAMESPACES
):
category_type = self._get_optional_text(
cat_elem, "inf:categoryType", PROXY_VOTE_TABLE_NAMESPACES
)
if category_type:
vote_categories_list.append(
VoteCategory(category_type=category_type)
)
vote_records_list = []
vote_element = element.find(
"inf:vote", namespaces=PROXY_VOTE_TABLE_NAMESPACES
)
if vote_element is not None:
for rec_elem in vote_element.findall(
"inf:voteRecord", namespaces=PROXY_VOTE_TABLE_NAMESPACES
):
how_voted = self._get_optional_text(
rec_elem, "inf:howVoted", PROXY_VOTE_TABLE_NAMESPACES
)
shares_voted_rec = self._get_optional_float(
rec_elem, "inf:sharesVoted", PROXY_VOTE_TABLE_NAMESPACES
)
mgmt_rec = self._get_optional_text(
rec_elem,
"inf:managementRecommendation",
PROXY_VOTE_TABLE_NAMESPACES,
)
if (
how_voted is not None
and shares_voted_rec is not None
and mgmt_rec is not None
):
vote_records_list.append(
VoteRecord(
how_voted=how_voted,
shares_voted=shares_voted_rec,
management_recommendation=mgmt_rec,
)
)
other_managers_list = []
vote_manager_element = element.find(
"inf:voteManager", namespaces=PROXY_VOTE_TABLE_NAMESPACES
)
if vote_manager_element is not None:
for other_managers_container in vote_manager_element.findall(
"inf:otherManagers", namespaces=PROXY_VOTE_TABLE_NAMESPACES
):
for om_elem in other_managers_container.findall(
"inf:otherManager", namespaces=PROXY_VOTE_TABLE_NAMESPACES
):
manager_id = om_elem.text.strip() if om_elem.text else None
if manager_id:
other_managers_list.append(manager_id)
proxy_table_data = ProxyTable(
issuer_name=issuer_name,
meeting_date=meeting_date,
vote_description=vote_description,
shares_voted=shares_voted_val,
shares_on_loan=shares_on_loan_val,
cusip=cusip,
isin=isin,
figi=figi,
other_vote_description=other_vote_desc,
vote_source=vote_source,
vote_series=vote_series,
vote_other_info=vote_other_info,
vote_categories=vote_categories_list,
vote_records=vote_records_list,
other_managers=other_managers_list,
)
yield proxy_table_data
except (ValueError, TypeError) as e:
log.error(
"Skipping proxyTable due to missing/invalid data or parsing error: %s on element %s", e, element.tag if element is not None else 'Unknown Element'
)
if element is not None:
element.clear()
parent = element.getparent()
if parent is not None:
parent.remove(element)
del context
def extract(self) -> ProxyVoteTable:
"""
Extracts all ProxyTable instances from the XML and returns them in a ProxyVoteTable container.
This is the main public method for this extractor.
"""
all_proxy_tables = list(self._extract_proxy_table_generator())
return ProxyVoteTable(proxy_tables=all_proxy_tables)