Initial commit
This commit is contained in:
746
venv/lib/python3.10/site-packages/edgar/npx/parsing.py
Normal file
746
venv/lib/python3.10/site-packages/edgar/npx/parsing.py
Normal file
@@ -0,0 +1,746 @@
|
||||
import io
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Generator, Optional
|
||||
|
||||
from lxml import etree as ET
|
||||
|
||||
from .data import (
|
||||
ClassInfo,
|
||||
IncludedManager,
|
||||
PrimaryDoc,
|
||||
ProxyTable,
|
||||
ProxyVoteTable,
|
||||
ReportSeriesClassInfo,
|
||||
SeriesReport,
|
||||
VoteCategory,
|
||||
VoteRecord,
|
||||
)
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseExtractor:
|
||||
"""Base class for XML extractors."""
|
||||
|
||||
def __init__(self, xml_bytes: bytes):
|
||||
"""Initialize the extractor with raw XML bytes and parse the root element."""
|
||||
self.xml_bytes = xml_bytes
|
||||
try:
|
||||
# Use a recovering parser for robustness against minor XML issues.
|
||||
parser = ET.XMLParser(recover=True)
|
||||
self.root: Optional[ET._Element] = ET.fromstring(xml_bytes, parser=parser)
|
||||
if self.root is None:
|
||||
# This might occur if the XML is severely malformed beyond recovery.
|
||||
raise ET.ParseError(
|
||||
"Failed to parse XML: root element is None even after recovery."
|
||||
)
|
||||
except ET.ParseError as e:
|
||||
raise ValueError(f"Error parsing XML bytes during initial check: {e}") from e
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, xml_file_path: Path) -> "BaseExtractor":
|
||||
"""Factory method to create an extractor instance from an XML file path."""
|
||||
if not xml_file_path.exists():
|
||||
raise FileNotFoundError(f"XML file not found: {xml_file_path}")
|
||||
xml_bytes = xml_file_path.read_bytes()
|
||||
return cls(xml_bytes)
|
||||
|
||||
def extract(self) -> Any:
|
||||
"""Main extraction method to be implemented by subclasses."""
|
||||
raise NotImplementedError("Subclasses must implement the extract method.")
|
||||
|
||||
def _get_optional_text(
|
||||
self,
|
||||
element: Optional[ET._Element],
|
||||
xpath: str,
|
||||
namespaces: Optional[Dict[str, str]] = None,
|
||||
) -> Optional[str]:
|
||||
"""Safely get text from an element, returning None if not found or empty."""
|
||||
if element is None:
|
||||
return None
|
||||
# Ensure namespaces is not None if used; provide an empty dict as default for find if None.
|
||||
ns = namespaces if namespaces is not None else {}
|
||||
found_element = element.find(xpath, namespaces=ns)
|
||||
if found_element is not None and found_element.text:
|
||||
return found_element.text.strip()
|
||||
return None
|
||||
|
||||
def _get_required_text(
|
||||
self,
|
||||
element: ET._Element,
|
||||
xpath: str,
|
||||
namespaces: Optional[Dict[str, str]] = None,
|
||||
) -> str:
|
||||
"""Get text from an element, raising ValueError if not found or empty."""
|
||||
text = self._get_optional_text(element, xpath, namespaces)
|
||||
if text is None:
|
||||
element_str = ET.tostring(element, pretty_print=True).decode()
|
||||
raise ValueError(
|
||||
f"Required text not found for xpath: {xpath} in element: {element_str}"
|
||||
)
|
||||
return text
|
||||
|
||||
def _get_optional_float(
|
||||
self,
|
||||
element: Optional[ET._Element],
|
||||
xpath: str,
|
||||
namespaces: Optional[Dict[str, str]] = None,
|
||||
) -> Optional[float]:
|
||||
"""Safely get float from an element, returning None if not found or not a valid float."""
|
||||
text = self._get_optional_text(element, xpath, namespaces)
|
||||
if text is None:
|
||||
return None
|
||||
try:
|
||||
return float(text)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def _get_required_float(
|
||||
self,
|
||||
element: ET._Element,
|
||||
xpath: str,
|
||||
namespaces: Optional[Dict[str, str]] = None,
|
||||
) -> float:
|
||||
"""Get float from an element, raising ValueError if not found or not a valid float."""
|
||||
val = self._get_optional_float(element, xpath, namespaces)
|
||||
if val is None:
|
||||
element_str = ET.tostring(element, pretty_print=True).decode()
|
||||
raise ValueError(
|
||||
f"Required float not found or invalid for xpath: {xpath} in element: {element_str}"
|
||||
)
|
||||
return val
|
||||
|
||||
|
||||
# Namespaces for different filing formats
|
||||
PRIMARY_DOC_NAMESPACES: Dict[str, str] = {
|
||||
"npx": "http://www.sec.gov/edgar/npx",
|
||||
"com": "http://www.sec.gov/edgar/common",
|
||||
}
|
||||
|
||||
|
||||
class PrimaryDocExtractor(BaseExtractor):
|
||||
"""
|
||||
Extracts data from a primary_doc.xml string into a PrimaryDoc dataclass.
|
||||
Handles different XML formats including standard N-PX and amendment filings.
|
||||
"""
|
||||
|
||||
def __init__(self, xml_bytes: bytes):
|
||||
"""Initialize the extractor with raw XML *bytes*."""
|
||||
super().__init__(xml_bytes)
|
||||
self.doc_prefix = "npx"
|
||||
|
||||
def _get_submission_type(self) -> Optional[str]:
|
||||
"""Extract the submission_type from the XML to determine filing type."""
|
||||
npx_prefix = "npx"
|
||||
path = f"{npx_prefix}:headerData/{npx_prefix}:submissionType"
|
||||
return self._get_optional_text(self.root, path, PRIMARY_DOC_NAMESPACES)
|
||||
|
||||
def extract(self) -> PrimaryDoc:
|
||||
"""
|
||||
Parses the XML and populates the PrimaryDoc dataclass.
|
||||
Handles both standard N-PX filings and amendment filings with different structures.
|
||||
"""
|
||||
if self.root is None:
|
||||
raise ValueError("XML root not parsed. Cannot extract.")
|
||||
|
||||
prefix = self.doc_prefix
|
||||
submission_type = self._get_submission_type()
|
||||
is_amendment_submission = "/A" in submission_type if submission_type else False
|
||||
|
||||
header_data = self.root.find(f"{prefix}:headerData", PRIMARY_DOC_NAMESPACES)
|
||||
if header_data is None:
|
||||
raise ValueError("Required <headerData> element not found in XML.")
|
||||
|
||||
form_data = self.root.find(f"{prefix}:formData", PRIMARY_DOC_NAMESPACES)
|
||||
if form_data is None:
|
||||
raise ValueError("Required <formData> element not found in XML.")
|
||||
|
||||
cover_page = form_data.find(f"{prefix}:coverPage", PRIMARY_DOC_NAMESPACES)
|
||||
if cover_page is None:
|
||||
raise ValueError(
|
||||
f"Required <coverPage> element not found in XML using prefix {prefix}."
|
||||
)
|
||||
|
||||
reporting_person = cover_page.find(
|
||||
f"{prefix}:reportingPerson", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if reporting_person is None:
|
||||
raise ValueError(
|
||||
"Required <reportingPerson> element not found in <coverPage>."
|
||||
)
|
||||
|
||||
reporting_person_address = reporting_person.find(
|
||||
f"{prefix}:address", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if reporting_person_address is None:
|
||||
raise ValueError(
|
||||
"Required <address> element not found in <reportingPerson>."
|
||||
)
|
||||
|
||||
agent_for_service = cover_page.find(
|
||||
f"{prefix}:agentForService", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
agent_for_service_address = (
|
||||
agent_for_service.find(f"{prefix}:address", PRIMARY_DOC_NAMESPACES)
|
||||
if agent_for_service is not None
|
||||
else None
|
||||
)
|
||||
|
||||
signature_page = form_data.find(
|
||||
f"{prefix}:signaturePage", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if signature_page is None:
|
||||
raise ValueError(
|
||||
f"Required <signaturePage> element not found in XML using prefix {prefix}."
|
||||
)
|
||||
|
||||
summary_page = form_data.find(f"{prefix}:summaryPage", PRIMARY_DOC_NAMESPACES)
|
||||
series_page = form_data.find(f"{prefix}:seriesPage", PRIMARY_DOC_NAMESPACES)
|
||||
report_info = cover_page.find(f"{prefix}:reportInfo", PRIMARY_DOC_NAMESPACES)
|
||||
explanatory_info = cover_page.find(
|
||||
f"{prefix}:explanatoryInformation", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
amendment_info = cover_page.find(
|
||||
f"{prefix}:amendmentInfo", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
contact = header_data.find(f"{prefix}:contact", PRIMARY_DOC_NAMESPACES)
|
||||
|
||||
included_managers = []
|
||||
if summary_page is not None:
|
||||
other_managers_section = summary_page.find(
|
||||
f"{prefix}:otherManagers2", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if other_managers_section is not None:
|
||||
# Corrected XPath to use :manager as per original logic for the parent element
|
||||
for manager_elem in other_managers_section.findall(
|
||||
f"{prefix}:investmentManagers", PRIMARY_DOC_NAMESPACES
|
||||
):
|
||||
manager = IncludedManager(
|
||||
# Assuming these sub-elements (serialNo, etc.) exist under each 'manager' element
|
||||
# and align with the IncludedManager dataclass fields.
|
||||
serial_no=self._get_required_text(
|
||||
manager_elem, f"{prefix}:serialNo", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
form13f_file_number=self._get_optional_text(
|
||||
manager_elem,
|
||||
f"{prefix}:form13FFileNumber",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
name=self._get_required_text(
|
||||
manager_elem, f"{prefix}:name", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
sec_file_number=self._get_optional_text(
|
||||
manager_elem,
|
||||
f"{prefix}:secFileNumber",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
)
|
||||
included_managers.append(manager)
|
||||
|
||||
report_series_class_infos = []
|
||||
series_class_section = header_data.find(
|
||||
f"{prefix}:seriesClass", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if series_class_section is not None:
|
||||
report_series_class = series_class_section.find(
|
||||
f"{prefix}:reportSeriesClass", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if report_series_class is not None:
|
||||
for rpt_series_class_info_elem in report_series_class.findall(
|
||||
f"{prefix}:rptSeriesClassInfo", PRIMARY_DOC_NAMESPACES
|
||||
):
|
||||
series_id = self._get_required_text(
|
||||
rpt_series_class_info_elem,
|
||||
f"{prefix}:seriesId",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
)
|
||||
class_infos = []
|
||||
for class_info_elem in rpt_series_class_info_elem.findall(
|
||||
f"{prefix}:classInfo", PRIMARY_DOC_NAMESPACES
|
||||
):
|
||||
class_id = self._get_required_text(
|
||||
class_info_elem, f"{prefix}:classId", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
class_infos.append(ClassInfo(class_id=class_id))
|
||||
report_series_class_infos.append(
|
||||
ReportSeriesClassInfo(
|
||||
series_id=series_id, class_infos=class_infos
|
||||
)
|
||||
)
|
||||
|
||||
series_reports = []
|
||||
if series_page is not None:
|
||||
series_details = series_page.find(
|
||||
f"{prefix}:seriesDetails", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if series_details is not None:
|
||||
for series_report_elem in series_details.findall(
|
||||
f"{prefix}:seriesReports", PRIMARY_DOC_NAMESPACES
|
||||
):
|
||||
series_reports.append(
|
||||
SeriesReport(
|
||||
id_of_series=self._get_required_text(
|
||||
series_report_elem,
|
||||
f"{prefix}:idOfSeries",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
name_of_series=self._get_optional_text(
|
||||
series_report_elem,
|
||||
f"{prefix}:nameOfSeries",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
lei_of_series=self._get_optional_text(
|
||||
series_report_elem,
|
||||
f"{prefix}:leiOfSeries",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
return PrimaryDoc(
|
||||
cik=self._get_required_text(
|
||||
header_data,
|
||||
f"{prefix}:filerInfo/{prefix}:filer/{prefix}:issuerCredentials/{prefix}:cik",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
submission_type=submission_type or "",
|
||||
period_of_report=self._get_required_text(
|
||||
header_data,
|
||||
f"{prefix}:filerInfo/{prefix}:periodOfReport",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
fund_name=self._get_required_text(
|
||||
reporting_person, f"{prefix}:name", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
phone_number=self._get_optional_text(
|
||||
reporting_person, f"{prefix}:phoneNumber", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
street1=self._get_required_text(
|
||||
reporting_person_address, "com:street1", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
street2=self._get_optional_text(
|
||||
reporting_person_address, "com:street2", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
city=self._get_required_text(
|
||||
reporting_person_address, "com:city", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
state=self._get_required_text(
|
||||
reporting_person_address, "com:stateOrCountry", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
zip_code=self._get_required_text(
|
||||
reporting_person_address, "com:zipCode", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
crd_number=self._get_optional_text(
|
||||
cover_page, f"{prefix}:reportingCrdNumber", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
or self._get_optional_text(
|
||||
cover_page, f"{prefix}:crdNumber", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
filer_sec_file_number=self._get_optional_text(
|
||||
cover_page, f"{prefix}:reportingSecFileNumber", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
or self._get_optional_text(
|
||||
cover_page, f"{prefix}:filerSecFileNumber", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
or self._get_optional_text(
|
||||
header_data,
|
||||
f"{prefix}:filerInfo/{prefix}:filer/{prefix}:fileNumber",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
lei_number=self._get_optional_text(
|
||||
cover_page, f"{prefix}:lei", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
or self._get_optional_text(
|
||||
cover_page, f"{prefix}:leiNumber", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
report_calendar_year=self._get_required_text(
|
||||
cover_page, f"{prefix}:reportCalendarYear", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
report_type=(
|
||||
self._get_required_text(
|
||||
report_info, f"{prefix}:reportType", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if report_info is not None
|
||||
else None
|
||||
),
|
||||
confidential_treatment=(
|
||||
self._get_optional_text(
|
||||
report_info,
|
||||
f"{prefix}:confidentialTreatment",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
)
|
||||
if report_info is not None
|
||||
else None
|
||||
),
|
||||
notice_explanation=(
|
||||
self._get_optional_text(
|
||||
explanatory_info,
|
||||
f"{prefix}:noticeExplanation",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
)
|
||||
if explanatory_info is not None
|
||||
else None
|
||||
),
|
||||
npx_file_number=self._get_optional_text(
|
||||
cover_page, f"{prefix}:fileNumber", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
explanatory_choice=(
|
||||
self._get_optional_text(
|
||||
explanatory_info,
|
||||
f"{prefix}:explanatoryChoice",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
)
|
||||
if explanatory_info is not None
|
||||
else None
|
||||
),
|
||||
other_included_managers_count=(
|
||||
self._get_optional_text(
|
||||
summary_page,
|
||||
f"{prefix}:otherIncludedManagersCount",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
)
|
||||
if summary_page is not None
|
||||
else "0"
|
||||
),
|
||||
signer_name=self._get_required_text(
|
||||
signature_page, f"{prefix}:txSignature", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
signer_title=self._get_required_text(
|
||||
signature_page, f"{prefix}:txTitle", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
signature_date=self._get_required_text(
|
||||
signature_page, f"{prefix}:txAsOfDate", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
tx_printed_signature=self._get_optional_text(
|
||||
signature_page, f"{prefix}:txPrintedSignature", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
agent_for_service_name=(
|
||||
self._get_optional_text(
|
||||
agent_for_service, f"{prefix}:name", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if agent_for_service is not None
|
||||
else None
|
||||
),
|
||||
agent_for_service_address_street1=(
|
||||
self._get_optional_text(
|
||||
agent_for_service_address, "com:street1", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if agent_for_service_address is not None
|
||||
else None
|
||||
),
|
||||
agent_for_service_address_street2=(
|
||||
self._get_optional_text(
|
||||
agent_for_service_address, "com:street2", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if agent_for_service_address is not None
|
||||
else None
|
||||
),
|
||||
agent_for_service_address_city=(
|
||||
self._get_optional_text(
|
||||
agent_for_service_address, "com:city", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if agent_for_service_address is not None
|
||||
else None
|
||||
),
|
||||
agent_for_service_address_state_country=(
|
||||
self._get_optional_text(
|
||||
agent_for_service_address,
|
||||
"com:stateOrCountry",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
)
|
||||
if agent_for_service_address is not None
|
||||
else None
|
||||
),
|
||||
agent_for_service_address_zip_code=(
|
||||
self._get_optional_text(
|
||||
agent_for_service_address, "com:zipCode", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if agent_for_service_address is not None
|
||||
else None
|
||||
),
|
||||
is_amendment=(
|
||||
is_amendment_submission if submission_type is not None else None
|
||||
),
|
||||
amendment_no=(
|
||||
self._get_optional_text(
|
||||
amendment_info, f"{prefix}:amendmentNo", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if amendment_info is not None
|
||||
else None
|
||||
),
|
||||
amendment_type=(
|
||||
self._get_optional_text(
|
||||
amendment_info, f"{prefix}:amendmentType", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if amendment_info is not None
|
||||
else None
|
||||
),
|
||||
conf_denied_expired=(
|
||||
self._get_optional_text(
|
||||
amendment_info,
|
||||
f"{prefix}:confDeniedExpired",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
)
|
||||
if amendment_info is not None
|
||||
else None
|
||||
),
|
||||
de_novo_request_choice=self._get_optional_text(
|
||||
header_data,
|
||||
f"{prefix}:filerInfo/{prefix}:deNovoRequestChoice",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
year_or_quarter=self._get_optional_text(
|
||||
cover_page, f"{prefix}:yearOrQuarter", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
included_managers=included_managers,
|
||||
registrant_type=self._get_optional_text(
|
||||
header_data,
|
||||
f"{prefix}:filerInfo/{prefix}:registrantType",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
live_test_flag=self._get_optional_text(
|
||||
header_data,
|
||||
f"{prefix}:filerInfo/{prefix}:liveTestFlag",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
ccc=self._get_optional_text(
|
||||
header_data,
|
||||
f"{prefix}:filerInfo/{prefix}:filer/{prefix}:issuerCredentials/{prefix}:ccc",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
contact_name=(
|
||||
self._get_optional_text(
|
||||
contact, f"{prefix}:name", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if contact is not None
|
||||
else None
|
||||
),
|
||||
contact_phone_number=(
|
||||
self._get_optional_text(
|
||||
contact, f"{prefix}:phoneNumber", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if contact is not None
|
||||
else None
|
||||
),
|
||||
contact_email_address=(
|
||||
self._get_optional_text(
|
||||
contact, f"{prefix}:emailAddress", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
if contact is not None
|
||||
else None
|
||||
),
|
||||
override_internet_flag=self._get_optional_text(
|
||||
header_data, f"{prefix}:overrideInternetFlag", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
confirming_copy_flag=self._get_optional_text(
|
||||
header_data, f"{prefix}:confirmingCopyFlag", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
investment_company_type=self._get_optional_text(
|
||||
header_data,
|
||||
f"{prefix}:filerInfo/{prefix}:investmentCompanyType",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
),
|
||||
rpt_include_all_series_flag=(
|
||||
self._get_optional_text(
|
||||
series_class_section.find(
|
||||
f"{prefix}:reportSeriesClass", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
f"{prefix}:rptIncludeAllSeriesFlag",
|
||||
PRIMARY_DOC_NAMESPACES,
|
||||
)
|
||||
if series_class_section is not None
|
||||
and series_class_section.find(
|
||||
f"{prefix}:reportSeriesClass", PRIMARY_DOC_NAMESPACES
|
||||
)
|
||||
is not None
|
||||
else None
|
||||
),
|
||||
series_count=self._get_optional_text(
|
||||
series_page, f"{prefix}:seriesCount", PRIMARY_DOC_NAMESPACES
|
||||
),
|
||||
report_series_class_infos=report_series_class_infos,
|
||||
series_reports=series_reports,
|
||||
)
|
||||
|
||||
|
||||
# Define the namespace for easier access
|
||||
PROXY_VOTE_TABLE_NAMESPACES = {
|
||||
"inf": "http://www.sec.gov/edgar/document/npxproxy/informationtable"
|
||||
}
|
||||
|
||||
|
||||
class ProxyVoteTableExtractor(BaseExtractor):
|
||||
"""
|
||||
Extracts proxy vote information from SEC N-PX proxy vote table XML data.
|
||||
Uses lxml.etree.iterparse for memory-efficient parsing of potentially large files.
|
||||
"""
|
||||
|
||||
def __init__(self, xml_bytes: bytes):
|
||||
"""Initialize the extractor with raw XML *bytes*."""
|
||||
super().__init__(xml_bytes)
|
||||
# The tag for iterparse should be the fully qualified name of the proxyTable element.
|
||||
self.proxy_table_iter_tag = (
|
||||
f"{{{PROXY_VOTE_TABLE_NAMESPACES['inf']}}}proxyTable"
|
||||
)
|
||||
|
||||
def _extract_proxy_table_generator(self) -> Generator[ProxyTable, None, None]:
|
||||
"""
|
||||
Parses the XML and yields ProxyTable objects.
|
||||
This is a generator method, renamed to avoid conflict if extract() was also a generator.
|
||||
|
||||
Yields:
|
||||
Generator[ProxyTable, None, None]: A generator of ProxyTable dataclass instances.
|
||||
"""
|
||||
xml_file_like = io.BytesIO(self.xml_bytes)
|
||||
|
||||
context = ET.iterparse(
|
||||
xml_file_like,
|
||||
events=("end",),
|
||||
tag=self.proxy_table_iter_tag,
|
||||
recover=True,
|
||||
)
|
||||
|
||||
for _, element in context:
|
||||
try:
|
||||
issuer_name = self._get_required_text(
|
||||
element, "inf:issuerName", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
meeting_date = self._get_required_text(
|
||||
element, "inf:meetingDate", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
vote_description = self._get_required_text(
|
||||
element, "inf:voteDescription", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
shares_voted_val = self._get_required_float(
|
||||
element, "inf:sharesVoted", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
shares_on_loan_val = self._get_required_float(
|
||||
element, "inf:sharesOnLoan", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
|
||||
cusip = self._get_optional_text(
|
||||
element, "inf:cusip", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
isin = self._get_optional_text(
|
||||
element, "inf:isin", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
figi = self._get_optional_text(
|
||||
element, "inf:figi", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
other_vote_desc = self._get_optional_text(
|
||||
element, "inf:otherVoteDescription", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
vote_source = self._get_optional_text(
|
||||
element, "inf:voteSource", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
vote_series = self._get_optional_text(
|
||||
element, "inf:voteSeries", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
vote_other_info = self._get_optional_text(
|
||||
element, "inf:voteOtherInfo", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
|
||||
vote_categories_list = []
|
||||
vote_categories_element = element.find(
|
||||
"inf:voteCategories", namespaces=PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
if vote_categories_element is not None:
|
||||
for cat_elem in vote_categories_element.findall(
|
||||
"inf:voteCategory", namespaces=PROXY_VOTE_TABLE_NAMESPACES
|
||||
):
|
||||
category_type = self._get_optional_text(
|
||||
cat_elem, "inf:categoryType", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
if category_type:
|
||||
vote_categories_list.append(
|
||||
VoteCategory(category_type=category_type)
|
||||
)
|
||||
|
||||
vote_records_list = []
|
||||
vote_element = element.find(
|
||||
"inf:vote", namespaces=PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
if vote_element is not None:
|
||||
for rec_elem in vote_element.findall(
|
||||
"inf:voteRecord", namespaces=PROXY_VOTE_TABLE_NAMESPACES
|
||||
):
|
||||
how_voted = self._get_optional_text(
|
||||
rec_elem, "inf:howVoted", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
shares_voted_rec = self._get_optional_float(
|
||||
rec_elem, "inf:sharesVoted", PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
mgmt_rec = self._get_optional_text(
|
||||
rec_elem,
|
||||
"inf:managementRecommendation",
|
||||
PROXY_VOTE_TABLE_NAMESPACES,
|
||||
)
|
||||
|
||||
if (
|
||||
how_voted is not None
|
||||
and shares_voted_rec is not None
|
||||
and mgmt_rec is not None
|
||||
):
|
||||
vote_records_list.append(
|
||||
VoteRecord(
|
||||
how_voted=how_voted,
|
||||
shares_voted=shares_voted_rec,
|
||||
management_recommendation=mgmt_rec,
|
||||
)
|
||||
)
|
||||
|
||||
other_managers_list = []
|
||||
vote_manager_element = element.find(
|
||||
"inf:voteManager", namespaces=PROXY_VOTE_TABLE_NAMESPACES
|
||||
)
|
||||
if vote_manager_element is not None:
|
||||
for other_managers_container in vote_manager_element.findall(
|
||||
"inf:otherManagers", namespaces=PROXY_VOTE_TABLE_NAMESPACES
|
||||
):
|
||||
for om_elem in other_managers_container.findall(
|
||||
"inf:otherManager", namespaces=PROXY_VOTE_TABLE_NAMESPACES
|
||||
):
|
||||
manager_id = om_elem.text.strip() if om_elem.text else None
|
||||
if manager_id:
|
||||
other_managers_list.append(manager_id)
|
||||
|
||||
proxy_table_data = ProxyTable(
|
||||
issuer_name=issuer_name,
|
||||
meeting_date=meeting_date,
|
||||
vote_description=vote_description,
|
||||
shares_voted=shares_voted_val,
|
||||
shares_on_loan=shares_on_loan_val,
|
||||
cusip=cusip,
|
||||
isin=isin,
|
||||
figi=figi,
|
||||
other_vote_description=other_vote_desc,
|
||||
vote_source=vote_source,
|
||||
vote_series=vote_series,
|
||||
vote_other_info=vote_other_info,
|
||||
vote_categories=vote_categories_list,
|
||||
vote_records=vote_records_list,
|
||||
other_managers=other_managers_list,
|
||||
)
|
||||
yield proxy_table_data
|
||||
|
||||
except (ValueError, TypeError) as e:
|
||||
log.error(
|
||||
"Skipping proxyTable due to missing/invalid data or parsing error: %s on element %s", e, element.tag if element is not None else 'Unknown Element'
|
||||
)
|
||||
|
||||
if element is not None:
|
||||
element.clear()
|
||||
parent = element.getparent()
|
||||
if parent is not None:
|
||||
parent.remove(element)
|
||||
|
||||
del context
|
||||
|
||||
def extract(self) -> ProxyVoteTable:
|
||||
"""
|
||||
Extracts all ProxyTable instances from the XML and returns them in a ProxyVoteTable container.
|
||||
This is the main public method for this extractor.
|
||||
"""
|
||||
all_proxy_tables = list(self._extract_proxy_table_generator())
|
||||
return ProxyVoteTable(proxy_tables=all_proxy_tables)
|
||||
Reference in New Issue
Block a user