import io import logging from pathlib import Path from typing import Any, Dict, Generator, Optional from lxml import etree as ET from .data import ( ClassInfo, IncludedManager, PrimaryDoc, ProxyTable, ProxyVoteTable, ReportSeriesClassInfo, SeriesReport, VoteCategory, VoteRecord, ) log = logging.getLogger(__name__) class BaseExtractor: """Base class for XML extractors.""" def __init__(self, xml_bytes: bytes): """Initialize the extractor with raw XML bytes and parse the root element.""" self.xml_bytes = xml_bytes try: # Use a recovering parser for robustness against minor XML issues. parser = ET.XMLParser(recover=True) self.root: Optional[ET._Element] = ET.fromstring(xml_bytes, parser=parser) if self.root is None: # This might occur if the XML is severely malformed beyond recovery. raise ET.ParseError( "Failed to parse XML: root element is None even after recovery." ) except ET.ParseError as e: raise ValueError(f"Error parsing XML bytes during initial check: {e}") from e @classmethod def from_file(cls, xml_file_path: Path) -> "BaseExtractor": """Factory method to create an extractor instance from an XML file path.""" if not xml_file_path.exists(): raise FileNotFoundError(f"XML file not found: {xml_file_path}") xml_bytes = xml_file_path.read_bytes() return cls(xml_bytes) def extract(self) -> Any: """Main extraction method to be implemented by subclasses.""" raise NotImplementedError("Subclasses must implement the extract method.") def _get_optional_text( self, element: Optional[ET._Element], xpath: str, namespaces: Optional[Dict[str, str]] = None, ) -> Optional[str]: """Safely get text from an element, returning None if not found or empty.""" if element is None: return None # Ensure namespaces is not None if used; provide an empty dict as default for find if None. ns = namespaces if namespaces is not None else {} found_element = element.find(xpath, namespaces=ns) if found_element is not None and found_element.text: return found_element.text.strip() return None def _get_required_text( self, element: ET._Element, xpath: str, namespaces: Optional[Dict[str, str]] = None, ) -> str: """Get text from an element, raising ValueError if not found or empty.""" text = self._get_optional_text(element, xpath, namespaces) if text is None: element_str = ET.tostring(element, pretty_print=True).decode() raise ValueError( f"Required text not found for xpath: {xpath} in element: {element_str}" ) return text def _get_optional_float( self, element: Optional[ET._Element], xpath: str, namespaces: Optional[Dict[str, str]] = None, ) -> Optional[float]: """Safely get float from an element, returning None if not found or not a valid float.""" text = self._get_optional_text(element, xpath, namespaces) if text is None: return None try: return float(text) except ValueError: return None def _get_required_float( self, element: ET._Element, xpath: str, namespaces: Optional[Dict[str, str]] = None, ) -> float: """Get float from an element, raising ValueError if not found or not a valid float.""" val = self._get_optional_float(element, xpath, namespaces) if val is None: element_str = ET.tostring(element, pretty_print=True).decode() raise ValueError( f"Required float not found or invalid for xpath: {xpath} in element: {element_str}" ) return val # Namespaces for different filing formats PRIMARY_DOC_NAMESPACES: Dict[str, str] = { "npx": "http://www.sec.gov/edgar/npx", "com": "http://www.sec.gov/edgar/common", } class PrimaryDocExtractor(BaseExtractor): """ Extracts data from a primary_doc.xml string into a PrimaryDoc dataclass. Handles different XML formats including standard N-PX and amendment filings. """ def __init__(self, xml_bytes: bytes): """Initialize the extractor with raw XML *bytes*.""" super().__init__(xml_bytes) self.doc_prefix = "npx" def _get_submission_type(self) -> Optional[str]: """Extract the submission_type from the XML to determine filing type.""" npx_prefix = "npx" path = f"{npx_prefix}:headerData/{npx_prefix}:submissionType" return self._get_optional_text(self.root, path, PRIMARY_DOC_NAMESPACES) def extract(self) -> PrimaryDoc: """ Parses the XML and populates the PrimaryDoc dataclass. Handles both standard N-PX filings and amendment filings with different structures. """ if self.root is None: raise ValueError("XML root not parsed. Cannot extract.") prefix = self.doc_prefix submission_type = self._get_submission_type() is_amendment_submission = "/A" in submission_type if submission_type else False header_data = self.root.find(f"{prefix}:headerData", PRIMARY_DOC_NAMESPACES) if header_data is None: raise ValueError("Required element not found in XML.") form_data = self.root.find(f"{prefix}:formData", PRIMARY_DOC_NAMESPACES) if form_data is None: raise ValueError("Required element not found in XML.") cover_page = form_data.find(f"{prefix}:coverPage", PRIMARY_DOC_NAMESPACES) if cover_page is None: raise ValueError( f"Required element not found in XML using prefix {prefix}." ) reporting_person = cover_page.find( f"{prefix}:reportingPerson", PRIMARY_DOC_NAMESPACES ) if reporting_person is None: raise ValueError( "Required element not found in ." ) reporting_person_address = reporting_person.find( f"{prefix}:address", PRIMARY_DOC_NAMESPACES ) if reporting_person_address is None: raise ValueError( "Required
element not found in ." ) agent_for_service = cover_page.find( f"{prefix}:agentForService", PRIMARY_DOC_NAMESPACES ) agent_for_service_address = ( agent_for_service.find(f"{prefix}:address", PRIMARY_DOC_NAMESPACES) if agent_for_service is not None else None ) signature_page = form_data.find( f"{prefix}:signaturePage", PRIMARY_DOC_NAMESPACES ) if signature_page is None: raise ValueError( f"Required element not found in XML using prefix {prefix}." ) summary_page = form_data.find(f"{prefix}:summaryPage", PRIMARY_DOC_NAMESPACES) series_page = form_data.find(f"{prefix}:seriesPage", PRIMARY_DOC_NAMESPACES) report_info = cover_page.find(f"{prefix}:reportInfo", PRIMARY_DOC_NAMESPACES) explanatory_info = cover_page.find( f"{prefix}:explanatoryInformation", PRIMARY_DOC_NAMESPACES ) amendment_info = cover_page.find( f"{prefix}:amendmentInfo", PRIMARY_DOC_NAMESPACES ) contact = header_data.find(f"{prefix}:contact", PRIMARY_DOC_NAMESPACES) included_managers = [] if summary_page is not None: other_managers_section = summary_page.find( f"{prefix}:otherManagers2", PRIMARY_DOC_NAMESPACES ) if other_managers_section is not None: # Corrected XPath to use :manager as per original logic for the parent element for manager_elem in other_managers_section.findall( f"{prefix}:investmentManagers", PRIMARY_DOC_NAMESPACES ): manager = IncludedManager( # Assuming these sub-elements (serialNo, etc.) exist under each 'manager' element # and align with the IncludedManager dataclass fields. serial_no=self._get_required_text( manager_elem, f"{prefix}:serialNo", PRIMARY_DOC_NAMESPACES ), form13f_file_number=self._get_optional_text( manager_elem, f"{prefix}:form13FFileNumber", PRIMARY_DOC_NAMESPACES, ), name=self._get_required_text( manager_elem, f"{prefix}:name", PRIMARY_DOC_NAMESPACES ), sec_file_number=self._get_optional_text( manager_elem, f"{prefix}:secFileNumber", PRIMARY_DOC_NAMESPACES, ), ) included_managers.append(manager) report_series_class_infos = [] series_class_section = header_data.find( f"{prefix}:seriesClass", PRIMARY_DOC_NAMESPACES ) if series_class_section is not None: report_series_class = series_class_section.find( f"{prefix}:reportSeriesClass", PRIMARY_DOC_NAMESPACES ) if report_series_class is not None: for rpt_series_class_info_elem in report_series_class.findall( f"{prefix}:rptSeriesClassInfo", PRIMARY_DOC_NAMESPACES ): series_id = self._get_required_text( rpt_series_class_info_elem, f"{prefix}:seriesId", PRIMARY_DOC_NAMESPACES, ) class_infos = [] for class_info_elem in rpt_series_class_info_elem.findall( f"{prefix}:classInfo", PRIMARY_DOC_NAMESPACES ): class_id = self._get_required_text( class_info_elem, f"{prefix}:classId", PRIMARY_DOC_NAMESPACES ) class_infos.append(ClassInfo(class_id=class_id)) report_series_class_infos.append( ReportSeriesClassInfo( series_id=series_id, class_infos=class_infos ) ) series_reports = [] if series_page is not None: series_details = series_page.find( f"{prefix}:seriesDetails", PRIMARY_DOC_NAMESPACES ) if series_details is not None: for series_report_elem in series_details.findall( f"{prefix}:seriesReports", PRIMARY_DOC_NAMESPACES ): series_reports.append( SeriesReport( id_of_series=self._get_required_text( series_report_elem, f"{prefix}:idOfSeries", PRIMARY_DOC_NAMESPACES, ), name_of_series=self._get_optional_text( series_report_elem, f"{prefix}:nameOfSeries", PRIMARY_DOC_NAMESPACES, ), lei_of_series=self._get_optional_text( series_report_elem, f"{prefix}:leiOfSeries", PRIMARY_DOC_NAMESPACES, ), ) ) return PrimaryDoc( cik=self._get_required_text( header_data, f"{prefix}:filerInfo/{prefix}:filer/{prefix}:issuerCredentials/{prefix}:cik", PRIMARY_DOC_NAMESPACES, ), submission_type=submission_type or "", period_of_report=self._get_required_text( header_data, f"{prefix}:filerInfo/{prefix}:periodOfReport", PRIMARY_DOC_NAMESPACES, ), fund_name=self._get_required_text( reporting_person, f"{prefix}:name", PRIMARY_DOC_NAMESPACES ), phone_number=self._get_optional_text( reporting_person, f"{prefix}:phoneNumber", PRIMARY_DOC_NAMESPACES ), street1=self._get_required_text( reporting_person_address, "com:street1", PRIMARY_DOC_NAMESPACES ), street2=self._get_optional_text( reporting_person_address, "com:street2", PRIMARY_DOC_NAMESPACES ), city=self._get_required_text( reporting_person_address, "com:city", PRIMARY_DOC_NAMESPACES ), state=self._get_required_text( reporting_person_address, "com:stateOrCountry", PRIMARY_DOC_NAMESPACES ), zip_code=self._get_required_text( reporting_person_address, "com:zipCode", PRIMARY_DOC_NAMESPACES ), crd_number=self._get_optional_text( cover_page, f"{prefix}:reportingCrdNumber", PRIMARY_DOC_NAMESPACES ) or self._get_optional_text( cover_page, f"{prefix}:crdNumber", PRIMARY_DOC_NAMESPACES ), filer_sec_file_number=self._get_optional_text( cover_page, f"{prefix}:reportingSecFileNumber", PRIMARY_DOC_NAMESPACES ) or self._get_optional_text( cover_page, f"{prefix}:filerSecFileNumber", PRIMARY_DOC_NAMESPACES ) or self._get_optional_text( header_data, f"{prefix}:filerInfo/{prefix}:filer/{prefix}:fileNumber", PRIMARY_DOC_NAMESPACES, ), lei_number=self._get_optional_text( cover_page, f"{prefix}:lei", PRIMARY_DOC_NAMESPACES ) or self._get_optional_text( cover_page, f"{prefix}:leiNumber", PRIMARY_DOC_NAMESPACES ), report_calendar_year=self._get_required_text( cover_page, f"{prefix}:reportCalendarYear", PRIMARY_DOC_NAMESPACES ), report_type=( self._get_required_text( report_info, f"{prefix}:reportType", PRIMARY_DOC_NAMESPACES ) if report_info is not None else None ), confidential_treatment=( self._get_optional_text( report_info, f"{prefix}:confidentialTreatment", PRIMARY_DOC_NAMESPACES, ) if report_info is not None else None ), notice_explanation=( self._get_optional_text( explanatory_info, f"{prefix}:noticeExplanation", PRIMARY_DOC_NAMESPACES, ) if explanatory_info is not None else None ), npx_file_number=self._get_optional_text( cover_page, f"{prefix}:fileNumber", PRIMARY_DOC_NAMESPACES ), explanatory_choice=( self._get_optional_text( explanatory_info, f"{prefix}:explanatoryChoice", PRIMARY_DOC_NAMESPACES, ) if explanatory_info is not None else None ), other_included_managers_count=( self._get_optional_text( summary_page, f"{prefix}:otherIncludedManagersCount", PRIMARY_DOC_NAMESPACES, ) if summary_page is not None else "0" ), signer_name=self._get_required_text( signature_page, f"{prefix}:txSignature", PRIMARY_DOC_NAMESPACES ), signer_title=self._get_required_text( signature_page, f"{prefix}:txTitle", PRIMARY_DOC_NAMESPACES ), signature_date=self._get_required_text( signature_page, f"{prefix}:txAsOfDate", PRIMARY_DOC_NAMESPACES ), tx_printed_signature=self._get_optional_text( signature_page, f"{prefix}:txPrintedSignature", PRIMARY_DOC_NAMESPACES ), agent_for_service_name=( self._get_optional_text( agent_for_service, f"{prefix}:name", PRIMARY_DOC_NAMESPACES ) if agent_for_service is not None else None ), agent_for_service_address_street1=( self._get_optional_text( agent_for_service_address, "com:street1", PRIMARY_DOC_NAMESPACES ) if agent_for_service_address is not None else None ), agent_for_service_address_street2=( self._get_optional_text( agent_for_service_address, "com:street2", PRIMARY_DOC_NAMESPACES ) if agent_for_service_address is not None else None ), agent_for_service_address_city=( self._get_optional_text( agent_for_service_address, "com:city", PRIMARY_DOC_NAMESPACES ) if agent_for_service_address is not None else None ), agent_for_service_address_state_country=( self._get_optional_text( agent_for_service_address, "com:stateOrCountry", PRIMARY_DOC_NAMESPACES, ) if agent_for_service_address is not None else None ), agent_for_service_address_zip_code=( self._get_optional_text( agent_for_service_address, "com:zipCode", PRIMARY_DOC_NAMESPACES ) if agent_for_service_address is not None else None ), is_amendment=( is_amendment_submission if submission_type is not None else None ), amendment_no=( self._get_optional_text( amendment_info, f"{prefix}:amendmentNo", PRIMARY_DOC_NAMESPACES ) if amendment_info is not None else None ), amendment_type=( self._get_optional_text( amendment_info, f"{prefix}:amendmentType", PRIMARY_DOC_NAMESPACES ) if amendment_info is not None else None ), conf_denied_expired=( self._get_optional_text( amendment_info, f"{prefix}:confDeniedExpired", PRIMARY_DOC_NAMESPACES, ) if amendment_info is not None else None ), de_novo_request_choice=self._get_optional_text( header_data, f"{prefix}:filerInfo/{prefix}:deNovoRequestChoice", PRIMARY_DOC_NAMESPACES, ), year_or_quarter=self._get_optional_text( cover_page, f"{prefix}:yearOrQuarter", PRIMARY_DOC_NAMESPACES ), included_managers=included_managers, registrant_type=self._get_optional_text( header_data, f"{prefix}:filerInfo/{prefix}:registrantType", PRIMARY_DOC_NAMESPACES, ), live_test_flag=self._get_optional_text( header_data, f"{prefix}:filerInfo/{prefix}:liveTestFlag", PRIMARY_DOC_NAMESPACES, ), ccc=self._get_optional_text( header_data, f"{prefix}:filerInfo/{prefix}:filer/{prefix}:issuerCredentials/{prefix}:ccc", PRIMARY_DOC_NAMESPACES, ), contact_name=( self._get_optional_text( contact, f"{prefix}:name", PRIMARY_DOC_NAMESPACES ) if contact is not None else None ), contact_phone_number=( self._get_optional_text( contact, f"{prefix}:phoneNumber", PRIMARY_DOC_NAMESPACES ) if contact is not None else None ), contact_email_address=( self._get_optional_text( contact, f"{prefix}:emailAddress", PRIMARY_DOC_NAMESPACES ) if contact is not None else None ), override_internet_flag=self._get_optional_text( header_data, f"{prefix}:overrideInternetFlag", PRIMARY_DOC_NAMESPACES ), confirming_copy_flag=self._get_optional_text( header_data, f"{prefix}:confirmingCopyFlag", PRIMARY_DOC_NAMESPACES ), investment_company_type=self._get_optional_text( header_data, f"{prefix}:filerInfo/{prefix}:investmentCompanyType", PRIMARY_DOC_NAMESPACES, ), rpt_include_all_series_flag=( self._get_optional_text( series_class_section.find( f"{prefix}:reportSeriesClass", PRIMARY_DOC_NAMESPACES ), f"{prefix}:rptIncludeAllSeriesFlag", PRIMARY_DOC_NAMESPACES, ) if series_class_section is not None and series_class_section.find( f"{prefix}:reportSeriesClass", PRIMARY_DOC_NAMESPACES ) is not None else None ), series_count=self._get_optional_text( series_page, f"{prefix}:seriesCount", PRIMARY_DOC_NAMESPACES ), report_series_class_infos=report_series_class_infos, series_reports=series_reports, ) # Define the namespace for easier access PROXY_VOTE_TABLE_NAMESPACES = { "inf": "http://www.sec.gov/edgar/document/npxproxy/informationtable" } class ProxyVoteTableExtractor(BaseExtractor): """ Extracts proxy vote information from SEC N-PX proxy vote table XML data. Uses lxml.etree.iterparse for memory-efficient parsing of potentially large files. """ def __init__(self, xml_bytes: bytes): """Initialize the extractor with raw XML *bytes*.""" super().__init__(xml_bytes) # The tag for iterparse should be the fully qualified name of the proxyTable element. self.proxy_table_iter_tag = ( f"{{{PROXY_VOTE_TABLE_NAMESPACES['inf']}}}proxyTable" ) def _extract_proxy_table_generator(self) -> Generator[ProxyTable, None, None]: """ Parses the XML and yields ProxyTable objects. This is a generator method, renamed to avoid conflict if extract() was also a generator. Yields: Generator[ProxyTable, None, None]: A generator of ProxyTable dataclass instances. """ xml_file_like = io.BytesIO(self.xml_bytes) context = ET.iterparse( xml_file_like, events=("end",), tag=self.proxy_table_iter_tag, recover=True, ) for _, element in context: try: issuer_name = self._get_required_text( element, "inf:issuerName", PROXY_VOTE_TABLE_NAMESPACES ) meeting_date = self._get_required_text( element, "inf:meetingDate", PROXY_VOTE_TABLE_NAMESPACES ) vote_description = self._get_required_text( element, "inf:voteDescription", PROXY_VOTE_TABLE_NAMESPACES ) shares_voted_val = self._get_required_float( element, "inf:sharesVoted", PROXY_VOTE_TABLE_NAMESPACES ) shares_on_loan_val = self._get_required_float( element, "inf:sharesOnLoan", PROXY_VOTE_TABLE_NAMESPACES ) cusip = self._get_optional_text( element, "inf:cusip", PROXY_VOTE_TABLE_NAMESPACES ) isin = self._get_optional_text( element, "inf:isin", PROXY_VOTE_TABLE_NAMESPACES ) figi = self._get_optional_text( element, "inf:figi", PROXY_VOTE_TABLE_NAMESPACES ) other_vote_desc = self._get_optional_text( element, "inf:otherVoteDescription", PROXY_VOTE_TABLE_NAMESPACES ) vote_source = self._get_optional_text( element, "inf:voteSource", PROXY_VOTE_TABLE_NAMESPACES ) vote_series = self._get_optional_text( element, "inf:voteSeries", PROXY_VOTE_TABLE_NAMESPACES ) vote_other_info = self._get_optional_text( element, "inf:voteOtherInfo", PROXY_VOTE_TABLE_NAMESPACES ) vote_categories_list = [] vote_categories_element = element.find( "inf:voteCategories", namespaces=PROXY_VOTE_TABLE_NAMESPACES ) if vote_categories_element is not None: for cat_elem in vote_categories_element.findall( "inf:voteCategory", namespaces=PROXY_VOTE_TABLE_NAMESPACES ): category_type = self._get_optional_text( cat_elem, "inf:categoryType", PROXY_VOTE_TABLE_NAMESPACES ) if category_type: vote_categories_list.append( VoteCategory(category_type=category_type) ) vote_records_list = [] vote_element = element.find( "inf:vote", namespaces=PROXY_VOTE_TABLE_NAMESPACES ) if vote_element is not None: for rec_elem in vote_element.findall( "inf:voteRecord", namespaces=PROXY_VOTE_TABLE_NAMESPACES ): how_voted = self._get_optional_text( rec_elem, "inf:howVoted", PROXY_VOTE_TABLE_NAMESPACES ) shares_voted_rec = self._get_optional_float( rec_elem, "inf:sharesVoted", PROXY_VOTE_TABLE_NAMESPACES ) mgmt_rec = self._get_optional_text( rec_elem, "inf:managementRecommendation", PROXY_VOTE_TABLE_NAMESPACES, ) if ( how_voted is not None and shares_voted_rec is not None and mgmt_rec is not None ): vote_records_list.append( VoteRecord( how_voted=how_voted, shares_voted=shares_voted_rec, management_recommendation=mgmt_rec, ) ) other_managers_list = [] vote_manager_element = element.find( "inf:voteManager", namespaces=PROXY_VOTE_TABLE_NAMESPACES ) if vote_manager_element is not None: for other_managers_container in vote_manager_element.findall( "inf:otherManagers", namespaces=PROXY_VOTE_TABLE_NAMESPACES ): for om_elem in other_managers_container.findall( "inf:otherManager", namespaces=PROXY_VOTE_TABLE_NAMESPACES ): manager_id = om_elem.text.strip() if om_elem.text else None if manager_id: other_managers_list.append(manager_id) proxy_table_data = ProxyTable( issuer_name=issuer_name, meeting_date=meeting_date, vote_description=vote_description, shares_voted=shares_voted_val, shares_on_loan=shares_on_loan_val, cusip=cusip, isin=isin, figi=figi, other_vote_description=other_vote_desc, vote_source=vote_source, vote_series=vote_series, vote_other_info=vote_other_info, vote_categories=vote_categories_list, vote_records=vote_records_list, other_managers=other_managers_list, ) yield proxy_table_data except (ValueError, TypeError) as e: log.error( "Skipping proxyTable due to missing/invalid data or parsing error: %s on element %s", e, element.tag if element is not None else 'Unknown Element' ) if element is not None: element.clear() parent = element.getparent() if parent is not None: parent.remove(element) del context def extract(self) -> ProxyVoteTable: """ Extracts all ProxyTable instances from the XML and returns them in a ProxyVoteTable container. This is the main public method for this extractor. """ all_proxy_tables = list(self._extract_proxy_table_generator()) return ProxyVoteTable(proxy_tables=all_proxy_tables)