""" Data structures and functions for working with fund data. This module provides the FundData class and related functions for accessing and manipulating fund data. """ import logging import re from functools import lru_cache from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union if TYPE_CHECKING: from bs4 import Tag import pandas as pd import pyarrow as pa from bs4 import BeautifulSoup from edgar._filings import Filings from edgar.datatools import drop_duplicates_pyarrow from edgar.entity.data import EntityData from edgar.funds.core import FundClass, FundCompany, FundSeries from edgar.httprequests import download_text log = logging.getLogger(__name__) # # Direct implementations to replace legacy module dependencies # # Direct implementations of fund-related functionality # These replace the legacy module dependencies # URL constants for fund searches fund_series_search_url = "https://www.sec.gov/cgi-bin/series?company=" fund_class_or_series_search_url = "https://www.sec.gov/cgi-bin/browse-edgar?CIK={}" fund_series_direct_url = "https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&scd=series" class _FundDTO: """ Data Transfer Object for fund information. Internal class used to return fund data from direct implementations. This is not part of the public API and should not be used directly. Use the Fund class from edgar.funds.core instead. """ def __init__(self, company_cik, company_name, name, series, ticker, class_contract_id, class_contract_name): self.company_cik = company_cik self.company_name = company_name self.name = name self.series = series self.ticker = ticker self.class_contract_id = class_contract_id self.class_contract_name = class_contract_name def __str__(self): return f"{self.name} - {self.ticker} [{self.class_contract_id}]" # Parse SGML fund data (directly implemented) def parse_fund_data(series_sgml_data: str) -> pd.DataFrame: """ Parse the SGML text containing fund series and class information. Args: series_sgml_data: SGML text with SERIES-AND-CLASSES-CONTRACTS-DATA Returns: DataFrame with parsed fund information Example SGML data: 0001090372 S000071967 Jacob Forward ETF C000227599 Jacob Forward ETF JFWD """ # Regular expressions to match each relevant tag series_re = re.compile(r'(.*?)', re.DOTALL) data_re = re.compile(r'<([^>]+)>([^<]*)') # Extract SERIES blocks series_blocks = series_re.findall(series_sgml_data) # Create an empty DataFrame columns = [ "OWNER-CIK", "SERIES-ID", "SERIES-NAME", "CLASS-CONTRACT-ID", "CLASS-CONTRACT-NAME", "CLASS-CONTRACT-TICKER-SYMBOL" ] # Extract information from SERIES blocks and append to DataFrame rows = [] for block in series_blocks: data_matches = data_re.findall(block) data_dict = {tag: value.strip() for tag, value in data_matches} class_contract_data = { "CLASS-CONTRACT-ID": data_dict.get("CLASS-CONTRACT-ID", ""), "CLASS-CONTRACT-NAME": data_dict.get("CLASS-CONTRACT-NAME", ""), "CLASS-CONTRACT-TICKER-SYMBOL": data_dict.get("CLASS-CONTRACT-TICKER-SYMBOL", "") } # Merge SERIES and CLASS-CONTRACT data row_data = {**data_dict, **class_contract_data} rows.append(row_data) # Create DataFrame and select relevant columns df = pd.DataFrame(rows, columns=columns).iloc[:, :6] # Rename columns for consistency return (df.rename(columns={ "OWNER-CIK": "CIK", "SERIES-ID": "SeriesID", "SERIES-NAME": "Fund", "CLASS-CONTRACT-ID": "ContractID", "CLASS-CONTRACT-NAME": "Class", "CLASS-CONTRACT-TICKER-SYMBOL": "Ticker" }) .filter(["Fund", "Ticker", "SeriesID", "ContractID", "Class", "CIK"]) ) # Direct implementation of FundCompanyInfo class _FundCompanyInfo: """ Internal helper class representing the fund company. This is parsed from the results page when we get the fund class or series. Not part of the public API - use the Fund class from edgar.funds.core instead. """ def __init__(self, name: str, cik: str, ident_info: Dict[str, str], addresses: List[str], filings: Filings): self.name: str = name self.cik: str = cik self.ident_info: Dict[str, str] = ident_info self.addresses: List[str] = addresses self.filings = filings @property def state(self): return self.ident_info.get("State location", None) @property def state_of_incorporation(self): return self.ident_info.get("State of Inc.", None) @lru_cache(maxsize=1) def id_and_name(self, contract_or_series: str) -> Optional[Tuple[str, str]]: class_contract_str = self.ident_info.get(contract_or_series, None) if not class_contract_str: return None match = re.match(r'([CS]\d+)(?:\s(.*))?', class_contract_str) # Storing the results in variables if matched, with a default for description if not present cik = match.group(1) if match else "" cik_description = match.group(2) if match and match.group(2) else "" return cik, cik_description @classmethod def from_html(cls, company_info_html: Union[str, 'Tag']): soup = BeautifulSoup(company_info_html, features="html.parser") # Parse the fund company info content_div = soup.find("div", {"id": "contentDiv"}) if content_div is None: # Should not reach here, but this is precautionary log.warning("Did not find div with id 'contentDiv'") return None ident_info_dict = {} company_info_div = content_div.find("div", class_="companyInfo") company_name_tag = company_info_div.find('span', class_='companyName') company_name = company_name_tag.text.split('CIK')[0].strip() cik = company_name_tag.a.text.split(' ')[0] # Extract the identifying information for tag in company_info_div.find_all('br'): tag.replace_with('\n') ident_info = company_info_div.find('p', class_='identInfo') ident_line = ident_info.get_text().replace("|", "\n").strip() for line in ident_line.split("\n"): if ":" in line: key, value = line.split(":") ident_info_dict[key.strip()] = value.strip().replace("\xa0", " ") # Addresses mailer_divs = content_div.find_all("div", class_="mailer") addresses = [re.sub(r'\n\s+', '\n', mailer_div.text.strip()) for mailer_div in mailer_divs] filing_index = cls._extract_filings(soup, company_name, cik) filings = Filings(filing_index=filing_index) return cls(name=company_name, cik=cik, filings=filings, ident_info=ident_info_dict, addresses=addresses) @classmethod def _extract_filings(cls, soup, company_name: str, cik: str): from datetime import datetime import pyarrow as pa filings_table = soup.find("table", class_="tableFile2") rows = filings_table.find_all("tr")[1:] forms, accession_nos, filing_dates = [], [], [] for row in rows: cells = row.find_all("td") form = cells[0].text forms.append(form) # Get the link href from cell[1] link = cells[1].find("a") href = link.attrs["href"] accession_no = href.split("/")[-1].replace("-index.htm", "") accession_nos.append(accession_no) # Get the filing_date filing_date = datetime.strptime(cells[3].text, '%Y-%m-%d') filing_dates.append(filing_date) schema = pa.schema([ ('form', pa.string()), ('company', pa.string()), ('cik', pa.int32()), ('filing_date', pa.date32()), ('accession_number', pa.string()), ]) # Create an empty table with the defined schema filing_index = pa.Table.from_arrays(arrays=[ pa.array(forms, type=pa.string()), pa.array([company_name] * len(forms), type=pa.string()), pa.array([int(cik)] * len(forms), type=pa.int32()), pa.array(filing_dates, type=pa.date32()), pa.array(accession_nos, type=pa.string()), ], schema=schema) return filing_index # Direct implementation of FundClassOrSeries and subclasses class _FundClassOrSeries: """ Internal base class for fund classes and series. Not part of the public API - use the FundClass and FundSeries classes from edgar.funds.core instead. """ def __init__(self, company_info: '_FundCompanyInfo', contract_or_series: str): self.fund = company_info self._contract_or_series = contract_or_series @property def fund_cik(self): return self.fund.cik @property def fund_name(self): return self.fund.name @lru_cache(maxsize=1) def _id_and_name(self) -> Optional[Tuple[str, str]]: class_contract_str = self.fund.ident_info.get(self._contract_or_series, None) if not class_contract_str: return None match = re.match(r'([CS]\d+)(?:\s(.*))?', class_contract_str) # Storing the results in variables if matched, with a default for description if not present cik = match.group(1) if match else "" cik_description = match.group(2) if match and match.group(2) else "" return cik, cik_description @property def id(self): id_and_name = self._id_and_name() if id_and_name: return id_and_name[0] return None @property def name(self): id_and_name = self._id_and_name() if id_and_name: return id_and_name[1] return None @property def description(self): return f"{self.fund_name} {self.id} {self.name}" @property def filings(self): return self.fund.filings class _FundClass(_FundClassOrSeries): """ Internal implementation of fund class (contract) information. Not part of the public API - use the FundClass class from edgar.funds.core instead. """ def __init__(self, company_info: '_FundCompanyInfo'): super().__init__(company_info, "Class/Contract") @property def ticker(self): return self.fund.ident_info.get("Ticker Symbol", None) @property def description(self): return f"{self.fund_name} {self.id} {self.name} {self.ticker or ''}" class _FundSeries(_FundClassOrSeries): """ Internal implementation of fund series information. Not part of the public API - use the FundSeries class from edgar.funds.core instead. """ def __init__(self, company_info: '_FundCompanyInfo'): super().__init__(company_info, "Series") # Direct implementation of get_fund_with_filings def direct_get_fund_with_filings(contract_or_series_id: str): """ Get fund class or series information including filings from the SEC website. Args: contract_or_series_id: Series ID (S...) or Class ID (C...) Returns: FundClass or FundSeries object, or None if not found """ # URL template to search for a fund by class or series ID fund_class_or_series_search_url = "https://www.sec.gov/cgi-bin/browse-edgar?CIK={}" if not re.match(r"[CS]\d+", contract_or_series_id): return None base_url = fund_class_or_series_search_url.format(contract_or_series_id) # Start at 0 and download 100 search_url = base_url + "&start=0&count=100" try: fund_text = download_text(search_url) if "No matching" in fund_text: return None # Company Info company_info = _FundCompanyInfo.from_html(fund_text) # Get the remaining filings start, count = 101, 100 filing_index = company_info.filings.data while True: # Get the next page next_page = base_url + f"&start={start}&count={count}" fund_text = download_text(next_page) soup = BeautifulSoup(fund_text, features="html.parser") filing_index_on_page = _FundCompanyInfo._extract_filings(soup, company_info.name, company_info.cik) if len(filing_index_on_page) == 0: break filing_index = pa.concat_tables([filing_index, filing_index_on_page]) start += count # Drop duplicate filings by accession number filing_index = drop_duplicates_pyarrow(filing_index, column_name='accession_number') company_info.filings = Filings(filing_index=filing_index) if contract_or_series_id.startswith('C'): return _FundClass(company_info) else: return _FundSeries(company_info) except Exception as e: log.warning("Error retrieving fund information for %s: %s", contract_or_series_id, e) return None @lru_cache(maxsize=16) def get_fund_object(identifier: str) -> Optional[Union[FundCompany, FundSeries, FundCompany]]: """ Get a Fund related object by it's identifier. Args: identifier: A CIK, a series id (e.g. 'S000001234') or class id or Fund ticker (e.g. 'VFINX') Returns: A FundCompany or FundSeries or FundClass """ if re.match(r'^[CS]\d+$', identifier): identifier_type = 'Series' if identifier.startswith('S') else 'Class' fund_search_url = fund_series_search_url + f"&CIK={identifier}" elif re.match(r"^[A-Z]{4}X$", identifier): identifier_type = 'Class' fund_search_url = fund_series_search_url + f"&ticker={identifier}" elif re.match(r"^0\d{9}$", identifier): identifier_type = 'FundCompany' fund_search_url = fund_series_search_url + f"&CIK={identifier}" else: log.warning("Invalid fund identifier %s", identifier) return None # Download the fund page fund_text = download_text(fund_search_url) soup = BeautifulSoup(fund_text, "html.parser") if 'To retrieve filings, click on the CIK' not in soup.text: return None tables = soup.find_all("table") # The fund table is the 6th table on the page if len(tables) < 6: log.warning("Expected fund table not found for %s", identifier) return None fund_table = tables[5] all_series = [] fund_company:Optional[FundCompany] = None current_series:Optional[FundSeries] = None current_class:Optional[FundClass] = None for tr in fund_table.find_all('tr')[4:]: # Skip the first 4 rows as they contain headers row_data = [td.get_text().strip() for td in tr.find_all('td') if td.get_text().strip()] if not row_data: continue if re.match(r'^0\d{9}$', row_data[0]): fund_company = FundCompany(cik_or_identifier=row_data[0], fund_name=row_data[1], all_series=all_series) elif re.match(r'^S\d+$', row_data[0]): current_series = FundSeries(series_id=row_data[0], name=row_data[1], fund_company=fund_company) fund_company.all_series.append(current_series) elif re.match(r'^C\d+$', row_data[0]): class_id, class_name = row_data[0], row_data[1] ticker = row_data[2] if len(row_data) > 2 else None current_class = FundClass(class_id=class_id, name=class_name, ticker=ticker) current_class.series = current_series current_series.fund_classes.append(current_class) if identifier_type == "FundCompany": return fund_company elif identifier_type == "Series": return current_series elif identifier_type == "Class": return current_class def is_fund_ticker(identifier: str) -> bool: """ Check if an identifier is a fund ticker. Args: identifier: The identifier to check Returns: True if it's a fund ticker, False otherwise """ # Use our own implementation if identifier and isinstance(identifier, str): return bool(re.match(r"^[A-Z]{4}X$", identifier)) return False class FundData(EntityData): """ Fund-specific data container. Contains specialized properties and methods for fund entities. """ def __init__(self, **kwargs): super().__init__(**kwargs) self.series_id = kwargs.get('series_id') self.class_ids = kwargs.get('class_ids', []) self._fund_classes = kwargs.get('fund_classes', []) @property def is_fund(self) -> bool: """Check if entity is a fund.""" return True def resolve_fund_identifier(identifier): """ Convert fund tickers or series IDs to CIK. Args: identifier: Fund ticker, Series ID, or CIK Returns: CIK as integer or original identifier if conversion not possible """ if isinstance(identifier, str): # Handle Series ID (S000XXXXX) if identifier.startswith('S') and identifier[1:].isdigit(): try: # Try our direct implementation fund_info = direct_get_fund_with_filings(identifier) if fund_info and hasattr(fund_info, 'fund_cik'): return int(fund_info.fund_cik) except Exception as e: log.warning("Error resolving series ID %s: %s", identifier, e) # Handle Class ID (C000XXXXX) if identifier.startswith('C') and identifier[1:].isdigit(): try: # Try our direct implementation fund_info = direct_get_fund_with_filings(identifier) if fund_info and hasattr(fund_info, 'fund_cik'): return int(fund_info.fund_cik) except Exception as e: log.warning("Error resolving class ID %s: %s", identifier, e) # Handle fund ticker if is_fund_ticker(identifier): try: # Use our direct implementation for tickers fund_info = (identifier) if fund_info and hasattr(fund_info, 'company_cik'): return int(fund_info.company_cik) except Exception as e: log.warning("Error resolving fund ticker %s: %s", identifier, e) return identifier def get_fund_information(header): """ Extract fund information from a filing header. Args: header: Filing header Returns: Fund series and contract information """ # Import FundSeriesAndContracts here to avoid circular imports from edgar.funds import FundSeriesAndContracts if not header or not hasattr(header, 'text'): return FundSeriesAndContracts() try: # Try our direct implementation first header_text = header.text series_and_classes_contracts_text = re.search( r'(.*?)', header_text, re.DOTALL ) if series_and_classes_contracts_text: # Use our directly implemented parse_fund_data df = parse_fund_data(series_and_classes_contracts_text.group(1)) return FundSeriesAndContracts(df) except Exception as e: log.debug("Error parsing fund information directly: %s", e) # Fallback implementation - extract fund information from header directly using regex try: # Try to extract fund information from the header text with regex if header and hasattr(header, 'text'): # Look for SERIES-ID and CONTRACT-ID in the header series_matches = re.findall(r'SERIES-ID[^>]*>([^<]+)', str(header.text)) contract_matches = re.findall(r'CONTRACT-ID[^>]*>([^<]+)', str(header.text)) name_matches = re.findall(r'FILER[^>]*>.*?COMPANY-DATA[^>]*>.*?CONFORMED-NAME[^>]*>([^<]+)', str(header.text)) ticker_matches = re.findall(r'TICKER-SYMBOL[^>]*>([^<]+)', str(header.text)) # If we found any matches, create a DataFrame with the information if series_matches or contract_matches: data = [] # Join series and contract IDs as rows for i in range(max(len(series_matches), len(contract_matches))): series_id = series_matches[i] if i < len(series_matches) else None contract_id = contract_matches[i] if i < len(contract_matches) else None fund_name = name_matches[0] if name_matches else None ticker = ticker_matches[0] if ticker_matches else None data.append({ 'SeriesID': series_id, 'ContractID': contract_id, 'Fund': fund_name, 'Ticker': ticker, 'Class': f"Class {contract_id[-1].upper()}" if contract_id else None }) if data: return FundSeriesAndContracts(pd.DataFrame(data)) except Exception as e: log.warning("Error in fallback get_fund_information: %s", e) # Return an empty container if everything else fails return FundSeriesAndContracts() def parse_series_and_classes_from_html(html_content: str, cik:str) -> List[Dict]: """ Parse series and class information from the SEC series listing HTML page. This parses HTML content from the URL https://www.sec.gov/cgi-bin/browse-edgar?CIK=XXXX&scd=series which contains a structured listing of all series and classes for a fund company. Args: html_content: HTML content from the SEC webpage fund: Fund entity to associate with the series/classes Returns: List of dictionaries containing series and class information """ import re from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') series_data = [] # Debug information log.debug("Parsing series HTML content for fund %s", cik) # The table structure in this specific page has series and classes # organized in a specific way with indentation levels try: # Find the main table - in Kinetics HTML, it's the main table in the content area tables = soup.find_all('table') # Find the table that's likely to contain the series information # In SEC pages, it's typically the one with class/contract and series information table = None for t in tables: # Look for rows with series or class info if t.find('tr') and re.search(r'Series|Class/Contract', str(t)): table = t break if not table: log.warning("No suitable table found in series HTML content") return [] current_series = None series_data = [] # Loop through all rows and process them rows = table.find_all('tr') # Debug information log.debug("Found %d rows in the table", len(rows)) # Process all rows since the table structure might vary for _row_idx, row in enumerate(rows): cells = row.find_all('td') if not cells or len(cells) < 3: continue # Check if this is a series row - marked by an S000 ID in a cell with a link series_cell = None series_id = None series_name = None # Series IDs are normally in the form S###### for cell in cells: # Look for tags with S IDs links = cell.find_all('a', href=True) for link in links: if re.search(r'S\d{6,}', link.text): series_id = re.search(r'S\d{6,}', link.text).group(0) series_cell = cell break if series_cell: break # If we found a series ID, extract its name and create a series entry if series_id: # Try to find the series name in the next cell or in the same row series_name = None for cell in cells: # Look for a cell with a link that's not the series ID if cell != series_cell and cell.find('a'): # Check if the link text doesn't match the series ID - it's likely the name link_text = cell.find('a').text.strip() if link_text and series_id not in link_text: series_name = link_text break # If we couldn't find a name, use a default if not series_name: series_name = f"Series {series_id}" # Create a new series entry current_series = { 'series_id': series_id, 'series_name': series_name, 'classes': [] } series_data.append(current_series) log.debug("Found series: %s - %s", series_id, series_name) # Check if this row contains a class - marked by a C000 ID # Classes appear after a series and are indented elif current_series: class_id = None class_name = None class_ticker = "" # Look for class IDs in the form C###### for cell in cells: # Search for C IDs in links links = cell.find_all('a', href=True) for link in links: if re.search(r'C\d{6,}', link.text): class_id = re.search(r'C\d{6,}', link.text).group(0) break if class_id: break if class_id: # Find the class name - usually in a cell after the ID for cell_idx, cell in enumerate(cells): if class_id in str(cell) and cell_idx + 1 < len(cells): # Class name is often in the next cell class_name = cells[cell_idx + 1].text.strip() break parts = class_name.split("\n") class_name = parts[1] if len(parts) > 2: class_ticker = parts[2].strip() # If we couldn't find a name, use a default if not class_name: class_name = f"Class {class_id}" # Add this class to the current series current_series['classes'].append({ 'class_id': class_id, 'class_name': class_name, 'ticker': class_ticker }) log.debug("Found class: %s - %s (%s)", class_id, class_name, class_ticker) # Debug information log.debug("Found %d series with classes", len(series_data)) except Exception as e: log.warning("Error parsing series HTML: %s", e) import traceback log.debug(traceback.format_exc()) return series_data def get_series_and_classes_from_sec(cik: Union[str, int]) -> List[Dict]: """ Directly fetch and parse series and class information from the SEC website. This uses the SEC's series listing page which provides a comprehensive view of all series and classes for a fund company. Args: cik: The CIK of the fund company Returns: List of dictionaries containing parsed series and class information """ # Format CIK properly for URL cik_str = str(cik).zfill(10) url = fund_series_direct_url.format(cik_str) # Download the HTML content html_content = download_text(url) # Check if we received valid content if 'No matching' in html_content or 'series for cik' not in html_content.lower(): log.debug("No series information found for CIK %s", cik) return [] return parse_series_and_classes_from_html(html_content, cik)