Files
edgartools/venv/lib/python3.10/site-packages/edgar/funds/data.py
2025-12-09 12:13:01 +01:00

805 lines
28 KiB
Python

"""
Data structures and functions for working with fund data.
This module provides the FundData class and related functions for
accessing and manipulating fund data.
"""
import logging
import re
from functools import lru_cache
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
if TYPE_CHECKING:
from bs4 import Tag
import pandas as pd
import pyarrow as pa
from bs4 import BeautifulSoup
from edgar._filings import Filings
from edgar.datatools import drop_duplicates_pyarrow
from edgar.entity.data import EntityData
from edgar.funds.core import FundClass, FundCompany, FundSeries
from edgar.httprequests import download_text
log = logging.getLogger(__name__)
#
# Direct implementations to replace legacy module dependencies
#
# Direct implementations of fund-related functionality
# These replace the legacy module dependencies
# URL constants for fund searches
fund_series_search_url = "https://www.sec.gov/cgi-bin/series?company="
fund_class_or_series_search_url = "https://www.sec.gov/cgi-bin/browse-edgar?CIK={}"
fund_series_direct_url = "https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&scd=series"
class _FundDTO:
"""
Data Transfer Object for fund information.
Internal class used to return fund data from direct implementations.
This is not part of the public API and should not be used directly.
Use the Fund class from edgar.funds.core instead.
"""
def __init__(self, company_cik, company_name, name, series, ticker,
class_contract_id, class_contract_name):
self.company_cik = company_cik
self.company_name = company_name
self.name = name
self.series = series
self.ticker = ticker
self.class_contract_id = class_contract_id
self.class_contract_name = class_contract_name
def __str__(self):
return f"{self.name} - {self.ticker} [{self.class_contract_id}]"
# Parse SGML fund data (directly implemented)
def parse_fund_data(series_sgml_data: str) -> pd.DataFrame:
"""
Parse the SGML text containing fund series and class information.
Args:
series_sgml_data: SGML text with SERIES-AND-CLASSES-CONTRACTS-DATA
Returns:
DataFrame with parsed fund information
Example SGML data:
<SERIES-AND-CLASSES-CONTRACTS-DATA>
<EXISTING-SERIES-AND-CLASSES-CONTRACTS>
<SERIES>
<OWNER-CIK>0001090372
<SERIES-ID>S000071967
<SERIES-NAME>Jacob Forward ETF
<CLASS-CONTRACT>
<CLASS-CONTRACT-ID>C000227599
<CLASS-CONTRACT-NAME>Jacob Forward ETF
<CLASS-CONTRACT-TICKER-SYMBOL>JFWD
</CLASS-CONTRACT>
</SERIES>
</EXISTING-SERIES-AND-CLASSES-CONTRACTS>
</SERIES-AND-CLASSES-CONTRACTS-DATA>
"""
# Regular expressions to match each relevant tag
series_re = re.compile(r'<SERIES>(.*?)</SERIES>', re.DOTALL)
data_re = re.compile(r'<([^>]+)>([^<]*)')
# Extract SERIES blocks
series_blocks = series_re.findall(series_sgml_data)
# Create an empty DataFrame
columns = [
"OWNER-CIK", "SERIES-ID", "SERIES-NAME",
"CLASS-CONTRACT-ID", "CLASS-CONTRACT-NAME", "CLASS-CONTRACT-TICKER-SYMBOL"
]
# Extract information from SERIES blocks and append to DataFrame
rows = []
for block in series_blocks:
data_matches = data_re.findall(block)
data_dict = {tag: value.strip() for tag, value in data_matches}
class_contract_data = {
"CLASS-CONTRACT-ID": data_dict.get("CLASS-CONTRACT-ID", ""),
"CLASS-CONTRACT-NAME": data_dict.get("CLASS-CONTRACT-NAME", ""),
"CLASS-CONTRACT-TICKER-SYMBOL": data_dict.get("CLASS-CONTRACT-TICKER-SYMBOL", "")
}
# Merge SERIES and CLASS-CONTRACT data
row_data = {**data_dict, **class_contract_data}
rows.append(row_data)
# Create DataFrame and select relevant columns
df = pd.DataFrame(rows, columns=columns).iloc[:, :6]
# Rename columns for consistency
return (df.rename(columns={
"OWNER-CIK": "CIK",
"SERIES-ID": "SeriesID",
"SERIES-NAME": "Fund",
"CLASS-CONTRACT-ID": "ContractID",
"CLASS-CONTRACT-NAME": "Class",
"CLASS-CONTRACT-TICKER-SYMBOL": "Ticker"
})
.filter(["Fund", "Ticker", "SeriesID", "ContractID", "Class", "CIK"])
)
# Direct implementation of FundCompanyInfo
class _FundCompanyInfo:
"""
Internal helper class representing the fund company.
This is parsed from the results page when we get the fund class or series.
Not part of the public API - use the Fund class from edgar.funds.core instead.
"""
def __init__(self,
name: str,
cik: str,
ident_info: Dict[str, str],
addresses: List[str],
filings: Filings):
self.name: str = name
self.cik: str = cik
self.ident_info: Dict[str, str] = ident_info
self.addresses: List[str] = addresses
self.filings = filings
@property
def state(self):
return self.ident_info.get("State location", None)
@property
def state_of_incorporation(self):
return self.ident_info.get("State of Inc.", None)
@lru_cache(maxsize=1)
def id_and_name(self, contract_or_series: str) -> Optional[Tuple[str, str]]:
class_contract_str = self.ident_info.get(contract_or_series, None)
if not class_contract_str:
return None
match = re.match(r'([CS]\d+)(?:\s(.*))?', class_contract_str)
# Storing the results in variables if matched, with a default for description if not present
cik = match.group(1) if match else ""
cik_description = match.group(2) if match and match.group(2) else ""
return cik, cik_description
@classmethod
def from_html(cls, company_info_html: Union[str, 'Tag']):
soup = BeautifulSoup(company_info_html, features="html.parser")
# Parse the fund company info
content_div = soup.find("div", {"id": "contentDiv"})
if content_div is None:
# Should not reach here, but this is precautionary
log.warning("Did not find div with id 'contentDiv'")
return None
ident_info_dict = {}
company_info_div = content_div.find("div", class_="companyInfo")
company_name_tag = company_info_div.find('span', class_='companyName')
company_name = company_name_tag.text.split('CIK')[0].strip()
cik = company_name_tag.a.text.split(' ')[0]
# Extract the identifying information
for tag in company_info_div.find_all('br'):
tag.replace_with('\n')
ident_info = company_info_div.find('p', class_='identInfo')
ident_line = ident_info.get_text().replace("|", "\n").strip()
for line in ident_line.split("\n"):
if ":" in line:
key, value = line.split(":")
ident_info_dict[key.strip()] = value.strip().replace("\xa0", " ")
# Addresses
mailer_divs = content_div.find_all("div", class_="mailer")
addresses = [re.sub(r'\n\s+', '\n', mailer_div.text.strip())
for mailer_div in mailer_divs]
filing_index = cls._extract_filings(soup, company_name, cik)
filings = Filings(filing_index=filing_index)
return cls(name=company_name,
cik=cik,
filings=filings,
ident_info=ident_info_dict,
addresses=addresses)
@classmethod
def _extract_filings(cls, soup, company_name: str, cik: str):
from datetime import datetime
import pyarrow as pa
filings_table = soup.find("table", class_="tableFile2")
rows = filings_table.find_all("tr")[1:]
forms, accession_nos, filing_dates = [], [], []
for row in rows:
cells = row.find_all("td")
form = cells[0].text
forms.append(form)
# Get the link href from cell[1]
link = cells[1].find("a")
href = link.attrs["href"]
accession_no = href.split("/")[-1].replace("-index.htm", "")
accession_nos.append(accession_no)
# Get the filing_date
filing_date = datetime.strptime(cells[3].text, '%Y-%m-%d')
filing_dates.append(filing_date)
schema = pa.schema([
('form', pa.string()),
('company', pa.string()),
('cik', pa.int32()),
('filing_date', pa.date32()),
('accession_number', pa.string()),
])
# Create an empty table with the defined schema
filing_index = pa.Table.from_arrays(arrays=[
pa.array(forms, type=pa.string()),
pa.array([company_name] * len(forms), type=pa.string()),
pa.array([int(cik)] * len(forms), type=pa.int32()),
pa.array(filing_dates, type=pa.date32()),
pa.array(accession_nos, type=pa.string()),
], schema=schema)
return filing_index
# Direct implementation of FundClassOrSeries and subclasses
class _FundClassOrSeries:
"""
Internal base class for fund classes and series.
Not part of the public API - use the FundClass and FundSeries classes
from edgar.funds.core instead.
"""
def __init__(self, company_info: '_FundCompanyInfo', contract_or_series: str):
self.fund = company_info
self._contract_or_series = contract_or_series
@property
def fund_cik(self):
return self.fund.cik
@property
def fund_name(self):
return self.fund.name
@lru_cache(maxsize=1)
def _id_and_name(self) -> Optional[Tuple[str, str]]:
class_contract_str = self.fund.ident_info.get(self._contract_or_series, None)
if not class_contract_str:
return None
match = re.match(r'([CS]\d+)(?:\s(.*))?', class_contract_str)
# Storing the results in variables if matched, with a default for description if not present
cik = match.group(1) if match else ""
cik_description = match.group(2) if match and match.group(2) else ""
return cik, cik_description
@property
def id(self):
id_and_name = self._id_and_name()
if id_and_name:
return id_and_name[0]
return None
@property
def name(self):
id_and_name = self._id_and_name()
if id_and_name:
return id_and_name[1]
return None
@property
def description(self):
return f"{self.fund_name} {self.id} {self.name}"
@property
def filings(self):
return self.fund.filings
class _FundClass(_FundClassOrSeries):
"""
Internal implementation of fund class (contract) information.
Not part of the public API - use the FundClass class from edgar.funds.core instead.
"""
def __init__(self, company_info: '_FundCompanyInfo'):
super().__init__(company_info, "Class/Contract")
@property
def ticker(self):
return self.fund.ident_info.get("Ticker Symbol", None)
@property
def description(self):
return f"{self.fund_name} {self.id} {self.name} {self.ticker or ''}"
class _FundSeries(_FundClassOrSeries):
"""
Internal implementation of fund series information.
Not part of the public API - use the FundSeries class from edgar.funds.core instead.
"""
def __init__(self, company_info: '_FundCompanyInfo'):
super().__init__(company_info, "Series")
# Direct implementation of get_fund_with_filings
def direct_get_fund_with_filings(contract_or_series_id: str):
"""
Get fund class or series information including filings from the SEC website.
Args:
contract_or_series_id: Series ID (S...) or Class ID (C...)
Returns:
FundClass or FundSeries object, or None if not found
"""
# URL template to search for a fund by class or series ID
fund_class_or_series_search_url = "https://www.sec.gov/cgi-bin/browse-edgar?CIK={}"
if not re.match(r"[CS]\d+", contract_or_series_id):
return None
base_url = fund_class_or_series_search_url.format(contract_or_series_id)
# Start at 0 and download 100
search_url = base_url + "&start=0&count=100"
try:
fund_text = download_text(search_url)
if "No matching" in fund_text:
return None
# Company Info
company_info = _FundCompanyInfo.from_html(fund_text)
# Get the remaining filings
start, count = 101, 100
filing_index = company_info.filings.data
while True:
# Get the next page
next_page = base_url + f"&start={start}&count={count}"
fund_text = download_text(next_page)
soup = BeautifulSoup(fund_text, features="html.parser")
filing_index_on_page = _FundCompanyInfo._extract_filings(soup, company_info.name, company_info.cik)
if len(filing_index_on_page) == 0:
break
filing_index = pa.concat_tables([filing_index, filing_index_on_page])
start += count
# Drop duplicate filings by accession number
filing_index = drop_duplicates_pyarrow(filing_index, column_name='accession_number')
company_info.filings = Filings(filing_index=filing_index)
if contract_or_series_id.startswith('C'):
return _FundClass(company_info)
else:
return _FundSeries(company_info)
except Exception as e:
log.warning("Error retrieving fund information for %s: %s", contract_or_series_id, e)
return None
@lru_cache(maxsize=16)
def get_fund_object(identifier: str) -> Optional[Union[FundCompany, FundSeries, FundCompany]]:
"""
Get a Fund related object by it's identifier.
Args:
identifier: A CIK, a series id (e.g. 'S000001234') or class id or Fund ticker (e.g. 'VFINX')
Returns:
A FundCompany or FundSeries or FundClass
"""
if re.match(r'^[CS]\d+$', identifier):
identifier_type = 'Series' if identifier.startswith('S') else 'Class'
fund_search_url = fund_series_search_url + f"&CIK={identifier}"
elif re.match(r"^[A-Z]{4}X$", identifier):
identifier_type = 'Class'
fund_search_url = fund_series_search_url + f"&ticker={identifier}"
elif re.match(r"^0\d{9}$", identifier):
identifier_type = 'FundCompany'
fund_search_url = fund_series_search_url + f"&CIK={identifier}"
else:
log.warning("Invalid fund identifier %s", identifier)
return None
# Download the fund page
fund_text = download_text(fund_search_url)
soup = BeautifulSoup(fund_text, "html.parser")
if 'To retrieve filings, click on the CIK' not in soup.text:
return None
tables = soup.find_all("table")
# The fund table is the 6th table on the page
if len(tables) < 6:
log.warning("Expected fund table not found for %s", identifier)
return None
fund_table = tables[5]
all_series = []
fund_company:Optional[FundCompany] = None
current_series:Optional[FundSeries] = None
current_class:Optional[FundClass] = None
for tr in fund_table.find_all('tr')[4:]: # Skip the first 4 rows as they contain headers
row_data = [td.get_text().strip() for td in tr.find_all('td') if td.get_text().strip()]
if not row_data:
continue
if re.match(r'^0\d{9}$', row_data[0]):
fund_company = FundCompany(cik_or_identifier=row_data[0], fund_name=row_data[1], all_series=all_series)
elif re.match(r'^S\d+$', row_data[0]):
current_series = FundSeries(series_id=row_data[0], name=row_data[1], fund_company=fund_company)
fund_company.all_series.append(current_series)
elif re.match(r'^C\d+$', row_data[0]):
class_id, class_name = row_data[0], row_data[1]
ticker = row_data[2] if len(row_data) > 2 else None
current_class = FundClass(class_id=class_id, name=class_name, ticker=ticker)
current_class.series = current_series
current_series.fund_classes.append(current_class)
if identifier_type == "FundCompany":
return fund_company
elif identifier_type == "Series":
return current_series
elif identifier_type == "Class":
return current_class
def is_fund_ticker(identifier: str) -> bool:
"""
Check if an identifier is a fund ticker.
Args:
identifier: The identifier to check
Returns:
True if it's a fund ticker, False otherwise
"""
# Use our own implementation
if identifier and isinstance(identifier, str):
return bool(re.match(r"^[A-Z]{4}X$", identifier))
return False
class FundData(EntityData):
"""
Fund-specific data container.
Contains specialized properties and methods for fund entities.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.series_id = kwargs.get('series_id')
self.class_ids = kwargs.get('class_ids', [])
self._fund_classes = kwargs.get('fund_classes', [])
@property
def is_fund(self) -> bool:
"""Check if entity is a fund."""
return True
def resolve_fund_identifier(identifier):
"""
Convert fund tickers or series IDs to CIK.
Args:
identifier: Fund ticker, Series ID, or CIK
Returns:
CIK as integer or original identifier if conversion not possible
"""
if isinstance(identifier, str):
# Handle Series ID (S000XXXXX)
if identifier.startswith('S') and identifier[1:].isdigit():
try:
# Try our direct implementation
fund_info = direct_get_fund_with_filings(identifier)
if fund_info and hasattr(fund_info, 'fund_cik'):
return int(fund_info.fund_cik)
except Exception as e:
log.warning("Error resolving series ID %s: %s", identifier, e)
# Handle Class ID (C000XXXXX)
if identifier.startswith('C') and identifier[1:].isdigit():
try:
# Try our direct implementation
fund_info = direct_get_fund_with_filings(identifier)
if fund_info and hasattr(fund_info, 'fund_cik'):
return int(fund_info.fund_cik)
except Exception as e:
log.warning("Error resolving class ID %s: %s", identifier, e)
# Handle fund ticker
if is_fund_ticker(identifier):
try:
# Use our direct implementation for tickers
fund_info = (identifier)
if fund_info and hasattr(fund_info, 'company_cik'):
return int(fund_info.company_cik)
except Exception as e:
log.warning("Error resolving fund ticker %s: %s", identifier, e)
return identifier
def get_fund_information(header):
"""
Extract fund information from a filing header.
Args:
header: Filing header
Returns:
Fund series and contract information
"""
# Import FundSeriesAndContracts here to avoid circular imports
from edgar.funds import FundSeriesAndContracts
if not header or not hasattr(header, 'text'):
return FundSeriesAndContracts()
try:
# Try our direct implementation first
header_text = header.text
series_and_classes_contracts_text = re.search(
r'<SERIES-AND-CLASSES-CONTRACTS-DATA>(.*?)</SERIES-AND-CLASSES-CONTRACTS-DATA>',
header_text,
re.DOTALL
)
if series_and_classes_contracts_text:
# Use our directly implemented parse_fund_data
df = parse_fund_data(series_and_classes_contracts_text.group(1))
return FundSeriesAndContracts(df)
except Exception as e:
log.debug("Error parsing fund information directly: %s", e)
# Fallback implementation - extract fund information from header directly using regex
try:
# Try to extract fund information from the header text with regex
if header and hasattr(header, 'text'):
# Look for SERIES-ID and CONTRACT-ID in the header
series_matches = re.findall(r'SERIES-ID[^>]*>([^<]+)', str(header.text))
contract_matches = re.findall(r'CONTRACT-ID[^>]*>([^<]+)', str(header.text))
name_matches = re.findall(r'FILER[^>]*>.*?COMPANY-DATA[^>]*>.*?CONFORMED-NAME[^>]*>([^<]+)', str(header.text))
ticker_matches = re.findall(r'TICKER-SYMBOL[^>]*>([^<]+)', str(header.text))
# If we found any matches, create a DataFrame with the information
if series_matches or contract_matches:
data = []
# Join series and contract IDs as rows
for i in range(max(len(series_matches), len(contract_matches))):
series_id = series_matches[i] if i < len(series_matches) else None
contract_id = contract_matches[i] if i < len(contract_matches) else None
fund_name = name_matches[0] if name_matches else None
ticker = ticker_matches[0] if ticker_matches else None
data.append({
'SeriesID': series_id,
'ContractID': contract_id,
'Fund': fund_name,
'Ticker': ticker,
'Class': f"Class {contract_id[-1].upper()}" if contract_id else None
})
if data:
return FundSeriesAndContracts(pd.DataFrame(data))
except Exception as e:
log.warning("Error in fallback get_fund_information: %s", e)
# Return an empty container if everything else fails
return FundSeriesAndContracts()
def parse_series_and_classes_from_html(html_content: str, cik:str) -> List[Dict]:
"""
Parse series and class information from the SEC series listing HTML page.
This parses HTML content from the URL https://www.sec.gov/cgi-bin/browse-edgar?CIK=XXXX&scd=series
which contains a structured listing of all series and classes for a fund company.
Args:
html_content: HTML content from the SEC webpage
fund: Fund entity to associate with the series/classes
Returns:
List of dictionaries containing series and class information
"""
import re
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
series_data = []
# Debug information
log.debug("Parsing series HTML content for fund %s", cik)
# The table structure in this specific page has series and classes
# organized in a specific way with indentation levels
try:
# Find the main table - in Kinetics HTML, it's the main table in the content area
tables = soup.find_all('table')
# Find the table that's likely to contain the series information
# In SEC pages, it's typically the one with class/contract and series information
table = None
for t in tables:
# Look for rows with series or class info
if t.find('tr') and re.search(r'Series|Class/Contract', str(t)):
table = t
break
if not table:
log.warning("No suitable table found in series HTML content")
return []
current_series = None
series_data = []
# Loop through all rows and process them
rows = table.find_all('tr')
# Debug information
log.debug("Found %d rows in the table", len(rows))
# Process all rows since the table structure might vary
for _row_idx, row in enumerate(rows):
cells = row.find_all('td')
if not cells or len(cells) < 3:
continue
# Check if this is a series row - marked by an S000 ID in a cell with a link
series_cell = None
series_id = None
series_name = None
# Series IDs are normally in the form S######
for cell in cells:
# Look for <a> tags with S IDs
links = cell.find_all('a', href=True)
for link in links:
if re.search(r'S\d{6,}', link.text):
series_id = re.search(r'S\d{6,}', link.text).group(0)
series_cell = cell
break
if series_cell:
break
# If we found a series ID, extract its name and create a series entry
if series_id:
# Try to find the series name in the next cell or in the same row
series_name = None
for cell in cells:
# Look for a cell with a link that's not the series ID
if cell != series_cell and cell.find('a'):
# Check if the link text doesn't match the series ID - it's likely the name
link_text = cell.find('a').text.strip()
if link_text and series_id not in link_text:
series_name = link_text
break
# If we couldn't find a name, use a default
if not series_name:
series_name = f"Series {series_id}"
# Create a new series entry
current_series = {
'series_id': series_id,
'series_name': series_name,
'classes': []
}
series_data.append(current_series)
log.debug("Found series: %s - %s", series_id, series_name)
# Check if this row contains a class - marked by a C000 ID
# Classes appear after a series and are indented
elif current_series:
class_id = None
class_name = None
class_ticker = ""
# Look for class IDs in the form C######
for cell in cells:
# Search for C IDs in links
links = cell.find_all('a', href=True)
for link in links:
if re.search(r'C\d{6,}', link.text):
class_id = re.search(r'C\d{6,}', link.text).group(0)
break
if class_id:
break
if class_id:
# Find the class name - usually in a cell after the ID
for cell_idx, cell in enumerate(cells):
if class_id in str(cell) and cell_idx + 1 < len(cells):
# Class name is often in the next cell
class_name = cells[cell_idx + 1].text.strip()
break
parts = class_name.split("\n")
class_name = parts[1]
if len(parts) > 2:
class_ticker = parts[2].strip()
# If we couldn't find a name, use a default
if not class_name:
class_name = f"Class {class_id}"
# Add this class to the current series
current_series['classes'].append({
'class_id': class_id,
'class_name': class_name,
'ticker': class_ticker
})
log.debug("Found class: %s - %s (%s)", class_id, class_name, class_ticker)
# Debug information
log.debug("Found %d series with classes", len(series_data))
except Exception as e:
log.warning("Error parsing series HTML: %s", e)
import traceback
log.debug(traceback.format_exc())
return series_data
def get_series_and_classes_from_sec(cik: Union[str, int]) -> List[Dict]:
"""
Directly fetch and parse series and class information from the SEC website.
This uses the SEC's series listing page which provides a comprehensive view
of all series and classes for a fund company.
Args:
cik: The CIK of the fund company
Returns:
List of dictionaries containing parsed series and class information
"""
# Format CIK properly for URL
cik_str = str(cik).zfill(10)
url = fund_series_direct_url.format(cik_str)
# Download the HTML content
html_content = download_text(url)
# Check if we received valid content
if 'No matching' in html_content or 'series for cik' not in html_content.lower():
log.debug("No series information found for CIK %s", cik)
return []
return parse_series_and_classes_from_html(html_content, cik)