Initial commit
This commit is contained in:
@@ -0,0 +1,40 @@
|
||||
"""
|
||||
13F Holdings Report Parser
|
||||
|
||||
Parses SEC Form 13F-HR (Quarterly Holdings Report) filings from institutional investment managers.
|
||||
Supports both XML format (2013+) and TXT format (2012 and earlier).
|
||||
"""
|
||||
|
||||
from edgar.thirteenf.models import (
|
||||
ThirteenF,
|
||||
THIRTEENF_FORMS,
|
||||
FilingManager,
|
||||
OtherManager,
|
||||
CoverPage,
|
||||
SummaryPage,
|
||||
Signature,
|
||||
PrimaryDocument13F,
|
||||
format_date,
|
||||
)
|
||||
|
||||
# For backward compatibility, also export parser functions
|
||||
from edgar.thirteenf.parsers import (
|
||||
parse_primary_document_xml,
|
||||
parse_infotable_xml,
|
||||
parse_infotable_txt,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'ThirteenF',
|
||||
'THIRTEENF_FORMS',
|
||||
'FilingManager',
|
||||
'OtherManager',
|
||||
'CoverPage',
|
||||
'SummaryPage',
|
||||
'Signature',
|
||||
'PrimaryDocument13F',
|
||||
'format_date',
|
||||
'parse_primary_document_xml',
|
||||
'parse_infotable_xml',
|
||||
'parse_infotable_txt',
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,210 @@
|
||||
"""Portfolio manager lookup functionality for 13F filings."""
|
||||
|
||||
import json
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
__all__ = [
|
||||
'lookup_portfolio_managers',
|
||||
'is_filing_signer_likely_portfolio_manager',
|
||||
]
|
||||
|
||||
|
||||
def lookup_portfolio_managers(company_name: str, cik: int = None, include_approximate: bool = False) -> list[dict]:
|
||||
"""
|
||||
Lookup portfolio managers for a given company.
|
||||
|
||||
This uses a curated database of well-known fund managers loaded from an external JSON file.
|
||||
The data is compiled from public sources and may not be complete or current.
|
||||
|
||||
Args:
|
||||
company_name: Company name to search for
|
||||
cik: Optional CIK for more accurate matching
|
||||
include_approximate: If True, includes non-active managers
|
||||
|
||||
Returns:
|
||||
list[dict]: List of portfolio manager information
|
||||
"""
|
||||
try:
|
||||
db = _load_portfolio_manager_db()
|
||||
|
||||
# Try CIK-based search first (more accurate)
|
||||
if cik:
|
||||
managers = _search_manager_database_by_cik(db, cik, include_approximate)
|
||||
if managers:
|
||||
return managers
|
||||
|
||||
# Fallback to name-based search
|
||||
return _search_manager_database(db, company_name, include_approximate)
|
||||
except Exception as e:
|
||||
# Fallback to empty list if database loading fails
|
||||
import warnings
|
||||
warnings.warn(f"Could not load portfolio manager database: {e}")
|
||||
return []
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _load_portfolio_manager_db() -> dict:
|
||||
"""
|
||||
Load the portfolio manager database from external JSON file.
|
||||
|
||||
Returns:
|
||||
dict: The loaded database, or empty dict if file not found
|
||||
"""
|
||||
# Try to load from external JSON file
|
||||
data_file = Path(__file__).parent.parent / 'reference' / 'data' / 'portfolio_managers.json'
|
||||
|
||||
if data_file.exists():
|
||||
try:
|
||||
with open(data_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, IOError) as e:
|
||||
import warnings
|
||||
warnings.warn(f"Could not parse portfolio manager database: {e}")
|
||||
return {}
|
||||
else:
|
||||
# Fallback to basic hardcoded database for backwards compatibility
|
||||
return {
|
||||
"metadata": {
|
||||
"version": "fallback",
|
||||
"description": "Minimal fallback database",
|
||||
"total_companies": 3,
|
||||
"last_updated": "2024-12-01"
|
||||
},
|
||||
"managers": {
|
||||
"berkshire_hathaway": {
|
||||
"company_name": "Berkshire Hathaway Inc",
|
||||
"match_patterns": ["berkshire hathaway", "brk", "berkshire"],
|
||||
"managers": [
|
||||
{
|
||||
"name": "Warren Buffett",
|
||||
"title": "Chairman & CEO",
|
||||
"status": "active",
|
||||
"confidence": "high",
|
||||
"last_verified": "2024-12-01"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def _search_manager_database(db: dict, company_name: str, include_approximate: bool = False) -> list[dict]:
|
||||
"""
|
||||
Search the manager database for a company.
|
||||
|
||||
Args:
|
||||
db: The loaded database dictionary
|
||||
company_name: Company name to search for
|
||||
include_approximate: Whether to include non-active managers
|
||||
|
||||
Returns:
|
||||
list[dict]: List of matching managers
|
||||
"""
|
||||
if not db or 'managers' not in db:
|
||||
return []
|
||||
|
||||
managers_data = db['managers']
|
||||
normalized_name = company_name.lower()
|
||||
|
||||
# Search through all companies
|
||||
for company_key, company_data in managers_data.items():
|
||||
# Check match patterns
|
||||
match_patterns = company_data.get('match_patterns', [company_key])
|
||||
|
||||
for pattern in match_patterns:
|
||||
if pattern.lower() in normalized_name:
|
||||
managers = company_data.get('managers', [])
|
||||
|
||||
if include_approximate:
|
||||
return managers
|
||||
else:
|
||||
# Only return active managers unless requested otherwise
|
||||
return [m for m in managers if m.get('status') == 'active']
|
||||
|
||||
# No matches found
|
||||
return []
|
||||
|
||||
|
||||
def _search_manager_database_by_cik(db: dict, cik: int, include_approximate: bool = False) -> list[dict]:
|
||||
"""
|
||||
Search the manager database by CIK (more accurate than name matching).
|
||||
|
||||
Args:
|
||||
db: The loaded database dictionary
|
||||
cik: The CIK to search for
|
||||
include_approximate: Whether to include non-active managers
|
||||
|
||||
Returns:
|
||||
list[dict]: List of matching managers
|
||||
"""
|
||||
if not db or 'managers' not in db:
|
||||
return []
|
||||
|
||||
managers_data = db['managers']
|
||||
|
||||
# Search through all companies for CIK match
|
||||
for _company_key, company_data in managers_data.items():
|
||||
company_cik = company_data.get('cik')
|
||||
if company_cik == cik:
|
||||
managers = company_data.get('managers', [])
|
||||
|
||||
if include_approximate:
|
||||
return managers
|
||||
else:
|
||||
# Only return active managers unless requested otherwise
|
||||
return [m for m in managers if m.get('status') == 'active']
|
||||
|
||||
# No CIK matches found
|
||||
return []
|
||||
|
||||
|
||||
def is_filing_signer_likely_portfolio_manager(filing_signer_title: str) -> bool:
|
||||
"""
|
||||
Determine if the filing signer is likely to be a portfolio manager.
|
||||
|
||||
This uses heuristics based on the signer's title to assess whether they
|
||||
might be involved in investment decisions rather than just administrative functions.
|
||||
|
||||
Args:
|
||||
filing_signer_title: The title of the person who signed the filing
|
||||
|
||||
Returns:
|
||||
bool: True if signer appears to be investment-focused, False if administrative
|
||||
|
||||
Example:
|
||||
>>> is_filing_signer_likely_portfolio_manager("Chief Financial Officer")
|
||||
False
|
||||
>>> is_filing_signer_likely_portfolio_manager("Portfolio Manager")
|
||||
True
|
||||
"""
|
||||
if not filing_signer_title:
|
||||
return False
|
||||
|
||||
title = filing_signer_title.upper()
|
||||
|
||||
# Investment-focused titles
|
||||
investment_titles = [
|
||||
'PORTFOLIO MANAGER', 'FUND MANAGER', 'INVESTMENT MANAGER',
|
||||
'CHIEF INVESTMENT OFFICER', 'CIO', 'MANAGING DIRECTOR',
|
||||
'CHAIRMAN', 'CEO', 'PRESIDENT', 'FOUNDER'
|
||||
]
|
||||
|
||||
# Administrative titles
|
||||
admin_titles = [
|
||||
'CFO', 'CCO', 'COMPLIANCE', 'SECRETARY', 'TREASURER',
|
||||
'VICE PRESIDENT', 'VP', 'ASSISTANT', 'COUNSEL'
|
||||
]
|
||||
|
||||
# Check for investment titles first
|
||||
for inv_title in investment_titles:
|
||||
if inv_title in title:
|
||||
return True
|
||||
|
||||
# Check for administrative titles
|
||||
for admin_title in admin_titles:
|
||||
if admin_title in title:
|
||||
return False
|
||||
|
||||
# If unclear, err on the side of caution
|
||||
return False
|
||||
484
venv/lib/python3.10/site-packages/edgar/thirteenf/models.py
Normal file
484
venv/lib/python3.10/site-packages/edgar/thirteenf/models.py
Normal file
@@ -0,0 +1,484 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from functools import lru_cache
|
||||
from typing import List, Union
|
||||
|
||||
import pyarrow.compute as pc
|
||||
|
||||
from edgar._party import Address
|
||||
|
||||
__all__ = [
|
||||
'FilingManager',
|
||||
'OtherManager',
|
||||
'CoverPage',
|
||||
'SummaryPage',
|
||||
'Signature',
|
||||
'PrimaryDocument13F',
|
||||
'ThirteenF',
|
||||
'THIRTEENF_FORMS',
|
||||
'format_date',
|
||||
]
|
||||
|
||||
THIRTEENF_FORMS = ['13F-HR', "13F-HR/A", "13F-NT", "13F-NT/A", "13F-CTR", "13F-CTR/A"]
|
||||
|
||||
|
||||
def format_date(date: Union[str, datetime]) -> str:
|
||||
if isinstance(date, str):
|
||||
return date
|
||||
return date.strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FilingManager:
|
||||
name: str
|
||||
address: Address
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class OtherManager:
|
||||
cik: str
|
||||
name: str
|
||||
file_number: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CoverPage:
|
||||
report_calendar_or_quarter: str
|
||||
report_type: str
|
||||
filing_manager: FilingManager
|
||||
other_managers: List[OtherManager]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SummaryPage:
|
||||
other_included_managers_count: int
|
||||
total_value: Decimal
|
||||
total_holdings: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Signature:
|
||||
name: str
|
||||
title: str
|
||||
phone: str
|
||||
signature: str
|
||||
city: str
|
||||
state_or_country: str
|
||||
date: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PrimaryDocument13F:
|
||||
report_period: datetime
|
||||
cover_page: CoverPage
|
||||
summary_page: SummaryPage
|
||||
signature: Signature
|
||||
additional_information: str
|
||||
|
||||
|
||||
class ThirteenF:
|
||||
"""
|
||||
A 13F-HR is a quarterly report filed by institutional investment managers that have over $100 million in qualifying
|
||||
assets under management. The report is filed with the Securities & Exchange Commission (SEC) and discloses all
|
||||
the firm's equity holdings that it held at the end of the quarter. The report is due within 45 days of the end
|
||||
of the quarter. The 13F-HR is a public document that is available on the SEC's website.
|
||||
"""
|
||||
|
||||
def __init__(self, filing, use_latest_period_of_report=False):
|
||||
from edgar.thirteenf.parsers.primary_xml import parse_primary_document_xml
|
||||
|
||||
assert filing.form in THIRTEENF_FORMS, f"Form {filing.form} is not a valid 13F form"
|
||||
# The filing might not be the filing for the current period. We need to use the related filing filed on the same
|
||||
# date as the current filing that has the latest period of report
|
||||
self._related_filings = filing.related_filings().filter(filing_date=filing.filing_date, form=filing.form)
|
||||
self._actual_filing = filing # The filing passed in
|
||||
if use_latest_period_of_report:
|
||||
# Use the last related filing.
|
||||
# It should also be the one that has the CONFORMED_PERIOD_OF_REPORT closest to filing_date
|
||||
self.filing = self._related_filings[-1]
|
||||
else:
|
||||
# Use the exact filing that was passed in
|
||||
self.filing = self._actual_filing
|
||||
|
||||
# Parse primary document if XML is available (2013+ filings)
|
||||
# For older TXT-only filings (2012 and earlier), primary_form_information will be None
|
||||
primary_xml = self.filing.xml()
|
||||
self.primary_form_information = parse_primary_document_xml(primary_xml) if primary_xml else None
|
||||
|
||||
def has_infotable(self):
|
||||
return self.filing.form in ['13F-HR', "13F-HR/A"]
|
||||
|
||||
@property
|
||||
def form(self):
|
||||
return self.filing.form
|
||||
|
||||
@property
|
||||
@lru_cache(maxsize=1)
|
||||
def infotable_xml(self):
|
||||
"""Returns XML content if available (2013+ filings)"""
|
||||
if self.has_infotable():
|
||||
result = self._get_infotable_from_attachment()
|
||||
if result and result[0] and result[1] == 'xml' and "informationTable" in result[0]:
|
||||
return result[0]
|
||||
return None
|
||||
|
||||
def _get_infotable_from_attachment(self):
|
||||
"""
|
||||
Use the filing homepage to get the infotable file.
|
||||
Returns a tuple of (content, format) where format is 'xml' or 'txt'.
|
||||
"""
|
||||
if self.has_infotable():
|
||||
# Try XML format first (2013+)
|
||||
query = "document_type=='INFORMATION TABLE' and document.lower().endswith('.xml')"
|
||||
attachments = self.filing.attachments.query(query)
|
||||
if len(attachments) > 0:
|
||||
return (attachments.get_by_index(0).download(), 'xml')
|
||||
|
||||
# Fall back to TXT format (2012 and earlier)
|
||||
# The primary document itself contains the table in TXT format
|
||||
# Try various description patterns first
|
||||
query = "description=='FORM 13F' or description=='INFORMATION TABLE'"
|
||||
attachments = self.filing.attachments.query(query)
|
||||
if len(attachments) > 0:
|
||||
# Filter for .txt files only
|
||||
txt_attachments = [att for att in attachments if att.document.lower().endswith('.txt')]
|
||||
if txt_attachments:
|
||||
return (txt_attachments[0].download(), 'txt')
|
||||
|
||||
# Final fallback: For older filings, descriptions may be unreliable
|
||||
# Look for sequence number 1 with .txt extension
|
||||
try:
|
||||
att = self.filing.attachments.get_by_sequence(1)
|
||||
if att and att.document.lower().endswith('.txt'):
|
||||
return (att.download(), 'txt')
|
||||
except (KeyError, AttributeError):
|
||||
pass
|
||||
|
||||
return (None, None)
|
||||
|
||||
@property
|
||||
@lru_cache(maxsize=1)
|
||||
def infotable_txt(self):
|
||||
"""Returns TXT content if available (pre-2013 filings)"""
|
||||
if self.has_infotable():
|
||||
result = self._get_infotable_from_attachment()
|
||||
if result and result[0] and result[1] == 'txt':
|
||||
return result[0]
|
||||
|
||||
# Fallback: Some filings have the information table embedded in the main HTML
|
||||
# instead of as a separate attachment. Try to extract it from the main HTML.
|
||||
if not result or not result[0]:
|
||||
html = self.filing.html()
|
||||
if html and "Form 13F Information Table" in html:
|
||||
return html
|
||||
return None
|
||||
|
||||
@property
|
||||
@lru_cache(maxsize=1)
|
||||
def infotable_html(self):
|
||||
if self.has_infotable():
|
||||
query = "document_type=='INFORMATION TABLE' and document.lower().endswith('.html')"
|
||||
attachments = self.filing.attachments.query(query)
|
||||
return attachments[0].download()
|
||||
|
||||
@property
|
||||
@lru_cache(maxsize=1)
|
||||
def infotable(self):
|
||||
"""
|
||||
Returns the information table as a pandas DataFrame.
|
||||
Supports both XML format (2013+) and TXT format (2012 and earlier).
|
||||
"""
|
||||
from edgar.thirteenf.parsers.infotable_xml import parse_infotable_xml
|
||||
from edgar.thirteenf.parsers.infotable_txt import parse_infotable_txt
|
||||
|
||||
if self.has_infotable():
|
||||
# Try XML format first
|
||||
if self.infotable_xml:
|
||||
return parse_infotable_xml(self.infotable_xml)
|
||||
# Fall back to TXT format
|
||||
elif self.infotable_txt:
|
||||
return parse_infotable_txt(self.infotable_txt)
|
||||
return None
|
||||
|
||||
@property
|
||||
def accession_number(self):
|
||||
return self.filing.accession_no
|
||||
|
||||
@property
|
||||
def total_value(self):
|
||||
"""Total value of holdings in thousands of dollars"""
|
||||
if self.primary_form_information:
|
||||
return self.primary_form_information.summary_page.total_value
|
||||
# For TXT-only filings, calculate from infotable
|
||||
infotable = self.infotable
|
||||
if infotable is not None and len(infotable) > 0:
|
||||
return Decimal(int(infotable['Value'].sum()))
|
||||
return None
|
||||
|
||||
@property
|
||||
def total_holdings(self):
|
||||
"""Total number of holdings"""
|
||||
if self.primary_form_information:
|
||||
return self.primary_form_information.summary_page.total_holdings
|
||||
# For TXT-only filings, count from infotable
|
||||
infotable = self.infotable
|
||||
if infotable is not None:
|
||||
return len(infotable)
|
||||
return None
|
||||
|
||||
@property
|
||||
def report_period(self):
|
||||
"""Report period end date"""
|
||||
if self.primary_form_information:
|
||||
return format_date(self.primary_form_information.report_period)
|
||||
# For TXT-only filings, use CONFORMED_PERIOD_OF_REPORT from filing header
|
||||
if hasattr(self.filing, 'period_of_report') and self.filing.period_of_report:
|
||||
return format_date(self.filing.period_of_report)
|
||||
return None
|
||||
|
||||
@property
|
||||
def filing_date(self):
|
||||
return format_date(self.filing.filing_date)
|
||||
|
||||
@property
|
||||
def investment_manager(self):
|
||||
# This is really the firm e.g. Spark Growth Management Partners II, LLC
|
||||
if self.primary_form_information:
|
||||
return self.primary_form_information.cover_page.filing_manager
|
||||
return None
|
||||
|
||||
@property
|
||||
def signer(self):
|
||||
# This is the person who signed the filing. Could be the Reporting Manager but could be someone else
|
||||
# like the CFO
|
||||
if self.primary_form_information:
|
||||
return self.primary_form_information.signature.name
|
||||
return None
|
||||
|
||||
# Enhanced manager name properties for better clarity
|
||||
@property
|
||||
def management_company_name(self) -> str:
|
||||
"""
|
||||
The legal name of the investment management company that filed the 13F.
|
||||
|
||||
This is the institutional entity (e.g., "Berkshire Hathaway Inc", "Vanguard Group Inc")
|
||||
that is legally responsible for managing the assets, not an individual person's name.
|
||||
|
||||
Returns:
|
||||
str: The legal name of the management company, or company name from filing if not available
|
||||
|
||||
Example:
|
||||
>>> thirteen_f.management_company_name
|
||||
'Berkshire Hathaway Inc'
|
||||
"""
|
||||
if self.investment_manager:
|
||||
return self.investment_manager.name
|
||||
# For TXT-only filings, use company name from filing
|
||||
return self.filing.company
|
||||
|
||||
@property
|
||||
def filing_signer_name(self) -> str:
|
||||
"""
|
||||
The name of the individual who signed the 13F filing.
|
||||
|
||||
This is typically an administrative officer (CFO, CCO, Compliance Officer, etc.)
|
||||
rather than the famous portfolio manager. For example, Berkshire Hathaway's 13F
|
||||
is signed by "Marc D. Hamburg" (SVP), not Warren Buffett.
|
||||
|
||||
Returns:
|
||||
str: The name of the person who signed the filing
|
||||
|
||||
Example:
|
||||
>>> thirteen_f.filing_signer_name
|
||||
'Marc D. Hamburg'
|
||||
"""
|
||||
return self.signer
|
||||
|
||||
@property
|
||||
def filing_signer_title(self) -> str:
|
||||
"""
|
||||
The business title of the individual who signed the 13F filing.
|
||||
|
||||
Common titles include: CFO, CCO, Senior Vice President, Chief Compliance Officer,
|
||||
Secretary, Treasurer, etc. This helps distinguish administrative signers from
|
||||
portfolio managers.
|
||||
|
||||
Returns:
|
||||
str: The business title of the filing signer, or None if not available
|
||||
|
||||
Example:
|
||||
>>> thirteen_f.filing_signer_title
|
||||
'Senior Vice President'
|
||||
"""
|
||||
if self.primary_form_information:
|
||||
return self.primary_form_information.signature.title
|
||||
return None
|
||||
|
||||
@property
|
||||
def manager_name(self) -> str:
|
||||
"""
|
||||
DEPRECATED: Use management_company_name instead.
|
||||
|
||||
Returns the management company name for backwards compatibility.
|
||||
This property name was misleading as it suggested an individual manager's name.
|
||||
|
||||
Returns:
|
||||
str: The management company name
|
||||
|
||||
Warning:
|
||||
This property is deprecated and may be removed in future versions.
|
||||
Use management_company_name for the company name, or see get_portfolio_managers()
|
||||
if you need information about individual portfolio managers.
|
||||
"""
|
||||
import warnings
|
||||
warnings.warn(
|
||||
"manager_name is deprecated and misleading. Use management_company_name for the "
|
||||
"company name, or get_portfolio_managers() for individual manager information.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
return self.management_company_name
|
||||
|
||||
def get_portfolio_managers(self, include_approximate: bool = False) -> list[dict]:
|
||||
"""
|
||||
Get information about the actual portfolio managers for this fund.
|
||||
|
||||
Note: 13F filings do not contain individual portfolio manager names.
|
||||
This method provides a curated mapping for well-known funds based on
|
||||
public information. Results may not be current or complete.
|
||||
|
||||
Args:
|
||||
include_approximate (bool): If True, includes approximate/historical
|
||||
manager information even if not current
|
||||
|
||||
Returns:
|
||||
list[dict]: List of portfolio manager information with keys:
|
||||
'name', 'title', 'status', 'source', 'last_updated'
|
||||
|
||||
Example:
|
||||
>>> thirteen_f.get_portfolio_managers()
|
||||
[
|
||||
{
|
||||
'name': 'Warren Buffett',
|
||||
'title': 'Chairman & CEO',
|
||||
'status': 'active',
|
||||
'source': 'public_records',
|
||||
'last_updated': '2024-01-01'
|
||||
}
|
||||
]
|
||||
"""
|
||||
from edgar.thirteenf.manager_lookup import lookup_portfolio_managers
|
||||
return lookup_portfolio_managers(
|
||||
self.management_company_name,
|
||||
getattr(self.filing, 'cik', None),
|
||||
include_approximate=include_approximate
|
||||
)
|
||||
|
||||
def _lookup_portfolio_managers(self, company_name: str, include_approximate: bool = False) -> list[dict]:
|
||||
"""
|
||||
Private method for testing - looks up portfolio managers by company name.
|
||||
|
||||
Args:
|
||||
company_name: Name of the management company
|
||||
include_approximate: Whether to include approximate/historical data
|
||||
|
||||
Returns:
|
||||
list[dict]: List of portfolio manager information
|
||||
"""
|
||||
from edgar.thirteenf.manager_lookup import lookup_portfolio_managers
|
||||
return lookup_portfolio_managers(company_name, cik=None, include_approximate=include_approximate)
|
||||
|
||||
def get_manager_info_summary(self) -> dict:
|
||||
"""
|
||||
Get a comprehensive summary of all available manager information.
|
||||
|
||||
This provides a clear breakdown of what information is available from the 13F
|
||||
filing versus external sources, helping users understand the data limitations.
|
||||
|
||||
Returns:
|
||||
dict: Summary with keys 'from_13f_filing', 'external_sources', 'limitations'
|
||||
|
||||
Example:
|
||||
>>> thirteen_f.get_manager_info_summary()
|
||||
{
|
||||
'from_13f_filing': {
|
||||
'management_company': 'Berkshire Hathaway Inc',
|
||||
'filing_signer': 'Marc D. Hamburg',
|
||||
'signer_title': 'Senior Vice President'
|
||||
},
|
||||
'external_sources': {
|
||||
'portfolio_managers': [
|
||||
{'name': 'Warren Buffett', 'title': 'Chairman & CEO', 'status': 'active'}
|
||||
]
|
||||
},
|
||||
'limitations': [
|
||||
'13F filings do not contain individual portfolio manager names',
|
||||
'External manager data may not be current or complete',
|
||||
'Filing signer is typically an administrative officer, not the portfolio manager'
|
||||
]
|
||||
}
|
||||
"""
|
||||
portfolio_managers = self.get_portfolio_managers()
|
||||
|
||||
return {
|
||||
'from_13f_filing': {
|
||||
'management_company': self.management_company_name,
|
||||
'filing_signer': self.filing_signer_name,
|
||||
'signer_title': self.filing_signer_title,
|
||||
'form': self.form,
|
||||
'period_of_report': str(self.report_period)
|
||||
},
|
||||
'external_sources': {
|
||||
'portfolio_managers': portfolio_managers,
|
||||
'manager_count': len(portfolio_managers)
|
||||
},
|
||||
'limitations': [
|
||||
'13F filings do not contain individual portfolio manager names',
|
||||
'External manager data may not be current or complete',
|
||||
'Filing signer is typically an administrative officer, not the portfolio manager',
|
||||
'Portfolio manager information is sourced from public records and may be outdated'
|
||||
]
|
||||
}
|
||||
|
||||
def is_filing_signer_likely_portfolio_manager(self) -> bool:
|
||||
"""
|
||||
Determine if the filing signer is likely to be a portfolio manager.
|
||||
|
||||
This uses heuristics based on the signer's title to assess whether they
|
||||
might be involved in investment decisions rather than just administrative functions.
|
||||
|
||||
Returns:
|
||||
bool: True if signer appears to be investment-focused, False if administrative
|
||||
|
||||
Example:
|
||||
>>> thirteen_f.is_filing_signer_likely_portfolio_manager()
|
||||
False # For administrative titles like CFO, CCO, etc.
|
||||
"""
|
||||
from edgar.thirteenf.manager_lookup import is_filing_signer_likely_portfolio_manager
|
||||
return is_filing_signer_likely_portfolio_manager(self.filing_signer_title)
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def previous_holding_report(self):
|
||||
if len(self.report_period) == 1:
|
||||
return None
|
||||
# Look in the related filings data for the row with this accession number
|
||||
idx = pc.equal(self._related_filings.data['accession_number'], self.accession_number).index(True).as_py()
|
||||
if idx == 0:
|
||||
return None
|
||||
previous_filing = self._related_filings[idx - 1]
|
||||
return ThirteenF(previous_filing, use_latest_period_of_report=False)
|
||||
|
||||
def __rich__(self):
|
||||
from edgar.thirteenf.rendering import render_rich
|
||||
return render_rich(self)
|
||||
|
||||
def __repr__(self):
|
||||
from edgar.richtools import repr_rich
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
# For backward compatibility, expose parse methods as static methods
|
||||
ThirteenF.parse_primary_document_xml = staticmethod(lambda xml: __import__('edgar.thirteenf.parsers.primary_xml', fromlist=['parse_primary_document_xml']).parse_primary_document_xml(xml))
|
||||
ThirteenF.parse_infotable_xml = staticmethod(lambda xml: __import__('edgar.thirteenf.parsers.infotable_xml', fromlist=['parse_infotable_xml']).parse_infotable_xml(xml))
|
||||
ThirteenF.parse_infotable_txt = staticmethod(lambda txt: __import__('edgar.thirteenf.parsers.infotable_txt', fromlist=['parse_infotable_txt']).parse_infotable_txt(txt))
|
||||
@@ -0,0 +1,11 @@
|
||||
"""13F filing parsers for different document formats."""
|
||||
|
||||
from .primary_xml import parse_primary_document_xml
|
||||
from .infotable_xml import parse_infotable_xml
|
||||
from .infotable_txt import parse_infotable_txt
|
||||
|
||||
__all__ = [
|
||||
'parse_primary_document_xml',
|
||||
'parse_infotable_xml',
|
||||
'parse_infotable_txt',
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,119 @@
|
||||
"""TXT format information table parsers with automatic format detection.
|
||||
|
||||
Supports two TXT formats from 2012 filings:
|
||||
- Format 1 (Multiline): Company names can span multiple lines
|
||||
- Format 2 (Columnar): All data on single line with <S> and <C> tags
|
||||
"""
|
||||
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
from .format_multiline import parse_multiline_format
|
||||
from .format_columnar import parse_columnar_format
|
||||
|
||||
__all__ = ['parse_infotable_txt']
|
||||
|
||||
|
||||
def parse_infotable_txt(infotable_txt: str) -> pd.DataFrame:
|
||||
"""
|
||||
Parse TXT format information table, auto-detecting format.
|
||||
|
||||
Supports:
|
||||
- Format 1 (Multiline): Berkshire-style with multi-line company names
|
||||
- Format 2 (Columnar): JANA-style with all data on single line
|
||||
|
||||
Args:
|
||||
infotable_txt: TXT content containing the information table
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Holdings data with same structure as XML parser
|
||||
"""
|
||||
if _is_columnar_format(infotable_txt):
|
||||
return parse_columnar_format(infotable_txt)
|
||||
else:
|
||||
return parse_multiline_format(infotable_txt)
|
||||
|
||||
|
||||
def _is_columnar_format(infotable_txt: str) -> bool:
|
||||
"""
|
||||
Detect if this is columnar format by looking for <S> tags in data rows.
|
||||
|
||||
Columnar format has <S> at the start of each data row, followed by data.
|
||||
Multiline format only has <S> and <C> in the header row.
|
||||
|
||||
Args:
|
||||
infotable_txt: TXT content to analyze
|
||||
|
||||
Returns:
|
||||
bool: True if columnar format, False if multiline format
|
||||
"""
|
||||
# Find the Form 13F Information Table section (case-insensitive)
|
||||
match = re.search(r'FORM\s+13F\s+INFORMATION\s+TABLE', infotable_txt, re.IGNORECASE)
|
||||
if not match:
|
||||
return False
|
||||
|
||||
# Extract tables (case-insensitive)
|
||||
# Note: Search from beginning since <TABLE> tag may come before the header text
|
||||
table_pattern = r'<TABLE>(.*?)</TABLE>'
|
||||
tables = re.findall(table_pattern, infotable_txt, re.DOTALL | re.IGNORECASE)
|
||||
|
||||
if len(tables) == 0:
|
||||
return False
|
||||
|
||||
# Determine which table to check
|
||||
# If 2+ tables: check second table (first holdings table, after managers table)
|
||||
# If 1 table: check that single table
|
||||
if len(tables) >= 2:
|
||||
holdings_table = tables[1]
|
||||
else:
|
||||
holdings_table = tables[0]
|
||||
|
||||
lines = holdings_table.split('\n')
|
||||
|
||||
# Count data rows with <S> tags that also have CUSIPs
|
||||
# In columnar format, data rows start with <S> and have CUSIP on same line
|
||||
# In multiline format, only header has <S>, and CUSIP is on second line of company
|
||||
data_rows_with_s_and_cusip = 0
|
||||
data_rows_checked = 0
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
line_upper = line.upper()
|
||||
|
||||
# Skip empty lines, CAPTION, and header rows (case-insensitive)
|
||||
if not line or '<CAPTION>' in line_upper:
|
||||
continue
|
||||
|
||||
# Skip if this looks like a header (has <S> but no digits)
|
||||
if '<S>' in line_upper and not re.search(r'\d', line):
|
||||
continue
|
||||
|
||||
# Check if this line has both <S> tag and a CUSIP (9 chars with digit, with or without spaces)
|
||||
cusip_match = re.search(r'\b([A-Za-z0-9]{9})\b', line)
|
||||
has_valid_cusip = cusip_match and any(c.isdigit() for c in cusip_match.group(1))
|
||||
|
||||
# Also check for spaced CUSIPs
|
||||
if not has_valid_cusip:
|
||||
spaced_matches = re.finditer(r'\b([A-Za-z0-9 ]{9,15})\b', line)
|
||||
for match in spaced_matches:
|
||||
cleaned = match.group(1).replace(' ', '')
|
||||
if len(cleaned) == 9 and any(c.isdigit() for c in cleaned):
|
||||
has_valid_cusip = True
|
||||
break
|
||||
|
||||
if '<S>' in line_upper and has_valid_cusip:
|
||||
data_rows_with_s_and_cusip += 1
|
||||
data_rows_checked += 1
|
||||
elif has_valid_cusip:
|
||||
# Has CUSIP but no <S> - multiline format
|
||||
data_rows_checked += 1
|
||||
|
||||
# If we've checked 3 data rows, that's enough to decide
|
||||
if data_rows_checked >= 3:
|
||||
break
|
||||
|
||||
# If most data rows with CUSIPs also have <S> tags, it's columnar format
|
||||
if data_rows_checked > 0 and data_rows_with_s_and_cusip >= data_rows_checked * 0.5:
|
||||
return True
|
||||
|
||||
return False
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,286 @@
|
||||
"""Parser for columnar TXT format (Format 2) used in some 2012 filings.
|
||||
|
||||
This format has <S> and <C> tags for each field, with all data on a single line.
|
||||
|
||||
Example:
|
||||
<S> <C> <C> <C> <C>
|
||||
AETNA INC NEW COM 00817Y108 92,760 2,342,435 SH SOLE 2,238,895 103,540 0
|
||||
"""
|
||||
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
from edgar.reference import cusip_ticker_mapping
|
||||
|
||||
__all__ = ['parse_columnar_format']
|
||||
|
||||
|
||||
def parse_columnar_format(infotable_txt: str) -> pd.DataFrame:
|
||||
"""
|
||||
Parse columnar TXT format (Format 2) information table.
|
||||
|
||||
This parser handles the format where all data is on a single line with
|
||||
<S> and <C> tags marking column boundaries.
|
||||
|
||||
Args:
|
||||
infotable_txt: TXT content containing the information table
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Holdings data with same structure as XML parser
|
||||
"""
|
||||
# Find the Form 13F Information Table section (case-insensitive)
|
||||
match = re.search(r'FORM\s+13F\s+INFORMATION\s+TABLE', infotable_txt, re.IGNORECASE)
|
||||
if not match:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Extract all table content between <TABLE> and </TABLE> tags (case-insensitive)
|
||||
# Note: Search from beginning since <TABLE> tag may come before the header text
|
||||
table_pattern = r'<TABLE>(.*?)</TABLE>'
|
||||
tables = re.findall(table_pattern, infotable_txt, re.DOTALL | re.IGNORECASE)
|
||||
|
||||
if len(tables) == 0:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Determine which tables to process:
|
||||
# - If 2+ tables: Skip first table (usually managers list), process rest
|
||||
# - If 1 table: Check if it has holdings data (CUSIPs with <S> tags), if so process it
|
||||
if len(tables) >= 2:
|
||||
holdings_tables = tables[1:] # Skip first table (managers)
|
||||
elif len(tables) == 1:
|
||||
# Check if the single table has holdings data (contains CUSIPs with <S> tags)
|
||||
# Look for lines that have both <S> tag and valid CUSIP (with or without spaces)
|
||||
potential_lines = [line for line in tables[0].split('\n') if '<S>' in line.upper()]
|
||||
has_data = False
|
||||
for line in potential_lines[:10]: # Check first 10 <S> lines
|
||||
# Try non-spaced CUSIPs first
|
||||
cusip_match = re.search(r'\b([A-Za-z0-9]{9})\b', line)
|
||||
if cusip_match and any(c.isdigit() for c in cusip_match.group(1)):
|
||||
has_data = True
|
||||
break
|
||||
# Try spaced CUSIPs
|
||||
spaced_matches = re.finditer(r'\b([A-Za-z0-9 ]{9,15})\b', line)
|
||||
for match in spaced_matches:
|
||||
cleaned = match.group(1).replace(' ', '')
|
||||
if len(cleaned) == 9 and any(c.isdigit() for c in cleaned):
|
||||
has_data = True
|
||||
break
|
||||
if has_data:
|
||||
break
|
||||
if has_data:
|
||||
holdings_tables = tables # Process the single table
|
||||
else:
|
||||
return pd.DataFrame() # No holdings data
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
|
||||
parsed_rows = []
|
||||
|
||||
for holdings_table in holdings_tables:
|
||||
# Skip if this is the totals table (very short, < 200 chars)
|
||||
if len(holdings_table.strip()) < 200:
|
||||
continue
|
||||
|
||||
lines = holdings_table.split('\n')
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines, CAPTION lines, header rows (case-insensitive)
|
||||
line_upper = line.upper()
|
||||
if not line or '<CAPTION>' in line_upper:
|
||||
continue
|
||||
|
||||
# Skip header rows with just tags (case-insensitive)
|
||||
# Header rows have <S> but no valid CUSIPs (9 chars with at least one digit, with or without spaces)
|
||||
if line_upper.startswith('<S>'):
|
||||
# Check for normal 9-char CUSIP
|
||||
has_cusip = False
|
||||
cusip_check = re.search(r'\b([A-Za-z0-9]{9})\b', line)
|
||||
if cusip_check and any(c.isdigit() for c in cusip_check.group(1)):
|
||||
has_cusip = True
|
||||
|
||||
# If not found, check for spaced CUSIP
|
||||
if not has_cusip:
|
||||
spaced_check = re.finditer(r'\b([A-Za-z0-9 ]{9,15})\b', line)
|
||||
for match in spaced_check:
|
||||
cleaned = match.group(1).replace(' ', '')
|
||||
if len(cleaned) == 9 and any(c.isdigit() for c in cleaned):
|
||||
has_cusip = True
|
||||
break
|
||||
|
||||
if not has_cusip:
|
||||
continue
|
||||
|
||||
if line.startswith(('Total', 'Title', 'NAME OF ISSUER', 'of', 'Market Value')):
|
||||
continue
|
||||
|
||||
# Look for data rows with <S> tag and a CUSIP (case-insensitive)
|
||||
if '<S>' not in line_upper:
|
||||
continue
|
||||
|
||||
# CUSIP is a reliable anchor - it's always 9 alphanumeric characters (case-insensitive)
|
||||
# Must contain at least one digit to avoid matching company names or words like "SPONSORED"
|
||||
# Some filings have spaces in CUSIPs: "00724F 10 1" should be "00724F101"
|
||||
# Find ALL potential CUSIP sequences (with or without spaces), then pick the first valid one
|
||||
|
||||
# First try without spaces (faster path)
|
||||
cusip_match = None
|
||||
cusip = None
|
||||
all_cusip_matches = re.finditer(r'\b([A-Za-z0-9]{9})\b', line)
|
||||
for match in all_cusip_matches:
|
||||
if any(c.isdigit() for c in match.group(1)):
|
||||
cusip_match = match
|
||||
cusip = match.group(1)
|
||||
break
|
||||
|
||||
# If not found, try matching with spaces and cleaning
|
||||
if not cusip_match:
|
||||
# Match sequences of 9-15 chars that might contain spaces
|
||||
spaced_matches = re.finditer(r'\b([A-Za-z0-9 ]{9,15})\b', line)
|
||||
for match in spaced_matches:
|
||||
cleaned = match.group(1).replace(' ', '')
|
||||
# Check if cleaned version is exactly 9 chars and has a digit
|
||||
if len(cleaned) == 9 and any(c.isdigit() for c in cleaned):
|
||||
cusip_match = match
|
||||
cusip = cleaned # Use cleaned version
|
||||
break
|
||||
|
||||
if not cusip_match:
|
||||
continue
|
||||
|
||||
# Remove SGML tags and split by whitespace
|
||||
# Replace <S> and <C> with spaces to help with splitting
|
||||
cleaned_line = line.replace('<S>', ' ').replace('<C>', ' ')
|
||||
parts = cleaned_line.split()
|
||||
|
||||
# Filter out empty parts
|
||||
parts = [p for p in parts if p.strip()]
|
||||
|
||||
if len(parts) < 10: # Need at least issuer, class, cusip, value, shares, type, discretion, sole, shared, none
|
||||
continue
|
||||
|
||||
try:
|
||||
# Find CUSIP position in parts
|
||||
# cusip already set above (either from direct match or cleaned from spaced match)
|
||||
# Try to find it in parts - it might be spaced or not spaced
|
||||
cusip_idx = None
|
||||
cusip_span = 1 # How many elements the CUSIP occupies in parts
|
||||
|
||||
# First try to find cleaned CUSIP as a single element
|
||||
if cusip in parts:
|
||||
cusip_idx = parts.index(cusip)
|
||||
else:
|
||||
# Try to find the original spaced version as a single element
|
||||
original_cusip = cusip_match.group(1)
|
||||
if original_cusip in parts:
|
||||
cusip_idx = parts.index(original_cusip)
|
||||
else:
|
||||
# For spaced CUSIPs split across multiple parts (e.g., "00724F 10 1" -> ["00724F", "10", "1"])
|
||||
# Look for a sequence of parts that, when joined, matches the cleaned CUSIP
|
||||
for i in range(len(parts) - 2): # Need at least 3 parts for a split CUSIP
|
||||
# Try joining 2-4 consecutive parts
|
||||
for span in range(2, 5):
|
||||
if i + span > len(parts):
|
||||
break
|
||||
joined = ''.join(parts[i:i+span])
|
||||
if joined == cusip:
|
||||
cusip_idx = i
|
||||
cusip_span = span
|
||||
break
|
||||
if cusip_idx is not None:
|
||||
break
|
||||
|
||||
if cusip_idx is None:
|
||||
continue
|
||||
|
||||
# Before CUSIP: Issuer name and class
|
||||
# Everything before CUSIP minus the last word (which is the class)
|
||||
before_cusip = parts[:cusip_idx]
|
||||
if len(before_cusip) < 2:
|
||||
continue
|
||||
|
||||
# Last part before CUSIP is the class, rest is issuer name
|
||||
title_class = before_cusip[-1]
|
||||
issuer_name = ' '.join(before_cusip[:-1])
|
||||
|
||||
# After CUSIP: value, shares, type (SH/PRN), discretion, sole, shared, none
|
||||
# Skip cusip_span elements for spaced CUSIPs (e.g., ["00724F", "10", "1"])
|
||||
after_cusip = parts[cusip_idx + cusip_span:]
|
||||
|
||||
if len(after_cusip) < 7:
|
||||
continue
|
||||
|
||||
# Parse fields after CUSIP
|
||||
# Expected order: VALUE SHARES TYPE DISCRETION ... SOLE SHARED NONE
|
||||
value_str = after_cusip[0].replace(',', '').replace('$', '')
|
||||
shares_str = after_cusip[1].replace(',', '')
|
||||
|
||||
value = int(value_str) if value_str and value_str != '-' else 0
|
||||
shares = int(shares_str) if shares_str and shares_str != '-' else 0
|
||||
|
||||
# Type (SH/PRN) is typically at index 2
|
||||
share_type = after_cusip[2] if len(after_cusip) > 2 else 'SH'
|
||||
if share_type == 'SH':
|
||||
share_type_full = 'Shares'
|
||||
elif share_type == 'PRN':
|
||||
share_type_full = 'Principal'
|
||||
else:
|
||||
share_type_full = 'Shares'
|
||||
|
||||
# Find investment discretion (typically "SOLE", "SHARED", "DEFINED", or compound like "SHARED-DEFINED")
|
||||
# It's the first non-numeric field after type
|
||||
discretion_idx = 3
|
||||
investment_discretion = ''
|
||||
for i in range(3, len(after_cusip) - 3): # Last 3 are voting columns
|
||||
part = after_cusip[i]
|
||||
if part and part not in ['-'] and not part.replace(',', '').isdigit():
|
||||
investment_discretion = part
|
||||
discretion_idx = i
|
||||
break
|
||||
|
||||
# Voting columns are the last 3 fields
|
||||
if len(after_cusip) >= 3:
|
||||
none_voting_str = after_cusip[-1].replace(',', '')
|
||||
shared_voting_str = after_cusip[-2].replace(',', '')
|
||||
sole_voting_str = after_cusip[-3].replace(',', '')
|
||||
|
||||
non_voting = int(none_voting_str) if none_voting_str and none_voting_str != '-' else 0
|
||||
shared_voting = int(shared_voting_str) if shared_voting_str and shared_voting_str != '-' else 0
|
||||
sole_voting = int(sole_voting_str) if sole_voting_str and sole_voting_str != '-' else 0
|
||||
else:
|
||||
sole_voting = 0
|
||||
shared_voting = 0
|
||||
non_voting = 0
|
||||
|
||||
# Create row dict
|
||||
row_dict = {
|
||||
'Issuer': issuer_name,
|
||||
'Class': title_class,
|
||||
'Cusip': cusip,
|
||||
'Value': value,
|
||||
'SharesPrnAmount': shares,
|
||||
'Type': share_type_full,
|
||||
'PutCall': '',
|
||||
'InvestmentDiscretion': investment_discretion,
|
||||
'SoleVoting': sole_voting,
|
||||
'SharedVoting': shared_voting,
|
||||
'NonVoting': non_voting
|
||||
}
|
||||
|
||||
parsed_rows.append(row_dict)
|
||||
|
||||
except (ValueError, IndexError) as e:
|
||||
# Skip rows that don't parse correctly
|
||||
continue
|
||||
|
||||
# Create DataFrame
|
||||
if not parsed_rows:
|
||||
return pd.DataFrame()
|
||||
|
||||
table = pd.DataFrame(parsed_rows)
|
||||
|
||||
# Add ticker symbols using CUSIP mapping
|
||||
cusip_mapping = cusip_ticker_mapping(allow_duplicate_cusips=False)
|
||||
table['Ticker'] = table.Cusip.map(cusip_mapping.Ticker)
|
||||
|
||||
return table
|
||||
@@ -0,0 +1,273 @@
|
||||
"""Parser for multiline TXT format (Format 1) used in some 2012 filings.
|
||||
|
||||
This format has company names that can span multiple lines, with the CUSIP
|
||||
appearing on the same line as the continuation of the company name.
|
||||
|
||||
Example:
|
||||
AMERICAN
|
||||
EXPRESS CO COM 025816109 110999 1952142 Shared-Defined...
|
||||
"""
|
||||
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
from edgar.reference import cusip_ticker_mapping
|
||||
|
||||
__all__ = ['parse_multiline_format']
|
||||
|
||||
|
||||
def parse_multiline_format(infotable_txt: str) -> pd.DataFrame:
|
||||
"""
|
||||
Parse multiline TXT format (Format 1) information table.
|
||||
|
||||
This parser handles the format where company names can span multiple lines,
|
||||
with the CUSIP appearing on the line that contains the continuation.
|
||||
|
||||
Args:
|
||||
infotable_txt: TXT content containing the information table
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Holdings data with same structure as XML parser
|
||||
"""
|
||||
# Find the Form 13F Information Table section (case-insensitive)
|
||||
match = re.search(r'FORM\s+13F\s+INFORMATION\s+TABLE', infotable_txt, re.IGNORECASE)
|
||||
if not match:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Extract all table content between <TABLE> and </TABLE> tags (case-insensitive)
|
||||
# Note: Search from beginning since <TABLE> tag may come before the header text
|
||||
table_pattern = r'<TABLE>(.*?)</TABLE>'
|
||||
tables = re.findall(table_pattern, infotable_txt, re.DOTALL | re.IGNORECASE)
|
||||
|
||||
if len(tables) == 0:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Determine which tables to process:
|
||||
# - If 2+ tables: Skip first table (usually managers list), process rest
|
||||
# - If 1 table: Check if it has holdings data (CUSIPs), if so process it
|
||||
if len(tables) >= 2:
|
||||
holdings_tables = tables[1:] # Skip first table (managers)
|
||||
elif len(tables) == 1:
|
||||
# Check if the single table has holdings data (contains CUSIPs with digits)
|
||||
# Look for 9-char alphanumeric sequences (with or without spaces) that contain at least one digit
|
||||
potential_cusips = re.findall(r'\b([A-Za-z0-9]{9})\b', tables[0])
|
||||
# Also check for spaced CUSIPs
|
||||
spaced_cusips = re.findall(r'\b([A-Za-z0-9 ]{9,15})\b', tables[0])
|
||||
spaced_cusips_cleaned = [c.replace(' ', '') for c in spaced_cusips if len(c.replace(' ', '')) == 9]
|
||||
|
||||
has_valid_cusips = (
|
||||
any(any(c.isdigit() for c in cusip) for cusip in potential_cusips) or
|
||||
any(any(c.isdigit() for c in cusip) for cusip in spaced_cusips_cleaned)
|
||||
)
|
||||
if has_valid_cusips:
|
||||
holdings_tables = tables # Process the single table
|
||||
else:
|
||||
return pd.DataFrame() # No holdings data
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
|
||||
parsed_rows = []
|
||||
|
||||
for holdings_table in holdings_tables:
|
||||
# Skip if this is the totals table (very short, < 200 chars)
|
||||
if len(holdings_table.strip()) < 200:
|
||||
continue
|
||||
|
||||
# Reset pending issuer parts for each table
|
||||
pending_issuer_parts = []
|
||||
|
||||
lines = holdings_table.split('\n')
|
||||
|
||||
for line in lines:
|
||||
orig_line = line
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines, CAPTION lines, header rows (case-insensitive)
|
||||
line_upper = line.upper()
|
||||
if not line or '<CAPTION>' in line_upper or '<S>' in line_upper or '<C>' in line_upper:
|
||||
continue
|
||||
|
||||
# Skip separator lines (made of dashes and spaces)
|
||||
if all(c in '- ' for c in line):
|
||||
continue
|
||||
|
||||
# Skip header/title rows
|
||||
line_upper = line.upper()
|
||||
if line.startswith(('Total', 'Title', 'Name of Issuer', 'of', 'Market Value')):
|
||||
continue
|
||||
|
||||
# Skip column header rows (contain keywords like COLUMN, VOTING AUTHORITY, SHRS OR PRN, etc.)
|
||||
if any(keyword in line_upper for keyword in ['COLUMN 1', 'COLUMN 2', 'VOTING AUTHORITY', 'SHRS OR', 'NAME OF ISSUER', 'FORM 13F', 'INFORMATION TABLE']):
|
||||
continue
|
||||
|
||||
# Try to parse as a data row
|
||||
# CUSIP is a reliable anchor - it's always 9 alphanumeric characters (case-insensitive)
|
||||
# Must contain at least one digit to avoid matching company names like "Berkshire" or "SPONSORED"
|
||||
# Some filings have spaces in CUSIPs: "00724F 10 1" should be "00724F101"
|
||||
# Find ALL potential CUSIP sequences (with or without spaces), then pick the first valid one
|
||||
|
||||
# First try without spaces (faster path)
|
||||
cusip_match = None
|
||||
cusip = None
|
||||
all_cusip_matches = re.finditer(r'\b([A-Za-z0-9]{9})\b', line)
|
||||
for match in all_cusip_matches:
|
||||
if any(c.isdigit() for c in match.group(1)):
|
||||
cusip_match = match
|
||||
cusip = match.group(1)
|
||||
break
|
||||
|
||||
# If not found, try matching with spaces and cleaning
|
||||
if not cusip_match:
|
||||
# Match sequences of 9-15 chars that might contain spaces
|
||||
spaced_matches = re.finditer(r'\b([A-Za-z0-9 ]{9,15})\b', line)
|
||||
for match in spaced_matches:
|
||||
cleaned = match.group(1).replace(' ', '')
|
||||
# Check if cleaned version is exactly 9 chars and has a digit
|
||||
if len(cleaned) == 9 and any(c.isdigit() for c in cleaned):
|
||||
cusip_match = match
|
||||
cusip = cleaned # Use cleaned version
|
||||
break
|
||||
|
||||
if cusip_match:
|
||||
# This line contains a CUSIP, so it has the main data
|
||||
# cusip already set above (either from direct match or cleaned from spaced match)
|
||||
cusip_pos = cusip_match.start()
|
||||
|
||||
# Everything before CUSIP is issuer name + class
|
||||
before_cusip = line[:cusip_pos].strip()
|
||||
# Everything after CUSIP is the numeric data
|
||||
# Use match.end() to handle spaced CUSIPs correctly (e.g., "00724F 10 1")
|
||||
after_cusip = line[cusip_match.end():].strip()
|
||||
|
||||
# Split before_cusip into issuer parts
|
||||
# Combine with any pending issuer parts from previous line
|
||||
before_parts = before_cusip.split()
|
||||
|
||||
# If we have pending parts, this completes a multi-line company name
|
||||
if pending_issuer_parts:
|
||||
before_parts = pending_issuer_parts + before_parts
|
||||
pending_issuer_parts = []
|
||||
|
||||
if len(before_parts) < 2:
|
||||
# Not enough data, skip
|
||||
continue
|
||||
|
||||
# Extract class and issuer name
|
||||
# Common patterns:
|
||||
# - "COMPANY NAME COM" → class="COM", issuer="COMPANY NAME"
|
||||
# - "COMPANY NAME SPONSORED ADR" → class="SPONSORED ADR", issuer="COMPANY NAME"
|
||||
# - "COMPANY NAME CL A" → class="CL A", issuer="COMPANY NAME"
|
||||
|
||||
if len(before_parts) >= 3 and before_parts[-2] == 'SPONSORED' and before_parts[-1] == 'ADR':
|
||||
title_class = 'SPONSORED ADR'
|
||||
issuer_parts = before_parts[:-2]
|
||||
elif len(before_parts) >= 3 and before_parts[-2] == 'CL':
|
||||
title_class = 'CL ' + before_parts[-1]
|
||||
issuer_parts = before_parts[:-2]
|
||||
elif len(before_parts) >= 5 and ' '.join(before_parts[-4:]).startswith('LIB CAP COM'):
|
||||
# "LIBERTY MEDIA CORPORATION LIB CAP COM A"
|
||||
title_class = ' '.join(before_parts[-4:])
|
||||
issuer_parts = before_parts[:-4]
|
||||
elif len(before_parts) >= 2:
|
||||
# Default: last word/token is the class
|
||||
title_class = before_parts[-1]
|
||||
issuer_parts = before_parts[:-1]
|
||||
else:
|
||||
# Only one part - skip this row
|
||||
continue
|
||||
|
||||
issuer_name = ' '.join(issuer_parts)
|
||||
|
||||
# Skip if issuer name is empty
|
||||
if not issuer_name:
|
||||
continue
|
||||
|
||||
# Parse the numeric data after CUSIP
|
||||
# Flexible format handling since empty columns may not appear
|
||||
# Expected order: VALUE SHARES [TYPE] [DISCRETION] [MANAGERS] [SOLE] [SHARED] [NONE]
|
||||
data_parts = after_cusip.split()
|
||||
|
||||
if len(data_parts) < 2: # At minimum need value and shares
|
||||
continue
|
||||
|
||||
try:
|
||||
# Value and Shares are always the first two fields
|
||||
value_str = data_parts[0].replace(',', '').replace('$', '')
|
||||
shares_str = data_parts[1].replace(',', '')
|
||||
|
||||
value = int(value_str) if value_str and value_str != '-' else 0
|
||||
shares = float(shares_str) if shares_str and shares_str != '-' else 0
|
||||
|
||||
# Parse voting columns from the end (look for numeric values)
|
||||
# Work backwards from end to find up to 3 numeric voting columns
|
||||
voting_values = []
|
||||
for i in range(len(data_parts) - 1, 1, -1): # Start from end, skip first 2 (value/shares)
|
||||
part = data_parts[i].replace(',', '').replace('.', '')
|
||||
if part.replace('-', '').isdigit():
|
||||
# This is a numeric value (could be voting)
|
||||
val_str = data_parts[i].replace(',', '')
|
||||
try:
|
||||
voting_values.insert(0, float(val_str) if val_str != '-' else 0)
|
||||
if len(voting_values) == 3:
|
||||
break
|
||||
except ValueError:
|
||||
break
|
||||
else:
|
||||
# Non-numeric, stop looking for voting columns
|
||||
break
|
||||
|
||||
# Assign voting values (may have 0-3 values)
|
||||
sole_voting = int(voting_values[0]) if len(voting_values) >= 1 else 0
|
||||
shared_voting = int(voting_values[1]) if len(voting_values) >= 2 else 0
|
||||
non_voting = int(voting_values[2]) if len(voting_values) >= 3 else 0
|
||||
|
||||
# Find investment discretion by looking for non-numeric field after position 2
|
||||
# It's typically "Shared-Defined", "SOLE", "Defined", etc.
|
||||
# Skip position 2 which might be TYPE (SH/PRN)
|
||||
investment_discretion = ''
|
||||
num_voting_at_end = len(voting_values)
|
||||
for i in range(2, len(data_parts) - num_voting_at_end):
|
||||
part = data_parts[i]
|
||||
# Investment discretion contains letters and is not a known type marker
|
||||
if part and part not in ['-', 'SH', 'PRN'] and not part.replace(',', '').replace('.', '').isdigit():
|
||||
investment_discretion = part
|
||||
break
|
||||
|
||||
# Create row dict
|
||||
row_dict = {
|
||||
'Issuer': issuer_name,
|
||||
'Class': title_class,
|
||||
'Cusip': cusip,
|
||||
'Value': value,
|
||||
'SharesPrnAmount': shares,
|
||||
'Type': 'Shares',
|
||||
'PutCall': '',
|
||||
'InvestmentDiscretion': investment_discretion,
|
||||
'SoleVoting': sole_voting,
|
||||
'SharedVoting': shared_voting,
|
||||
'NonVoting': non_voting
|
||||
}
|
||||
|
||||
parsed_rows.append(row_dict)
|
||||
|
||||
except (ValueError, IndexError):
|
||||
# Skip rows that don't parse correctly
|
||||
continue
|
||||
|
||||
else:
|
||||
# No CUSIP on this line - might be first part of a multi-line company name
|
||||
# Store it for the next line
|
||||
if line and not line.startswith(('Total', 'Title')):
|
||||
pending_issuer_parts = line.split()
|
||||
|
||||
# Create DataFrame
|
||||
if not parsed_rows:
|
||||
return pd.DataFrame()
|
||||
|
||||
table = pd.DataFrame(parsed_rows)
|
||||
|
||||
# Add ticker symbols using CUSIP mapping
|
||||
cusip_mapping = cusip_ticker_mapping(allow_duplicate_cusips=False)
|
||||
table['Ticker'] = table.Cusip.map(cusip_mapping.Ticker)
|
||||
|
||||
return table
|
||||
@@ -0,0 +1,56 @@
|
||||
"""Parser for 13F information table XML format."""
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from edgar.reference import cusip_ticker_mapping
|
||||
from edgar.xmltools import child_text, find_element
|
||||
|
||||
__all__ = ['parse_infotable_xml']
|
||||
|
||||
|
||||
def parse_infotable_xml(infotable_xml: str) -> pd.DataFrame:
|
||||
"""
|
||||
Parse the infotable xml and return a pandas DataFrame
|
||||
|
||||
Args:
|
||||
infotable_xml: XML content of the information table
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Holdings data with columns matching the XML structure
|
||||
"""
|
||||
root = find_element(infotable_xml, "informationTable")
|
||||
rows = []
|
||||
shares_or_principal = {"SH": "Shares", "PRN": "Principal"}
|
||||
for info_tag in root.find_all("infoTable"):
|
||||
info_table = dict()
|
||||
|
||||
info_table['Issuer'] = child_text(info_tag, "nameOfIssuer")
|
||||
info_table['Class'] = child_text(info_tag, "titleOfClass")
|
||||
info_table['Cusip'] = child_text(info_tag, "cusip")
|
||||
info_table['Value'] = int(child_text(info_tag, "value"))
|
||||
|
||||
# Shares or principal
|
||||
shares_tag = info_tag.find("shrsOrPrnAmt")
|
||||
info_table['SharesPrnAmount'] = child_text(shares_tag, "sshPrnamt")
|
||||
|
||||
# Shares or principal
|
||||
ssh_prnamt_type = child_text(shares_tag, "sshPrnamtType")
|
||||
info_table['Type'] = shares_or_principal.get(ssh_prnamt_type)
|
||||
|
||||
info_table["PutCall"] = child_text(info_tag, "putCall") or ""
|
||||
info_table['InvestmentDiscretion'] = child_text(info_tag, "investmentDiscretion")
|
||||
|
||||
# Voting authority
|
||||
voting_auth_tag = info_tag.find("votingAuthority")
|
||||
info_table['SoleVoting'] = int(float(child_text(voting_auth_tag, "Sole")))
|
||||
info_table['SharedVoting'] = int(float(child_text(voting_auth_tag, "Shared")))
|
||||
info_table['NonVoting'] = int(float(child_text(voting_auth_tag, "None")))
|
||||
rows.append(info_table)
|
||||
|
||||
table = pd.DataFrame(rows)
|
||||
|
||||
# Add the ticker symbol
|
||||
cusip_mapping = cusip_ticker_mapping(allow_duplicate_cusips=False)
|
||||
table['Ticker'] = table.Cusip.map(cusip_mapping.Ticker)
|
||||
|
||||
return table
|
||||
@@ -0,0 +1,118 @@
|
||||
"""Parser for 13F primary document XML format."""
|
||||
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from functools import lru_cache
|
||||
|
||||
from edgar._party import Address
|
||||
from edgar.thirteenf.models import (
|
||||
FilingManager,
|
||||
OtherManager,
|
||||
CoverPage,
|
||||
SummaryPage,
|
||||
Signature,
|
||||
PrimaryDocument13F
|
||||
)
|
||||
from edgar.xmltools import child_text, find_element
|
||||
|
||||
__all__ = ['parse_primary_document_xml']
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def parse_primary_document_xml(primary_document_xml: str):
|
||||
"""
|
||||
Parse the primary 13F XML document.
|
||||
|
||||
Args:
|
||||
primary_document_xml: XML content of the primary document
|
||||
|
||||
Returns:
|
||||
PrimaryDocument13F: Parsed primary document data
|
||||
"""
|
||||
root = find_element(primary_document_xml, "edgarSubmission")
|
||||
# Header data
|
||||
header_data = root.find("headerData")
|
||||
filer_info = header_data.find("filerInfo")
|
||||
report_period = datetime.strptime(child_text(filer_info, "periodOfReport"), "%m-%d-%Y")
|
||||
|
||||
# Form Data
|
||||
form_data = root.find("formData")
|
||||
cover_page_el = form_data.find("coverPage")
|
||||
|
||||
report_calendar_or_quarter = child_text(form_data, "reportCalendarOrQuarter")
|
||||
report_type = child_text(cover_page_el, "reportType")
|
||||
|
||||
# Filing Manager
|
||||
filing_manager_el = cover_page_el.find("filingManager")
|
||||
|
||||
# Address
|
||||
address_el = filing_manager_el.find("address")
|
||||
address = Address(
|
||||
street1=child_text(address_el, "street1"),
|
||||
street2=child_text(address_el, "street2"),
|
||||
city=child_text(address_el, "city"),
|
||||
state_or_country=child_text(address_el, "stateOrCountry"),
|
||||
zipcode=child_text(address_el, "zipCode")
|
||||
)
|
||||
filing_manager = FilingManager(name=child_text(filing_manager_el, "name"), address=address)
|
||||
# Other managers
|
||||
other_manager_info_el = cover_page_el.find("otherManagersInfo")
|
||||
other_managers = [
|
||||
OtherManager(
|
||||
cik=child_text(other_manager_el, "cik"),
|
||||
name=child_text(other_manager_el, "name"),
|
||||
file_number=child_text(other_manager_el, "form13FFileNumber")
|
||||
)
|
||||
for other_manager_el in other_manager_info_el.find_all("otherManager")
|
||||
] if other_manager_info_el else []
|
||||
|
||||
# Summary Page
|
||||
summary_page_el = form_data.find("summaryPage")
|
||||
if summary_page_el:
|
||||
other_included_managers_count = child_text(summary_page_el,
|
||||
"otherIncludedManagersCount")
|
||||
if other_included_managers_count:
|
||||
other_included_managers_count = int(other_included_managers_count)
|
||||
|
||||
total_holdings = child_text(summary_page_el, "tableEntryTotal")
|
||||
if total_holdings:
|
||||
total_holdings = int(total_holdings)
|
||||
|
||||
total_value = child_text(summary_page_el, "tableValueTotal")
|
||||
if total_value:
|
||||
total_value = Decimal(total_value)
|
||||
else:
|
||||
other_included_managers_count = 0
|
||||
total_holdings = 0
|
||||
total_value = 0
|
||||
|
||||
# Signature Block
|
||||
signature_block_el = form_data.find("signatureBlock")
|
||||
signature = Signature(
|
||||
name=child_text(signature_block_el, "name"),
|
||||
title=child_text(signature_block_el, "title"),
|
||||
phone=child_text(signature_block_el, "phone"),
|
||||
city=child_text(signature_block_el, "city"),
|
||||
signature=child_text(signature_block_el, "signature"),
|
||||
state_or_country=child_text(signature_block_el, "stateOrCountry"),
|
||||
date=child_text(signature_block_el, "signatureDate")
|
||||
)
|
||||
|
||||
parsed_primary_doc = PrimaryDocument13F(
|
||||
report_period=report_period,
|
||||
cover_page=CoverPage(
|
||||
filing_manager=filing_manager,
|
||||
report_calendar_or_quarter=report_calendar_or_quarter,
|
||||
report_type=report_type,
|
||||
other_managers=other_managers
|
||||
),
|
||||
signature=signature,
|
||||
summary_page=SummaryPage(
|
||||
other_included_managers_count=other_included_managers_count or 0,
|
||||
total_holdings=total_holdings or 0,
|
||||
total_value=total_value or 0
|
||||
),
|
||||
additional_information=child_text(cover_page_el, "additionalInformation")
|
||||
)
|
||||
|
||||
return parsed_primary_doc
|
||||
@@ -0,0 +1,90 @@
|
||||
"""Rich rendering for 13F holdings reports."""
|
||||
|
||||
from rich import box
|
||||
from rich.console import Group
|
||||
from rich.panel import Panel
|
||||
from rich.table import Column, Table
|
||||
|
||||
__all__ = ['render_rich', 'infotable_summary']
|
||||
|
||||
|
||||
def infotable_summary(thirteen_f):
|
||||
"""
|
||||
Create a summary DataFrame of the information table for display.
|
||||
|
||||
Args:
|
||||
thirteen_f: ThirteenF instance
|
||||
|
||||
Returns:
|
||||
pd.DataFrame or None: Summary of holdings sorted by value
|
||||
"""
|
||||
if thirteen_f.has_infotable():
|
||||
infotable = thirteen_f.infotable
|
||||
if infotable is not None and len(infotable) > 0:
|
||||
return (infotable
|
||||
.filter(['Issuer', 'Class', 'Cusip', 'Ticker', 'Value', 'SharesPrnAmount', 'Type', 'PutCall',
|
||||
'SoleVoting', 'SharedVoting', 'NonVoting'])
|
||||
.rename(columns={'SharesPrnAmount': 'Shares'})
|
||||
.assign(Value=lambda df: df.Value,
|
||||
Type=lambda df: df.Type.fillna('-'),
|
||||
Ticker=lambda df: df.Ticker.fillna(''))
|
||||
.sort_values(['Value'], ascending=False)
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def render_rich(thirteen_f):
|
||||
"""
|
||||
Create Rich Panel display for a 13F filing.
|
||||
|
||||
Args:
|
||||
thirteen_f: ThirteenF instance
|
||||
|
||||
Returns:
|
||||
Panel: Rich Panel containing filing summary and holdings table
|
||||
"""
|
||||
title = f"{thirteen_f.form} Holding Report for {thirteen_f.filing.company} for period {thirteen_f.report_period}"
|
||||
summary = Table(
|
||||
"Report Period",
|
||||
Column("Investment Manager", style="bold deep_sky_blue1"),
|
||||
"Signed By",
|
||||
"Holdings",
|
||||
"Value",
|
||||
"Accession Number",
|
||||
"Filed",
|
||||
box=box.SIMPLE)
|
||||
|
||||
summary.add_row(
|
||||
thirteen_f.report_period,
|
||||
thirteen_f.investment_manager.name if thirteen_f.investment_manager else thirteen_f.manager_name,
|
||||
thirteen_f.signer or "-",
|
||||
str(thirteen_f.total_holdings or "-"),
|
||||
f"${thirteen_f.total_value:,.0f}" if thirteen_f.total_value else "-",
|
||||
thirteen_f.filing.accession_no,
|
||||
thirteen_f.filing_date
|
||||
)
|
||||
|
||||
content = [summary]
|
||||
|
||||
# info table
|
||||
infotable_summary_df = infotable_summary(thirteen_f)
|
||||
if infotable_summary_df is not None:
|
||||
table = Table("", "Issuer", "Class", "Cusip", "Ticker", "Value", "Type", "Shares", "Put/Call",
|
||||
row_styles=["bold", ""],
|
||||
box=box.SIMPLE)
|
||||
for index, row in enumerate(infotable_summary_df.itertuples()):
|
||||
table.add_row(str(index),
|
||||
row.Issuer,
|
||||
row.Class,
|
||||
row.Cusip,
|
||||
row.Ticker,
|
||||
f"${row.Value:,.0f}",
|
||||
row.Type,
|
||||
f"{int(row.Shares):,.0f}",
|
||||
row.PutCall
|
||||
)
|
||||
content.append(table)
|
||||
|
||||
return Panel(
|
||||
Group(*content), title=title, subtitle=title
|
||||
)
|
||||
Reference in New Issue
Block a user