Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
"""13F filing parsers for different document formats."""
from .primary_xml import parse_primary_document_xml
from .infotable_xml import parse_infotable_xml
from .infotable_txt import parse_infotable_txt
__all__ = [
'parse_primary_document_xml',
'parse_infotable_xml',
'parse_infotable_txt',
]

View File

@@ -0,0 +1,119 @@
"""TXT format information table parsers with automatic format detection.
Supports two TXT formats from 2012 filings:
- Format 1 (Multiline): Company names can span multiple lines
- Format 2 (Columnar): All data on single line with <S> and <C> tags
"""
import re
import pandas as pd
from .format_multiline import parse_multiline_format
from .format_columnar import parse_columnar_format
__all__ = ['parse_infotable_txt']
def parse_infotable_txt(infotable_txt: str) -> pd.DataFrame:
"""
Parse TXT format information table, auto-detecting format.
Supports:
- Format 1 (Multiline): Berkshire-style with multi-line company names
- Format 2 (Columnar): JANA-style with all data on single line
Args:
infotable_txt: TXT content containing the information table
Returns:
pd.DataFrame: Holdings data with same structure as XML parser
"""
if _is_columnar_format(infotable_txt):
return parse_columnar_format(infotable_txt)
else:
return parse_multiline_format(infotable_txt)
def _is_columnar_format(infotable_txt: str) -> bool:
"""
Detect if this is columnar format by looking for <S> tags in data rows.
Columnar format has <S> at the start of each data row, followed by data.
Multiline format only has <S> and <C> in the header row.
Args:
infotable_txt: TXT content to analyze
Returns:
bool: True if columnar format, False if multiline format
"""
# Find the Form 13F Information Table section (case-insensitive)
match = re.search(r'FORM\s+13F\s+INFORMATION\s+TABLE', infotable_txt, re.IGNORECASE)
if not match:
return False
# Extract tables (case-insensitive)
# Note: Search from beginning since <TABLE> tag may come before the header text
table_pattern = r'<TABLE>(.*?)</TABLE>'
tables = re.findall(table_pattern, infotable_txt, re.DOTALL | re.IGNORECASE)
if len(tables) == 0:
return False
# Determine which table to check
# If 2+ tables: check second table (first holdings table, after managers table)
# If 1 table: check that single table
if len(tables) >= 2:
holdings_table = tables[1]
else:
holdings_table = tables[0]
lines = holdings_table.split('\n')
# Count data rows with <S> tags that also have CUSIPs
# In columnar format, data rows start with <S> and have CUSIP on same line
# In multiline format, only header has <S>, and CUSIP is on second line of company
data_rows_with_s_and_cusip = 0
data_rows_checked = 0
for line in lines:
line = line.strip()
line_upper = line.upper()
# Skip empty lines, CAPTION, and header rows (case-insensitive)
if not line or '<CAPTION>' in line_upper:
continue
# Skip if this looks like a header (has <S> but no digits)
if '<S>' in line_upper and not re.search(r'\d', line):
continue
# Check if this line has both <S> tag and a CUSIP (9 chars with digit, with or without spaces)
cusip_match = re.search(r'\b([A-Za-z0-9]{9})\b', line)
has_valid_cusip = cusip_match and any(c.isdigit() for c in cusip_match.group(1))
# Also check for spaced CUSIPs
if not has_valid_cusip:
spaced_matches = re.finditer(r'\b([A-Za-z0-9 ]{9,15})\b', line)
for match in spaced_matches:
cleaned = match.group(1).replace(' ', '')
if len(cleaned) == 9 and any(c.isdigit() for c in cleaned):
has_valid_cusip = True
break
if '<S>' in line_upper and has_valid_cusip:
data_rows_with_s_and_cusip += 1
data_rows_checked += 1
elif has_valid_cusip:
# Has CUSIP but no <S> - multiline format
data_rows_checked += 1
# If we've checked 3 data rows, that's enough to decide
if data_rows_checked >= 3:
break
# If most data rows with CUSIPs also have <S> tags, it's columnar format
if data_rows_checked > 0 and data_rows_with_s_and_cusip >= data_rows_checked * 0.5:
return True
return False

View File

@@ -0,0 +1,286 @@
"""Parser for columnar TXT format (Format 2) used in some 2012 filings.
This format has <S> and <C> tags for each field, with all data on a single line.
Example:
<S> <C> <C> <C> <C>
AETNA INC NEW COM 00817Y108 92,760 2,342,435 SH SOLE 2,238,895 103,540 0
"""
import re
import pandas as pd
from edgar.reference import cusip_ticker_mapping
__all__ = ['parse_columnar_format']
def parse_columnar_format(infotable_txt: str) -> pd.DataFrame:
"""
Parse columnar TXT format (Format 2) information table.
This parser handles the format where all data is on a single line with
<S> and <C> tags marking column boundaries.
Args:
infotable_txt: TXT content containing the information table
Returns:
pd.DataFrame: Holdings data with same structure as XML parser
"""
# Find the Form 13F Information Table section (case-insensitive)
match = re.search(r'FORM\s+13F\s+INFORMATION\s+TABLE', infotable_txt, re.IGNORECASE)
if not match:
return pd.DataFrame()
# Extract all table content between <TABLE> and </TABLE> tags (case-insensitive)
# Note: Search from beginning since <TABLE> tag may come before the header text
table_pattern = r'<TABLE>(.*?)</TABLE>'
tables = re.findall(table_pattern, infotable_txt, re.DOTALL | re.IGNORECASE)
if len(tables) == 0:
return pd.DataFrame()
# Determine which tables to process:
# - If 2+ tables: Skip first table (usually managers list), process rest
# - If 1 table: Check if it has holdings data (CUSIPs with <S> tags), if so process it
if len(tables) >= 2:
holdings_tables = tables[1:] # Skip first table (managers)
elif len(tables) == 1:
# Check if the single table has holdings data (contains CUSIPs with <S> tags)
# Look for lines that have both <S> tag and valid CUSIP (with or without spaces)
potential_lines = [line for line in tables[0].split('\n') if '<S>' in line.upper()]
has_data = False
for line in potential_lines[:10]: # Check first 10 <S> lines
# Try non-spaced CUSIPs first
cusip_match = re.search(r'\b([A-Za-z0-9]{9})\b', line)
if cusip_match and any(c.isdigit() for c in cusip_match.group(1)):
has_data = True
break
# Try spaced CUSIPs
spaced_matches = re.finditer(r'\b([A-Za-z0-9 ]{9,15})\b', line)
for match in spaced_matches:
cleaned = match.group(1).replace(' ', '')
if len(cleaned) == 9 and any(c.isdigit() for c in cleaned):
has_data = True
break
if has_data:
break
if has_data:
holdings_tables = tables # Process the single table
else:
return pd.DataFrame() # No holdings data
else:
return pd.DataFrame()
parsed_rows = []
for holdings_table in holdings_tables:
# Skip if this is the totals table (very short, < 200 chars)
if len(holdings_table.strip()) < 200:
continue
lines = holdings_table.split('\n')
for line in lines:
line = line.strip()
# Skip empty lines, CAPTION lines, header rows (case-insensitive)
line_upper = line.upper()
if not line or '<CAPTION>' in line_upper:
continue
# Skip header rows with just tags (case-insensitive)
# Header rows have <S> but no valid CUSIPs (9 chars with at least one digit, with or without spaces)
if line_upper.startswith('<S>'):
# Check for normal 9-char CUSIP
has_cusip = False
cusip_check = re.search(r'\b([A-Za-z0-9]{9})\b', line)
if cusip_check and any(c.isdigit() for c in cusip_check.group(1)):
has_cusip = True
# If not found, check for spaced CUSIP
if not has_cusip:
spaced_check = re.finditer(r'\b([A-Za-z0-9 ]{9,15})\b', line)
for match in spaced_check:
cleaned = match.group(1).replace(' ', '')
if len(cleaned) == 9 and any(c.isdigit() for c in cleaned):
has_cusip = True
break
if not has_cusip:
continue
if line.startswith(('Total', 'Title', 'NAME OF ISSUER', 'of', 'Market Value')):
continue
# Look for data rows with <S> tag and a CUSIP (case-insensitive)
if '<S>' not in line_upper:
continue
# CUSIP is a reliable anchor - it's always 9 alphanumeric characters (case-insensitive)
# Must contain at least one digit to avoid matching company names or words like "SPONSORED"
# Some filings have spaces in CUSIPs: "00724F 10 1" should be "00724F101"
# Find ALL potential CUSIP sequences (with or without spaces), then pick the first valid one
# First try without spaces (faster path)
cusip_match = None
cusip = None
all_cusip_matches = re.finditer(r'\b([A-Za-z0-9]{9})\b', line)
for match in all_cusip_matches:
if any(c.isdigit() for c in match.group(1)):
cusip_match = match
cusip = match.group(1)
break
# If not found, try matching with spaces and cleaning
if not cusip_match:
# Match sequences of 9-15 chars that might contain spaces
spaced_matches = re.finditer(r'\b([A-Za-z0-9 ]{9,15})\b', line)
for match in spaced_matches:
cleaned = match.group(1).replace(' ', '')
# Check if cleaned version is exactly 9 chars and has a digit
if len(cleaned) == 9 and any(c.isdigit() for c in cleaned):
cusip_match = match
cusip = cleaned # Use cleaned version
break
if not cusip_match:
continue
# Remove SGML tags and split by whitespace
# Replace <S> and <C> with spaces to help with splitting
cleaned_line = line.replace('<S>', ' ').replace('<C>', ' ')
parts = cleaned_line.split()
# Filter out empty parts
parts = [p for p in parts if p.strip()]
if len(parts) < 10: # Need at least issuer, class, cusip, value, shares, type, discretion, sole, shared, none
continue
try:
# Find CUSIP position in parts
# cusip already set above (either from direct match or cleaned from spaced match)
# Try to find it in parts - it might be spaced or not spaced
cusip_idx = None
cusip_span = 1 # How many elements the CUSIP occupies in parts
# First try to find cleaned CUSIP as a single element
if cusip in parts:
cusip_idx = parts.index(cusip)
else:
# Try to find the original spaced version as a single element
original_cusip = cusip_match.group(1)
if original_cusip in parts:
cusip_idx = parts.index(original_cusip)
else:
# For spaced CUSIPs split across multiple parts (e.g., "00724F 10 1" -> ["00724F", "10", "1"])
# Look for a sequence of parts that, when joined, matches the cleaned CUSIP
for i in range(len(parts) - 2): # Need at least 3 parts for a split CUSIP
# Try joining 2-4 consecutive parts
for span in range(2, 5):
if i + span > len(parts):
break
joined = ''.join(parts[i:i+span])
if joined == cusip:
cusip_idx = i
cusip_span = span
break
if cusip_idx is not None:
break
if cusip_idx is None:
continue
# Before CUSIP: Issuer name and class
# Everything before CUSIP minus the last word (which is the class)
before_cusip = parts[:cusip_idx]
if len(before_cusip) < 2:
continue
# Last part before CUSIP is the class, rest is issuer name
title_class = before_cusip[-1]
issuer_name = ' '.join(before_cusip[:-1])
# After CUSIP: value, shares, type (SH/PRN), discretion, sole, shared, none
# Skip cusip_span elements for spaced CUSIPs (e.g., ["00724F", "10", "1"])
after_cusip = parts[cusip_idx + cusip_span:]
if len(after_cusip) < 7:
continue
# Parse fields after CUSIP
# Expected order: VALUE SHARES TYPE DISCRETION ... SOLE SHARED NONE
value_str = after_cusip[0].replace(',', '').replace('$', '')
shares_str = after_cusip[1].replace(',', '')
value = int(value_str) if value_str and value_str != '-' else 0
shares = int(shares_str) if shares_str and shares_str != '-' else 0
# Type (SH/PRN) is typically at index 2
share_type = after_cusip[2] if len(after_cusip) > 2 else 'SH'
if share_type == 'SH':
share_type_full = 'Shares'
elif share_type == 'PRN':
share_type_full = 'Principal'
else:
share_type_full = 'Shares'
# Find investment discretion (typically "SOLE", "SHARED", "DEFINED", or compound like "SHARED-DEFINED")
# It's the first non-numeric field after type
discretion_idx = 3
investment_discretion = ''
for i in range(3, len(after_cusip) - 3): # Last 3 are voting columns
part = after_cusip[i]
if part and part not in ['-'] and not part.replace(',', '').isdigit():
investment_discretion = part
discretion_idx = i
break
# Voting columns are the last 3 fields
if len(after_cusip) >= 3:
none_voting_str = after_cusip[-1].replace(',', '')
shared_voting_str = after_cusip[-2].replace(',', '')
sole_voting_str = after_cusip[-3].replace(',', '')
non_voting = int(none_voting_str) if none_voting_str and none_voting_str != '-' else 0
shared_voting = int(shared_voting_str) if shared_voting_str and shared_voting_str != '-' else 0
sole_voting = int(sole_voting_str) if sole_voting_str and sole_voting_str != '-' else 0
else:
sole_voting = 0
shared_voting = 0
non_voting = 0
# Create row dict
row_dict = {
'Issuer': issuer_name,
'Class': title_class,
'Cusip': cusip,
'Value': value,
'SharesPrnAmount': shares,
'Type': share_type_full,
'PutCall': '',
'InvestmentDiscretion': investment_discretion,
'SoleVoting': sole_voting,
'SharedVoting': shared_voting,
'NonVoting': non_voting
}
parsed_rows.append(row_dict)
except (ValueError, IndexError) as e:
# Skip rows that don't parse correctly
continue
# Create DataFrame
if not parsed_rows:
return pd.DataFrame()
table = pd.DataFrame(parsed_rows)
# Add ticker symbols using CUSIP mapping
cusip_mapping = cusip_ticker_mapping(allow_duplicate_cusips=False)
table['Ticker'] = table.Cusip.map(cusip_mapping.Ticker)
return table

View File

@@ -0,0 +1,273 @@
"""Parser for multiline TXT format (Format 1) used in some 2012 filings.
This format has company names that can span multiple lines, with the CUSIP
appearing on the same line as the continuation of the company name.
Example:
AMERICAN
EXPRESS CO COM 025816109 110999 1952142 Shared-Defined...
"""
import re
import pandas as pd
from edgar.reference import cusip_ticker_mapping
__all__ = ['parse_multiline_format']
def parse_multiline_format(infotable_txt: str) -> pd.DataFrame:
"""
Parse multiline TXT format (Format 1) information table.
This parser handles the format where company names can span multiple lines,
with the CUSIP appearing on the line that contains the continuation.
Args:
infotable_txt: TXT content containing the information table
Returns:
pd.DataFrame: Holdings data with same structure as XML parser
"""
# Find the Form 13F Information Table section (case-insensitive)
match = re.search(r'FORM\s+13F\s+INFORMATION\s+TABLE', infotable_txt, re.IGNORECASE)
if not match:
return pd.DataFrame()
# Extract all table content between <TABLE> and </TABLE> tags (case-insensitive)
# Note: Search from beginning since <TABLE> tag may come before the header text
table_pattern = r'<TABLE>(.*?)</TABLE>'
tables = re.findall(table_pattern, infotable_txt, re.DOTALL | re.IGNORECASE)
if len(tables) == 0:
return pd.DataFrame()
# Determine which tables to process:
# - If 2+ tables: Skip first table (usually managers list), process rest
# - If 1 table: Check if it has holdings data (CUSIPs), if so process it
if len(tables) >= 2:
holdings_tables = tables[1:] # Skip first table (managers)
elif len(tables) == 1:
# Check if the single table has holdings data (contains CUSIPs with digits)
# Look for 9-char alphanumeric sequences (with or without spaces) that contain at least one digit
potential_cusips = re.findall(r'\b([A-Za-z0-9]{9})\b', tables[0])
# Also check for spaced CUSIPs
spaced_cusips = re.findall(r'\b([A-Za-z0-9 ]{9,15})\b', tables[0])
spaced_cusips_cleaned = [c.replace(' ', '') for c in spaced_cusips if len(c.replace(' ', '')) == 9]
has_valid_cusips = (
any(any(c.isdigit() for c in cusip) for cusip in potential_cusips) or
any(any(c.isdigit() for c in cusip) for cusip in spaced_cusips_cleaned)
)
if has_valid_cusips:
holdings_tables = tables # Process the single table
else:
return pd.DataFrame() # No holdings data
else:
return pd.DataFrame()
parsed_rows = []
for holdings_table in holdings_tables:
# Skip if this is the totals table (very short, < 200 chars)
if len(holdings_table.strip()) < 200:
continue
# Reset pending issuer parts for each table
pending_issuer_parts = []
lines = holdings_table.split('\n')
for line in lines:
orig_line = line
line = line.strip()
# Skip empty lines, CAPTION lines, header rows (case-insensitive)
line_upper = line.upper()
if not line or '<CAPTION>' in line_upper or '<S>' in line_upper or '<C>' in line_upper:
continue
# Skip separator lines (made of dashes and spaces)
if all(c in '- ' for c in line):
continue
# Skip header/title rows
line_upper = line.upper()
if line.startswith(('Total', 'Title', 'Name of Issuer', 'of', 'Market Value')):
continue
# Skip column header rows (contain keywords like COLUMN, VOTING AUTHORITY, SHRS OR PRN, etc.)
if any(keyword in line_upper for keyword in ['COLUMN 1', 'COLUMN 2', 'VOTING AUTHORITY', 'SHRS OR', 'NAME OF ISSUER', 'FORM 13F', 'INFORMATION TABLE']):
continue
# Try to parse as a data row
# CUSIP is a reliable anchor - it's always 9 alphanumeric characters (case-insensitive)
# Must contain at least one digit to avoid matching company names like "Berkshire" or "SPONSORED"
# Some filings have spaces in CUSIPs: "00724F 10 1" should be "00724F101"
# Find ALL potential CUSIP sequences (with or without spaces), then pick the first valid one
# First try without spaces (faster path)
cusip_match = None
cusip = None
all_cusip_matches = re.finditer(r'\b([A-Za-z0-9]{9})\b', line)
for match in all_cusip_matches:
if any(c.isdigit() for c in match.group(1)):
cusip_match = match
cusip = match.group(1)
break
# If not found, try matching with spaces and cleaning
if not cusip_match:
# Match sequences of 9-15 chars that might contain spaces
spaced_matches = re.finditer(r'\b([A-Za-z0-9 ]{9,15})\b', line)
for match in spaced_matches:
cleaned = match.group(1).replace(' ', '')
# Check if cleaned version is exactly 9 chars and has a digit
if len(cleaned) == 9 and any(c.isdigit() for c in cleaned):
cusip_match = match
cusip = cleaned # Use cleaned version
break
if cusip_match:
# This line contains a CUSIP, so it has the main data
# cusip already set above (either from direct match or cleaned from spaced match)
cusip_pos = cusip_match.start()
# Everything before CUSIP is issuer name + class
before_cusip = line[:cusip_pos].strip()
# Everything after CUSIP is the numeric data
# Use match.end() to handle spaced CUSIPs correctly (e.g., "00724F 10 1")
after_cusip = line[cusip_match.end():].strip()
# Split before_cusip into issuer parts
# Combine with any pending issuer parts from previous line
before_parts = before_cusip.split()
# If we have pending parts, this completes a multi-line company name
if pending_issuer_parts:
before_parts = pending_issuer_parts + before_parts
pending_issuer_parts = []
if len(before_parts) < 2:
# Not enough data, skip
continue
# Extract class and issuer name
# Common patterns:
# - "COMPANY NAME COM" → class="COM", issuer="COMPANY NAME"
# - "COMPANY NAME SPONSORED ADR" → class="SPONSORED ADR", issuer="COMPANY NAME"
# - "COMPANY NAME CL A" → class="CL A", issuer="COMPANY NAME"
if len(before_parts) >= 3 and before_parts[-2] == 'SPONSORED' and before_parts[-1] == 'ADR':
title_class = 'SPONSORED ADR'
issuer_parts = before_parts[:-2]
elif len(before_parts) >= 3 and before_parts[-2] == 'CL':
title_class = 'CL ' + before_parts[-1]
issuer_parts = before_parts[:-2]
elif len(before_parts) >= 5 and ' '.join(before_parts[-4:]).startswith('LIB CAP COM'):
# "LIBERTY MEDIA CORPORATION LIB CAP COM A"
title_class = ' '.join(before_parts[-4:])
issuer_parts = before_parts[:-4]
elif len(before_parts) >= 2:
# Default: last word/token is the class
title_class = before_parts[-1]
issuer_parts = before_parts[:-1]
else:
# Only one part - skip this row
continue
issuer_name = ' '.join(issuer_parts)
# Skip if issuer name is empty
if not issuer_name:
continue
# Parse the numeric data after CUSIP
# Flexible format handling since empty columns may not appear
# Expected order: VALUE SHARES [TYPE] [DISCRETION] [MANAGERS] [SOLE] [SHARED] [NONE]
data_parts = after_cusip.split()
if len(data_parts) < 2: # At minimum need value and shares
continue
try:
# Value and Shares are always the first two fields
value_str = data_parts[0].replace(',', '').replace('$', '')
shares_str = data_parts[1].replace(',', '')
value = int(value_str) if value_str and value_str != '-' else 0
shares = float(shares_str) if shares_str and shares_str != '-' else 0
# Parse voting columns from the end (look for numeric values)
# Work backwards from end to find up to 3 numeric voting columns
voting_values = []
for i in range(len(data_parts) - 1, 1, -1): # Start from end, skip first 2 (value/shares)
part = data_parts[i].replace(',', '').replace('.', '')
if part.replace('-', '').isdigit():
# This is a numeric value (could be voting)
val_str = data_parts[i].replace(',', '')
try:
voting_values.insert(0, float(val_str) if val_str != '-' else 0)
if len(voting_values) == 3:
break
except ValueError:
break
else:
# Non-numeric, stop looking for voting columns
break
# Assign voting values (may have 0-3 values)
sole_voting = int(voting_values[0]) if len(voting_values) >= 1 else 0
shared_voting = int(voting_values[1]) if len(voting_values) >= 2 else 0
non_voting = int(voting_values[2]) if len(voting_values) >= 3 else 0
# Find investment discretion by looking for non-numeric field after position 2
# It's typically "Shared-Defined", "SOLE", "Defined", etc.
# Skip position 2 which might be TYPE (SH/PRN)
investment_discretion = ''
num_voting_at_end = len(voting_values)
for i in range(2, len(data_parts) - num_voting_at_end):
part = data_parts[i]
# Investment discretion contains letters and is not a known type marker
if part and part not in ['-', 'SH', 'PRN'] and not part.replace(',', '').replace('.', '').isdigit():
investment_discretion = part
break
# Create row dict
row_dict = {
'Issuer': issuer_name,
'Class': title_class,
'Cusip': cusip,
'Value': value,
'SharesPrnAmount': shares,
'Type': 'Shares',
'PutCall': '',
'InvestmentDiscretion': investment_discretion,
'SoleVoting': sole_voting,
'SharedVoting': shared_voting,
'NonVoting': non_voting
}
parsed_rows.append(row_dict)
except (ValueError, IndexError):
# Skip rows that don't parse correctly
continue
else:
# No CUSIP on this line - might be first part of a multi-line company name
# Store it for the next line
if line and not line.startswith(('Total', 'Title')):
pending_issuer_parts = line.split()
# Create DataFrame
if not parsed_rows:
return pd.DataFrame()
table = pd.DataFrame(parsed_rows)
# Add ticker symbols using CUSIP mapping
cusip_mapping = cusip_ticker_mapping(allow_duplicate_cusips=False)
table['Ticker'] = table.Cusip.map(cusip_mapping.Ticker)
return table

View File

@@ -0,0 +1,56 @@
"""Parser for 13F information table XML format."""
import pandas as pd
from edgar.reference import cusip_ticker_mapping
from edgar.xmltools import child_text, find_element
__all__ = ['parse_infotable_xml']
def parse_infotable_xml(infotable_xml: str) -> pd.DataFrame:
"""
Parse the infotable xml and return a pandas DataFrame
Args:
infotable_xml: XML content of the information table
Returns:
pd.DataFrame: Holdings data with columns matching the XML structure
"""
root = find_element(infotable_xml, "informationTable")
rows = []
shares_or_principal = {"SH": "Shares", "PRN": "Principal"}
for info_tag in root.find_all("infoTable"):
info_table = dict()
info_table['Issuer'] = child_text(info_tag, "nameOfIssuer")
info_table['Class'] = child_text(info_tag, "titleOfClass")
info_table['Cusip'] = child_text(info_tag, "cusip")
info_table['Value'] = int(child_text(info_tag, "value"))
# Shares or principal
shares_tag = info_tag.find("shrsOrPrnAmt")
info_table['SharesPrnAmount'] = child_text(shares_tag, "sshPrnamt")
# Shares or principal
ssh_prnamt_type = child_text(shares_tag, "sshPrnamtType")
info_table['Type'] = shares_or_principal.get(ssh_prnamt_type)
info_table["PutCall"] = child_text(info_tag, "putCall") or ""
info_table['InvestmentDiscretion'] = child_text(info_tag, "investmentDiscretion")
# Voting authority
voting_auth_tag = info_tag.find("votingAuthority")
info_table['SoleVoting'] = int(float(child_text(voting_auth_tag, "Sole")))
info_table['SharedVoting'] = int(float(child_text(voting_auth_tag, "Shared")))
info_table['NonVoting'] = int(float(child_text(voting_auth_tag, "None")))
rows.append(info_table)
table = pd.DataFrame(rows)
# Add the ticker symbol
cusip_mapping = cusip_ticker_mapping(allow_duplicate_cusips=False)
table['Ticker'] = table.Cusip.map(cusip_mapping.Ticker)
return table

View File

@@ -0,0 +1,118 @@
"""Parser for 13F primary document XML format."""
from datetime import datetime
from decimal import Decimal
from functools import lru_cache
from edgar._party import Address
from edgar.thirteenf.models import (
FilingManager,
OtherManager,
CoverPage,
SummaryPage,
Signature,
PrimaryDocument13F
)
from edgar.xmltools import child_text, find_element
__all__ = ['parse_primary_document_xml']
@lru_cache(maxsize=8)
def parse_primary_document_xml(primary_document_xml: str):
"""
Parse the primary 13F XML document.
Args:
primary_document_xml: XML content of the primary document
Returns:
PrimaryDocument13F: Parsed primary document data
"""
root = find_element(primary_document_xml, "edgarSubmission")
# Header data
header_data = root.find("headerData")
filer_info = header_data.find("filerInfo")
report_period = datetime.strptime(child_text(filer_info, "periodOfReport"), "%m-%d-%Y")
# Form Data
form_data = root.find("formData")
cover_page_el = form_data.find("coverPage")
report_calendar_or_quarter = child_text(form_data, "reportCalendarOrQuarter")
report_type = child_text(cover_page_el, "reportType")
# Filing Manager
filing_manager_el = cover_page_el.find("filingManager")
# Address
address_el = filing_manager_el.find("address")
address = Address(
street1=child_text(address_el, "street1"),
street2=child_text(address_el, "street2"),
city=child_text(address_el, "city"),
state_or_country=child_text(address_el, "stateOrCountry"),
zipcode=child_text(address_el, "zipCode")
)
filing_manager = FilingManager(name=child_text(filing_manager_el, "name"), address=address)
# Other managers
other_manager_info_el = cover_page_el.find("otherManagersInfo")
other_managers = [
OtherManager(
cik=child_text(other_manager_el, "cik"),
name=child_text(other_manager_el, "name"),
file_number=child_text(other_manager_el, "form13FFileNumber")
)
for other_manager_el in other_manager_info_el.find_all("otherManager")
] if other_manager_info_el else []
# Summary Page
summary_page_el = form_data.find("summaryPage")
if summary_page_el:
other_included_managers_count = child_text(summary_page_el,
"otherIncludedManagersCount")
if other_included_managers_count:
other_included_managers_count = int(other_included_managers_count)
total_holdings = child_text(summary_page_el, "tableEntryTotal")
if total_holdings:
total_holdings = int(total_holdings)
total_value = child_text(summary_page_el, "tableValueTotal")
if total_value:
total_value = Decimal(total_value)
else:
other_included_managers_count = 0
total_holdings = 0
total_value = 0
# Signature Block
signature_block_el = form_data.find("signatureBlock")
signature = Signature(
name=child_text(signature_block_el, "name"),
title=child_text(signature_block_el, "title"),
phone=child_text(signature_block_el, "phone"),
city=child_text(signature_block_el, "city"),
signature=child_text(signature_block_el, "signature"),
state_or_country=child_text(signature_block_el, "stateOrCountry"),
date=child_text(signature_block_el, "signatureDate")
)
parsed_primary_doc = PrimaryDocument13F(
report_period=report_period,
cover_page=CoverPage(
filing_manager=filing_manager,
report_calendar_or_quarter=report_calendar_or_quarter,
report_type=report_type,
other_managers=other_managers
),
signature=signature,
summary_page=SummaryPage(
other_included_managers_count=other_included_managers_count or 0,
total_holdings=total_holdings or 0,
total_value=total_value or 0
),
additional_information=child_text(cover_page_el, "additionalInformation")
)
return parsed_primary_doc