Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,45 @@
from edgar import Company
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
print("Analyzing period durations for FY facts:\n")
# Group facts by (fiscal_year, fiscal_period, period_end)
fact_groups = defaultdict(list)
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year and fact.fiscal_year >= 2019 and fact.fiscal_year <= 2021:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
key = (fact.fiscal_year, fact.fiscal_period, fact.period_end)
fact_groups[key].append(fact)
# Analyze each group
for key in sorted(fact_groups.keys()):
year, period, end_date = key
facts_in_group = fact_groups[key]
if len(facts_in_group) > 1:
print(f"\nFY {year} ending {end_date}: {len(facts_in_group)} facts")
for fact in facts_in_group:
duration = None
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
period_type = "Annual" if duration and duration > 300 else "Quarterly" if duration else "Unknown"
print(f" ${fact.value:,.0f} - Duration: {duration} days ({period_type})")
print(f" Period: {fact.period_start} to {fact.period_end}")
print(f" Filed: {fact.filing_date}")
if hasattr(fact, 'form'):
print(f" Form: {fact.form}")
if hasattr(fact, 'accession'):
print(f" Accession: {fact.accession}")
print("\n\nSummary:")
print("The issue: Both annual and quarterly revenue are marked as 'FY'")
print("Solution: Use period duration to distinguish:")
print(" - Annual: period_start to period_end > 300 days")
print(" - Quarterly: period_start to period_end < 100 days")

View File

@@ -0,0 +1,57 @@
from edgar import Company
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
# Check all FY income statement facts for 2019-2024
print("Checking FY facts and their period_end dates:\n")
print("fiscal_year | fiscal_period | period_end | period_end.year | Match?")
print("-" * 70)
fy_facts = defaultdict(list)
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year and fact.fiscal_year >= 2019:
fy_facts[fact.fiscal_year].append(fact)
# Show all FY entries grouped by fiscal_year
for year in sorted(fy_facts.keys(), reverse=True):
facts_for_year = fy_facts[year]
# Get unique period_end dates for this fiscal year
unique_ends = set()
for fact in facts_for_year:
if fact.period_end:
unique_ends.add(fact.period_end)
print(f"\nFY {year} has {len(unique_ends)} unique period_end dates:")
for end_date in sorted(unique_ends):
if end_date:
match = "" if end_date.year == year else ""
print(f" {year:4d} | FY | {end_date} | {end_date.year} | {match}")
# Now check if we have the correct matches
print("\n\nChecking if we have correct year matches:")
correct_matches = defaultdict(set)
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.period_end and fact.fiscal_year:
if fact.period_end.year == fact.fiscal_year:
correct_matches[fact.fiscal_year].add(fact.period_end)
print("\nFiscal years with matching period_end.year:")
for year in sorted(correct_matches.keys(), reverse=True)[:6]:
for end_date in correct_matches[year]:
print(f" FY {year} -> {end_date}")
# Check revenue values for correct matches
print("\n\nRevenue values for CORRECT year matches:")
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.period_end and fact.fiscal_year:
if fact.period_end.year == fact.fiscal_year:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
if fact.fiscal_year >= 2019 and fact.fiscal_year <= 2024:
print(f" FY {fact.fiscal_year} (ends {fact.period_end}): ${fact.value:,.0f}")

View File

@@ -0,0 +1,172 @@
#!/usr/bin/env python3
"""
Check which renderer is actually being used in the MSFT table.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def check_renderer_usage():
print("🔍 CHECKING WHICH RENDERER IS ACTUALLY BEING USED")
print("=" * 60)
try:
# Parse with default config
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Check what the default config actually has
config = ParserConfig()
print(f"Default ParserConfig.fast_table_rendering: {config.fast_table_rendering}")
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return
print(f"✅ Found target table")
print(f"Table has _config: {'' if hasattr(target_table, '_config') else ''}")
if hasattr(target_table, '_config'):
print(f"Table config fast_table_rendering: {target_table._config.fast_table_rendering}")
# Test the decision logic in TableNode.text()
print(f"\n🔍 TRACING TableNode.text() DECISION LOGIC:")
# Check if cache exists
has_cache = hasattr(target_table, '_text_cache') and target_table._text_cache is not None
print(f"Has cached text: {has_cache}")
if has_cache:
print(f"❗ Using cached result - clearing cache to test renderer...")
target_table._text_cache = None
# Check the config decision
config_obj = getattr(target_table, '_config', None)
should_use_fast = config_obj and getattr(config_obj, 'fast_table_rendering', False)
print(f"Config object exists: {'' if config_obj else ''}")
print(f"Should use fast rendering: {'' if should_use_fast else ''}")
# Test both renderers directly
print(f"\n🧪 TESTING BOTH RENDERERS DIRECTLY:")
# Test Rich renderer
try:
print("Rich renderer test:")
rich_table = target_table.render(width=195)
from edgar.richtools import rich_to_text
rich_text = rich_to_text(rich_table)
rich_has_pipes = '|' in rich_text
print(f" Rich output has pipes: {'' if rich_has_pipes else ''}")
print(f" Rich output length: {len(rich_text)} chars")
print(f" Rich preview: {rich_text[:80]}...")
except Exception as e:
print(f" Rich renderer error: {e}")
# Test Fast renderer
try:
print("Fast renderer test:")
fast_text = target_table._fast_text_rendering()
fast_has_pipes = '|' in fast_text
print(f" Fast output has pipes: {'' if fast_has_pipes else ''}")
print(f" Fast output length: {len(fast_text)} chars")
print(f" Fast preview: {fast_text[:80]}...")
except Exception as e:
print(f" Fast renderer error: {e}")
# Test current text() method
print("Current text() method:")
current_text = target_table.text()
current_has_pipes = '|' in current_text
print(f" Current output has pipes: {'' if current_has_pipes else ''}")
print(f" Current output length: {len(current_text)} chars")
print(f" Current preview: {current_text[:80]}...")
# Determine which renderer is actually being used
if current_has_pipes and len(current_text) < 2000:
print(f"\n🎯 CONCLUSION: Currently using FAST RENDERER ✅")
elif not current_has_pipes and len(current_text) > 1500:
print(f"\n🎯 CONCLUSION: Currently using RICH RENDERER ❌")
else:
print(f"\n🤔 CONCLUSION: Unclear which renderer is being used")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
def test_explicit_configurations():
"""Test with explicit fast and rich configurations."""
print(f"\n🧪 TESTING EXPLICIT CONFIGURATIONS")
print("=" * 60)
configs = [
("Explicit Fast", ParserConfig(fast_table_rendering=True)),
("Explicit Rich", ParserConfig(fast_table_rendering=False)),
]
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
for config_name, config in configs:
print(f"\n🔧 {config_name} (fast_table_rendering={config.fast_table_rendering}):")
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if target_table:
table_text = target_table.text()
has_pipes = '|' in table_text
print(f" Output has pipes: {'' if has_pipes else ''}")
print(f" Output length: {len(table_text)} chars")
print(f" Preview: {table_text[:60]}...")
else:
print(f" ❌ Table not found")
except Exception as e:
print(f"❌ Error: {e}")
if __name__ == "__main__":
check_renderer_usage()
test_explicit_configurations()

View File

@@ -0,0 +1,46 @@
from edgar import Company
from collections import defaultdict
import json
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
# Get raw facts data - access internal facts list
raw_facts = facts._facts # Access internal facts list
# Look for Revenue facts in 2020 and 2019
revenue_facts = []
for fact in raw_facts:
if fact.concept and 'Revenue' in fact.concept:
if fact.fiscal_year in [2019, 2020]:
revenue_facts.append({
'concept': fact.concept,
'value': fact.value,
'fy': fact.fiscal_year,
'fp': fact.fiscal_period,
'period_end': str(fact.period_end) if fact.period_end else None,
'period_duration': getattr(fact, 'period_duration', None),
'statement': fact.statement_type,
'filing_date': str(fact.filing_date) if fact.filing_date else None
})
print("Revenue facts for 2019-2020:")
print(json.dumps(revenue_facts, indent=2, default=str))
# Group by fiscal year and period
by_year_period = defaultdict(list)
for fact in revenue_facts:
key = f"{fact['fy']}-{fact['fp']}"
by_year_period[key].append(fact)
print("\n\nGrouped by fiscal year and period:")
for key in sorted(by_year_period.keys()):
print(f"\n{key}:")
for fact in by_year_period[key]:
print(f" {fact['concept']}: ${fact['value']:,} (duration: {fact['period_duration']} days)")
# Now check what the income statement method returns
print("\n\nIncome statement for 2019-2020 (annual=True):")
income = facts.income_statement(annual=True, periods=6)
print(income)

View File

@@ -0,0 +1,89 @@
from edgar import Company
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
# Get raw facts data - access internal facts list
raw_facts = facts._facts # Access internal facts list
# Look for all facts in Income Statement for 2019-2020
income_facts = defaultdict(lambda: defaultdict(list))
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement':
if fact.fiscal_year in [2019, 2020]:
key = f"{fact.fiscal_year}-{fact.fiscal_period}"
income_facts[fact.concept][key].append({
'value': fact.value,
'period_end': fact.period_end,
'filing_date': fact.filing_date
})
# Find Revenue/Revenues concept
revenue_concepts = []
for concept in income_facts.keys():
if 'Revenue' in concept and 'Contract' not in concept:
revenue_concepts.append(concept)
print("Revenue concepts found:", revenue_concepts)
print("\nRevenue values by year-period:")
for concept in revenue_concepts:
print(f"\n{concept}:")
for period in sorted(income_facts[concept].keys()):
facts_list = income_facts[concept][period]
for f in facts_list:
print(f" {period}: ${f['value']:,}")
# Check what periods are actually marked as FY
print("\n\nAll FY periods in Income Statement:")
fy_periods = set()
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
fy_periods.add((fact.fiscal_year, fact.fiscal_period, fact.period_end))
for year, period, end_date in sorted(fy_periods):
print(f" {year} {period} (ends {end_date})")
# Now check what exact facts are selected for 2019 and 2020
print("\n\nChecking what's selected for income statement:")
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
builder = EnhancedStatementBuilder()
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Build period info like the builder does
period_info = {}
period_facts_map = defaultdict(list)
for fact in stmt_facts:
period_key = (fact.fiscal_year, fact.fiscal_period)
period_label = f"{fact.fiscal_period} {fact.fiscal_year}"
period_facts_map[period_label].append(fact)
if period_key not in period_info:
period_info[period_key] = {
'label': period_label,
'end_date': fact.period_end,
'is_annual': fact.fiscal_period == 'FY',
'filing_date': fact.filing_date,
'fiscal_year': fact.fiscal_year,
'fiscal_period': fact.fiscal_period
}
# Get annual periods
annual_periods = [(pk, info) for pk, info in period_info.items() if info['is_annual']]
annual_periods.sort(key=lambda x: x[0][0] if x[0][0] else 0, reverse=True)
print("\nAnnual periods found (sorted newest first):")
for (year, period), info in annual_periods[:10]:
print(f" {info['label']} - ends {info['end_date']}")
# Check if there are any revenue facts for FY 2019 and FY 2020
print("\n\nRevenue facts for FY periods:")
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year in [2019, 2020] and 'Revenue' in str(fact.concept):
print(f" {fact.fiscal_year} {fact.fiscal_period}: {fact.concept} = ${fact.value:,}")

View File

@@ -0,0 +1,37 @@
from edgar import Company
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
# Check how period_info is built
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Track all unique combinations
all_combos = set()
period_end_by_key = defaultdict(set)
for fact in stmt_facts:
if fact.fiscal_period == 'FY' and fact.fiscal_year and fact.fiscal_year >= 2019:
period_key = (fact.fiscal_year, fact.fiscal_period)
all_combos.add((fact.fiscal_year, fact.fiscal_period, fact.period_end))
period_end_by_key[period_key].add(fact.period_end)
print("Period keys and their different period_end dates:")
for key in sorted(period_end_by_key.keys(), reverse=True):
year, period = key
if year >= 2019 and year <= 2024:
ends = period_end_by_key[key]
print(f"\n({year}, '{period}'): {len(ends)} different period_ends")
for end in sorted(ends):
match = "" if end and end.year == year else ""
print(f" {end} {match}")
# The problem: period_info dict only keeps ONE per key
print("\n\nProblem: The current code builds period_info as a dict,")
print("so it only keeps ONE fact per (fiscal_year, fiscal_period) key!")
print("We lose all the other period_end variations when we do:")
print(" if period_key not in period_info:")
print(" period_info[period_key] = {...} # Only first one is kept!")

View File

@@ -0,0 +1,83 @@
from edgar import Company
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
# Build statement manually to debug
builder = EnhancedStatementBuilder()
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Build period info with new key structure
period_info = {}
period_facts = defaultdict(list)
for fact in stmt_facts:
period_key = (fact.fiscal_year, fact.fiscal_period, fact.period_end)
if period_key not in period_info:
period_info[period_key] = {
'label': f"{fact.fiscal_period} {fact.fiscal_year}",
'end_date': fact.period_end,
'is_annual': fact.fiscal_period == 'FY',
'filing_date': fact.filing_date,
'fiscal_year': fact.fiscal_year,
'fiscal_period': fact.fiscal_period
}
period_facts[period_key].append(fact)
# Apply the annual filtering logic
period_list = [(pk, info) for pk, info in period_info.items()]
true_annual_periods = []
for pk, info in period_list:
if not info['is_annual']:
continue
fiscal_year = pk[0]
period_end_date = pk[2]
# Check if fiscal_year matches period_end.year
if not (period_end_date and period_end_date.year == fiscal_year):
continue
# Check duration
period_fact_list = period_facts.get(pk, [])
if period_fact_list:
sample_fact = period_fact_list[0]
if sample_fact.period_start and sample_fact.period_end:
duration = (sample_fact.period_end - sample_fact.period_start).days
if duration > 300:
true_annual_periods.append((pk, info))
# Find revenue for this period
for fact in period_fact_list:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
print(f"Selected: FY {fiscal_year} ends {period_end_date}: ${fact.value:,.0f} (duration: {duration} days)")
break
print(f"\nTotal true annual periods found: {len(true_annual_periods)}")
# Check what's in the final selection
annual_by_year = {}
for pk, info in true_annual_periods:
fiscal_year = pk[0]
period_end_date = pk[2]
if fiscal_year not in annual_by_year or period_end_date > annual_by_year[fiscal_year][0][2]:
annual_by_year[fiscal_year] = (pk, info)
sorted_periods = sorted(annual_by_year.items(), key=lambda x: x[0], reverse=True)
selected = [period_info for year, period_info in sorted_periods[:6]]
print(f"\nFinal selected periods:")
for (year, period, end), info in selected:
print(f" FY {year} ends {end}")
# Find revenue for this period
for fact in period_facts[(year, period, end)]:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
duration = (fact.period_end - fact.period_start).days if fact.period_start else None
print(f" Revenue: ${fact.value:,.0f} (duration: {duration} days)")
break

View File

@@ -0,0 +1,33 @@
from edgar import Company
# Get Apple facts and display income statement
aapl = Company("AAPL")
facts = aapl.facts
print("Testing with annual=True, periods=6:")
income = facts.income_statement(annual=True, periods=6)
# Get the internal data
items = income.items
# Find the Total Revenue item
for item in items:
if "Revenue" in item.label and "Total" in item.label:
print(f"\n{item.label}:")
print(f" Values: {item.values}")
print(f" Periods: {income.periods}")
# Show what values we have
for i, (period, value) in enumerate(zip(income.periods, item.values)):
if value:
print(f" {period}: {value}")
# Let's also check what raw facts we have
print("\n\nChecking raw facts for FY 2019 and FY 2020:")
raw_facts = facts._facts
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year in [2019, 2020]:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
match = "" if fact.period_end and fact.period_end.year == fact.fiscal_year else ""
print(f" FY {fact.fiscal_year} ends {fact.period_end}: ${fact.value:,.0f} {match}")

View File

@@ -0,0 +1,71 @@
from edgar import Company
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
# Build statement manually to debug
builder = EnhancedStatementBuilder()
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Build period info
from collections import defaultdict
period_info = {}
period_facts_map = defaultdict(list)
for fact in stmt_facts:
period_key = (fact.fiscal_year, fact.fiscal_period)
period_label = f"{fact.fiscal_period} {fact.fiscal_year}"
period_facts_map[period_label].append(fact)
if period_key not in period_info:
period_info[period_key] = {
'label': period_label,
'end_date': fact.period_end,
'is_annual': fact.fiscal_period == 'FY',
'filing_date': fact.filing_date,
'fiscal_year': fact.fiscal_year,
'fiscal_period': fact.fiscal_period
}
# Create list of periods
period_list = [(pk, info) for pk, info in period_info.items()]
# Filter for annual
annual_periods = [(pk, info) for pk, info in period_list if info['is_annual']]
print(f"Total annual periods before sort: {len(annual_periods)}")
# Sort by end_date
annual_periods.sort(key=lambda x: x[1]['end_date'], reverse=True)
print("\nFirst 10 annual periods after sorting by end_date:")
for i, ((year, period), info) in enumerate(annual_periods[:10]):
print(f" {i}: FY {year} - ends {info['end_date']}")
# Deduplicate by fiscal year
seen_years = set()
unique_annual_periods = []
for pk, info in annual_periods:
fiscal_year = pk[0]
if fiscal_year not in seen_years:
seen_years.add(fiscal_year)
unique_annual_periods.append((pk, info))
print(f" Keeping: FY {fiscal_year} ending {info['end_date']}")
print(f"\nUnique annual periods: {len(unique_annual_periods)}")
print("\nFirst 6 unique periods:")
for (year, period), info in unique_annual_periods[:6]:
print(f" FY {year} - ends {info['end_date']}")
# Check what revenue value we have for those periods
print("\nRevenue values for selected periods:")
for (year, fp), info in unique_annual_periods[:6]:
period_label = info['label']
# Find revenue fact for this period
for fact in period_facts_map[period_label]:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
print(f" {period_label}: {fact.concept} = ${fact.value:,}")
break

View File

@@ -0,0 +1,71 @@
from edgar import Company
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
# Build statement manually to debug
builder = EnhancedStatementBuilder()
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Build period info
period_info = {}
period_facts_map = defaultdict(list)
for fact in stmt_facts:
period_key = (fact.fiscal_year, fact.fiscal_period)
period_label = f"{fact.fiscal_period} {fact.fiscal_year}"
period_facts_map[period_label].append(fact)
if period_key not in period_info:
period_info[period_key] = {
'label': period_label,
'end_date': fact.period_end,
'is_annual': fact.fiscal_period == 'FY',
'filing_date': fact.filing_date,
'fiscal_year': fact.fiscal_year,
'fiscal_period': fact.fiscal_period
}
# Apply the fix logic
period_list = [(pk, info) for pk, info in period_info.items()]
annual_periods = [(pk, info) for pk, info in period_list if info['is_annual']]
print(f"Total annual periods: {len(annual_periods)}")
# Apply the matching logic
correct_annual_periods = {}
for pk, info in annual_periods:
fiscal_year = pk[0]
if info['end_date'] and info['end_date'].year == fiscal_year:
if fiscal_year not in correct_annual_periods or \
info['end_date'] > correct_annual_periods[fiscal_year][1]['end_date']:
correct_annual_periods[fiscal_year] = (pk, info)
print(f" Selected FY {fiscal_year}: ends {info['end_date']}")
print(f"\nCorrect annual periods found: {len(correct_annual_periods)}")
# Sort and select
sorted_periods = sorted(correct_annual_periods.items(), key=lambda x: x[0], reverse=True)
selected_period_info = [period_info for year, period_info in sorted_periods[:6]]
print(f"\nSelected {len(selected_period_info)} periods:")
for (year, period), info in selected_period_info:
print(f" {info['label']}")
# Check what revenue facts we have for these periods
print("\nRevenue facts for selected periods:")
for (year, fp), info in selected_period_info:
period_label = info['label']
revenue_found = False
for fact in period_facts_map[period_label]:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
print(f" {period_label}: ${fact.value:,.0f}")
revenue_found = True
break
if not revenue_found:
print(f" {period_label}: No revenue found")

View File

@@ -0,0 +1,262 @@
#!/usr/bin/env python3
"""
Debug script to investigate table parsing/rendering issues in MSFT 10-K.
Focus on the "Weighted average outstanding shares of common stock (B)" table.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
from bs4 import BeautifulSoup
def find_table_in_html():
"""Find and examine the table HTML structure around the target text."""
print("🔍 EXAMINING TABLE HTML STRUCTURE")
print("=" * 50)
try:
# Read the MSFT file
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
print(f"File size: {len(html_content)} characters")
# Find the table containing our target text
soup = BeautifulSoup(html_content, 'html.parser')
# Search for the specific text
target_elements = soup.find_all(text=lambda text: text and "Weighted average outstanding shares of common stock" in text)
print(f"\nFound {len(target_elements)} elements with target text")
for i, element in enumerate(target_elements):
print(f"\n📍 Element {i+1}:")
print(f" Text: {element.strip()[:80]}...")
# Find the containing table
parent = element.parent
while parent and parent.name != 'table':
parent = parent.parent
if parent and parent.name == 'table':
print(f" Found containing table!")
# Analyze the table structure
rows = parent.find_all('tr')
print(f" Table has {len(rows)} rows")
# Look at first few rows
for j, row in enumerate(rows[:5]):
cells = row.find_all(['td', 'th'])
print(f" Row {j+1}: {len(cells)} cells")
for k, cell in enumerate(cells[:3]): # First 3 cells
cell_text = cell.get_text().strip()[:30].replace('\n', ' ')
print(f" Cell {k+1}: '{cell_text}...'")
return parent
else:
print(f" No containing table found")
return None
except Exception as e:
print(f"❌ Error examining HTML: {e}")
import traceback
traceback.print_exc()
return None
def test_parser_on_msft():
"""Test the document parser on the MSFT file."""
print("\n🚀 TESTING DOCUMENT PARSER")
print("=" * 50)
try:
# Read the MSFT file
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Parse with different configurations
configs_to_test = [
("Default", ParserConfig()),
("Performance", ParserConfig.for_performance()),
("Accuracy", ParserConfig.for_accuracy()),
]
for config_name, config in configs_to_test:
print(f"\n🧪 Testing with {config_name} config...")
parser = HTMLParser(config)
document = parser.parse(html_content)
print(f" Document parsed successfully")
print(f" Root children: {len(document.root.children)}")
# Find tables with our target text
matching_tables = []
def find_target_tables(node):
if isinstance(node, TableNode):
table_text = node.text()
if "Weighted average outstanding shares of common stock" in table_text:
matching_tables.append(node)
for child in node.children:
find_target_tables(child)
find_target_tables(document.root)
print(f" Found {len(matching_tables)} table(s) with target text")
for i, table in enumerate(matching_tables):
print(f"\n 📋 Table {i+1}:")
print(f" Headers: {len(table.headers)} row(s)")
print(f" Data rows: {len(table.rows)}")
print(f" Table type: {table.table_type}")
# Show table structure
if table.headers:
print(f" Header structure:")
for j, header_row in enumerate(table.headers):
print(f" Row {j+1}: {len(header_row)} cells")
for k, cell in enumerate(header_row[:3]):
cell_text = cell.text().strip()[:20].replace('\n', ' ')
print(f" Cell {k+1}: '{cell_text}...'")
print(f" First few data rows:")
for j, row in enumerate(table.rows[:3]):
print(f" Row {j+1}: {len(row.cells)} cells")
for k, cell in enumerate(row.cells[:3]):
cell_text = cell.text().strip()[:20].replace('\n', ' ')
print(f" Cell {k+1}: '{cell_text}...'")
# Get the text output
table_text = table.text()
print(f"\n Text output ({len(table_text)} chars):")
print(" " + "-" * 40)
# Show first few lines
lines = table_text.split('\n')
for line_num, line in enumerate(lines[:10]):
print(f" {line_num+1:2d}: {line}")
if len(lines) > 10:
print(f" ... ({len(lines)-10} more lines)")
print(" " + "-" * 40)
# Check for issues
issues = []
if len(table_text.strip()) == 0:
issues.append("Empty text output")
if "Weighted average outstanding shares" not in table_text:
issues.append("Missing target text in output")
if table_text.count('|') < 5: # Should have multiple columns
issues.append("Possibly missing column separators")
if len(lines) < 3:
issues.append("Very few output lines")
if issues:
print(f" ⚠️ Issues detected: {', '.join(issues)}")
return table # Return problematic table for further analysis
else:
print(f" ✅ Table appears to render correctly")
return None
except Exception as e:
print(f"❌ Parser test failed: {e}")
import traceback
traceback.print_exc()
return None
def analyze_table_structure(table):
"""Deep analysis of a problematic table."""
print("\n🔬 DEEP TABLE ANALYSIS")
print("=" * 50)
if not table:
print("No table to analyze")
return
print(f"Table type: {table.table_type}")
print(f"Caption: {table.caption}")
print(f"Summary: {table.summary}")
# Analyze headers
print(f"\n📋 HEADERS ({len(table.headers)} rows):")
for i, header_row in enumerate(table.headers):
print(f" Row {i+1} ({len(header_row)} cells):")
for j, cell in enumerate(header_row):
print(f" Cell {j+1}: colspan={cell.colspan}, rowspan={cell.rowspan}")
print(f" text='{cell.text()[:40]}...'")
print(f" is_header={cell.is_header}")
# Analyze data rows
print(f"\n📊 DATA ROWS ({len(table.rows)} rows):")
for i, row in enumerate(table.rows[:5]): # First 5 rows
print(f" Row {i+1} ({len(row.cells)} cells):")
for j, cell in enumerate(row.cells):
print(f" Cell {j+1}: colspan={cell.colspan}, rowspan={cell.rowspan}")
print(f" text='{cell.text()[:40]}...'")
print(f" is_numeric={cell.is_numeric}")
if len(table.rows) > 5:
print(f" ... and {len(table.rows)-5} more rows")
# Test different rendering approaches
print(f"\n🖼️ TESTING DIFFERENT RENDERERS:")
# Rich renderer
try:
rich_table = table.render(width=120)
from edgar.richtools import rich_to_text
rich_text = rich_to_text(rich_table)
print(f" Rich renderer: {len(rich_text)} chars")
print(f" Preview: {rich_text[:100]}...")
except Exception as e:
print(f" Rich renderer failed: {e}")
# Fast renderer
try:
fast_text = table._fast_text_rendering()
print(f" Fast renderer: {len(fast_text)} chars")
print(f" Preview: {fast_text[:100]}...")
except Exception as e:
print(f" Fast renderer failed: {e}")
# Compare outputs
try:
current_text = table.text()
print(f" Current text() method: {len(current_text)} chars")
if "Weighted average outstanding shares" in current_text:
print(f" ✅ Contains target text")
else:
print(f" ❌ Missing target text")
except Exception as e:
print(f" Current text() method failed: {e}")
if __name__ == "__main__":
print("🎯 DEBUGGING MSFT TABLE PARSING ISSUE")
print("Target: 'Weighted average outstanding shares of common stock (B)' table")
print()
# Step 1: Examine HTML structure
table_element = find_table_in_html()
# Step 2: Test parser with different configurations
problematic_table = test_parser_on_msft()
# Step 3: Deep analysis if issues found
if problematic_table:
analyze_table_structure(problematic_table)
print(f"\n🎯 CONCLUSION:")
print("A problematic table was identified. Check the analysis above")
print("for specific issues with parsing or rendering.")
else:
print(f"\n✅ CONCLUSION:")
print("No obvious parsing issues were detected. The table appears to")
print("be parsing and rendering correctly with the current parser.")
print("If there are still issues, they may be subtle formatting problems.")

View File

@@ -0,0 +1,159 @@
#!/usr/bin/env python3
"""
Debug why Rich table rendering is still producing poor structure even with headers detected.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def debug_rich_rendering_issue():
print("🔍 DEBUGGING RICH RENDERING WITH DETECTED HEADERS")
print("=" * 60)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
config = ParserConfig()
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return
print("✅ Found target table")
print(f"Headers: {len(target_table.headers)}")
print(f"Data rows: {len(target_table.rows)}")
# Examine the table structure in detail
print(f"\n🔍 DETAILED TABLE STRUCTURE ANALYSIS:")
# Check headers
if target_table.headers:
for i, header_row in enumerate(target_table.headers):
print(f"\nHeader row {i+1}: {len(header_row)} cells")
for j, cell in enumerate(header_row[:8]): # First 8 cells
print(f" Cell {j+1}: '{cell.text()}' (colspan={cell.colspan}, rowspan={cell.rowspan})")
# Check data row structure
print(f"\n📊 DATA ROW ANALYSIS:")
for i, row in enumerate(target_table.rows[:5]): # First 5 data rows
content_cells = [j for j, cell in enumerate(row.cells) if cell.text().strip()]
print(f"Row {i+1}: {len(row.cells)} total cells, content in positions {content_cells}")
# Show first few cells with content
for j in content_cells[:3]:
if j < len(row.cells):
cell = row.cells[j]
print(f" Cell {j+1}: '{cell.text()[:30]}...' (align={cell.align})")
# Check table dimensions
max_cols = max(len(row.cells) for row in target_table.rows) if target_table.rows else 0
header_cols = len(target_table.headers[0]) if target_table.headers else 0
print(f"\n📏 TABLE DIMENSIONS:")
print(f" Header columns: {header_cols}")
print(f" Max data columns: {max_cols}")
print(f" Dimension mismatch: {'YES' if header_cols != max_cols else 'NO'}")
# Count empty vs content cells
total_cells = sum(len(row.cells) for row in target_table.rows)
empty_cells = sum(1 for row in target_table.rows for cell in row.cells if not cell.text().strip())
print(f" Total data cells: {total_cells}")
print(f" Empty data cells: {empty_cells} ({empty_cells/total_cells*100:.1f}%)")
# Test Rich table creation manually
print(f"\n🎨 TESTING RICH TABLE CREATION:")
try:
rich_table = target_table.render(width=120)
print(f"✅ Rich table created successfully")
print(f"Rich table type: {type(rich_table)}")
# Check Rich table properties
if hasattr(rich_table, 'columns'):
print(f"Rich columns: {len(rich_table.columns)}")
if hasattr(rich_table, 'rows'):
print(f"Rich rows: {len(rich_table.rows)}")
except Exception as e:
print(f"❌ Rich table creation failed: {e}")
import traceback
traceback.print_exc()
return
# Test text conversion
print(f"\n📝 TESTING TEXT CONVERSION:")
try:
from edgar.richtools import rich_to_text
rich_text = rich_to_text(rich_table)
lines = rich_text.split('\n')
print(f"Text output: {len(lines)} lines, {len(rich_text)} chars")
# Analyze line types
empty_lines = sum(1 for line in lines if not line.strip())
border_lines = sum(1 for line in lines if any(c in line for c in '┌┐└┘├┤│─'))
content_lines = sum(1 for line in lines if line.strip() and not all(c in '┌┐└┘├┤│─ ' for c in line))
print(f" Empty lines: {empty_lines}")
print(f" Border lines: {border_lines}")
print(f" Content lines: {content_lines}")
# Show actual structure
print(f"\nFirst 10 lines of output:")
for i, line in enumerate(lines[:10]):
line_type = "EMPTY" if not line.strip() else "BORDER" if any(c in line for c in '┌┐└┘├┤│─') else "CONTENT"
print(f" {i+1:2d} [{line_type:7}]: {line[:60]}{'...' if len(line) > 60 else ''}")
# The problem might be that Rich is creating a table but with poor formatting
# Let's see if we can identify the issue
if border_lines < 3:
print(f"\n❌ DIAGNOSIS: Very few border lines - Rich table structure is poor")
print("This suggests the table has structural issues that prevent proper rendering.")
print("Possible causes:")
print("1. Column count mismatch between headers and data")
print("2. Too many empty cells causing poor layout")
print("3. Cell spanning issues")
print("4. Table too wide for rendering width")
else:
print(f"\n✅ Rich table structure appears normal")
except Exception as e:
print(f"❌ Text conversion failed: {e}")
return
return target_table
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
debug_rich_rendering_issue()
print(f"\n🎯 NEXT STEPS:")
print("Based on the analysis above, we can identify specific issues preventing")
print("proper Rich table rendering and address them systematically.")

View File

@@ -0,0 +1,61 @@
from edgar import Company
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
# Build the income statement
builder = EnhancedStatementBuilder()
stmt = builder.build_multi_period_statement(
facts=facts._facts,
statement_type='IncomeStatement',
periods=6,
annual=True
)
print(f"Selected periods: {stmt.periods}")
print("\nChecking Revenue item values:")
# Find the revenue item
for item in stmt.items:
if item.label and 'Revenue' in item.label and 'Total' in item.label:
print(f"\n{item.label}:")
for i, (period, value) in enumerate(zip(stmt.periods, item.values)):
print(f" {period}: {value}")
# Check what concept this maps to
if hasattr(item, 'concept'):
print(f" Concept: {item.concept}")
# Now let's check what facts are in period_facts_by_label
print("\n\nChecking what facts are in the FY 2020 period:")
from collections import defaultdict
# Recreate what the builder does
raw_facts = facts._facts
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Build period_facts with the new key structure
period_facts = defaultdict(list)
for fact in stmt_facts:
period_key = (fact.fiscal_year, fact.fiscal_period, fact.period_end)
period_facts[period_key].append(fact)
# Look for FY 2020 periods
for key in period_facts.keys():
if key[0] == 2020 and key[1] == 'FY':
if key[2] and key[2].year == 2020: # Correct match
print(f"\nKey: {key}")
# Check revenue facts in this period
for fact in period_facts[key]:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
duration = None
if fact.period_start:
duration = (fact.period_end - fact.period_start).days
print(f" Revenue: ${fact.value:,.0f} (duration: {duration})")
# The issue might be in how period_facts_by_label is built
print("\n\nChecking period_facts_by_label mapping:")
# This is what happens in the builder after selection
# It remaps from period_key to label, but multiple keys can have the same label!

View File

@@ -0,0 +1,190 @@
#!/usr/bin/env python3
"""
Debug the table structure to understand why we're getting so many empty columns.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def analyze_table_structure():
print("🔍 ANALYZING TABLE STRUCTURE")
print("=" * 50)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
config = ParserConfig(fast_table_rendering=True)
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return
print("✅ Found target table")
# Analyze the structure
print(f"\nTable structure:")
print(f" Headers: {len(target_table.headers)} rows")
print(f" Data rows: {len(target_table.rows)}")
# Analyze header structure
print(f"\n📋 HEADER ANALYSIS:")
for i, header_row in enumerate(target_table.headers):
print(f" Header row {i+1}: {len(header_row)} cells")
for j, cell in enumerate(header_row[:10]): # First 10 cells
text = cell.text().strip()
display_text = text[:20] if text else "[EMPTY]"
print(f" Cell {j+1}: '{display_text}' (colspan={cell.colspan})")
# Analyze data rows
print(f"\n📊 DATA ROW ANALYSIS:")
for i, row in enumerate(target_table.rows[:5]): # First 5 rows
print(f" Row {i+1}: {len(row.cells)} cells")
for j, cell in enumerate(row.cells[:10]): # First 10 cells
text = cell.text().strip()
display_text = text[:20] if text else "[EMPTY]"
print(f" Cell {j+1}: '{display_text}' (colspan={cell.colspan})")
# Count empty vs filled cells
total_cells = 0
empty_cells = 0
for header_row in target_table.headers:
for cell in header_row:
total_cells += 1
if not cell.text().strip():
empty_cells += 1
for row in target_table.rows:
for cell in row.cells:
total_cells += 1
if not cell.text().strip():
empty_cells += 1
print(f"\n📊 CELL STATISTICS:")
print(f" Total cells: {total_cells}")
print(f" Empty cells: {empty_cells}")
print(f" Filled cells: {total_cells - empty_cells}")
print(f" Empty percentage: {empty_cells/total_cells*100:.1f}%")
# Check maximum meaningful columns
max_meaningful_cols = 0
for row in target_table.rows:
meaningful_cols = 0
for cell in row.cells:
if cell.text().strip():
meaningful_cols = len([c for c in row.cells[:len(row.cells)] if c.text().strip()])
break
max_meaningful_cols = max(max_meaningful_cols, meaningful_cols)
print(f" Maximum meaningful columns in any row: {max_meaningful_cols}")
return target_table
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return None
def test_column_filtering():
"""Test filtering out empty columns."""
print(f"\n🔧 TESTING COLUMN FILTERING")
print("=" * 50)
target_table = analyze_table_structure()
if not target_table:
return
# Analyze which columns actually have content
if not target_table.rows:
print("No data rows to analyze")
return
max_cols = max(len(row.cells) for row in target_table.rows)
print(f"Maximum columns: {max_cols}")
# Check each column for meaningful content
meaningful_columns = []
for col_idx in range(max_cols):
has_content = False
# Check headers
for header_row in target_table.headers:
if col_idx < len(header_row) and header_row[col_idx].text().strip():
has_content = True
break
# Check data rows
if not has_content:
for row in target_table.rows:
if col_idx < len(row.cells) and row.cells[col_idx].text().strip():
has_content = True
break
if has_content:
meaningful_columns.append(col_idx)
print(f"Meaningful columns: {meaningful_columns} ({len(meaningful_columns)} total)")
# Test rendering with only meaningful columns
print(f"\n📊 FILTERED TABLE PREVIEW:")
# Show first data row with only meaningful columns
if target_table.rows:
first_row = target_table.rows[0]
filtered_cells = []
for col_idx in meaningful_columns:
if col_idx < len(first_row.cells):
cell_text = first_row.cells[col_idx].text().strip()
filtered_cells.append(cell_text if cell_text else "[EMPTY]")
else:
filtered_cells.append("[MISSING]")
print("First row filtered:", " | ".join(filtered_cells))
return meaningful_columns
if __name__ == "__main__":
print("🎯 DEBUGGING TABLE STRUCTURE ISSUE")
print("Focus: Understanding why we get so many empty columns")
print()
meaningful_cols = test_column_filtering()
if meaningful_cols:
print(f"\n🎯 FINDINGS:")
print(f"The table has many empty spacing columns.")
print(f"Only {len(meaningful_cols)} out of many columns have actual content.")
print(f"The FastTableRenderer should filter out empty columns.")
print(f"\n🔧 SOLUTION:")
print("Update FastTableRenderer to:")
print("1. Identify columns with meaningful content")
print("2. Filter out purely empty/spacing columns")
print("3. Only render the meaningful columns")
else:
print("❌ Could not analyze column structure")

View File

@@ -0,0 +1,225 @@
#!/usr/bin/env python3
"""
Debug why tables are losing their structure during parsing.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
from bs4 import BeautifulSoup
def examine_raw_html_table():
"""Examine the raw HTML structure of the problematic table."""
print("🔍 EXAMINING RAW HTML TABLE STRUCTURE")
print("=" * 55)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Find the table HTML
soup = BeautifulSoup(html_content, 'html.parser')
# Look for table containing our target text
target_elements = soup.find_all(string=lambda text: text and "Weighted average outstanding shares" in text)
if not target_elements:
print("❌ Target text not found in HTML")
return None
target_element = target_elements[0]
# Find the containing table
table_element = target_element
while table_element and table_element.name != 'table':
table_element = table_element.parent
if not table_element:
print("❌ No containing table found")
return None
print("✅ Found containing HTML table")
# Analyze the HTML table structure
rows = table_element.find_all('tr')
print(f"HTML table has {len(rows)} rows")
# Look for thead, tbody structure
thead = table_element.find('thead')
tbody = table_element.find('tbody')
print(f"Has <thead>: {'' if thead else ''}")
print(f"Has <tbody>: {'' if tbody else ''}")
# Analyze first few rows
print(f"\nFirst few rows analysis:")
for i, row in enumerate(rows[:10]):
cells = row.find_all(['td', 'th'])
cell_info = []
for cell in cells[:5]: # First 5 cells
text = cell.get_text().strip()[:20]
tag = cell.name
colspan = cell.get('colspan', '1')
cell_info.append(f"{tag}({colspan}):'{text}'")
print(f" Row {i+1}: {len(cells)} cells - {', '.join(cell_info)}")
if len(cells) > 5:
print(f" ... and {len(cells)-5} more cells")
# Check if there are any TH (header) cells
th_cells = table_element.find_all('th')
print(f"\nTotal <th> header cells: {len(th_cells)}")
# Look for potential header patterns
header_candidates = []
for i, row in enumerate(rows[:5]): # Check first 5 rows for headers
cells = row.find_all(['td', 'th'])
row_text = ' '.join(cell.get_text().strip() for cell in cells).strip()
if any(keyword in row_text.lower() for keyword in ['year', 'ended', '2025', '2024', '2023']):
header_candidates.append(i)
print(f" Potential header row {i+1}: {row_text[:80]}...")
return table_element
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return None
def debug_table_parsing_pipeline():
"""Debug how the table gets processed through the parsing pipeline."""
print(f"\n🔧 DEBUGGING TABLE PARSING PIPELINE")
print("=" * 55)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
config = ParserConfig(fast_table_rendering=False)
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found in parsed document")
return
print("✅ Found target table in parsed document")
# Analyze how the table was parsed
print(f"\nParsed table analysis:")
print(f" Table type: {target_table.table_type}")
print(f" Has headers: {'' if target_table.headers else ''}")
print(f" Header rows: {len(target_table.headers)}")
print(f" Data rows: {len(target_table.rows)}")
print(f" Caption: {target_table.caption}")
# Check if headers were detected
if target_table.headers:
print(f"\n Header structure:")
for i, header_row in enumerate(target_table.headers):
header_texts = [cell.text().strip()[:20] for cell in header_row]
print(f" Header row {i+1}: {header_texts}")
else:
print(f"\n ❌ NO HEADERS DETECTED - This is likely the problem!")
print(f" The parser failed to identify header rows in the HTML table.")
# Check if any of the first few data rows look like headers
print(f"\n First few data rows (might be misclassified headers):")
for i, row in enumerate(target_table.rows[:5]):
row_texts = [cell.text().strip()[:20] for cell in row.cells[:5]]
print(f" Data row {i+1}: {row_texts}")
# Check if this row looks like a header
row_text = ' '.join(cell.text().strip() for cell in row.cells)
if any(keyword in row_text.lower() for keyword in ['year', 'ended', '2025', '2024', '2023', 'millions']):
print(f" ⚠️ This looks like it should be a header row!")
# Test manual header detection
print(f"\n🔍 MANUAL HEADER DETECTION TEST:")
potential_headers = []
for i, row in enumerate(target_table.rows[:5]):
row_text = ' '.join(cell.text().strip() for cell in row.cells).strip()
# Score this row as a potential header
header_score = 0
# Check for typical header keywords
header_keywords = ['millions', 'year ended', 'june 30', '2025', '2024', '2023']
for keyword in header_keywords:
if keyword in row_text.lower():
header_score += 1
# Check for mostly empty cells (common in header spacing rows)
empty_cells = sum(1 for cell in row.cells if not cell.text().strip())
if empty_cells / len(row.cells) > 0.7: # More than 70% empty
header_score -= 1
# Check for meaningful content vs pure spacing
meaningful_cells = sum(1 for cell in row.cells if len(cell.text().strip()) > 2)
if meaningful_cells >= 2: # At least 2 cells with meaningful content
header_score += 1
potential_headers.append((i, row, header_score, row_text))
print(f" Row {i+1}: score={header_score}, text='{row_text[:60]}...'")
# Find the best header candidate
best_header = max(potential_headers, key=lambda x: x[2])
if best_header[2] > 0:
print(f"\n ✅ Best header candidate: Row {best_header[0]+1} (score={best_header[2]})")
print(f" Text: {best_header[3]}")
else:
print(f"\n ❌ No good header candidates found")
return target_table
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
print("🎯 DEBUGGING TABLE STRUCTURE PARSING")
print("Focus: Why tables lose structure during parsing")
print()
# Step 1: Examine raw HTML
html_table = examine_raw_html_table()
# Step 2: Debug parsing pipeline
parsed_table = debug_table_parsing_pipeline()
print(f"\n🎯 DIAGNOSIS:")
if html_table and parsed_table:
print("The table exists in HTML and is being parsed into a TableNode.")
print("The issue is likely in header detection - the parser isn't")
print("properly identifying which rows should be headers vs data.")
print(f"\n🔧 SOLUTION:")
print("1. Improve header detection logic in table parsing")
print("2. Look for rows with year indicators (2025, 2024, 2023) as headers")
print("3. Handle tables without explicit <th> tags better")
print("4. Keep Rich rendering as default for beautiful output")
else:
print("Basic table parsing is failing - need to investigate further.")

View File

@@ -0,0 +1,209 @@
"""
Check specific edge cases in our solution
"""
from edgar import Company
def check_instant_facts():
"""Check how we handle instant facts (balance sheet items)"""
print("\n1. INSTANT FACTS (Balance Sheet Items)")
print("-" * 50)
aapl = Company("AAPL")
facts = aapl.facts._facts
# Look for balance sheet instant facts
instant_count = 0
duration_count = 0
for fact in facts:
if fact.statement_type == 'BalanceSheet' and fact.fiscal_period == 'FY':
if fact.fiscal_year == 2023:
if fact.period_start:
duration_count += 1
else:
instant_count += 1
print(f" Balance Sheet FY 2023 facts:")
print(f" - With duration (period_start exists): {duration_count}")
print(f" - Instant (no period_start): {instant_count}")
print(f" ✓ Our solution handles instant facts correctly (no duration check)")
def check_fiscal_year_boundaries():
"""Check companies with different fiscal year ends"""
print("\n2. FISCAL YEAR BOUNDARY ISSUES")
print("-" * 50)
# Microsoft has June year-end
msft = Company("MSFT")
facts = msft.facts._facts
print(" Microsoft (June year-end):")
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year == 2023 and 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 300:
print(f" FY 2023: {fact.period_start} to {fact.period_end}")
print(f" Period end year: {fact.period_end.year}")
print(f" Fiscal year: {fact.fiscal_year}")
match = "" if fact.period_end.year == fact.fiscal_year else ""
print(f" Year match: {match}")
break
# Walmart has January year-end
print("\n Walmart (January year-end):")
wmt = Company("WMT")
facts = wmt.facts._facts
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year == 2023 and 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 300:
print(f" FY 2023: {fact.period_start} to {fact.period_end}")
print(f" Period end year: {fact.period_end.year}")
print(f" Fiscal year: {fact.fiscal_year}")
match = "" if fact.period_end.year == fact.fiscal_year else ""
print(f" Year match: {match}")
break
def check_duration_edge_cases():
"""Check edge cases around our 300-day threshold"""
print("\n3. DURATION EDGE CASES")
print("-" * 50)
# Collect all annual durations across companies
test_tickers = ['AAPL', 'MSFT', 'WMT', 'JNJ', 'TSLA']
all_durations = []
for ticker in test_tickers:
try:
company = Company(ticker)
facts = company.facts._facts
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year >= 2020 and 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 200: # Collect all potentially annual
all_durations.append((ticker, duration))
except:
pass
# Analyze distribution
from collections import Counter
duration_counts = Counter([d for _, d in all_durations])
print(" Duration distribution for FY Revenue facts:")
for duration in sorted(set([d for _, d in all_durations])):
count = duration_counts[duration]
if duration < 300:
status = "❌ Would be filtered out"
elif duration > 400:
status = "⚠️ Unusually long"
else:
status = "✓ Accepted as annual"
print(f" {duration} days: {count} facts - {status}")
# Check if any annual facts are < 300 days
short_annuals = [d for _, d in all_durations if d >= 250 and d < 300]
if short_annuals:
print(f"\n ⚠️ WARNING: Found {len(short_annuals)} facts between 250-300 days")
print(f" These might be annual but would be filtered out")
def check_leap_year_impact():
"""Check if leap years affect our logic"""
print("\n4. LEAP YEAR IMPACT")
print("-" * 50)
# 2020 was a leap year
aapl = Company("AAPL")
facts = aapl.facts._facts
leap_year_durations = []
regular_year_durations = []
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 300:
if fact.fiscal_year == 2020:
leap_year_durations.append(duration)
elif fact.fiscal_year in [2019, 2021]:
regular_year_durations.append(duration)
if leap_year_durations and regular_year_durations:
print(f" Leap year (2020) durations: {set(leap_year_durations)}")
print(f" Regular year durations: {set(regular_year_durations)}")
print(f" ✓ Difference is minimal, 300-day threshold handles both")
def check_amended_filings():
"""Check how amended filings affect our logic"""
print("\n5. AMENDED FILINGS")
print("-" * 50)
# Look for duplicate facts from amendments
aapl = Company("AAPL")
facts = aapl.facts._facts
# Track facts by fiscal year and duration
from collections import defaultdict
facts_by_year_duration = defaultdict(list)
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year == 2023 and 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 300:
key = (fact.fiscal_year, duration, fact.period_end)
facts_by_year_duration[key].append({
'value': fact.value,
'filing_date': fact.filing_date,
'accession': fact.accession if hasattr(fact, 'accession') else None
})
# Check for duplicates
for key, facts_list in facts_by_year_duration.items():
if len(facts_list) > 1:
year, duration, end_date = key
print(f" Found {len(facts_list)} facts for FY {year} ({duration} days, ends {end_date}):")
for f in facts_list:
print(f" Value: ${f['value']:,.0f}, Filed: {f['filing_date']}")
print(" ⚠️ Multiple facts for same period - might need to pick latest filing")
# Run all checks
if __name__ == "__main__":
print("=" * 60)
print("EDGE CASE ANALYSIS FOR DURATION-BASED SOLUTION")
print("=" * 60)
check_instant_facts()
check_fiscal_year_boundaries()
check_duration_edge_cases()
check_leap_year_impact()
check_amended_filings()
print("\n" + "=" * 60)
print("SUMMARY OF FINDINGS")
print("=" * 60)
print("\n✓ STRENGTHS:")
print(" 1. 300-day threshold works well for standard annual periods (363-365 days)")
print(" 2. Instant facts (balance sheet) handled correctly")
print(" 3. Leap years don't cause issues")
print("\n⚠️ POTENTIAL ISSUES:")
print(" 1. Fiscal year boundary: Some companies' FY doesn't match calendar year")
print(" - WMT FY 2023 ends in Jan 2023 (year mismatch)")
print(" 2. Amended filings might create duplicates")
print(" 3. No handling for multi-year aggregates (>400 days)")
print("\nRECOMMENDED IMPROVEMENTS:")
print(" 1. For fiscal year matching, be more flexible:")
print(" - Allow FY to match period_end.year OR period_end.year + 1")
print(" 2. When duplicates exist, prefer latest filing_date")
print(" 3. Add upper bound check (duration < 400) to exclude multi-year")

View File

@@ -0,0 +1,170 @@
#!/usr/bin/env python3
"""
Test that the table parsing issue is actually fixed with proper config propagation.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def test_msft_table_with_proper_config():
"""Test MSFT table with proper config propagation."""
print("🧪 TESTING MSFT TABLE WITH PROPER CONFIG")
print("=" * 60)
try:
# Parse the document with explicit config
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Test with explicit fast rendering config
config = ParserConfig(fast_table_rendering=True)
parser = HTMLParser(config)
document = parser.parse(html_content)
print(f"Config fast_table_rendering: {config.fast_table_rendering}")
# Find the target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return False
print("✅ Found target table!")
# Ensure config is set on the table
target_table._config = config
# Test the output
table_text = target_table.text()
print(f"\nTable output ({len(table_text)} characters):")
print("-" * 40)
print(table_text)
print("-" * 40)
# Check for proper formatting
lines = table_text.split('\n')
pipe_lines = [line for line in lines if '|' in line and line.strip()]
print(f"\nFormatting analysis:")
print(f" Total lines: {len(lines)}")
print(f" Lines with pipes: {len(pipe_lines)}")
print(f" Contains target text: {'' if 'Weighted average outstanding shares' in table_text else ''}")
if len(pipe_lines) > 5 and 'Weighted average outstanding shares' in table_text:
print("✅ TABLE IS PROPERLY FORMATTED!")
return True
else:
print("❌ Table formatting issues persist")
return False
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return False
def verify_config_propagation():
"""Verify that table nodes receive the config during parsing."""
print(f"\n🔧 VERIFYING CONFIG PROPAGATION")
print("=" * 60)
# We need to check if the HTMLParser properly sets config on table nodes
# This might require modifications to ensure config propagation
print("Checking if TableNodes receive config during parsing...")
# Create a simple test HTML
simple_html = """
<html>
<body>
<table>
<tr><td>Header 1</td><td>Header 2</td></tr>
<tr><td>Data 1</td><td>Data 2</td></tr>
</table>
</body>
</html>
"""
config = ParserConfig(fast_table_rendering=True)
parser = HTMLParser(config)
document = parser.parse(simple_html)
# Find table and check config
table_found = False
def check_table_config(node):
nonlocal table_found
if isinstance(node, TableNode):
table_found = True
has_config = hasattr(node, '_config')
config_matches = has_config and node._config.fast_table_rendering == True
print(f" Table found: ✅")
print(f" Has _config attribute: {'' if has_config else ''}")
print(f" Config fast_table_rendering: {'' if config_matches else ''}")
if not has_config:
print(" 🔧 Setting config manually...")
node._config = config
test_text = node.text()
print(f" Manual config test: {'' if '|' in test_text else ''}")
print(f" Test output preview: {test_text[:50]}...")
return has_config and config_matches
if hasattr(node, 'children'):
for child in node.children:
check_table_config(child)
config_working = check_table_config(document.root)
if not table_found:
print(" ❌ No table found in simple test")
return False
return config_working
if __name__ == "__main__":
print("🎯 FINAL TEST: MSFT TABLE PARSING FIX")
print()
# Test config propagation
config_ok = verify_config_propagation()
# Test MSFT table
table_ok = test_msft_table_with_proper_config()
print(f"\n🏁 FINAL RESULTS:")
print(f" Config propagation: {'' if config_ok else ''}")
print(f" MSFT table formatting: {'' if table_ok else ''}")
if table_ok:
print(f"\n🎉 SUCCESS!")
print("The MSFT table parsing issue has been resolved!")
print("Tables now render with proper pipe formatting.")
else:
print(f"\n🔧 NEEDS WORK:")
if not config_ok:
print("- Config propagation to TableNodes needs to be implemented")
if not table_ok:
print("- Table formatting still has issues")
print("\nRecommended fix: Ensure HTMLParser sets _config on all TableNode instances")

View File

@@ -0,0 +1,196 @@
#!/usr/bin/env python3
"""
Test the improved header detection logic.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def test_header_detection_improvement():
print("🔧 TESTING IMPROVED HEADER DETECTION")
print("=" * 50)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Use default config (Rich rendering)
config = ParserConfig()
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return False
print("✅ Found target table")
# Check the results
print(f"\nImproved parsing results:")
print(f" Headers detected: {len(target_table.headers)} rows")
print(f" Data rows: {len(target_table.rows)}")
if target_table.headers:
print(f"\n📋 DETECTED HEADERS:")
for i, header_row in enumerate(target_table.headers):
header_texts = [cell.text().strip() for cell in header_row if cell.text().strip()]
print(f" Header row {i+1}: {header_texts}")
else:
print(f"\n❌ Still no headers detected")
return False
# Test Rich rendering with proper headers
print(f"\n🎨 TESTING RICH RENDERING:")
rich_table = target_table.render(width=120)
from edgar.richtools import rich_to_text
rich_text = rich_to_text(rich_table)
# Check if Rich now produces structured output
lines = rich_text.split('\n')
structured_lines = [line for line in lines if any(c in line for c in '┌┐└┘├┤│─')]
print(f" Rich output length: {len(rich_text)} chars")
print(f" Total lines: {len(lines)}")
print(f" Structured lines: {len(structured_lines)}")
if len(structured_lines) > 5:
print(f" ✅ Rich output is now properly structured!")
# Show a sample of the structured output
print(f"\n📊 RICH TABLE SAMPLE:")
for i, line in enumerate(lines[:10]):
if line.strip():
print(f" {line}")
return True
else:
print(f" ❌ Rich output still lacks proper structure")
print(f" Sample lines:")
for i, line in enumerate(lines[:5]):
print(f" {i+1}: {line[:60]}{'...' if len(line) > 60 else ''}")
return False
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return False
def compare_before_after():
"""Compare table quality across all tables after the fix."""
print(f"\n📊 COMPARING TABLE QUALITY ACROSS ALL TABLES")
print("=" * 50)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
config = ParserConfig()
parser = HTMLParser(config)
document = parser.parse(html_content)
# Collect all tables
all_tables = []
def collect_tables(node):
if isinstance(node, TableNode):
all_tables.append(node)
if hasattr(node, 'children'):
for child in node.children:
collect_tables(child)
collect_tables(document.root)
print(f"Found {len(all_tables)} total tables")
# Analyze table quality
good_tables = 0
tables_with_headers = 0
from edgar.richtools import rich_to_text
for i, table in enumerate(all_tables):
try:
# Count tables with headers
if table.headers:
tables_with_headers += 1
# Test Rich rendering quality
rich_table = table.render(width=120)
rich_text = rich_to_text(rich_table)
lines = rich_text.split('\n')
structured_lines = [line for line in lines if any(c in line for c in '┌┐└┘├┤│─')]
if len(structured_lines) > 3:
good_tables += 1
except Exception as e:
pass # Skip problematic tables
print(f"\nTable quality summary:")
print(f" Tables with headers: {tables_with_headers}/{len(all_tables)} ({tables_with_headers/len(all_tables)*100:.1f}%)")
print(f" Well-structured tables: {good_tables}/{len(all_tables)} ({good_tables/len(all_tables)*100:.1f}%)")
if tables_with_headers > 0:
print(f" ✅ Header detection is working!")
else:
print(f" ❌ Header detection still needs work")
if good_tables > 0:
print(f" ✅ Some tables now render with proper structure!")
else:
print(f" ❌ Rich rendering still needs improvement")
return tables_with_headers > 0 and good_tables > 0
except Exception as e:
print(f"❌ Error: {e}")
return False
if __name__ == "__main__":
print("🎯 TESTING IMPROVED TABLE PARSING")
print("Focus: Better header detection for Rich table rendering")
print()
# Test specific target table
target_success = test_header_detection_improvement()
# Test overall improvement
overall_success = compare_before_after()
print(f"\n🏁 FINAL RESULTS:")
print(f" Target table fixed: {'' if target_success else ''}")
print(f" Overall improvement: {'' if overall_success else ''}")
if target_success and overall_success:
print(f"\n🎉 SUCCESS!")
print("The table parsing issue has been resolved!")
print("Tables now render with beautiful Rich formatting!")
elif target_success:
print(f"\n🎯 PARTIAL SUCCESS!")
print("The target table is fixed, but more work needed on other tables.")
else:
print(f"\n🔧 MORE WORK NEEDED")
print("Header detection improvements aren't sufficient yet.")

View File

@@ -0,0 +1,194 @@
#!/usr/bin/env python3
"""
Test the improved FastTableRenderer with column filtering.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def test_improved_rendering():
print("🧪 TESTING IMPROVED FAST TABLE RENDERER")
print("=" * 55)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
config = ParserConfig(fast_table_rendering=True)
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return False
print("✅ Found target table")
# Clear cache to get fresh rendering
if hasattr(target_table, '_text_cache'):
target_table._text_cache = None
# Get new table text
table_text = target_table.text()
print(f"\nImproved table output ({len(table_text)} characters):")
print("-" * 60)
print(table_text)
print("-" * 60)
# Analyze the improvement
lines = [line for line in table_text.split('\n') if line.strip()]
pipe_lines = [line for line in lines if '|' in line]
if pipe_lines:
# Count columns in the first content line
first_content_line = pipe_lines[0]
column_count = first_content_line.count('|') - 1 # Subtract 1 for border
print(f"\nTable structure analysis:")
print(f" Total lines: {len(lines)}")
print(f" Lines with pipes: {len(pipe_lines)}")
print(f" Columns: {column_count}")
# Check if it looks reasonable (should be ~4 columns: Description, 2025, 2024, 2023)
if 3 <= column_count <= 6:
print(f" ✅ Column count looks reasonable ({column_count} columns)")
else:
print(f" ⚠️ Column count still seems high ({column_count} columns)")
# Check for specific improvements
improvements = []
issues = []
if "Weighted average outstanding shares" in table_text:
improvements.append("Contains target text")
else:
issues.append("Missing target text")
if "|" in table_text:
improvements.append("Has pipe separators")
else:
issues.append("No pipe separators")
# Count empty columns (sequences of | | | with only spaces between)
empty_column_pattern = r'\|\s*\|\s*\|'
import re
empty_sequences = len(re.findall(empty_column_pattern, table_text))
if empty_sequences < 5: # Much fewer than before
improvements.append("Reduced empty columns")
else:
issues.append("Still many empty columns")
if len(table_text) < 2000: # Should be more compact
improvements.append("More compact output")
else:
issues.append("Still verbose output")
print(f"\nQuality assessment:")
if improvements:
print(" ✅ Improvements:")
for improvement in improvements:
print(f" - {improvement}")
if issues:
print(" ⚠️ Remaining issues:")
for issue in issues:
print(f" - {issue}")
# Show sample of first few lines for readability
print(f"\nFirst few lines preview:")
for i, line in enumerate(pipe_lines[:5]):
print(f" {i+1}: {line}")
return len(issues) == 0
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return False
def compare_with_rich():
"""Compare the improved fast renderer with Rich renderer."""
print(f"\n🔄 COMPARING WITH RICH RENDERER")
print("=" * 55)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Test both renderers
configs = [
("Fast Renderer", ParserConfig(fast_table_rendering=True)),
("Rich Renderer", ParserConfig(fast_table_rendering=False)),
]
for config_name, config in configs:
print(f"\n🔧 {config_name}:")
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if target_table:
table_text = target_table.text()
lines = table_text.split('\n')
pipe_lines = [line for line in lines if '|' in line and line.strip()]
print(f" Length: {len(table_text)} chars")
print(f" Lines: {len(lines)}")
print(f" Pipe lines: {len(pipe_lines)}")
print(f" Contains target: {'' if 'Weighted average outstanding shares' in table_text else ''}")
print(f" First line: {lines[0][:60]}..." if lines else " No lines")
else:
print(" ❌ Table not found")
except Exception as e:
print(f"❌ Comparison failed: {e}")
if __name__ == "__main__":
success = test_improved_rendering()
compare_with_rich()
if success:
print(f"\n🎉 SUCCESS!")
print("The improved FastTableRenderer is working well!")
else:
print(f"\n🔧 NEEDS MORE WORK")
print("The renderer still needs improvements.")

View File

@@ -0,0 +1,134 @@
"""
Test our duration-based solution across different companies to identify edge cases
"""
from edgar import Company
from collections import defaultdict
import sys
def analyze_company_periods(ticker, company_name):
"""Analyze period durations for a company"""
print(f"\n{'='*60}")
print(f"Analyzing {company_name} ({ticker})")
print('='*60)
try:
company = Company(ticker)
facts = company.facts
raw_facts = facts._facts
# Find FY facts with different durations
fy_facts_by_duration = defaultdict(list)
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year and fact.fiscal_year >= 2019:
# Check for revenue facts
if 'Revenue' in str(fact.concept):
duration = None
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
duration_bucket = "No duration"
if duration:
if duration < 100:
duration_bucket = f"Quarterly (~{duration} days)"
elif duration > 300 and duration < 400:
duration_bucket = f"Annual (~{duration} days)"
elif duration > 180 and duration < 200:
duration_bucket = f"Semi-annual (~{duration} days)"
elif duration > 700:
duration_bucket = f"Multi-year (~{duration} days)"
else:
duration_bucket = f"Other ({duration} days)"
fy_facts_by_duration[duration_bucket].append({
'year': fact.fiscal_year,
'value': fact.value,
'duration': duration,
'period_end': fact.period_end
})
# Report findings
for bucket in sorted(fy_facts_by_duration.keys()):
facts_list = fy_facts_by_duration[bucket]
print(f"\n{bucket}: {len(facts_list)} facts")
# Show a few examples
for fact in facts_list[:3]:
print(f" FY {fact['year']}: ${fact['value']:,.0f}")
return fy_facts_by_duration
except Exception as e:
print(f" Error: {e}")
return None
# Test various types of companies
test_companies = [
('AAPL', 'Apple - Tech Giant'),
('MSFT', 'Microsoft - Different fiscal year end'),
('WMT', 'Walmart - Retail with Jan year end'),
('BAC', 'Bank of America - Financial institution'),
('JNJ', 'Johnson & Johnson - Healthcare'),
('TSLA', 'Tesla - Newer company'),
('AMZN', 'Amazon - E-commerce'),
('XOM', 'Exxon - Energy sector'),
]
# Analyze each company
results = {}
for ticker, name in test_companies:
result = analyze_company_periods(ticker, name)
if result:
results[ticker] = result
# Summary of potential issues
print("\n" + "="*60)
print("POTENTIAL ISSUES WITH OUR SOLUTION")
print("="*60)
print("\n1. DURATION THRESHOLD (>300 days):")
print(" Our fix assumes annual = >300 days")
print(" Potential issues:")
# Check for edge cases around 300 days
for ticker in results:
for bucket in results[ticker]:
if "Other" in bucket or "Semi-annual" in bucket:
print(f" - {ticker} has unusual duration: {bucket}")
print("\n2. NO DURATION DATA:")
print(" Some facts might not have period_start")
for ticker in results:
if "No duration" in results[ticker]:
count = len(results[ticker]["No duration"])
print(f" - {ticker}: {count} facts without duration")
print("\n3. FISCAL YEAR VARIATIONS:")
print(" Companies have different fiscal year ends:")
fiscal_year_ends = {
'AAPL': 'September',
'MSFT': 'June',
'WMT': 'January',
'BAC': 'December',
'JNJ': 'December',
'TSLA': 'December',
'AMZN': 'December',
'XOM': 'December'
}
for ticker, month in fiscal_year_ends.items():
print(f" - {ticker}: Fiscal year ends in {month}")
print("\n4. MULTI-YEAR FACTS:")
print(" Some companies might report multi-year aggregates")
for ticker in results:
if "Multi-year" in results[ticker]:
count = len(results[ticker]["Multi-year"])
print(f" - {ticker}: {count} multi-year facts found")
print("\nRECOMMENDATIONS:")
print("1. The 300-day threshold works for most companies")
print("2. Consider 350-380 days as 'normal' annual range")
print("3. Handle edge cases:")
print(" - No duration: Could check fiscal_period or use other heuristics")
print(" - Multi-year: Filter out (duration > 400)")
print(" - Semi-annual: Rare but should be filtered for annual=True")

View File

@@ -0,0 +1,145 @@
#!/usr/bin/env python3
"""
Test specific header detection logic on the target table rows.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
import re
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def test_header_detection_logic():
print("🔍 TESTING SPECIFIC HEADER DETECTION LOGIC")
print("=" * 50)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Parse document
config = ParserConfig()
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return
print("✅ Found target table")
print(f"Current status: {len(target_table.headers)} headers, {len(target_table.rows)} data rows")
# Test our header detection logic on each of the first few rows
print(f"\n🔧 TESTING HEADER DETECTION ON FIRST 7 ROWS:")
for i, row in enumerate(target_table.rows[:7]):
print(f"\n--- ROW {i+1} ---")
# Get the row text
row_text = ' '.join(cell.text().strip() for cell in row.cells)
print(f"Row text: '{row_text}'")
# Test each part of our header detection logic
score = 0
reasons = []
# 1. Check for year patterns in the combined text
year_pattern = r'\b(19\d{2}|20\d{2})\b'
years_found = re.findall(year_pattern, row_text)
if len(years_found) >= 2:
if 'total' not in row_text.lower()[:20]:
score += 3
reasons.append(f"Multiple years found: {years_found}")
# 2. Enhanced year detection - check individual cells
year_cells = 0
date_phrases = 0
cell_contents = []
for cell in row.cells:
cell_text = cell.text().strip()
cell_contents.append(f"'{cell_text}'")
if cell_text:
# Check for individual years
if re.match(r'^\s*(19\d{2}|20\d{2})\s*$', cell_text):
year_cells += 1
# Check for date phrases
elif 'june 30' in cell_text.lower() or 'december 31' in cell_text.lower():
date_phrases += 1
print(f"Cell contents: {cell_contents[:5]}{'...' if len(cell_contents) > 5 else ''}")
print(f"Year cells: {year_cells}, Date phrases: {date_phrases}")
if year_cells >= 2 or (year_cells >= 1 and date_phrases >= 1):
if 'total' not in row_text.lower()[:20]:
score += 4
reasons.append(f"Enhanced year detection: {year_cells} year cells, {date_phrases} date phrases")
# 3. Check for financial header patterns
row_text_lower = row_text.lower()
financial_patterns = [
r'year\s+ended\s+(june|december|march|september)',
r'(three|six|nine|twelve)\s+months?\s+ended',
r'\(in\s+(millions|thousands|billions)\)',
r'fiscal\s+year\s+ended'
]
for pattern in financial_patterns:
if re.search(pattern, row_text_lower):
score += 2
reasons.append(f"Financial pattern: {pattern}")
# 4. Check for period indicators
period_keywords = ['quarter', 'q1', 'q2', 'q3', 'q4', 'month',
'january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november', 'december',
'ended', 'three months', 'six months', 'nine months']
matching_keywords = [kw for kw in period_keywords if kw in row_text_lower]
if matching_keywords:
score += 1
reasons.append(f"Period keywords: {matching_keywords}")
print(f"HEADER SCORE: {score}")
if reasons:
print(f"Reasons: {', '.join(reasons)}")
# Determine if this should be considered a header
should_be_header = score >= 3
print(f"SHOULD BE HEADER: {'YES' if should_be_header else 'NO'}")
if should_be_header and i == 4: # Row 5 (index 4) is our expected header
print("🎯 This matches our expected header row!")
elif should_be_header:
print("⚠️ This would be detected as a header but wasn't expected")
elif i == 4:
print("❌ This should be the header row but isn't being detected!")
return target_table
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
test_header_detection_logic()

View File

@@ -0,0 +1,98 @@
"""
Verify the fiscal year pattern across companies
"""
from edgar import Company
def check_fiscal_year_pattern(ticker, name):
"""Check the relationship between fiscal_year and period_end.year"""
print(f"\n{name} ({ticker}):")
print("-" * 40)
try:
company = Company(ticker)
facts = company.facts._facts
# Collect FY facts with revenue
fy_facts = []
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year and fact.fiscal_year >= 2019 and fact.fiscal_year <= 2024:
if 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 300 and duration < 400: # Annual only
fy_facts.append({
'fiscal_year': fact.fiscal_year,
'period_end': fact.period_end,
'period_end_year': fact.period_end.year,
'difference': fact.fiscal_year - fact.period_end.year
})
# Deduplicate and sort
unique_facts = {}
for f in fy_facts:
key = (f['fiscal_year'], f['period_end'])
unique_facts[key] = f
# Analyze the pattern
differences = set()
for f in unique_facts.values():
differences.add(f['difference'])
print(f" Fiscal Year vs Period End Year differences: {sorted(differences)}")
# Show examples
print("\n Examples:")
for f in sorted(unique_facts.values(), key=lambda x: x['fiscal_year'], reverse=True)[:5]:
print(f" FY {f['fiscal_year']} → ends {f['period_end']} (diff: {f['difference']} years)")
# What's the consistent pattern?
if len(differences) == 1:
diff = list(differences)[0]
print(f"\n ✓ Consistent pattern: fiscal_year = period_end.year + {diff}")
else:
print(f"\n ⚠️ Multiple patterns found: {differences}")
return differences
except Exception as e:
print(f" Error: {e}")
return set()
# Test various companies
companies = [
('AAPL', 'Apple (Sept year-end)'),
('MSFT', 'Microsoft (June year-end)'),
('WMT', 'Walmart (Jan year-end)'),
('AMZN', 'Amazon (Dec year-end)'),
('JNJ', 'J&J (Dec year-end)'),
('TSLA', 'Tesla (Dec year-end)'),
]
all_differences = set()
for ticker, name in companies:
diffs = check_fiscal_year_pattern(ticker, name)
all_differences.update(diffs)
print("\n" + "="*60)
print("CONCLUSION")
print("="*60)
if len(all_differences) == 1:
diff = list(all_differences)[0]
print(f"\n✓ ALL companies show the same pattern:")
print(f" fiscal_year = period_end.year + {diff}")
print("\nThis appears to be how the SEC Facts API structures the data!")
print("The 'fiscal_year' field indicates when the data was filed/reported,")
print("not the actual year of the fiscal period.")
else:
print(f"\n⚠️ Different companies show different patterns: {all_differences}")
print("The most common pattern seems to be a 2-year difference.")
print("\nIMPLICATION FOR OUR FIX:")
print("We should NOT require fiscal_year == period_end.year")
print("Instead, we should:")
print("1. Use duration (>300 days) as the primary filter")
print("2. Match facts where fiscal_year is within 0-3 years of period_end.year")
print("3. Deduplicate by keeping the latest period_end for each actual year")

View File

@@ -0,0 +1,99 @@
"""
Entity module for the EdgarTools library.
This module provides the Entity, Company, Fund, and related classes
for working with SEC filers.
"""
# Import for backward compatibility
from edgar.entity.constants import COMPANY_FORMS
from edgar.entity.core import (
Company,
Entity,
SecFiler,
get_company,
get_entity,
public_companies,
)
from edgar.entity.utils import has_company_filings, normalize_cik
from edgar.entity.data import Address, CompanyData, EntityData
from edgar.entity.entity_facts import (
EntityFacts,
NoCompanyFactsFound,
get_company_facts,
)
from edgar.entity.filings import EntityFiling, EntityFilings
from edgar.entity.search import CompanySearchIndex, CompanySearchResults, find_company
from edgar.entity.submissions import (
create_company_from_file,
create_entity_from_file,
create_entity_from_submissions_json,
download_entity_submissions_from_sec,
get_entity_submissions,
)
from edgar.entity.tickers import find_cik, find_ticker, get_cik_lookup_data, get_company_tickers, get_icon_from_ticker, get_ticker_to_cik_lookup
# Import from the funds package instead of entity.funds
from edgar.funds import FundData, FundSeries
# Aliases for backward compatibility
CompanyFiling = EntityFiling
CompanyFilings = EntityFilings
__all__ = [
# Core classes
'SecFiler',
'Entity',
'Company',
'FundSeries',
# Data classes
'EntityData',
'CompanyData',
'FundData',
'Address',
# Filing classes
'EntityFiling',
'EntityFilings',
'EntityFacts',
# Factory functions
'get_entity',
'get_company',
'public_companies',
# Search functions
'find_company',
'CompanySearchResults',
'CompanySearchIndex',
# Ticker functions
'get_icon_from_ticker',
'get_company_tickers',
'get_ticker_to_cik_lookup',
'get_cik_lookup_data',
'find_cik',
'find_ticker',
# Submission functions
'get_entity_submissions',
'download_entity_submissions_from_sec',
'create_entity_from_submissions_json',
'create_entity_from_file',
'create_company_from_file',
# Fact functions
'get_company_facts',
# Exceptions
'NoCompanyFactsFound',
# Constants and utilities
'COMPANY_FORMS',
'has_company_filings',
'normalize_cik',
# Backwards compatibility
'CompanyFiling',
'CompanyFilings',
]

View File

@@ -0,0 +1,80 @@
"""
Constants for entity classification and form types.
This module contains constants used throughout the entity package for
determining entity types and form classifications.
"""
# Performance optimization: use set for O(1) lookups
COMPANY_FORMS = {
# Registration statements
"S-1", "S-3", "S-4", "S-8", "S-11",
# Foreign issuers registration forms
"F-1", "F-3", "F-4", "F-6", "F-7", "F-8", "F-9", "F-10", "F-80",
# Foreign form amendments and effectiveness
"F-6EF", "F-6 POS", "F-3ASR", "F-4MEF", "F-10EF", "F-3D", "F-3MEF",
# Exchange Act registration
"10-12B", "10-12G",
# Periodic reports
"10-K", "10-Q", "10-K/A", "10-Q/A",
"20-F", "40-F", # Foreign issuers
"11-K", # Employee benefit plans
# Current reports
"8-K", "6-K",
# Proxy materials
"DEF 14A", "PRE 14A", "DEFA14A", "DEFM14A",
# Other corporate filings
"424B1", "424B2", "424B3", "424B4", "424B5",
"ARS", "NT 10-K", "NT 10-Q",
"SC 13D", "SC 13G", "SC TO-I", "SC TO-T",
"SD", "PX14A6G",
# Specialized corporate filings
"N-CSR", "N-Q", "N-MFP", "N-CEN",
"X-17A-5", "17-H",
"TA-1", "TA-2",
"ATS-N",
# Corporate disclosures
"EFFECT", "FWP", "425", "CB",
"POS AM", "CORRESP", "UPLOAD"
}
# Fund-specific form types
FUND_FORMS = {
# Investment company registration
"N-1A", "N-2", "N-3", "N-4", "N-5", "N-6",
# Investment company periodic reports
"N-CSR", "N-Q", "N-CEN", "N-MFP",
# Investment adviser forms
"ADV", "ADV-E", "ADV-H", "ADV-NR", "ADV-W",
# Private fund forms
"PF", "CPO-PQR", "CTA-PR",
# Municipal advisor forms
"MA", "MA-I", "MA-NR", "MA-W",
# Investment company shareholder reports
"N-30B-2", "N-30D", "485APOS", "485BPOS",
# Variable insurance products
"N-3/A", "N-4/A", "N-6/A",
# Closed-end funds
"N-2/A", "N-5/A",
# Business development companies
"N-6F", "N-54A", "N-54C",
# Exchange-traded funds
"N-1A/A",
# Portfolio holdings
"NPORT-P", "NPORT-EX", "N-PORT", "N-PORT/A"
}
# Individual/insider forms
INDIVIDUAL_FORMS = {
# Ownership reports
"3", "4", "5", "3/A", "4/A", "5/A",
# Beneficial ownership
"SC 13D", "SC 13G", "SC 13D/A", "SC 13G/A",
# Tender offer schedules
"SC TO-I", "SC TO-C", "SC TO-T",
# Investment adviser representatives
"ADV-E", "DRS"
}
# All known form types for validation
ALL_FORM_TYPES = COMPANY_FORMS | FUND_FORMS | INDIVIDUAL_FORMS

View File

@@ -0,0 +1,923 @@
"""
Core entity classes for working with SEC filings.
This module provides the main classes for interacting with SEC entities,
including companies, funds, and individuals.
"""
from abc import ABC, abstractmethod
from functools import cached_property
from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, TypeVar, Union
if TYPE_CHECKING:
import pyarrow
from edgar.entity.enhanced_statement import StructuredStatement
from edgar.entity.filings import EntityFilings
from edgar.enums import FormType, PeriodType
from rich import box
from rich.columns import Columns
from rich.console import Group
from rich.padding import Padding
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from edgar._filings import Filings
from edgar.company_reports import TenK, TenQ
from edgar.entity.data import Address, CompanyData, EntityData
from edgar.entity.entity_facts import EntityFacts, NoCompanyFactsFound, get_company_facts
from edgar.entity.tickers import get_icon_from_ticker
from edgar.financials import Financials
from edgar.formatting import datefmt, reverse_name
from edgar.reference.tickers import find_cik
from edgar.richtools import Docs, repr_rich
if TYPE_CHECKING:
from edgar.enums import FormType
# Import constants and utilities from separate modules
from edgar.entity.constants import COMPANY_FORMS
from edgar.entity.utils import has_company_filings, normalize_cik
# Type variables for better type annotations
T = TypeVar('T')
__all__ = [
'SecFiler',
'Entity',
'Company',
'EntityData',
'CompanyData',
'get_entity',
'get_company',
'NoCompanyFactsFound',
'has_company_filings',
'COMPANY_FORMS',
]
class SecFiler(ABC):
"""
Abstract base class for all SEC filing entities.
This is the root of the entity hierarchy and defines the common interface
that all entity types must implement.
"""
@abstractmethod
def get_filings(self, **kwargs) -> Filings:
"""Get filings for this entity."""
pass
@abstractmethod
def get_facts(self) -> Optional[EntityFacts]:
"""Get structured facts about this entity."""
pass
@property
@abstractmethod
def cik(self) -> int:
"""Get the CIK number for this entity."""
pass
@property
@abstractmethod
def data(self) -> 'EntityData':
"""Get detailed data for this entity."""
pass
class Entity(SecFiler):
"""
Represents any entity that files with the SEC.
This is the base concrete implementation that can be used directly
or specialized for specific entity types.
"""
def __init__(self, cik_or_identifier: Union[str, int]):
# If it's a ticker, convert to CIK first
if isinstance(cik_or_identifier, str) and not cik_or_identifier.isdigit():
cik = find_cik(cik_or_identifier)
if cik is None:
self._cik = -999999999
else:
self._cik = cik
else:
self._cik = normalize_cik(cik_or_identifier)
self._data = None
@property
def cik(self) -> int:
"""Get the CIK number for this entity."""
return self._cik
@property
def name(self):
"""Get the name of the company."""
if hasattr(self.data, 'name'):
return self.data.name
return None
@cached_property
def display_name(self) -> str:
"""Reverse the name if it is a company"""
if self.is_company:
return self.name
return reverse_name(self.name)
@cached_property
def data(self) -> 'EntityData':
"""Get detailed data for this entity."""
if self._data is None:
# Import locally to avoid circular imports
from edgar.entity.submissions import get_entity_submissions
# get_entity_submissions returns the EntityData directly
entity_data = get_entity_submissions(self.cik)
if entity_data:
self._data = entity_data
self._data._not_found = False
else:
# Instead of raising an error, create a default EntityData
#log.warning(f"Could not find entity data for CIK {self.cik}, using placeholder data")
from edgar.entity.data import create_default_entity_data
self._data = create_default_entity_data(self.cik)
self._data._not_found = True
return self._data
def mailing_address(self) -> Optional[Address]:
"""Get the mailing address of the entity."""
if hasattr(self.data, 'mailing_address') and self.data.mailing_address:
return self.data.mailing_address
def business_address(self) -> Optional[Address]:
"""Get the business address of the entity."""
if hasattr(self.data, 'business_address') and self.data.business_address:
return self.data.business_address
@property
def not_found(self) -> bool:
"""
Check if the entity data was not found.
Returns:
True if the entity data could not be found, False otherwise
"""
if not hasattr(self, '_data') or self._data is None:
# We haven't loaded the data yet, so we don't know if it's not found
# Loading the data will set the not_found flag
_ = self.data
return getattr(self._data, '_not_found', False)
@property
def is_company(self) -> bool:
"""
Check if this entity is a company.
Returns:
True if the entity is a company, False otherwise
"""
return self.data.is_company
@property
def is_individual(self) -> bool:
"""
Check if this entity is an individual.
Returns:
True if the entity is an individual, False otherwise
"""
return not self.is_company
def get_filings(self,
*,
year: Union[int, List[int]] = None,
quarter: Union[int, List[int]] = None,
form: Union[str, 'FormType', List[Union[str, 'FormType']]] = None,
accession_number: Union[str, List] = None,
file_number: Union[str, List] = None,
filing_date: Union[str, Tuple[str, str]] = None,
date: Union[str, Tuple[str, str]] = None,
amendments: bool = True,
is_xbrl: bool = None,
is_inline_xbrl: bool = None,
sort_by: Union[str, List[Tuple[str, str]]] = None,
trigger_full_load: bool = True) -> 'EntityFilings':
"""
Get the entity's filings and optionally filter by multiple criteria.
This method has a special behavior for loading filings. When first called,
it only loads the most recent filings. If trigger_full_load=True, it will
automatically fetch all historical filings from the SEC (potentially making
multiple API calls) as needed.
Args:
year: The year or list of years to filter by (e.g. 2023, [2022, 2023])
quarter: The quarter or list of quarters to filter by (1-4, e.g. 4, [3, 4])
form: The form type (e.g. FormType.ANNUAL_REPORT, '10-K', or ['10-Q', '10-K'])
accession_number: The accession number that identifies a filing
file_number: The file number e.g. 001-39504
filing_date: Filter by filing date (YYYY-MM-DD or range)
date: Alias for filing_date
amendments: Whether to include amendments (default: True)
is_xbrl: Whether the filing is XBRL
is_inline_xbrl: Whether the filing is Inline XBRL
sort_by: Sort criteria
trigger_full_load: Whether to load all historical filings if not already loaded
Returns:
Filtered filings matching the criteria
"""
# Simply delegate to the EntityData implementation
# This preserves the lazy-loading behavior while keeping the API clean
return self.data.get_filings(
year=year,
quarter=quarter,
form=form,
accession_number=accession_number,
file_number=file_number,
filing_date=filing_date or date,
amendments=amendments,
is_xbrl=is_xbrl,
is_inline_xbrl=is_inline_xbrl,
sort_by=sort_by,
trigger_full_load=trigger_full_load
)
def get_facts(self, period_type: Optional[Union[str, 'PeriodType']] = None) -> Optional[EntityFacts]:
"""
Get structured facts about this entity.
Args:
period_type: Optional filter by period type. Can be PeriodType enum
or string ('annual', 'quarterly', 'monthly').
Returns:
EntityFacts object, optionally filtered by period type
"""
try:
facts = get_company_facts(self.cik)
if facts and period_type:
# Apply period type filtering to the facts
return facts.filter_by_period_type(period_type)
return facts
except NoCompanyFactsFound:
return None
def get_structured_statement(self,
statement_type: str,
fiscal_year: Optional[int] = None,
fiscal_period: Optional[str] = None,
use_canonical: bool = True,
include_missing: bool = False) -> Optional['StructuredStatement']:
"""
Get a hierarchically structured financial statement.
This method uses learned canonical structures to build complete financial
statements with proper hierarchy and relationships, filling in missing
concepts when requested.
Args:
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', 'CashFlow')
fiscal_year: Fiscal year to retrieve (defaults to latest)
fiscal_period: Fiscal period ('FY', 'Q1', 'Q2', 'Q3', 'Q4')
use_canonical: Use canonical structure for organization (recommended)
include_missing: Include placeholders for missing canonical concepts
Returns:
StructuredStatement with hierarchical organization or None if no data
Example:
>>> company = Company('AAPL')
>>> stmt = company.get_structured_statement('IncomeStatement', 2024, 'Q4')
>>> print(stmt.get_hierarchical_display())
"""
from edgar.entity.statement_builder import StatementBuilder
facts_data = self.get_facts()
if not facts_data:
return None
# Get all facts
all_facts = facts_data.get_all_facts()
if not all_facts:
return None
# Build the statement
builder = StatementBuilder(cik=str(self.cik))
structured_stmt = builder.build_statement(
facts=all_facts,
statement_type=statement_type,
fiscal_year=fiscal_year,
fiscal_period=fiscal_period,
use_canonical=use_canonical,
include_missing=include_missing
)
# Add company metadata
structured_stmt.company_name = self.name
return structured_stmt
def latest(self, form: str, n=1):
"""Get the latest filing(s) for a given form."""
return self.get_filings(form=form, trigger_full_load=False).latest(n)
def __str__(self):
if hasattr(self, 'data'):
return f"Entity({self.data.name} [{self.cik}])"
return f"Entity(CIK={self.cik})"
def __rich__(self):
return self.data.__rich__()
def __repr__(self):
return repr_rich(self.__rich__())
def __bool__(self):
"""
Allow truthiness check for entities.
Returns False if the entity doesn't exist (has a sentinel CIK value or not_found is True).
This enables code patterns like: `if company: do_something()`
"""
# Check for sentinel CIK value (-999999999) or not_found flag
return self.cik != -999999999 and not self.not_found
class Company(Entity):
"""
Represents a public company that files with the SEC.
Provides company-specific functionality like financial statements,
ticker lookup, etc.
"""
def __init__(self, cik_or_ticker: Union[str, int]):
super().__init__(cik_or_ticker)
@property
def data(self) -> 'EntityData': # We'll return the base type to simplify
"""Get detailed data for this company."""
# For simplicity, return the base EntityData
# Type checkers will still see this as a CompanyData due to the annotation
return super().data
@property
def tickers(self):
"""Get all ticker symbols for this company."""
if hasattr(self.data, 'tickers'):
return self.data.tickers
return []
def get_ticker(self) -> Optional[str]:
"""Get the primary ticker symbol for this company."""
if self.data and self.data.tickers and len(self.data.tickers) > 0:
return self.data.tickers[0]
return None
def get_exchanges(self ):
"""Get all exchanges for this company."""
if hasattr(self.data, 'exchanges'):
return self.data.exchanges
return []
def get_financials(self) -> Optional[Financials]:
"""Get financial statements for this company."""
tenk_filing = self.latest_tenk
if tenk_filing is not None:
return tenk_filing.financials
return None
def get_quarterly_financials(self) -> Optional[Financials]:
"""Get quarterly financial statements for this company."""
tenq_filing = self.latest_tenq
if tenq_filing is not None:
return tenq_filing.financials
return None
@property
def fiscal_year_end(self):
"""Get the fiscal year end date for this company."""
if hasattr(self.data, 'fiscal_year_end'):
return self.data.fiscal_year_end
return None
@property
def sic(self):
"""Get the SIC code for this company."""
if hasattr(self.data, 'sic'):
return self.data.sic
return None
@property
def industry(self):
"""Get the industry description for this company."""
if hasattr(self.data, 'sic_description'):
return self.data.sic_description
return None
@property
def latest_tenk(self) -> Optional[TenK]:
"""Get the latest 10-K filing for this company."""
latest_10k = self.get_filings(form='10-K', trigger_full_load=False).latest()
if latest_10k is not None:
return latest_10k.obj()
return None
@property
def latest_tenq(self) -> Optional[TenQ]:
"""Get the latest 10-Q filing for this company."""
latest_10q = self.get_filings(form='10-Q', trigger_full_load=False).latest()
if latest_10q is not None:
return latest_10q.obj()
return None
def get_icon(self):
return get_icon_from_ticker(self.tickers[0])
# Enhanced financial data properties and methods
@property
def facts(self) -> Optional[EntityFacts]:
"""Get enhanced structured facts about this company."""
return self.get_facts()
@property
def docs(self):
"""Access comprehensive Company API documentation."""
return Docs(self)
@property
def public_float(self) -> Optional[float]:
"""Get the public float value for this company."""
facts = self.facts
if facts:
return facts.public_float
return None
@property
def shares_outstanding(self) -> Optional[float]:
"""Get the shares outstanding for this company."""
facts = self.facts
if facts:
return facts.shares_outstanding
return None
def income_statement(self, periods: int = 4, annual: bool = True, as_dataframe: bool = False, concise_format: bool = False):
"""
Get income statement data for this company.
Args:
periods: Number of periods to retrieve
annual: If True, prefer annual periods; if False, get quarterly
as_dataframe: If True, return DataFrame; if False, return MultiPeriodStatement
concise_format: If True, display values as $1.0B, if False display as $1,000,000,000
Returns:
MultiPeriodStatement or DataFrame with income statement data, or None if not available
"""
facts = self.facts
if facts:
try:
return facts.income_statement(periods=periods, annual=annual, as_dataframe=as_dataframe, concise_format=concise_format)
except Exception as e:
from edgar.core import log
log.debug(f"Error getting income statement for {self.name}: {e}")
return None
def balance_sheet(self, periods: int = 4, annual: bool = True, as_dataframe: bool = False, concise_format: bool = False):
"""
Get balance sheet data for this company.
Args:
periods: Number of periods to retrieve
annual: If True, prefer annual periods; if False, get quarterly
as_dataframe: If True, return DataFrame; if False, return MultiPeriodStatement
concise_format: If True, display values as $1.0B, if False display as $1,000,000,000
Returns:
MultiPeriodStatement or DataFrame with balance sheet data, or None if not available
"""
facts = self.facts
if facts:
try:
return facts.balance_sheet(periods=periods, annual=annual, as_dataframe=as_dataframe, concise_format=concise_format)
except Exception as e:
from edgar.core import log
log.debug(f"Error getting balance sheet for {self.name}: {e}")
return None
def cash_flow(self, periods: int = 4, annual: bool = True, as_dataframe: bool = False, concise_format: bool = False):
"""
Get cash flow statement data for this company.
Args:
periods: Number of periods to retrieve
annual: If True, prefer annual periods; if False, get quarterly
as_dataframe: If True, return DataFrame; if False, return MultiPeriodStatement
concise_format: If True, display values as $1.0B, if False display as $1,000,000,000
Returns:
MultiPeriodStatement or DataFrame with cash flow data, or None if not available
"""
facts = self.facts
if facts:
try:
return facts.cash_flow(periods=periods, annual=annual, as_dataframe=as_dataframe, concise_format=concise_format)
except Exception as e:
from edgar.core import log
log.debug(f"Error getting cash flow for {self.name}: {e}")
return None
def __str__(self):
ticker = self.get_ticker()
ticker_str = f" - {ticker}" if ticker else ""
if hasattr(self, 'data'):
return f"Company({self.data.name} [{self.cik}]{ticker_str})"
return f"Company(CIK={self.cik}{ticker_str})"
def __repr__(self):
# Delegate to the rich representation for consistency with the old implementation
return repr_rich(self.__rich__())
def text(self, max_tokens: int = 2000) -> str:
"""
Get AI-optimized plain text representation.
Uses Markdown-KV format (60.7% accuracy, 25% fewer tokens than JSON) optimized
for LLM consumption. For terminal display, use print(company) instead.
Research basis: improvingagents.com/blog/best-input-data-format-for-llms
Args:
max_tokens: Approximate token limit using 4 chars/token heuristic (default: 2000)
Returns:
Markdown-formatted key-value representation optimized for LLMs
Example:
>>> from edgar import Company
>>> company = Company("AAPL")
>>> text = company.text()
>>> print(text)
**Company:** Apple Inc.
**CIK:** 0000320193
**Ticker:** AAPL
**Exchange:** NASDAQ
...
"""
lines = []
# Basic identification
lines.append(f"**Company:** {self.data.name}")
lines.append(f"**CIK:** {str(self.cik).zfill(10)}")
# Ticker and exchange
ticker = self.get_ticker()
if ticker:
lines.append(f"**Ticker:** {ticker}")
if hasattr(self.data, 'exchanges') and self.data.exchanges:
exchanges_str = ", ".join(self.data.exchanges) if isinstance(self.data.exchanges, (list, tuple)) else str(self.data.exchanges)
lines.append(f"**Exchange:** {exchanges_str}")
# Industry classification
if hasattr(self.data, 'sic') and self.data.sic:
sic_desc = getattr(self.data, 'sic_description', '')
if sic_desc:
lines.append(f"**Industry:** {sic_desc} (SIC {self.data.sic})")
else:
lines.append(f"**SIC Code:** {self.data.sic}")
# Entity type
if hasattr(self.data, 'entity_type') and self.data.entity_type:
lines.append(f"**Entity Type:** {self.data.entity_type.title()}")
# Category
if hasattr(self.data, 'category') and self.data.category:
lines.append(f"**Category:** {self.data.category}")
# Fiscal year end
if hasattr(self.data, 'fiscal_year_end') and self.data.fiscal_year_end:
lines.append(f"**Fiscal Year End:** {self._format_fiscal_year_date(self.data.fiscal_year_end)}")
# Business address
if hasattr(self.data, 'business_address') and self.data.business_address:
addr = self.data.business_address
lines.append("")
lines.append("**Business Address:**")
if hasattr(addr, 'street1') and addr.street1:
lines.append(f"{addr.street1}")
if hasattr(addr, 'street2') and addr.street2:
lines.append(f"{addr.street2}")
if hasattr(addr, 'city') and hasattr(addr, 'state_or_country') and addr.city and addr.state_or_country:
zip_code = f" {addr.zip_code}" if hasattr(addr, 'zip_code') and addr.zip_code else ""
lines.append(f"{addr.city}, {addr.state_or_country}{zip_code}")
# Contact information
if hasattr(self.data, 'phone') and self.data.phone:
lines.append(f"**Phone:** {self.data.phone}")
# Mailing address (if different from business address)
if hasattr(self.data, 'mailing_address') and self.data.mailing_address:
mail_addr = self.data.mailing_address
if hasattr(self.data, 'business_address'):
# Only include if different
business_addr = self.data.business_address
if (not hasattr(business_addr, 'street1') or
mail_addr.street1 != business_addr.street1):
lines.append("")
lines.append("**Mailing Address:**")
if hasattr(mail_addr, 'street1') and mail_addr.street1:
lines.append(f"{mail_addr.street1}")
if hasattr(mail_addr, 'city') and hasattr(mail_addr, 'state_or_country'):
zip_code = f" {mail_addr.zip_code}" if hasattr(mail_addr, 'zip_code') and mail_addr.zip_code else ""
lines.append(f"{mail_addr.city}, {mail_addr.state_or_country}{zip_code}")
text = "\n".join(lines)
# Token limiting (4 chars/token heuristic)
max_chars = max_tokens * 4
if len(text) > max_chars:
text = text[:max_chars] + "\n\n[Truncated for token limit]"
return text
def __rich__(self):
"""Creates a rich representation of the company with detailed information."""
# The title of the panel
ticker = self.get_ticker()
if self.data.is_company:
entity_title = Text.assemble("🏢 ",
(self.data.name, "bold green"),
" ",
(ticker if ticker else "", "bold yellow")
)
else:
entity_title = Text.assemble("👤", (self.data.name, "bold green"))
# Primary Information Table
main_info = Table(box=box.SIMPLE_HEAVY, show_header=False, padding=(0, 1))
main_info.add_column("Row", style="") # Single column for the entire row
row_parts = []
row_parts.extend([Text("CIK", style="grey60"), Text(str(self.cik), style="bold deep_sky_blue3")])
if hasattr(self.data, 'entity_type') and self.data.entity_type:
if self.data.is_individual:
row_parts.extend([Text("Type", style="grey60"),
Text("Individual", style="bold yellow")])
else:
row_parts.extend([Text("Type", style="grey60"),
Text(self.data.entity_type.title(), style="bold yellow"),
Text(self._get_operating_type_emoticon(self.data.entity_type), style="bold yellow")])
main_info.add_row(*row_parts)
# Detailed Information Table
details = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
details.add_column("Category")
details.add_column("Industry")
details.add_column("Fiscal Year End")
details.add_row(
getattr(self.data, 'category', '-') or "-",
f"{getattr(self.data, 'sic', '')}: {getattr(self.data, 'sic_description', '')}" if hasattr(self.data, 'sic') and self.data.sic else "-",
self._format_fiscal_year_date(getattr(self.data, 'fiscal_year_end', '')) if hasattr(self.data, 'fiscal_year_end') and self.data.fiscal_year_end else "-"
)
# Combine main_info and details in a single panel
if self.data.is_company:
basic_info_renderables = [main_info, details]
else:
basic_info_renderables = [main_info]
basic_info_panel = Panel(
Group(*basic_info_renderables),
title="📋 Entity",
border_style="grey50"
)
# Trading Information
if hasattr(self.data, 'tickers') and hasattr(self.data, 'exchanges') and self.data.tickers and self.data.exchanges:
trading_info = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
trading_info.add_column("Exchange")
trading_info.add_column("Symbol", style="bold yellow")
for exchange, ticker in zip(self.data.exchanges, self.data.tickers, strict=False):
trading_info.add_row(exchange, ticker)
trading_panel = Panel(
trading_info,
title="📈 Exchanges",
border_style="grey50"
)
else:
trading_panel = Panel(
Text("No trading information available", style="grey58"),
title="📈 Trading Information",
border_style="grey50"
)
# Contact Information
contact_info = Table(box=box.SIMPLE, show_header=False, padding=(0, 1))
contact_info.add_column("Label", style="bold grey70")
contact_info.add_column("Value")
has_contact_info = any([
hasattr(self.data, 'phone') and self.data.phone,
hasattr(self.data, 'website') and self.data.website,
hasattr(self.data, 'investor_website') and self.data.investor_website
])
if hasattr(self.data, 'website') and self.data.website:
contact_info.add_row("Website", self.data.website)
if hasattr(self.data, 'investor_website') and self.data.investor_website:
contact_info.add_row("Investor Relations", self.data.investor_website)
if hasattr(self.data, 'phone') and self.data.phone:
contact_info.add_row("Phone", self.data.phone)
# Three-column layout for addresses and contact info
contact_renderables = []
if hasattr(self.data, 'business_address') and not self.data.business_address.empty:
contact_renderables.append(Panel(
Text(str(self.data.business_address)),
title="🏢 Business Address",
border_style="grey50"
))
if hasattr(self.data, 'mailing_address') and not self.data.mailing_address.empty:
contact_renderables.append(Panel(
Text(str(self.data.mailing_address)),
title="📫 Mailing Address",
border_style="grey50"
))
if has_contact_info:
contact_renderables.append(Panel(
contact_info,
title="📞 Contact Information",
border_style="grey50"
))
# Former Names Table (if any exist)
former_names_panel = None
if hasattr(self.data, 'former_names') and self.data.former_names:
former_names_table = Table(box=box.SIMPLE, show_header=False, padding=(0, 1))
former_names_table.add_column("Previous Company Names")
former_names_table.add_column("") # Empty column for better spacing
for former_name in self.data.former_names:
from_date = datefmt(former_name['from'], '%B %Y')
to_date = datefmt(former_name['to'], '%B %Y')
former_names_table.add_row(Text(former_name['name'], style="italic"), f"{from_date} to {to_date}")
former_names_panel = Panel(
former_names_table,
title="📜 Former Names",
border_style="grey50"
)
# Combine all sections using Group
if self.data.is_company:
content_renderables = [Padding("", (1, 0, 0, 0)), basic_info_panel, trading_panel]
if len(contact_renderables):
contact_and_addresses = Columns(contact_renderables, equal=True, expand=True)
content_renderables.append(contact_and_addresses)
if former_names_panel:
content_renderables.append(former_names_panel)
else:
content_renderables = [Padding("", (1, 0, 0, 0)), basic_info_panel]
if len(contact_renderables):
contact_and_addresses = Columns(contact_renderables, equal=True, expand=True)
content_renderables.append(contact_and_addresses)
content = Group(*content_renderables)
# Create the main panel
return Panel(
content,
title=entity_title,
subtitle=Text.assemble(
("SEC Entity Data", "dim"),
"",
("company.docs", "cyan dim"),
(" for usage guide", "dim")
),
border_style="grey50"
)
@staticmethod
def _get_operating_type_emoticon(entity_type: str) -> str:
"""
Generate a meaningful single-width symbol based on the SEC entity type.
All symbols are chosen to be single-width to work well with rich borders.
Args:
entity_type (str): The SEC entity type (case-insensitive)
Returns:
str: A single-width symbol representing the entity type
"""
symbols = {
"operating": "", # Circle for active operations
"subsidiary": "", # Arrow showing connection to parent
"inactive": "×", # Cross for inactive
"holding company": "", # Square for solid corporate structure
"investment company": "$", # Dollar for investment focus
"investment trust": "$", # Dollar for investment focus
"shell": "", # Empty square for shell
"development stage": "", # Triangle for growth/development
"financial services": "¢", # Cent sign for financial services
"reit": "", # House symbol
"spv": "", # Diamond for special purpose
"joint venture": "" # Infinity for partnership
}
# Clean input: convert to lowercase and strip whitespace
cleaned_type = entity_type.lower().strip()
# Handle some common variations
if "investment" in cleaned_type:
return symbols["investment company"]
if "real estate" in cleaned_type or "reit" in cleaned_type:
return symbols["reit"]
# Return default question mark if type not found
return symbols.get(cleaned_type, "")
@staticmethod
def _format_fiscal_year_date(date_str):
"""Format fiscal year end date in a human-readable format."""
if not date_str:
return "-"
# Dictionary of months
months = {
"01": "Jan", "02": "Feb", "03": "Mar",
"04": "Apr", "05": "May", "06": "Jun",
"07": "Jul", "08": "Aug", "09": "Sep",
"10": "Oct", "11": "Nov", "12": "Dec"
}
# Extract month and day
month = date_str[:2]
if month not in months:
return date_str
try:
day = str(int(date_str[2:])) # Remove leading zero
return f"{months[month]} {day}"
except (ValueError, IndexError):
return date_str
# Factory functions for backward compatibility
def get_entity(cik_or_identifier: Union[str, int]) -> Entity:
"""
Get any SEC filing entity by CIK or identifier.
Args:
cik_or_identifier: CIK number (as int or str) or other identifier
Returns:
Entity instance
"""
return Entity(cik_or_identifier)
def get_company(cik_or_ticker: Union[str, int]) -> Company:
"""
Get a public company by CIK or ticker.
Args:
cik_or_ticker: CIK number or ticker symbol
Returns:
Company instance
"""
return Company(cik_or_ticker)
def public_companies() -> Iterable[Company]:
"""
Iterator over all known public companies.
Returns:
Iterable of Company objects
"""
from edgar.reference.tickers import get_cik_tickers
df = get_cik_tickers()
for _, row in df.iterrows():
c = Company(row.cik)
yield c

View File

@@ -0,0 +1,854 @@
"""
Data classes for the Entity package.
This module contains classes for working with entity data, including
addresses, facts, and other structured data from SEC filings.
"""
import re
from functools import cached_property
from typing import Any, Dict, List, Optional, Tuple, Union
import pyarrow as pa
import pyarrow.compute as pc
from edgar.core import listify, log
from edgar.dates import InvalidDateException
from edgar.entity.filings import EntityFilings
from edgar.filtering import filter_by_date, filter_by_form, filter_by_year_quarter
from edgar.formatting import reverse_name
from edgar.storage import is_using_local_storage
# Module-level import cache for lazy imports
_IMPORT_CACHE = {}
def lazy_import(module_path):
"""
Lazily import a module or attribute and cache the result to avoid repeated imports.
Args:
module_path: String path to the module or attribute
Returns:
The imported module or attribute
"""
if module_path not in _IMPORT_CACHE:
parts = module_path.split('.')
if len(parts) == 1:
# Simple module import
_IMPORT_CACHE[module_path] = __import__(module_path)
else:
# Import from module (potentially nested)
module_name = '.'.join(parts[:-1])
attr_name = parts[-1]
module = __import__(module_name, fromlist=[attr_name])
_IMPORT_CACHE[module_path] = getattr(module, attr_name)
return _IMPORT_CACHE[module_path]
__all__ = [
'Address',
'EntityData',
'CompanyData',
'preprocess_company',
'parse_entity_submissions',
'extract_company_filings_table',
'create_company_filings',
'create_default_entity_data'
]
def extract_company_filings_table(filings_json: Dict[str, Any]) -> pa.Table:
"""
Extract company filings from the json response.
Args:
filings_json: The JSON data containing filings
Returns:
A PyArrow Table containing the filings data
"""
# Import this here to avoid circular imports
from edgar.core import parse_acceptance_datetime
# Handle case of no data
if not filings_json.get('accessionNumber'):
# Create an empty table with the right schema
schema = pa.schema([
('accession_number', pa.string()),
('filing_date', pa.date32()),
('reportDate', pa.string()),
('acceptanceDateTime', pa.timestamp('us')),
('act', pa.string()),
('form', pa.string()),
('fileNumber', pa.string()),
('items', pa.string()),
('size', pa.string()),
('isXBRL', pa.string()),
('isInlineXBRL', pa.string()),
('primaryDocument', pa.string()),
('primaryDocDescription', pa.string())
])
return pa.Table.from_arrays([[] for _ in range(13)], schema=schema)
else:
# Convert acceptanceDateTime string to datetime
acceptance_datetimes = [
parse_acceptance_datetime(dt) for dt in filings_json['acceptanceDateTime']
]
fields = {
'accession_number': filings_json['accessionNumber'],
'filing_date': pc.cast(pc.strptime(pa.array(filings_json['filingDate']), '%Y-%m-%d', 'us'), pa.date32()),
'reportDate': filings_json['reportDate'],
'acceptanceDateTime': acceptance_datetimes,
'act': filings_json['act'],
'form': filings_json['form'],
'fileNumber': filings_json['fileNumber'],
'items': filings_json['items'],
'size': filings_json['size'],
'isXBRL': filings_json['isXBRL'],
'isInlineXBRL': filings_json['isInlineXBRL'],
'primaryDocument': filings_json['primaryDocument'],
'primaryDocDescription': filings_json['primaryDocDescription']
}
# Create table using dictionary
return pa.Table.from_arrays(
arrays=[pa.array(v) if k not in ['filing_date', 'acceptanceDateTime']
else v for k, v in fields.items()],
names=list(fields.keys())
)
def create_company_filings(filings_json: Dict[str, Any], cik: int, company_name: str) -> EntityFilings:
"""
Extract company filings from the json response.
Args:
filings_json: The JSON data containing filings
cik: The company CIK
company_name: The company name
Returns:
An EntityFilings object containing the filings
"""
recent_filings = extract_company_filings_table(filings_json['recent'])
return EntityFilings(recent_filings, cik=cik, company_name=company_name)
def parse_entity_submissions(cjson: Dict[str, Any]) -> 'CompanyData':
"""
Parse entity submissions from the SEC API.
Args:
cjson: The JSON data from the SEC submissions API
Returns:
A CompanyData object representing the entity
"""
mailing_addr = cjson['addresses']['mailing']
business_addr = cjson['addresses']['business']
cik = cjson['cik']
company_name = cjson["name"]
former_names = cjson.get('formerNames', [])
for former_name in former_names:
former_name['from'] = former_name['from'][:10] if former_name['from'] else former_name['from']
former_name['to'] = former_name['to'][:10] if former_name['to'] else former_name['to']
return CompanyData(
cik=int(cik),
name=company_name,
tickers=cjson['tickers'],
exchanges=cjson['exchanges'],
sic=cjson['sic'],
sic_description=cjson['sicDescription'],
category=cjson['category'].replace("<br>", " | ") if cjson['category'] else None,
fiscal_year_end=cjson['fiscalYearEnd'],
entity_type=cjson['entityType'],
phone=cjson['phone'],
flags=cjson['flags'],
mailing_address=Address(
street1=mailing_addr['street1'],
street2=mailing_addr['street2'],
city=mailing_addr['city'],
state_or_country_desc=mailing_addr['stateOrCountryDescription'],
state_or_country=mailing_addr['stateOrCountry'],
zipcode=mailing_addr['zipCode'],
),
business_address=Address(
street1=business_addr['street1'],
street2=business_addr['street2'],
city=business_addr['city'],
state_or_country_desc=business_addr['stateOrCountryDescription'],
state_or_country=business_addr['stateOrCountry'],
zipcode=business_addr['zipCode'],
),
filings=create_company_filings(cjson['filings'], cik=cik, company_name=company_name),
insider_transaction_for_owner_exists=bool(cjson['insiderTransactionForOwnerExists']),
insider_transaction_for_issuer_exists=bool(cjson['insiderTransactionForIssuerExists']),
ein=cjson['ein'],
description=cjson['description'],
website=cjson['website'],
investor_website=cjson['investorWebsite'],
state_of_incorporation=cjson['stateOfIncorporation'],
state_of_incorporation_description=cjson['stateOfIncorporationDescription'],
former_names=former_names,
files=cjson['filings']['files']
)
class Address:
"""
Represents a physical address.
This class is optimized for memory usage and performance.
"""
__slots__ = ('street1', 'street2', 'city', 'state_or_country', 'zipcode', 'state_or_country_desc', '_str_cache')
def __init__(self,
street1: str,
street2: Optional[str],
city: str,
state_or_country: str,
zipcode: str,
state_or_country_desc: str
):
"""
Initialize an Address object.
Args:
street1: First line of street address
street2: Second line of street address (optional)
city: City name
state_or_country: State or country code
zipcode: Postal/ZIP code
state_or_country_desc: Human-readable state or country name
"""
# Store empty strings instead of None to avoid type checks later
self.street1: str = street1 or ""
self.street2: Optional[str] = street2 or ""
self.city: str = city or ""
self.state_or_country: str = state_or_country or ""
self.zipcode: str = zipcode or ""
self.state_or_country_desc: str = state_or_country_desc or ""
self._str_cache = None
@property
def empty(self) -> bool:
"""Check if the address is empty. Optimized to avoid multiple attribute checks when possible."""
# Short-circuit on common empty case
if not self.street1:
if not self.city and not self.zipcode:
return True
# Full check
return not (self.street1 or self.street2 or self.city or self.state_or_country or self.zipcode)
def __str__(self):
"""
Generate a formatted string representation of the address.
Caches result for repeated calls.
"""
if self._str_cache is not None:
return self._str_cache
if not self.street1:
self._str_cache = ""
return ""
# Build string only once and cache it
parts = []
parts.append(self.street1)
if self.street2:
parts.append(self.street2)
parts.append(f"{self.city}, {self.state_or_country_desc} {self.zipcode}")
self._str_cache = "\n".join(parts)
return self._str_cache
def __repr__(self):
"""Generate a string representation suitable for debugging."""
# Simplified representation that avoids unnecessary string operations
return f'Address(street1="{self.street1}", street2="{self.street2}", city="{self.city}", zipcode="{self.zipcode}")'
def to_json(self) -> Dict[str, str]:
"""Convert the address to a JSON-serializable dict."""
# Direct dictionary creation is faster than multiple assignments
return {
'street1': self.street1,
'street2': self.street2,
'city': self.city,
'state_or_country': self.state_or_country,
'zipcode': self.zipcode,
'state_or_country_desc': self.state_or_country_desc
}
class EntityData:
"""
Container for entity data loaded from SEC submissions API.
This class provides access to entity metadata and filings.
"""
def __init__(self,
cik: int,
name: str,
tickers: List[str],
exchanges: List[str],
sic: str,
sic_description: str,
ein: str,
entity_type: str,
fiscal_year_end: str,
filings: EntityFilings,
business_address: Address,
mailing_address: Address,
state_of_incorporation: str,
**kwargs):
"""
Initialize a new EntityData instance.
Args:
cik: The CIK number
name: The entity name
sic: The Standard Industrial Classification code
ein: The Employer Identification Number
fiscal_year_end: The fiscal year end date
tickers: List of ticker symbols
exchanges: List of exchanges
entity_type: The entity type
filings: The entity's filings
business_address: The business address
mailing_address: The mailing address
state_of_incorporation: The state of incorporation
**kwargs: Additional attributes
"""
self.cik: int = cik
self.name: str = name
self.sic = sic
self.sic_description: str = sic_description
self.ein: str = ein
self.fiscal_year_end: str = fiscal_year_end
self.tickers: List[str] = tickers
self.exchanges: List[str] = exchanges
self.filings: EntityFilings = filings
self.entity_type = entity_type
self.business_address: Address = business_address
self.mailing_address: Address = mailing_address
self.state_of_incorporation: str = state_of_incorporation
# Store all other attributes
for key, value in kwargs.items():
setattr(self, key, value)
# Initialize lazy loading flag
self._loaded_all_filings: bool = False
self._files = kwargs.get('files', [])
def _load_older_filings(self):
"""
Load older filings that were not included in the initial data.
This method implements the lazy loading behavior of filings.
When first creating an entity, only the most recent filings are loaded
to keep API response times fast. When more filings are needed, this
method will load additional filings from the SEC.
"""
# If we have no files to load, we're done
if not self._files:
return
# Import locally to avoid circular imports using the lazy import cache
download_json = lazy_import('edgar.httprequests.download_json')
# Load additional filings from the SEC
filing_tables = [self.filings.data]
for file in self._files:
submissions = download_json("https://data.sec.gov/submissions/" + file['name'])
filing_table = extract_company_filings_table(submissions)
filing_tables.append(filing_table)
# Combine all filing tables
combined_tables = pa.concat_tables(filing_tables)
# Update filings
EntityFilings = lazy_import('edgar.entity.filings.EntityFilings')
self.filings = EntityFilings(combined_tables, cik=self.cik, company_name=self.name)
def get_filings(self,
year: Union[int, List[int]] = None,
quarter: Union[int, List[int]] = None,
form: Union[str, List] = None,
accession_number: Union[str, List] = None,
file_number: Union[str, List] = None,
filing_date: Union[str, Tuple[str, str]] = None,
date: Union[str, Tuple[str, str]] = None,
amendments: bool = True,
is_xbrl: bool = None,
is_inline_xbrl: bool = None,
sort_by: Union[str, List[Tuple[str, str]]] = None,
trigger_full_load: bool = True
) -> EntityFilings:
"""
Get entity filings with lazy loading behavior.
Args:
year: Filter by year(s) (e.g. 2023, [2022, 2023])
quarter: Filter by quarter(s) (1-4, e.g. 4, [3, 4])
form: Filter by form type(s)
accession_number: Filter by accession number(s)
file_number: Filter by file number(s)
filing_date: Filter by filing date (YYYY-MM-DD or range)
date: Alias for filing_date
amendments: Whether to include amendments (default: True)
is_xbrl: Filter by XBRL status
is_inline_xbrl: Filter by inline XBRL status
sort_by: Sort criteria
trigger_full_load: Whether to load all historical filings if not already loaded
Returns:
Filtered filings
"""
# Lazy loading behavior
if not self._loaded_all_filings and not is_using_local_storage() and trigger_full_load:
self._load_older_filings()
self._loaded_all_filings = True
# Get filings data
company_filings = self.filings.data
# Filter by year/quarter first (most selective)
if year is not None:
company_filings = filter_by_year_quarter(company_filings, year, quarter)
# Filter by accession number
if accession_number:
company_filings = company_filings.filter(
pc.is_in(company_filings['accession_number'], pa.array(listify(accession_number))))
if len(company_filings) >= 1:
# We found the filing(s)
return EntityFilings(company_filings, cik=self.cik, company_name=self.name)
# Filter by form (with amendments support)
if form:
company_filings = filter_by_form(company_filings, form, amendments)
# Filter by file number
if file_number:
company_filings = company_filings.filter(
pc.is_in(company_filings['fileNumber'], pa.array(listify(file_number))))
# Filter by XBRL status
if is_xbrl is not None:
company_filings = company_filings.filter(pc.equal(company_filings['isXBRL'], int(is_xbrl)))
# Filter by inline XBRL status
if is_inline_xbrl is not None:
company_filings = company_filings.filter(pc.equal(company_filings['isInlineXBRL'], int(is_inline_xbrl)))
# Filter by filing date
filing_date = filing_date or date
if filing_date:
try:
company_filings = filter_by_date(company_filings, filing_date, 'filing_date')
except InvalidDateException as e:
log.error(e)
return None
# Sort filings
if sort_by:
company_filings = company_filings.sort_by(sort_by)
# Return filtered filings
return EntityFilings(company_filings, cik=self.cik, company_name=self.name)
@property
def is_company(self) -> bool:
"""Determine if this entity is a company."""
return not self.is_individual
@cached_property
def is_individual(self) -> bool:
"""
Determine if this entity is an individual.
Tricky logic to detect if a company is an individual or a company.
Companies have an ein, individuals do not. Oddly Warren Buffet has an EIN but not a state of incorporation
There may be other edge cases.
If you have a ticker or exchange you are a company.
"""
# Import locally using the lazy import cache
has_company_filings = lazy_import('edgar.entity.core.has_company_filings')
if len(self.tickers) > 0 or len(self.exchanges) > 0:
return False
elif hasattr(self,
'state_of_incorporation') and self.state_of_incorporation is not None and self.state_of_incorporation != '':
if self.cik == 1033331: # Reed Hastings exception
return True
return False
elif hasattr(self, 'entity_type') and self.entity_type not in ['', 'other']:
return False
elif has_company_filings(self.filings.data['form']):
if self.cik == 315090: # The Warren Buffett exception
return True
return False
elif not hasattr(self, 'ein') or self.ein is None or self.ein == "000000000":
return True
else:
return False
def __str__(self):
return f"EntityData({self.name} [{self.cik}])"
def __repr__(self):
repr_rich = lazy_import('edgar.richtools.repr_rich')
return repr_rich(self.__rich__())
def __rich__(self):
"""Creates a rich representation of the entity with clear information hierarchy."""
# Use lazy imports for rich components
box = lazy_import('rich.box')
Group = lazy_import('rich.console.Group')
Columns = lazy_import('rich.columns.Columns')
Padding = lazy_import('rich.padding.Padding')
Panel = lazy_import('rich.panel.Panel')
Table = lazy_import('rich.table.Table')
Text = lazy_import('rich.text.Text')
find_ticker = lazy_import('edgar.reference.tickers.find_ticker')
zip_longest = lazy_import('itertools.zip_longest')
datefmt = lazy_import('edgar.formatting.datefmt')
# Primary entity identification section
if self.is_company:
ticker = find_ticker(self.cik)
ticker = f"{ticker}" if ticker else ""
# The title of the panel
entity_title = Text.assemble("🏢 ",
(self.display_name, "bold green"),
" ",
(f"[{self.cik}] ", "dim"),
(ticker, "bold yellow")
)
else:
entity_title = Text.assemble("👤", (self.display_name, "bold green"))
# Primary Information Table
main_info = Table(box=box.SIMPLE_HEAVY, show_header=False, padding=(0, 1))
main_info.add_column("Row", style="") # Single column for the entire row
row_parts = []
row_parts.extend([Text("CIK", style="grey60"), Text(str(self.cik), style="bold deep_sky_blue3")])
if hasattr(self, 'entity_type') and self.entity_type:
if self.is_individual:
row_parts.extend([Text("Type", style="grey60"),
Text("Individual", style="bold yellow")])
else:
row_parts.extend([Text("Type", style="grey60"),
Text(self.entity_type.title(), style="bold yellow"),
Text(self._get_operating_type_emoticon(self.entity_type), style="bold yellow")])
main_info.add_row(*row_parts)
# Detailed Information Table
details = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
details.add_column("Category")
details.add_column("Industry")
details.add_column("Fiscal Year End")
details.add_row(
getattr(self, 'category', '-') or "-",
f"{getattr(self, 'sic', '')}: {getattr(self, 'sic_description', '')}" if hasattr(self,
'sic') and self.sic else "-",
self._format_fiscal_year_date(getattr(self, 'fiscal_year_end', '')) if hasattr(self,
'fiscal_year_end') and self.fiscal_year_end else "-"
)
# Combine main_info and details in a single panel
if self.is_company:
basic_info_renderables = [main_info, details]
else:
basic_info_renderables = [main_info]
basic_info_panel = Panel(
Group(*basic_info_renderables),
title="📋 Entity",
border_style="grey50"
)
# Trading Information
if self.tickers and self.exchanges:
trading_info = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
trading_info.add_column("Exchange")
trading_info.add_column("Symbol", style="bold yellow")
for exchange, ticker in zip_longest(self.exchanges, self.tickers, fillvalue="-"):
trading_info.add_row(exchange, ticker)
trading_panel = Panel(
trading_info,
title="📈 Exchanges",
border_style="grey50"
)
else:
trading_panel = Panel(
Text("No trading information available", style="grey58"),
title="📈 Trading Information",
border_style="grey50"
)
# Contact Information
contact_info = Table(box=box.SIMPLE, show_header=False, padding=(0, 1))
contact_info.add_column("Label", style="bold grey70")
contact_info.add_column("Value")
has_contact_info = any([
hasattr(self, 'phone') and self.phone,
hasattr(self, 'website') and self.website,
hasattr(self, 'investor_website') and self.investor_website
])
if hasattr(self, 'website') and self.website:
contact_info.add_row("Website", self.website)
if hasattr(self, 'investor_website') and self.investor_website:
contact_info.add_row("Investor Relations", self.investor_website)
if hasattr(self, 'phone') and self.phone:
contact_info.add_row("Phone", self.phone)
# Three-column layout for addresses and contact info
contact_renderables = []
if hasattr(self, 'business_address') and not self.business_address.empty:
contact_renderables.append(Panel(
Text(str(self.business_address)),
title="🏢 Business Address",
border_style="grey50"
))
if hasattr(self, 'mailing_address') and not self.mailing_address.empty:
contact_renderables.append(Panel(
Text(str(self.mailing_address)),
title="📫 Mailing Address",
border_style="grey50"
))
if has_contact_info:
contact_renderables.append(Panel(
contact_info,
title="📞 Contact Information",
border_style="grey50"
))
# Former Names Table (if any exist)
former_names_panel = None
if hasattr(self, 'former_names') and self.former_names:
former_names_table = Table(box=box.SIMPLE, show_header=False, padding=(0, 1))
former_names_table.add_column("Previous Company Names")
former_names_table.add_column("") # Empty column for better spacing
for former_name in self.former_names:
from_date = datefmt(former_name['from'], '%B %Y')
to_date = datefmt(former_name['to'], '%B %Y')
former_names_table.add_row(Text(former_name['name'], style="italic"), f"{from_date} to {to_date}")
former_names_panel = Panel(
former_names_table,
title="📜 Former Names",
border_style="grey50"
)
# Combine all sections using Group
if self.is_company:
content_renderables = [Padding("", (1, 0, 0, 0)), basic_info_panel, trading_panel]
if len(contact_renderables):
contact_and_addresses = Columns(contact_renderables, equal=True, expand=True)
content_renderables.append(contact_and_addresses)
if former_names_panel:
content_renderables.append(former_names_panel)
else:
content_renderables = [Padding("", (1, 0, 0, 0)), basic_info_panel]
if len(contact_renderables):
contact_and_addresses = Columns(contact_renderables, equal=True, expand=True)
content_renderables.append(contact_and_addresses)
content = Group(*content_renderables)
# Create the main panel
return Panel(
content,
title=entity_title,
subtitle="SEC Entity Data",
border_style="grey50"
)
@property
def display_name(self) -> str:
"""Reverse the name if it is a company"""
if self.is_company:
return self.name
return reverse_name(self.name)
@staticmethod
def _get_operating_type_emoticon(entity_type: str) -> str:
"""
Generate a meaningful single-width symbol based on the SEC entity type.
All symbols are chosen to be single-width to work well with rich borders.
Args:
entity_type (str): The SEC entity type (case-insensitive)
Returns:
str: A single-width symbol representing the entity type
"""
symbols = {
"operating": "", # Circle for active operations
"subsidiary": "", # Arrow showing connection to parent
"inactive": "×", # Cross for inactive
"holding company": "", # Square for solid corporate structure
"investment company": "$", # Dollar for investment focus
"investment trust": "$", # Dollar for investment focus
"shell": "", # Empty square for shell
"development stage": "", # Triangle for growth/development
"financial services": "¢", # Cent sign for financial services
"reit": "", # House symbol
"spv": "", # Diamond for special purpose
"joint venture": "" # Infinity for partnership
}
# Clean input: convert to lowercase and strip whitespace
cleaned_type = entity_type.lower().strip()
# Handle some common variations
if "investment" in cleaned_type:
return symbols["investment company"]
if "real estate" in cleaned_type or "reit" in cleaned_type:
return symbols["reit"]
# Return default question mark if type not found
return symbols.get(cleaned_type, "")
@staticmethod
def _format_fiscal_year_date(date_str):
"""Format fiscal year end date in a human-readable format."""
if not date_str:
return "-"
# Dictionary of months
months = {
"01": "Jan", "02": "Feb", "03": "Mar",
"04": "Apr", "05": "May", "06": "Jun",
"07": "Jul", "08": "Aug", "09": "Sep",
"10": "Oct", "11": "Nov", "12": "Dec"
}
# Extract month and day
month = date_str[:2]
if month not in months:
return date_str
try:
day = str(int(date_str[2:])) # Remove leading zero
return f"{months[month]} {day}"
except (ValueError, IndexError):
return date_str
class CompanyData(EntityData):
"""
Specialized container for company data loaded from SEC submissions API.
This is a specialized version of EntityData specifically for companies.
It adds company-specific methods and properties.
"""
def __init__(self, **kwargs):
"""Construct a new CompanyData object."""
super().__init__(**kwargs)
@property
def industry(self) -> str:
"""Get the industry description for this company."""
return getattr(self, 'sic_description', '')
def get_ticker(self) -> Optional[str]:
"""Get the primary ticker for this company."""
if self.tickers and len(self.tickers) > 0:
return self.tickers[0]
return None
def __str__(self):
ticker = self.get_ticker()
ticker_str = f" - {ticker}" if ticker else ""
return f"CompanyData({self.name} [{self.cik}]{ticker_str})"
# Compile regex patterns for better performance
_COMPANY_TYPES_PATTERN = re.compile(r"(L\.?L\.?C\.?|Inc\.?|Ltd\.?|L\.?P\.?|/[A-Za-z]{2,3}/?| CORP(ORATION)?|PLC| AG)$",
re.IGNORECASE)
_PUNCTUATION_PATTERN = re.compile(r"\.|,")
def preprocess_company(company: str) -> str:
"""preprocess the company name for storing in the search index"""
comp = _COMPANY_TYPES_PATTERN.sub("", company.lower())
comp = _PUNCTUATION_PATTERN.sub("", comp)
return comp.strip()
def create_default_entity_data(cik: int) -> 'EntityData':
"""
Create a default EntityData instance for when entity data cannot be found.
Args:
cik: The CIK number to use for the entity
Returns:
A minimal EntityData instance with default values
"""
# Create a minimal EntityData with blank/empty values
empty_address = Address(
street1="",
street2="",
city="",
state_or_country="",
zipcode="",
state_or_country_desc=""
)
# Import using lazy import cache
empty_company_filings = lazy_import('edgar.entity.filings.empty_company_filings')
# Use the CIK as the name since we don't know the real name
name = f"Entity {cik}"
# Create a minimal entity data
return EntityData(
cik=cik,
name=name,
tickers=[],
exchanges=[],
filings=empty_company_filings(cik, name),
business_address=empty_address,
mailing_address=empty_address,
category="",
sic=None,
sic_description="",
fiscal_year_end="",
entity_type="",
phone="",
flags="",
insider_transaction_for_owner_exists=False,
insider_transaction_for_issuer_exists=False,
ein="",
description="",
website="",
investor_website="",
state_of_incorporation="",
state_of_incorporation_description="",
former_names=[],
files=[]
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""
Process the learned canonical structures into a simplified mappings file
optimized for the Facts API.
"""
import json
def process_mappings():
"""Convert canonical structures to simple concept->statement mappings."""
# Load canonical structures
with open('learned_mappings.json', 'r') as f:
canonical = json.load(f)
# Create simplified mappings
mappings = {}
metadata = {
'version': '1.0.0',
'generated': '2025-08-13',
'companies_analyzed': 133,
'source': 'structural_learning_production_run'
}
# Process each statement type
for statement_type, concepts in canonical.items():
for concept_data in concepts:
concept = concept_data['concept']
# Only include high-confidence mappings
if concept_data['occurrence_rate'] >= 0.3: # 30% threshold
mappings[concept] = {
'statement_type': statement_type,
'confidence': concept_data['occurrence_rate'],
'label': concept_data['label'],
'parent': concept_data.get('parent'),
'is_abstract': concept_data.get('is_abstract', False),
'is_total': concept_data.get('is_total', False),
'section': concept_data.get('section'),
'avg_depth': concept_data.get('avg_depth', 0)
}
# Save processed mappings
output = {
'metadata': metadata,
'mappings': mappings
}
with open('statement_mappings_v1.json', 'w') as f:
json.dump(output, f, indent=2)
print(f"Processed {len(mappings)} concept mappings")
print("Statement distribution:")
stmt_counts = {}
for concept, data in mappings.items():
stmt = data['statement_type']
stmt_counts[stmt] = stmt_counts.get(stmt, 0) + 1
for stmt, count in sorted(stmt_counts.items()):
print(f" {stmt}: {count}")
if __name__ == "__main__":
process_mappings()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,557 @@
# EntityFiling Class Documentation
## Overview
The `EntityFiling` class extends the base `Filing` class with additional entity-specific metadata and functionality. When you access filings through a `Company` object, you get `EntityFiling` instances that include enriched information from the SEC's company submissions API.
**Key Differences from Base Filing:**
- Additional metadata (items, acceptance datetime, file number, etc.)
- `related_filings()` method to find filings by file number
- XBRL format indicators (is_xbrl, is_inline_xbrl)
- Report date separate from filing date
- Access to entity context
## Getting EntityFilings
### From Company
```python
from edgar import Company
# Get company
company = Company("AAPL")
# Get filings - returns EntityFiling instances
filings = company.get_filings(form="10-K")
filing = filings.latest()
# filing is now an EntityFiling, not base Filing
print(type(filing)) # <class 'edgar.entity.filings.EntityFiling'>
```
### Automatic Enhancement
When you call `company.get_filings()`, the filings are automatically EntityFiling instances with additional metadata.
## Common Actions
Quick reference for the most frequently used EntityFiling methods:
### Access Filing Content
```python
# Get HTML content
html = filing.html()
# Get plain text
text = filing.text()
# Get markdown formatted content
markdown = filing.markdown()
```
### Get Structured Data
```python
# Get form-specific object (10-K, 10-Q, 8-K, etc.)
report = filing.obj()
# Get XBRL financial data
xbrl = filing.xbrl()
```
### Entity-Specific Features
```python
# Find related filings (amendments, etc.)
related = filing.related_filings()
# Check XBRL availability
if filing.is_xbrl:
xbrl = filing.xbrl()
# Access entity-specific metadata
print(filing.report_date) # Period end date
print(filing.items) # 8-K items
print(filing.file_number) # SEC file number
```
### View in Browser
```python
# Open filing in web browser
filing.open()
```
### Get Attachments
```python
# Access all filing attachments
attachments = filing.attachments
```
## EntityFiling-Specific Attributes
### Additional Metadata
| Attribute | Type | Description |
|-----------|------|-------------|
| `report_date` | str | Period end date for the report (YYYY-MM-DD) |
| `acceptance_datetime` | str | SEC acceptance timestamp |
| `file_number` | str | SEC file number for tracking related filings |
| `items` | str | 8-K items (e.g., "2.02,9.01") |
| `size` | int | Filing size in bytes |
| `primary_document` | str | Primary document filename |
| `primary_doc_description` | str | Description of primary document |
| `is_xbrl` | bool | Whether filing has XBRL data |
| `is_inline_xbrl` | bool | Whether filing uses inline XBRL |
### Accessing Additional Metadata
```python
filing = company.get_filings(form="10-K").latest()
# Entity-specific attributes
print(f"Report Date: {filing.report_date}")
print(f"Accepted: {filing.acceptance_datetime}")
print(f"File Number: {filing.file_number}")
print(f"Has XBRL: {filing.is_xbrl}")
print(f"Inline XBRL: {filing.is_inline_xbrl}")
print(f"Size: {filing.size:,} bytes")
```
## Working with 8-K Items
The `items` attribute is especially useful for 8-K current reports, which can cover multiple topics.
### Understanding 8-K Items
8-K items indicate what events or information the filing reports:
- **2.02** - Results of Operations and Financial Condition
- **5.02** - Departure/Election of Directors or Officers
- **8.01** - Other Events
- **9.01** - Financial Statements and Exhibits
```python
# Get 8-K filings
filings_8k = company.get_filings(form="8-K")
# Filter by items
for filing in filings_8k:
if filing.items and "2.02" in filing.items:
print(f"Earnings 8-K: {filing.filing_date}")
print(f" Items: {filing.items}")
```
### Important Note on Legacy Filings
**Data Source Limitation**: The `items` value comes from SEC metadata, not from parsing the filing document.
**For Legacy SGML Filings (1999-2001)**: The SEC's historical metadata may be incorrect or incomplete. Modern XML filings (2005+) have accurate metadata.
**Workaround**: For accurate item extraction from legacy SGML 8-K filings, parse the filing text directly:
```python
# For legacy filings, parse the document
filing_text = filing.text()
# Use regex to find items (adjust pattern as needed)
import re
items_pattern = r'Item\s+(\d+\.\d+)'
found_items = re.findall(items_pattern, filing_text, re.IGNORECASE)
```
## Related Filings
### Finding Related Filings by File Number
Use the `file_number` to find amendments, related documents, or filings from the same series:
```python
# Get original filing
filing = company.get_filings(form="10-K").latest()
# Find all related filings (amendments, etc.)
related = filing.related_filings()
print(f"Original filing: {filing.accession_no}")
print(f"Related filings: {len(related)}")
for f in related:
print(f" {f.form} - {f.filing_date}")
```
### Use Cases for Related Filings
**1. Find Amendments:**
```python
# Get original 10-K
filing_10k = company.get_filings(form="10-K").latest()
# Find any amendments
related = filing_10k.related_filings()
amendments = related.filter(form="10-K/A")
if len(amendments) > 0:
print("Filing was amended:")
for amendment in amendments:
print(f" {amendment.filing_date}: {amendment.accession_no}")
```
**2. Track Filing Series:**
```python
# Get S-1 registration
s1 = company.get_filings(form="S-1").latest()
# Find all related S-1 amendments
series = s1.related_filings()
print(f"Registration series: {len(series)} filings")
```
## XBRL Indicators
The `is_xbrl` and `is_inline_xbrl` attributes help determine if structured financial data is available.
### Checking XBRL Availability
```python
filing = company.get_filings(form="10-K").latest()
if filing.is_xbrl:
print("Filing has XBRL data")
if filing.is_inline_xbrl:
print(" Uses inline XBRL format")
xbrl = filing.xbrl() # Parse XBRL data
else:
print(" Uses traditional XBRL format")
else:
print("No XBRL data available")
```
### Filtering by XBRL
```python
# Get only filings with XBRL data
filings = company.get_filings(form="10-Q")
xbrl_filings = [f for f in filings if f.is_xbrl]
print(f"{len(xbrl_filings)} of {len(filings)} have XBRL")
# Check inline XBRL adoption
inline_count = sum(1 for f in xbrl_filings if f.is_inline_xbrl)
print(f"{inline_count} use inline XBRL format")
```
## Report Date vs Filing Date
EntityFiling provides both `report_date` and `filing_date`:
- **`report_date`**: Period end date (what the filing reports on)
- **`filing_date`**: When the filing was submitted to SEC
```python
filing = company.get_filings(form="10-Q").latest()
print(f"Period Ended: {filing.report_date}")
print(f"Filed On: {filing.filing_date}")
# Calculate filing lag
from datetime import datetime
report_dt = datetime.strptime(filing.report_date, '%Y-%m-%d')
filing_dt = datetime.strptime(filing.filing_date, '%Y-%m-%d')
lag_days = (filing_dt - report_dt).days
print(f"Filing lag: {lag_days} days")
```
## Common Workflows
### Analyzing 8-K Patterns
```python
# Get all 8-K filings
filings_8k = company.get_filings(form="8-K")
# Categorize by item
from collections import Counter
item_counts = Counter()
for filing in filings_8k:
if filing.items:
for item in filing.items.split(','):
item_counts[item.strip()] += 1
# Show most common 8-K topics
print("Most common 8-K items:")
for item, count in item_counts.most_common(5):
print(f" Item {item}: {count} filings")
```
### Track Amendment Activity
```python
# Get all 10-K filings including amendments
all_10k = company.get_filings(form=["10-K", "10-K/A"])
# Group by year
from collections import defaultdict
by_year = defaultdict(list)
for filing in all_10k:
year = filing.report_date[:4]
by_year[year].append(filing)
# Check which years had amendments
for year in sorted(by_year.keys(), reverse=True):
filings = by_year[year]
has_amendment = any('/A' in f.form for f in filings)
status = "amended" if has_amendment else "original"
print(f"{year}: {len(filings)} filing(s) - {status}")
```
### Find Earnings Announcements
```python
# Find 8-K filings with earnings (Item 2.02)
earnings_8k = []
for filing in company.get_filings(form="8-K"):
if filing.items and "2.02" in filing.items:
earnings_8k.append(filing)
print(f"Found {len(earnings_8k)} earnings 8-K filings")
# Show filing timeline
for filing in earnings_8k[-5:]: # Last 5
print(f"{filing.report_date}: {filing.filing_date}")
```
### Check XBRL Adoption Timeline
```python
# Track when company started using XBRL
filings = company.get_filings(form="10-K")
for filing in filings:
xbrl_status = "inline XBRL" if filing.is_inline_xbrl else "XBRL" if filing.is_xbrl else "no XBRL"
print(f"{filing.filing_date}: {xbrl_status}")
```
## Integration with Base Filing Features
EntityFiling inherits all methods from the base Filing class:
```python
filing = company.get_filings(form="10-K").latest()
# All base Filing methods work
html = filing.html()
text = filing.text()
markdown = filing.markdown()
xbrl = filing.xbrl()
filing.open()
# PLUS entity-specific features
related = filing.related_filings()
print(f"8-K items: {filing.items}")
print(f"Has XBRL: {filing.is_xbrl}")
```
## Comparison: EntityFiling vs Base Filing
### When You Get Each Type
**EntityFiling** - From Company context:
```python
company = Company("AAPL")
filing = company.get_filings(form="10-K").latest()
# Type: EntityFiling (with extra metadata)
```
**Base Filing** - From general search:
```python
from edgar import get_filings
filings = get_filings(2024, 3, form="10-K")
filing = filings[0]
# Type: Filing (base class)
```
### Feature Comparison
| Feature | Base Filing | EntityFiling |
|---------|-------------|--------------|
| Basic metadata | ✅ | ✅ |
| Content access (html, text) | ✅ | ✅ |
| XBRL parsing | ✅ | ✅ |
| Report date | ❌ | ✅ |
| Acceptance datetime | ❌ | ✅ |
| File number | ❌ | ✅ |
| 8-K items | ❌ | ✅ |
| XBRL indicators | ❌ | ✅ |
| related_filings() | ❌ | ✅ |
## Best Practices
### 1. Use EntityFiling for Company Analysis
When working with a specific company, always access filings through the Company object to get EntityFiling benefits:
```python
# Good - get EntityFiling with metadata
company = Company("AAPL")
filing = company.get_filings(form="10-K").latest()
# Less ideal - get base Filing without metadata
filings = get_filings(2024, 3, form="10-K").filter(ticker="AAPL")
filing = filings[0]
```
### 2. Check XBRL Availability Before Parsing
```python
filing = company.get_filings(form="10-K").latest()
if filing.is_xbrl:
xbrl = filing.xbrl()
statements = xbrl.statements
else:
print("No structured financial data available")
```
### 3. Handle Missing Items Gracefully
```python
# Items may be None or empty string
if filing.items:
items_list = filing.items.split(',')
else:
items_list = []
```
### 4. Use Related Filings to Track Changes
```python
# Find if filing was amended
filing = company.get_filings(form="10-K").latest()
related = filing.related_filings()
amendments = [f for f in related if '/A' in f.form]
if amendments:
print(f"This filing has {len(amendments)} amendment(s)")
latest_amendment = amendments[-1]
print(f"Most recent: {latest_amendment.filing_date}")
```
## Error Handling
### Missing Attributes
Not all filings have all attributes populated:
```python
filing = company.get_filings(form="8-K").latest()
# Some filings may not have items
items = filing.items if filing.items else "Not specified"
# File number should always be present for EntityFiling
if filing.file_number:
print(f"File number: {filing.file_number}")
```
### XBRL Parsing Failures
Even if `is_xbrl` is True, parsing can fail:
```python
if filing.is_xbrl:
try:
xbrl = filing.xbrl()
statements = xbrl.statements
except Exception as e:
print(f"XBRL parsing failed: {e}")
# Fall back to text parsing
text = filing.text()
```
## Performance Considerations
### Efficient Filtering
Use EntityFiling metadata to filter before expensive operations:
```python
# Filter by XBRL availability first
filings = company.get_filings(form="10-Q")
xbrl_filings = [f for f in filings if f.is_xbrl]
# Then parse only those with XBRL
for filing in xbrl_filings:
xbrl = filing.xbrl()
# Process XBRL data...
```
### Batch Operations
When processing many filings, check size first:
```python
filings = company.get_filings()
# Process smaller filings first
sorted_filings = sorted(filings, key=lambda f: f.size)
for filing in sorted_filings[:10]: # Process 10 smallest
html = filing.html()
# Process content...
```
## Troubleshooting
### "EntityFiling has no attribute 'X'"
You're trying to use EntityFiling-specific features on a base Filing object:
```python
# Problem: Base filing doesn't have entity attributes
filings = get_filings(2024, 3)
filing = filings[0]
# filing.report_date # AttributeError!
# Solution: Get from company for EntityFiling
company = Company(filing.cik)
entity_filing = company.get_filings(
accession_number=filing.accession_no
)[0]
# entity_filing.report_date # Works!
```
### Related Filings Returns Empty
The file number might not link to other filings:
```python
related = filing.related_filings()
if len(related) == 0:
print("No related filings found")
# This is normal for standalone filings
else:
print(f"Found {len(related)} related filing(s)")
```
### Items Not Showing for 8-K
Check if it's a legacy filing:
```python
filing = company.get_filings(form="8-K")[0]
if not filing.items or filing.items == "":
# Check filing year
filing_year = int(filing.filing_date[:4])
if filing_year < 2005:
print("Legacy SGML filing - items may be missing from metadata")
print("Parse filing text for accurate item identification")
else:
print("Modern filing with no items specified")
```
This comprehensive guide covers the unique features and workflows available when working with EntityFiling objects in edgartools.

View File

@@ -0,0 +1,671 @@
# EntityFilings Class Documentation
## Overview
The `EntityFilings` class extends the base `Filings` class with entity-specific functionality. When you access filings through a `Company` object, you get an `EntityFilings` collection that maintains entity context (CIK and company name) and returns `EntityFiling` instances with enriched metadata.
**Key Differences from Base Filings:**
- Maintains entity context (CIK, company name)
- Returns `EntityFiling` instances (not base `Filing`)
- All filtering/selection methods preserve `EntityFilings` type
- Additional metadata from SEC company submissions API
- Direct access to entity-specific features
## Getting EntityFilings
### From Company
```python
from edgar import Company
# Get company
company = Company("AAPL")
# Get filings - returns EntityFilings collection
filings = company.get_filings()
# filings is EntityFilings, not base Filings
print(type(filings)) # <class 'edgar.entity.filings.EntityFilings'>
# Each filing in the collection is EntityFiling
filing = filings[0]
print(type(filing)) # <class 'edgar.entity.filings.EntityFiling'>
```
### With Form Filters
```python
# Get specific form types
filings_10k = company.get_filings(form="10-K")
filings_8k = company.get_filings(form="8-K")
filings_multi = company.get_filings(form=["10-K", "10-Q"])
```
## Common Actions
Quick reference for the most frequently used EntityFilings methods:
### Get Individual Filings
```python
# Get most recent filing
latest = filings.latest()
# Get multiple recent filings
latest_5 = filings.latest(5)
# Get filing by index
filing = filings[0]
filing = filings.get_filing_at(5)
```
### Filter the Collection
```python
# Filter by form type
annual_reports = filings.filter(form="10-K")
# Filter by date
recent = filings.filter(filing_date="2024-01-01:")
# Exclude amendments
originals_only = filings.filter(amendments=False)
# Combined filters
filtered = filings.filter(
form=["10-K", "10-Q"],
filing_date="2023-01-01:2023-12-31",
amendments=False
)
```
### Navigate Pages
```python
# For large collections (multiple pages)
next_page = filings.next()
prev_page = filings.previous()
```
### Convert to DataFrame
```python
# Export to pandas
df = filings.to_pandas()
# Select specific columns
df = filings.to_pandas('form', 'filing_date', 'accession_number')
```
### Select Subsets
```python
# Get first/last n filings
first_10 = filings.head(10)
last_10 = filings.tail(10)
# Random sample
sample = filings.sample(20)
```
## EntityFilings-Specific Features
### Entity Context
EntityFilings maintains the entity context throughout operations:
```python
filings = company.get_filings()
# Access entity information
print(filings.cik) # Company CIK
print(filings.company_name) # Company name
# Context preserved through operations
filtered = filings.filter(form="10-K")
print(filtered.cik) # Same CIK
print(filtered.company_name) # Same company name
```
### Returns EntityFiling Instances
All methods that return individual filings return `EntityFiling` (not base `Filing`):
```python
# Get latest returns EntityFiling
filing = filings.latest()
print(type(filing)) # EntityFiling
# Indexing returns EntityFiling
filing = filings[0]
print(type(filing)) # EntityFiling
# Access EntityFiling-specific attributes
print(filing.report_date) # Period end date
print(filing.items) # 8-K items
print(filing.is_xbrl) # XBRL indicator
```
### Type Preservation
All collection methods preserve the `EntityFilings` type:
```python
# filter() returns EntityFilings
filtered = filings.filter(form="10-K")
print(type(filtered)) # EntityFilings
# head() returns EntityFilings
first_10 = filings.head(10)
print(type(first_10)) # EntityFilings
# latest(n) with n>1 returns EntityFilings
latest_5 = filings.latest(5)
print(type(latest_5)) # EntityFilings
```
## Core Methods
### latest(n=1)
Get the most recent filing(s):
```python
# Get single latest filing (returns EntityFiling)
latest = filings.latest()
print(f"Most recent: {latest.form} on {latest.filing_date}")
# Get multiple latest filings (returns EntityFilings)
latest_5 = filings.latest(5)
for filing in latest_5:
print(f"{filing.form}: {filing.filing_date}")
```
### filter()
Filter filings by various criteria:
```python
# Filter by form type
filings_10k = filings.filter(form="10-K")
filings_8k = filings.filter(form="8-K")
filings_annual = filings.filter(form=["10-K", "10-K/A"])
# Filter by date
recent = filings.filter(filing_date="2024-01-01:")
date_range = filings.filter(filing_date="2023-01-01:2023-12-31")
specific_date = filings.filter(filing_date="2024-03-15")
# Exclude amendments
no_amendments = filings.filter(amendments=False)
# Filter by accession number
specific = filings.filter(accession_number="0000320193-24-000123")
# Combined filters
filtered = filings.filter(
form="10-Q",
filing_date="2024-01-01:",
amendments=False
)
```
**Note**: Unlike base `Filings.filter()`, `EntityFilings.filter()` doesn't support `cik` or `ticker` parameters since the collection is already scoped to a single entity.
### head(n) / tail(n)
Get first or last n filings:
```python
# Get first 10 filings
first_10 = filings.head(10)
# Get last 10 filings
last_10 = filings.tail(10)
# Both return EntityFilings collections
print(type(first_10)) # EntityFilings
print(type(last_10)) # EntityFilings
```
### sample(n)
Get random sample of filings:
```python
# Get random sample of 20 filings
sample = filings.sample(20)
# Returns EntityFilings collection
print(type(sample)) # EntityFilings
```
### Access by Index
```python
# Direct indexing
first_filing = filings[0]
tenth_filing = filings[9]
# Explicit method
filing = filings.get_filing_at(5)
# All return EntityFiling instances
```
## Pagination
For large filing collections, EntityFilings supports pagination:
### next() / previous()
```python
# Display shows page info if multiple pages
print(filings)
# Shows: "Showing 1 to 50 of 250 filings. Page using ← prev() and next() →"
# Navigate to next page
next_page = filings.next()
# Navigate to previous page
prev_page = filings.previous()
# Both return EntityFilings with new page of data
```
### Page Navigation Example
```python
# Start with first page
current_page = company.get_filings()
print(current_page)
# Move through pages
page_2 = current_page.next()
page_3 = page_2.next()
# Go back
page_2_again = page_3.previous()
# At end of pages
last_page = current_page
while True:
next_page = last_page.next()
if next_page is None:
break
last_page = next_page
```
## Data Conversion & Export
### to_pandas()
Convert to pandas DataFrame:
```python
# All columns
df = filings.to_pandas()
# Specific columns
df = filings.to_pandas('form', 'filing_date', 'accession_number')
# Shows entity-specific columns:
# form, filing_date, reportDate, acceptanceDateTime, fileNumber,
# items, size, primaryDocument, isXBRL, isInlineXBRL, etc.
```
### to_dict()
Convert to dictionary:
```python
# Convert to dict
data = filings.to_dict()
# Limit rows
data = filings.to_dict(max_rows=100)
```
### save() / save_parquet()
Save to Parquet file:
```python
# Save as Parquet
filings.save_parquet("company_filings.parquet")
# Alternative
filings.save("company_filings.parquet")
```
## Common Workflows
### Get Most Recent Annual Report
```python
company = Company("AAPL")
# Get all 10-K filings
filings_10k = company.get_filings(form="10-K")
# Get most recent
latest_10k = filings_10k.latest()
print(f"Latest 10-K: {latest_10k.filing_date}")
print(f"Period: {latest_10k.report_date}")
# Access XBRL if available
if latest_10k.is_xbrl:
xbrl = latest_10k.xbrl()
```
### Analyze Quarterly Reports
```python
# Get all 10-Q filings
filings_10q = company.get_filings(form="10-Q")
# Get last 4 quarters
last_4_quarters = filings_10q.latest(4)
# Analyze each quarter
for filing in last_4_quarters:
print(f"Quarter ending {filing.report_date}:")
print(f" Filed: {filing.filing_date}")
print(f" XBRL: {filing.is_xbrl}")
```
### Find 8-K Earnings Announcements
```python
# Get all 8-K filings
filings_8k = company.get_filings(form="8-K")
# Filter for earnings-related items
earnings_filings = []
for filing in filings_8k:
if filing.items and "2.02" in filing.items:
earnings_filings.append(filing)
print(f"Found {len(earnings_filings)} earnings 8-Ks")
# Show recent earnings dates
for filing in earnings_filings[:5]:
print(f"{filing.filing_date}: Items {filing.items}")
```
### Track Amendment Activity
```python
# Get all 10-K filings including amendments
all_10k = company.get_filings(form=["10-K", "10-K/A"])
# Separate originals from amendments
originals = all_10k.filter(amendments=False)
amendments = all_10k.filter(form="10-K/A")
print(f"Original 10-Ks: {len(originals)}")
print(f"Amended 10-Ks: {len(amendments)}")
# Show amendment details
for amendment in amendments:
print(f"{amendment.filing_date}: {amendment.accession_no}")
```
### Export Filings to DataFrame
```python
# Get recent filings
filings = company.get_filings(form=["10-K", "10-Q"])
# Filter to recent year
recent = filings.filter(filing_date="2024-01-01:")
# Convert to DataFrame
df = recent.to_pandas()
# Analyze
print(f"Total filings: {len(df)}")
print(f"Forms: {df['form'].value_counts()}")
print(f"XBRL filings: {df['isXBRL'].sum()}")
# Export
df.to_csv("aapl_recent_filings.csv", index=False)
```
### Compare XBRL Adoption
```python
# Get all annual reports
filings_10k = company.get_filings(form="10-K")
# Convert to DataFrame
df = filings_10k.to_pandas()
# Group by year
df['year'] = pd.to_datetime(df['filing_date']).dt.year
# Check XBRL adoption by year
xbrl_by_year = df.groupby('year').agg({
'isXBRL': 'sum',
'isInlineXBRL': 'sum',
'form': 'count'
}).rename(columns={'form': 'total'})
print(xbrl_by_year)
```
## Display & Representation
### Rich Display
EntityFilings displays as a rich table with pagination info:
```python
print(filings)
```
Shows:
- Table of filings with: #, Form, Description, Filing Date, Accession Number
- Pagination info (if multiple pages): "Showing 1 to 50 of 250 filings"
- Panel title: "Filings for [Company Name] [CIK]"
- Panel subtitle: Date range of filings
### Properties
```python
# Check if empty
if filings.empty:
print("No filings found")
# Get date range
start, end = filings.date_range
print(f"Filings from {start} to {end}")
# Get summary
print(filings.summary)
```
## Comparison: EntityFilings vs Base Filings
### When You Get Each Type
**EntityFilings** - From Company context:
```python
company = Company("AAPL")
filings = company.get_filings()
# Type: EntityFilings (with entity context)
```
**Base Filings** - From general search:
```python
from edgar import get_filings
filings = get_filings(2024, 1, form="10-K")
# Type: Filings (base class)
```
### Feature Comparison
| Feature | Base Filings | EntityFilings |
|---------|-------------|---------------|
| Filter by form | ✅ | ✅ |
| Filter by date | ✅ | ✅ |
| Filter by CIK/ticker | ✅ | ❌ (already scoped to entity) |
| Returns EntityFiling | ❌ | ✅ |
| Entity context (CIK, name) | ❌ | ✅ |
| Type preserved in operations | Filings | EntityFilings |
| From Company.get_filings() | ❌ | ✅ |
## Best Practices
### 1. Use EntityFilings for Company Analysis
When working with a specific company, always use `Company.get_filings()`:
```python
# Good - get EntityFilings with context
company = Company("AAPL")
filings = company.get_filings(form="10-K")
# Less ideal - get base Filings, requires filtering
from edgar import get_filings
all_filings = get_filings(2024, 1, form="10-K")
apple_filings = all_filings.filter(ticker="AAPL")
```
### 2. Check Empty Collections
```python
filings = company.get_filings(form="RARE-FORM")
if filings.empty:
print("No filings found")
else:
latest = filings.latest()
```
### 3. Use latest() for Single Most Recent
```python
# Get single filing
filing = filings.latest()
# Not this (gets collection of 1)
filings_one = filings.head(1)
filing = filings_one[0]
```
### 4. Preserve Type Through Operations
```python
# All these return EntityFilings
filtered = filings.filter(form="10-K")
recent = filtered.filter(filing_date="2024-01-01:")
sample = recent.sample(10)
# All maintain entity context
print(sample.cik) # Still accessible
print(sample.company_name) # Still accessible
```
## Error Handling
### Empty Collections
```python
filings = company.get_filings(form="NONEXISTENT")
if filings.empty:
print("No filings found")
else:
# Safe to access
latest = filings.latest()
```
### Pagination at Boundaries
```python
# At end of pages
last_page = filings
while True:
next_page = last_page.next()
if next_page is None:
print("Reached end of filings")
break
last_page = next_page
```
### Invalid Index
```python
# Check length first
if len(filings) > 5:
filing = filings[5]
else:
print("Collection has fewer than 6 filings")
```
## Performance Considerations
### Efficient Filtering
Filter early to reduce data size:
```python
# Good: filter first, then process
recent_10k = company.get_filings(form="10-K", filing_date="2023-01-01:")
for filing in recent_10k:
process(filing)
# Less efficient: get all, then filter in Python
all_filings = company.get_filings()
for filing in all_filings:
if filing.form == "10-K" and filing.filing_date >= "2023-01-01":
process(filing)
```
### Use Pagination
For very large collections, use pagination:
```python
# Process page by page
current_page = company.get_filings()
while current_page:
# Process current page
for filing in current_page:
process(filing)
# Move to next page
current_page = current_page.next()
```
### DataFrame Conversion
Only convert to pandas when needed:
```python
# Good: operate on EntityFilings directly
filings_10k = filings.filter(form="10-K")
latest = filings_10k.latest()
# Less efficient: convert to DataFrame first
df = filings.to_pandas()
df_10k = df[df['form'] == '10-K']
# Now you've lost EntityFiling functionality
```
## Integration with Company
EntityFilings is the primary interface between Company and Filing objects:
```python
company = Company("AAPL")
# Company.get_filings() returns EntityFilings
filings = company.get_filings()
# EntityFilings contains EntityFiling instances
filing = filings[0]
# EntityFiling knows its entity
entity = filing.get_entity()
# entity is the same Company object
```
This creates a seamless workflow for entity-focused analysis while maintaining proper type separation and functionality at each level.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,454 @@
"""
Filings-related classes for the Entity package.
This module contains classes related to SEC filings for entities, including
collections of filings and filing facts.
"""
from typing import List, Union
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
from rich.box import SIMPLE
from rich.console import Group
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from edgar._filings import Filing, Filings, PagingState
from edgar.core import IntString, log
from edgar.formatting import accession_number_text, display_size
from edgar.reference.forms import describe_form
from edgar.richtools import Docs, df_to_rich_table, repr_rich
__all__ = [
'EntityFiling',
'EntityFilings',
'EntityFacts',
'empty_company_filings'
]
class EntityFiling(Filing):
"""
Represents a single SEC filing for an entity.
This extends the base Filing class with additional information
and methods specific to SEC entities.
Attributes:
items (str): Filing items from SEC metadata. For 8-K filings, this indicates
which items are included (e.g., "2.02,9.01").
**Data Source**: This value comes from SEC filing metadata, not from parsing
the filing document itself.
**Legacy SGML Limitation**: For legacy SGML filings (1999-2001), the SEC's
historical metadata may be incorrect or incomplete. Modern XML filings (2005+)
have accurate metadata.
**Workaround for Legacy Filings**: For accurate item extraction from legacy
SGML 8-K filings, parse the filing text directly using regex patterns.
See GitHub Issue #462 for example code.
"""
def __init__(self,
cik: int,
company: str,
form: str,
filing_date: str,
report_date: str,
acceptance_datetime: str,
accession_no: str,
file_number: str,
items: str,
size: int,
primary_document: str,
primary_doc_description: str,
is_xbrl: bool,
is_inline_xbrl: bool):
super().__init__(cik=cik, company=company, form=form, filing_date=filing_date, accession_no=accession_no)
self.report_date = report_date
self.acceptance_datetime = acceptance_datetime
self.file_number: str = file_number
self.items: str = items # See class docstring for important notes on data source and limitations
self.size: int = size
self.primary_document: str = primary_document
self.primary_doc_description: str = primary_doc_description
self.is_xbrl: bool = is_xbrl
self.is_inline_xbrl: bool = is_inline_xbrl
def related_filings(self):
"""Get all the filings related to this one by file number."""
return self.get_entity().get_filings(file_number=self.file_number, sort_by="filing_date")
def __str__(self):
return (f"Filing(company='{self.company}', cik={self.cik}, form='{self.form}', "
f"filing_date='{self.filing_date}', accession_no='{self.accession_no}')"
)
class EntityFilings(Filings):
"""
Collection of SEC filings for an entity.
This extends the base Filings class with additional methods and properties
specific to entity filings.
"""
def __init__(self,
data: pa.Table,
cik: int,
company_name: str,
original_state: PagingState = None):
super().__init__(data, original_state=original_state)
self.cik = cik
self.company_name = company_name
@property
def docs(self):
return Docs(self)
def __getitem__(self, item):
return self.get_filing_at(item)
@property
def empty(self):
return len(self.data) == 0
def get_filing_at(self, item: int):
"""Get the filing at the specified index."""
return EntityFiling(
cik=self.cik,
company=self.company_name,
form=self.data['form'][item].as_py(),
filing_date=self.data['filing_date'][item].as_py(),
report_date=self.data['reportDate'][item].as_py(),
acceptance_datetime=self.data['acceptanceDateTime'][item].as_py(),
accession_no=self.data['accession_number'][item].as_py(),
file_number=self.data['fileNumber'][item].as_py(),
items=self.data['items'][item].as_py(),
size=self.data['size'][item].as_py(),
primary_document=self.data['primaryDocument'][item].as_py(),
primary_doc_description=self.data['primaryDocDescription'][item].as_py(),
is_xbrl=self.data['isXBRL'][item].as_py(),
is_inline_xbrl=self.data['isInlineXBRL'][item].as_py()
)
def filter(self,
form: Union[str, List[str]] = None,
amendments: bool = None,
filing_date: str = None,
date: str = None,
cik: Union[int, str, List[Union[int, str]]] = None,
ticker: Union[str, List[str]] = None,
accession_number: Union[str, List[str]] = None):
"""
Filter the filings based on various criteria.
Args:
form: Filter by form type
amendments: Include amendments
filing_date: Filter by filing date
date: Alias for filing_date
cik: Filter by CIK
ticker: Filter by ticker
accession_number: Filter by accession number
Returns:
Filtered EntityFilings
"""
# The super filter returns Filings. We want EntityFilings
res = super().filter(form=form,
amendments=amendments,
filing_date=filing_date,
date=date,
cik=cik,
ticker=ticker,
accession_number=accession_number)
return EntityFilings(data=res.data, cik=self.cik, company_name=self.company_name)
def latest(self, n: int = 1):
"""
Get the latest n filings.
Args:
n: Number of filings to return
Returns:
Latest filing(s) - single filing if n=1, otherwise EntityFilings
"""
sort_indices = pc.sort_indices(self.data, sort_keys=[("filing_date", "descending")])
sort_indices_top = sort_indices[:min(n, len(sort_indices))]
latest_filing_index = pc.take(data=self.data, indices=sort_indices_top)
filings = EntityFilings(latest_filing_index,
cik=self.cik,
company_name=self.company_name)
if filings.empty:
return None
if len(filings) == 1:
return filings[0]
else:
return filings
def head(self, n: int):
"""
Get the first n filings.
Args:
n: Number of filings to return
Returns:
EntityFilings containing the first n filings
"""
selection = self._head(n)
return EntityFilings(data=selection, cik=self.cik, company_name=self.company_name)
def tail(self, n: int):
"""
Get the last n filings.
Args:
n: Number of filings to return
Returns:
EntityFilings containing the last n filings
"""
selection = self._tail(n)
return EntityFilings(data=selection, cik=self.cik, company_name=self.company_name)
def sample(self, n: int):
"""
Get a random sample of n filings.
Args:
n: Number of filings to sample
Returns:
EntityFilings containing n random filings
"""
selection = self._sample(n)
return EntityFilings(data=selection, cik=self.cik, company_name=self.company_name)
@staticmethod
def summarize(data) -> pd.DataFrame:
"""
Summarize filing data as a pandas DataFrame.
Args:
data: Filing data to summarize
Returns:
DataFrame with summarized data
"""
return (data
.assign(size=lambda df: df['size'].apply(display_size),
isXBRL=lambda df: df.isXBRL.map({'1': "\u2713", 1: "\u2713"}).fillna(""),
)
.filter(["form", "filing_date", "accession_number", "isXBRL"])
.rename(columns={"filing_date": "filed", "isXBRL": "xbrl"})
)
def next(self):
"""
Show the next page of filings.
Returns:
EntityFilings with the next page of data, or None if at the end
"""
data_page = self.data_pager.next()
if data_page is None:
log.warning("End of data .. use prev() \u2190 ")
return None
start_index, _ = self.data_pager._current_range
filings_state = PagingState(page_start=start_index, num_records=len(self))
return EntityFilings(data_page,
cik=self.cik,
company_name=self.company_name,
original_state=filings_state)
def previous(self):
"""
Show the previous page of filings.
Returns:
EntityFilings with the previous page of data, or None if at the beginning
"""
data_page = self.data_pager.previous()
if data_page is None:
log.warning(" No previous data .. use next() \u2192 ")
return None
start_index, _ = self.data_pager._current_range
filings_state = PagingState(page_start=start_index, num_records=len(self))
return EntityFilings(data_page,
cik=self.cik,
company_name=self.company_name,
original_state=filings_state)
def __repr__(self):
return repr_rich(self.__rich__())
def __rich__(self):
# Create table with appropriate columns and styling
table = Table(
show_header=True,
header_style="bold",
show_edge=True,
expand=False,
padding=(0, 1),
box=SIMPLE,
row_styles=["", "bold"]
)
# Add columns with specific styling and alignment
table.add_column("#", style="dim", justify="right")
table.add_column("Form", width=10, style="bold yellow")
table.add_column("Description", width=60, style="bold blue"),
table.add_column("Filing Date", width=11)
table.add_column("Accession Number", width=20)
# Get current page from data pager
current_page = self.data_pager.current()
# Calculate start index for proper indexing
start_idx = self._original_state.page_start if self._original_state else self.data_pager.start_index
# Iterate through rows in current page
for i in range(len(current_page)):
form = current_page['form'][i].as_py()
description = describe_form(current_page['form'][i].as_py(), prepend_form=False)
row = [
str(start_idx + i),
form,
description,
str(current_page['filing_date'][i].as_py()),
accession_number_text(current_page['accession_number'][i].as_py())
]
table.add_row(*row)
# Show paging information only if there are multiple pages
elements = [table]
if self.data_pager.total_pages > 1:
total_filings = self._original_state.num_records
current_count = len(current_page)
start_num = start_idx + 1
end_num = start_idx + current_count
page_info = Text.assemble(
("Showing ", "dim"),
(f"{start_num:,}", "bold red"),
(" to ", "dim"),
(f"{end_num:,}", "bold red"),
(" of ", "dim"),
(f"{total_filings:,}", "bold"),
(" filings.", "dim"),
(" Page using ", "dim"),
("← prev()", "bold gray54"),
(" and ", "dim"),
("next() →", "bold gray54")
)
elements.extend([Text("\n"), page_info])
# Get the title
title = Text.assemble(
("Filings for ", "bold"),
(f"{self.company_name}", "bold green"),
(" [", "dim"),
(f"{self.cik}", "bold yellow"),
("]", "dim")
)
# Get the subtitle
start_date, end_date = self.date_range
date_range_text = f"Company filings between {start_date:%Y-%m-%d} and {end_date:%Y-%m-%d}" if start_date else "Company filings"
subtitle = Text.assemble(
(date_range_text, "dim"),
"",
("filings.docs", "cyan dim"),
(" for usage guide", "dim")
)
return Panel(
Group(*elements),
title=title,
subtitle=subtitle,
border_style="bold grey54",
expand=False
)
class EntityFacts:
"""
Contains structured facts data about an entity from XBRL filings.
"""
def __init__(self,
cik: int,
name: str,
facts: pa.Table,
fact_meta: pd.DataFrame):
self.cik: int = cik
self.name: str = name
self.facts: pa.Table = facts
self.fact_meta: pd.DataFrame = fact_meta
def to_pandas(self) -> pd.DataFrame:
"""Convert facts to a pandas DataFrame."""
return self.facts.to_pandas()
def __len__(self):
return len(self.facts)
def num_facts(self) -> int:
"""Get the number of facts."""
return len(self.fact_meta)
def __rich__(self):
return Panel(
Group(
df_to_rich_table(self.facts)
), title=f"Company Facts({self.name} [{self.cik}] {len(self.facts):,} total facts)"
)
def __repr__(self):
return repr_rich(self.__rich__())
COMPANY_FILINGS_SCHEMA = schema = pa.schema([
('accession_number', pa.string()),
('filing_date', pa.date32()),
('reportDate', pa.string()),
('acceptanceDateTime', pa.timestamp('us')), # Changed to timestamp
('act', pa.string()),
('form', pa.string()),
('fileNumber', pa.string()),
('items', pa.string()),
('size', pa.string()),
('isXBRL', pa.string()),
('isInlineXBRL', pa.string()),
('primaryDocument', pa.string()),
('primaryDocDescription', pa.string())
])
def empty_company_filings(cik:IntString, company_name:str):
"""
Create an empty filings container.
Args:
cik: The CIK number
company_name: The company name
Returns:
EntityFilings: An empty filings container
"""
table = pa.Table.from_arrays([[] for _ in range(13)], schema=COMPANY_FILINGS_SCHEMA)
return EntityFilings(table, cik=cik, company_name=company_name)
# For backward compatibility
CompanyFiling = EntityFiling
CompanyFilings = EntityFilings
CompanyFacts = EntityFacts

View File

@@ -0,0 +1,137 @@
"""
Loader for learned statement mappings and canonical structures.
This module handles loading and caching of learned mappings from the
structural learning process.
"""
import json
import logging
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, Optional
log = logging.getLogger(__name__)
@lru_cache(maxsize=1)
def load_learned_mappings() -> Dict[str, Dict[str, Any]]:
"""
Load learned statement mappings from package data.
Returns:
Dictionary of concept -> mapping info
"""
try:
# Get the data file path
data_dir = Path(__file__).parent / 'data'
mappings_file = data_dir / 'statement_mappings_v1.json'
if not mappings_file.exists():
log.warning("Learned mappings file not found: %s", mappings_file)
return {}
with open(mappings_file, 'r') as f:
data = json.load(f)
mappings = data.get('mappings', {})
metadata = data.get('metadata', {})
log.info("Loaded %d learned concept mappings (version: %s)", len(mappings), metadata.get('version', 'unknown'))
return mappings
except Exception as e:
log.error("Error loading learned mappings: %s", e)
return {}
@lru_cache(maxsize=1)
def load_canonical_structures() -> Dict[str, Any]:
"""
Load canonical statement structures.
Returns:
Dictionary of statement_type -> canonical structure
"""
try:
data_dir = Path(__file__).parent / 'data'
structures_file = data_dir / 'learned_mappings.json'
if not structures_file.exists():
log.warning("Canonical structures file not found: %s", structures_file)
return {}
with open(structures_file, 'r') as f:
structures = json.load(f)
log.info("Loaded canonical structures for %d statement types", len(structures))
return structures
except Exception as e:
log.error("Error loading canonical structures: %s", e)
return {}
@lru_cache(maxsize=1)
def load_virtual_trees() -> Dict[str, Any]:
"""
Load virtual presentation trees.
Returns:
Dictionary of statement_type -> virtual tree
"""
try:
data_dir = Path(__file__).parent / 'data'
trees_file = data_dir / 'virtual_trees.json'
if not trees_file.exists():
log.warning("Virtual trees file not found: %s", trees_file)
return {}
with open(trees_file, 'r') as f:
trees = json.load(f)
log.info("Loaded virtual trees for %d statement types", len(trees))
return trees
except Exception as e:
log.error("Error loading virtual trees: %s", e)
return {}
def get_concept_mapping(concept: str) -> Optional[Dict[str, Any]]:
"""
Get mapping information for a specific concept.
Args:
concept: Concept name (without namespace)
Returns:
Mapping info dict or None if not found
"""
mappings = load_learned_mappings()
return mappings.get(concept)
def get_statement_concepts(statement_type: str,
min_confidence: float = 0.5) -> Dict[str, Dict[str, Any]]:
"""
Get all concepts for a specific statement type.
Args:
statement_type: Type of statement (BalanceSheet, IncomeStatement, etc.)
min_confidence: Minimum confidence threshold
Returns:
Dictionary of concept -> mapping info
"""
mappings = load_learned_mappings()
result = {}
for concept, info in mappings.items():
if (info.get('statement_type') == statement_type and
info.get('confidence', 0) >= min_confidence):
result[concept] = info
return result

View File

@@ -0,0 +1,262 @@
"""
Data models for the enhanced Entity Facts API.
This module provides the unified data models for financial facts,
optimized for both traditional analysis and AI consumption.
"""
from dataclasses import dataclass, field
from datetime import date
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Union
class DataQuality(Enum):
"""Data quality indicators for facts"""
HIGH = "high" # Direct from XBRL, validated
MEDIUM = "medium" # Derived or calculated
LOW = "low" # Estimated or inferred
@dataclass
class FinancialFact:
"""
Unified fact representation optimized for both traditional analysis and AI consumption.
This class represents a single financial fact with rich contextual information,
quality indicators, and AI-ready metadata.
"""
# Core identification
concept: str # Standardized concept (e.g., 'us-gaap:Revenue')
taxonomy: str # Taxonomy namespace (us-gaap, ifrs, etc.)
label: str # Human-readable label
# Values with proper typing
value: Union[float, int, str] # The actual value
numeric_value: Optional[float] # Numeric representation for calculations
unit: str # Unit of measure (USD, shares, etc.)
scale: Optional[int] = None # Scale factor (thousands=1000, millions=1000000)
# Temporal context
period_start: Optional[date] = None
period_end: date = None
period_type: Literal['instant', 'duration'] = 'instant'
fiscal_year: int = 0
fiscal_period: str = '' # FY, Q1, Q2, Q3, Q4
# Filing context
filing_date: date = None
form_type: str = '' # 10-K, 10-Q, 8-K, etc.
accession: str = '' # SEC accession number
# Quality and provenance
data_quality: DataQuality = DataQuality.MEDIUM
is_audited: bool = False
is_restated: bool = False
is_estimated: bool = False
confidence_score: float = 0.8 # 0.0 to 1.0
# AI-ready context
semantic_tags: List[str] = field(default_factory=list) # ['revenue', 'recurring', 'operating']
business_context: str = '' # "Product revenue from iPhone sales"
calculation_context: Optional[str] = None # "Derived from segment data"
# Optional XBRL specifics
context_ref: Optional[str] = None
dimensions: Dict[str, str] = field(default_factory=dict)
statement_type: Optional[str] = None
line_item_sequence: Optional[int] = None
# Structural metadata (from learned mappings)
depth: Optional[int] = None # Hierarchy depth in statement
parent_concept: Optional[str] = None # Parent concept in hierarchy
section: Optional[str] = None # Statement section (e.g., "Current Assets")
is_abstract: bool = False # Abstract/header item
is_total: bool = False # Total/sum item
presentation_order: Optional[float] = None # Order in presentation
def to_llm_context(self) -> Dict[str, Any]:
"""
Generate rich context for LLM consumption.
Returns a dictionary with formatted values and contextual information
optimized for language model understanding.
"""
# Format the value appropriately
if self.numeric_value is not None:
if self.unit.upper() in ['USD', 'EUR', 'GBP', 'JPY']:
# Currency formatting
formatted_value = f"{self.numeric_value:,.0f}"
if self.scale:
if self.scale == 1000:
formatted_value += " thousand"
elif self.scale == 1000000:
formatted_value += " million"
elif self.scale == 1000000000:
formatted_value += " billion"
else:
formatted_value = f"{self.numeric_value:,.2f}"
else:
formatted_value = str(self.value)
# Format the period
if self.period_type == 'instant':
period_desc = f"as of {self.period_end}"
else:
period_desc = f"for {self.fiscal_period} {self.fiscal_year}"
if self.period_start and self.period_end:
period_desc += f" ({self.period_start} to {self.period_end})"
return {
"concept": self.label,
"value": formatted_value,
"unit": self.unit,
"period": period_desc,
"context": self.business_context,
"quality": self.data_quality.value,
"confidence": self.confidence_score,
"tags": self.semantic_tags,
"source": f"{self.form_type} filed {self.filing_date}" if self.filing_date else "Unknown source",
"is_audited": self.is_audited,
"is_estimated": self.is_estimated,
"dimensions": self.dimensions if self.dimensions else None
}
def get_display_period_key(self) -> str:
"""
Generate a display-friendly period key based on actual period dates.
This method creates period keys like "Q1 2024" based on the actual period
covered by the data, not the filing year. It uses the period_end date to
determine the calendar year and quarter.
Returns:
A period key in format like "Q1 2024", "FY 2023", etc.
"""
if not self.period_end:
# Fallback to fiscal year/period if no period_end
return f"{self.fiscal_period} {self.fiscal_year}"
# Extract calendar year from period_end
calendar_year = self.period_end.year
# For fiscal years, use "FY" prefix
if self.fiscal_period == 'FY':
return f"FY {calendar_year}"
# For quarters, determine the calendar quarter from the end date
if self.fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
end_month = self.period_end.month
# Map end month to calendar quarter
if end_month in [1, 2, 3]:
quarter = 'Q1'
elif end_month in [4, 5, 6]:
quarter = 'Q2'
elif end_month in [7, 8, 9]:
quarter = 'Q3'
else: # 10, 11, 12
quarter = 'Q4'
return f"{quarter} {calendar_year}"
# For other periods, use the fiscal period with calendar year
return f"{self.fiscal_period} {calendar_year}"
def get_formatted_value(self) -> str:
"""
Format the numeric value for display, avoiding scientific notation.
Returns:
Formatted string representation of the value
"""
if self.numeric_value is None:
return str(self.value)
# For currency values
if self.unit.upper() in ['USD', 'EUR', 'GBP', 'JPY', 'CAD', 'CHF']:
# Round to nearest whole number for large values
if abs(self.numeric_value) >= 1000:
return f"{self.numeric_value:,.0f}"
else:
return f"{self.numeric_value:,.2f}"
# For share counts
elif self.unit.lower() in ['shares', 'share']:
return f"{self.numeric_value:,.0f}"
# For percentages and ratios
elif self.unit.lower() in ['pure', 'percent', '%']:
return f"{self.numeric_value:.2f}"
# Default formatting
else:
if abs(self.numeric_value) >= 1000:
return f"{self.numeric_value:,.0f}"
else:
return f"{self.numeric_value:,.2f}"
def __repr__(self) -> str:
"""String representation focusing on key information"""
value_str = f"{self.numeric_value:,.0f}" if self.numeric_value else str(self.value)
return f"FinancialFact({self.concept}={value_str} {self.unit}, {self.fiscal_period} {self.fiscal_year})"
@dataclass
class ConceptMetadata:
"""
Metadata about a financial concept.
This provides additional context about what a concept represents,
how it's calculated, and how it relates to other concepts.
"""
concept: str # The concept identifier
label: str # Primary display label
definition: str # Detailed definition
# Concept relationships
parent_concepts: List[str] = field(default_factory=list)
child_concepts: List[str] = field(default_factory=list)
calculation_components: List[str] = field(default_factory=list)
# Classification
statement_type: Optional[str] = None # BalanceSheet, IncomeStatement, etc.
is_monetary: bool = True
is_duration: bool = True # True for flow concepts, False for stock concepts
normal_balance: Optional[Literal['debit', 'credit']] = None
# Usage guidance
common_names: List[str] = field(default_factory=list) # Alternative labels
usage_notes: str = '' # Special considerations
typical_scale: Optional[int] = None # Common scale factor
@dataclass
class FactCollection:
"""
A collection of related facts, typically for a specific time period or statement.
This is used internally to group facts for efficient processing and analysis.
"""
facts: List[FinancialFact]
period_key: str # e.g., "2024-Q4", "2024-FY"
statement_type: Optional[str] = None
def get_fact(self, concept: str) -> Optional[FinancialFact]:
"""Get a specific fact by concept"""
for fact in self.facts:
if fact.concept == concept or fact.label == concept:
return fact
return None
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary keyed by concept"""
return {
fact.concept: {
'value': fact.numeric_value or fact.value,
'label': fact.label,
'unit': fact.unit
}
for fact in self.facts
}

View File

@@ -0,0 +1,382 @@
"""
Parser for converting SEC API data to the new Entity Facts format.
This module handles the conversion of raw SEC company facts JSON data
into the new unified FinancialFact model.
"""
import logging
from datetime import date, datetime
from typing import Any, Dict, List, Optional
from edgar.entity.entity_facts import EntityFacts
from edgar.entity.mappings_loader import load_learned_mappings
from edgar.entity.models import DataQuality, FinancialFact
log = logging.getLogger(__name__)
class EntityFactsParser:
"""
Parser for converting SEC company facts to EntityFacts.
This class handles the transformation of raw SEC API data into
the new unified fact model with proper typing and AI-ready metadata.
"""
# Concept mapping for common financial statement items
STATEMENT_MAPPING = {
# Income Statement
'Revenue': 'IncomeStatement',
'Revenues': 'IncomeStatement', # Fix for Issue #438 - ensure us-gaap:Revenues maps properly
'RevenueFromContractWithCustomerExcludingAssessedTax': 'IncomeStatement',
'SalesRevenueNet': 'IncomeStatement',
'CostOfRevenue': 'IncomeStatement',
'GrossProfit': 'IncomeStatement',
'OperatingExpenses': 'IncomeStatement',
'OperatingIncomeLoss': 'IncomeStatement',
'NetIncomeLoss': 'IncomeStatement',
'EarningsPerShareDiluted': 'IncomeStatement',
# Balance Sheet
'Assets': 'BalanceSheet',
'AssetsCurrent': 'BalanceSheet',
'CurrentAssets': 'BalanceSheet',
'AssetsNoncurrent': 'BalanceSheet',
'Liabilities': 'BalanceSheet',
'LiabilitiesCurrent': 'BalanceSheet',
'CurrentLiabilities': 'BalanceSheet',
'LiabilitiesNoncurrent': 'BalanceSheet',
'StockholdersEquity': 'BalanceSheet',
'CashAndCashEquivalentsAtCarryingValue': 'BalanceSheet',
# Cash Flow
'NetCashProvidedByUsedInOperatingActivities': 'CashFlow',
'NetCashProvidedByUsedInInvestingActivities': 'CashFlow',
'NetCashProvidedByUsedInFinancingActivities': 'CashFlow',
'CashAndCashEquivalentsPeriodIncreaseDecrease': 'CashFlow'
}
# Semantic tags for concepts
SEMANTIC_TAGS = {
'Revenue': ['revenue', 'sales', 'operating'],
'NetIncomeLoss': ['profit', 'earnings', 'bottom_line'],
'Assets': ['assets', 'resources', 'balance_sheet'],
'CashAndCashEquivalentsAtCarryingValue': ['cash', 'liquidity', 'current_assets']
}
@classmethod
def parse_company_facts(cls, json_data: Dict[str, Any]) -> Optional[EntityFacts]:
"""
Parse SEC company facts JSON into EntityFacts.
Args:
json_data: Raw JSON from SEC API
Returns:
EntityFacts object or None if parsing fails
"""
try:
cik = int(json_data.get('cik', 0))
entity_name = json_data.get('entityName', 'Unknown')
facts = []
# Process facts from different taxonomies
facts_data = json_data.get('facts', {})
for taxonomy, taxonomy_facts in facts_data.items():
for concept, concept_data in taxonomy_facts.items():
# Process units for this concept
units = concept_data.get('units', {})
label = concept_data.get('label', concept)
description = concept_data.get('description', '')
for unit, unit_facts in units.items():
for fact_data in unit_facts:
fact = cls._parse_single_fact(
concept=concept,
taxonomy=taxonomy,
label=label,
description=description,
unit=unit,
fact_data=fact_data
)
if fact:
facts.append(fact)
if not facts:
log.warning("No facts found for CIK %s", cik)
return None
return EntityFacts(cik=cik, name=entity_name, facts=facts)
except Exception as e:
log.error("Error parsing company facts: %s", e)
return None
@classmethod
def _parse_single_fact(cls,
concept: str,
taxonomy: str,
label: str,
description: str,
unit: str,
fact_data: Dict[str, Any]) -> Optional[FinancialFact]:
"""
Parse a single fact from SEC data.
Args:
concept: Concept identifier
taxonomy: Taxonomy namespace
label: Human-readable label
description: Concept description
unit: Unit of measure
fact_data: Raw fact data
Returns:
FinancialFact or None if parsing fails
"""
# Extract core values
value = fact_data.get('val')
if value is None:
return None
# Parse dates
period_end = cls._parse_date(fact_data.get('end'))
period_start = cls._parse_date(fact_data.get('start'))
filing_date = cls._parse_date(fact_data.get('filed'))
# Determine period type
if period_start:
period_type = 'duration'
else:
period_type = 'instant'
# Parse fiscal period info
fiscal_year = cls._parse_fiscal_year(fact_data.get('fy'))
fiscal_period = fact_data.get('fp', '')
# Determine numeric value
numeric_value = None
if isinstance(value, (int, float)):
numeric_value = float(value)
elif isinstance(value, str) and value.replace('-', '').replace('.', '').isdigit():
try:
numeric_value = float(value)
except ValueError:
pass
# Determine statement type
statement_type = cls._determine_statement_type(concept)
# Get semantic tags
semantic_tags = cls._get_semantic_tags(concept)
# Get structural metadata from learned mappings
structural_info = cls._get_structural_info(concept)
# Determine data quality
data_quality = cls._assess_data_quality(fact_data, fiscal_period)
# Create business context
business_context = cls._generate_business_context(label, description, unit)
# Clean unit representation
clean_unit = cls._clean_unit(unit)
# Determine scale
scale = cls._determine_scale(unit)
return FinancialFact(
concept=f"{taxonomy}:{concept}",
taxonomy=taxonomy,
label=label,
value=value,
numeric_value=numeric_value,
unit=clean_unit,
scale=scale,
period_start=period_start,
period_end=period_end,
period_type=period_type,
fiscal_year=fiscal_year,
fiscal_period=fiscal_period,
filing_date=filing_date,
form_type=fact_data.get('form', ''),
accession=fact_data.get('accn', ''),
data_quality=data_quality,
is_audited=fiscal_period == 'FY', # Annual reports are typically audited
is_restated=False, # Would need additional logic to detect
is_estimated=False, # Would need additional logic to detect
confidence_score=0.9 if data_quality == DataQuality.HIGH else 0.7,
semantic_tags=semantic_tags,
business_context=business_context,
statement_type=statement_type,
# Add structural metadata
depth=structural_info.get('depth'),
parent_concept=structural_info.get('parent'),
section=structural_info.get('section'),
is_abstract=structural_info.get('is_abstract', False),
is_total=structural_info.get('is_total', False),
presentation_order=structural_info.get('avg_depth')
)
@staticmethod
def _parse_date(date_str: Optional[str]) -> Optional[date]:
"""Parse date string to date object"""
if not date_str:
return None
try:
# Try common date formats
for fmt in ['%Y-%m-%d', '%Y%m%d', '%m/%d/%Y']:
try:
return datetime.strptime(date_str, fmt).date()
except ValueError:
continue
# If all formats fail, try to parse as ISO format
return datetime.fromisoformat(date_str).date()
except Exception:
return None
@staticmethod
def _parse_fiscal_year(fy_value: Any) -> int:
"""Parse fiscal year value"""
if not fy_value:
return 0
try:
return int(fy_value)
except (ValueError, TypeError):
return 0
@classmethod
def _determine_statement_type(cls, concept: str) -> Optional[str]:
"""
Determine which financial statement a concept belongs to.
First checks static mappings, then falls back to learned mappings
with confidence threshold.
"""
# Remove namespace if present
if ':' in concept:
concept = concept.split(':')[-1]
# Check static mappings first (highest confidence)
if concept in cls.STATEMENT_MAPPING:
return cls.STATEMENT_MAPPING[concept]
# Check learned mappings
try:
learned_mappings = load_learned_mappings()
if concept in learned_mappings:
mapping = learned_mappings[concept]
# Only use high-confidence learned mappings
if mapping.get('confidence', 0) >= 0.5: # 50% threshold
return mapping['statement_type']
except Exception as e:
log.debug("Error loading learned mappings: %s", e)
return None
@classmethod
def _get_semantic_tags(cls, concept: str) -> List[str]:
"""Get semantic tags for a concept"""
# Remove namespace if present
if ':' in concept:
concept = concept.split(':')[-1]
return cls.SEMANTIC_TAGS.get(concept, [])
@classmethod
def _get_structural_info(cls, concept: str) -> Dict[str, Any]:
"""
Get structural metadata for a concept from learned mappings.
Returns dict with depth, parent, section, is_abstract, is_total
"""
# Remove namespace if present
if ':' in concept:
concept = concept.split(':')[-1]
try:
learned_mappings = load_learned_mappings()
if concept in learned_mappings:
mapping = learned_mappings[concept]
return {
'depth': int(mapping.get('avg_depth', 0)) if mapping.get('avg_depth') else None,
'parent': mapping.get('parent'),
'section': mapping.get('section'),
'is_abstract': mapping.get('is_abstract', False),
'is_total': mapping.get('is_total', False)
}
except Exception as e:
log.debug("Error getting structural info: %s", e)
return {}
@staticmethod
def _assess_data_quality(fact_data: Dict[str, Any], fiscal_period: str) -> DataQuality:
"""Assess the quality of a fact"""
# Annual data is typically higher quality
if fiscal_period == 'FY':
return DataQuality.HIGH
# Quarterly data
if fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
return DataQuality.HIGH
# Other data
return DataQuality.MEDIUM
@staticmethod
def _generate_business_context(label: str, description: str, unit: str) -> str:
"""Generate business context for a fact"""
# Handle null/None values
if not label:
label = ""
if not description:
description = ""
# Return description if it's longer and more informative than label
if description and len(description) > len(label):
return description
# Generate context based on label and unit
if label and 'Revenue' in label:
return "Total revenue generated from operations"
elif label and 'Income' in label:
return "Net earnings after all expenses and taxes"
elif label and 'Assets' in label:
return "Total resources owned by the company"
# Return label if available, otherwise empty string
return label if label else ""
@staticmethod
def _clean_unit(unit: str) -> str:
"""Clean and standardize unit representation"""
if not unit:
return ""
unit_mapping = {
'USD': 'USD',
'usd': 'USD',
'pure': 'number',
'shares': 'shares',
'USD/shares': 'USD per share'
}
return unit_mapping.get(unit, unit)
@staticmethod
def _determine_scale(unit: str) -> Optional[int]:
"""Determine scale factor from unit"""
# SEC data is typically already scaled
# This would need more sophisticated logic based on the actual data
return None

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,116 @@
"""
Search functionality for SEC entities.
This module provides functions and classes for searching for SEC entities.
"""
from functools import lru_cache
from typing import Any, Dict, List
import pandas as pd
from rich import box
from rich.table import Column, Table
from edgar.entity import Company
from edgar.entity.tickers import get_company_tickers
from edgar.richtools import repr_rich
from edgar.search.datasearch import FastSearch, company_ticker_preprocess, company_ticker_score
__all__ = [
'find_company',
'CompanySearchResults',
'CompanySearchIndex'
]
class CompanySearchResults:
"""
Results from a company search.
"""
def __init__(self, query: str,
search_results: List[Dict[str, Any]]):
self.query: str = query
self.results: pd.DataFrame = pd.DataFrame(search_results, columns=['cik', 'ticker', 'company', 'score'])
@property
def tickers(self):
return self.results.ticker.tolist()
@property
def ciks(self):
return self.results.cik.tolist()
@property
def empty(self):
return self.results.empty
def __len__(self):
return len(self.results)
def __getitem__(self, item):
if 0 <= item < len(self):
row = self.results.iloc[item]
cik: int = int(row.cik)
return Company(cik)
def __rich__(self):
table = Table(Column(""),
Column("Ticker", justify="left"),
Column("Name", justify="left"),
Column("Score", justify="left"),
title=f"Search results for '{self.query}'",
box=box.SIMPLE)
for index, row in enumerate(self.results.itertuples()):
table.add_row(str(index), row.ticker.rjust(6), row.company, f"{int(row.score)}%")
return table
def __repr__(self):
return repr_rich(self.__rich__())
class CompanySearchIndex(FastSearch):
"""
Search index for companies.
"""
def __init__(self):
data = get_company_tickers(as_dataframe=False)
super().__init__(data, ['company', 'ticker'],
preprocess_func=company_ticker_preprocess,
score_func=company_ticker_score)
def search(self, query: str, top_n: int = 10, threshold: float = 60) -> CompanySearchResults:
results = super().search(query, top_n, threshold)
return CompanySearchResults(query=query, search_results=results)
def __len__(self):
return len(self.data)
def __hash__(self):
# Combine column names and last 10 values in the 'company' column to create a hash
column_names = tuple(self.data[0].keys())
last_10_companies = tuple(entry['company'] for entry in self.data[-10:])
return hash((column_names, last_10_companies))
def __eq__(self, other):
if not isinstance(other, CompanySearchIndex):
return False
return (self.data[-10:], tuple(self.data[0].keys())) == (other.data[-10:], tuple(other.data[0].keys()))
@lru_cache(maxsize=1)
def _get_company_search_index():
"""Get the company search index."""
return CompanySearchIndex()
@lru_cache(maxsize=16)
def find_company(company: str, top_n: int = 10):
"""
Find a company by name.
Args:
company: The company name or ticker to search for
top_n: The maximum number of results to return
Returns:
CompanySearchResults: The search results
"""
return _get_company_search_index().search(company, top_n=top_n)

View File

@@ -0,0 +1,495 @@
"""
Financial Statement wrapper classes with rich display and concept-aware formatting.
This module provides Statement classes that wrap pandas DataFrames with:
- Intelligent formatting based on financial concept types
- Rich display for professional presentation
- Access to underlying data for calculations
- LLM-ready context generation
"""
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import pandas as pd
from rich.box import SIMPLE, SIMPLE_HEAVY
from rich.console import Group
from rich.padding import Padding
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from .terminal_styles import get_current_scheme
@dataclass
class ConceptFormatting:
"""Formatting rules for specific financial concepts"""
decimal_places: int = 2
show_currency: bool = True
scale_display: bool = True # Show M, B suffixes
percentage: bool = False
class FinancialStatement:
"""
A wrapper around pandas DataFrame for financial statements with intelligent formatting.
This class provides:
- Concept-aware formatting (EPS to 2 decimals, revenue in millions, etc.)
- Rich display for professional presentation
- Access to underlying numeric data
- LLM context generation
"""
# Formatting rules by concept pattern
CONCEPT_FORMATS = {
# Earnings per share - always show decimals
'earningspershare': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
'earnings per share': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
'eps': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
# Ratios and percentages
'ratio': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
'margin': ConceptFormatting(decimal_places=1, show_currency=False, scale_display=False, percentage=True),
'percent': ConceptFormatting(decimal_places=1, show_currency=False, scale_display=False, percentage=True),
# Per-share values
'per share': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
'pershare': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
'book value': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
'dividend': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
# Share counts - show full numbers with commas
'shares outstanding': ConceptFormatting(decimal_places=0, show_currency=False, scale_display=False),
'common stock': ConceptFormatting(decimal_places=0, show_currency=False, scale_display=False),
'weighted average': ConceptFormatting(decimal_places=0, show_currency=False, scale_display=False),
# Large financial amounts - show full numbers with commas
'revenue': ConceptFormatting(decimal_places=0, show_currency=True, scale_display=False),
'income': ConceptFormatting(decimal_places=0, show_currency=True, scale_display=False),
'assets': ConceptFormatting(decimal_places=0, show_currency=True, scale_display=False),
'liabilities': ConceptFormatting(decimal_places=0, show_currency=True, scale_display=False),
}
def __init__(self,
data: pd.DataFrame,
statement_type: str,
entity_name: str = "",
period_lengths: Optional[List[str]] = None,
mixed_periods: bool = False):
"""
Initialize financial statement.
Args:
data: DataFrame with financial data
statement_type: Type of statement (IncomeStatement, BalanceSheet, etc.)
entity_name: Company name
period_lengths: List of period lengths in the data
mixed_periods: Whether data contains mixed period lengths
"""
self.data = data
self.statement_type = statement_type
self.entity_name = entity_name
self.period_lengths = period_lengths or []
self.mixed_periods = mixed_periods
# Store original numeric data
self._numeric_data = data.copy()
def get_concept_formatting(self, concept_label: str) -> ConceptFormatting:
"""
Get formatting rules for a specific concept.
Args:
concept_label: Label of the financial concept
Returns:
ConceptFormatting rules for this concept
"""
label_lower = concept_label.lower()
# Check for exact matches first
for pattern, formatting in self.CONCEPT_FORMATS.items():
if pattern in label_lower:
return formatting
# Default formatting for large amounts - show full numbers with commas
return ConceptFormatting(decimal_places=0, show_currency=True, scale_display=False)
def format_value(self, value: float, concept_label: str) -> str:
"""
Format a single value based on its concept.
Args:
value: Numeric value to format
concept_label: Label of the financial concept
Returns:
Formatted string representation
"""
if pd.isna(value):
return ''
formatting = self.get_concept_formatting(concept_label)
# Handle percentage formatting
if formatting.percentage:
return f"{value:.{formatting.decimal_places}f}%"
# Always use full number formatting with commas - no scaling to preserve precision
if formatting.show_currency:
return f"${value:,.{formatting.decimal_places}f}"
else:
return f"{value:,.{formatting.decimal_places}f}"
def _repr_html_(self) -> str:
"""
Rich HTML representation for Jupyter notebooks.
Returns:
HTML string for rich display
"""
# Create a formatted copy as string DataFrame
formatted_data = pd.DataFrame(index=self.data.index, columns=self.data.columns, dtype=str)
# Apply formatting to each cell
for index in self.data.index:
concept_label = str(index)
for column in self.data.columns:
value = self.data.loc[index, column]
if pd.notna(value) and isinstance(value, (int, float)):
formatted_data.loc[index, column] = self.format_value(value, concept_label)
else:
formatted_data.loc[index, column] = str(value) if pd.notna(value) else ''
# Create HTML with styling
html = f"""
<div style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
<h3 style="color: #2c3e50; margin-bottom: 10px;">
{self.entity_name} - {self.statement_type.replace('Statement', ' Statement')}
</h3>
"""
# Add period warning if mixed
if self.mixed_periods:
html += """
<div style="background-color: #fff3cd; border: 1px solid #ffeaa7;
padding: 8px; margin-bottom: 10px; border-radius: 4px;">
<strong>⚠️ Mixed Period Lengths:</strong> This statement contains periods of different lengths
({periods}). Consider filtering to comparable periods for accurate analysis.
</div>
""".format(periods=', '.join(self.period_lengths))
# Add the formatted table
html += formatted_data.to_html(classes='financial-statement',
table_id='fs-table',
escape=False)
# Add CSS styling
html += """
<style>
.financial-statement {
border-collapse: collapse;
width: 100%;
font-size: 12px;
margin-top: 10px;
}
.financial-statement th {
background-color: #34495e;
color: white;
padding: 8px;
text-align: right;
font-weight: bold;
}
.financial-statement td {
padding: 6px 8px;
text-align: right;
border-bottom: 1px solid #ecf0f1;
}
.financial-statement tr:hover {
background-color: #f8f9fa;
}
.financial-statement tr:nth-child(even) {
background-color: #fdfdfd;
}
.financial-statement td:first-child {
text-align: left;
font-weight: 500;
}
</style>
</div>
"""
return html
def __str__(self) -> str:
"""
String representation for console display.
Returns:
Formatted string representation
"""
# Create formatted version as string DataFrame
formatted_data = pd.DataFrame(index=self.data.index, columns=self.data.columns, dtype=str)
# Apply formatting to each cell
for index in self.data.index:
concept_label = str(index)
for column in self.data.columns:
value = self.data.loc[index, column]
if pd.notna(value) and isinstance(value, (int, float)):
formatted_data.loc[index, column] = self.format_value(value, concept_label)
else:
formatted_data.loc[index, column] = str(value) if pd.notna(value) else ''
header = f"\n{self.entity_name} - {self.statement_type.replace('Statement', ' Statement')}\n"
header += "=" * len(header.strip()) + "\n"
if self.mixed_periods:
header += f"⚠️ Mixed period lengths: {', '.join(self.period_lengths)}\n\n"
return header + str(formatted_data)
def __rich__(self):
"""Creates a rich representation for professional financial statement display."""
colors = get_current_scheme()
if self.data.empty:
return Panel(
Text("No data available", style=colors["empty_value"]),
title=f"📊 {self.statement_type.replace('Statement', ' Statement')}",
border_style=colors["panel_border"]
)
# Statement type icon mapping
icon_map = {
'IncomeStatement': '💰',
'BalanceSheet': '⚖️',
'CashFlow': '💵',
'Statement': '📊'
}
icon = icon_map.get(self.statement_type, '📊')
# Title with company name and statement type
if self.entity_name:
title = Text.assemble(
icon + " ",
(self.entity_name, colors["company_name"]),
" ",
(self.statement_type.replace('Statement', ' Statement'), colors["statement_type"])
)
else:
title = Text.assemble(
icon + " ",
(self.statement_type.replace('Statement', ' Statement'), colors["statement_type"])
)
# Create the main financial statement table
statement_table = Table(box=SIMPLE, show_header=True, padding=(0, 1))
statement_table.add_column("Line Item", style=colors["total_item"], no_wrap=True, max_width=30)
# Add period columns (limit to reasonable number for display)
periods = list(self.data.columns)
display_periods = periods[:6] # Show max 6 periods for readability
has_more_periods = len(periods) > 6
for period in display_periods:
statement_table.add_column(str(period), justify="right", max_width=15)
# Add rows with formatted values
for index in self.data.index:
concept_label = str(index)
# Truncate long concept names
display_label = concept_label[:28] + "..." if len(concept_label) > 30 else concept_label
row_values = [display_label]
for period in display_periods:
value = self.data.loc[index, period]
if pd.notna(value) and isinstance(value, (int, float)):
formatted_value = self.format_value(value, concept_label)
row_values.append(formatted_value)
else:
row_values.append("-" if pd.isna(value) else str(value)[:12])
statement_table.add_row(*row_values)
# Create summary info panel
info_table = Table(box=SIMPLE_HEAVY, show_header=False, padding=(0, 1))
info_table.add_column("Metric", style=colors["low_confidence_item"])
info_table.add_column("Value", style=colors["total_item"])
info_table.add_row("Line Items", f"{len(self.data.index):,}")
info_table.add_row("Periods", f"{len(self.data.columns):,}")
if self.period_lengths:
info_table.add_row("Period Types", ", ".join(set(self.period_lengths)))
info_panel = Panel(
info_table,
title="📋 Statement Info",
border_style="bright_black"
)
# Create period warning if needed
warning_panel = None
if self.mixed_periods:
warning_text = Text.assemble(
"⚠️ Mixed period lengths detected: ",
(", ".join(self.period_lengths), "yellow"),
"\nConsider filtering to comparable periods for accurate analysis."
)
warning_panel = Panel(
warning_text,
title="🚨 Period Warning",
border_style=colors.get("warning", "yellow")
)
# Subtitle with additional info
subtitle_parts = [f"{len(self.data.index):,} line items"]
if has_more_periods:
subtitle_parts.append(f"showing first {len(display_periods)} of {len(periods)} periods")
subtitle = "".join(subtitle_parts)
# Main statement panel
statement_panel = Panel(
statement_table,
title="📊 Financial Data",
subtitle=subtitle,
border_style="bright_black"
)
# Combine all panels
content_renderables = [
Padding("", (1, 0, 0, 0)),
info_panel
]
if warning_panel:
content_renderables.append(warning_panel)
content_renderables.append(statement_panel)
content = Group(*content_renderables)
return Panel(
content,
title=title,
border_style=colors["panel_border"]
)
def __repr__(self):
"""String representation using rich formatting."""
from edgar.richtools import repr_rich
return repr_rich(self.__rich__())
def to_numeric(self) -> pd.DataFrame:
"""
Get the underlying numeric DataFrame for calculations.
Returns:
DataFrame with original numeric values
"""
return self._numeric_data.copy()
def to_llm_context(self) -> Dict[str, Any]:
"""
Generate LLM-friendly context from the statement.
Returns:
Dictionary with structured financial data for LLM consumption
"""
context = {
"entity_name": self.entity_name,
"statement_type": self.statement_type,
"period_lengths": self.period_lengths,
"mixed_periods": self.mixed_periods,
"periods": list(self.data.columns),
"line_items": {}
}
# Convert each line item to LLM-friendly format
for index in self.data.index:
concept_label = str(index)
line_item = {
"label": concept_label,
"values": {},
"formatting": self.get_concept_formatting(concept_label).__dict__
}
for column in self.data.columns:
value = self.data.loc[index, column]
if pd.notna(value):
line_item["values"][str(column)] = {
"raw_value": float(value),
"formatted_value": self.format_value(value, concept_label)
}
context["line_items"][concept_label] = line_item
return context
def get_concept(self, concept_name: str) -> Optional[pd.Series]:
"""
Get data for a specific concept across all periods.
Args:
concept_name: Name of the concept to retrieve
Returns:
Series with values across periods, or None if not found
"""
# Try exact match first
if concept_name in self.data.index:
return self.data.loc[concept_name]
# Try case-insensitive partial match
concept_lower = concept_name.lower()
for index in self.data.index:
if concept_lower in str(index).lower():
return self.data.loc[index]
return None
def calculate_growth(self, concept_name: str, periods: int = 2) -> Optional[pd.Series]:
"""
Calculate period-over-period growth for a concept.
Args:
concept_name: Name of the concept
periods: Number of periods to calculate growth over
Returns:
Series with growth rates, or None if concept not found
"""
concept_data = self.get_concept(concept_name)
if concept_data is None:
return None
# Calculate percentage change
return concept_data.pct_change(periods=periods) * 100
@property
def shape(self) -> tuple:
"""Get the shape of the underlying data."""
return self.data.shape
@property
def columns(self) -> pd.Index:
"""Get the columns of the underlying data."""
return self.data.columns
@property
def index(self) -> pd.Index:
"""Get the index of the underlying data."""
return self.data.index
@property
def empty(self) -> bool:
"""Check if the underlying DataFrame is empty."""
return self.data.empty
def __len__(self) -> int:
"""Get the length of the underlying DataFrame."""
return len(self.data)

View File

@@ -0,0 +1,731 @@
"""
Statement Builder for reconstructing financial statements using canonical structures.
This module provides intelligent statement reconstruction using learned canonical
structures and virtual presentation trees.
"""
import logging
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import date
from typing import Any, Dict, List, Optional, Set
from rich import box
from rich.columns import Columns
from rich.console import Group
from rich.padding import Padding
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from edgar.entity.mappings_loader import load_canonical_structures, load_virtual_trees
from edgar.entity.models import FinancialFact
from edgar.richtools import repr_rich
log = logging.getLogger(__name__)
@dataclass
class StatementItem:
"""A single item in a reconstructed financial statement."""
concept: str
label: str
value: Optional[float]
depth: int
parent_concept: Optional[str]
children: List['StatementItem'] = field(default_factory=list)
# Metadata
is_abstract: bool = False
is_total: bool = False
section: Optional[str] = None
confidence: float = 1.0
source: str = 'fact' # 'fact', 'calculated', 'canonical', 'placeholder'
# Original fact if available
fact: Optional[FinancialFact] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary representation."""
return {
'concept': self.concept,
'label': self.label,
'value': self.value,
'depth': self.depth,
'is_abstract': self.is_abstract,
'is_total': self.is_total,
'section': self.section,
'confidence': self.confidence,
'source': self.source,
'children': [child.to_dict() for child in self.children]
}
def get_display_value(self) -> str:
"""Get formatted value for display."""
if self.value is not None:
if abs(self.value) >= 1_000_000_000:
return f"${self.value/1_000_000_000:.1f}B"
elif abs(self.value) >= 1_000_000:
return f"${self.value/1_000_000:.1f}M"
elif abs(self.value) >= 1_000:
return f"${self.value/1_000:.0f}K"
else:
return f"${self.value:.0f}"
elif self.is_abstract:
return ""
elif self.source == 'placeholder':
return "[Missing]"
else:
return "-"
def __rich__(self):
"""Create a rich representation of the statement item."""
from rich.tree import Tree
# Create the node label
if self.is_abstract:
label = Text(self.label, style="bold cyan")
elif self.is_total:
label = Text(self.label, style="bold yellow")
else:
style = "dim" if self.confidence < 0.8 else ""
confidence_marker = "" if self.confidence < 0.8 else ""
label = Text(f"{self.label}{confidence_marker}", style=style)
# Add value if present
value_str = self.get_display_value()
if value_str and value_str != "-":
# Color code values
if value_str.startswith("$") and self.value and isinstance(self.value, (int, float)):
value_style = "red" if self.value < 0 else "green"
else:
value_style = ""
label_with_value = Text.assemble(
label,
" ",
(value_str, value_style)
)
else:
label_with_value = label
# Create tree with this item as root
tree = Tree(label_with_value)
# Add children
for child in self.children:
tree.add(child.__rich__())
return tree
def __repr__(self) -> str:
"""String representation using rich formatting."""
return repr_rich(self.__rich__())
@dataclass
class StructuredStatement:
"""A complete structured financial statement."""
statement_type: str
fiscal_year: Optional[int]
fiscal_period: Optional[str]
period_end: Optional[date]
items: List[StatementItem]
# Metadata
company_name: Optional[str] = None
cik: Optional[str] = None
canonical_coverage: float = 0.0
facts_used: int = 0
facts_total: int = 0
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary representation."""
return {
'statement_type': self.statement_type,
'fiscal_year': self.fiscal_year,
'fiscal_period': self.fiscal_period,
'period_end': self.period_end.isoformat() if self.period_end else None,
'company_name': self.company_name,
'cik': self.cik,
'canonical_coverage': self.canonical_coverage,
'facts_used': self.facts_used,
'facts_total': self.facts_total,
'items': [item.to_dict() for item in self.items]
}
def get_hierarchical_display(self, max_depth: int = 3) -> str:
"""Get hierarchical text representation."""
lines = []
def add_item(item: StatementItem, indent: int = 0):
if indent > max_depth:
return
indent_str = " " * indent
value_str = item.get_display_value()
if item.is_abstract:
lines.append(f"{indent_str}{item.label}")
elif item.is_total:
lines.append(f"{indent_str}{item.label:<40} {value_str:>15}")
lines.append(f"{indent_str}{'-' * 55}")
else:
confidence_marker = "" if item.confidence > 0.8 else " *"
lines.append(f"{indent_str}{item.label:<40} {value_str:>15}{confidence_marker}")
for child in item.children:
add_item(child, indent + 1)
for item in self.items:
add_item(item)
return "\n".join(lines)
def __rich__(self):
"""Create a rich representation of the structured statement."""
# Statement type mapping for better display
statement_names = {
'IncomeStatement': 'Income Statement',
'BalanceSheet': 'Balance Sheet',
'CashFlow': 'Cash Flow Statement',
'StatementsOfComprehensiveIncome': 'Comprehensive Income',
'StatementsOfShareholdersEquity': 'Shareholders Equity'
}
# Title with company name and period
title_parts = []
if self.company_name:
title_parts.append((self.company_name, "bold green"))
else:
title_parts.append(("Financial Statement", "bold"))
title = Text.assemble(*title_parts)
# Subtitle with statement type and period
statement_display = statement_names.get(self.statement_type, self.statement_type)
if self.fiscal_period and self.fiscal_year:
subtitle = f"{statement_display}{self.fiscal_period} {self.fiscal_year}"
elif self.period_end:
subtitle = f"{statement_display} • As of {self.period_end}"
else:
subtitle = statement_display
# Main statement table
stmt_table = Table(
box=box.SIMPLE,
show_header=False,
padding=(0, 1),
expand=True
)
stmt_table.add_column("Item", style="", ratio=3)
stmt_table.add_column("Value", justify="right", style="bold", ratio=1)
def add_item_to_table(item: StatementItem, depth: int = 0):
"""Add an item to the table with proper indentation."""
indent = " " * depth
if item.is_abstract:
# Abstract items are headers
stmt_table.add_row(
Text(f"{indent}{item.label}", style="bold cyan"),
""
)
elif item.is_total:
# Total items with underline
value_text = Text(item.get_display_value(), style="bold yellow")
stmt_table.add_row(
Text(f"{indent}{item.label}", style="bold"),
value_text
)
# Add a separator line after totals
if depth == 0:
stmt_table.add_row("", "")
stmt_table.add_row(
Text("" * 40, style="dim"),
Text("" * 15, style="dim")
)
else:
# Regular items
style = "dim" if item.confidence < 0.8 else ""
confidence_marker = "" if item.confidence < 0.8 else ""
label_text = f"{indent}{item.label}{confidence_marker}"
# Color code positive/negative values
value_str = item.get_display_value()
if value_str and value_str.startswith("$"):
try:
# Extract numeric value for coloring
if item.value and isinstance(item.value, (int, float)):
if item.value < 0:
value_style = "red"
else:
value_style = "green"
else:
value_style = ""
except:
value_style = ""
else:
value_style = ""
stmt_table.add_row(
Text(label_text, style=style),
Text(value_str, style=value_style) if value_str else ""
)
# Add children recursively
for child in item.children:
if depth < 3: # Limit depth for display
add_item_to_table(child, depth + 1)
# Add all items to the table
for item in self.items:
add_item_to_table(item)
# Metadata summary
metadata = Table(box=box.SIMPLE, show_header=False, padding=(0, 1))
metadata.add_column("Metric", style="dim")
metadata.add_column("Value", style="bold")
metadata.add_row("Facts Used", f"{self.facts_used:,}")
if self.facts_total > 0:
metadata.add_row("Total Facts", f"{self.facts_total:,}")
if self.canonical_coverage > 0:
coverage_pct = self.canonical_coverage * 100
coverage_style = "green" if coverage_pct >= 50 else "yellow" if coverage_pct >= 25 else "red"
metadata.add_row(
"Canonical Coverage",
Text(f"{coverage_pct:.1f}%", style=coverage_style)
)
if self.cik:
metadata.add_row("CIK", self.cik)
# Data quality indicators
quality_notes = []
# Count items by confidence
low_confidence_count = sum(
1 for item in self._flatten_items()
if not item.is_abstract and item.confidence < 0.8
)
if low_confidence_count > 0:
quality_notes.append(
Text(f"{low_confidence_count} items with lower confidence", style="dim yellow")
)
# Count calculated vs actual values
calculated_count = sum(
1 for item in self._flatten_items()
if item.source == 'calculated'
)
if calculated_count > 0:
quality_notes.append(
Text(f"{calculated_count} calculated values", style="dim cyan")
)
# Combine metadata and quality notes
metadata_panel = Panel(
metadata,
title="📊 Statement Metadata",
border_style="bright_black"
)
# Create the main content group
content_parts = [
Padding("", (1, 0, 0, 0)),
stmt_table
]
# Add metadata in a column layout
if self.facts_used > 0:
bottom_content = [metadata_panel]
if quality_notes:
quality_panel = Panel(
Group(*quality_notes),
title="📝 Data Quality Notes",
border_style="bright_black"
)
bottom_content.append(quality_panel)
content_parts.append(Padding("", (1, 0)))
content_parts.append(Columns(bottom_content, equal=True, expand=True))
content = Group(*content_parts)
# Create the main panel
return Panel(
content,
title=title,
subtitle=subtitle,
border_style="blue",
expand=True
)
def _flatten_items(self) -> List[StatementItem]:
"""Flatten the hierarchical items into a flat list."""
flat_items = []
def flatten(item: StatementItem):
flat_items.append(item)
for child in item.children:
flatten(child)
for item in self.items:
flatten(item)
return flat_items
def __repr__(self) -> str:
"""String representation using rich formatting."""
return repr_rich(self.__rich__())
class StatementBuilder:
"""
Builds structured financial statements using canonical templates.
This class reconstructs complete financial statements by combining
actual facts with canonical structures, filling in missing concepts
and maintaining proper hierarchy.
"""
def __init__(self, cik: Optional[str] = None):
"""
Initialize the statement builder.
Args:
cik: Company CIK for context
"""
self.cik = cik
self.canonical_structures = load_canonical_structures()
self.virtual_trees = load_virtual_trees()
def build_statement(self,
facts: List[FinancialFact],
statement_type: str,
fiscal_year: Optional[int] = None,
fiscal_period: Optional[str] = None,
use_canonical: bool = True,
include_missing: bool = False) -> StructuredStatement:
"""
Build a structured financial statement from facts.
Args:
facts: List of financial facts
statement_type: Type of statement (BalanceSheet, IncomeStatement, etc.)
fiscal_year: Fiscal year to filter for
fiscal_period: Fiscal period (FY, Q1, Q2, Q3, Q4)
use_canonical: Whether to use canonical structure for organization
include_missing: Whether to include placeholder for missing concepts
Returns:
StructuredStatement with hierarchical organization
"""
# Filter facts for this statement and period
filtered_facts = self._filter_facts(facts, statement_type, fiscal_year, fiscal_period)
# Create fact lookup
fact_map = self._create_fact_map(filtered_facts)
# Get period end date
period_end = self._get_period_end(filtered_facts)
if use_canonical and statement_type in self.virtual_trees:
# Build using canonical structure
items = self._build_with_canonical(
fact_map,
self.virtual_trees[statement_type],
include_missing
)
# Add unmatched facts
unmatched = self._find_unmatched_facts(fact_map, self.virtual_trees[statement_type])
items.extend(self._create_items_from_facts(unmatched))
else:
# Build from facts only
items = self._build_from_facts(fact_map)
# Calculate metadata
facts_used = len(fact_map)
canonical_coverage = self._calculate_coverage(fact_map, statement_type) if use_canonical else 0.0
return StructuredStatement(
statement_type=statement_type,
fiscal_year=fiscal_year,
fiscal_period=fiscal_period,
period_end=period_end,
items=items,
cik=self.cik,
canonical_coverage=canonical_coverage,
facts_used=facts_used,
facts_total=len(facts)
)
def _filter_facts(self, facts: List[FinancialFact],
statement_type: str,
fiscal_year: Optional[int],
fiscal_period: Optional[str]) -> List[FinancialFact]:
"""Filter facts for the requested statement and period."""
filtered = []
for fact in facts:
# Check statement type
if fact.statement_type != statement_type:
continue
# Check fiscal year
if fiscal_year and fact.fiscal_year != fiscal_year:
continue
# Check fiscal period
if fiscal_period and fact.fiscal_period != fiscal_period:
continue
filtered.append(fact)
return filtered
def _create_fact_map(self, facts: List[FinancialFact]) -> Dict[str, FinancialFact]:
"""Create a map of concept to fact."""
fact_map = {}
for fact in facts:
# Extract clean concept name
concept = fact.concept
if ':' in concept:
concept = concept.split(':', 1)[1]
# Use most recent fact for duplicates
if concept not in fact_map or fact.filing_date > fact_map[concept].filing_date:
fact_map[concept] = fact
return fact_map
def _get_period_end(self, facts: List[FinancialFact]) -> Optional[date]:
"""Get the period end date from facts."""
for fact in facts:
if fact.period_end:
return fact.period_end
return None
def _build_with_canonical(self, fact_map: Dict[str, FinancialFact],
virtual_tree: Dict[str, Any],
include_missing: bool) -> List[StatementItem]:
"""Build statement using canonical structure."""
items = []
processed = set()
# Process root nodes
for root_concept in virtual_tree.get('roots', []):
item = self._build_canonical_item(
root_concept,
virtual_tree['nodes'],
fact_map,
processed,
include_missing,
depth=0
)
if item:
items.append(item)
return items
def _build_canonical_item(self, concept: str,
nodes: Dict[str, Any],
fact_map: Dict[str, FinancialFact],
processed: Set[str],
include_missing: bool,
depth: int = 0,
parent: Optional[str] = None) -> Optional[StatementItem]:
"""Build a single canonical item with children."""
if concept in processed:
return None
processed.add(concept)
# Get node info
node = nodes.get(concept, {})
# Check if we have a fact for this concept
fact = fact_map.get(concept)
# Determine if we should include this item
if not fact and not include_missing and not node.get('is_abstract'):
# Skip missing concrete concepts unless required
if node.get('occurrence_rate', 0) < 0.8: # Not a core concept
return None
# Create the item
item = StatementItem(
concept=concept,
label=fact.label if fact else node.get('label', concept),
value=fact.numeric_value if fact else None,
depth=depth,
parent_concept=parent,
is_abstract=node.get('is_abstract', False),
is_total=node.get('is_total', False),
section=node.get('section'),
confidence=node.get('occurrence_rate', 1.0) if not fact else 1.0,
source='fact' if fact else ('canonical' if not include_missing else 'placeholder'),
fact=fact
)
# Process children
for child_concept in node.get('children', []):
child_item = self._build_canonical_item(
child_concept,
nodes,
fact_map,
processed,
include_missing,
depth + 1,
concept
)
if child_item:
item.children.append(child_item)
# Try to calculate total if missing
if item.is_total and item.value is None and item.children:
calculated_value = self._calculate_total(item.children)
if calculated_value is not None:
item.value = calculated_value
item.source = 'calculated'
return item
def _calculate_total(self, children: List[StatementItem]) -> Optional[float]:
"""Calculate total from children values."""
total = 0
has_values = False
for child in children:
if not child.is_abstract and child.value is not None:
total += child.value
has_values = True
return total if has_values else None
def _find_unmatched_facts(self, fact_map: Dict[str, FinancialFact],
virtual_tree: Dict[str, Any]) -> Dict[str, FinancialFact]:
"""Find facts that don't match canonical concepts."""
canonical_concepts = set(virtual_tree.get('nodes', {}).keys())
unmatched = {}
for concept, fact in fact_map.items():
if concept not in canonical_concepts:
unmatched[concept] = fact
return unmatched
def _create_items_from_facts(self, facts: Dict[str, FinancialFact]) -> List[StatementItem]:
"""Create statement items from unmatched facts."""
items = []
for concept, fact in facts.items():
item = StatementItem(
concept=concept,
label=fact.label,
value=fact.numeric_value,
depth=1, # Default depth
parent_concept=None,
is_abstract=fact.is_abstract,
is_total=fact.is_total,
section=fact.section,
confidence=0.7, # Lower confidence for unmatched
source='fact',
fact=fact
)
items.append(item)
return items
def _build_from_facts(self, fact_map: Dict[str, FinancialFact]) -> List[StatementItem]:
"""Build statement directly from facts without canonical structure."""
# Group facts by parent
hierarchy = defaultdict(list)
roots = []
for concept, fact in fact_map.items():
if fact.parent_concept:
hierarchy[fact.parent_concept].append(concept)
else:
roots.append(concept)
# Build items recursively
items = []
for root_concept in roots:
item = self._build_fact_item(root_concept, fact_map, hierarchy)
if item:
items.append(item)
# Add orphaned facts
for concept, fact in fact_map.items():
if concept not in roots and not fact.parent_concept:
item = StatementItem(
concept=concept,
label=fact.label,
value=fact.numeric_value,
depth=0,
parent_concept=None,
is_abstract=fact.is_abstract,
is_total=fact.is_total,
section=fact.section,
confidence=1.0,
source='fact',
fact=fact
)
items.append(item)
return items
def _build_fact_item(self, concept: str,
fact_map: Dict[str, FinancialFact],
hierarchy: Dict[str, List[str]],
depth: int = 0) -> Optional[StatementItem]:
"""Build item from fact with children."""
if concept not in fact_map:
return None
fact = fact_map[concept]
item = StatementItem(
concept=concept,
label=fact.label,
value=fact.numeric_value,
depth=depth,
parent_concept=fact.parent_concept,
is_abstract=fact.is_abstract,
is_total=fact.is_total,
section=fact.section,
confidence=1.0,
source='fact',
fact=fact
)
# Add children
for child_concept in hierarchy.get(concept, []):
child_item = self._build_fact_item(child_concept, fact_map, hierarchy, depth + 1)
if child_item:
item.children.append(child_item)
return item
def _calculate_coverage(self, fact_map: Dict[str, FinancialFact],
statement_type: str) -> float:
"""Calculate canonical coverage percentage."""
if statement_type not in self.virtual_trees:
return 0.0
canonical_concepts = set(self.virtual_trees[statement_type].get('nodes', {}).keys())
if not canonical_concepts:
return 0.0
matched = len(set(fact_map.keys()) & canonical_concepts)
return matched / len(canonical_concepts)

View File

@@ -0,0 +1,216 @@
"""
Functions for retrieving entity submission data from the SEC.
"""
import json
from typing import Any, Dict, Optional
import httpx
from edgar.core import log
from edgar.entity.data import parse_entity_submissions
from edgar.httprequests import download_json
from edgar.storage import get_edgar_data_directory, is_using_local_storage
__all__ = [
'get_entity_submissions',
'download_entity_submissions_from_sec',
'load_company_submissions_from_local',
'create_entity_from_submissions_json',
'create_entity_from_file',
'create_company_from_file'
]
def load_company_submissions_from_local(cik: int) -> Optional[Dict[str, Any]]:
"""
Load company submissions from local data.
If the cached file is corrupted or empty, it will be re-downloaded automatically.
"""
submissions_dir = get_edgar_data_directory() / "submissions"
if not submissions_dir.exists():
return None
submissions_file = submissions_dir / f"CIK{cik:010}.json"
# If file doesn't exist, download it
if not submissions_file.exists():
submissions_json = download_entity_submissions_from_sec(cik)
if submissions_json:
with open(submissions_file, "w", encoding='utf-8') as f:
json.dump(submissions_json, f)
return submissions_json
# File exists, try to parse it
try:
return json.loads(submissions_file.read_text())
except (json.JSONDecodeError, UnicodeDecodeError) as e:
# File is corrupted, log warning and re-download
log.warning(f"Corrupted submissions cache file for CIK {cik}: {e}. Re-downloading...")
try:
submissions_json = download_entity_submissions_from_sec(cik)
if submissions_json:
# Write the fresh data to cache
with open(submissions_file, "w", encoding='utf-8') as f:
json.dump(submissions_json, f)
return submissions_json
else:
# If download failed, remove the corrupted file
submissions_file.unlink(missing_ok=True)
return None
except Exception as download_error:
log.error(f"Failed to re-download submissions for CIK {cik}: {download_error}")
# Remove the corrupted file so it can be retried later
submissions_file.unlink(missing_ok=True)
return None
def download_entity_submissions_from_sec(cik: int) -> Optional[Dict[str, Any]]:
"""
Get the company filings for a given cik.
Note: This function no longer uses @lru_cache (removed in Issue #471 fix) to allow
HttpxThrottleCache to control freshness. The HTTP cache now has a 30-second TTL
for submissions, providing a balance between freshness and performance.
Args:
cik: The company CIK
Returns:
Optional[Dict[str, Any]]: The entity submissions JSON data, or None if not found
"""
try:
submission_json = download_json(f"https://data.sec.gov/submissions/CIK{cik:010}.json")
except httpx.HTTPStatusError as e:
# Handle the case where the cik is invalid and not found on Edgar
if e.response.status_code == 404:
return None
else:
raise
return submission_json
def get_entity_submissions(cik: int) -> Optional[Any]:
"""
Get the entity data from the SEC submissions endpoint.
Note: This function no longer uses @lru_cache (removed in Issue #471 fix) to allow
HttpxThrottleCache to control freshness with a 30-second TTL.
Args:
cik: The company CIK
Returns:
Optional[EntityData]: The entity data, or None if not found
"""
# Check the environment var EDGAR_USE_LOCAL_DATA
if is_using_local_storage():
submissions_json = load_company_submissions_from_local(cik)
if not submissions_json:
submissions_json = download_entity_submissions_from_sec(cik)
else:
submissions_json = download_entity_submissions_from_sec(cik)
if submissions_json:
return parse_entity_submissions(submissions_json)
def create_entity_from_submissions_json(
submissions_json: Dict[str, Any],
entity_type: str = 'auto'
) -> Any:
"""
Create an Entity object from a submissions JSON dictionary.
This is particularly useful for testing, as it allows creating
Entity objects from local JSON files or mock data, without
making any API calls.
Args:
submissions_json: The submissions JSON dictionary (either from a file or API)
entity_type: The type of entity to create ('company', 'fund', or 'auto' to detect)
Returns:
An Entity, Company, or Fund object, depending on the entity_type parameter.
If entity_type is 'auto', it tries to detect the entity type from the data.
"""
# Import locally to avoid circular imports
from edgar.entity.core import Company, Entity
from edgar.entity.data import parse_entity_submissions
from edgar.funds import FundCompany
# First, parse the submissions JSON to get the entity data
entity_data = parse_entity_submissions(submissions_json)
# Create the appropriate entity object based on the entity_type parameter
if entity_type == 'auto':
# Try to detect the entity type - if it has tickers or exchanges, it's likely a company
if entity_data.tickers or hasattr(entity_data, 'exchanges') and entity_data.exchanges:
entity_type = 'company'
# More detection logic could be added here
else:
# Default to generic entity if we can't detect the type
entity_type = 'entity'
# Create and return the appropriate entity type
if entity_type.lower() == 'company':
entity = Company(entity_data.cik)
elif entity_type.lower() == 'fund':
entity = FundCompany(entity_data.cik)
else:
entity = Entity(entity_data.cik)
# Set the data directly to avoid making API calls
entity._data = entity_data
entity._data._not_found = False
# Mark the entity as having already loaded all filings to prevent fetching more
entity._data._loaded_all_filings = True
return entity
def create_entity_from_file(
file_path: str,
entity_type: str = 'auto'
) -> Any:
"""
Create an Entity object from a local submissions JSON file.
This is a convenience function that loads a JSON file and creates
an Entity object from it, without making any API calls.
Args:
file_path: Path to a submissions JSON file
entity_type: The type of entity to create ('company', 'fund', or 'auto' to detect)
Returns:
An Entity, Company, or Fund object, depending on the entity_type parameter.
"""
import json
from pathlib import Path
# Load the JSON file
try:
with open(Path(file_path).expanduser(), 'r') as f:
submissions_json = json.load(f)
except (FileNotFoundError, json.JSONDecodeError) as e:
log.error(f"Error loading submissions JSON file: {e}")
return None
# Create the entity from the loaded JSON
return create_entity_from_submissions_json(submissions_json, entity_type)
def create_company_from_file(file_path: str) -> Any:
"""
Create a Company object from a local submissions JSON file.
This is a convenience function specifically for creating companies,
which is the most common use case.
Args:
file_path: Path to a submissions JSON file
Returns:
A Company object
"""
return create_entity_from_file(file_path, entity_type='company')

View File

@@ -0,0 +1,138 @@
"""
Terminal-friendly color schemes for financial statement display.
Provides better contrast and readability in various terminal environments.
"""
from typing import Dict
# Default scheme - the current implementation
DEFAULT_SCHEME = {
"abstract_item": "bold cyan",
"total_item": "bold",
"regular_item": "",
"low_confidence_item": "dim",
"positive_value": "green",
"negative_value": "red",
"total_value_prefix": "bold yellow",
"separator": "dim",
"company_name": "bold green",
"statement_type": "bold blue",
"panel_border": "blue",
"empty_value": "dim",
}
# High contrast scheme - better for terminals with poor dim text support
HIGH_CONTRAST_SCHEME = {
"abstract_item": "bold bright_cyan",
"total_item": "bold bright_white",
"regular_item": "white",
"low_confidence_item": "bright_black", # Usually renders as gray
"positive_value": "bright_green",
"negative_value": "bright_red",
"total_value_prefix": "bold bright_yellow",
"separator": "bright_black",
"company_name": "bold bright_green",
"statement_type": "bold bright_blue",
"panel_border": "bright_blue",
"empty_value": "bright_black",
}
# Professional scheme - emphasizes important items without dim text
PROFESSIONAL_SCHEME = {
"abstract_item": "bold blue",
"total_item": "bold bright_white",
"regular_item": "",
"low_confidence_item": "italic", # Use italic instead of dim
"positive_value": "green",
"negative_value": "red",
"total_value_prefix": "bold",
"separator": "blue",
"company_name": "bold bright_white",
"statement_type": "bold blue",
"panel_border": "white",
"empty_value": "bright_black",
}
# Minimal scheme - focuses on structure over color
MINIMAL_SCHEME = {
"abstract_item": "bold",
"total_item": "bold bright_white",
"regular_item": "",
"low_confidence_item": "italic",
"positive_value": "",
"negative_value": "red", # Keep red for negative values
"total_value_prefix": "bold",
"separator": "white",
"company_name": "bold",
"statement_type": "bold",
"panel_border": "white",
"empty_value": "bright_black",
}
# Color-blind friendly scheme
ACCESSIBLE_SCHEME = {
"abstract_item": "bold blue",
"total_item": "bold bright_white underline", # Use underline for emphasis
"regular_item": "",
"low_confidence_item": "italic",
"positive_value": "blue", # Avoid green/red
"negative_value": "magenta", # Avoid green/red
"total_value_prefix": "bold underline",
"separator": "white",
"company_name": "bold bright_white",
"statement_type": "bold blue",
"panel_border": "white",
"empty_value": "bright_black",
}
# SEC filing style - mimics actual printed filings
FILING_SCHEME = {
"abstract_item": "bold", # Major sections (ASSETS, LIABILITIES) - just bold
"total_item": "bold", # Subtotals - bold only
"regular_item": "", # Regular items - no styling
"low_confidence_item": "dim", # Low confidence items - dimmed
"positive_value": "", # Positive values - no color (like printed filings)
"negative_value": "", # Negative values - no color (parentheses show negative)
"total_value_prefix": "bold", # Total values - bold only
"separator": "dim", # Table separators - dimmed
"company_name": "bold", # Company name - just bold
"statement_type": "bold", # Statement title - just bold
"panel_border": "white", # Panel borders - white
"empty_value": "dim", # Empty values - dimmed
}
# Available schemes
SCHEMES: Dict[str, Dict[str, str]] = {
"default": DEFAULT_SCHEME,
"high_contrast": HIGH_CONTRAST_SCHEME,
"professional": PROFESSIONAL_SCHEME,
"minimal": MINIMAL_SCHEME,
"accessible": ACCESSIBLE_SCHEME,
"filing": FILING_SCHEME,
}
def get_color_scheme(scheme_name: str = "professional") -> Dict[str, str]:
"""
Get a color scheme by name.
Args:
scheme_name: Name of the scheme (default, high_contrast, professional, minimal, accessible, filing)
Returns:
Dictionary of style mappings
"""
return SCHEMES.get(scheme_name, PROFESSIONAL_SCHEME)
# Environment variable support
import os
def get_current_scheme() -> Dict[str, str]:
"""
Get the current color scheme based on environment variable or default.
Environment variable: EDGAR_FINANCIALS_COLOR_SCHEME
Values: default, high_contrast, professional, minimal, accessible, filing
"""
scheme_name = os.environ.get("EDGAR_FINANCIALS_COLOR_SCHEME", "professional")
return get_color_scheme(scheme_name)

View File

@@ -0,0 +1,56 @@
"""
Ticker-related functionality for the entity package.
This module re-exports ticker-related functions from edgar.reference.tickers.
"""
# We need to create our own implementation of these functions
from functools import lru_cache
import pandas as pd
from edgar.httprequests import download_text
from edgar.reference.tickers import find_cik, find_ticker, get_company_tickers, get_icon_from_ticker
@lru_cache(maxsize=1)
def get_ticker_to_cik_lookup():
"""
Create a dictionary that maps from ticker symbol to CIK.
"""
df = get_company_tickers()
ticker_to_cik = {}
for _, row in df.iterrows():
ticker_to_cik[row['ticker']] = row['cik']
return ticker_to_cik
def _parse_cik_lookup_data(content):
"""Parse CIK lookup data from content."""
return [
{
# for companies with : in the name
'name': ":".join(line.split(':')[:-2]),
'cik': int(line.split(':')[-2])
} for line in content.split("\n") if line != '']
@lru_cache(maxsize=1)
def get_cik_lookup_data() -> pd.DataFrame:
"""
Get a dataframe of company/entity names and their cik
or a Dict of int(cik) to str(name)
DECADE CAPITAL MANAGEMENT LLC:0001426822:
DECADE COMPANIES INCOME PROPERTIES:0000775840:
"""
content = download_text("https://www.sec.gov/Archives/edgar/cik-lookup-data.txt")
cik_lookup_df = pd.DataFrame(_parse_cik_lookup_data(content))
return cik_lookup_df
__all__ = [
'get_icon_from_ticker',
'get_company_tickers',
'get_ticker_to_cik_lookup',
'get_cik_lookup_data',
'find_cik',
'find_ticker'
]

View File

@@ -0,0 +1,17 @@
from edgar import Company
def income_statement(ticker:str, annual:bool=True, periods:int=4):
company = Company(ticker)
if company:
return company.income_statement(annual=annual, periods=periods)
def balance_sheet(ticker:str, annual:bool=True, periods:int=4):
company = Company(ticker)
if company:
return company.balance_sheet(annual=annual, periods=periods)
def cash_flow_statement(ticker:str, annual:bool=True, periods:int=4):
company = Company(ticker)
if company:
return company.cash_flow_statement(annual=annual, periods=periods)

View File

@@ -0,0 +1,419 @@
"""
Unit handling and normalization for financial facts.
This module provides comprehensive unit normalization and conversion capabilities
to address unit inconsistencies across different companies' SEC filings.
Key features:
- Currency unit normalization (USD, EUR, GBP, etc.)
- Share-based unit standardization
- Scale-aware unit matching
- Unit compatibility checking
- Error reporting with unit mismatch details
Usage:
from edgar.entity.unit_handling import UnitNormalizer, UnitResult
# Normalize a unit
normalized = UnitNormalizer.normalize_unit("US DOLLAR") # Returns "USD"
# Check unit compatibility
compatible = UnitNormalizer.are_compatible("USD", "DOLLARS") # Returns True
# Get unit with error details
result = UnitNormalizer.get_normalized_value(fact, target_unit="USD")
"""
import logging
from dataclasses import dataclass
from enum import Enum
from typing import Dict, List, Optional
from edgar.entity.models import FinancialFact
logger = logging.getLogger(__name__)
class UnitType(Enum):
"""Types of financial units."""
CURRENCY = "currency"
SHARES = "shares"
RATIO = "ratio"
BUSINESS = "business"
TIME = "time"
AREA = "area"
OTHER = "other"
@dataclass
class UnitResult:
"""Result of unit normalization with error details."""
value: Optional[float]
normalized_unit: Optional[str]
original_unit: str
success: bool
error_reason: Optional[str] = None
scale_applied: Optional[int] = None
unit_type: Optional[UnitType] = None
suggestions: List[str] = None
def __post_init__(self):
if self.suggestions is None:
self.suggestions = []
class UnitNormalizer:
"""Comprehensive unit normalization for financial facts."""
# Currency unit mappings
CURRENCY_MAPPINGS = {
'USD': ['USD', 'US DOLLAR', 'DOLLARS', 'usd', 'US$', 'DOLLAR'],
'EUR': ['EUR', 'EURO', 'EUROS', 'eur', '', 'EUROPEAN UNION EURO'],
'GBP': ['GBP', 'POUND', 'POUNDS', 'gbp', '£', 'BRITISH POUND', 'POUND STERLING'],
'JPY': ['JPY', 'YEN', 'yen', 'jpy', '¥', 'JAPANESE YEN'],
'CAD': ['CAD', 'CANADIAN DOLLAR', 'CANADIAN DOLLARS', 'cad'],
'CHF': ['CHF', 'SWISS FRANC', 'SWISS FRANCS', 'chf'],
'AUD': ['AUD', 'AUSTRALIAN DOLLAR', 'AUSTRALIAN DOLLARS', 'aud'],
'CNY': ['CNY', 'YUAN', 'CHINESE YUAN', 'cny', '¥'],
}
# Share unit mappings
SHARE_MAPPINGS = {
'shares': ['shares', 'share', 'SHARES', 'SHARE', 'STOCK', 'EQUITY'],
'shares_unit': ['shares_unit', 'share_unit', 'SHARES_UNIT'],
'partnership_unit': ['USD/PartnershipUnit', 'PartnershipUnit', 'partnership_unit']
}
# Ratio/dimensionless unit mappings
RATIO_MAPPINGS = {
'pure': ['pure', 'number', 'ratio', 'percent', '%', 'PURE', 'NUMBER'],
'basis_points': ['bp', 'bps', 'basis_points', 'BASIS_POINTS']
}
# Per-share combinations
PER_SHARE_MAPPINGS = {
'USD_per_share': ['USD/shares', 'USD per share', 'USD/share', 'usd/shares'],
'USD_per_share_unit': ['USD/shares_unit', 'USD per share unit', 'USD/share_unit']
}
# Business/operational unit mappings
BUSINESS_MAPPINGS = {
'customer': ['Customer', 'customer', 'CUSTOMER'],
'store': ['Store', 'store', 'STORE'],
'entity': ['Entity', 'entity', 'ENTITY'],
'segment': ['Segment', 'segment', 'SEGMENT', 'reportable_segment'],
'instrument': ['instrument', 'INSTRUMENT', 'financial_instrument'],
'contract': ['USD/Contract', 'contract', 'CONTRACT'],
'investment': ['USD/Investment', 'investment', 'INVESTMENT']
}
# Time-based unit mappings
TIME_MAPPINGS = {
'years': ['Year', 'years', 'YEAR', 'YEARS'],
'months': ['Month', 'months', 'MONTH', 'MONTHS'],
'days': ['Day', 'days', 'DAY', 'DAYS']
}
# Area unit mappings
AREA_MAPPINGS = {
'sqft': ['sqft', 'square_feet', 'SQFT', 'sq_ft'],
'sqm': ['sqm', 'square_meters', 'SQMETER', 'sq_m']
}
# Comprehensive mapping combining all categories
ALL_MAPPINGS = {
**CURRENCY_MAPPINGS,
**SHARE_MAPPINGS,
**RATIO_MAPPINGS,
**PER_SHARE_MAPPINGS,
**BUSINESS_MAPPINGS,
**TIME_MAPPINGS,
**AREA_MAPPINGS
}
# Reverse mapping for faster lookups
_REVERSE_MAPPING = None
@classmethod
def _build_reverse_mapping(cls) -> Dict[str, str]:
"""Build reverse mapping from variant to normalized unit."""
if cls._REVERSE_MAPPING is not None:
return cls._REVERSE_MAPPING
reverse_map = {}
for normalized_unit, variants in cls.ALL_MAPPINGS.items():
for variant in variants:
reverse_map[variant.upper()] = normalized_unit
cls._REVERSE_MAPPING = reverse_map
return reverse_map
@classmethod
def normalize_unit(cls, unit: str) -> str:
"""
Normalize a unit string to its canonical form.
Args:
unit: Raw unit string from SEC filing
Returns:
Normalized unit string
Example:
>>> UnitNormalizer.normalize_unit("US DOLLAR")
'USD'
>>> UnitNormalizer.normalize_unit("shares_unit")
'shares_unit'
"""
if not unit:
return ""
reverse_map = cls._build_reverse_mapping()
normalized = reverse_map.get(unit.upper())
return normalized if normalized else unit
@classmethod
def get_unit_type(cls, unit: str) -> UnitType:
"""
Determine the type of a unit.
Args:
unit: Unit string (normalized or raw)
Returns:
UnitType enum value
"""
normalized = cls.normalize_unit(unit)
if normalized in cls.CURRENCY_MAPPINGS:
return UnitType.CURRENCY
elif normalized in cls.PER_SHARE_MAPPINGS:
# Per-share units are a special currency-like type (amount per share)
return UnitType.CURRENCY # Treat per-share as currency-derived
elif normalized in cls.SHARE_MAPPINGS:
return UnitType.SHARES
elif normalized in cls.RATIO_MAPPINGS:
return UnitType.RATIO
elif normalized in cls.BUSINESS_MAPPINGS:
return UnitType.BUSINESS
elif normalized in cls.TIME_MAPPINGS:
return UnitType.TIME
elif normalized in cls.AREA_MAPPINGS:
return UnitType.AREA
else:
return UnitType.OTHER
@classmethod
def are_compatible(cls, unit1: str, unit2: str) -> bool:
"""
Check if two units are compatible for calculations.
Args:
unit1: First unit
unit2: Second unit
Returns:
True if units are compatible
"""
norm1 = cls.normalize_unit(unit1)
norm2 = cls.normalize_unit(unit2)
# Exact match
if norm1 == norm2:
return True
# Same unit type
type1 = cls.get_unit_type(norm1)
type2 = cls.get_unit_type(norm2)
if type1 == type2:
# Special cases for compatible unit types
if type1 == UnitType.CURRENCY:
# Regular currencies are compatible, but per-share must match exactly
if norm1 in cls.PER_SHARE_MAPPINGS or norm2 in cls.PER_SHARE_MAPPINGS:
# Per-share units must match exactly (USD_per_share != USD_per_share_unit)
return norm1 == norm2
return True # Regular currencies could be converted
elif type1 == UnitType.SHARES:
# shares and shares_unit are compatible for some calculations
return norm1 in ['shares', 'shares_unit'] and norm2 in ['shares', 'shares_unit']
return False
@classmethod
def get_normalized_value(
cls,
fact: FinancialFact,
target_unit: Optional[str] = None,
apply_scale: bool = True,
strict_unit_match: bool = False
) -> UnitResult:
"""
Get a normalized value from a financial fact with detailed error reporting.
Args:
fact: FinancialFact to normalize
target_unit: Desired unit (if None, just normalize existing unit)
apply_scale: Whether to apply scale factor
strict_unit_match: If True, require exact unit match. If False, allow compatible units.
Returns:
UnitResult with value and metadata
"""
if fact.numeric_value is None:
return UnitResult(
value=None,
normalized_unit=None,
original_unit=fact.unit,
success=False,
error_reason="No numeric value available"
)
original_unit = fact.unit or ""
normalized_unit = cls.normalize_unit(original_unit)
unit_type = cls.get_unit_type(normalized_unit)
# Apply scale factor if requested
value = fact.numeric_value
scale_applied = None
if apply_scale and fact.scale:
value *= fact.scale
scale_applied = fact.scale
# If no target unit specified, return normalized value
if target_unit is None:
return UnitResult(
value=value,
normalized_unit=normalized_unit,
original_unit=original_unit,
success=True,
scale_applied=scale_applied,
unit_type=unit_type
)
# Check compatibility with target unit
target_normalized = cls.normalize_unit(target_unit)
if normalized_unit == target_normalized:
# Exact match
return UnitResult(
value=value,
normalized_unit=target_normalized,
original_unit=original_unit,
success=True,
scale_applied=scale_applied,
unit_type=unit_type
)
elif not strict_unit_match and cls.are_compatible(normalized_unit, target_normalized):
# Compatible units - could potentially convert (only if not in strict mode)
suggestions = []
if cls.get_unit_type(normalized_unit) == UnitType.CURRENCY:
suggestions.append(f"Consider currency conversion from {normalized_unit} to {target_normalized}")
return UnitResult(
value=value,
normalized_unit=normalized_unit, # Keep original, mark as compatible
original_unit=original_unit,
success=True,
scale_applied=scale_applied,
unit_type=unit_type,
suggestions=suggestions
)
else:
# Incompatible units
suggestions = cls._get_unit_suggestions(normalized_unit, target_normalized)
return UnitResult(
value=None,
normalized_unit=normalized_unit,
original_unit=original_unit,
success=False,
error_reason=f"Unit mismatch: {normalized_unit} is not compatible with {target_normalized}",
unit_type=unit_type,
suggestions=suggestions
)
@classmethod
def _get_unit_suggestions(cls, actual_unit: str, target_unit: str) -> List[str]:
"""Generate helpful suggestions for unit mismatches."""
suggestions = []
actual_type = cls.get_unit_type(actual_unit)
target_type = cls.get_unit_type(target_unit)
if actual_type != target_type:
suggestions.append(f"Unit type mismatch: {actual_unit} is {actual_type.value}, "
f"but {target_unit} is {target_type.value}")
# Specific suggestions based on unit types
if target_type == UnitType.CURRENCY and actual_type != UnitType.CURRENCY:
suggestions.append("Consider using a financial amount concept instead of a ratio/count")
elif target_type == UnitType.SHARES and actual_type != UnitType.SHARES:
suggestions.append("Consider using a share-based concept instead of a monetary amount")
# Alternative units in the same category
if actual_type == target_type:
if actual_type == UnitType.CURRENCY:
suggestions.append("Use currency conversion or specify the correct currency unit")
elif actual_type == UnitType.SHARES:
suggestions.append("Try using 'shares' instead of 'shares_unit' or vice versa")
return suggestions
def apply_scale_factor(value: float, scale: Optional[int]) -> float:
"""
Apply scale factor to a value.
Args:
value: Numeric value
scale: Scale factor (e.g., 1000 for thousands)
Returns:
Scaled value
"""
if scale and scale != 1:
return value * scale
return value
def format_unit_error(unit_result: UnitResult) -> str:
"""
Format a unit error message for user display.
Args:
unit_result: UnitResult with error details
Returns:
Formatted error message
"""
if unit_result.success:
return "No error"
message = f"Unit handling error: {unit_result.error_reason}"
if unit_result.suggestions:
message += "\n Suggestions:\n"
for suggestion in unit_result.suggestions:
message += f" - {suggestion}\n"
message += f" Original unit: '{unit_result.original_unit}'"
if unit_result.normalized_unit != unit_result.original_unit:
message += f" Normalized to: '{unit_result.normalized_unit}'"
return message
# Legacy support - maintain compatibility with existing code
def normalize_unit_legacy(unit: str) -> str:
"""Legacy unit normalization for backward compatibility."""
return UnitNormalizer.normalize_unit(unit)
def are_units_compatible_legacy(unit1: str, unit2: str) -> bool:
"""Legacy unit compatibility check for backward compatibility."""
return UnitNormalizer.are_compatible(unit1, unit2)

View File

@@ -0,0 +1,132 @@
"""
Utility functions for entity processing.
This module contains utility functions used throughout the entity package
for data processing, normalization, and validation.
"""
from typing import TYPE_CHECKING, Union
if TYPE_CHECKING:
import pyarrow
from edgar.entity.constants import COMPANY_FORMS
def has_company_filings(filings_form_array: 'pyarrow.ChunkedArray', max_filings: int = 50) -> bool:
"""
Efficiently check if any form in the PyArrow ChunkedArray matches company-only forms.
Limited to checking the first max_filings entries for performance.
Args:
filings_form_array: PyArrow ChunkedArray containing form values
max_filings: Maximum number of filings to check
Returns:
True if any form matches a company form, False otherwise
"""
# Early exit for empty arrays
if filings_form_array.null_count == filings_form_array.length:
return False
# Handle case with fewer than max_filings
total_filings = filings_form_array.length()
filings_to_check = min(total_filings, max_filings)
# Track how many we've checked so far
checked_count = 0
# Process chunks in the ChunkedArray until we hit our limit
for chunk in filings_form_array.chunks:
chunk_size = len(chunk)
# If this chunk would exceed our limit, slice it
if checked_count + chunk_size > filings_to_check:
# Only check remaining forms needed to reach filings_to_check
remaining = filings_to_check - checked_count
sliced_chunk = chunk.slice(0, remaining)
# Use safer iteration over array values
for i in range(len(sliced_chunk)):
# Get value safely, handling nulls
val = sliced_chunk.take([i]).to_pylist()[0]
if val is not None and val in COMPANY_FORMS:
return True
else:
# Process full chunk safely
for val in chunk.to_pylist():
if val is not None and val in COMPANY_FORMS:
return True
# Update count of checked filings
if checked_count + chunk_size > filings_to_check:
checked_count += (filings_to_check - checked_count)
else:
checked_count += chunk_size
# Stop if we've checked enough
if checked_count >= filings_to_check:
break
return False
def normalize_cik(cik_or_identifier: Union[str, int]) -> int:
"""
Normalize a CIK to an integer by removing leading zeros.
Args:
cik_or_identifier: CIK as string or integer
Returns:
Normalized CIK as integer
Raises:
ValueError: If the identifier cannot be converted to a valid CIK
"""
if isinstance(cik_or_identifier, int):
return cik_or_identifier
if isinstance(cik_or_identifier, str):
# Remove leading zeros and convert to int
try:
return int(cik_or_identifier.lstrip('0') or '0')
except ValueError:
raise ValueError(f"Invalid CIK format: {cik_or_identifier}")
raise ValueError(f"CIK must be string or integer, got {type(cik_or_identifier)}")
def validate_cik(cik: int) -> bool:
"""
Validate that a CIK is within the expected range.
Args:
cik: CIK to validate
Returns:
True if CIK is valid, False otherwise
"""
# CIKs are typically 1-10 digits, with valid range roughly 1 to 2,000,000,000
return isinstance(cik, int) and 1 <= cik <= 2_000_000_000
def format_cik(cik: Union[str, int], zero_pad: int = 10) -> str:
"""
Format a CIK with zero padding for display or API calls.
Args:
cik: CIK to format
zero_pad: Number of digits to pad to (default 10)
Returns:
Zero-padded CIK string
Example:
>>> format_cik(320193)
'0000320193'
>>> format_cik('320193', zero_pad=6)
'320193'
"""
normalized_cik = normalize_cik(cik)
return str(normalized_cik).zfill(zero_pad)