Initial commit
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,45 @@
|
||||
from edgar import Company
|
||||
from collections import defaultdict
|
||||
|
||||
# Get Apple facts
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts
|
||||
raw_facts = facts._facts
|
||||
|
||||
print("Analyzing period durations for FY facts:\n")
|
||||
|
||||
# Group facts by (fiscal_year, fiscal_period, period_end)
|
||||
fact_groups = defaultdict(list)
|
||||
for fact in raw_facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.fiscal_year and fact.fiscal_year >= 2019 and fact.fiscal_year <= 2021:
|
||||
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
|
||||
key = (fact.fiscal_year, fact.fiscal_period, fact.period_end)
|
||||
fact_groups[key].append(fact)
|
||||
|
||||
# Analyze each group
|
||||
for key in sorted(fact_groups.keys()):
|
||||
year, period, end_date = key
|
||||
facts_in_group = fact_groups[key]
|
||||
|
||||
if len(facts_in_group) > 1:
|
||||
print(f"\nFY {year} ending {end_date}: {len(facts_in_group)} facts")
|
||||
for fact in facts_in_group:
|
||||
duration = None
|
||||
if fact.period_start and fact.period_end:
|
||||
duration = (fact.period_end - fact.period_start).days
|
||||
|
||||
period_type = "Annual" if duration and duration > 300 else "Quarterly" if duration else "Unknown"
|
||||
print(f" ${fact.value:,.0f} - Duration: {duration} days ({period_type})")
|
||||
print(f" Period: {fact.period_start} to {fact.period_end}")
|
||||
print(f" Filed: {fact.filing_date}")
|
||||
if hasattr(fact, 'form'):
|
||||
print(f" Form: {fact.form}")
|
||||
if hasattr(fact, 'accession'):
|
||||
print(f" Accession: {fact.accession}")
|
||||
|
||||
print("\n\nSummary:")
|
||||
print("The issue: Both annual and quarterly revenue are marked as 'FY'")
|
||||
print("Solution: Use period duration to distinguish:")
|
||||
print(" - Annual: period_start to period_end > 300 days")
|
||||
print(" - Quarterly: period_start to period_end < 100 days")
|
||||
@@ -0,0 +1,57 @@
|
||||
from edgar import Company
|
||||
from collections import defaultdict
|
||||
|
||||
# Get Apple facts
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts
|
||||
raw_facts = facts._facts
|
||||
|
||||
# Check all FY income statement facts for 2019-2024
|
||||
print("Checking FY facts and their period_end dates:\n")
|
||||
print("fiscal_year | fiscal_period | period_end | period_end.year | Match?")
|
||||
print("-" * 70)
|
||||
|
||||
fy_facts = defaultdict(list)
|
||||
for fact in raw_facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.fiscal_year and fact.fiscal_year >= 2019:
|
||||
fy_facts[fact.fiscal_year].append(fact)
|
||||
|
||||
# Show all FY entries grouped by fiscal_year
|
||||
for year in sorted(fy_facts.keys(), reverse=True):
|
||||
facts_for_year = fy_facts[year]
|
||||
# Get unique period_end dates for this fiscal year
|
||||
unique_ends = set()
|
||||
for fact in facts_for_year:
|
||||
if fact.period_end:
|
||||
unique_ends.add(fact.period_end)
|
||||
|
||||
print(f"\nFY {year} has {len(unique_ends)} unique period_end dates:")
|
||||
for end_date in sorted(unique_ends):
|
||||
if end_date:
|
||||
match = "✓" if end_date.year == year else "✗"
|
||||
print(f" {year:4d} | FY | {end_date} | {end_date.year} | {match}")
|
||||
|
||||
# Now check if we have the correct matches
|
||||
print("\n\nChecking if we have correct year matches:")
|
||||
correct_matches = defaultdict(set)
|
||||
for fact in raw_facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.period_end and fact.fiscal_year:
|
||||
if fact.period_end.year == fact.fiscal_year:
|
||||
correct_matches[fact.fiscal_year].add(fact.period_end)
|
||||
|
||||
print("\nFiscal years with matching period_end.year:")
|
||||
for year in sorted(correct_matches.keys(), reverse=True)[:6]:
|
||||
for end_date in correct_matches[year]:
|
||||
print(f" FY {year} -> {end_date} ✓")
|
||||
|
||||
# Check revenue values for correct matches
|
||||
print("\n\nRevenue values for CORRECT year matches:")
|
||||
for fact in raw_facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.period_end and fact.fiscal_year:
|
||||
if fact.period_end.year == fact.fiscal_year:
|
||||
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
|
||||
if fact.fiscal_year >= 2019 and fact.fiscal_year <= 2024:
|
||||
print(f" FY {fact.fiscal_year} (ends {fact.period_end}): ${fact.value:,.0f}")
|
||||
@@ -0,0 +1,172 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Check which renderer is actually being used in the MSFT table.
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
|
||||
|
||||
from edgar.documents.parser import HTMLParser
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
|
||||
def check_renderer_usage():
|
||||
print("🔍 CHECKING WHICH RENDERER IS ACTUALLY BEING USED")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
# Parse with default config
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
# Check what the default config actually has
|
||||
config = ParserConfig()
|
||||
print(f"Default ParserConfig.fast_table_rendering: {config.fast_table_rendering}")
|
||||
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
# Find target table
|
||||
target_table = None
|
||||
def find_target(node):
|
||||
nonlocal target_table
|
||||
if isinstance(node, TableNode):
|
||||
try:
|
||||
if "Weighted average outstanding shares" in node.text():
|
||||
target_table = node
|
||||
return
|
||||
except:
|
||||
pass
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
find_target(child)
|
||||
|
||||
find_target(document.root)
|
||||
|
||||
if not target_table:
|
||||
print("❌ Target table not found")
|
||||
return
|
||||
|
||||
print(f"✅ Found target table")
|
||||
print(f"Table has _config: {'✅' if hasattr(target_table, '_config') else '❌'}")
|
||||
|
||||
if hasattr(target_table, '_config'):
|
||||
print(f"Table config fast_table_rendering: {target_table._config.fast_table_rendering}")
|
||||
|
||||
# Test the decision logic in TableNode.text()
|
||||
print(f"\n🔍 TRACING TableNode.text() DECISION LOGIC:")
|
||||
|
||||
# Check if cache exists
|
||||
has_cache = hasattr(target_table, '_text_cache') and target_table._text_cache is not None
|
||||
print(f"Has cached text: {has_cache}")
|
||||
|
||||
if has_cache:
|
||||
print(f"❗ Using cached result - clearing cache to test renderer...")
|
||||
target_table._text_cache = None
|
||||
|
||||
# Check the config decision
|
||||
config_obj = getattr(target_table, '_config', None)
|
||||
should_use_fast = config_obj and getattr(config_obj, 'fast_table_rendering', False)
|
||||
print(f"Config object exists: {'✅' if config_obj else '❌'}")
|
||||
print(f"Should use fast rendering: {'✅' if should_use_fast else '❌'}")
|
||||
|
||||
# Test both renderers directly
|
||||
print(f"\n🧪 TESTING BOTH RENDERERS DIRECTLY:")
|
||||
|
||||
# Test Rich renderer
|
||||
try:
|
||||
print("Rich renderer test:")
|
||||
rich_table = target_table.render(width=195)
|
||||
from edgar.richtools import rich_to_text
|
||||
rich_text = rich_to_text(rich_table)
|
||||
rich_has_pipes = '|' in rich_text
|
||||
print(f" Rich output has pipes: {'✅' if rich_has_pipes else '❌'}")
|
||||
print(f" Rich output length: {len(rich_text)} chars")
|
||||
print(f" Rich preview: {rich_text[:80]}...")
|
||||
except Exception as e:
|
||||
print(f" Rich renderer error: {e}")
|
||||
|
||||
# Test Fast renderer
|
||||
try:
|
||||
print("Fast renderer test:")
|
||||
fast_text = target_table._fast_text_rendering()
|
||||
fast_has_pipes = '|' in fast_text
|
||||
print(f" Fast output has pipes: {'✅' if fast_has_pipes else '❌'}")
|
||||
print(f" Fast output length: {len(fast_text)} chars")
|
||||
print(f" Fast preview: {fast_text[:80]}...")
|
||||
except Exception as e:
|
||||
print(f" Fast renderer error: {e}")
|
||||
|
||||
# Test current text() method
|
||||
print("Current text() method:")
|
||||
current_text = target_table.text()
|
||||
current_has_pipes = '|' in current_text
|
||||
print(f" Current output has pipes: {'✅' if current_has_pipes else '❌'}")
|
||||
print(f" Current output length: {len(current_text)} chars")
|
||||
print(f" Current preview: {current_text[:80]}...")
|
||||
|
||||
# Determine which renderer is actually being used
|
||||
if current_has_pipes and len(current_text) < 2000:
|
||||
print(f"\n🎯 CONCLUSION: Currently using FAST RENDERER ✅")
|
||||
elif not current_has_pipes and len(current_text) > 1500:
|
||||
print(f"\n🎯 CONCLUSION: Currently using RICH RENDERER ❌")
|
||||
else:
|
||||
print(f"\n🤔 CONCLUSION: Unclear which renderer is being used")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def test_explicit_configurations():
|
||||
"""Test with explicit fast and rich configurations."""
|
||||
print(f"\n🧪 TESTING EXPLICIT CONFIGURATIONS")
|
||||
print("=" * 60)
|
||||
|
||||
configs = [
|
||||
("Explicit Fast", ParserConfig(fast_table_rendering=True)),
|
||||
("Explicit Rich", ParserConfig(fast_table_rendering=False)),
|
||||
]
|
||||
|
||||
try:
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
for config_name, config in configs:
|
||||
print(f"\n🔧 {config_name} (fast_table_rendering={config.fast_table_rendering}):")
|
||||
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
# Find table
|
||||
target_table = None
|
||||
def find_target(node):
|
||||
nonlocal target_table
|
||||
if isinstance(node, TableNode):
|
||||
try:
|
||||
if "Weighted average outstanding shares" in node.text():
|
||||
target_table = node
|
||||
return
|
||||
except:
|
||||
pass
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
find_target(child)
|
||||
|
||||
find_target(document.root)
|
||||
|
||||
if target_table:
|
||||
table_text = target_table.text()
|
||||
has_pipes = '|' in table_text
|
||||
print(f" Output has pipes: {'✅' if has_pipes else '❌'}")
|
||||
print(f" Output length: {len(table_text)} chars")
|
||||
print(f" Preview: {table_text[:60]}...")
|
||||
else:
|
||||
print(f" ❌ Table not found")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_renderer_usage()
|
||||
test_explicit_configurations()
|
||||
@@ -0,0 +1,46 @@
|
||||
from edgar import Company
|
||||
from collections import defaultdict
|
||||
import json
|
||||
|
||||
# Get Apple facts
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts
|
||||
|
||||
# Get raw facts data - access internal facts list
|
||||
raw_facts = facts._facts # Access internal facts list
|
||||
|
||||
# Look for Revenue facts in 2020 and 2019
|
||||
revenue_facts = []
|
||||
for fact in raw_facts:
|
||||
if fact.concept and 'Revenue' in fact.concept:
|
||||
if fact.fiscal_year in [2019, 2020]:
|
||||
revenue_facts.append({
|
||||
'concept': fact.concept,
|
||||
'value': fact.value,
|
||||
'fy': fact.fiscal_year,
|
||||
'fp': fact.fiscal_period,
|
||||
'period_end': str(fact.period_end) if fact.period_end else None,
|
||||
'period_duration': getattr(fact, 'period_duration', None),
|
||||
'statement': fact.statement_type,
|
||||
'filing_date': str(fact.filing_date) if fact.filing_date else None
|
||||
})
|
||||
|
||||
print("Revenue facts for 2019-2020:")
|
||||
print(json.dumps(revenue_facts, indent=2, default=str))
|
||||
|
||||
# Group by fiscal year and period
|
||||
by_year_period = defaultdict(list)
|
||||
for fact in revenue_facts:
|
||||
key = f"{fact['fy']}-{fact['fp']}"
|
||||
by_year_period[key].append(fact)
|
||||
|
||||
print("\n\nGrouped by fiscal year and period:")
|
||||
for key in sorted(by_year_period.keys()):
|
||||
print(f"\n{key}:")
|
||||
for fact in by_year_period[key]:
|
||||
print(f" {fact['concept']}: ${fact['value']:,} (duration: {fact['period_duration']} days)")
|
||||
|
||||
# Now check what the income statement method returns
|
||||
print("\n\nIncome statement for 2019-2020 (annual=True):")
|
||||
income = facts.income_statement(annual=True, periods=6)
|
||||
print(income)
|
||||
@@ -0,0 +1,89 @@
|
||||
from edgar import Company
|
||||
from collections import defaultdict
|
||||
|
||||
# Get Apple facts
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts
|
||||
|
||||
# Get raw facts data - access internal facts list
|
||||
raw_facts = facts._facts # Access internal facts list
|
||||
|
||||
# Look for all facts in Income Statement for 2019-2020
|
||||
income_facts = defaultdict(lambda: defaultdict(list))
|
||||
for fact in raw_facts:
|
||||
if fact.statement_type == 'IncomeStatement':
|
||||
if fact.fiscal_year in [2019, 2020]:
|
||||
key = f"{fact.fiscal_year}-{fact.fiscal_period}"
|
||||
income_facts[fact.concept][key].append({
|
||||
'value': fact.value,
|
||||
'period_end': fact.period_end,
|
||||
'filing_date': fact.filing_date
|
||||
})
|
||||
|
||||
# Find Revenue/Revenues concept
|
||||
revenue_concepts = []
|
||||
for concept in income_facts.keys():
|
||||
if 'Revenue' in concept and 'Contract' not in concept:
|
||||
revenue_concepts.append(concept)
|
||||
|
||||
print("Revenue concepts found:", revenue_concepts)
|
||||
print("\nRevenue values by year-period:")
|
||||
|
||||
for concept in revenue_concepts:
|
||||
print(f"\n{concept}:")
|
||||
for period in sorted(income_facts[concept].keys()):
|
||||
facts_list = income_facts[concept][period]
|
||||
for f in facts_list:
|
||||
print(f" {period}: ${f['value']:,}")
|
||||
|
||||
# Check what periods are actually marked as FY
|
||||
print("\n\nAll FY periods in Income Statement:")
|
||||
fy_periods = set()
|
||||
for fact in raw_facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
fy_periods.add((fact.fiscal_year, fact.fiscal_period, fact.period_end))
|
||||
|
||||
for year, period, end_date in sorted(fy_periods):
|
||||
print(f" {year} {period} (ends {end_date})")
|
||||
|
||||
# Now check what exact facts are selected for 2019 and 2020
|
||||
print("\n\nChecking what's selected for income statement:")
|
||||
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
|
||||
|
||||
builder = EnhancedStatementBuilder()
|
||||
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
|
||||
|
||||
# Build period info like the builder does
|
||||
period_info = {}
|
||||
period_facts_map = defaultdict(list)
|
||||
|
||||
for fact in stmt_facts:
|
||||
period_key = (fact.fiscal_year, fact.fiscal_period)
|
||||
period_label = f"{fact.fiscal_period} {fact.fiscal_year}"
|
||||
|
||||
period_facts_map[period_label].append(fact)
|
||||
|
||||
if period_key not in period_info:
|
||||
period_info[period_key] = {
|
||||
'label': period_label,
|
||||
'end_date': fact.period_end,
|
||||
'is_annual': fact.fiscal_period == 'FY',
|
||||
'filing_date': fact.filing_date,
|
||||
'fiscal_year': fact.fiscal_year,
|
||||
'fiscal_period': fact.fiscal_period
|
||||
}
|
||||
|
||||
# Get annual periods
|
||||
annual_periods = [(pk, info) for pk, info in period_info.items() if info['is_annual']]
|
||||
annual_periods.sort(key=lambda x: x[0][0] if x[0][0] else 0, reverse=True)
|
||||
|
||||
print("\nAnnual periods found (sorted newest first):")
|
||||
for (year, period), info in annual_periods[:10]:
|
||||
print(f" {info['label']} - ends {info['end_date']}")
|
||||
|
||||
# Check if there are any revenue facts for FY 2019 and FY 2020
|
||||
print("\n\nRevenue facts for FY periods:")
|
||||
for fact in raw_facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.fiscal_year in [2019, 2020] and 'Revenue' in str(fact.concept):
|
||||
print(f" {fact.fiscal_year} {fact.fiscal_period}: {fact.concept} = ${fact.value:,}")
|
||||
@@ -0,0 +1,37 @@
|
||||
from edgar import Company
|
||||
from collections import defaultdict
|
||||
|
||||
# Get Apple facts
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts
|
||||
raw_facts = facts._facts
|
||||
|
||||
# Check how period_info is built
|
||||
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
|
||||
|
||||
# Track all unique combinations
|
||||
all_combos = set()
|
||||
period_end_by_key = defaultdict(set)
|
||||
|
||||
for fact in stmt_facts:
|
||||
if fact.fiscal_period == 'FY' and fact.fiscal_year and fact.fiscal_year >= 2019:
|
||||
period_key = (fact.fiscal_year, fact.fiscal_period)
|
||||
all_combos.add((fact.fiscal_year, fact.fiscal_period, fact.period_end))
|
||||
period_end_by_key[period_key].add(fact.period_end)
|
||||
|
||||
print("Period keys and their different period_end dates:")
|
||||
for key in sorted(period_end_by_key.keys(), reverse=True):
|
||||
year, period = key
|
||||
if year >= 2019 and year <= 2024:
|
||||
ends = period_end_by_key[key]
|
||||
print(f"\n({year}, '{period}'): {len(ends)} different period_ends")
|
||||
for end in sorted(ends):
|
||||
match = "✓" if end and end.year == year else "✗"
|
||||
print(f" {end} {match}")
|
||||
|
||||
# The problem: period_info dict only keeps ONE per key
|
||||
print("\n\nProblem: The current code builds period_info as a dict,")
|
||||
print("so it only keeps ONE fact per (fiscal_year, fiscal_period) key!")
|
||||
print("We lose all the other period_end variations when we do:")
|
||||
print(" if period_key not in period_info:")
|
||||
print(" period_info[period_key] = {...} # Only first one is kept!")
|
||||
@@ -0,0 +1,83 @@
|
||||
from edgar import Company
|
||||
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
|
||||
from collections import defaultdict
|
||||
|
||||
# Get Apple facts
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts
|
||||
raw_facts = facts._facts
|
||||
|
||||
# Build statement manually to debug
|
||||
builder = EnhancedStatementBuilder()
|
||||
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
|
||||
|
||||
# Build period info with new key structure
|
||||
period_info = {}
|
||||
period_facts = defaultdict(list)
|
||||
|
||||
for fact in stmt_facts:
|
||||
period_key = (fact.fiscal_year, fact.fiscal_period, fact.period_end)
|
||||
|
||||
if period_key not in period_info:
|
||||
period_info[period_key] = {
|
||||
'label': f"{fact.fiscal_period} {fact.fiscal_year}",
|
||||
'end_date': fact.period_end,
|
||||
'is_annual': fact.fiscal_period == 'FY',
|
||||
'filing_date': fact.filing_date,
|
||||
'fiscal_year': fact.fiscal_year,
|
||||
'fiscal_period': fact.fiscal_period
|
||||
}
|
||||
|
||||
period_facts[period_key].append(fact)
|
||||
|
||||
# Apply the annual filtering logic
|
||||
period_list = [(pk, info) for pk, info in period_info.items()]
|
||||
|
||||
true_annual_periods = []
|
||||
for pk, info in period_list:
|
||||
if not info['is_annual']:
|
||||
continue
|
||||
|
||||
fiscal_year = pk[0]
|
||||
period_end_date = pk[2]
|
||||
|
||||
# Check if fiscal_year matches period_end.year
|
||||
if not (period_end_date and period_end_date.year == fiscal_year):
|
||||
continue
|
||||
|
||||
# Check duration
|
||||
period_fact_list = period_facts.get(pk, [])
|
||||
if period_fact_list:
|
||||
sample_fact = period_fact_list[0]
|
||||
if sample_fact.period_start and sample_fact.period_end:
|
||||
duration = (sample_fact.period_end - sample_fact.period_start).days
|
||||
if duration > 300:
|
||||
true_annual_periods.append((pk, info))
|
||||
# Find revenue for this period
|
||||
for fact in period_fact_list:
|
||||
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
|
||||
print(f"Selected: FY {fiscal_year} ends {period_end_date}: ${fact.value:,.0f} (duration: {duration} days)")
|
||||
break
|
||||
|
||||
print(f"\nTotal true annual periods found: {len(true_annual_periods)}")
|
||||
|
||||
# Check what's in the final selection
|
||||
annual_by_year = {}
|
||||
for pk, info in true_annual_periods:
|
||||
fiscal_year = pk[0]
|
||||
period_end_date = pk[2]
|
||||
if fiscal_year not in annual_by_year or period_end_date > annual_by_year[fiscal_year][0][2]:
|
||||
annual_by_year[fiscal_year] = (pk, info)
|
||||
|
||||
sorted_periods = sorted(annual_by_year.items(), key=lambda x: x[0], reverse=True)
|
||||
selected = [period_info for year, period_info in sorted_periods[:6]]
|
||||
|
||||
print(f"\nFinal selected periods:")
|
||||
for (year, period, end), info in selected:
|
||||
print(f" FY {year} ends {end}")
|
||||
# Find revenue for this period
|
||||
for fact in period_facts[(year, period, end)]:
|
||||
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
|
||||
duration = (fact.period_end - fact.period_start).days if fact.period_start else None
|
||||
print(f" Revenue: ${fact.value:,.0f} (duration: {duration} days)")
|
||||
break
|
||||
@@ -0,0 +1,33 @@
|
||||
from edgar import Company
|
||||
|
||||
# Get Apple facts and display income statement
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts
|
||||
|
||||
print("Testing with annual=True, periods=6:")
|
||||
income = facts.income_statement(annual=True, periods=6)
|
||||
|
||||
# Get the internal data
|
||||
items = income.items
|
||||
|
||||
# Find the Total Revenue item
|
||||
for item in items:
|
||||
if "Revenue" in item.label and "Total" in item.label:
|
||||
print(f"\n{item.label}:")
|
||||
print(f" Values: {item.values}")
|
||||
print(f" Periods: {income.periods}")
|
||||
|
||||
# Show what values we have
|
||||
for i, (period, value) in enumerate(zip(income.periods, item.values)):
|
||||
if value:
|
||||
print(f" {period}: {value}")
|
||||
|
||||
# Let's also check what raw facts we have
|
||||
print("\n\nChecking raw facts for FY 2019 and FY 2020:")
|
||||
raw_facts = facts._facts
|
||||
for fact in raw_facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.fiscal_year in [2019, 2020]:
|
||||
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
|
||||
match = "✓" if fact.period_end and fact.period_end.year == fact.fiscal_year else "✗"
|
||||
print(f" FY {fact.fiscal_year} ends {fact.period_end}: ${fact.value:,.0f} {match}")
|
||||
@@ -0,0 +1,71 @@
|
||||
from edgar import Company
|
||||
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
|
||||
|
||||
# Get Apple facts
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts
|
||||
raw_facts = facts._facts
|
||||
|
||||
# Build statement manually to debug
|
||||
builder = EnhancedStatementBuilder()
|
||||
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
|
||||
|
||||
# Build period info
|
||||
from collections import defaultdict
|
||||
period_info = {}
|
||||
period_facts_map = defaultdict(list)
|
||||
|
||||
for fact in stmt_facts:
|
||||
period_key = (fact.fiscal_year, fact.fiscal_period)
|
||||
period_label = f"{fact.fiscal_period} {fact.fiscal_year}"
|
||||
|
||||
period_facts_map[period_label].append(fact)
|
||||
|
||||
if period_key not in period_info:
|
||||
period_info[period_key] = {
|
||||
'label': period_label,
|
||||
'end_date': fact.period_end,
|
||||
'is_annual': fact.fiscal_period == 'FY',
|
||||
'filing_date': fact.filing_date,
|
||||
'fiscal_year': fact.fiscal_year,
|
||||
'fiscal_period': fact.fiscal_period
|
||||
}
|
||||
|
||||
# Create list of periods
|
||||
period_list = [(pk, info) for pk, info in period_info.items()]
|
||||
|
||||
# Filter for annual
|
||||
annual_periods = [(pk, info) for pk, info in period_list if info['is_annual']]
|
||||
print(f"Total annual periods before sort: {len(annual_periods)}")
|
||||
|
||||
# Sort by end_date
|
||||
annual_periods.sort(key=lambda x: x[1]['end_date'], reverse=True)
|
||||
|
||||
print("\nFirst 10 annual periods after sorting by end_date:")
|
||||
for i, ((year, period), info) in enumerate(annual_periods[:10]):
|
||||
print(f" {i}: FY {year} - ends {info['end_date']}")
|
||||
|
||||
# Deduplicate by fiscal year
|
||||
seen_years = set()
|
||||
unique_annual_periods = []
|
||||
for pk, info in annual_periods:
|
||||
fiscal_year = pk[0]
|
||||
if fiscal_year not in seen_years:
|
||||
seen_years.add(fiscal_year)
|
||||
unique_annual_periods.append((pk, info))
|
||||
print(f" Keeping: FY {fiscal_year} ending {info['end_date']}")
|
||||
|
||||
print(f"\nUnique annual periods: {len(unique_annual_periods)}")
|
||||
print("\nFirst 6 unique periods:")
|
||||
for (year, period), info in unique_annual_periods[:6]:
|
||||
print(f" FY {year} - ends {info['end_date']}")
|
||||
|
||||
# Check what revenue value we have for those periods
|
||||
print("\nRevenue values for selected periods:")
|
||||
for (year, fp), info in unique_annual_periods[:6]:
|
||||
period_label = info['label']
|
||||
# Find revenue fact for this period
|
||||
for fact in period_facts_map[period_label]:
|
||||
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
|
||||
print(f" {period_label}: {fact.concept} = ${fact.value:,}")
|
||||
break
|
||||
@@ -0,0 +1,71 @@
|
||||
from edgar import Company
|
||||
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
|
||||
from collections import defaultdict
|
||||
|
||||
# Get Apple facts
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts
|
||||
raw_facts = facts._facts
|
||||
|
||||
# Build statement manually to debug
|
||||
builder = EnhancedStatementBuilder()
|
||||
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
|
||||
|
||||
# Build period info
|
||||
period_info = {}
|
||||
period_facts_map = defaultdict(list)
|
||||
|
||||
for fact in stmt_facts:
|
||||
period_key = (fact.fiscal_year, fact.fiscal_period)
|
||||
period_label = f"{fact.fiscal_period} {fact.fiscal_year}"
|
||||
|
||||
period_facts_map[period_label].append(fact)
|
||||
|
||||
if period_key not in period_info:
|
||||
period_info[period_key] = {
|
||||
'label': period_label,
|
||||
'end_date': fact.period_end,
|
||||
'is_annual': fact.fiscal_period == 'FY',
|
||||
'filing_date': fact.filing_date,
|
||||
'fiscal_year': fact.fiscal_year,
|
||||
'fiscal_period': fact.fiscal_period
|
||||
}
|
||||
|
||||
# Apply the fix logic
|
||||
period_list = [(pk, info) for pk, info in period_info.items()]
|
||||
annual_periods = [(pk, info) for pk, info in period_list if info['is_annual']]
|
||||
|
||||
print(f"Total annual periods: {len(annual_periods)}")
|
||||
|
||||
# Apply the matching logic
|
||||
correct_annual_periods = {}
|
||||
for pk, info in annual_periods:
|
||||
fiscal_year = pk[0]
|
||||
if info['end_date'] and info['end_date'].year == fiscal_year:
|
||||
if fiscal_year not in correct_annual_periods or \
|
||||
info['end_date'] > correct_annual_periods[fiscal_year][1]['end_date']:
|
||||
correct_annual_periods[fiscal_year] = (pk, info)
|
||||
print(f" Selected FY {fiscal_year}: ends {info['end_date']}")
|
||||
|
||||
print(f"\nCorrect annual periods found: {len(correct_annual_periods)}")
|
||||
|
||||
# Sort and select
|
||||
sorted_periods = sorted(correct_annual_periods.items(), key=lambda x: x[0], reverse=True)
|
||||
selected_period_info = [period_info for year, period_info in sorted_periods[:6]]
|
||||
|
||||
print(f"\nSelected {len(selected_period_info)} periods:")
|
||||
for (year, period), info in selected_period_info:
|
||||
print(f" {info['label']}")
|
||||
|
||||
# Check what revenue facts we have for these periods
|
||||
print("\nRevenue facts for selected periods:")
|
||||
for (year, fp), info in selected_period_info:
|
||||
period_label = info['label']
|
||||
revenue_found = False
|
||||
for fact in period_facts_map[period_label]:
|
||||
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
|
||||
print(f" {period_label}: ${fact.value:,.0f}")
|
||||
revenue_found = True
|
||||
break
|
||||
if not revenue_found:
|
||||
print(f" {period_label}: No revenue found")
|
||||
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug script to investigate table parsing/rendering issues in MSFT 10-K.
|
||||
Focus on the "Weighted average outstanding shares of common stock (B)" table.
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
|
||||
|
||||
from edgar.documents.parser import HTMLParser
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def find_table_in_html():
|
||||
"""Find and examine the table HTML structure around the target text."""
|
||||
print("🔍 EXAMINING TABLE HTML STRUCTURE")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
# Read the MSFT file
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
print(f"File size: {len(html_content)} characters")
|
||||
|
||||
# Find the table containing our target text
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Search for the specific text
|
||||
target_elements = soup.find_all(text=lambda text: text and "Weighted average outstanding shares of common stock" in text)
|
||||
|
||||
print(f"\nFound {len(target_elements)} elements with target text")
|
||||
|
||||
for i, element in enumerate(target_elements):
|
||||
print(f"\n📍 Element {i+1}:")
|
||||
print(f" Text: {element.strip()[:80]}...")
|
||||
|
||||
# Find the containing table
|
||||
parent = element.parent
|
||||
while parent and parent.name != 'table':
|
||||
parent = parent.parent
|
||||
|
||||
if parent and parent.name == 'table':
|
||||
print(f" Found containing table!")
|
||||
|
||||
# Analyze the table structure
|
||||
rows = parent.find_all('tr')
|
||||
print(f" Table has {len(rows)} rows")
|
||||
|
||||
# Look at first few rows
|
||||
for j, row in enumerate(rows[:5]):
|
||||
cells = row.find_all(['td', 'th'])
|
||||
print(f" Row {j+1}: {len(cells)} cells")
|
||||
for k, cell in enumerate(cells[:3]): # First 3 cells
|
||||
cell_text = cell.get_text().strip()[:30].replace('\n', ' ')
|
||||
print(f" Cell {k+1}: '{cell_text}...'")
|
||||
|
||||
return parent
|
||||
else:
|
||||
print(f" No containing table found")
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error examining HTML: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
def test_parser_on_msft():
|
||||
"""Test the document parser on the MSFT file."""
|
||||
print("\n🚀 TESTING DOCUMENT PARSER")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
# Read the MSFT file
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
# Parse with different configurations
|
||||
configs_to_test = [
|
||||
("Default", ParserConfig()),
|
||||
("Performance", ParserConfig.for_performance()),
|
||||
("Accuracy", ParserConfig.for_accuracy()),
|
||||
]
|
||||
|
||||
for config_name, config in configs_to_test:
|
||||
print(f"\n🧪 Testing with {config_name} config...")
|
||||
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
print(f" Document parsed successfully")
|
||||
print(f" Root children: {len(document.root.children)}")
|
||||
|
||||
# Find tables with our target text
|
||||
matching_tables = []
|
||||
|
||||
def find_target_tables(node):
|
||||
if isinstance(node, TableNode):
|
||||
table_text = node.text()
|
||||
if "Weighted average outstanding shares of common stock" in table_text:
|
||||
matching_tables.append(node)
|
||||
for child in node.children:
|
||||
find_target_tables(child)
|
||||
|
||||
find_target_tables(document.root)
|
||||
|
||||
print(f" Found {len(matching_tables)} table(s) with target text")
|
||||
|
||||
for i, table in enumerate(matching_tables):
|
||||
print(f"\n 📋 Table {i+1}:")
|
||||
print(f" Headers: {len(table.headers)} row(s)")
|
||||
print(f" Data rows: {len(table.rows)}")
|
||||
print(f" Table type: {table.table_type}")
|
||||
|
||||
# Show table structure
|
||||
if table.headers:
|
||||
print(f" Header structure:")
|
||||
for j, header_row in enumerate(table.headers):
|
||||
print(f" Row {j+1}: {len(header_row)} cells")
|
||||
for k, cell in enumerate(header_row[:3]):
|
||||
cell_text = cell.text().strip()[:20].replace('\n', ' ')
|
||||
print(f" Cell {k+1}: '{cell_text}...'")
|
||||
|
||||
print(f" First few data rows:")
|
||||
for j, row in enumerate(table.rows[:3]):
|
||||
print(f" Row {j+1}: {len(row.cells)} cells")
|
||||
for k, cell in enumerate(row.cells[:3]):
|
||||
cell_text = cell.text().strip()[:20].replace('\n', ' ')
|
||||
print(f" Cell {k+1}: '{cell_text}...'")
|
||||
|
||||
# Get the text output
|
||||
table_text = table.text()
|
||||
print(f"\n Text output ({len(table_text)} chars):")
|
||||
print(" " + "-" * 40)
|
||||
|
||||
# Show first few lines
|
||||
lines = table_text.split('\n')
|
||||
for line_num, line in enumerate(lines[:10]):
|
||||
print(f" {line_num+1:2d}: {line}")
|
||||
|
||||
if len(lines) > 10:
|
||||
print(f" ... ({len(lines)-10} more lines)")
|
||||
|
||||
print(" " + "-" * 40)
|
||||
|
||||
# Check for issues
|
||||
issues = []
|
||||
if len(table_text.strip()) == 0:
|
||||
issues.append("Empty text output")
|
||||
if "Weighted average outstanding shares" not in table_text:
|
||||
issues.append("Missing target text in output")
|
||||
if table_text.count('|') < 5: # Should have multiple columns
|
||||
issues.append("Possibly missing column separators")
|
||||
if len(lines) < 3:
|
||||
issues.append("Very few output lines")
|
||||
|
||||
if issues:
|
||||
print(f" ⚠️ Issues detected: {', '.join(issues)}")
|
||||
return table # Return problematic table for further analysis
|
||||
else:
|
||||
print(f" ✅ Table appears to render correctly")
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Parser test failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
def analyze_table_structure(table):
|
||||
"""Deep analysis of a problematic table."""
|
||||
print("\n🔬 DEEP TABLE ANALYSIS")
|
||||
print("=" * 50)
|
||||
|
||||
if not table:
|
||||
print("No table to analyze")
|
||||
return
|
||||
|
||||
print(f"Table type: {table.table_type}")
|
||||
print(f"Caption: {table.caption}")
|
||||
print(f"Summary: {table.summary}")
|
||||
|
||||
# Analyze headers
|
||||
print(f"\n📋 HEADERS ({len(table.headers)} rows):")
|
||||
for i, header_row in enumerate(table.headers):
|
||||
print(f" Row {i+1} ({len(header_row)} cells):")
|
||||
for j, cell in enumerate(header_row):
|
||||
print(f" Cell {j+1}: colspan={cell.colspan}, rowspan={cell.rowspan}")
|
||||
print(f" text='{cell.text()[:40]}...'")
|
||||
print(f" is_header={cell.is_header}")
|
||||
|
||||
# Analyze data rows
|
||||
print(f"\n📊 DATA ROWS ({len(table.rows)} rows):")
|
||||
for i, row in enumerate(table.rows[:5]): # First 5 rows
|
||||
print(f" Row {i+1} ({len(row.cells)} cells):")
|
||||
for j, cell in enumerate(row.cells):
|
||||
print(f" Cell {j+1}: colspan={cell.colspan}, rowspan={cell.rowspan}")
|
||||
print(f" text='{cell.text()[:40]}...'")
|
||||
print(f" is_numeric={cell.is_numeric}")
|
||||
|
||||
if len(table.rows) > 5:
|
||||
print(f" ... and {len(table.rows)-5} more rows")
|
||||
|
||||
# Test different rendering approaches
|
||||
print(f"\n🖼️ TESTING DIFFERENT RENDERERS:")
|
||||
|
||||
# Rich renderer
|
||||
try:
|
||||
rich_table = table.render(width=120)
|
||||
from edgar.richtools import rich_to_text
|
||||
rich_text = rich_to_text(rich_table)
|
||||
print(f" Rich renderer: {len(rich_text)} chars")
|
||||
print(f" Preview: {rich_text[:100]}...")
|
||||
except Exception as e:
|
||||
print(f" Rich renderer failed: {e}")
|
||||
|
||||
# Fast renderer
|
||||
try:
|
||||
fast_text = table._fast_text_rendering()
|
||||
print(f" Fast renderer: {len(fast_text)} chars")
|
||||
print(f" Preview: {fast_text[:100]}...")
|
||||
except Exception as e:
|
||||
print(f" Fast renderer failed: {e}")
|
||||
|
||||
# Compare outputs
|
||||
try:
|
||||
current_text = table.text()
|
||||
print(f" Current text() method: {len(current_text)} chars")
|
||||
if "Weighted average outstanding shares" in current_text:
|
||||
print(f" ✅ Contains target text")
|
||||
else:
|
||||
print(f" ❌ Missing target text")
|
||||
except Exception as e:
|
||||
print(f" Current text() method failed: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("🎯 DEBUGGING MSFT TABLE PARSING ISSUE")
|
||||
print("Target: 'Weighted average outstanding shares of common stock (B)' table")
|
||||
print()
|
||||
|
||||
# Step 1: Examine HTML structure
|
||||
table_element = find_table_in_html()
|
||||
|
||||
# Step 2: Test parser with different configurations
|
||||
problematic_table = test_parser_on_msft()
|
||||
|
||||
# Step 3: Deep analysis if issues found
|
||||
if problematic_table:
|
||||
analyze_table_structure(problematic_table)
|
||||
|
||||
print(f"\n🎯 CONCLUSION:")
|
||||
print("A problematic table was identified. Check the analysis above")
|
||||
print("for specific issues with parsing or rendering.")
|
||||
else:
|
||||
print(f"\n✅ CONCLUSION:")
|
||||
print("No obvious parsing issues were detected. The table appears to")
|
||||
print("be parsing and rendering correctly with the current parser.")
|
||||
print("If there are still issues, they may be subtle formatting problems.")
|
||||
@@ -0,0 +1,159 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug why Rich table rendering is still producing poor structure even with headers detected.
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
|
||||
|
||||
from edgar.documents.parser import HTMLParser
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
|
||||
def debug_rich_rendering_issue():
|
||||
print("🔍 DEBUGGING RICH RENDERING WITH DETECTED HEADERS")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
config = ParserConfig()
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
# Find target table
|
||||
target_table = None
|
||||
def find_target(node):
|
||||
nonlocal target_table
|
||||
if isinstance(node, TableNode):
|
||||
try:
|
||||
if "Weighted average outstanding shares" in node.text():
|
||||
target_table = node
|
||||
return
|
||||
except:
|
||||
pass
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
find_target(child)
|
||||
|
||||
find_target(document.root)
|
||||
|
||||
if not target_table:
|
||||
print("❌ Target table not found")
|
||||
return
|
||||
|
||||
print("✅ Found target table")
|
||||
print(f"Headers: {len(target_table.headers)}")
|
||||
print(f"Data rows: {len(target_table.rows)}")
|
||||
|
||||
# Examine the table structure in detail
|
||||
print(f"\n🔍 DETAILED TABLE STRUCTURE ANALYSIS:")
|
||||
|
||||
# Check headers
|
||||
if target_table.headers:
|
||||
for i, header_row in enumerate(target_table.headers):
|
||||
print(f"\nHeader row {i+1}: {len(header_row)} cells")
|
||||
for j, cell in enumerate(header_row[:8]): # First 8 cells
|
||||
print(f" Cell {j+1}: '{cell.text()}' (colspan={cell.colspan}, rowspan={cell.rowspan})")
|
||||
|
||||
# Check data row structure
|
||||
print(f"\n📊 DATA ROW ANALYSIS:")
|
||||
for i, row in enumerate(target_table.rows[:5]): # First 5 data rows
|
||||
content_cells = [j for j, cell in enumerate(row.cells) if cell.text().strip()]
|
||||
print(f"Row {i+1}: {len(row.cells)} total cells, content in positions {content_cells}")
|
||||
|
||||
# Show first few cells with content
|
||||
for j in content_cells[:3]:
|
||||
if j < len(row.cells):
|
||||
cell = row.cells[j]
|
||||
print(f" Cell {j+1}: '{cell.text()[:30]}...' (align={cell.align})")
|
||||
|
||||
# Check table dimensions
|
||||
max_cols = max(len(row.cells) for row in target_table.rows) if target_table.rows else 0
|
||||
header_cols = len(target_table.headers[0]) if target_table.headers else 0
|
||||
print(f"\n📏 TABLE DIMENSIONS:")
|
||||
print(f" Header columns: {header_cols}")
|
||||
print(f" Max data columns: {max_cols}")
|
||||
print(f" Dimension mismatch: {'YES' if header_cols != max_cols else 'NO'}")
|
||||
|
||||
# Count empty vs content cells
|
||||
total_cells = sum(len(row.cells) for row in target_table.rows)
|
||||
empty_cells = sum(1 for row in target_table.rows for cell in row.cells if not cell.text().strip())
|
||||
print(f" Total data cells: {total_cells}")
|
||||
print(f" Empty data cells: {empty_cells} ({empty_cells/total_cells*100:.1f}%)")
|
||||
|
||||
# Test Rich table creation manually
|
||||
print(f"\n🎨 TESTING RICH TABLE CREATION:")
|
||||
try:
|
||||
rich_table = target_table.render(width=120)
|
||||
print(f"✅ Rich table created successfully")
|
||||
print(f"Rich table type: {type(rich_table)}")
|
||||
|
||||
# Check Rich table properties
|
||||
if hasattr(rich_table, 'columns'):
|
||||
print(f"Rich columns: {len(rich_table.columns)}")
|
||||
if hasattr(rich_table, 'rows'):
|
||||
print(f"Rich rows: {len(rich_table.rows)}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Rich table creation failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return
|
||||
|
||||
# Test text conversion
|
||||
print(f"\n📝 TESTING TEXT CONVERSION:")
|
||||
try:
|
||||
from edgar.richtools import rich_to_text
|
||||
rich_text = rich_to_text(rich_table)
|
||||
|
||||
lines = rich_text.split('\n')
|
||||
print(f"Text output: {len(lines)} lines, {len(rich_text)} chars")
|
||||
|
||||
# Analyze line types
|
||||
empty_lines = sum(1 for line in lines if not line.strip())
|
||||
border_lines = sum(1 for line in lines if any(c in line for c in '┌┐└┘├┤│─'))
|
||||
content_lines = sum(1 for line in lines if line.strip() and not all(c in '┌┐└┘├┤│─ ' for c in line))
|
||||
|
||||
print(f" Empty lines: {empty_lines}")
|
||||
print(f" Border lines: {border_lines}")
|
||||
print(f" Content lines: {content_lines}")
|
||||
|
||||
# Show actual structure
|
||||
print(f"\nFirst 10 lines of output:")
|
||||
for i, line in enumerate(lines[:10]):
|
||||
line_type = "EMPTY" if not line.strip() else "BORDER" if any(c in line for c in '┌┐└┘├┤│─') else "CONTENT"
|
||||
print(f" {i+1:2d} [{line_type:7}]: {line[:60]}{'...' if len(line) > 60 else ''}")
|
||||
|
||||
# The problem might be that Rich is creating a table but with poor formatting
|
||||
# Let's see if we can identify the issue
|
||||
if border_lines < 3:
|
||||
print(f"\n❌ DIAGNOSIS: Very few border lines - Rich table structure is poor")
|
||||
print("This suggests the table has structural issues that prevent proper rendering.")
|
||||
print("Possible causes:")
|
||||
print("1. Column count mismatch between headers and data")
|
||||
print("2. Too many empty cells causing poor layout")
|
||||
print("3. Cell spanning issues")
|
||||
print("4. Table too wide for rendering width")
|
||||
else:
|
||||
print(f"\n✅ Rich table structure appears normal")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Text conversion failed: {e}")
|
||||
return
|
||||
|
||||
return target_table
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
if __name__ == "__main__":
|
||||
debug_rich_rendering_issue()
|
||||
|
||||
print(f"\n🎯 NEXT STEPS:")
|
||||
print("Based on the analysis above, we can identify specific issues preventing")
|
||||
print("proper Rich table rendering and address them systematically.")
|
||||
@@ -0,0 +1,61 @@
|
||||
from edgar import Company
|
||||
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
|
||||
|
||||
# Get Apple facts
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts
|
||||
|
||||
# Build the income statement
|
||||
builder = EnhancedStatementBuilder()
|
||||
stmt = builder.build_multi_period_statement(
|
||||
facts=facts._facts,
|
||||
statement_type='IncomeStatement',
|
||||
periods=6,
|
||||
annual=True
|
||||
)
|
||||
|
||||
print(f"Selected periods: {stmt.periods}")
|
||||
print("\nChecking Revenue item values:")
|
||||
|
||||
# Find the revenue item
|
||||
for item in stmt.items:
|
||||
if item.label and 'Revenue' in item.label and 'Total' in item.label:
|
||||
print(f"\n{item.label}:")
|
||||
for i, (period, value) in enumerate(zip(stmt.periods, item.values)):
|
||||
print(f" {period}: {value}")
|
||||
|
||||
# Check what concept this maps to
|
||||
if hasattr(item, 'concept'):
|
||||
print(f" Concept: {item.concept}")
|
||||
|
||||
# Now let's check what facts are in period_facts_by_label
|
||||
print("\n\nChecking what facts are in the FY 2020 period:")
|
||||
from collections import defaultdict
|
||||
|
||||
# Recreate what the builder does
|
||||
raw_facts = facts._facts
|
||||
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
|
||||
|
||||
# Build period_facts with the new key structure
|
||||
period_facts = defaultdict(list)
|
||||
for fact in stmt_facts:
|
||||
period_key = (fact.fiscal_year, fact.fiscal_period, fact.period_end)
|
||||
period_facts[period_key].append(fact)
|
||||
|
||||
# Look for FY 2020 periods
|
||||
for key in period_facts.keys():
|
||||
if key[0] == 2020 and key[1] == 'FY':
|
||||
if key[2] and key[2].year == 2020: # Correct match
|
||||
print(f"\nKey: {key}")
|
||||
# Check revenue facts in this period
|
||||
for fact in period_facts[key]:
|
||||
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
|
||||
duration = None
|
||||
if fact.period_start:
|
||||
duration = (fact.period_end - fact.period_start).days
|
||||
print(f" Revenue: ${fact.value:,.0f} (duration: {duration})")
|
||||
|
||||
# The issue might be in how period_facts_by_label is built
|
||||
print("\n\nChecking period_facts_by_label mapping:")
|
||||
# This is what happens in the builder after selection
|
||||
# It remaps from period_key to label, but multiple keys can have the same label!
|
||||
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug the table structure to understand why we're getting so many empty columns.
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
|
||||
|
||||
from edgar.documents.parser import HTMLParser
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
|
||||
def analyze_table_structure():
|
||||
print("🔍 ANALYZING TABLE STRUCTURE")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
config = ParserConfig(fast_table_rendering=True)
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
# Find target table
|
||||
target_table = None
|
||||
def find_target(node):
|
||||
nonlocal target_table
|
||||
if isinstance(node, TableNode):
|
||||
try:
|
||||
if "Weighted average outstanding shares" in node.text():
|
||||
target_table = node
|
||||
return
|
||||
except:
|
||||
pass
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
find_target(child)
|
||||
|
||||
find_target(document.root)
|
||||
|
||||
if not target_table:
|
||||
print("❌ Target table not found")
|
||||
return
|
||||
|
||||
print("✅ Found target table")
|
||||
|
||||
# Analyze the structure
|
||||
print(f"\nTable structure:")
|
||||
print(f" Headers: {len(target_table.headers)} rows")
|
||||
print(f" Data rows: {len(target_table.rows)}")
|
||||
|
||||
# Analyze header structure
|
||||
print(f"\n📋 HEADER ANALYSIS:")
|
||||
for i, header_row in enumerate(target_table.headers):
|
||||
print(f" Header row {i+1}: {len(header_row)} cells")
|
||||
for j, cell in enumerate(header_row[:10]): # First 10 cells
|
||||
text = cell.text().strip()
|
||||
display_text = text[:20] if text else "[EMPTY]"
|
||||
print(f" Cell {j+1}: '{display_text}' (colspan={cell.colspan})")
|
||||
|
||||
# Analyze data rows
|
||||
print(f"\n📊 DATA ROW ANALYSIS:")
|
||||
for i, row in enumerate(target_table.rows[:5]): # First 5 rows
|
||||
print(f" Row {i+1}: {len(row.cells)} cells")
|
||||
for j, cell in enumerate(row.cells[:10]): # First 10 cells
|
||||
text = cell.text().strip()
|
||||
display_text = text[:20] if text else "[EMPTY]"
|
||||
print(f" Cell {j+1}: '{display_text}' (colspan={cell.colspan})")
|
||||
|
||||
# Count empty vs filled cells
|
||||
total_cells = 0
|
||||
empty_cells = 0
|
||||
|
||||
for header_row in target_table.headers:
|
||||
for cell in header_row:
|
||||
total_cells += 1
|
||||
if not cell.text().strip():
|
||||
empty_cells += 1
|
||||
|
||||
for row in target_table.rows:
|
||||
for cell in row.cells:
|
||||
total_cells += 1
|
||||
if not cell.text().strip():
|
||||
empty_cells += 1
|
||||
|
||||
print(f"\n📊 CELL STATISTICS:")
|
||||
print(f" Total cells: {total_cells}")
|
||||
print(f" Empty cells: {empty_cells}")
|
||||
print(f" Filled cells: {total_cells - empty_cells}")
|
||||
print(f" Empty percentage: {empty_cells/total_cells*100:.1f}%")
|
||||
|
||||
# Check maximum meaningful columns
|
||||
max_meaningful_cols = 0
|
||||
for row in target_table.rows:
|
||||
meaningful_cols = 0
|
||||
for cell in row.cells:
|
||||
if cell.text().strip():
|
||||
meaningful_cols = len([c for c in row.cells[:len(row.cells)] if c.text().strip()])
|
||||
break
|
||||
max_meaningful_cols = max(max_meaningful_cols, meaningful_cols)
|
||||
|
||||
print(f" Maximum meaningful columns in any row: {max_meaningful_cols}")
|
||||
|
||||
return target_table
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
def test_column_filtering():
|
||||
"""Test filtering out empty columns."""
|
||||
print(f"\n🔧 TESTING COLUMN FILTERING")
|
||||
print("=" * 50)
|
||||
|
||||
target_table = analyze_table_structure()
|
||||
if not target_table:
|
||||
return
|
||||
|
||||
# Analyze which columns actually have content
|
||||
if not target_table.rows:
|
||||
print("No data rows to analyze")
|
||||
return
|
||||
|
||||
max_cols = max(len(row.cells) for row in target_table.rows)
|
||||
print(f"Maximum columns: {max_cols}")
|
||||
|
||||
# Check each column for meaningful content
|
||||
meaningful_columns = []
|
||||
for col_idx in range(max_cols):
|
||||
has_content = False
|
||||
|
||||
# Check headers
|
||||
for header_row in target_table.headers:
|
||||
if col_idx < len(header_row) and header_row[col_idx].text().strip():
|
||||
has_content = True
|
||||
break
|
||||
|
||||
# Check data rows
|
||||
if not has_content:
|
||||
for row in target_table.rows:
|
||||
if col_idx < len(row.cells) and row.cells[col_idx].text().strip():
|
||||
has_content = True
|
||||
break
|
||||
|
||||
if has_content:
|
||||
meaningful_columns.append(col_idx)
|
||||
|
||||
print(f"Meaningful columns: {meaningful_columns} ({len(meaningful_columns)} total)")
|
||||
|
||||
# Test rendering with only meaningful columns
|
||||
print(f"\n📊 FILTERED TABLE PREVIEW:")
|
||||
|
||||
# Show first data row with only meaningful columns
|
||||
if target_table.rows:
|
||||
first_row = target_table.rows[0]
|
||||
filtered_cells = []
|
||||
for col_idx in meaningful_columns:
|
||||
if col_idx < len(first_row.cells):
|
||||
cell_text = first_row.cells[col_idx].text().strip()
|
||||
filtered_cells.append(cell_text if cell_text else "[EMPTY]")
|
||||
else:
|
||||
filtered_cells.append("[MISSING]")
|
||||
|
||||
print("First row filtered:", " | ".join(filtered_cells))
|
||||
|
||||
return meaningful_columns
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("🎯 DEBUGGING TABLE STRUCTURE ISSUE")
|
||||
print("Focus: Understanding why we get so many empty columns")
|
||||
print()
|
||||
|
||||
meaningful_cols = test_column_filtering()
|
||||
|
||||
if meaningful_cols:
|
||||
print(f"\n🎯 FINDINGS:")
|
||||
print(f"The table has many empty spacing columns.")
|
||||
print(f"Only {len(meaningful_cols)} out of many columns have actual content.")
|
||||
print(f"The FastTableRenderer should filter out empty columns.")
|
||||
|
||||
print(f"\n🔧 SOLUTION:")
|
||||
print("Update FastTableRenderer to:")
|
||||
print("1. Identify columns with meaningful content")
|
||||
print("2. Filter out purely empty/spacing columns")
|
||||
print("3. Only render the meaningful columns")
|
||||
else:
|
||||
print("❌ Could not analyze column structure")
|
||||
@@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug why tables are losing their structure during parsing.
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
|
||||
|
||||
from edgar.documents.parser import HTMLParser
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def examine_raw_html_table():
|
||||
"""Examine the raw HTML structure of the problematic table."""
|
||||
print("🔍 EXAMINING RAW HTML TABLE STRUCTURE")
|
||||
print("=" * 55)
|
||||
|
||||
try:
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
# Find the table HTML
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Look for table containing our target text
|
||||
target_elements = soup.find_all(string=lambda text: text and "Weighted average outstanding shares" in text)
|
||||
|
||||
if not target_elements:
|
||||
print("❌ Target text not found in HTML")
|
||||
return None
|
||||
|
||||
target_element = target_elements[0]
|
||||
|
||||
# Find the containing table
|
||||
table_element = target_element
|
||||
while table_element and table_element.name != 'table':
|
||||
table_element = table_element.parent
|
||||
|
||||
if not table_element:
|
||||
print("❌ No containing table found")
|
||||
return None
|
||||
|
||||
print("✅ Found containing HTML table")
|
||||
|
||||
# Analyze the HTML table structure
|
||||
rows = table_element.find_all('tr')
|
||||
print(f"HTML table has {len(rows)} rows")
|
||||
|
||||
# Look for thead, tbody structure
|
||||
thead = table_element.find('thead')
|
||||
tbody = table_element.find('tbody')
|
||||
print(f"Has <thead>: {'✅' if thead else '❌'}")
|
||||
print(f"Has <tbody>: {'✅' if tbody else '❌'}")
|
||||
|
||||
# Analyze first few rows
|
||||
print(f"\nFirst few rows analysis:")
|
||||
for i, row in enumerate(rows[:10]):
|
||||
cells = row.find_all(['td', 'th'])
|
||||
cell_info = []
|
||||
for cell in cells[:5]: # First 5 cells
|
||||
text = cell.get_text().strip()[:20]
|
||||
tag = cell.name
|
||||
colspan = cell.get('colspan', '1')
|
||||
cell_info.append(f"{tag}({colspan}):'{text}'")
|
||||
|
||||
print(f" Row {i+1}: {len(cells)} cells - {', '.join(cell_info)}")
|
||||
if len(cells) > 5:
|
||||
print(f" ... and {len(cells)-5} more cells")
|
||||
|
||||
# Check if there are any TH (header) cells
|
||||
th_cells = table_element.find_all('th')
|
||||
print(f"\nTotal <th> header cells: {len(th_cells)}")
|
||||
|
||||
# Look for potential header patterns
|
||||
header_candidates = []
|
||||
for i, row in enumerate(rows[:5]): # Check first 5 rows for headers
|
||||
cells = row.find_all(['td', 'th'])
|
||||
row_text = ' '.join(cell.get_text().strip() for cell in cells).strip()
|
||||
if any(keyword in row_text.lower() for keyword in ['year', 'ended', '2025', '2024', '2023']):
|
||||
header_candidates.append(i)
|
||||
print(f" Potential header row {i+1}: {row_text[:80]}...")
|
||||
|
||||
return table_element
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
def debug_table_parsing_pipeline():
|
||||
"""Debug how the table gets processed through the parsing pipeline."""
|
||||
print(f"\n🔧 DEBUGGING TABLE PARSING PIPELINE")
|
||||
print("=" * 55)
|
||||
|
||||
try:
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
config = ParserConfig(fast_table_rendering=False)
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
# Find target table
|
||||
target_table = None
|
||||
def find_target(node):
|
||||
nonlocal target_table
|
||||
if isinstance(node, TableNode):
|
||||
try:
|
||||
if "Weighted average outstanding shares" in node.text():
|
||||
target_table = node
|
||||
return
|
||||
except:
|
||||
pass
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
find_target(child)
|
||||
|
||||
find_target(document.root)
|
||||
|
||||
if not target_table:
|
||||
print("❌ Target table not found in parsed document")
|
||||
return
|
||||
|
||||
print("✅ Found target table in parsed document")
|
||||
|
||||
# Analyze how the table was parsed
|
||||
print(f"\nParsed table analysis:")
|
||||
print(f" Table type: {target_table.table_type}")
|
||||
print(f" Has headers: {'✅' if target_table.headers else '❌'}")
|
||||
print(f" Header rows: {len(target_table.headers)}")
|
||||
print(f" Data rows: {len(target_table.rows)}")
|
||||
print(f" Caption: {target_table.caption}")
|
||||
|
||||
# Check if headers were detected
|
||||
if target_table.headers:
|
||||
print(f"\n Header structure:")
|
||||
for i, header_row in enumerate(target_table.headers):
|
||||
header_texts = [cell.text().strip()[:20] for cell in header_row]
|
||||
print(f" Header row {i+1}: {header_texts}")
|
||||
else:
|
||||
print(f"\n ❌ NO HEADERS DETECTED - This is likely the problem!")
|
||||
print(f" The parser failed to identify header rows in the HTML table.")
|
||||
|
||||
# Check if any of the first few data rows look like headers
|
||||
print(f"\n First few data rows (might be misclassified headers):")
|
||||
for i, row in enumerate(target_table.rows[:5]):
|
||||
row_texts = [cell.text().strip()[:20] for cell in row.cells[:5]]
|
||||
print(f" Data row {i+1}: {row_texts}")
|
||||
|
||||
# Check if this row looks like a header
|
||||
row_text = ' '.join(cell.text().strip() for cell in row.cells)
|
||||
if any(keyword in row_text.lower() for keyword in ['year', 'ended', '2025', '2024', '2023', 'millions']):
|
||||
print(f" ⚠️ This looks like it should be a header row!")
|
||||
|
||||
# Test manual header detection
|
||||
print(f"\n🔍 MANUAL HEADER DETECTION TEST:")
|
||||
potential_headers = []
|
||||
|
||||
for i, row in enumerate(target_table.rows[:5]):
|
||||
row_text = ' '.join(cell.text().strip() for cell in row.cells).strip()
|
||||
|
||||
# Score this row as a potential header
|
||||
header_score = 0
|
||||
|
||||
# Check for typical header keywords
|
||||
header_keywords = ['millions', 'year ended', 'june 30', '2025', '2024', '2023']
|
||||
for keyword in header_keywords:
|
||||
if keyword in row_text.lower():
|
||||
header_score += 1
|
||||
|
||||
# Check for mostly empty cells (common in header spacing rows)
|
||||
empty_cells = sum(1 for cell in row.cells if not cell.text().strip())
|
||||
if empty_cells / len(row.cells) > 0.7: # More than 70% empty
|
||||
header_score -= 1
|
||||
|
||||
# Check for meaningful content vs pure spacing
|
||||
meaningful_cells = sum(1 for cell in row.cells if len(cell.text().strip()) > 2)
|
||||
if meaningful_cells >= 2: # At least 2 cells with meaningful content
|
||||
header_score += 1
|
||||
|
||||
potential_headers.append((i, row, header_score, row_text))
|
||||
print(f" Row {i+1}: score={header_score}, text='{row_text[:60]}...'")
|
||||
|
||||
# Find the best header candidate
|
||||
best_header = max(potential_headers, key=lambda x: x[2])
|
||||
if best_header[2] > 0:
|
||||
print(f"\n ✅ Best header candidate: Row {best_header[0]+1} (score={best_header[2]})")
|
||||
print(f" Text: {best_header[3]}")
|
||||
else:
|
||||
print(f"\n ❌ No good header candidates found")
|
||||
|
||||
return target_table
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("🎯 DEBUGGING TABLE STRUCTURE PARSING")
|
||||
print("Focus: Why tables lose structure during parsing")
|
||||
print()
|
||||
|
||||
# Step 1: Examine raw HTML
|
||||
html_table = examine_raw_html_table()
|
||||
|
||||
# Step 2: Debug parsing pipeline
|
||||
parsed_table = debug_table_parsing_pipeline()
|
||||
|
||||
print(f"\n🎯 DIAGNOSIS:")
|
||||
if html_table and parsed_table:
|
||||
print("The table exists in HTML and is being parsed into a TableNode.")
|
||||
print("The issue is likely in header detection - the parser isn't")
|
||||
print("properly identifying which rows should be headers vs data.")
|
||||
|
||||
print(f"\n🔧 SOLUTION:")
|
||||
print("1. Improve header detection logic in table parsing")
|
||||
print("2. Look for rows with year indicators (2025, 2024, 2023) as headers")
|
||||
print("3. Handle tables without explicit <th> tags better")
|
||||
print("4. Keep Rich rendering as default for beautiful output")
|
||||
else:
|
||||
print("Basic table parsing is failing - need to investigate further.")
|
||||
@@ -0,0 +1,209 @@
|
||||
"""
|
||||
Check specific edge cases in our solution
|
||||
"""
|
||||
|
||||
from edgar import Company
|
||||
|
||||
def check_instant_facts():
|
||||
"""Check how we handle instant facts (balance sheet items)"""
|
||||
print("\n1. INSTANT FACTS (Balance Sheet Items)")
|
||||
print("-" * 50)
|
||||
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts._facts
|
||||
|
||||
# Look for balance sheet instant facts
|
||||
instant_count = 0
|
||||
duration_count = 0
|
||||
|
||||
for fact in facts:
|
||||
if fact.statement_type == 'BalanceSheet' and fact.fiscal_period == 'FY':
|
||||
if fact.fiscal_year == 2023:
|
||||
if fact.period_start:
|
||||
duration_count += 1
|
||||
else:
|
||||
instant_count += 1
|
||||
|
||||
print(f" Balance Sheet FY 2023 facts:")
|
||||
print(f" - With duration (period_start exists): {duration_count}")
|
||||
print(f" - Instant (no period_start): {instant_count}")
|
||||
print(f" ✓ Our solution handles instant facts correctly (no duration check)")
|
||||
|
||||
def check_fiscal_year_boundaries():
|
||||
"""Check companies with different fiscal year ends"""
|
||||
print("\n2. FISCAL YEAR BOUNDARY ISSUES")
|
||||
print("-" * 50)
|
||||
|
||||
# Microsoft has June year-end
|
||||
msft = Company("MSFT")
|
||||
facts = msft.facts._facts
|
||||
|
||||
print(" Microsoft (June year-end):")
|
||||
for fact in facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.fiscal_year == 2023 and 'Revenue' in str(fact.concept):
|
||||
if fact.period_start and fact.period_end:
|
||||
duration = (fact.period_end - fact.period_start).days
|
||||
if duration > 300:
|
||||
print(f" FY 2023: {fact.period_start} to {fact.period_end}")
|
||||
print(f" Period end year: {fact.period_end.year}")
|
||||
print(f" Fiscal year: {fact.fiscal_year}")
|
||||
match = "✓" if fact.period_end.year == fact.fiscal_year else "✗"
|
||||
print(f" Year match: {match}")
|
||||
break
|
||||
|
||||
# Walmart has January year-end
|
||||
print("\n Walmart (January year-end):")
|
||||
wmt = Company("WMT")
|
||||
facts = wmt.facts._facts
|
||||
|
||||
for fact in facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.fiscal_year == 2023 and 'Revenue' in str(fact.concept):
|
||||
if fact.period_start and fact.period_end:
|
||||
duration = (fact.period_end - fact.period_start).days
|
||||
if duration > 300:
|
||||
print(f" FY 2023: {fact.period_start} to {fact.period_end}")
|
||||
print(f" Period end year: {fact.period_end.year}")
|
||||
print(f" Fiscal year: {fact.fiscal_year}")
|
||||
match = "✓" if fact.period_end.year == fact.fiscal_year else "✗"
|
||||
print(f" Year match: {match}")
|
||||
break
|
||||
|
||||
def check_duration_edge_cases():
|
||||
"""Check edge cases around our 300-day threshold"""
|
||||
print("\n3. DURATION EDGE CASES")
|
||||
print("-" * 50)
|
||||
|
||||
# Collect all annual durations across companies
|
||||
test_tickers = ['AAPL', 'MSFT', 'WMT', 'JNJ', 'TSLA']
|
||||
all_durations = []
|
||||
|
||||
for ticker in test_tickers:
|
||||
try:
|
||||
company = Company(ticker)
|
||||
facts = company.facts._facts
|
||||
|
||||
for fact in facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.fiscal_year >= 2020 and 'Revenue' in str(fact.concept):
|
||||
if fact.period_start and fact.period_end:
|
||||
duration = (fact.period_end - fact.period_start).days
|
||||
if duration > 200: # Collect all potentially annual
|
||||
all_durations.append((ticker, duration))
|
||||
except:
|
||||
pass
|
||||
|
||||
# Analyze distribution
|
||||
from collections import Counter
|
||||
duration_counts = Counter([d for _, d in all_durations])
|
||||
|
||||
print(" Duration distribution for FY Revenue facts:")
|
||||
for duration in sorted(set([d for _, d in all_durations])):
|
||||
count = duration_counts[duration]
|
||||
if duration < 300:
|
||||
status = "❌ Would be filtered out"
|
||||
elif duration > 400:
|
||||
status = "⚠️ Unusually long"
|
||||
else:
|
||||
status = "✓ Accepted as annual"
|
||||
print(f" {duration} days: {count} facts - {status}")
|
||||
|
||||
# Check if any annual facts are < 300 days
|
||||
short_annuals = [d for _, d in all_durations if d >= 250 and d < 300]
|
||||
if short_annuals:
|
||||
print(f"\n ⚠️ WARNING: Found {len(short_annuals)} facts between 250-300 days")
|
||||
print(f" These might be annual but would be filtered out")
|
||||
|
||||
def check_leap_year_impact():
|
||||
"""Check if leap years affect our logic"""
|
||||
print("\n4. LEAP YEAR IMPACT")
|
||||
print("-" * 50)
|
||||
|
||||
# 2020 was a leap year
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts._facts
|
||||
|
||||
leap_year_durations = []
|
||||
regular_year_durations = []
|
||||
|
||||
for fact in facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if 'Revenue' in str(fact.concept):
|
||||
if fact.period_start and fact.period_end:
|
||||
duration = (fact.period_end - fact.period_start).days
|
||||
if duration > 300:
|
||||
if fact.fiscal_year == 2020:
|
||||
leap_year_durations.append(duration)
|
||||
elif fact.fiscal_year in [2019, 2021]:
|
||||
regular_year_durations.append(duration)
|
||||
|
||||
if leap_year_durations and regular_year_durations:
|
||||
print(f" Leap year (2020) durations: {set(leap_year_durations)}")
|
||||
print(f" Regular year durations: {set(regular_year_durations)}")
|
||||
print(f" ✓ Difference is minimal, 300-day threshold handles both")
|
||||
|
||||
def check_amended_filings():
|
||||
"""Check how amended filings affect our logic"""
|
||||
print("\n5. AMENDED FILINGS")
|
||||
print("-" * 50)
|
||||
|
||||
# Look for duplicate facts from amendments
|
||||
aapl = Company("AAPL")
|
||||
facts = aapl.facts._facts
|
||||
|
||||
# Track facts by fiscal year and duration
|
||||
from collections import defaultdict
|
||||
facts_by_year_duration = defaultdict(list)
|
||||
|
||||
for fact in facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.fiscal_year == 2023 and 'Revenue' in str(fact.concept):
|
||||
if fact.period_start and fact.period_end:
|
||||
duration = (fact.period_end - fact.period_start).days
|
||||
if duration > 300:
|
||||
key = (fact.fiscal_year, duration, fact.period_end)
|
||||
facts_by_year_duration[key].append({
|
||||
'value': fact.value,
|
||||
'filing_date': fact.filing_date,
|
||||
'accession': fact.accession if hasattr(fact, 'accession') else None
|
||||
})
|
||||
|
||||
# Check for duplicates
|
||||
for key, facts_list in facts_by_year_duration.items():
|
||||
if len(facts_list) > 1:
|
||||
year, duration, end_date = key
|
||||
print(f" Found {len(facts_list)} facts for FY {year} ({duration} days, ends {end_date}):")
|
||||
for f in facts_list:
|
||||
print(f" Value: ${f['value']:,.0f}, Filed: {f['filing_date']}")
|
||||
print(" ⚠️ Multiple facts for same period - might need to pick latest filing")
|
||||
|
||||
# Run all checks
|
||||
if __name__ == "__main__":
|
||||
print("=" * 60)
|
||||
print("EDGE CASE ANALYSIS FOR DURATION-BASED SOLUTION")
|
||||
print("=" * 60)
|
||||
|
||||
check_instant_facts()
|
||||
check_fiscal_year_boundaries()
|
||||
check_duration_edge_cases()
|
||||
check_leap_year_impact()
|
||||
check_amended_filings()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY OF FINDINGS")
|
||||
print("=" * 60)
|
||||
print("\n✓ STRENGTHS:")
|
||||
print(" 1. 300-day threshold works well for standard annual periods (363-365 days)")
|
||||
print(" 2. Instant facts (balance sheet) handled correctly")
|
||||
print(" 3. Leap years don't cause issues")
|
||||
print("\n⚠️ POTENTIAL ISSUES:")
|
||||
print(" 1. Fiscal year boundary: Some companies' FY doesn't match calendar year")
|
||||
print(" - WMT FY 2023 ends in Jan 2023 (year mismatch)")
|
||||
print(" 2. Amended filings might create duplicates")
|
||||
print(" 3. No handling for multi-year aggregates (>400 days)")
|
||||
print("\nRECOMMENDED IMPROVEMENTS:")
|
||||
print(" 1. For fiscal year matching, be more flexible:")
|
||||
print(" - Allow FY to match period_end.year OR period_end.year + 1")
|
||||
print(" 2. When duplicates exist, prefer latest filing_date")
|
||||
print(" 3. Add upper bound check (duration < 400) to exclude multi-year")
|
||||
@@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test that the table parsing issue is actually fixed with proper config propagation.
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
|
||||
|
||||
from edgar.documents.parser import HTMLParser
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
|
||||
def test_msft_table_with_proper_config():
|
||||
"""Test MSFT table with proper config propagation."""
|
||||
print("🧪 TESTING MSFT TABLE WITH PROPER CONFIG")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
# Parse the document with explicit config
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
# Test with explicit fast rendering config
|
||||
config = ParserConfig(fast_table_rendering=True)
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
print(f"Config fast_table_rendering: {config.fast_table_rendering}")
|
||||
|
||||
# Find the target table
|
||||
target_table = None
|
||||
def find_target(node):
|
||||
nonlocal target_table
|
||||
if isinstance(node, TableNode):
|
||||
try:
|
||||
if "Weighted average outstanding shares" in node.text():
|
||||
target_table = node
|
||||
return
|
||||
except:
|
||||
pass
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
find_target(child)
|
||||
|
||||
find_target(document.root)
|
||||
|
||||
if not target_table:
|
||||
print("❌ Target table not found")
|
||||
return False
|
||||
|
||||
print("✅ Found target table!")
|
||||
|
||||
# Ensure config is set on the table
|
||||
target_table._config = config
|
||||
|
||||
# Test the output
|
||||
table_text = target_table.text()
|
||||
|
||||
print(f"\nTable output ({len(table_text)} characters):")
|
||||
print("-" * 40)
|
||||
print(table_text)
|
||||
print("-" * 40)
|
||||
|
||||
# Check for proper formatting
|
||||
lines = table_text.split('\n')
|
||||
pipe_lines = [line for line in lines if '|' in line and line.strip()]
|
||||
|
||||
print(f"\nFormatting analysis:")
|
||||
print(f" Total lines: {len(lines)}")
|
||||
print(f" Lines with pipes: {len(pipe_lines)}")
|
||||
print(f" Contains target text: {'✅' if 'Weighted average outstanding shares' in table_text else '❌'}")
|
||||
|
||||
if len(pipe_lines) > 5 and 'Weighted average outstanding shares' in table_text:
|
||||
print("✅ TABLE IS PROPERLY FORMATTED!")
|
||||
return True
|
||||
else:
|
||||
print("❌ Table formatting issues persist")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def verify_config_propagation():
|
||||
"""Verify that table nodes receive the config during parsing."""
|
||||
print(f"\n🔧 VERIFYING CONFIG PROPAGATION")
|
||||
print("=" * 60)
|
||||
|
||||
# We need to check if the HTMLParser properly sets config on table nodes
|
||||
# This might require modifications to ensure config propagation
|
||||
|
||||
print("Checking if TableNodes receive config during parsing...")
|
||||
|
||||
# Create a simple test HTML
|
||||
simple_html = """
|
||||
<html>
|
||||
<body>
|
||||
<table>
|
||||
<tr><td>Header 1</td><td>Header 2</td></tr>
|
||||
<tr><td>Data 1</td><td>Data 2</td></tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
config = ParserConfig(fast_table_rendering=True)
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(simple_html)
|
||||
|
||||
# Find table and check config
|
||||
table_found = False
|
||||
def check_table_config(node):
|
||||
nonlocal table_found
|
||||
if isinstance(node, TableNode):
|
||||
table_found = True
|
||||
has_config = hasattr(node, '_config')
|
||||
config_matches = has_config and node._config.fast_table_rendering == True
|
||||
print(f" Table found: ✅")
|
||||
print(f" Has _config attribute: {'✅' if has_config else '❌'}")
|
||||
print(f" Config fast_table_rendering: {'✅' if config_matches else '❌'}")
|
||||
|
||||
if not has_config:
|
||||
print(" 🔧 Setting config manually...")
|
||||
node._config = config
|
||||
test_text = node.text()
|
||||
print(f" Manual config test: {'✅' if '|' in test_text else '❌'}")
|
||||
print(f" Test output preview: {test_text[:50]}...")
|
||||
|
||||
return has_config and config_matches
|
||||
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
check_table_config(child)
|
||||
|
||||
config_working = check_table_config(document.root)
|
||||
|
||||
if not table_found:
|
||||
print(" ❌ No table found in simple test")
|
||||
return False
|
||||
|
||||
return config_working
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("🎯 FINAL TEST: MSFT TABLE PARSING FIX")
|
||||
print()
|
||||
|
||||
# Test config propagation
|
||||
config_ok = verify_config_propagation()
|
||||
|
||||
# Test MSFT table
|
||||
table_ok = test_msft_table_with_proper_config()
|
||||
|
||||
print(f"\n🏁 FINAL RESULTS:")
|
||||
print(f" Config propagation: {'✅' if config_ok else '❌'}")
|
||||
print(f" MSFT table formatting: {'✅' if table_ok else '❌'}")
|
||||
|
||||
if table_ok:
|
||||
print(f"\n🎉 SUCCESS!")
|
||||
print("The MSFT table parsing issue has been resolved!")
|
||||
print("Tables now render with proper pipe formatting.")
|
||||
else:
|
||||
print(f"\n🔧 NEEDS WORK:")
|
||||
if not config_ok:
|
||||
print("- Config propagation to TableNodes needs to be implemented")
|
||||
if not table_ok:
|
||||
print("- Table formatting still has issues")
|
||||
|
||||
print("\nRecommended fix: Ensure HTMLParser sets _config on all TableNode instances")
|
||||
@@ -0,0 +1,196 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test the improved header detection logic.
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
|
||||
|
||||
from edgar.documents.parser import HTMLParser
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
|
||||
def test_header_detection_improvement():
|
||||
print("🔧 TESTING IMPROVED HEADER DETECTION")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
# Use default config (Rich rendering)
|
||||
config = ParserConfig()
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
# Find target table
|
||||
target_table = None
|
||||
def find_target(node):
|
||||
nonlocal target_table
|
||||
if isinstance(node, TableNode):
|
||||
try:
|
||||
if "Weighted average outstanding shares" in node.text():
|
||||
target_table = node
|
||||
return
|
||||
except:
|
||||
pass
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
find_target(child)
|
||||
|
||||
find_target(document.root)
|
||||
|
||||
if not target_table:
|
||||
print("❌ Target table not found")
|
||||
return False
|
||||
|
||||
print("✅ Found target table")
|
||||
|
||||
# Check the results
|
||||
print(f"\nImproved parsing results:")
|
||||
print(f" Headers detected: {len(target_table.headers)} rows")
|
||||
print(f" Data rows: {len(target_table.rows)}")
|
||||
|
||||
if target_table.headers:
|
||||
print(f"\n📋 DETECTED HEADERS:")
|
||||
for i, header_row in enumerate(target_table.headers):
|
||||
header_texts = [cell.text().strip() for cell in header_row if cell.text().strip()]
|
||||
print(f" Header row {i+1}: {header_texts}")
|
||||
else:
|
||||
print(f"\n❌ Still no headers detected")
|
||||
return False
|
||||
|
||||
# Test Rich rendering with proper headers
|
||||
print(f"\n🎨 TESTING RICH RENDERING:")
|
||||
rich_table = target_table.render(width=120)
|
||||
from edgar.richtools import rich_to_text
|
||||
rich_text = rich_to_text(rich_table)
|
||||
|
||||
# Check if Rich now produces structured output
|
||||
lines = rich_text.split('\n')
|
||||
structured_lines = [line for line in lines if any(c in line for c in '┌┐└┘├┤│─')]
|
||||
|
||||
print(f" Rich output length: {len(rich_text)} chars")
|
||||
print(f" Total lines: {len(lines)}")
|
||||
print(f" Structured lines: {len(structured_lines)}")
|
||||
|
||||
if len(structured_lines) > 5:
|
||||
print(f" ✅ Rich output is now properly structured!")
|
||||
|
||||
# Show a sample of the structured output
|
||||
print(f"\n📊 RICH TABLE SAMPLE:")
|
||||
for i, line in enumerate(lines[:10]):
|
||||
if line.strip():
|
||||
print(f" {line}")
|
||||
|
||||
return True
|
||||
else:
|
||||
print(f" ❌ Rich output still lacks proper structure")
|
||||
print(f" Sample lines:")
|
||||
for i, line in enumerate(lines[:5]):
|
||||
print(f" {i+1}: {line[:60]}{'...' if len(line) > 60 else ''}")
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def compare_before_after():
|
||||
"""Compare table quality across all tables after the fix."""
|
||||
print(f"\n📊 COMPARING TABLE QUALITY ACROSS ALL TABLES")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
config = ParserConfig()
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
# Collect all tables
|
||||
all_tables = []
|
||||
def collect_tables(node):
|
||||
if isinstance(node, TableNode):
|
||||
all_tables.append(node)
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
collect_tables(child)
|
||||
|
||||
collect_tables(document.root)
|
||||
|
||||
print(f"Found {len(all_tables)} total tables")
|
||||
|
||||
# Analyze table quality
|
||||
good_tables = 0
|
||||
tables_with_headers = 0
|
||||
|
||||
from edgar.richtools import rich_to_text
|
||||
|
||||
for i, table in enumerate(all_tables):
|
||||
try:
|
||||
# Count tables with headers
|
||||
if table.headers:
|
||||
tables_with_headers += 1
|
||||
|
||||
# Test Rich rendering quality
|
||||
rich_table = table.render(width=120)
|
||||
rich_text = rich_to_text(rich_table)
|
||||
|
||||
lines = rich_text.split('\n')
|
||||
structured_lines = [line for line in lines if any(c in line for c in '┌┐└┘├┤│─')]
|
||||
|
||||
if len(structured_lines) > 3:
|
||||
good_tables += 1
|
||||
|
||||
except Exception as e:
|
||||
pass # Skip problematic tables
|
||||
|
||||
print(f"\nTable quality summary:")
|
||||
print(f" Tables with headers: {tables_with_headers}/{len(all_tables)} ({tables_with_headers/len(all_tables)*100:.1f}%)")
|
||||
print(f" Well-structured tables: {good_tables}/{len(all_tables)} ({good_tables/len(all_tables)*100:.1f}%)")
|
||||
|
||||
if tables_with_headers > 0:
|
||||
print(f" ✅ Header detection is working!")
|
||||
else:
|
||||
print(f" ❌ Header detection still needs work")
|
||||
|
||||
if good_tables > 0:
|
||||
print(f" ✅ Some tables now render with proper structure!")
|
||||
else:
|
||||
print(f" ❌ Rich rendering still needs improvement")
|
||||
|
||||
return tables_with_headers > 0 and good_tables > 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("🎯 TESTING IMPROVED TABLE PARSING")
|
||||
print("Focus: Better header detection for Rich table rendering")
|
||||
print()
|
||||
|
||||
# Test specific target table
|
||||
target_success = test_header_detection_improvement()
|
||||
|
||||
# Test overall improvement
|
||||
overall_success = compare_before_after()
|
||||
|
||||
print(f"\n🏁 FINAL RESULTS:")
|
||||
print(f" Target table fixed: {'✅' if target_success else '❌'}")
|
||||
print(f" Overall improvement: {'✅' if overall_success else '❌'}")
|
||||
|
||||
if target_success and overall_success:
|
||||
print(f"\n🎉 SUCCESS!")
|
||||
print("The table parsing issue has been resolved!")
|
||||
print("Tables now render with beautiful Rich formatting!")
|
||||
elif target_success:
|
||||
print(f"\n🎯 PARTIAL SUCCESS!")
|
||||
print("The target table is fixed, but more work needed on other tables.")
|
||||
else:
|
||||
print(f"\n🔧 MORE WORK NEEDED")
|
||||
print("Header detection improvements aren't sufficient yet.")
|
||||
@@ -0,0 +1,194 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test the improved FastTableRenderer with column filtering.
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
|
||||
|
||||
from edgar.documents.parser import HTMLParser
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
|
||||
def test_improved_rendering():
|
||||
print("🧪 TESTING IMPROVED FAST TABLE RENDERER")
|
||||
print("=" * 55)
|
||||
|
||||
try:
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
config = ParserConfig(fast_table_rendering=True)
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
# Find target table
|
||||
target_table = None
|
||||
def find_target(node):
|
||||
nonlocal target_table
|
||||
if isinstance(node, TableNode):
|
||||
try:
|
||||
if "Weighted average outstanding shares" in node.text():
|
||||
target_table = node
|
||||
return
|
||||
except:
|
||||
pass
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
find_target(child)
|
||||
|
||||
find_target(document.root)
|
||||
|
||||
if not target_table:
|
||||
print("❌ Target table not found")
|
||||
return False
|
||||
|
||||
print("✅ Found target table")
|
||||
|
||||
# Clear cache to get fresh rendering
|
||||
if hasattr(target_table, '_text_cache'):
|
||||
target_table._text_cache = None
|
||||
|
||||
# Get new table text
|
||||
table_text = target_table.text()
|
||||
|
||||
print(f"\nImproved table output ({len(table_text)} characters):")
|
||||
print("-" * 60)
|
||||
print(table_text)
|
||||
print("-" * 60)
|
||||
|
||||
# Analyze the improvement
|
||||
lines = [line for line in table_text.split('\n') if line.strip()]
|
||||
pipe_lines = [line for line in lines if '|' in line]
|
||||
|
||||
if pipe_lines:
|
||||
# Count columns in the first content line
|
||||
first_content_line = pipe_lines[0]
|
||||
column_count = first_content_line.count('|') - 1 # Subtract 1 for border
|
||||
print(f"\nTable structure analysis:")
|
||||
print(f" Total lines: {len(lines)}")
|
||||
print(f" Lines with pipes: {len(pipe_lines)}")
|
||||
print(f" Columns: {column_count}")
|
||||
|
||||
# Check if it looks reasonable (should be ~4 columns: Description, 2025, 2024, 2023)
|
||||
if 3 <= column_count <= 6:
|
||||
print(f" ✅ Column count looks reasonable ({column_count} columns)")
|
||||
else:
|
||||
print(f" ⚠️ Column count still seems high ({column_count} columns)")
|
||||
|
||||
# Check for specific improvements
|
||||
improvements = []
|
||||
issues = []
|
||||
|
||||
if "Weighted average outstanding shares" in table_text:
|
||||
improvements.append("Contains target text")
|
||||
else:
|
||||
issues.append("Missing target text")
|
||||
|
||||
if "|" in table_text:
|
||||
improvements.append("Has pipe separators")
|
||||
else:
|
||||
issues.append("No pipe separators")
|
||||
|
||||
# Count empty columns (sequences of | | | with only spaces between)
|
||||
empty_column_pattern = r'\|\s*\|\s*\|'
|
||||
import re
|
||||
empty_sequences = len(re.findall(empty_column_pattern, table_text))
|
||||
if empty_sequences < 5: # Much fewer than before
|
||||
improvements.append("Reduced empty columns")
|
||||
else:
|
||||
issues.append("Still many empty columns")
|
||||
|
||||
if len(table_text) < 2000: # Should be more compact
|
||||
improvements.append("More compact output")
|
||||
else:
|
||||
issues.append("Still verbose output")
|
||||
|
||||
print(f"\nQuality assessment:")
|
||||
if improvements:
|
||||
print(" ✅ Improvements:")
|
||||
for improvement in improvements:
|
||||
print(f" - {improvement}")
|
||||
|
||||
if issues:
|
||||
print(" ⚠️ Remaining issues:")
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
|
||||
# Show sample of first few lines for readability
|
||||
print(f"\nFirst few lines preview:")
|
||||
for i, line in enumerate(pipe_lines[:5]):
|
||||
print(f" {i+1}: {line}")
|
||||
|
||||
return len(issues) == 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def compare_with_rich():
|
||||
"""Compare the improved fast renderer with Rich renderer."""
|
||||
print(f"\n🔄 COMPARING WITH RICH RENDERER")
|
||||
print("=" * 55)
|
||||
|
||||
try:
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
# Test both renderers
|
||||
configs = [
|
||||
("Fast Renderer", ParserConfig(fast_table_rendering=True)),
|
||||
("Rich Renderer", ParserConfig(fast_table_rendering=False)),
|
||||
]
|
||||
|
||||
for config_name, config in configs:
|
||||
print(f"\n🔧 {config_name}:")
|
||||
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
# Find target table
|
||||
target_table = None
|
||||
def find_target(node):
|
||||
nonlocal target_table
|
||||
if isinstance(node, TableNode):
|
||||
try:
|
||||
if "Weighted average outstanding shares" in node.text():
|
||||
target_table = node
|
||||
return
|
||||
except:
|
||||
pass
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
find_target(child)
|
||||
|
||||
find_target(document.root)
|
||||
|
||||
if target_table:
|
||||
table_text = target_table.text()
|
||||
lines = table_text.split('\n')
|
||||
pipe_lines = [line for line in lines if '|' in line and line.strip()]
|
||||
|
||||
print(f" Length: {len(table_text)} chars")
|
||||
print(f" Lines: {len(lines)}")
|
||||
print(f" Pipe lines: {len(pipe_lines)}")
|
||||
print(f" Contains target: {'✅' if 'Weighted average outstanding shares' in table_text else '❌'}")
|
||||
print(f" First line: {lines[0][:60]}..." if lines else " No lines")
|
||||
else:
|
||||
print(" ❌ Table not found")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Comparison failed: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = test_improved_rendering()
|
||||
compare_with_rich()
|
||||
|
||||
if success:
|
||||
print(f"\n🎉 SUCCESS!")
|
||||
print("The improved FastTableRenderer is working well!")
|
||||
else:
|
||||
print(f"\n🔧 NEEDS MORE WORK")
|
||||
print("The renderer still needs improvements.")
|
||||
@@ -0,0 +1,134 @@
|
||||
"""
|
||||
Test our duration-based solution across different companies to identify edge cases
|
||||
"""
|
||||
|
||||
from edgar import Company
|
||||
from collections import defaultdict
|
||||
import sys
|
||||
|
||||
def analyze_company_periods(ticker, company_name):
|
||||
"""Analyze period durations for a company"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Analyzing {company_name} ({ticker})")
|
||||
print('='*60)
|
||||
|
||||
try:
|
||||
company = Company(ticker)
|
||||
facts = company.facts
|
||||
raw_facts = facts._facts
|
||||
|
||||
# Find FY facts with different durations
|
||||
fy_facts_by_duration = defaultdict(list)
|
||||
|
||||
for fact in raw_facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.fiscal_year and fact.fiscal_year >= 2019:
|
||||
# Check for revenue facts
|
||||
if 'Revenue' in str(fact.concept):
|
||||
duration = None
|
||||
if fact.period_start and fact.period_end:
|
||||
duration = (fact.period_end - fact.period_start).days
|
||||
duration_bucket = "No duration"
|
||||
if duration:
|
||||
if duration < 100:
|
||||
duration_bucket = f"Quarterly (~{duration} days)"
|
||||
elif duration > 300 and duration < 400:
|
||||
duration_bucket = f"Annual (~{duration} days)"
|
||||
elif duration > 180 and duration < 200:
|
||||
duration_bucket = f"Semi-annual (~{duration} days)"
|
||||
elif duration > 700:
|
||||
duration_bucket = f"Multi-year (~{duration} days)"
|
||||
else:
|
||||
duration_bucket = f"Other ({duration} days)"
|
||||
|
||||
fy_facts_by_duration[duration_bucket].append({
|
||||
'year': fact.fiscal_year,
|
||||
'value': fact.value,
|
||||
'duration': duration,
|
||||
'period_end': fact.period_end
|
||||
})
|
||||
|
||||
# Report findings
|
||||
for bucket in sorted(fy_facts_by_duration.keys()):
|
||||
facts_list = fy_facts_by_duration[bucket]
|
||||
print(f"\n{bucket}: {len(facts_list)} facts")
|
||||
# Show a few examples
|
||||
for fact in facts_list[:3]:
|
||||
print(f" FY {fact['year']}: ${fact['value']:,.0f}")
|
||||
|
||||
return fy_facts_by_duration
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
return None
|
||||
|
||||
# Test various types of companies
|
||||
test_companies = [
|
||||
('AAPL', 'Apple - Tech Giant'),
|
||||
('MSFT', 'Microsoft - Different fiscal year end'),
|
||||
('WMT', 'Walmart - Retail with Jan year end'),
|
||||
('BAC', 'Bank of America - Financial institution'),
|
||||
('JNJ', 'Johnson & Johnson - Healthcare'),
|
||||
('TSLA', 'Tesla - Newer company'),
|
||||
('AMZN', 'Amazon - E-commerce'),
|
||||
('XOM', 'Exxon - Energy sector'),
|
||||
]
|
||||
|
||||
# Analyze each company
|
||||
results = {}
|
||||
for ticker, name in test_companies:
|
||||
result = analyze_company_periods(ticker, name)
|
||||
if result:
|
||||
results[ticker] = result
|
||||
|
||||
# Summary of potential issues
|
||||
print("\n" + "="*60)
|
||||
print("POTENTIAL ISSUES WITH OUR SOLUTION")
|
||||
print("="*60)
|
||||
|
||||
print("\n1. DURATION THRESHOLD (>300 days):")
|
||||
print(" Our fix assumes annual = >300 days")
|
||||
print(" Potential issues:")
|
||||
|
||||
# Check for edge cases around 300 days
|
||||
for ticker in results:
|
||||
for bucket in results[ticker]:
|
||||
if "Other" in bucket or "Semi-annual" in bucket:
|
||||
print(f" - {ticker} has unusual duration: {bucket}")
|
||||
|
||||
print("\n2. NO DURATION DATA:")
|
||||
print(" Some facts might not have period_start")
|
||||
for ticker in results:
|
||||
if "No duration" in results[ticker]:
|
||||
count = len(results[ticker]["No duration"])
|
||||
print(f" - {ticker}: {count} facts without duration")
|
||||
|
||||
print("\n3. FISCAL YEAR VARIATIONS:")
|
||||
print(" Companies have different fiscal year ends:")
|
||||
fiscal_year_ends = {
|
||||
'AAPL': 'September',
|
||||
'MSFT': 'June',
|
||||
'WMT': 'January',
|
||||
'BAC': 'December',
|
||||
'JNJ': 'December',
|
||||
'TSLA': 'December',
|
||||
'AMZN': 'December',
|
||||
'XOM': 'December'
|
||||
}
|
||||
for ticker, month in fiscal_year_ends.items():
|
||||
print(f" - {ticker}: Fiscal year ends in {month}")
|
||||
|
||||
print("\n4. MULTI-YEAR FACTS:")
|
||||
print(" Some companies might report multi-year aggregates")
|
||||
for ticker in results:
|
||||
if "Multi-year" in results[ticker]:
|
||||
count = len(results[ticker]["Multi-year"])
|
||||
print(f" - {ticker}: {count} multi-year facts found")
|
||||
|
||||
print("\nRECOMMENDATIONS:")
|
||||
print("1. The 300-day threshold works for most companies")
|
||||
print("2. Consider 350-380 days as 'normal' annual range")
|
||||
print("3. Handle edge cases:")
|
||||
print(" - No duration: Could check fiscal_period or use other heuristics")
|
||||
print(" - Multi-year: Filter out (duration > 400)")
|
||||
print(" - Semi-annual: Rare but should be filtered for annual=True")
|
||||
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test specific header detection logic on the target table rows.
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
|
||||
|
||||
import re
|
||||
from edgar.documents.parser import HTMLParser
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
|
||||
def test_header_detection_logic():
|
||||
print("🔍 TESTING SPECIFIC HEADER DETECTION LOGIC")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
|
||||
html_content = f.read()
|
||||
|
||||
# Parse document
|
||||
config = ParserConfig()
|
||||
parser = HTMLParser(config)
|
||||
document = parser.parse(html_content)
|
||||
|
||||
# Find target table
|
||||
target_table = None
|
||||
def find_target(node):
|
||||
nonlocal target_table
|
||||
if isinstance(node, TableNode):
|
||||
try:
|
||||
if "Weighted average outstanding shares" in node.text():
|
||||
target_table = node
|
||||
return
|
||||
except:
|
||||
pass
|
||||
if hasattr(node, 'children'):
|
||||
for child in node.children:
|
||||
find_target(child)
|
||||
|
||||
find_target(document.root)
|
||||
|
||||
if not target_table:
|
||||
print("❌ Target table not found")
|
||||
return
|
||||
|
||||
print("✅ Found target table")
|
||||
print(f"Current status: {len(target_table.headers)} headers, {len(target_table.rows)} data rows")
|
||||
|
||||
# Test our header detection logic on each of the first few rows
|
||||
print(f"\n🔧 TESTING HEADER DETECTION ON FIRST 7 ROWS:")
|
||||
|
||||
for i, row in enumerate(target_table.rows[:7]):
|
||||
print(f"\n--- ROW {i+1} ---")
|
||||
|
||||
# Get the row text
|
||||
row_text = ' '.join(cell.text().strip() for cell in row.cells)
|
||||
print(f"Row text: '{row_text}'")
|
||||
|
||||
# Test each part of our header detection logic
|
||||
score = 0
|
||||
reasons = []
|
||||
|
||||
# 1. Check for year patterns in the combined text
|
||||
year_pattern = r'\b(19\d{2}|20\d{2})\b'
|
||||
years_found = re.findall(year_pattern, row_text)
|
||||
if len(years_found) >= 2:
|
||||
if 'total' not in row_text.lower()[:20]:
|
||||
score += 3
|
||||
reasons.append(f"Multiple years found: {years_found}")
|
||||
|
||||
# 2. Enhanced year detection - check individual cells
|
||||
year_cells = 0
|
||||
date_phrases = 0
|
||||
cell_contents = []
|
||||
for cell in row.cells:
|
||||
cell_text = cell.text().strip()
|
||||
cell_contents.append(f"'{cell_text}'")
|
||||
if cell_text:
|
||||
# Check for individual years
|
||||
if re.match(r'^\s*(19\d{2}|20\d{2})\s*$', cell_text):
|
||||
year_cells += 1
|
||||
# Check for date phrases
|
||||
elif 'june 30' in cell_text.lower() or 'december 31' in cell_text.lower():
|
||||
date_phrases += 1
|
||||
|
||||
print(f"Cell contents: {cell_contents[:5]}{'...' if len(cell_contents) > 5 else ''}")
|
||||
print(f"Year cells: {year_cells}, Date phrases: {date_phrases}")
|
||||
|
||||
if year_cells >= 2 or (year_cells >= 1 and date_phrases >= 1):
|
||||
if 'total' not in row_text.lower()[:20]:
|
||||
score += 4
|
||||
reasons.append(f"Enhanced year detection: {year_cells} year cells, {date_phrases} date phrases")
|
||||
|
||||
# 3. Check for financial header patterns
|
||||
row_text_lower = row_text.lower()
|
||||
financial_patterns = [
|
||||
r'year\s+ended\s+(june|december|march|september)',
|
||||
r'(three|six|nine|twelve)\s+months?\s+ended',
|
||||
r'\(in\s+(millions|thousands|billions)\)',
|
||||
r'fiscal\s+year\s+ended'
|
||||
]
|
||||
|
||||
for pattern in financial_patterns:
|
||||
if re.search(pattern, row_text_lower):
|
||||
score += 2
|
||||
reasons.append(f"Financial pattern: {pattern}")
|
||||
|
||||
# 4. Check for period indicators
|
||||
period_keywords = ['quarter', 'q1', 'q2', 'q3', 'q4', 'month',
|
||||
'january', 'february', 'march', 'april', 'may', 'june',
|
||||
'july', 'august', 'september', 'october', 'november', 'december',
|
||||
'ended', 'three months', 'six months', 'nine months']
|
||||
|
||||
matching_keywords = [kw for kw in period_keywords if kw in row_text_lower]
|
||||
if matching_keywords:
|
||||
score += 1
|
||||
reasons.append(f"Period keywords: {matching_keywords}")
|
||||
|
||||
print(f"HEADER SCORE: {score}")
|
||||
if reasons:
|
||||
print(f"Reasons: {', '.join(reasons)}")
|
||||
|
||||
# Determine if this should be considered a header
|
||||
should_be_header = score >= 3
|
||||
print(f"SHOULD BE HEADER: {'YES' if should_be_header else 'NO'}")
|
||||
|
||||
if should_be_header and i == 4: # Row 5 (index 4) is our expected header
|
||||
print("🎯 This matches our expected header row!")
|
||||
elif should_be_header:
|
||||
print("⚠️ This would be detected as a header but wasn't expected")
|
||||
elif i == 4:
|
||||
print("❌ This should be the header row but isn't being detected!")
|
||||
|
||||
return target_table
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_header_detection_logic()
|
||||
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
Verify the fiscal year pattern across companies
|
||||
"""
|
||||
|
||||
from edgar import Company
|
||||
|
||||
def check_fiscal_year_pattern(ticker, name):
|
||||
"""Check the relationship between fiscal_year and period_end.year"""
|
||||
print(f"\n{name} ({ticker}):")
|
||||
print("-" * 40)
|
||||
|
||||
try:
|
||||
company = Company(ticker)
|
||||
facts = company.facts._facts
|
||||
|
||||
# Collect FY facts with revenue
|
||||
fy_facts = []
|
||||
for fact in facts:
|
||||
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
|
||||
if fact.fiscal_year and fact.fiscal_year >= 2019 and fact.fiscal_year <= 2024:
|
||||
if 'Revenue' in str(fact.concept):
|
||||
if fact.period_start and fact.period_end:
|
||||
duration = (fact.period_end - fact.period_start).days
|
||||
if duration > 300 and duration < 400: # Annual only
|
||||
fy_facts.append({
|
||||
'fiscal_year': fact.fiscal_year,
|
||||
'period_end': fact.period_end,
|
||||
'period_end_year': fact.period_end.year,
|
||||
'difference': fact.fiscal_year - fact.period_end.year
|
||||
})
|
||||
|
||||
# Deduplicate and sort
|
||||
unique_facts = {}
|
||||
for f in fy_facts:
|
||||
key = (f['fiscal_year'], f['period_end'])
|
||||
unique_facts[key] = f
|
||||
|
||||
# Analyze the pattern
|
||||
differences = set()
|
||||
for f in unique_facts.values():
|
||||
differences.add(f['difference'])
|
||||
|
||||
print(f" Fiscal Year vs Period End Year differences: {sorted(differences)}")
|
||||
|
||||
# Show examples
|
||||
print("\n Examples:")
|
||||
for f in sorted(unique_facts.values(), key=lambda x: x['fiscal_year'], reverse=True)[:5]:
|
||||
print(f" FY {f['fiscal_year']} → ends {f['period_end']} (diff: {f['difference']} years)")
|
||||
|
||||
# What's the consistent pattern?
|
||||
if len(differences) == 1:
|
||||
diff = list(differences)[0]
|
||||
print(f"\n ✓ Consistent pattern: fiscal_year = period_end.year + {diff}")
|
||||
else:
|
||||
print(f"\n ⚠️ Multiple patterns found: {differences}")
|
||||
|
||||
return differences
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
return set()
|
||||
|
||||
# Test various companies
|
||||
companies = [
|
||||
('AAPL', 'Apple (Sept year-end)'),
|
||||
('MSFT', 'Microsoft (June year-end)'),
|
||||
('WMT', 'Walmart (Jan year-end)'),
|
||||
('AMZN', 'Amazon (Dec year-end)'),
|
||||
('JNJ', 'J&J (Dec year-end)'),
|
||||
('TSLA', 'Tesla (Dec year-end)'),
|
||||
]
|
||||
|
||||
all_differences = set()
|
||||
for ticker, name in companies:
|
||||
diffs = check_fiscal_year_pattern(ticker, name)
|
||||
all_differences.update(diffs)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("CONCLUSION")
|
||||
print("="*60)
|
||||
|
||||
if len(all_differences) == 1:
|
||||
diff = list(all_differences)[0]
|
||||
print(f"\n✓ ALL companies show the same pattern:")
|
||||
print(f" fiscal_year = period_end.year + {diff}")
|
||||
print("\nThis appears to be how the SEC Facts API structures the data!")
|
||||
print("The 'fiscal_year' field indicates when the data was filed/reported,")
|
||||
print("not the actual year of the fiscal period.")
|
||||
else:
|
||||
print(f"\n⚠️ Different companies show different patterns: {all_differences}")
|
||||
print("The most common pattern seems to be a 2-year difference.")
|
||||
|
||||
print("\nIMPLICATION FOR OUR FIX:")
|
||||
print("We should NOT require fiscal_year == period_end.year")
|
||||
print("Instead, we should:")
|
||||
print("1. Use duration (>300 days) as the primary filter")
|
||||
print("2. Match facts where fiscal_year is within 0-3 years of period_end.year")
|
||||
print("3. Deduplicate by keeping the latest period_end for each actual year")
|
||||
99
venv/lib/python3.10/site-packages/edgar/entity/__init__.py
Normal file
99
venv/lib/python3.10/site-packages/edgar/entity/__init__.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""
|
||||
Entity module for the EdgarTools library.
|
||||
|
||||
This module provides the Entity, Company, Fund, and related classes
|
||||
for working with SEC filers.
|
||||
"""
|
||||
# Import for backward compatibility
|
||||
from edgar.entity.constants import COMPANY_FORMS
|
||||
from edgar.entity.core import (
|
||||
Company,
|
||||
Entity,
|
||||
SecFiler,
|
||||
get_company,
|
||||
get_entity,
|
||||
public_companies,
|
||||
)
|
||||
from edgar.entity.utils import has_company_filings, normalize_cik
|
||||
from edgar.entity.data import Address, CompanyData, EntityData
|
||||
from edgar.entity.entity_facts import (
|
||||
EntityFacts,
|
||||
NoCompanyFactsFound,
|
||||
get_company_facts,
|
||||
)
|
||||
from edgar.entity.filings import EntityFiling, EntityFilings
|
||||
from edgar.entity.search import CompanySearchIndex, CompanySearchResults, find_company
|
||||
from edgar.entity.submissions import (
|
||||
create_company_from_file,
|
||||
create_entity_from_file,
|
||||
create_entity_from_submissions_json,
|
||||
download_entity_submissions_from_sec,
|
||||
get_entity_submissions,
|
||||
)
|
||||
from edgar.entity.tickers import find_cik, find_ticker, get_cik_lookup_data, get_company_tickers, get_icon_from_ticker, get_ticker_to_cik_lookup
|
||||
|
||||
# Import from the funds package instead of entity.funds
|
||||
from edgar.funds import FundData, FundSeries
|
||||
|
||||
# Aliases for backward compatibility
|
||||
CompanyFiling = EntityFiling
|
||||
CompanyFilings = EntityFilings
|
||||
|
||||
__all__ = [
|
||||
# Core classes
|
||||
'SecFiler',
|
||||
'Entity',
|
||||
'Company',
|
||||
'FundSeries',
|
||||
|
||||
# Data classes
|
||||
'EntityData',
|
||||
'CompanyData',
|
||||
'FundData',
|
||||
'Address',
|
||||
|
||||
# Filing classes
|
||||
'EntityFiling',
|
||||
'EntityFilings',
|
||||
'EntityFacts',
|
||||
|
||||
# Factory functions
|
||||
'get_entity',
|
||||
'get_company',
|
||||
'public_companies',
|
||||
|
||||
# Search functions
|
||||
'find_company',
|
||||
'CompanySearchResults',
|
||||
'CompanySearchIndex',
|
||||
|
||||
# Ticker functions
|
||||
'get_icon_from_ticker',
|
||||
'get_company_tickers',
|
||||
'get_ticker_to_cik_lookup',
|
||||
'get_cik_lookup_data',
|
||||
'find_cik',
|
||||
'find_ticker',
|
||||
|
||||
# Submission functions
|
||||
'get_entity_submissions',
|
||||
'download_entity_submissions_from_sec',
|
||||
'create_entity_from_submissions_json',
|
||||
'create_entity_from_file',
|
||||
'create_company_from_file',
|
||||
|
||||
# Fact functions
|
||||
'get_company_facts',
|
||||
|
||||
# Exceptions
|
||||
'NoCompanyFactsFound',
|
||||
|
||||
# Constants and utilities
|
||||
'COMPANY_FORMS',
|
||||
'has_company_filings',
|
||||
'normalize_cik',
|
||||
|
||||
# Backwards compatibility
|
||||
'CompanyFiling',
|
||||
'CompanyFilings',
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
80
venv/lib/python3.10/site-packages/edgar/entity/constants.py
Normal file
80
venv/lib/python3.10/site-packages/edgar/entity/constants.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""
|
||||
Constants for entity classification and form types.
|
||||
|
||||
This module contains constants used throughout the entity package for
|
||||
determining entity types and form classifications.
|
||||
"""
|
||||
|
||||
# Performance optimization: use set for O(1) lookups
|
||||
COMPANY_FORMS = {
|
||||
# Registration statements
|
||||
"S-1", "S-3", "S-4", "S-8", "S-11",
|
||||
# Foreign issuers registration forms
|
||||
"F-1", "F-3", "F-4", "F-6", "F-7", "F-8", "F-9", "F-10", "F-80",
|
||||
# Foreign form amendments and effectiveness
|
||||
"F-6EF", "F-6 POS", "F-3ASR", "F-4MEF", "F-10EF", "F-3D", "F-3MEF",
|
||||
# Exchange Act registration
|
||||
"10-12B", "10-12G",
|
||||
# Periodic reports
|
||||
"10-K", "10-Q", "10-K/A", "10-Q/A",
|
||||
"20-F", "40-F", # Foreign issuers
|
||||
"11-K", # Employee benefit plans
|
||||
# Current reports
|
||||
"8-K", "6-K",
|
||||
# Proxy materials
|
||||
"DEF 14A", "PRE 14A", "DEFA14A", "DEFM14A",
|
||||
# Other corporate filings
|
||||
"424B1", "424B2", "424B3", "424B4", "424B5",
|
||||
"ARS", "NT 10-K", "NT 10-Q",
|
||||
"SC 13D", "SC 13G", "SC TO-I", "SC TO-T",
|
||||
"SD", "PX14A6G",
|
||||
# Specialized corporate filings
|
||||
"N-CSR", "N-Q", "N-MFP", "N-CEN",
|
||||
"X-17A-5", "17-H",
|
||||
"TA-1", "TA-2",
|
||||
"ATS-N",
|
||||
# Corporate disclosures
|
||||
"EFFECT", "FWP", "425", "CB",
|
||||
"POS AM", "CORRESP", "UPLOAD"
|
||||
}
|
||||
|
||||
# Fund-specific form types
|
||||
FUND_FORMS = {
|
||||
# Investment company registration
|
||||
"N-1A", "N-2", "N-3", "N-4", "N-5", "N-6",
|
||||
# Investment company periodic reports
|
||||
"N-CSR", "N-Q", "N-CEN", "N-MFP",
|
||||
# Investment adviser forms
|
||||
"ADV", "ADV-E", "ADV-H", "ADV-NR", "ADV-W",
|
||||
# Private fund forms
|
||||
"PF", "CPO-PQR", "CTA-PR",
|
||||
# Municipal advisor forms
|
||||
"MA", "MA-I", "MA-NR", "MA-W",
|
||||
# Investment company shareholder reports
|
||||
"N-30B-2", "N-30D", "485APOS", "485BPOS",
|
||||
# Variable insurance products
|
||||
"N-3/A", "N-4/A", "N-6/A",
|
||||
# Closed-end funds
|
||||
"N-2/A", "N-5/A",
|
||||
# Business development companies
|
||||
"N-6F", "N-54A", "N-54C",
|
||||
# Exchange-traded funds
|
||||
"N-1A/A",
|
||||
# Portfolio holdings
|
||||
"NPORT-P", "NPORT-EX", "N-PORT", "N-PORT/A"
|
||||
}
|
||||
|
||||
# Individual/insider forms
|
||||
INDIVIDUAL_FORMS = {
|
||||
# Ownership reports
|
||||
"3", "4", "5", "3/A", "4/A", "5/A",
|
||||
# Beneficial ownership
|
||||
"SC 13D", "SC 13G", "SC 13D/A", "SC 13G/A",
|
||||
# Tender offer schedules
|
||||
"SC TO-I", "SC TO-C", "SC TO-T",
|
||||
# Investment adviser representatives
|
||||
"ADV-E", "DRS"
|
||||
}
|
||||
|
||||
# All known form types for validation
|
||||
ALL_FORM_TYPES = COMPANY_FORMS | FUND_FORMS | INDIVIDUAL_FORMS
|
||||
923
venv/lib/python3.10/site-packages/edgar/entity/core.py
Normal file
923
venv/lib/python3.10/site-packages/edgar/entity/core.py
Normal file
@@ -0,0 +1,923 @@
|
||||
"""
|
||||
Core entity classes for working with SEC filings.
|
||||
|
||||
This module provides the main classes for interacting with SEC entities,
|
||||
including companies, funds, and individuals.
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from functools import cached_property
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, TypeVar, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pyarrow
|
||||
|
||||
from edgar.entity.enhanced_statement import StructuredStatement
|
||||
from edgar.entity.filings import EntityFilings
|
||||
from edgar.enums import FormType, PeriodType
|
||||
|
||||
from rich import box
|
||||
from rich.columns import Columns
|
||||
from rich.console import Group
|
||||
from rich.padding import Padding
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from edgar._filings import Filings
|
||||
from edgar.company_reports import TenK, TenQ
|
||||
from edgar.entity.data import Address, CompanyData, EntityData
|
||||
from edgar.entity.entity_facts import EntityFacts, NoCompanyFactsFound, get_company_facts
|
||||
from edgar.entity.tickers import get_icon_from_ticker
|
||||
from edgar.financials import Financials
|
||||
from edgar.formatting import datefmt, reverse_name
|
||||
from edgar.reference.tickers import find_cik
|
||||
from edgar.richtools import Docs, repr_rich
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar.enums import FormType
|
||||
|
||||
# Import constants and utilities from separate modules
|
||||
from edgar.entity.constants import COMPANY_FORMS
|
||||
from edgar.entity.utils import has_company_filings, normalize_cik
|
||||
|
||||
# Type variables for better type annotations
|
||||
T = TypeVar('T')
|
||||
|
||||
__all__ = [
|
||||
'SecFiler',
|
||||
'Entity',
|
||||
'Company',
|
||||
'EntityData',
|
||||
'CompanyData',
|
||||
'get_entity',
|
||||
'get_company',
|
||||
'NoCompanyFactsFound',
|
||||
'has_company_filings',
|
||||
'COMPANY_FORMS',
|
||||
]
|
||||
|
||||
class SecFiler(ABC):
|
||||
"""
|
||||
Abstract base class for all SEC filing entities.
|
||||
|
||||
This is the root of the entity hierarchy and defines the common interface
|
||||
that all entity types must implement.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_filings(self, **kwargs) -> Filings:
|
||||
"""Get filings for this entity."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_facts(self) -> Optional[EntityFacts]:
|
||||
"""Get structured facts about this entity."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def cik(self) -> int:
|
||||
"""Get the CIK number for this entity."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def data(self) -> 'EntityData':
|
||||
"""Get detailed data for this entity."""
|
||||
pass
|
||||
|
||||
|
||||
class Entity(SecFiler):
|
||||
"""
|
||||
Represents any entity that files with the SEC.
|
||||
|
||||
This is the base concrete implementation that can be used directly
|
||||
or specialized for specific entity types.
|
||||
"""
|
||||
|
||||
def __init__(self, cik_or_identifier: Union[str, int]):
|
||||
# If it's a ticker, convert to CIK first
|
||||
if isinstance(cik_or_identifier, str) and not cik_or_identifier.isdigit():
|
||||
cik = find_cik(cik_or_identifier)
|
||||
if cik is None:
|
||||
self._cik = -999999999
|
||||
else:
|
||||
self._cik = cik
|
||||
else:
|
||||
self._cik = normalize_cik(cik_or_identifier)
|
||||
|
||||
self._data = None
|
||||
|
||||
@property
|
||||
def cik(self) -> int:
|
||||
"""Get the CIK number for this entity."""
|
||||
return self._cik
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
"""Get the name of the company."""
|
||||
if hasattr(self.data, 'name'):
|
||||
return self.data.name
|
||||
return None
|
||||
|
||||
@cached_property
|
||||
def display_name(self) -> str:
|
||||
"""Reverse the name if it is a company"""
|
||||
if self.is_company:
|
||||
return self.name
|
||||
return reverse_name(self.name)
|
||||
|
||||
@cached_property
|
||||
def data(self) -> 'EntityData':
|
||||
"""Get detailed data for this entity."""
|
||||
if self._data is None:
|
||||
# Import locally to avoid circular imports
|
||||
from edgar.entity.submissions import get_entity_submissions
|
||||
|
||||
# get_entity_submissions returns the EntityData directly
|
||||
entity_data = get_entity_submissions(self.cik)
|
||||
|
||||
if entity_data:
|
||||
self._data = entity_data
|
||||
self._data._not_found = False
|
||||
else:
|
||||
# Instead of raising an error, create a default EntityData
|
||||
#log.warning(f"Could not find entity data for CIK {self.cik}, using placeholder data")
|
||||
from edgar.entity.data import create_default_entity_data
|
||||
self._data = create_default_entity_data(self.cik)
|
||||
self._data._not_found = True
|
||||
return self._data
|
||||
|
||||
def mailing_address(self) -> Optional[Address]:
|
||||
"""Get the mailing address of the entity."""
|
||||
if hasattr(self.data, 'mailing_address') and self.data.mailing_address:
|
||||
return self.data.mailing_address
|
||||
|
||||
def business_address(self) -> Optional[Address]:
|
||||
"""Get the business address of the entity."""
|
||||
if hasattr(self.data, 'business_address') and self.data.business_address:
|
||||
return self.data.business_address
|
||||
|
||||
|
||||
@property
|
||||
def not_found(self) -> bool:
|
||||
"""
|
||||
Check if the entity data was not found.
|
||||
|
||||
Returns:
|
||||
True if the entity data could not be found, False otherwise
|
||||
"""
|
||||
if not hasattr(self, '_data') or self._data is None:
|
||||
# We haven't loaded the data yet, so we don't know if it's not found
|
||||
# Loading the data will set the not_found flag
|
||||
_ = self.data
|
||||
|
||||
return getattr(self._data, '_not_found', False)
|
||||
|
||||
@property
|
||||
def is_company(self) -> bool:
|
||||
"""
|
||||
Check if this entity is a company.
|
||||
|
||||
Returns:
|
||||
True if the entity is a company, False otherwise
|
||||
"""
|
||||
return self.data.is_company
|
||||
|
||||
@property
|
||||
def is_individual(self) -> bool:
|
||||
"""
|
||||
Check if this entity is an individual.
|
||||
|
||||
Returns:
|
||||
True if the entity is an individual, False otherwise
|
||||
"""
|
||||
return not self.is_company
|
||||
|
||||
|
||||
def get_filings(self,
|
||||
*,
|
||||
year: Union[int, List[int]] = None,
|
||||
quarter: Union[int, List[int]] = None,
|
||||
form: Union[str, 'FormType', List[Union[str, 'FormType']]] = None,
|
||||
accession_number: Union[str, List] = None,
|
||||
file_number: Union[str, List] = None,
|
||||
filing_date: Union[str, Tuple[str, str]] = None,
|
||||
date: Union[str, Tuple[str, str]] = None,
|
||||
amendments: bool = True,
|
||||
is_xbrl: bool = None,
|
||||
is_inline_xbrl: bool = None,
|
||||
sort_by: Union[str, List[Tuple[str, str]]] = None,
|
||||
trigger_full_load: bool = True) -> 'EntityFilings':
|
||||
"""
|
||||
Get the entity's filings and optionally filter by multiple criteria.
|
||||
|
||||
This method has a special behavior for loading filings. When first called,
|
||||
it only loads the most recent filings. If trigger_full_load=True, it will
|
||||
automatically fetch all historical filings from the SEC (potentially making
|
||||
multiple API calls) as needed.
|
||||
|
||||
Args:
|
||||
year: The year or list of years to filter by (e.g. 2023, [2022, 2023])
|
||||
quarter: The quarter or list of quarters to filter by (1-4, e.g. 4, [3, 4])
|
||||
form: The form type (e.g. FormType.ANNUAL_REPORT, '10-K', or ['10-Q', '10-K'])
|
||||
accession_number: The accession number that identifies a filing
|
||||
file_number: The file number e.g. 001-39504
|
||||
filing_date: Filter by filing date (YYYY-MM-DD or range)
|
||||
date: Alias for filing_date
|
||||
amendments: Whether to include amendments (default: True)
|
||||
is_xbrl: Whether the filing is XBRL
|
||||
is_inline_xbrl: Whether the filing is Inline XBRL
|
||||
sort_by: Sort criteria
|
||||
trigger_full_load: Whether to load all historical filings if not already loaded
|
||||
|
||||
Returns:
|
||||
Filtered filings matching the criteria
|
||||
"""
|
||||
# Simply delegate to the EntityData implementation
|
||||
# This preserves the lazy-loading behavior while keeping the API clean
|
||||
return self.data.get_filings(
|
||||
year=year,
|
||||
quarter=quarter,
|
||||
form=form,
|
||||
accession_number=accession_number,
|
||||
file_number=file_number,
|
||||
filing_date=filing_date or date,
|
||||
amendments=amendments,
|
||||
is_xbrl=is_xbrl,
|
||||
is_inline_xbrl=is_inline_xbrl,
|
||||
sort_by=sort_by,
|
||||
trigger_full_load=trigger_full_load
|
||||
)
|
||||
|
||||
def get_facts(self, period_type: Optional[Union[str, 'PeriodType']] = None) -> Optional[EntityFacts]:
|
||||
"""
|
||||
Get structured facts about this entity.
|
||||
|
||||
Args:
|
||||
period_type: Optional filter by period type. Can be PeriodType enum
|
||||
or string ('annual', 'quarterly', 'monthly').
|
||||
|
||||
Returns:
|
||||
EntityFacts object, optionally filtered by period type
|
||||
"""
|
||||
try:
|
||||
facts = get_company_facts(self.cik)
|
||||
if facts and period_type:
|
||||
# Apply period type filtering to the facts
|
||||
return facts.filter_by_period_type(period_type)
|
||||
return facts
|
||||
except NoCompanyFactsFound:
|
||||
return None
|
||||
|
||||
def get_structured_statement(self,
|
||||
statement_type: str,
|
||||
fiscal_year: Optional[int] = None,
|
||||
fiscal_period: Optional[str] = None,
|
||||
use_canonical: bool = True,
|
||||
include_missing: bool = False) -> Optional['StructuredStatement']:
|
||||
"""
|
||||
Get a hierarchically structured financial statement.
|
||||
|
||||
This method uses learned canonical structures to build complete financial
|
||||
statements with proper hierarchy and relationships, filling in missing
|
||||
concepts when requested.
|
||||
|
||||
Args:
|
||||
statement_type: Type of statement ('BalanceSheet', 'IncomeStatement', 'CashFlow')
|
||||
fiscal_year: Fiscal year to retrieve (defaults to latest)
|
||||
fiscal_period: Fiscal period ('FY', 'Q1', 'Q2', 'Q3', 'Q4')
|
||||
use_canonical: Use canonical structure for organization (recommended)
|
||||
include_missing: Include placeholders for missing canonical concepts
|
||||
|
||||
Returns:
|
||||
StructuredStatement with hierarchical organization or None if no data
|
||||
|
||||
Example:
|
||||
>>> company = Company('AAPL')
|
||||
>>> stmt = company.get_structured_statement('IncomeStatement', 2024, 'Q4')
|
||||
>>> print(stmt.get_hierarchical_display())
|
||||
"""
|
||||
from edgar.entity.statement_builder import StatementBuilder
|
||||
|
||||
facts_data = self.get_facts()
|
||||
if not facts_data:
|
||||
return None
|
||||
|
||||
# Get all facts
|
||||
all_facts = facts_data.get_all_facts()
|
||||
if not all_facts:
|
||||
return None
|
||||
|
||||
# Build the statement
|
||||
builder = StatementBuilder(cik=str(self.cik))
|
||||
structured_stmt = builder.build_statement(
|
||||
facts=all_facts,
|
||||
statement_type=statement_type,
|
||||
fiscal_year=fiscal_year,
|
||||
fiscal_period=fiscal_period,
|
||||
use_canonical=use_canonical,
|
||||
include_missing=include_missing
|
||||
)
|
||||
|
||||
# Add company metadata
|
||||
structured_stmt.company_name = self.name
|
||||
|
||||
return structured_stmt
|
||||
|
||||
def latest(self, form: str, n=1):
|
||||
"""Get the latest filing(s) for a given form."""
|
||||
return self.get_filings(form=form, trigger_full_load=False).latest(n)
|
||||
|
||||
def __str__(self):
|
||||
if hasattr(self, 'data'):
|
||||
return f"Entity({self.data.name} [{self.cik}])"
|
||||
return f"Entity(CIK={self.cik})"
|
||||
|
||||
def __rich__(self):
|
||||
return self.data.__rich__()
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def __bool__(self):
|
||||
"""
|
||||
Allow truthiness check for entities.
|
||||
|
||||
Returns False if the entity doesn't exist (has a sentinel CIK value or not_found is True).
|
||||
This enables code patterns like: `if company: do_something()`
|
||||
"""
|
||||
# Check for sentinel CIK value (-999999999) or not_found flag
|
||||
return self.cik != -999999999 and not self.not_found
|
||||
|
||||
|
||||
class Company(Entity):
|
||||
"""
|
||||
Represents a public company that files with the SEC.
|
||||
|
||||
Provides company-specific functionality like financial statements,
|
||||
ticker lookup, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, cik_or_ticker: Union[str, int]):
|
||||
|
||||
|
||||
super().__init__(cik_or_ticker)
|
||||
|
||||
@property
|
||||
def data(self) -> 'EntityData': # We'll return the base type to simplify
|
||||
"""Get detailed data for this company."""
|
||||
# For simplicity, return the base EntityData
|
||||
# Type checkers will still see this as a CompanyData due to the annotation
|
||||
return super().data
|
||||
|
||||
@property
|
||||
def tickers(self):
|
||||
"""Get all ticker symbols for this company."""
|
||||
if hasattr(self.data, 'tickers'):
|
||||
return self.data.tickers
|
||||
return []
|
||||
|
||||
def get_ticker(self) -> Optional[str]:
|
||||
"""Get the primary ticker symbol for this company."""
|
||||
if self.data and self.data.tickers and len(self.data.tickers) > 0:
|
||||
return self.data.tickers[0]
|
||||
return None
|
||||
|
||||
def get_exchanges(self ):
|
||||
"""Get all exchanges for this company."""
|
||||
if hasattr(self.data, 'exchanges'):
|
||||
return self.data.exchanges
|
||||
return []
|
||||
|
||||
def get_financials(self) -> Optional[Financials]:
|
||||
"""Get financial statements for this company."""
|
||||
tenk_filing = self.latest_tenk
|
||||
if tenk_filing is not None:
|
||||
return tenk_filing.financials
|
||||
return None
|
||||
|
||||
def get_quarterly_financials(self) -> Optional[Financials]:
|
||||
"""Get quarterly financial statements for this company."""
|
||||
tenq_filing = self.latest_tenq
|
||||
if tenq_filing is not None:
|
||||
return tenq_filing.financials
|
||||
return None
|
||||
|
||||
@property
|
||||
def fiscal_year_end(self):
|
||||
"""Get the fiscal year end date for this company."""
|
||||
if hasattr(self.data, 'fiscal_year_end'):
|
||||
return self.data.fiscal_year_end
|
||||
return None
|
||||
|
||||
@property
|
||||
def sic(self):
|
||||
"""Get the SIC code for this company."""
|
||||
if hasattr(self.data, 'sic'):
|
||||
return self.data.sic
|
||||
return None
|
||||
|
||||
@property
|
||||
def industry(self):
|
||||
"""Get the industry description for this company."""
|
||||
if hasattr(self.data, 'sic_description'):
|
||||
return self.data.sic_description
|
||||
return None
|
||||
|
||||
@property
|
||||
def latest_tenk(self) -> Optional[TenK]:
|
||||
"""Get the latest 10-K filing for this company."""
|
||||
latest_10k = self.get_filings(form='10-K', trigger_full_load=False).latest()
|
||||
if latest_10k is not None:
|
||||
return latest_10k.obj()
|
||||
return None
|
||||
|
||||
@property
|
||||
def latest_tenq(self) -> Optional[TenQ]:
|
||||
"""Get the latest 10-Q filing for this company."""
|
||||
latest_10q = self.get_filings(form='10-Q', trigger_full_load=False).latest()
|
||||
if latest_10q is not None:
|
||||
return latest_10q.obj()
|
||||
return None
|
||||
|
||||
def get_icon(self):
|
||||
return get_icon_from_ticker(self.tickers[0])
|
||||
|
||||
# Enhanced financial data properties and methods
|
||||
@property
|
||||
def facts(self) -> Optional[EntityFacts]:
|
||||
"""Get enhanced structured facts about this company."""
|
||||
return self.get_facts()
|
||||
|
||||
@property
|
||||
def docs(self):
|
||||
"""Access comprehensive Company API documentation."""
|
||||
return Docs(self)
|
||||
|
||||
@property
|
||||
def public_float(self) -> Optional[float]:
|
||||
"""Get the public float value for this company."""
|
||||
facts = self.facts
|
||||
if facts:
|
||||
return facts.public_float
|
||||
return None
|
||||
|
||||
@property
|
||||
def shares_outstanding(self) -> Optional[float]:
|
||||
"""Get the shares outstanding for this company."""
|
||||
facts = self.facts
|
||||
if facts:
|
||||
return facts.shares_outstanding
|
||||
return None
|
||||
|
||||
def income_statement(self, periods: int = 4, annual: bool = True, as_dataframe: bool = False, concise_format: bool = False):
|
||||
"""
|
||||
Get income statement data for this company.
|
||||
|
||||
Args:
|
||||
periods: Number of periods to retrieve
|
||||
annual: If True, prefer annual periods; if False, get quarterly
|
||||
as_dataframe: If True, return DataFrame; if False, return MultiPeriodStatement
|
||||
concise_format: If True, display values as $1.0B, if False display as $1,000,000,000
|
||||
|
||||
Returns:
|
||||
MultiPeriodStatement or DataFrame with income statement data, or None if not available
|
||||
"""
|
||||
facts = self.facts
|
||||
if facts:
|
||||
try:
|
||||
return facts.income_statement(periods=periods, annual=annual, as_dataframe=as_dataframe, concise_format=concise_format)
|
||||
except Exception as e:
|
||||
from edgar.core import log
|
||||
log.debug(f"Error getting income statement for {self.name}: {e}")
|
||||
return None
|
||||
|
||||
def balance_sheet(self, periods: int = 4, annual: bool = True, as_dataframe: bool = False, concise_format: bool = False):
|
||||
"""
|
||||
Get balance sheet data for this company.
|
||||
|
||||
Args:
|
||||
periods: Number of periods to retrieve
|
||||
annual: If True, prefer annual periods; if False, get quarterly
|
||||
as_dataframe: If True, return DataFrame; if False, return MultiPeriodStatement
|
||||
concise_format: If True, display values as $1.0B, if False display as $1,000,000,000
|
||||
|
||||
Returns:
|
||||
MultiPeriodStatement or DataFrame with balance sheet data, or None if not available
|
||||
"""
|
||||
facts = self.facts
|
||||
if facts:
|
||||
try:
|
||||
return facts.balance_sheet(periods=periods, annual=annual, as_dataframe=as_dataframe, concise_format=concise_format)
|
||||
except Exception as e:
|
||||
from edgar.core import log
|
||||
log.debug(f"Error getting balance sheet for {self.name}: {e}")
|
||||
return None
|
||||
|
||||
def cash_flow(self, periods: int = 4, annual: bool = True, as_dataframe: bool = False, concise_format: bool = False):
|
||||
"""
|
||||
Get cash flow statement data for this company.
|
||||
|
||||
Args:
|
||||
periods: Number of periods to retrieve
|
||||
annual: If True, prefer annual periods; if False, get quarterly
|
||||
as_dataframe: If True, return DataFrame; if False, return MultiPeriodStatement
|
||||
concise_format: If True, display values as $1.0B, if False display as $1,000,000,000
|
||||
|
||||
Returns:
|
||||
MultiPeriodStatement or DataFrame with cash flow data, or None if not available
|
||||
"""
|
||||
facts = self.facts
|
||||
if facts:
|
||||
try:
|
||||
return facts.cash_flow(periods=periods, annual=annual, as_dataframe=as_dataframe, concise_format=concise_format)
|
||||
except Exception as e:
|
||||
from edgar.core import log
|
||||
log.debug(f"Error getting cash flow for {self.name}: {e}")
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
ticker = self.get_ticker()
|
||||
ticker_str = f" - {ticker}" if ticker else ""
|
||||
if hasattr(self, 'data'):
|
||||
return f"Company({self.data.name} [{self.cik}]{ticker_str})"
|
||||
return f"Company(CIK={self.cik}{ticker_str})"
|
||||
|
||||
def __repr__(self):
|
||||
# Delegate to the rich representation for consistency with the old implementation
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def text(self, max_tokens: int = 2000) -> str:
|
||||
"""
|
||||
Get AI-optimized plain text representation.
|
||||
|
||||
Uses Markdown-KV format (60.7% accuracy, 25% fewer tokens than JSON) optimized
|
||||
for LLM consumption. For terminal display, use print(company) instead.
|
||||
|
||||
Research basis: improvingagents.com/blog/best-input-data-format-for-llms
|
||||
|
||||
Args:
|
||||
max_tokens: Approximate token limit using 4 chars/token heuristic (default: 2000)
|
||||
|
||||
Returns:
|
||||
Markdown-formatted key-value representation optimized for LLMs
|
||||
|
||||
Example:
|
||||
>>> from edgar import Company
|
||||
>>> company = Company("AAPL")
|
||||
>>> text = company.text()
|
||||
>>> print(text)
|
||||
**Company:** Apple Inc.
|
||||
**CIK:** 0000320193
|
||||
**Ticker:** AAPL
|
||||
**Exchange:** NASDAQ
|
||||
...
|
||||
"""
|
||||
lines = []
|
||||
|
||||
# Basic identification
|
||||
lines.append(f"**Company:** {self.data.name}")
|
||||
lines.append(f"**CIK:** {str(self.cik).zfill(10)}")
|
||||
|
||||
# Ticker and exchange
|
||||
ticker = self.get_ticker()
|
||||
if ticker:
|
||||
lines.append(f"**Ticker:** {ticker}")
|
||||
|
||||
if hasattr(self.data, 'exchanges') and self.data.exchanges:
|
||||
exchanges_str = ", ".join(self.data.exchanges) if isinstance(self.data.exchanges, (list, tuple)) else str(self.data.exchanges)
|
||||
lines.append(f"**Exchange:** {exchanges_str}")
|
||||
|
||||
# Industry classification
|
||||
if hasattr(self.data, 'sic') and self.data.sic:
|
||||
sic_desc = getattr(self.data, 'sic_description', '')
|
||||
if sic_desc:
|
||||
lines.append(f"**Industry:** {sic_desc} (SIC {self.data.sic})")
|
||||
else:
|
||||
lines.append(f"**SIC Code:** {self.data.sic}")
|
||||
|
||||
# Entity type
|
||||
if hasattr(self.data, 'entity_type') and self.data.entity_type:
|
||||
lines.append(f"**Entity Type:** {self.data.entity_type.title()}")
|
||||
|
||||
# Category
|
||||
if hasattr(self.data, 'category') and self.data.category:
|
||||
lines.append(f"**Category:** {self.data.category}")
|
||||
|
||||
# Fiscal year end
|
||||
if hasattr(self.data, 'fiscal_year_end') and self.data.fiscal_year_end:
|
||||
lines.append(f"**Fiscal Year End:** {self._format_fiscal_year_date(self.data.fiscal_year_end)}")
|
||||
|
||||
# Business address
|
||||
if hasattr(self.data, 'business_address') and self.data.business_address:
|
||||
addr = self.data.business_address
|
||||
lines.append("")
|
||||
lines.append("**Business Address:**")
|
||||
if hasattr(addr, 'street1') and addr.street1:
|
||||
lines.append(f"{addr.street1}")
|
||||
if hasattr(addr, 'street2') and addr.street2:
|
||||
lines.append(f"{addr.street2}")
|
||||
if hasattr(addr, 'city') and hasattr(addr, 'state_or_country') and addr.city and addr.state_or_country:
|
||||
zip_code = f" {addr.zip_code}" if hasattr(addr, 'zip_code') and addr.zip_code else ""
|
||||
lines.append(f"{addr.city}, {addr.state_or_country}{zip_code}")
|
||||
|
||||
# Contact information
|
||||
if hasattr(self.data, 'phone') and self.data.phone:
|
||||
lines.append(f"**Phone:** {self.data.phone}")
|
||||
|
||||
# Mailing address (if different from business address)
|
||||
if hasattr(self.data, 'mailing_address') and self.data.mailing_address:
|
||||
mail_addr = self.data.mailing_address
|
||||
if hasattr(self.data, 'business_address'):
|
||||
# Only include if different
|
||||
business_addr = self.data.business_address
|
||||
if (not hasattr(business_addr, 'street1') or
|
||||
mail_addr.street1 != business_addr.street1):
|
||||
lines.append("")
|
||||
lines.append("**Mailing Address:**")
|
||||
if hasattr(mail_addr, 'street1') and mail_addr.street1:
|
||||
lines.append(f"{mail_addr.street1}")
|
||||
if hasattr(mail_addr, 'city') and hasattr(mail_addr, 'state_or_country'):
|
||||
zip_code = f" {mail_addr.zip_code}" if hasattr(mail_addr, 'zip_code') and mail_addr.zip_code else ""
|
||||
lines.append(f"{mail_addr.city}, {mail_addr.state_or_country}{zip_code}")
|
||||
|
||||
text = "\n".join(lines)
|
||||
|
||||
# Token limiting (4 chars/token heuristic)
|
||||
max_chars = max_tokens * 4
|
||||
if len(text) > max_chars:
|
||||
text = text[:max_chars] + "\n\n[Truncated for token limit]"
|
||||
|
||||
return text
|
||||
|
||||
def __rich__(self):
|
||||
"""Creates a rich representation of the company with detailed information."""
|
||||
|
||||
# The title of the panel
|
||||
ticker = self.get_ticker()
|
||||
if self.data.is_company:
|
||||
entity_title = Text.assemble("🏢 ",
|
||||
(self.data.name, "bold green"),
|
||||
" ",
|
||||
(ticker if ticker else "", "bold yellow")
|
||||
)
|
||||
else:
|
||||
entity_title = Text.assemble("👤", (self.data.name, "bold green"))
|
||||
|
||||
# Primary Information Table
|
||||
main_info = Table(box=box.SIMPLE_HEAVY, show_header=False, padding=(0, 1))
|
||||
main_info.add_column("Row", style="") # Single column for the entire row
|
||||
|
||||
row_parts = []
|
||||
row_parts.extend([Text("CIK", style="grey60"), Text(str(self.cik), style="bold deep_sky_blue3")])
|
||||
if hasattr(self.data, 'entity_type') and self.data.entity_type:
|
||||
if self.data.is_individual:
|
||||
row_parts.extend([Text("Type", style="grey60"),
|
||||
Text("Individual", style="bold yellow")])
|
||||
else:
|
||||
row_parts.extend([Text("Type", style="grey60"),
|
||||
Text(self.data.entity_type.title(), style="bold yellow"),
|
||||
Text(self._get_operating_type_emoticon(self.data.entity_type), style="bold yellow")])
|
||||
main_info.add_row(*row_parts)
|
||||
|
||||
# Detailed Information Table
|
||||
details = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
|
||||
details.add_column("Category")
|
||||
details.add_column("Industry")
|
||||
details.add_column("Fiscal Year End")
|
||||
|
||||
details.add_row(
|
||||
getattr(self.data, 'category', '-') or "-",
|
||||
f"{getattr(self.data, 'sic', '')}: {getattr(self.data, 'sic_description', '')}" if hasattr(self.data, 'sic') and self.data.sic else "-",
|
||||
self._format_fiscal_year_date(getattr(self.data, 'fiscal_year_end', '')) if hasattr(self.data, 'fiscal_year_end') and self.data.fiscal_year_end else "-"
|
||||
)
|
||||
|
||||
# Combine main_info and details in a single panel
|
||||
if self.data.is_company:
|
||||
basic_info_renderables = [main_info, details]
|
||||
else:
|
||||
basic_info_renderables = [main_info]
|
||||
basic_info_panel = Panel(
|
||||
Group(*basic_info_renderables),
|
||||
title="📋 Entity",
|
||||
border_style="grey50"
|
||||
)
|
||||
|
||||
# Trading Information
|
||||
if hasattr(self.data, 'tickers') and hasattr(self.data, 'exchanges') and self.data.tickers and self.data.exchanges:
|
||||
trading_info = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
|
||||
trading_info.add_column("Exchange")
|
||||
trading_info.add_column("Symbol", style="bold yellow")
|
||||
|
||||
for exchange, ticker in zip(self.data.exchanges, self.data.tickers, strict=False):
|
||||
trading_info.add_row(exchange, ticker)
|
||||
|
||||
trading_panel = Panel(
|
||||
trading_info,
|
||||
title="📈 Exchanges",
|
||||
border_style="grey50"
|
||||
)
|
||||
else:
|
||||
trading_panel = Panel(
|
||||
Text("No trading information available", style="grey58"),
|
||||
title="📈 Trading Information",
|
||||
border_style="grey50"
|
||||
)
|
||||
|
||||
# Contact Information
|
||||
contact_info = Table(box=box.SIMPLE, show_header=False, padding=(0, 1))
|
||||
contact_info.add_column("Label", style="bold grey70")
|
||||
contact_info.add_column("Value")
|
||||
|
||||
has_contact_info = any([
|
||||
hasattr(self.data, 'phone') and self.data.phone,
|
||||
hasattr(self.data, 'website') and self.data.website,
|
||||
hasattr(self.data, 'investor_website') and self.data.investor_website
|
||||
])
|
||||
|
||||
if hasattr(self.data, 'website') and self.data.website:
|
||||
contact_info.add_row("Website", self.data.website)
|
||||
if hasattr(self.data, 'investor_website') and self.data.investor_website:
|
||||
contact_info.add_row("Investor Relations", self.data.investor_website)
|
||||
if hasattr(self.data, 'phone') and self.data.phone:
|
||||
contact_info.add_row("Phone", self.data.phone)
|
||||
|
||||
# Three-column layout for addresses and contact info
|
||||
contact_renderables = []
|
||||
if hasattr(self.data, 'business_address') and not self.data.business_address.empty:
|
||||
contact_renderables.append(Panel(
|
||||
Text(str(self.data.business_address)),
|
||||
title="🏢 Business Address",
|
||||
border_style="grey50"
|
||||
))
|
||||
if hasattr(self.data, 'mailing_address') and not self.data.mailing_address.empty:
|
||||
contact_renderables.append(Panel(
|
||||
Text(str(self.data.mailing_address)),
|
||||
title="📫 Mailing Address",
|
||||
border_style="grey50"
|
||||
))
|
||||
if has_contact_info:
|
||||
contact_renderables.append(Panel(
|
||||
contact_info,
|
||||
title="📞 Contact Information",
|
||||
border_style="grey50"
|
||||
))
|
||||
|
||||
# Former Names Table (if any exist)
|
||||
former_names_panel = None
|
||||
if hasattr(self.data, 'former_names') and self.data.former_names:
|
||||
|
||||
|
||||
former_names_table = Table(box=box.SIMPLE, show_header=False, padding=(0, 1))
|
||||
former_names_table.add_column("Previous Company Names")
|
||||
former_names_table.add_column("") # Empty column for better spacing
|
||||
|
||||
for former_name in self.data.former_names:
|
||||
from_date = datefmt(former_name['from'], '%B %Y')
|
||||
to_date = datefmt(former_name['to'], '%B %Y')
|
||||
former_names_table.add_row(Text(former_name['name'], style="italic"), f"{from_date} to {to_date}")
|
||||
|
||||
former_names_panel = Panel(
|
||||
former_names_table,
|
||||
title="📜 Former Names",
|
||||
border_style="grey50"
|
||||
)
|
||||
|
||||
# Combine all sections using Group
|
||||
if self.data.is_company:
|
||||
content_renderables = [Padding("", (1, 0, 0, 0)), basic_info_panel, trading_panel]
|
||||
if len(contact_renderables):
|
||||
contact_and_addresses = Columns(contact_renderables, equal=True, expand=True)
|
||||
content_renderables.append(contact_and_addresses)
|
||||
if former_names_panel:
|
||||
content_renderables.append(former_names_panel)
|
||||
else:
|
||||
content_renderables = [Padding("", (1, 0, 0, 0)), basic_info_panel]
|
||||
if len(contact_renderables):
|
||||
contact_and_addresses = Columns(contact_renderables, equal=True, expand=True)
|
||||
content_renderables.append(contact_and_addresses)
|
||||
|
||||
content = Group(*content_renderables)
|
||||
|
||||
# Create the main panel
|
||||
return Panel(
|
||||
content,
|
||||
title=entity_title,
|
||||
subtitle=Text.assemble(
|
||||
("SEC Entity Data", "dim"),
|
||||
" • ",
|
||||
("company.docs", "cyan dim"),
|
||||
(" for usage guide", "dim")
|
||||
),
|
||||
border_style="grey50"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_operating_type_emoticon(entity_type: str) -> str:
|
||||
"""
|
||||
Generate a meaningful single-width symbol based on the SEC entity type.
|
||||
All symbols are chosen to be single-width to work well with rich borders.
|
||||
|
||||
Args:
|
||||
entity_type (str): The SEC entity type (case-insensitive)
|
||||
|
||||
Returns:
|
||||
str: A single-width symbol representing the entity type
|
||||
"""
|
||||
symbols = {
|
||||
"operating": "○", # Circle for active operations
|
||||
"subsidiary": "→", # Arrow showing connection to parent
|
||||
"inactive": "×", # Cross for inactive
|
||||
"holding company": "■", # Square for solid corporate structure
|
||||
"investment company": "$", # Dollar for investment focus
|
||||
"investment trust": "$", # Dollar for investment focus
|
||||
"shell": "□", # Empty square for shell
|
||||
"development stage": "∆", # Triangle for growth/development
|
||||
"financial services": "¢", # Cent sign for financial services
|
||||
"reit": "⌂", # House symbol
|
||||
"spv": "◊", # Diamond for special purpose
|
||||
"joint venture": "∞" # Infinity for partnership
|
||||
}
|
||||
|
||||
# Clean input: convert to lowercase and strip whitespace
|
||||
cleaned_type = entity_type.lower().strip()
|
||||
|
||||
# Handle some common variations
|
||||
if "investment" in cleaned_type:
|
||||
return symbols["investment company"]
|
||||
if "real estate" in cleaned_type or "reit" in cleaned_type:
|
||||
return symbols["reit"]
|
||||
|
||||
# Return default question mark if type not found
|
||||
return symbols.get(cleaned_type, "")
|
||||
|
||||
@staticmethod
|
||||
def _format_fiscal_year_date(date_str):
|
||||
"""Format fiscal year end date in a human-readable format."""
|
||||
if not date_str:
|
||||
return "-"
|
||||
|
||||
# Dictionary of months
|
||||
months = {
|
||||
"01": "Jan", "02": "Feb", "03": "Mar",
|
||||
"04": "Apr", "05": "May", "06": "Jun",
|
||||
"07": "Jul", "08": "Aug", "09": "Sep",
|
||||
"10": "Oct", "11": "Nov", "12": "Dec"
|
||||
}
|
||||
|
||||
# Extract month and day
|
||||
month = date_str[:2]
|
||||
if month not in months:
|
||||
return date_str
|
||||
|
||||
try:
|
||||
day = str(int(date_str[2:])) # Remove leading zero
|
||||
return f"{months[month]} {day}"
|
||||
except (ValueError, IndexError):
|
||||
return date_str
|
||||
|
||||
|
||||
# Factory functions for backward compatibility
|
||||
|
||||
def get_entity(cik_or_identifier: Union[str, int]) -> Entity:
|
||||
"""
|
||||
Get any SEC filing entity by CIK or identifier.
|
||||
|
||||
Args:
|
||||
cik_or_identifier: CIK number (as int or str) or other identifier
|
||||
|
||||
Returns:
|
||||
Entity instance
|
||||
"""
|
||||
return Entity(cik_or_identifier)
|
||||
|
||||
|
||||
def get_company(cik_or_ticker: Union[str, int]) -> Company:
|
||||
"""
|
||||
Get a public company by CIK or ticker.
|
||||
|
||||
Args:
|
||||
cik_or_ticker: CIK number or ticker symbol
|
||||
|
||||
Returns:
|
||||
Company instance
|
||||
"""
|
||||
return Company(cik_or_ticker)
|
||||
|
||||
|
||||
def public_companies() -> Iterable[Company]:
|
||||
"""
|
||||
Iterator over all known public companies.
|
||||
|
||||
Returns:
|
||||
Iterable of Company objects
|
||||
"""
|
||||
from edgar.reference.tickers import get_cik_tickers
|
||||
|
||||
df = get_cik_tickers()
|
||||
for _, row in df.iterrows():
|
||||
c = Company(row.cik)
|
||||
yield c
|
||||
|
||||
|
||||
854
venv/lib/python3.10/site-packages/edgar/entity/data.py
Normal file
854
venv/lib/python3.10/site-packages/edgar/entity/data.py
Normal file
@@ -0,0 +1,854 @@
|
||||
"""
|
||||
Data classes for the Entity package.
|
||||
|
||||
This module contains classes for working with entity data, including
|
||||
addresses, facts, and other structured data from SEC filings.
|
||||
"""
|
||||
import re
|
||||
from functools import cached_property
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
from edgar.core import listify, log
|
||||
from edgar.dates import InvalidDateException
|
||||
from edgar.entity.filings import EntityFilings
|
||||
from edgar.filtering import filter_by_date, filter_by_form, filter_by_year_quarter
|
||||
from edgar.formatting import reverse_name
|
||||
from edgar.storage import is_using_local_storage
|
||||
|
||||
# Module-level import cache for lazy imports
|
||||
_IMPORT_CACHE = {}
|
||||
|
||||
|
||||
def lazy_import(module_path):
|
||||
"""
|
||||
Lazily import a module or attribute and cache the result to avoid repeated imports.
|
||||
|
||||
Args:
|
||||
module_path: String path to the module or attribute
|
||||
|
||||
Returns:
|
||||
The imported module or attribute
|
||||
"""
|
||||
if module_path not in _IMPORT_CACHE:
|
||||
parts = module_path.split('.')
|
||||
if len(parts) == 1:
|
||||
# Simple module import
|
||||
_IMPORT_CACHE[module_path] = __import__(module_path)
|
||||
else:
|
||||
# Import from module (potentially nested)
|
||||
module_name = '.'.join(parts[:-1])
|
||||
attr_name = parts[-1]
|
||||
|
||||
module = __import__(module_name, fromlist=[attr_name])
|
||||
_IMPORT_CACHE[module_path] = getattr(module, attr_name)
|
||||
|
||||
return _IMPORT_CACHE[module_path]
|
||||
|
||||
|
||||
__all__ = [
|
||||
'Address',
|
||||
'EntityData',
|
||||
'CompanyData',
|
||||
'preprocess_company',
|
||||
'parse_entity_submissions',
|
||||
'extract_company_filings_table',
|
||||
'create_company_filings',
|
||||
'create_default_entity_data'
|
||||
]
|
||||
|
||||
|
||||
def extract_company_filings_table(filings_json: Dict[str, Any]) -> pa.Table:
|
||||
"""
|
||||
Extract company filings from the json response.
|
||||
|
||||
Args:
|
||||
filings_json: The JSON data containing filings
|
||||
|
||||
Returns:
|
||||
A PyArrow Table containing the filings data
|
||||
"""
|
||||
# Import this here to avoid circular imports
|
||||
from edgar.core import parse_acceptance_datetime
|
||||
|
||||
# Handle case of no data
|
||||
if not filings_json.get('accessionNumber'):
|
||||
# Create an empty table with the right schema
|
||||
schema = pa.schema([
|
||||
('accession_number', pa.string()),
|
||||
('filing_date', pa.date32()),
|
||||
('reportDate', pa.string()),
|
||||
('acceptanceDateTime', pa.timestamp('us')),
|
||||
('act', pa.string()),
|
||||
('form', pa.string()),
|
||||
('fileNumber', pa.string()),
|
||||
('items', pa.string()),
|
||||
('size', pa.string()),
|
||||
('isXBRL', pa.string()),
|
||||
('isInlineXBRL', pa.string()),
|
||||
('primaryDocument', pa.string()),
|
||||
('primaryDocDescription', pa.string())
|
||||
])
|
||||
return pa.Table.from_arrays([[] for _ in range(13)], schema=schema)
|
||||
else:
|
||||
# Convert acceptanceDateTime string to datetime
|
||||
acceptance_datetimes = [
|
||||
parse_acceptance_datetime(dt) for dt in filings_json['acceptanceDateTime']
|
||||
]
|
||||
|
||||
fields = {
|
||||
'accession_number': filings_json['accessionNumber'],
|
||||
'filing_date': pc.cast(pc.strptime(pa.array(filings_json['filingDate']), '%Y-%m-%d', 'us'), pa.date32()),
|
||||
'reportDate': filings_json['reportDate'],
|
||||
'acceptanceDateTime': acceptance_datetimes,
|
||||
'act': filings_json['act'],
|
||||
'form': filings_json['form'],
|
||||
'fileNumber': filings_json['fileNumber'],
|
||||
'items': filings_json['items'],
|
||||
'size': filings_json['size'],
|
||||
'isXBRL': filings_json['isXBRL'],
|
||||
'isInlineXBRL': filings_json['isInlineXBRL'],
|
||||
'primaryDocument': filings_json['primaryDocument'],
|
||||
'primaryDocDescription': filings_json['primaryDocDescription']
|
||||
}
|
||||
|
||||
# Create table using dictionary
|
||||
return pa.Table.from_arrays(
|
||||
arrays=[pa.array(v) if k not in ['filing_date', 'acceptanceDateTime']
|
||||
else v for k, v in fields.items()],
|
||||
names=list(fields.keys())
|
||||
)
|
||||
|
||||
|
||||
def create_company_filings(filings_json: Dict[str, Any], cik: int, company_name: str) -> EntityFilings:
|
||||
"""
|
||||
Extract company filings from the json response.
|
||||
|
||||
Args:
|
||||
filings_json: The JSON data containing filings
|
||||
cik: The company CIK
|
||||
company_name: The company name
|
||||
|
||||
Returns:
|
||||
An EntityFilings object containing the filings
|
||||
"""
|
||||
recent_filings = extract_company_filings_table(filings_json['recent'])
|
||||
return EntityFilings(recent_filings, cik=cik, company_name=company_name)
|
||||
|
||||
|
||||
def parse_entity_submissions(cjson: Dict[str, Any]) -> 'CompanyData':
|
||||
"""
|
||||
Parse entity submissions from the SEC API.
|
||||
|
||||
Args:
|
||||
cjson: The JSON data from the SEC submissions API
|
||||
|
||||
Returns:
|
||||
A CompanyData object representing the entity
|
||||
"""
|
||||
mailing_addr = cjson['addresses']['mailing']
|
||||
business_addr = cjson['addresses']['business']
|
||||
cik = cjson['cik']
|
||||
company_name = cjson["name"]
|
||||
former_names = cjson.get('formerNames', [])
|
||||
|
||||
for former_name in former_names:
|
||||
former_name['from'] = former_name['from'][:10] if former_name['from'] else former_name['from']
|
||||
former_name['to'] = former_name['to'][:10] if former_name['to'] else former_name['to']
|
||||
|
||||
return CompanyData(
|
||||
cik=int(cik),
|
||||
name=company_name,
|
||||
tickers=cjson['tickers'],
|
||||
exchanges=cjson['exchanges'],
|
||||
sic=cjson['sic'],
|
||||
sic_description=cjson['sicDescription'],
|
||||
category=cjson['category'].replace("<br>", " | ") if cjson['category'] else None,
|
||||
fiscal_year_end=cjson['fiscalYearEnd'],
|
||||
entity_type=cjson['entityType'],
|
||||
phone=cjson['phone'],
|
||||
flags=cjson['flags'],
|
||||
mailing_address=Address(
|
||||
street1=mailing_addr['street1'],
|
||||
street2=mailing_addr['street2'],
|
||||
city=mailing_addr['city'],
|
||||
state_or_country_desc=mailing_addr['stateOrCountryDescription'],
|
||||
state_or_country=mailing_addr['stateOrCountry'],
|
||||
zipcode=mailing_addr['zipCode'],
|
||||
),
|
||||
business_address=Address(
|
||||
street1=business_addr['street1'],
|
||||
street2=business_addr['street2'],
|
||||
city=business_addr['city'],
|
||||
state_or_country_desc=business_addr['stateOrCountryDescription'],
|
||||
state_or_country=business_addr['stateOrCountry'],
|
||||
zipcode=business_addr['zipCode'],
|
||||
),
|
||||
filings=create_company_filings(cjson['filings'], cik=cik, company_name=company_name),
|
||||
insider_transaction_for_owner_exists=bool(cjson['insiderTransactionForOwnerExists']),
|
||||
insider_transaction_for_issuer_exists=bool(cjson['insiderTransactionForIssuerExists']),
|
||||
ein=cjson['ein'],
|
||||
description=cjson['description'],
|
||||
website=cjson['website'],
|
||||
investor_website=cjson['investorWebsite'],
|
||||
state_of_incorporation=cjson['stateOfIncorporation'],
|
||||
state_of_incorporation_description=cjson['stateOfIncorporationDescription'],
|
||||
former_names=former_names,
|
||||
files=cjson['filings']['files']
|
||||
)
|
||||
|
||||
|
||||
class Address:
|
||||
"""
|
||||
Represents a physical address.
|
||||
|
||||
This class is optimized for memory usage and performance.
|
||||
"""
|
||||
__slots__ = ('street1', 'street2', 'city', 'state_or_country', 'zipcode', 'state_or_country_desc', '_str_cache')
|
||||
|
||||
def __init__(self,
|
||||
street1: str,
|
||||
street2: Optional[str],
|
||||
city: str,
|
||||
state_or_country: str,
|
||||
zipcode: str,
|
||||
state_or_country_desc: str
|
||||
):
|
||||
"""
|
||||
Initialize an Address object.
|
||||
|
||||
Args:
|
||||
street1: First line of street address
|
||||
street2: Second line of street address (optional)
|
||||
city: City name
|
||||
state_or_country: State or country code
|
||||
zipcode: Postal/ZIP code
|
||||
state_or_country_desc: Human-readable state or country name
|
||||
"""
|
||||
# Store empty strings instead of None to avoid type checks later
|
||||
self.street1: str = street1 or ""
|
||||
self.street2: Optional[str] = street2 or ""
|
||||
self.city: str = city or ""
|
||||
self.state_or_country: str = state_or_country or ""
|
||||
self.zipcode: str = zipcode or ""
|
||||
self.state_or_country_desc: str = state_or_country_desc or ""
|
||||
self._str_cache = None
|
||||
|
||||
@property
|
||||
def empty(self) -> bool:
|
||||
"""Check if the address is empty. Optimized to avoid multiple attribute checks when possible."""
|
||||
# Short-circuit on common empty case
|
||||
if not self.street1:
|
||||
if not self.city and not self.zipcode:
|
||||
return True
|
||||
|
||||
# Full check
|
||||
return not (self.street1 or self.street2 or self.city or self.state_or_country or self.zipcode)
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Generate a formatted string representation of the address.
|
||||
Caches result for repeated calls.
|
||||
"""
|
||||
if self._str_cache is not None:
|
||||
return self._str_cache
|
||||
|
||||
if not self.street1:
|
||||
self._str_cache = ""
|
||||
return ""
|
||||
|
||||
# Build string only once and cache it
|
||||
parts = []
|
||||
parts.append(self.street1)
|
||||
|
||||
if self.street2:
|
||||
parts.append(self.street2)
|
||||
|
||||
parts.append(f"{self.city}, {self.state_or_country_desc} {self.zipcode}")
|
||||
|
||||
self._str_cache = "\n".join(parts)
|
||||
return self._str_cache
|
||||
|
||||
def __repr__(self):
|
||||
"""Generate a string representation suitable for debugging."""
|
||||
# Simplified representation that avoids unnecessary string operations
|
||||
return f'Address(street1="{self.street1}", street2="{self.street2}", city="{self.city}", zipcode="{self.zipcode}")'
|
||||
|
||||
def to_json(self) -> Dict[str, str]:
|
||||
"""Convert the address to a JSON-serializable dict."""
|
||||
# Direct dictionary creation is faster than multiple assignments
|
||||
return {
|
||||
'street1': self.street1,
|
||||
'street2': self.street2,
|
||||
'city': self.city,
|
||||
'state_or_country': self.state_or_country,
|
||||
'zipcode': self.zipcode,
|
||||
'state_or_country_desc': self.state_or_country_desc
|
||||
}
|
||||
|
||||
|
||||
class EntityData:
|
||||
"""
|
||||
Container for entity data loaded from SEC submissions API.
|
||||
|
||||
This class provides access to entity metadata and filings.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
cik: int,
|
||||
name: str,
|
||||
tickers: List[str],
|
||||
exchanges: List[str],
|
||||
sic: str,
|
||||
sic_description: str,
|
||||
ein: str,
|
||||
entity_type: str,
|
||||
fiscal_year_end: str,
|
||||
filings: EntityFilings,
|
||||
business_address: Address,
|
||||
mailing_address: Address,
|
||||
state_of_incorporation: str,
|
||||
**kwargs):
|
||||
"""
|
||||
Initialize a new EntityData instance.
|
||||
|
||||
Args:
|
||||
cik: The CIK number
|
||||
name: The entity name
|
||||
sic: The Standard Industrial Classification code
|
||||
ein: The Employer Identification Number
|
||||
fiscal_year_end: The fiscal year end date
|
||||
tickers: List of ticker symbols
|
||||
exchanges: List of exchanges
|
||||
entity_type: The entity type
|
||||
filings: The entity's filings
|
||||
business_address: The business address
|
||||
mailing_address: The mailing address
|
||||
state_of_incorporation: The state of incorporation
|
||||
**kwargs: Additional attributes
|
||||
"""
|
||||
self.cik: int = cik
|
||||
self.name: str = name
|
||||
self.sic = sic
|
||||
self.sic_description: str = sic_description
|
||||
self.ein: str = ein
|
||||
self.fiscal_year_end: str = fiscal_year_end
|
||||
self.tickers: List[str] = tickers
|
||||
self.exchanges: List[str] = exchanges
|
||||
self.filings: EntityFilings = filings
|
||||
self.entity_type = entity_type
|
||||
self.business_address: Address = business_address
|
||||
self.mailing_address: Address = mailing_address
|
||||
self.state_of_incorporation: str = state_of_incorporation
|
||||
|
||||
# Store all other attributes
|
||||
for key, value in kwargs.items():
|
||||
setattr(self, key, value)
|
||||
|
||||
# Initialize lazy loading flag
|
||||
self._loaded_all_filings: bool = False
|
||||
self._files = kwargs.get('files', [])
|
||||
|
||||
def _load_older_filings(self):
|
||||
"""
|
||||
Load older filings that were not included in the initial data.
|
||||
|
||||
This method implements the lazy loading behavior of filings.
|
||||
When first creating an entity, only the most recent filings are loaded
|
||||
to keep API response times fast. When more filings are needed, this
|
||||
method will load additional filings from the SEC.
|
||||
"""
|
||||
# If we have no files to load, we're done
|
||||
if not self._files:
|
||||
return
|
||||
|
||||
# Import locally to avoid circular imports using the lazy import cache
|
||||
download_json = lazy_import('edgar.httprequests.download_json')
|
||||
|
||||
# Load additional filings from the SEC
|
||||
filing_tables = [self.filings.data]
|
||||
for file in self._files:
|
||||
submissions = download_json("https://data.sec.gov/submissions/" + file['name'])
|
||||
filing_table = extract_company_filings_table(submissions)
|
||||
filing_tables.append(filing_table)
|
||||
|
||||
# Combine all filing tables
|
||||
combined_tables = pa.concat_tables(filing_tables)
|
||||
|
||||
# Update filings
|
||||
EntityFilings = lazy_import('edgar.entity.filings.EntityFilings')
|
||||
self.filings = EntityFilings(combined_tables, cik=self.cik, company_name=self.name)
|
||||
|
||||
def get_filings(self,
|
||||
year: Union[int, List[int]] = None,
|
||||
quarter: Union[int, List[int]] = None,
|
||||
form: Union[str, List] = None,
|
||||
accession_number: Union[str, List] = None,
|
||||
file_number: Union[str, List] = None,
|
||||
filing_date: Union[str, Tuple[str, str]] = None,
|
||||
date: Union[str, Tuple[str, str]] = None,
|
||||
amendments: bool = True,
|
||||
is_xbrl: bool = None,
|
||||
is_inline_xbrl: bool = None,
|
||||
sort_by: Union[str, List[Tuple[str, str]]] = None,
|
||||
trigger_full_load: bool = True
|
||||
) -> EntityFilings:
|
||||
"""
|
||||
Get entity filings with lazy loading behavior.
|
||||
|
||||
Args:
|
||||
year: Filter by year(s) (e.g. 2023, [2022, 2023])
|
||||
quarter: Filter by quarter(s) (1-4, e.g. 4, [3, 4])
|
||||
form: Filter by form type(s)
|
||||
accession_number: Filter by accession number(s)
|
||||
file_number: Filter by file number(s)
|
||||
filing_date: Filter by filing date (YYYY-MM-DD or range)
|
||||
date: Alias for filing_date
|
||||
amendments: Whether to include amendments (default: True)
|
||||
is_xbrl: Filter by XBRL status
|
||||
is_inline_xbrl: Filter by inline XBRL status
|
||||
sort_by: Sort criteria
|
||||
trigger_full_load: Whether to load all historical filings if not already loaded
|
||||
|
||||
Returns:
|
||||
Filtered filings
|
||||
"""
|
||||
|
||||
# Lazy loading behavior
|
||||
if not self._loaded_all_filings and not is_using_local_storage() and trigger_full_load:
|
||||
self._load_older_filings()
|
||||
self._loaded_all_filings = True
|
||||
|
||||
# Get filings data
|
||||
company_filings = self.filings.data
|
||||
|
||||
# Filter by year/quarter first (most selective)
|
||||
if year is not None:
|
||||
company_filings = filter_by_year_quarter(company_filings, year, quarter)
|
||||
|
||||
# Filter by accession number
|
||||
if accession_number:
|
||||
company_filings = company_filings.filter(
|
||||
pc.is_in(company_filings['accession_number'], pa.array(listify(accession_number))))
|
||||
if len(company_filings) >= 1:
|
||||
# We found the filing(s)
|
||||
return EntityFilings(company_filings, cik=self.cik, company_name=self.name)
|
||||
|
||||
# Filter by form (with amendments support)
|
||||
if form:
|
||||
company_filings = filter_by_form(company_filings, form, amendments)
|
||||
|
||||
# Filter by file number
|
||||
if file_number:
|
||||
company_filings = company_filings.filter(
|
||||
pc.is_in(company_filings['fileNumber'], pa.array(listify(file_number))))
|
||||
|
||||
# Filter by XBRL status
|
||||
if is_xbrl is not None:
|
||||
company_filings = company_filings.filter(pc.equal(company_filings['isXBRL'], int(is_xbrl)))
|
||||
|
||||
# Filter by inline XBRL status
|
||||
if is_inline_xbrl is not None:
|
||||
company_filings = company_filings.filter(pc.equal(company_filings['isInlineXBRL'], int(is_inline_xbrl)))
|
||||
|
||||
# Filter by filing date
|
||||
filing_date = filing_date or date
|
||||
if filing_date:
|
||||
try:
|
||||
company_filings = filter_by_date(company_filings, filing_date, 'filing_date')
|
||||
except InvalidDateException as e:
|
||||
log.error(e)
|
||||
return None
|
||||
|
||||
# Sort filings
|
||||
if sort_by:
|
||||
company_filings = company_filings.sort_by(sort_by)
|
||||
|
||||
# Return filtered filings
|
||||
return EntityFilings(company_filings, cik=self.cik, company_name=self.name)
|
||||
|
||||
@property
|
||||
def is_company(self) -> bool:
|
||||
"""Determine if this entity is a company."""
|
||||
return not self.is_individual
|
||||
|
||||
@cached_property
|
||||
def is_individual(self) -> bool:
|
||||
"""
|
||||
Determine if this entity is an individual.
|
||||
|
||||
Tricky logic to detect if a company is an individual or a company.
|
||||
Companies have an ein, individuals do not. Oddly Warren Buffet has an EIN but not a state of incorporation
|
||||
There may be other edge cases.
|
||||
If you have a ticker or exchange you are a company.
|
||||
"""
|
||||
# Import locally using the lazy import cache
|
||||
has_company_filings = lazy_import('edgar.entity.core.has_company_filings')
|
||||
|
||||
if len(self.tickers) > 0 or len(self.exchanges) > 0:
|
||||
return False
|
||||
elif hasattr(self,
|
||||
'state_of_incorporation') and self.state_of_incorporation is not None and self.state_of_incorporation != '':
|
||||
if self.cik == 1033331: # Reed Hastings exception
|
||||
return True
|
||||
return False
|
||||
elif hasattr(self, 'entity_type') and self.entity_type not in ['', 'other']:
|
||||
return False
|
||||
elif has_company_filings(self.filings.data['form']):
|
||||
if self.cik == 315090: # The Warren Buffett exception
|
||||
return True
|
||||
return False
|
||||
elif not hasattr(self, 'ein') or self.ein is None or self.ein == "000000000":
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def __str__(self):
|
||||
return f"EntityData({self.name} [{self.cik}])"
|
||||
|
||||
def __repr__(self):
|
||||
repr_rich = lazy_import('edgar.richtools.repr_rich')
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def __rich__(self):
|
||||
"""Creates a rich representation of the entity with clear information hierarchy."""
|
||||
# Use lazy imports for rich components
|
||||
box = lazy_import('rich.box')
|
||||
Group = lazy_import('rich.console.Group')
|
||||
Columns = lazy_import('rich.columns.Columns')
|
||||
Padding = lazy_import('rich.padding.Padding')
|
||||
Panel = lazy_import('rich.panel.Panel')
|
||||
Table = lazy_import('rich.table.Table')
|
||||
Text = lazy_import('rich.text.Text')
|
||||
find_ticker = lazy_import('edgar.reference.tickers.find_ticker')
|
||||
zip_longest = lazy_import('itertools.zip_longest')
|
||||
datefmt = lazy_import('edgar.formatting.datefmt')
|
||||
|
||||
# Primary entity identification section
|
||||
if self.is_company:
|
||||
ticker = find_ticker(self.cik)
|
||||
ticker = f"{ticker}" if ticker else ""
|
||||
|
||||
# The title of the panel
|
||||
entity_title = Text.assemble("🏢 ",
|
||||
(self.display_name, "bold green"),
|
||||
" ",
|
||||
(f"[{self.cik}] ", "dim"),
|
||||
(ticker, "bold yellow")
|
||||
)
|
||||
else:
|
||||
entity_title = Text.assemble("👤", (self.display_name, "bold green"))
|
||||
|
||||
# Primary Information Table
|
||||
main_info = Table(box=box.SIMPLE_HEAVY, show_header=False, padding=(0, 1))
|
||||
main_info.add_column("Row", style="") # Single column for the entire row
|
||||
|
||||
row_parts = []
|
||||
row_parts.extend([Text("CIK", style="grey60"), Text(str(self.cik), style="bold deep_sky_blue3")])
|
||||
if hasattr(self, 'entity_type') and self.entity_type:
|
||||
if self.is_individual:
|
||||
row_parts.extend([Text("Type", style="grey60"),
|
||||
Text("Individual", style="bold yellow")])
|
||||
else:
|
||||
row_parts.extend([Text("Type", style="grey60"),
|
||||
Text(self.entity_type.title(), style="bold yellow"),
|
||||
Text(self._get_operating_type_emoticon(self.entity_type), style="bold yellow")])
|
||||
main_info.add_row(*row_parts)
|
||||
|
||||
# Detailed Information Table
|
||||
details = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
|
||||
details.add_column("Category")
|
||||
details.add_column("Industry")
|
||||
details.add_column("Fiscal Year End")
|
||||
|
||||
details.add_row(
|
||||
getattr(self, 'category', '-') or "-",
|
||||
f"{getattr(self, 'sic', '')}: {getattr(self, 'sic_description', '')}" if hasattr(self,
|
||||
'sic') and self.sic else "-",
|
||||
self._format_fiscal_year_date(getattr(self, 'fiscal_year_end', '')) if hasattr(self,
|
||||
'fiscal_year_end') and self.fiscal_year_end else "-"
|
||||
)
|
||||
|
||||
# Combine main_info and details in a single panel
|
||||
if self.is_company:
|
||||
basic_info_renderables = [main_info, details]
|
||||
else:
|
||||
basic_info_renderables = [main_info]
|
||||
basic_info_panel = Panel(
|
||||
Group(*basic_info_renderables),
|
||||
title="📋 Entity",
|
||||
border_style="grey50"
|
||||
)
|
||||
|
||||
# Trading Information
|
||||
if self.tickers and self.exchanges:
|
||||
trading_info = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
|
||||
trading_info.add_column("Exchange")
|
||||
trading_info.add_column("Symbol", style="bold yellow")
|
||||
|
||||
for exchange, ticker in zip_longest(self.exchanges, self.tickers, fillvalue="-"):
|
||||
trading_info.add_row(exchange, ticker)
|
||||
|
||||
trading_panel = Panel(
|
||||
trading_info,
|
||||
title="📈 Exchanges",
|
||||
border_style="grey50"
|
||||
)
|
||||
else:
|
||||
trading_panel = Panel(
|
||||
Text("No trading information available", style="grey58"),
|
||||
title="📈 Trading Information",
|
||||
border_style="grey50"
|
||||
)
|
||||
|
||||
# Contact Information
|
||||
contact_info = Table(box=box.SIMPLE, show_header=False, padding=(0, 1))
|
||||
contact_info.add_column("Label", style="bold grey70")
|
||||
contact_info.add_column("Value")
|
||||
|
||||
has_contact_info = any([
|
||||
hasattr(self, 'phone') and self.phone,
|
||||
hasattr(self, 'website') and self.website,
|
||||
hasattr(self, 'investor_website') and self.investor_website
|
||||
])
|
||||
|
||||
if hasattr(self, 'website') and self.website:
|
||||
contact_info.add_row("Website", self.website)
|
||||
if hasattr(self, 'investor_website') and self.investor_website:
|
||||
contact_info.add_row("Investor Relations", self.investor_website)
|
||||
if hasattr(self, 'phone') and self.phone:
|
||||
contact_info.add_row("Phone", self.phone)
|
||||
|
||||
# Three-column layout for addresses and contact info
|
||||
contact_renderables = []
|
||||
if hasattr(self, 'business_address') and not self.business_address.empty:
|
||||
contact_renderables.append(Panel(
|
||||
Text(str(self.business_address)),
|
||||
title="🏢 Business Address",
|
||||
border_style="grey50"
|
||||
))
|
||||
if hasattr(self, 'mailing_address') and not self.mailing_address.empty:
|
||||
contact_renderables.append(Panel(
|
||||
Text(str(self.mailing_address)),
|
||||
title="📫 Mailing Address",
|
||||
border_style="grey50"
|
||||
))
|
||||
if has_contact_info:
|
||||
contact_renderables.append(Panel(
|
||||
contact_info,
|
||||
title="📞 Contact Information",
|
||||
border_style="grey50"
|
||||
))
|
||||
|
||||
# Former Names Table (if any exist)
|
||||
former_names_panel = None
|
||||
if hasattr(self, 'former_names') and self.former_names:
|
||||
former_names_table = Table(box=box.SIMPLE, show_header=False, padding=(0, 1))
|
||||
former_names_table.add_column("Previous Company Names")
|
||||
former_names_table.add_column("") # Empty column for better spacing
|
||||
|
||||
for former_name in self.former_names:
|
||||
from_date = datefmt(former_name['from'], '%B %Y')
|
||||
to_date = datefmt(former_name['to'], '%B %Y')
|
||||
former_names_table.add_row(Text(former_name['name'], style="italic"), f"{from_date} to {to_date}")
|
||||
|
||||
former_names_panel = Panel(
|
||||
former_names_table,
|
||||
title="📜 Former Names",
|
||||
border_style="grey50"
|
||||
)
|
||||
|
||||
# Combine all sections using Group
|
||||
if self.is_company:
|
||||
content_renderables = [Padding("", (1, 0, 0, 0)), basic_info_panel, trading_panel]
|
||||
if len(contact_renderables):
|
||||
contact_and_addresses = Columns(contact_renderables, equal=True, expand=True)
|
||||
content_renderables.append(contact_and_addresses)
|
||||
if former_names_panel:
|
||||
content_renderables.append(former_names_panel)
|
||||
else:
|
||||
content_renderables = [Padding("", (1, 0, 0, 0)), basic_info_panel]
|
||||
if len(contact_renderables):
|
||||
contact_and_addresses = Columns(contact_renderables, equal=True, expand=True)
|
||||
content_renderables.append(contact_and_addresses)
|
||||
|
||||
content = Group(*content_renderables)
|
||||
|
||||
# Create the main panel
|
||||
return Panel(
|
||||
content,
|
||||
title=entity_title,
|
||||
subtitle="SEC Entity Data",
|
||||
border_style="grey50"
|
||||
)
|
||||
|
||||
@property
|
||||
def display_name(self) -> str:
|
||||
"""Reverse the name if it is a company"""
|
||||
if self.is_company:
|
||||
return self.name
|
||||
|
||||
return reverse_name(self.name)
|
||||
|
||||
@staticmethod
|
||||
def _get_operating_type_emoticon(entity_type: str) -> str:
|
||||
"""
|
||||
Generate a meaningful single-width symbol based on the SEC entity type.
|
||||
All symbols are chosen to be single-width to work well with rich borders.
|
||||
|
||||
Args:
|
||||
entity_type (str): The SEC entity type (case-insensitive)
|
||||
|
||||
Returns:
|
||||
str: A single-width symbol representing the entity type
|
||||
"""
|
||||
symbols = {
|
||||
"operating": "○", # Circle for active operations
|
||||
"subsidiary": "→", # Arrow showing connection to parent
|
||||
"inactive": "×", # Cross for inactive
|
||||
"holding company": "■", # Square for solid corporate structure
|
||||
"investment company": "$", # Dollar for investment focus
|
||||
"investment trust": "$", # Dollar for investment focus
|
||||
"shell": "□", # Empty square for shell
|
||||
"development stage": "∆", # Triangle for growth/development
|
||||
"financial services": "¢", # Cent sign for financial services
|
||||
"reit": "⌂", # House symbol
|
||||
"spv": "◊", # Diamond for special purpose
|
||||
"joint venture": "∞" # Infinity for partnership
|
||||
}
|
||||
|
||||
# Clean input: convert to lowercase and strip whitespace
|
||||
cleaned_type = entity_type.lower().strip()
|
||||
|
||||
# Handle some common variations
|
||||
if "investment" in cleaned_type:
|
||||
return symbols["investment company"]
|
||||
if "real estate" in cleaned_type or "reit" in cleaned_type:
|
||||
return symbols["reit"]
|
||||
|
||||
# Return default question mark if type not found
|
||||
return symbols.get(cleaned_type, "")
|
||||
|
||||
@staticmethod
|
||||
def _format_fiscal_year_date(date_str):
|
||||
"""Format fiscal year end date in a human-readable format."""
|
||||
if not date_str:
|
||||
return "-"
|
||||
|
||||
# Dictionary of months
|
||||
months = {
|
||||
"01": "Jan", "02": "Feb", "03": "Mar",
|
||||
"04": "Apr", "05": "May", "06": "Jun",
|
||||
"07": "Jul", "08": "Aug", "09": "Sep",
|
||||
"10": "Oct", "11": "Nov", "12": "Dec"
|
||||
}
|
||||
|
||||
# Extract month and day
|
||||
month = date_str[:2]
|
||||
if month not in months:
|
||||
return date_str
|
||||
|
||||
try:
|
||||
day = str(int(date_str[2:])) # Remove leading zero
|
||||
return f"{months[month]} {day}"
|
||||
except (ValueError, IndexError):
|
||||
return date_str
|
||||
|
||||
|
||||
class CompanyData(EntityData):
|
||||
"""
|
||||
Specialized container for company data loaded from SEC submissions API.
|
||||
|
||||
This is a specialized version of EntityData specifically for companies.
|
||||
It adds company-specific methods and properties.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Construct a new CompanyData object."""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@property
|
||||
def industry(self) -> str:
|
||||
"""Get the industry description for this company."""
|
||||
return getattr(self, 'sic_description', '')
|
||||
|
||||
def get_ticker(self) -> Optional[str]:
|
||||
"""Get the primary ticker for this company."""
|
||||
if self.tickers and len(self.tickers) > 0:
|
||||
return self.tickers[0]
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
ticker = self.get_ticker()
|
||||
ticker_str = f" - {ticker}" if ticker else ""
|
||||
return f"CompanyData({self.name} [{self.cik}]{ticker_str})"
|
||||
|
||||
|
||||
# Compile regex patterns for better performance
|
||||
_COMPANY_TYPES_PATTERN = re.compile(r"(L\.?L\.?C\.?|Inc\.?|Ltd\.?|L\.?P\.?|/[A-Za-z]{2,3}/?| CORP(ORATION)?|PLC| AG)$",
|
||||
re.IGNORECASE)
|
||||
_PUNCTUATION_PATTERN = re.compile(r"\.|,")
|
||||
|
||||
|
||||
def preprocess_company(company: str) -> str:
|
||||
"""preprocess the company name for storing in the search index"""
|
||||
comp = _COMPANY_TYPES_PATTERN.sub("", company.lower())
|
||||
comp = _PUNCTUATION_PATTERN.sub("", comp)
|
||||
return comp.strip()
|
||||
|
||||
|
||||
def create_default_entity_data(cik: int) -> 'EntityData':
|
||||
"""
|
||||
Create a default EntityData instance for when entity data cannot be found.
|
||||
|
||||
Args:
|
||||
cik: The CIK number to use for the entity
|
||||
|
||||
Returns:
|
||||
A minimal EntityData instance with default values
|
||||
"""
|
||||
# Create a minimal EntityData with blank/empty values
|
||||
empty_address = Address(
|
||||
street1="",
|
||||
street2="",
|
||||
city="",
|
||||
state_or_country="",
|
||||
zipcode="",
|
||||
state_or_country_desc=""
|
||||
)
|
||||
|
||||
# Import using lazy import cache
|
||||
empty_company_filings = lazy_import('edgar.entity.filings.empty_company_filings')
|
||||
|
||||
# Use the CIK as the name since we don't know the real name
|
||||
name = f"Entity {cik}"
|
||||
|
||||
# Create a minimal entity data
|
||||
return EntityData(
|
||||
cik=cik,
|
||||
name=name,
|
||||
tickers=[],
|
||||
exchanges=[],
|
||||
filings=empty_company_filings(cik, name),
|
||||
business_address=empty_address,
|
||||
mailing_address=empty_address,
|
||||
category="",
|
||||
sic=None,
|
||||
sic_description="",
|
||||
fiscal_year_end="",
|
||||
entity_type="",
|
||||
phone="",
|
||||
flags="",
|
||||
insider_transaction_for_owner_exists=False,
|
||||
insider_transaction_for_issuer_exists=False,
|
||||
ein="",
|
||||
description="",
|
||||
website="",
|
||||
investor_website="",
|
||||
state_of_incorporation="",
|
||||
state_of_incorporation_description="",
|
||||
former_names=[],
|
||||
files=[]
|
||||
)
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Process the learned canonical structures into a simplified mappings file
|
||||
optimized for the Facts API.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
|
||||
def process_mappings():
|
||||
"""Convert canonical structures to simple concept->statement mappings."""
|
||||
|
||||
# Load canonical structures
|
||||
with open('learned_mappings.json', 'r') as f:
|
||||
canonical = json.load(f)
|
||||
|
||||
# Create simplified mappings
|
||||
mappings = {}
|
||||
metadata = {
|
||||
'version': '1.0.0',
|
||||
'generated': '2025-08-13',
|
||||
'companies_analyzed': 133,
|
||||
'source': 'structural_learning_production_run'
|
||||
}
|
||||
|
||||
# Process each statement type
|
||||
for statement_type, concepts in canonical.items():
|
||||
for concept_data in concepts:
|
||||
concept = concept_data['concept']
|
||||
|
||||
# Only include high-confidence mappings
|
||||
if concept_data['occurrence_rate'] >= 0.3: # 30% threshold
|
||||
mappings[concept] = {
|
||||
'statement_type': statement_type,
|
||||
'confidence': concept_data['occurrence_rate'],
|
||||
'label': concept_data['label'],
|
||||
'parent': concept_data.get('parent'),
|
||||
'is_abstract': concept_data.get('is_abstract', False),
|
||||
'is_total': concept_data.get('is_total', False),
|
||||
'section': concept_data.get('section'),
|
||||
'avg_depth': concept_data.get('avg_depth', 0)
|
||||
}
|
||||
|
||||
# Save processed mappings
|
||||
output = {
|
||||
'metadata': metadata,
|
||||
'mappings': mappings
|
||||
}
|
||||
|
||||
with open('statement_mappings_v1.json', 'w') as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
print(f"Processed {len(mappings)} concept mappings")
|
||||
print("Statement distribution:")
|
||||
|
||||
stmt_counts = {}
|
||||
for concept, data in mappings.items():
|
||||
stmt = data['statement_type']
|
||||
stmt_counts[stmt] = stmt_counts.get(stmt, 0) + 1
|
||||
|
||||
for stmt, count in sorted(stmt_counts.items()):
|
||||
print(f" {stmt}: {count}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
process_mappings()
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1070
venv/lib/python3.10/site-packages/edgar/entity/docs/Company.md
Normal file
1070
venv/lib/python3.10/site-packages/edgar/entity/docs/Company.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,557 @@
|
||||
# EntityFiling Class Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The `EntityFiling` class extends the base `Filing` class with additional entity-specific metadata and functionality. When you access filings through a `Company` object, you get `EntityFiling` instances that include enriched information from the SEC's company submissions API.
|
||||
|
||||
**Key Differences from Base Filing:**
|
||||
- Additional metadata (items, acceptance datetime, file number, etc.)
|
||||
- `related_filings()` method to find filings by file number
|
||||
- XBRL format indicators (is_xbrl, is_inline_xbrl)
|
||||
- Report date separate from filing date
|
||||
- Access to entity context
|
||||
|
||||
## Getting EntityFilings
|
||||
|
||||
### From Company
|
||||
|
||||
```python
|
||||
from edgar import Company
|
||||
|
||||
# Get company
|
||||
company = Company("AAPL")
|
||||
|
||||
# Get filings - returns EntityFiling instances
|
||||
filings = company.get_filings(form="10-K")
|
||||
filing = filings.latest()
|
||||
|
||||
# filing is now an EntityFiling, not base Filing
|
||||
print(type(filing)) # <class 'edgar.entity.filings.EntityFiling'>
|
||||
```
|
||||
|
||||
### Automatic Enhancement
|
||||
|
||||
When you call `company.get_filings()`, the filings are automatically EntityFiling instances with additional metadata.
|
||||
|
||||
## Common Actions
|
||||
|
||||
Quick reference for the most frequently used EntityFiling methods:
|
||||
|
||||
### Access Filing Content
|
||||
```python
|
||||
# Get HTML content
|
||||
html = filing.html()
|
||||
|
||||
# Get plain text
|
||||
text = filing.text()
|
||||
|
||||
# Get markdown formatted content
|
||||
markdown = filing.markdown()
|
||||
```
|
||||
|
||||
### Get Structured Data
|
||||
```python
|
||||
# Get form-specific object (10-K, 10-Q, 8-K, etc.)
|
||||
report = filing.obj()
|
||||
|
||||
# Get XBRL financial data
|
||||
xbrl = filing.xbrl()
|
||||
```
|
||||
|
||||
### Entity-Specific Features
|
||||
```python
|
||||
# Find related filings (amendments, etc.)
|
||||
related = filing.related_filings()
|
||||
|
||||
# Check XBRL availability
|
||||
if filing.is_xbrl:
|
||||
xbrl = filing.xbrl()
|
||||
|
||||
# Access entity-specific metadata
|
||||
print(filing.report_date) # Period end date
|
||||
print(filing.items) # 8-K items
|
||||
print(filing.file_number) # SEC file number
|
||||
```
|
||||
|
||||
### View in Browser
|
||||
```python
|
||||
# Open filing in web browser
|
||||
filing.open()
|
||||
```
|
||||
|
||||
### Get Attachments
|
||||
```python
|
||||
# Access all filing attachments
|
||||
attachments = filing.attachments
|
||||
```
|
||||
|
||||
## EntityFiling-Specific Attributes
|
||||
|
||||
### Additional Metadata
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `report_date` | str | Period end date for the report (YYYY-MM-DD) |
|
||||
| `acceptance_datetime` | str | SEC acceptance timestamp |
|
||||
| `file_number` | str | SEC file number for tracking related filings |
|
||||
| `items` | str | 8-K items (e.g., "2.02,9.01") |
|
||||
| `size` | int | Filing size in bytes |
|
||||
| `primary_document` | str | Primary document filename |
|
||||
| `primary_doc_description` | str | Description of primary document |
|
||||
| `is_xbrl` | bool | Whether filing has XBRL data |
|
||||
| `is_inline_xbrl` | bool | Whether filing uses inline XBRL |
|
||||
|
||||
### Accessing Additional Metadata
|
||||
|
||||
```python
|
||||
filing = company.get_filings(form="10-K").latest()
|
||||
|
||||
# Entity-specific attributes
|
||||
print(f"Report Date: {filing.report_date}")
|
||||
print(f"Accepted: {filing.acceptance_datetime}")
|
||||
print(f"File Number: {filing.file_number}")
|
||||
print(f"Has XBRL: {filing.is_xbrl}")
|
||||
print(f"Inline XBRL: {filing.is_inline_xbrl}")
|
||||
print(f"Size: {filing.size:,} bytes")
|
||||
```
|
||||
|
||||
## Working with 8-K Items
|
||||
|
||||
The `items` attribute is especially useful for 8-K current reports, which can cover multiple topics.
|
||||
|
||||
### Understanding 8-K Items
|
||||
|
||||
8-K items indicate what events or information the filing reports:
|
||||
- **2.02** - Results of Operations and Financial Condition
|
||||
- **5.02** - Departure/Election of Directors or Officers
|
||||
- **8.01** - Other Events
|
||||
- **9.01** - Financial Statements and Exhibits
|
||||
|
||||
```python
|
||||
# Get 8-K filings
|
||||
filings_8k = company.get_filings(form="8-K")
|
||||
|
||||
# Filter by items
|
||||
for filing in filings_8k:
|
||||
if filing.items and "2.02" in filing.items:
|
||||
print(f"Earnings 8-K: {filing.filing_date}")
|
||||
print(f" Items: {filing.items}")
|
||||
```
|
||||
|
||||
### Important Note on Legacy Filings
|
||||
|
||||
**Data Source Limitation**: The `items` value comes from SEC metadata, not from parsing the filing document.
|
||||
|
||||
**For Legacy SGML Filings (1999-2001)**: The SEC's historical metadata may be incorrect or incomplete. Modern XML filings (2005+) have accurate metadata.
|
||||
|
||||
**Workaround**: For accurate item extraction from legacy SGML 8-K filings, parse the filing text directly:
|
||||
|
||||
```python
|
||||
# For legacy filings, parse the document
|
||||
filing_text = filing.text()
|
||||
|
||||
# Use regex to find items (adjust pattern as needed)
|
||||
import re
|
||||
items_pattern = r'Item\s+(\d+\.\d+)'
|
||||
found_items = re.findall(items_pattern, filing_text, re.IGNORECASE)
|
||||
```
|
||||
|
||||
## Related Filings
|
||||
|
||||
### Finding Related Filings by File Number
|
||||
|
||||
Use the `file_number` to find amendments, related documents, or filings from the same series:
|
||||
|
||||
```python
|
||||
# Get original filing
|
||||
filing = company.get_filings(form="10-K").latest()
|
||||
|
||||
# Find all related filings (amendments, etc.)
|
||||
related = filing.related_filings()
|
||||
|
||||
print(f"Original filing: {filing.accession_no}")
|
||||
print(f"Related filings: {len(related)}")
|
||||
|
||||
for f in related:
|
||||
print(f" {f.form} - {f.filing_date}")
|
||||
```
|
||||
|
||||
### Use Cases for Related Filings
|
||||
|
||||
**1. Find Amendments:**
|
||||
```python
|
||||
# Get original 10-K
|
||||
filing_10k = company.get_filings(form="10-K").latest()
|
||||
|
||||
# Find any amendments
|
||||
related = filing_10k.related_filings()
|
||||
amendments = related.filter(form="10-K/A")
|
||||
|
||||
if len(amendments) > 0:
|
||||
print("Filing was amended:")
|
||||
for amendment in amendments:
|
||||
print(f" {amendment.filing_date}: {amendment.accession_no}")
|
||||
```
|
||||
|
||||
**2. Track Filing Series:**
|
||||
```python
|
||||
# Get S-1 registration
|
||||
s1 = company.get_filings(form="S-1").latest()
|
||||
|
||||
# Find all related S-1 amendments
|
||||
series = s1.related_filings()
|
||||
print(f"Registration series: {len(series)} filings")
|
||||
```
|
||||
|
||||
## XBRL Indicators
|
||||
|
||||
The `is_xbrl` and `is_inline_xbrl` attributes help determine if structured financial data is available.
|
||||
|
||||
### Checking XBRL Availability
|
||||
|
||||
```python
|
||||
filing = company.get_filings(form="10-K").latest()
|
||||
|
||||
if filing.is_xbrl:
|
||||
print("Filing has XBRL data")
|
||||
|
||||
if filing.is_inline_xbrl:
|
||||
print(" Uses inline XBRL format")
|
||||
xbrl = filing.xbrl() # Parse XBRL data
|
||||
else:
|
||||
print(" Uses traditional XBRL format")
|
||||
else:
|
||||
print("No XBRL data available")
|
||||
```
|
||||
|
||||
### Filtering by XBRL
|
||||
|
||||
```python
|
||||
# Get only filings with XBRL data
|
||||
filings = company.get_filings(form="10-Q")
|
||||
|
||||
xbrl_filings = [f for f in filings if f.is_xbrl]
|
||||
print(f"{len(xbrl_filings)} of {len(filings)} have XBRL")
|
||||
|
||||
# Check inline XBRL adoption
|
||||
inline_count = sum(1 for f in xbrl_filings if f.is_inline_xbrl)
|
||||
print(f"{inline_count} use inline XBRL format")
|
||||
```
|
||||
|
||||
## Report Date vs Filing Date
|
||||
|
||||
EntityFiling provides both `report_date` and `filing_date`:
|
||||
|
||||
- **`report_date`**: Period end date (what the filing reports on)
|
||||
- **`filing_date`**: When the filing was submitted to SEC
|
||||
|
||||
```python
|
||||
filing = company.get_filings(form="10-Q").latest()
|
||||
|
||||
print(f"Period Ended: {filing.report_date}")
|
||||
print(f"Filed On: {filing.filing_date}")
|
||||
|
||||
# Calculate filing lag
|
||||
from datetime import datetime
|
||||
report_dt = datetime.strptime(filing.report_date, '%Y-%m-%d')
|
||||
filing_dt = datetime.strptime(filing.filing_date, '%Y-%m-%d')
|
||||
lag_days = (filing_dt - report_dt).days
|
||||
|
||||
print(f"Filing lag: {lag_days} days")
|
||||
```
|
||||
|
||||
## Common Workflows
|
||||
|
||||
### Analyzing 8-K Patterns
|
||||
|
||||
```python
|
||||
# Get all 8-K filings
|
||||
filings_8k = company.get_filings(form="8-K")
|
||||
|
||||
# Categorize by item
|
||||
from collections import Counter
|
||||
item_counts = Counter()
|
||||
|
||||
for filing in filings_8k:
|
||||
if filing.items:
|
||||
for item in filing.items.split(','):
|
||||
item_counts[item.strip()] += 1
|
||||
|
||||
# Show most common 8-K topics
|
||||
print("Most common 8-K items:")
|
||||
for item, count in item_counts.most_common(5):
|
||||
print(f" Item {item}: {count} filings")
|
||||
```
|
||||
|
||||
### Track Amendment Activity
|
||||
|
||||
```python
|
||||
# Get all 10-K filings including amendments
|
||||
all_10k = company.get_filings(form=["10-K", "10-K/A"])
|
||||
|
||||
# Group by year
|
||||
from collections import defaultdict
|
||||
by_year = defaultdict(list)
|
||||
|
||||
for filing in all_10k:
|
||||
year = filing.report_date[:4]
|
||||
by_year[year].append(filing)
|
||||
|
||||
# Check which years had amendments
|
||||
for year in sorted(by_year.keys(), reverse=True):
|
||||
filings = by_year[year]
|
||||
has_amendment = any('/A' in f.form for f in filings)
|
||||
status = "amended" if has_amendment else "original"
|
||||
print(f"{year}: {len(filings)} filing(s) - {status}")
|
||||
```
|
||||
|
||||
### Find Earnings Announcements
|
||||
|
||||
```python
|
||||
# Find 8-K filings with earnings (Item 2.02)
|
||||
earnings_8k = []
|
||||
|
||||
for filing in company.get_filings(form="8-K"):
|
||||
if filing.items and "2.02" in filing.items:
|
||||
earnings_8k.append(filing)
|
||||
|
||||
print(f"Found {len(earnings_8k)} earnings 8-K filings")
|
||||
|
||||
# Show filing timeline
|
||||
for filing in earnings_8k[-5:]: # Last 5
|
||||
print(f"{filing.report_date}: {filing.filing_date}")
|
||||
```
|
||||
|
||||
### Check XBRL Adoption Timeline
|
||||
|
||||
```python
|
||||
# Track when company started using XBRL
|
||||
filings = company.get_filings(form="10-K")
|
||||
|
||||
for filing in filings:
|
||||
xbrl_status = "inline XBRL" if filing.is_inline_xbrl else "XBRL" if filing.is_xbrl else "no XBRL"
|
||||
print(f"{filing.filing_date}: {xbrl_status}")
|
||||
```
|
||||
|
||||
## Integration with Base Filing Features
|
||||
|
||||
EntityFiling inherits all methods from the base Filing class:
|
||||
|
||||
```python
|
||||
filing = company.get_filings(form="10-K").latest()
|
||||
|
||||
# All base Filing methods work
|
||||
html = filing.html()
|
||||
text = filing.text()
|
||||
markdown = filing.markdown()
|
||||
xbrl = filing.xbrl()
|
||||
filing.open()
|
||||
|
||||
# PLUS entity-specific features
|
||||
related = filing.related_filings()
|
||||
print(f"8-K items: {filing.items}")
|
||||
print(f"Has XBRL: {filing.is_xbrl}")
|
||||
```
|
||||
|
||||
## Comparison: EntityFiling vs Base Filing
|
||||
|
||||
### When You Get Each Type
|
||||
|
||||
**EntityFiling** - From Company context:
|
||||
```python
|
||||
company = Company("AAPL")
|
||||
filing = company.get_filings(form="10-K").latest()
|
||||
# Type: EntityFiling (with extra metadata)
|
||||
```
|
||||
|
||||
**Base Filing** - From general search:
|
||||
```python
|
||||
from edgar import get_filings
|
||||
filings = get_filings(2024, 3, form="10-K")
|
||||
filing = filings[0]
|
||||
# Type: Filing (base class)
|
||||
```
|
||||
|
||||
### Feature Comparison
|
||||
|
||||
| Feature | Base Filing | EntityFiling |
|
||||
|---------|-------------|--------------|
|
||||
| Basic metadata | ✅ | ✅ |
|
||||
| Content access (html, text) | ✅ | ✅ |
|
||||
| XBRL parsing | ✅ | ✅ |
|
||||
| Report date | ❌ | ✅ |
|
||||
| Acceptance datetime | ❌ | ✅ |
|
||||
| File number | ❌ | ✅ |
|
||||
| 8-K items | ❌ | ✅ |
|
||||
| XBRL indicators | ❌ | ✅ |
|
||||
| related_filings() | ❌ | ✅ |
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Use EntityFiling for Company Analysis
|
||||
|
||||
When working with a specific company, always access filings through the Company object to get EntityFiling benefits:
|
||||
|
||||
```python
|
||||
# Good - get EntityFiling with metadata
|
||||
company = Company("AAPL")
|
||||
filing = company.get_filings(form="10-K").latest()
|
||||
|
||||
# Less ideal - get base Filing without metadata
|
||||
filings = get_filings(2024, 3, form="10-K").filter(ticker="AAPL")
|
||||
filing = filings[0]
|
||||
```
|
||||
|
||||
### 2. Check XBRL Availability Before Parsing
|
||||
|
||||
```python
|
||||
filing = company.get_filings(form="10-K").latest()
|
||||
|
||||
if filing.is_xbrl:
|
||||
xbrl = filing.xbrl()
|
||||
statements = xbrl.statements
|
||||
else:
|
||||
print("No structured financial data available")
|
||||
```
|
||||
|
||||
### 3. Handle Missing Items Gracefully
|
||||
|
||||
```python
|
||||
# Items may be None or empty string
|
||||
if filing.items:
|
||||
items_list = filing.items.split(',')
|
||||
else:
|
||||
items_list = []
|
||||
```
|
||||
|
||||
### 4. Use Related Filings to Track Changes
|
||||
|
||||
```python
|
||||
# Find if filing was amended
|
||||
filing = company.get_filings(form="10-K").latest()
|
||||
related = filing.related_filings()
|
||||
|
||||
amendments = [f for f in related if '/A' in f.form]
|
||||
if amendments:
|
||||
print(f"This filing has {len(amendments)} amendment(s)")
|
||||
latest_amendment = amendments[-1]
|
||||
print(f"Most recent: {latest_amendment.filing_date}")
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Missing Attributes
|
||||
|
||||
Not all filings have all attributes populated:
|
||||
|
||||
```python
|
||||
filing = company.get_filings(form="8-K").latest()
|
||||
|
||||
# Some filings may not have items
|
||||
items = filing.items if filing.items else "Not specified"
|
||||
|
||||
# File number should always be present for EntityFiling
|
||||
if filing.file_number:
|
||||
print(f"File number: {filing.file_number}")
|
||||
```
|
||||
|
||||
### XBRL Parsing Failures
|
||||
|
||||
Even if `is_xbrl` is True, parsing can fail:
|
||||
|
||||
```python
|
||||
if filing.is_xbrl:
|
||||
try:
|
||||
xbrl = filing.xbrl()
|
||||
statements = xbrl.statements
|
||||
except Exception as e:
|
||||
print(f"XBRL parsing failed: {e}")
|
||||
# Fall back to text parsing
|
||||
text = filing.text()
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Efficient Filtering
|
||||
|
||||
Use EntityFiling metadata to filter before expensive operations:
|
||||
|
||||
```python
|
||||
# Filter by XBRL availability first
|
||||
filings = company.get_filings(form="10-Q")
|
||||
xbrl_filings = [f for f in filings if f.is_xbrl]
|
||||
|
||||
# Then parse only those with XBRL
|
||||
for filing in xbrl_filings:
|
||||
xbrl = filing.xbrl()
|
||||
# Process XBRL data...
|
||||
```
|
||||
|
||||
### Batch Operations
|
||||
|
||||
When processing many filings, check size first:
|
||||
|
||||
```python
|
||||
filings = company.get_filings()
|
||||
|
||||
# Process smaller filings first
|
||||
sorted_filings = sorted(filings, key=lambda f: f.size)
|
||||
|
||||
for filing in sorted_filings[:10]: # Process 10 smallest
|
||||
html = filing.html()
|
||||
# Process content...
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "EntityFiling has no attribute 'X'"
|
||||
|
||||
You're trying to use EntityFiling-specific features on a base Filing object:
|
||||
|
||||
```python
|
||||
# Problem: Base filing doesn't have entity attributes
|
||||
filings = get_filings(2024, 3)
|
||||
filing = filings[0]
|
||||
# filing.report_date # AttributeError!
|
||||
|
||||
# Solution: Get from company for EntityFiling
|
||||
company = Company(filing.cik)
|
||||
entity_filing = company.get_filings(
|
||||
accession_number=filing.accession_no
|
||||
)[0]
|
||||
# entity_filing.report_date # Works!
|
||||
```
|
||||
|
||||
### Related Filings Returns Empty
|
||||
|
||||
The file number might not link to other filings:
|
||||
|
||||
```python
|
||||
related = filing.related_filings()
|
||||
|
||||
if len(related) == 0:
|
||||
print("No related filings found")
|
||||
# This is normal for standalone filings
|
||||
else:
|
||||
print(f"Found {len(related)} related filing(s)")
|
||||
```
|
||||
|
||||
### Items Not Showing for 8-K
|
||||
|
||||
Check if it's a legacy filing:
|
||||
|
||||
```python
|
||||
filing = company.get_filings(form="8-K")[0]
|
||||
|
||||
if not filing.items or filing.items == "":
|
||||
# Check filing year
|
||||
filing_year = int(filing.filing_date[:4])
|
||||
|
||||
if filing_year < 2005:
|
||||
print("Legacy SGML filing - items may be missing from metadata")
|
||||
print("Parse filing text for accurate item identification")
|
||||
else:
|
||||
print("Modern filing with no items specified")
|
||||
```
|
||||
|
||||
This comprehensive guide covers the unique features and workflows available when working with EntityFiling objects in edgartools.
|
||||
@@ -0,0 +1,671 @@
|
||||
# EntityFilings Class Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The `EntityFilings` class extends the base `Filings` class with entity-specific functionality. When you access filings through a `Company` object, you get an `EntityFilings` collection that maintains entity context (CIK and company name) and returns `EntityFiling` instances with enriched metadata.
|
||||
|
||||
**Key Differences from Base Filings:**
|
||||
- Maintains entity context (CIK, company name)
|
||||
- Returns `EntityFiling` instances (not base `Filing`)
|
||||
- All filtering/selection methods preserve `EntityFilings` type
|
||||
- Additional metadata from SEC company submissions API
|
||||
- Direct access to entity-specific features
|
||||
|
||||
## Getting EntityFilings
|
||||
|
||||
### From Company
|
||||
|
||||
```python
|
||||
from edgar import Company
|
||||
|
||||
# Get company
|
||||
company = Company("AAPL")
|
||||
|
||||
# Get filings - returns EntityFilings collection
|
||||
filings = company.get_filings()
|
||||
|
||||
# filings is EntityFilings, not base Filings
|
||||
print(type(filings)) # <class 'edgar.entity.filings.EntityFilings'>
|
||||
|
||||
# Each filing in the collection is EntityFiling
|
||||
filing = filings[0]
|
||||
print(type(filing)) # <class 'edgar.entity.filings.EntityFiling'>
|
||||
```
|
||||
|
||||
### With Form Filters
|
||||
|
||||
```python
|
||||
# Get specific form types
|
||||
filings_10k = company.get_filings(form="10-K")
|
||||
filings_8k = company.get_filings(form="8-K")
|
||||
filings_multi = company.get_filings(form=["10-K", "10-Q"])
|
||||
```
|
||||
|
||||
## Common Actions
|
||||
|
||||
Quick reference for the most frequently used EntityFilings methods:
|
||||
|
||||
### Get Individual Filings
|
||||
```python
|
||||
# Get most recent filing
|
||||
latest = filings.latest()
|
||||
|
||||
# Get multiple recent filings
|
||||
latest_5 = filings.latest(5)
|
||||
|
||||
# Get filing by index
|
||||
filing = filings[0]
|
||||
filing = filings.get_filing_at(5)
|
||||
```
|
||||
|
||||
### Filter the Collection
|
||||
```python
|
||||
# Filter by form type
|
||||
annual_reports = filings.filter(form="10-K")
|
||||
|
||||
# Filter by date
|
||||
recent = filings.filter(filing_date="2024-01-01:")
|
||||
|
||||
# Exclude amendments
|
||||
originals_only = filings.filter(amendments=False)
|
||||
|
||||
# Combined filters
|
||||
filtered = filings.filter(
|
||||
form=["10-K", "10-Q"],
|
||||
filing_date="2023-01-01:2023-12-31",
|
||||
amendments=False
|
||||
)
|
||||
```
|
||||
|
||||
### Navigate Pages
|
||||
```python
|
||||
# For large collections (multiple pages)
|
||||
next_page = filings.next()
|
||||
prev_page = filings.previous()
|
||||
```
|
||||
|
||||
### Convert to DataFrame
|
||||
```python
|
||||
# Export to pandas
|
||||
df = filings.to_pandas()
|
||||
|
||||
# Select specific columns
|
||||
df = filings.to_pandas('form', 'filing_date', 'accession_number')
|
||||
```
|
||||
|
||||
### Select Subsets
|
||||
```python
|
||||
# Get first/last n filings
|
||||
first_10 = filings.head(10)
|
||||
last_10 = filings.tail(10)
|
||||
|
||||
# Random sample
|
||||
sample = filings.sample(20)
|
||||
```
|
||||
|
||||
## EntityFilings-Specific Features
|
||||
|
||||
### Entity Context
|
||||
|
||||
EntityFilings maintains the entity context throughout operations:
|
||||
|
||||
```python
|
||||
filings = company.get_filings()
|
||||
|
||||
# Access entity information
|
||||
print(filings.cik) # Company CIK
|
||||
print(filings.company_name) # Company name
|
||||
|
||||
# Context preserved through operations
|
||||
filtered = filings.filter(form="10-K")
|
||||
print(filtered.cik) # Same CIK
|
||||
print(filtered.company_name) # Same company name
|
||||
```
|
||||
|
||||
### Returns EntityFiling Instances
|
||||
|
||||
All methods that return individual filings return `EntityFiling` (not base `Filing`):
|
||||
|
||||
```python
|
||||
# Get latest returns EntityFiling
|
||||
filing = filings.latest()
|
||||
print(type(filing)) # EntityFiling
|
||||
|
||||
# Indexing returns EntityFiling
|
||||
filing = filings[0]
|
||||
print(type(filing)) # EntityFiling
|
||||
|
||||
# Access EntityFiling-specific attributes
|
||||
print(filing.report_date) # Period end date
|
||||
print(filing.items) # 8-K items
|
||||
print(filing.is_xbrl) # XBRL indicator
|
||||
```
|
||||
|
||||
### Type Preservation
|
||||
|
||||
All collection methods preserve the `EntityFilings` type:
|
||||
|
||||
```python
|
||||
# filter() returns EntityFilings
|
||||
filtered = filings.filter(form="10-K")
|
||||
print(type(filtered)) # EntityFilings
|
||||
|
||||
# head() returns EntityFilings
|
||||
first_10 = filings.head(10)
|
||||
print(type(first_10)) # EntityFilings
|
||||
|
||||
# latest(n) with n>1 returns EntityFilings
|
||||
latest_5 = filings.latest(5)
|
||||
print(type(latest_5)) # EntityFilings
|
||||
```
|
||||
|
||||
## Core Methods
|
||||
|
||||
### latest(n=1)
|
||||
|
||||
Get the most recent filing(s):
|
||||
|
||||
```python
|
||||
# Get single latest filing (returns EntityFiling)
|
||||
latest = filings.latest()
|
||||
print(f"Most recent: {latest.form} on {latest.filing_date}")
|
||||
|
||||
# Get multiple latest filings (returns EntityFilings)
|
||||
latest_5 = filings.latest(5)
|
||||
for filing in latest_5:
|
||||
print(f"{filing.form}: {filing.filing_date}")
|
||||
```
|
||||
|
||||
### filter()
|
||||
|
||||
Filter filings by various criteria:
|
||||
|
||||
```python
|
||||
# Filter by form type
|
||||
filings_10k = filings.filter(form="10-K")
|
||||
filings_8k = filings.filter(form="8-K")
|
||||
filings_annual = filings.filter(form=["10-K", "10-K/A"])
|
||||
|
||||
# Filter by date
|
||||
recent = filings.filter(filing_date="2024-01-01:")
|
||||
date_range = filings.filter(filing_date="2023-01-01:2023-12-31")
|
||||
specific_date = filings.filter(filing_date="2024-03-15")
|
||||
|
||||
# Exclude amendments
|
||||
no_amendments = filings.filter(amendments=False)
|
||||
|
||||
# Filter by accession number
|
||||
specific = filings.filter(accession_number="0000320193-24-000123")
|
||||
|
||||
# Combined filters
|
||||
filtered = filings.filter(
|
||||
form="10-Q",
|
||||
filing_date="2024-01-01:",
|
||||
amendments=False
|
||||
)
|
||||
```
|
||||
|
||||
**Note**: Unlike base `Filings.filter()`, `EntityFilings.filter()` doesn't support `cik` or `ticker` parameters since the collection is already scoped to a single entity.
|
||||
|
||||
### head(n) / tail(n)
|
||||
|
||||
Get first or last n filings:
|
||||
|
||||
```python
|
||||
# Get first 10 filings
|
||||
first_10 = filings.head(10)
|
||||
|
||||
# Get last 10 filings
|
||||
last_10 = filings.tail(10)
|
||||
|
||||
# Both return EntityFilings collections
|
||||
print(type(first_10)) # EntityFilings
|
||||
print(type(last_10)) # EntityFilings
|
||||
```
|
||||
|
||||
### sample(n)
|
||||
|
||||
Get random sample of filings:
|
||||
|
||||
```python
|
||||
# Get random sample of 20 filings
|
||||
sample = filings.sample(20)
|
||||
|
||||
# Returns EntityFilings collection
|
||||
print(type(sample)) # EntityFilings
|
||||
```
|
||||
|
||||
### Access by Index
|
||||
|
||||
```python
|
||||
# Direct indexing
|
||||
first_filing = filings[0]
|
||||
tenth_filing = filings[9]
|
||||
|
||||
# Explicit method
|
||||
filing = filings.get_filing_at(5)
|
||||
|
||||
# All return EntityFiling instances
|
||||
```
|
||||
|
||||
## Pagination
|
||||
|
||||
For large filing collections, EntityFilings supports pagination:
|
||||
|
||||
### next() / previous()
|
||||
|
||||
```python
|
||||
# Display shows page info if multiple pages
|
||||
print(filings)
|
||||
# Shows: "Showing 1 to 50 of 250 filings. Page using ← prev() and next() →"
|
||||
|
||||
# Navigate to next page
|
||||
next_page = filings.next()
|
||||
|
||||
# Navigate to previous page
|
||||
prev_page = filings.previous()
|
||||
|
||||
# Both return EntityFilings with new page of data
|
||||
```
|
||||
|
||||
### Page Navigation Example
|
||||
|
||||
```python
|
||||
# Start with first page
|
||||
current_page = company.get_filings()
|
||||
print(current_page)
|
||||
|
||||
# Move through pages
|
||||
page_2 = current_page.next()
|
||||
page_3 = page_2.next()
|
||||
|
||||
# Go back
|
||||
page_2_again = page_3.previous()
|
||||
|
||||
# At end of pages
|
||||
last_page = current_page
|
||||
while True:
|
||||
next_page = last_page.next()
|
||||
if next_page is None:
|
||||
break
|
||||
last_page = next_page
|
||||
```
|
||||
|
||||
## Data Conversion & Export
|
||||
|
||||
### to_pandas()
|
||||
|
||||
Convert to pandas DataFrame:
|
||||
|
||||
```python
|
||||
# All columns
|
||||
df = filings.to_pandas()
|
||||
|
||||
# Specific columns
|
||||
df = filings.to_pandas('form', 'filing_date', 'accession_number')
|
||||
|
||||
# Shows entity-specific columns:
|
||||
# form, filing_date, reportDate, acceptanceDateTime, fileNumber,
|
||||
# items, size, primaryDocument, isXBRL, isInlineXBRL, etc.
|
||||
```
|
||||
|
||||
### to_dict()
|
||||
|
||||
Convert to dictionary:
|
||||
|
||||
```python
|
||||
# Convert to dict
|
||||
data = filings.to_dict()
|
||||
|
||||
# Limit rows
|
||||
data = filings.to_dict(max_rows=100)
|
||||
```
|
||||
|
||||
### save() / save_parquet()
|
||||
|
||||
Save to Parquet file:
|
||||
|
||||
```python
|
||||
# Save as Parquet
|
||||
filings.save_parquet("company_filings.parquet")
|
||||
|
||||
# Alternative
|
||||
filings.save("company_filings.parquet")
|
||||
```
|
||||
|
||||
## Common Workflows
|
||||
|
||||
### Get Most Recent Annual Report
|
||||
|
||||
```python
|
||||
company = Company("AAPL")
|
||||
|
||||
# Get all 10-K filings
|
||||
filings_10k = company.get_filings(form="10-K")
|
||||
|
||||
# Get most recent
|
||||
latest_10k = filings_10k.latest()
|
||||
|
||||
print(f"Latest 10-K: {latest_10k.filing_date}")
|
||||
print(f"Period: {latest_10k.report_date}")
|
||||
|
||||
# Access XBRL if available
|
||||
if latest_10k.is_xbrl:
|
||||
xbrl = latest_10k.xbrl()
|
||||
```
|
||||
|
||||
### Analyze Quarterly Reports
|
||||
|
||||
```python
|
||||
# Get all 10-Q filings
|
||||
filings_10q = company.get_filings(form="10-Q")
|
||||
|
||||
# Get last 4 quarters
|
||||
last_4_quarters = filings_10q.latest(4)
|
||||
|
||||
# Analyze each quarter
|
||||
for filing in last_4_quarters:
|
||||
print(f"Quarter ending {filing.report_date}:")
|
||||
print(f" Filed: {filing.filing_date}")
|
||||
print(f" XBRL: {filing.is_xbrl}")
|
||||
```
|
||||
|
||||
### Find 8-K Earnings Announcements
|
||||
|
||||
```python
|
||||
# Get all 8-K filings
|
||||
filings_8k = company.get_filings(form="8-K")
|
||||
|
||||
# Filter for earnings-related items
|
||||
earnings_filings = []
|
||||
for filing in filings_8k:
|
||||
if filing.items and "2.02" in filing.items:
|
||||
earnings_filings.append(filing)
|
||||
|
||||
print(f"Found {len(earnings_filings)} earnings 8-Ks")
|
||||
|
||||
# Show recent earnings dates
|
||||
for filing in earnings_filings[:5]:
|
||||
print(f"{filing.filing_date}: Items {filing.items}")
|
||||
```
|
||||
|
||||
### Track Amendment Activity
|
||||
|
||||
```python
|
||||
# Get all 10-K filings including amendments
|
||||
all_10k = company.get_filings(form=["10-K", "10-K/A"])
|
||||
|
||||
# Separate originals from amendments
|
||||
originals = all_10k.filter(amendments=False)
|
||||
amendments = all_10k.filter(form="10-K/A")
|
||||
|
||||
print(f"Original 10-Ks: {len(originals)}")
|
||||
print(f"Amended 10-Ks: {len(amendments)}")
|
||||
|
||||
# Show amendment details
|
||||
for amendment in amendments:
|
||||
print(f"{amendment.filing_date}: {amendment.accession_no}")
|
||||
```
|
||||
|
||||
### Export Filings to DataFrame
|
||||
|
||||
```python
|
||||
# Get recent filings
|
||||
filings = company.get_filings(form=["10-K", "10-Q"])
|
||||
|
||||
# Filter to recent year
|
||||
recent = filings.filter(filing_date="2024-01-01:")
|
||||
|
||||
# Convert to DataFrame
|
||||
df = recent.to_pandas()
|
||||
|
||||
# Analyze
|
||||
print(f"Total filings: {len(df)}")
|
||||
print(f"Forms: {df['form'].value_counts()}")
|
||||
print(f"XBRL filings: {df['isXBRL'].sum()}")
|
||||
|
||||
# Export
|
||||
df.to_csv("aapl_recent_filings.csv", index=False)
|
||||
```
|
||||
|
||||
### Compare XBRL Adoption
|
||||
|
||||
```python
|
||||
# Get all annual reports
|
||||
filings_10k = company.get_filings(form="10-K")
|
||||
|
||||
# Convert to DataFrame
|
||||
df = filings_10k.to_pandas()
|
||||
|
||||
# Group by year
|
||||
df['year'] = pd.to_datetime(df['filing_date']).dt.year
|
||||
|
||||
# Check XBRL adoption by year
|
||||
xbrl_by_year = df.groupby('year').agg({
|
||||
'isXBRL': 'sum',
|
||||
'isInlineXBRL': 'sum',
|
||||
'form': 'count'
|
||||
}).rename(columns={'form': 'total'})
|
||||
|
||||
print(xbrl_by_year)
|
||||
```
|
||||
|
||||
## Display & Representation
|
||||
|
||||
### Rich Display
|
||||
|
||||
EntityFilings displays as a rich table with pagination info:
|
||||
|
||||
```python
|
||||
print(filings)
|
||||
```
|
||||
|
||||
Shows:
|
||||
- Table of filings with: #, Form, Description, Filing Date, Accession Number
|
||||
- Pagination info (if multiple pages): "Showing 1 to 50 of 250 filings"
|
||||
- Panel title: "Filings for [Company Name] [CIK]"
|
||||
- Panel subtitle: Date range of filings
|
||||
|
||||
### Properties
|
||||
|
||||
```python
|
||||
# Check if empty
|
||||
if filings.empty:
|
||||
print("No filings found")
|
||||
|
||||
# Get date range
|
||||
start, end = filings.date_range
|
||||
print(f"Filings from {start} to {end}")
|
||||
|
||||
# Get summary
|
||||
print(filings.summary)
|
||||
```
|
||||
|
||||
## Comparison: EntityFilings vs Base Filings
|
||||
|
||||
### When You Get Each Type
|
||||
|
||||
**EntityFilings** - From Company context:
|
||||
```python
|
||||
company = Company("AAPL")
|
||||
filings = company.get_filings()
|
||||
# Type: EntityFilings (with entity context)
|
||||
```
|
||||
|
||||
**Base Filings** - From general search:
|
||||
```python
|
||||
from edgar import get_filings
|
||||
filings = get_filings(2024, 1, form="10-K")
|
||||
# Type: Filings (base class)
|
||||
```
|
||||
|
||||
### Feature Comparison
|
||||
|
||||
| Feature | Base Filings | EntityFilings |
|
||||
|---------|-------------|---------------|
|
||||
| Filter by form | ✅ | ✅ |
|
||||
| Filter by date | ✅ | ✅ |
|
||||
| Filter by CIK/ticker | ✅ | ❌ (already scoped to entity) |
|
||||
| Returns EntityFiling | ❌ | ✅ |
|
||||
| Entity context (CIK, name) | ❌ | ✅ |
|
||||
| Type preserved in operations | Filings | EntityFilings |
|
||||
| From Company.get_filings() | ❌ | ✅ |
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Use EntityFilings for Company Analysis
|
||||
|
||||
When working with a specific company, always use `Company.get_filings()`:
|
||||
|
||||
```python
|
||||
# Good - get EntityFilings with context
|
||||
company = Company("AAPL")
|
||||
filings = company.get_filings(form="10-K")
|
||||
|
||||
# Less ideal - get base Filings, requires filtering
|
||||
from edgar import get_filings
|
||||
all_filings = get_filings(2024, 1, form="10-K")
|
||||
apple_filings = all_filings.filter(ticker="AAPL")
|
||||
```
|
||||
|
||||
### 2. Check Empty Collections
|
||||
|
||||
```python
|
||||
filings = company.get_filings(form="RARE-FORM")
|
||||
|
||||
if filings.empty:
|
||||
print("No filings found")
|
||||
else:
|
||||
latest = filings.latest()
|
||||
```
|
||||
|
||||
### 3. Use latest() for Single Most Recent
|
||||
|
||||
```python
|
||||
# Get single filing
|
||||
filing = filings.latest()
|
||||
|
||||
# Not this (gets collection of 1)
|
||||
filings_one = filings.head(1)
|
||||
filing = filings_one[0]
|
||||
```
|
||||
|
||||
### 4. Preserve Type Through Operations
|
||||
|
||||
```python
|
||||
# All these return EntityFilings
|
||||
filtered = filings.filter(form="10-K")
|
||||
recent = filtered.filter(filing_date="2024-01-01:")
|
||||
sample = recent.sample(10)
|
||||
|
||||
# All maintain entity context
|
||||
print(sample.cik) # Still accessible
|
||||
print(sample.company_name) # Still accessible
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Empty Collections
|
||||
|
||||
```python
|
||||
filings = company.get_filings(form="NONEXISTENT")
|
||||
|
||||
if filings.empty:
|
||||
print("No filings found")
|
||||
else:
|
||||
# Safe to access
|
||||
latest = filings.latest()
|
||||
```
|
||||
|
||||
### Pagination at Boundaries
|
||||
|
||||
```python
|
||||
# At end of pages
|
||||
last_page = filings
|
||||
while True:
|
||||
next_page = last_page.next()
|
||||
if next_page is None:
|
||||
print("Reached end of filings")
|
||||
break
|
||||
last_page = next_page
|
||||
```
|
||||
|
||||
### Invalid Index
|
||||
|
||||
```python
|
||||
# Check length first
|
||||
if len(filings) > 5:
|
||||
filing = filings[5]
|
||||
else:
|
||||
print("Collection has fewer than 6 filings")
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Efficient Filtering
|
||||
|
||||
Filter early to reduce data size:
|
||||
|
||||
```python
|
||||
# Good: filter first, then process
|
||||
recent_10k = company.get_filings(form="10-K", filing_date="2023-01-01:")
|
||||
for filing in recent_10k:
|
||||
process(filing)
|
||||
|
||||
# Less efficient: get all, then filter in Python
|
||||
all_filings = company.get_filings()
|
||||
for filing in all_filings:
|
||||
if filing.form == "10-K" and filing.filing_date >= "2023-01-01":
|
||||
process(filing)
|
||||
```
|
||||
|
||||
### Use Pagination
|
||||
|
||||
For very large collections, use pagination:
|
||||
|
||||
```python
|
||||
# Process page by page
|
||||
current_page = company.get_filings()
|
||||
while current_page:
|
||||
# Process current page
|
||||
for filing in current_page:
|
||||
process(filing)
|
||||
|
||||
# Move to next page
|
||||
current_page = current_page.next()
|
||||
```
|
||||
|
||||
### DataFrame Conversion
|
||||
|
||||
Only convert to pandas when needed:
|
||||
|
||||
```python
|
||||
# Good: operate on EntityFilings directly
|
||||
filings_10k = filings.filter(form="10-K")
|
||||
latest = filings_10k.latest()
|
||||
|
||||
# Less efficient: convert to DataFrame first
|
||||
df = filings.to_pandas()
|
||||
df_10k = df[df['form'] == '10-K']
|
||||
# Now you've lost EntityFiling functionality
|
||||
```
|
||||
|
||||
## Integration with Company
|
||||
|
||||
EntityFilings is the primary interface between Company and Filing objects:
|
||||
|
||||
```python
|
||||
company = Company("AAPL")
|
||||
|
||||
# Company.get_filings() returns EntityFilings
|
||||
filings = company.get_filings()
|
||||
|
||||
# EntityFilings contains EntityFiling instances
|
||||
filing = filings[0]
|
||||
|
||||
# EntityFiling knows its entity
|
||||
entity = filing.get_entity()
|
||||
# entity is the same Company object
|
||||
```
|
||||
|
||||
This creates a seamless workflow for entity-focused analysis while maintaining proper type separation and functionality at each level.
|
||||
2403
venv/lib/python3.10/site-packages/edgar/entity/enhanced_statement.py
Normal file
2403
venv/lib/python3.10/site-packages/edgar/entity/enhanced_statement.py
Normal file
File diff suppressed because it is too large
Load Diff
1732
venv/lib/python3.10/site-packages/edgar/entity/entity_facts.py
Normal file
1732
venv/lib/python3.10/site-packages/edgar/entity/entity_facts.py
Normal file
File diff suppressed because it is too large
Load Diff
454
venv/lib/python3.10/site-packages/edgar/entity/filings.py
Normal file
454
venv/lib/python3.10/site-packages/edgar/entity/filings.py
Normal file
@@ -0,0 +1,454 @@
|
||||
"""
|
||||
Filings-related classes for the Entity package.
|
||||
|
||||
This module contains classes related to SEC filings for entities, including
|
||||
collections of filings and filing facts.
|
||||
"""
|
||||
from typing import List, Union
|
||||
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
from rich.box import SIMPLE
|
||||
from rich.console import Group
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from edgar._filings import Filing, Filings, PagingState
|
||||
from edgar.core import IntString, log
|
||||
from edgar.formatting import accession_number_text, display_size
|
||||
from edgar.reference.forms import describe_form
|
||||
from edgar.richtools import Docs, df_to_rich_table, repr_rich
|
||||
|
||||
__all__ = [
|
||||
'EntityFiling',
|
||||
'EntityFilings',
|
||||
'EntityFacts',
|
||||
'empty_company_filings'
|
||||
]
|
||||
|
||||
|
||||
class EntityFiling(Filing):
|
||||
"""
|
||||
Represents a single SEC filing for an entity.
|
||||
|
||||
This extends the base Filing class with additional information
|
||||
and methods specific to SEC entities.
|
||||
|
||||
Attributes:
|
||||
items (str): Filing items from SEC metadata. For 8-K filings, this indicates
|
||||
which items are included (e.g., "2.02,9.01").
|
||||
|
||||
**Data Source**: This value comes from SEC filing metadata, not from parsing
|
||||
the filing document itself.
|
||||
|
||||
**Legacy SGML Limitation**: For legacy SGML filings (1999-2001), the SEC's
|
||||
historical metadata may be incorrect or incomplete. Modern XML filings (2005+)
|
||||
have accurate metadata.
|
||||
|
||||
**Workaround for Legacy Filings**: For accurate item extraction from legacy
|
||||
SGML 8-K filings, parse the filing text directly using regex patterns.
|
||||
See GitHub Issue #462 for example code.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
cik: int,
|
||||
company: str,
|
||||
form: str,
|
||||
filing_date: str,
|
||||
report_date: str,
|
||||
acceptance_datetime: str,
|
||||
accession_no: str,
|
||||
file_number: str,
|
||||
items: str,
|
||||
size: int,
|
||||
primary_document: str,
|
||||
primary_doc_description: str,
|
||||
is_xbrl: bool,
|
||||
is_inline_xbrl: bool):
|
||||
super().__init__(cik=cik, company=company, form=form, filing_date=filing_date, accession_no=accession_no)
|
||||
self.report_date = report_date
|
||||
self.acceptance_datetime = acceptance_datetime
|
||||
self.file_number: str = file_number
|
||||
self.items: str = items # See class docstring for important notes on data source and limitations
|
||||
self.size: int = size
|
||||
self.primary_document: str = primary_document
|
||||
self.primary_doc_description: str = primary_doc_description
|
||||
self.is_xbrl: bool = is_xbrl
|
||||
self.is_inline_xbrl: bool = is_inline_xbrl
|
||||
|
||||
def related_filings(self):
|
||||
"""Get all the filings related to this one by file number."""
|
||||
return self.get_entity().get_filings(file_number=self.file_number, sort_by="filing_date")
|
||||
|
||||
def __str__(self):
|
||||
return (f"Filing(company='{self.company}', cik={self.cik}, form='{self.form}', "
|
||||
f"filing_date='{self.filing_date}', accession_no='{self.accession_no}')"
|
||||
)
|
||||
|
||||
|
||||
class EntityFilings(Filings):
|
||||
"""
|
||||
Collection of SEC filings for an entity.
|
||||
|
||||
This extends the base Filings class with additional methods and properties
|
||||
specific to entity filings.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
data: pa.Table,
|
||||
cik: int,
|
||||
company_name: str,
|
||||
original_state: PagingState = None):
|
||||
super().__init__(data, original_state=original_state)
|
||||
self.cik = cik
|
||||
self.company_name = company_name
|
||||
|
||||
@property
|
||||
def docs(self):
|
||||
return Docs(self)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self.get_filing_at(item)
|
||||
|
||||
@property
|
||||
def empty(self):
|
||||
return len(self.data) == 0
|
||||
|
||||
def get_filing_at(self, item: int):
|
||||
"""Get the filing at the specified index."""
|
||||
return EntityFiling(
|
||||
cik=self.cik,
|
||||
company=self.company_name,
|
||||
form=self.data['form'][item].as_py(),
|
||||
filing_date=self.data['filing_date'][item].as_py(),
|
||||
report_date=self.data['reportDate'][item].as_py(),
|
||||
acceptance_datetime=self.data['acceptanceDateTime'][item].as_py(),
|
||||
accession_no=self.data['accession_number'][item].as_py(),
|
||||
file_number=self.data['fileNumber'][item].as_py(),
|
||||
items=self.data['items'][item].as_py(),
|
||||
size=self.data['size'][item].as_py(),
|
||||
primary_document=self.data['primaryDocument'][item].as_py(),
|
||||
primary_doc_description=self.data['primaryDocDescription'][item].as_py(),
|
||||
is_xbrl=self.data['isXBRL'][item].as_py(),
|
||||
is_inline_xbrl=self.data['isInlineXBRL'][item].as_py()
|
||||
)
|
||||
|
||||
def filter(self,
|
||||
form: Union[str, List[str]] = None,
|
||||
amendments: bool = None,
|
||||
filing_date: str = None,
|
||||
date: str = None,
|
||||
cik: Union[int, str, List[Union[int, str]]] = None,
|
||||
ticker: Union[str, List[str]] = None,
|
||||
accession_number: Union[str, List[str]] = None):
|
||||
"""
|
||||
Filter the filings based on various criteria.
|
||||
|
||||
Args:
|
||||
form: Filter by form type
|
||||
amendments: Include amendments
|
||||
filing_date: Filter by filing date
|
||||
date: Alias for filing_date
|
||||
cik: Filter by CIK
|
||||
ticker: Filter by ticker
|
||||
accession_number: Filter by accession number
|
||||
|
||||
Returns:
|
||||
Filtered EntityFilings
|
||||
"""
|
||||
# The super filter returns Filings. We want EntityFilings
|
||||
res = super().filter(form=form,
|
||||
amendments=amendments,
|
||||
filing_date=filing_date,
|
||||
date=date,
|
||||
cik=cik,
|
||||
ticker=ticker,
|
||||
accession_number=accession_number)
|
||||
return EntityFilings(data=res.data, cik=self.cik, company_name=self.company_name)
|
||||
|
||||
def latest(self, n: int = 1):
|
||||
"""
|
||||
Get the latest n filings.
|
||||
|
||||
Args:
|
||||
n: Number of filings to return
|
||||
|
||||
Returns:
|
||||
Latest filing(s) - single filing if n=1, otherwise EntityFilings
|
||||
"""
|
||||
sort_indices = pc.sort_indices(self.data, sort_keys=[("filing_date", "descending")])
|
||||
sort_indices_top = sort_indices[:min(n, len(sort_indices))]
|
||||
latest_filing_index = pc.take(data=self.data, indices=sort_indices_top)
|
||||
filings = EntityFilings(latest_filing_index,
|
||||
cik=self.cik,
|
||||
company_name=self.company_name)
|
||||
if filings.empty:
|
||||
return None
|
||||
if len(filings) == 1:
|
||||
return filings[0]
|
||||
else:
|
||||
return filings
|
||||
|
||||
def head(self, n: int):
|
||||
"""
|
||||
Get the first n filings.
|
||||
|
||||
Args:
|
||||
n: Number of filings to return
|
||||
|
||||
Returns:
|
||||
EntityFilings containing the first n filings
|
||||
"""
|
||||
selection = self._head(n)
|
||||
return EntityFilings(data=selection, cik=self.cik, company_name=self.company_name)
|
||||
|
||||
def tail(self, n: int):
|
||||
"""
|
||||
Get the last n filings.
|
||||
|
||||
Args:
|
||||
n: Number of filings to return
|
||||
|
||||
Returns:
|
||||
EntityFilings containing the last n filings
|
||||
"""
|
||||
selection = self._tail(n)
|
||||
return EntityFilings(data=selection, cik=self.cik, company_name=self.company_name)
|
||||
|
||||
def sample(self, n: int):
|
||||
"""
|
||||
Get a random sample of n filings.
|
||||
|
||||
Args:
|
||||
n: Number of filings to sample
|
||||
|
||||
Returns:
|
||||
EntityFilings containing n random filings
|
||||
"""
|
||||
selection = self._sample(n)
|
||||
return EntityFilings(data=selection, cik=self.cik, company_name=self.company_name)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def summarize(data) -> pd.DataFrame:
|
||||
"""
|
||||
Summarize filing data as a pandas DataFrame.
|
||||
|
||||
Args:
|
||||
data: Filing data to summarize
|
||||
|
||||
Returns:
|
||||
DataFrame with summarized data
|
||||
"""
|
||||
return (data
|
||||
.assign(size=lambda df: df['size'].apply(display_size),
|
||||
isXBRL=lambda df: df.isXBRL.map({'1': "\u2713", 1: "\u2713"}).fillna(""),
|
||||
)
|
||||
.filter(["form", "filing_date", "accession_number", "isXBRL"])
|
||||
.rename(columns={"filing_date": "filed", "isXBRL": "xbrl"})
|
||||
)
|
||||
|
||||
def next(self):
|
||||
"""
|
||||
Show the next page of filings.
|
||||
|
||||
Returns:
|
||||
EntityFilings with the next page of data, or None if at the end
|
||||
"""
|
||||
data_page = self.data_pager.next()
|
||||
if data_page is None:
|
||||
log.warning("End of data .. use prev() \u2190 ")
|
||||
return None
|
||||
start_index, _ = self.data_pager._current_range
|
||||
filings_state = PagingState(page_start=start_index, num_records=len(self))
|
||||
return EntityFilings(data_page,
|
||||
cik=self.cik,
|
||||
company_name=self.company_name,
|
||||
original_state=filings_state)
|
||||
|
||||
def previous(self):
|
||||
"""
|
||||
Show the previous page of filings.
|
||||
|
||||
Returns:
|
||||
EntityFilings with the previous page of data, or None if at the beginning
|
||||
"""
|
||||
data_page = self.data_pager.previous()
|
||||
if data_page is None:
|
||||
log.warning(" No previous data .. use next() \u2192 ")
|
||||
return None
|
||||
start_index, _ = self.data_pager._current_range
|
||||
filings_state = PagingState(page_start=start_index, num_records=len(self))
|
||||
return EntityFilings(data_page,
|
||||
cik=self.cik,
|
||||
company_name=self.company_name,
|
||||
original_state=filings_state)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def __rich__(self):
|
||||
# Create table with appropriate columns and styling
|
||||
table = Table(
|
||||
show_header=True,
|
||||
header_style="bold",
|
||||
show_edge=True,
|
||||
expand=False,
|
||||
padding=(0, 1),
|
||||
box=SIMPLE,
|
||||
row_styles=["", "bold"]
|
||||
)
|
||||
|
||||
# Add columns with specific styling and alignment
|
||||
table.add_column("#", style="dim", justify="right")
|
||||
table.add_column("Form", width=10, style="bold yellow")
|
||||
table.add_column("Description", width=60, style="bold blue"),
|
||||
table.add_column("Filing Date", width=11)
|
||||
table.add_column("Accession Number", width=20)
|
||||
|
||||
# Get current page from data pager
|
||||
current_page = self.data_pager.current()
|
||||
|
||||
# Calculate start index for proper indexing
|
||||
start_idx = self._original_state.page_start if self._original_state else self.data_pager.start_index
|
||||
|
||||
# Iterate through rows in current page
|
||||
for i in range(len(current_page)):
|
||||
form = current_page['form'][i].as_py()
|
||||
description = describe_form(current_page['form'][i].as_py(), prepend_form=False)
|
||||
|
||||
row = [
|
||||
str(start_idx + i),
|
||||
form,
|
||||
description,
|
||||
str(current_page['filing_date'][i].as_py()),
|
||||
accession_number_text(current_page['accession_number'][i].as_py())
|
||||
]
|
||||
table.add_row(*row)
|
||||
|
||||
# Show paging information only if there are multiple pages
|
||||
elements = [table]
|
||||
|
||||
if self.data_pager.total_pages > 1:
|
||||
total_filings = self._original_state.num_records
|
||||
current_count = len(current_page)
|
||||
start_num = start_idx + 1
|
||||
end_num = start_idx + current_count
|
||||
|
||||
page_info = Text.assemble(
|
||||
("Showing ", "dim"),
|
||||
(f"{start_num:,}", "bold red"),
|
||||
(" to ", "dim"),
|
||||
(f"{end_num:,}", "bold red"),
|
||||
(" of ", "dim"),
|
||||
(f"{total_filings:,}", "bold"),
|
||||
(" filings.", "dim"),
|
||||
(" Page using ", "dim"),
|
||||
("← prev()", "bold gray54"),
|
||||
(" and ", "dim"),
|
||||
("next() →", "bold gray54")
|
||||
)
|
||||
|
||||
elements.extend([Text("\n"), page_info])
|
||||
|
||||
# Get the title
|
||||
title = Text.assemble(
|
||||
("Filings for ", "bold"),
|
||||
(f"{self.company_name}", "bold green"),
|
||||
(" [", "dim"),
|
||||
(f"{self.cik}", "bold yellow"),
|
||||
("]", "dim")
|
||||
)
|
||||
|
||||
# Get the subtitle
|
||||
start_date, end_date = self.date_range
|
||||
date_range_text = f"Company filings between {start_date:%Y-%m-%d} and {end_date:%Y-%m-%d}" if start_date else "Company filings"
|
||||
subtitle = Text.assemble(
|
||||
(date_range_text, "dim"),
|
||||
" • ",
|
||||
("filings.docs", "cyan dim"),
|
||||
(" for usage guide", "dim")
|
||||
)
|
||||
return Panel(
|
||||
Group(*elements),
|
||||
title=title,
|
||||
subtitle=subtitle,
|
||||
border_style="bold grey54",
|
||||
expand=False
|
||||
)
|
||||
|
||||
|
||||
class EntityFacts:
|
||||
"""
|
||||
Contains structured facts data about an entity from XBRL filings.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
cik: int,
|
||||
name: str,
|
||||
facts: pa.Table,
|
||||
fact_meta: pd.DataFrame):
|
||||
self.cik: int = cik
|
||||
self.name: str = name
|
||||
self.facts: pa.Table = facts
|
||||
self.fact_meta: pd.DataFrame = fact_meta
|
||||
|
||||
def to_pandas(self) -> pd.DataFrame:
|
||||
"""Convert facts to a pandas DataFrame."""
|
||||
return self.facts.to_pandas()
|
||||
|
||||
def __len__(self):
|
||||
return len(self.facts)
|
||||
|
||||
def num_facts(self) -> int:
|
||||
"""Get the number of facts."""
|
||||
return len(self.fact_meta)
|
||||
|
||||
def __rich__(self):
|
||||
return Panel(
|
||||
Group(
|
||||
df_to_rich_table(self.facts)
|
||||
), title=f"Company Facts({self.name} [{self.cik}] {len(self.facts):,} total facts)"
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
COMPANY_FILINGS_SCHEMA = schema = pa.schema([
|
||||
('accession_number', pa.string()),
|
||||
('filing_date', pa.date32()),
|
||||
('reportDate', pa.string()),
|
||||
('acceptanceDateTime', pa.timestamp('us')), # Changed to timestamp
|
||||
('act', pa.string()),
|
||||
('form', pa.string()),
|
||||
('fileNumber', pa.string()),
|
||||
('items', pa.string()),
|
||||
('size', pa.string()),
|
||||
('isXBRL', pa.string()),
|
||||
('isInlineXBRL', pa.string()),
|
||||
('primaryDocument', pa.string()),
|
||||
('primaryDocDescription', pa.string())
|
||||
])
|
||||
|
||||
def empty_company_filings(cik:IntString, company_name:str):
|
||||
"""
|
||||
Create an empty filings container.
|
||||
|
||||
Args:
|
||||
cik: The CIK number
|
||||
company_name: The company name
|
||||
|
||||
Returns:
|
||||
EntityFilings: An empty filings container
|
||||
"""
|
||||
table = pa.Table.from_arrays([[] for _ in range(13)], schema=COMPANY_FILINGS_SCHEMA)
|
||||
return EntityFilings(table, cik=cik, company_name=company_name)
|
||||
|
||||
|
||||
# For backward compatibility
|
||||
CompanyFiling = EntityFiling
|
||||
CompanyFilings = EntityFilings
|
||||
CompanyFacts = EntityFacts
|
||||
|
||||
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
Loader for learned statement mappings and canonical structures.
|
||||
|
||||
This module handles loading and caching of learned mappings from the
|
||||
structural learning process.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def load_learned_mappings() -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Load learned statement mappings from package data.
|
||||
|
||||
Returns:
|
||||
Dictionary of concept -> mapping info
|
||||
"""
|
||||
try:
|
||||
# Get the data file path
|
||||
data_dir = Path(__file__).parent / 'data'
|
||||
mappings_file = data_dir / 'statement_mappings_v1.json'
|
||||
|
||||
if not mappings_file.exists():
|
||||
log.warning("Learned mappings file not found: %s", mappings_file)
|
||||
return {}
|
||||
|
||||
with open(mappings_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
mappings = data.get('mappings', {})
|
||||
metadata = data.get('metadata', {})
|
||||
|
||||
log.info("Loaded %d learned concept mappings (version: %s)", len(mappings), metadata.get('version', 'unknown'))
|
||||
|
||||
return mappings
|
||||
|
||||
except Exception as e:
|
||||
log.error("Error loading learned mappings: %s", e)
|
||||
return {}
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def load_canonical_structures() -> Dict[str, Any]:
|
||||
"""
|
||||
Load canonical statement structures.
|
||||
|
||||
Returns:
|
||||
Dictionary of statement_type -> canonical structure
|
||||
"""
|
||||
try:
|
||||
data_dir = Path(__file__).parent / 'data'
|
||||
structures_file = data_dir / 'learned_mappings.json'
|
||||
|
||||
if not structures_file.exists():
|
||||
log.warning("Canonical structures file not found: %s", structures_file)
|
||||
return {}
|
||||
|
||||
with open(structures_file, 'r') as f:
|
||||
structures = json.load(f)
|
||||
|
||||
log.info("Loaded canonical structures for %d statement types", len(structures))
|
||||
return structures
|
||||
|
||||
except Exception as e:
|
||||
log.error("Error loading canonical structures: %s", e)
|
||||
return {}
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def load_virtual_trees() -> Dict[str, Any]:
|
||||
"""
|
||||
Load virtual presentation trees.
|
||||
|
||||
Returns:
|
||||
Dictionary of statement_type -> virtual tree
|
||||
"""
|
||||
try:
|
||||
data_dir = Path(__file__).parent / 'data'
|
||||
trees_file = data_dir / 'virtual_trees.json'
|
||||
|
||||
if not trees_file.exists():
|
||||
log.warning("Virtual trees file not found: %s", trees_file)
|
||||
return {}
|
||||
|
||||
with open(trees_file, 'r') as f:
|
||||
trees = json.load(f)
|
||||
|
||||
log.info("Loaded virtual trees for %d statement types", len(trees))
|
||||
return trees
|
||||
|
||||
except Exception as e:
|
||||
log.error("Error loading virtual trees: %s", e)
|
||||
return {}
|
||||
|
||||
|
||||
def get_concept_mapping(concept: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get mapping information for a specific concept.
|
||||
|
||||
Args:
|
||||
concept: Concept name (without namespace)
|
||||
|
||||
Returns:
|
||||
Mapping info dict or None if not found
|
||||
"""
|
||||
mappings = load_learned_mappings()
|
||||
return mappings.get(concept)
|
||||
|
||||
|
||||
def get_statement_concepts(statement_type: str,
|
||||
min_confidence: float = 0.5) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Get all concepts for a specific statement type.
|
||||
|
||||
Args:
|
||||
statement_type: Type of statement (BalanceSheet, IncomeStatement, etc.)
|
||||
min_confidence: Minimum confidence threshold
|
||||
|
||||
Returns:
|
||||
Dictionary of concept -> mapping info
|
||||
"""
|
||||
mappings = load_learned_mappings()
|
||||
|
||||
result = {}
|
||||
for concept, info in mappings.items():
|
||||
if (info.get('statement_type') == statement_type and
|
||||
info.get('confidence', 0) >= min_confidence):
|
||||
result[concept] = info
|
||||
|
||||
return result
|
||||
262
venv/lib/python3.10/site-packages/edgar/entity/models.py
Normal file
262
venv/lib/python3.10/site-packages/edgar/entity/models.py
Normal file
@@ -0,0 +1,262 @@
|
||||
"""
|
||||
Data models for the enhanced Entity Facts API.
|
||||
|
||||
This module provides the unified data models for financial facts,
|
||||
optimized for both traditional analysis and AI consumption.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Literal, Optional, Union
|
||||
|
||||
|
||||
class DataQuality(Enum):
|
||||
"""Data quality indicators for facts"""
|
||||
HIGH = "high" # Direct from XBRL, validated
|
||||
MEDIUM = "medium" # Derived or calculated
|
||||
LOW = "low" # Estimated or inferred
|
||||
|
||||
|
||||
@dataclass
|
||||
class FinancialFact:
|
||||
"""
|
||||
Unified fact representation optimized for both traditional analysis and AI consumption.
|
||||
|
||||
This class represents a single financial fact with rich contextual information,
|
||||
quality indicators, and AI-ready metadata.
|
||||
"""
|
||||
|
||||
# Core identification
|
||||
concept: str # Standardized concept (e.g., 'us-gaap:Revenue')
|
||||
taxonomy: str # Taxonomy namespace (us-gaap, ifrs, etc.)
|
||||
label: str # Human-readable label
|
||||
|
||||
# Values with proper typing
|
||||
value: Union[float, int, str] # The actual value
|
||||
numeric_value: Optional[float] # Numeric representation for calculations
|
||||
unit: str # Unit of measure (USD, shares, etc.)
|
||||
scale: Optional[int] = None # Scale factor (thousands=1000, millions=1000000)
|
||||
|
||||
# Temporal context
|
||||
period_start: Optional[date] = None
|
||||
period_end: date = None
|
||||
period_type: Literal['instant', 'duration'] = 'instant'
|
||||
fiscal_year: int = 0
|
||||
fiscal_period: str = '' # FY, Q1, Q2, Q3, Q4
|
||||
|
||||
# Filing context
|
||||
filing_date: date = None
|
||||
form_type: str = '' # 10-K, 10-Q, 8-K, etc.
|
||||
accession: str = '' # SEC accession number
|
||||
|
||||
# Quality and provenance
|
||||
data_quality: DataQuality = DataQuality.MEDIUM
|
||||
is_audited: bool = False
|
||||
is_restated: bool = False
|
||||
is_estimated: bool = False
|
||||
confidence_score: float = 0.8 # 0.0 to 1.0
|
||||
|
||||
# AI-ready context
|
||||
semantic_tags: List[str] = field(default_factory=list) # ['revenue', 'recurring', 'operating']
|
||||
business_context: str = '' # "Product revenue from iPhone sales"
|
||||
calculation_context: Optional[str] = None # "Derived from segment data"
|
||||
|
||||
# Optional XBRL specifics
|
||||
context_ref: Optional[str] = None
|
||||
dimensions: Dict[str, str] = field(default_factory=dict)
|
||||
statement_type: Optional[str] = None
|
||||
line_item_sequence: Optional[int] = None
|
||||
|
||||
# Structural metadata (from learned mappings)
|
||||
depth: Optional[int] = None # Hierarchy depth in statement
|
||||
parent_concept: Optional[str] = None # Parent concept in hierarchy
|
||||
section: Optional[str] = None # Statement section (e.g., "Current Assets")
|
||||
is_abstract: bool = False # Abstract/header item
|
||||
is_total: bool = False # Total/sum item
|
||||
presentation_order: Optional[float] = None # Order in presentation
|
||||
|
||||
def to_llm_context(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate rich context for LLM consumption.
|
||||
|
||||
Returns a dictionary with formatted values and contextual information
|
||||
optimized for language model understanding.
|
||||
"""
|
||||
# Format the value appropriately
|
||||
if self.numeric_value is not None:
|
||||
if self.unit.upper() in ['USD', 'EUR', 'GBP', 'JPY']:
|
||||
# Currency formatting
|
||||
formatted_value = f"{self.numeric_value:,.0f}"
|
||||
if self.scale:
|
||||
if self.scale == 1000:
|
||||
formatted_value += " thousand"
|
||||
elif self.scale == 1000000:
|
||||
formatted_value += " million"
|
||||
elif self.scale == 1000000000:
|
||||
formatted_value += " billion"
|
||||
else:
|
||||
formatted_value = f"{self.numeric_value:,.2f}"
|
||||
else:
|
||||
formatted_value = str(self.value)
|
||||
|
||||
# Format the period
|
||||
if self.period_type == 'instant':
|
||||
period_desc = f"as of {self.period_end}"
|
||||
else:
|
||||
period_desc = f"for {self.fiscal_period} {self.fiscal_year}"
|
||||
if self.period_start and self.period_end:
|
||||
period_desc += f" ({self.period_start} to {self.period_end})"
|
||||
|
||||
return {
|
||||
"concept": self.label,
|
||||
"value": formatted_value,
|
||||
"unit": self.unit,
|
||||
"period": period_desc,
|
||||
"context": self.business_context,
|
||||
"quality": self.data_quality.value,
|
||||
"confidence": self.confidence_score,
|
||||
"tags": self.semantic_tags,
|
||||
"source": f"{self.form_type} filed {self.filing_date}" if self.filing_date else "Unknown source",
|
||||
"is_audited": self.is_audited,
|
||||
"is_estimated": self.is_estimated,
|
||||
"dimensions": self.dimensions if self.dimensions else None
|
||||
}
|
||||
|
||||
def get_display_period_key(self) -> str:
|
||||
"""
|
||||
Generate a display-friendly period key based on actual period dates.
|
||||
|
||||
This method creates period keys like "Q1 2024" based on the actual period
|
||||
covered by the data, not the filing year. It uses the period_end date to
|
||||
determine the calendar year and quarter.
|
||||
|
||||
Returns:
|
||||
A period key in format like "Q1 2024", "FY 2023", etc.
|
||||
"""
|
||||
if not self.period_end:
|
||||
# Fallback to fiscal year/period if no period_end
|
||||
return f"{self.fiscal_period} {self.fiscal_year}"
|
||||
|
||||
# Extract calendar year from period_end
|
||||
calendar_year = self.period_end.year
|
||||
|
||||
# For fiscal years, use "FY" prefix
|
||||
if self.fiscal_period == 'FY':
|
||||
return f"FY {calendar_year}"
|
||||
|
||||
# For quarters, determine the calendar quarter from the end date
|
||||
if self.fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
|
||||
end_month = self.period_end.month
|
||||
|
||||
# Map end month to calendar quarter
|
||||
if end_month in [1, 2, 3]:
|
||||
quarter = 'Q1'
|
||||
elif end_month in [4, 5, 6]:
|
||||
quarter = 'Q2'
|
||||
elif end_month in [7, 8, 9]:
|
||||
quarter = 'Q3'
|
||||
else: # 10, 11, 12
|
||||
quarter = 'Q4'
|
||||
|
||||
return f"{quarter} {calendar_year}"
|
||||
|
||||
# For other periods, use the fiscal period with calendar year
|
||||
return f"{self.fiscal_period} {calendar_year}"
|
||||
|
||||
def get_formatted_value(self) -> str:
|
||||
"""
|
||||
Format the numeric value for display, avoiding scientific notation.
|
||||
|
||||
Returns:
|
||||
Formatted string representation of the value
|
||||
"""
|
||||
if self.numeric_value is None:
|
||||
return str(self.value)
|
||||
|
||||
# For currency values
|
||||
if self.unit.upper() in ['USD', 'EUR', 'GBP', 'JPY', 'CAD', 'CHF']:
|
||||
# Round to nearest whole number for large values
|
||||
if abs(self.numeric_value) >= 1000:
|
||||
return f"{self.numeric_value:,.0f}"
|
||||
else:
|
||||
return f"{self.numeric_value:,.2f}"
|
||||
|
||||
# For share counts
|
||||
elif self.unit.lower() in ['shares', 'share']:
|
||||
return f"{self.numeric_value:,.0f}"
|
||||
|
||||
# For percentages and ratios
|
||||
elif self.unit.lower() in ['pure', 'percent', '%']:
|
||||
return f"{self.numeric_value:.2f}"
|
||||
|
||||
# Default formatting
|
||||
else:
|
||||
if abs(self.numeric_value) >= 1000:
|
||||
return f"{self.numeric_value:,.0f}"
|
||||
else:
|
||||
return f"{self.numeric_value:,.2f}"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""String representation focusing on key information"""
|
||||
value_str = f"{self.numeric_value:,.0f}" if self.numeric_value else str(self.value)
|
||||
return f"FinancialFact({self.concept}={value_str} {self.unit}, {self.fiscal_period} {self.fiscal_year})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConceptMetadata:
|
||||
"""
|
||||
Metadata about a financial concept.
|
||||
|
||||
This provides additional context about what a concept represents,
|
||||
how it's calculated, and how it relates to other concepts.
|
||||
"""
|
||||
concept: str # The concept identifier
|
||||
label: str # Primary display label
|
||||
definition: str # Detailed definition
|
||||
|
||||
# Concept relationships
|
||||
parent_concepts: List[str] = field(default_factory=list)
|
||||
child_concepts: List[str] = field(default_factory=list)
|
||||
calculation_components: List[str] = field(default_factory=list)
|
||||
|
||||
# Classification
|
||||
statement_type: Optional[str] = None # BalanceSheet, IncomeStatement, etc.
|
||||
is_monetary: bool = True
|
||||
is_duration: bool = True # True for flow concepts, False for stock concepts
|
||||
normal_balance: Optional[Literal['debit', 'credit']] = None
|
||||
|
||||
# Usage guidance
|
||||
common_names: List[str] = field(default_factory=list) # Alternative labels
|
||||
usage_notes: str = '' # Special considerations
|
||||
typical_scale: Optional[int] = None # Common scale factor
|
||||
|
||||
|
||||
@dataclass
|
||||
class FactCollection:
|
||||
"""
|
||||
A collection of related facts, typically for a specific time period or statement.
|
||||
|
||||
This is used internally to group facts for efficient processing and analysis.
|
||||
"""
|
||||
facts: List[FinancialFact]
|
||||
period_key: str # e.g., "2024-Q4", "2024-FY"
|
||||
statement_type: Optional[str] = None
|
||||
|
||||
def get_fact(self, concept: str) -> Optional[FinancialFact]:
|
||||
"""Get a specific fact by concept"""
|
||||
for fact in self.facts:
|
||||
if fact.concept == concept or fact.label == concept:
|
||||
return fact
|
||||
return None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary keyed by concept"""
|
||||
return {
|
||||
fact.concept: {
|
||||
'value': fact.numeric_value or fact.value,
|
||||
'label': fact.label,
|
||||
'unit': fact.unit
|
||||
}
|
||||
for fact in self.facts
|
||||
}
|
||||
382
venv/lib/python3.10/site-packages/edgar/entity/parser.py
Normal file
382
venv/lib/python3.10/site-packages/edgar/entity/parser.py
Normal file
@@ -0,0 +1,382 @@
|
||||
"""
|
||||
Parser for converting SEC API data to the new Entity Facts format.
|
||||
|
||||
This module handles the conversion of raw SEC company facts JSON data
|
||||
into the new unified FinancialFact model.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import date, datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from edgar.entity.entity_facts import EntityFacts
|
||||
from edgar.entity.mappings_loader import load_learned_mappings
|
||||
from edgar.entity.models import DataQuality, FinancialFact
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EntityFactsParser:
|
||||
"""
|
||||
Parser for converting SEC company facts to EntityFacts.
|
||||
|
||||
This class handles the transformation of raw SEC API data into
|
||||
the new unified fact model with proper typing and AI-ready metadata.
|
||||
"""
|
||||
|
||||
# Concept mapping for common financial statement items
|
||||
STATEMENT_MAPPING = {
|
||||
# Income Statement
|
||||
'Revenue': 'IncomeStatement',
|
||||
'Revenues': 'IncomeStatement', # Fix for Issue #438 - ensure us-gaap:Revenues maps properly
|
||||
'RevenueFromContractWithCustomerExcludingAssessedTax': 'IncomeStatement',
|
||||
'SalesRevenueNet': 'IncomeStatement',
|
||||
'CostOfRevenue': 'IncomeStatement',
|
||||
'GrossProfit': 'IncomeStatement',
|
||||
'OperatingExpenses': 'IncomeStatement',
|
||||
'OperatingIncomeLoss': 'IncomeStatement',
|
||||
'NetIncomeLoss': 'IncomeStatement',
|
||||
'EarningsPerShareDiluted': 'IncomeStatement',
|
||||
|
||||
# Balance Sheet
|
||||
'Assets': 'BalanceSheet',
|
||||
'AssetsCurrent': 'BalanceSheet',
|
||||
'CurrentAssets': 'BalanceSheet',
|
||||
'AssetsNoncurrent': 'BalanceSheet',
|
||||
'Liabilities': 'BalanceSheet',
|
||||
'LiabilitiesCurrent': 'BalanceSheet',
|
||||
'CurrentLiabilities': 'BalanceSheet',
|
||||
'LiabilitiesNoncurrent': 'BalanceSheet',
|
||||
'StockholdersEquity': 'BalanceSheet',
|
||||
'CashAndCashEquivalentsAtCarryingValue': 'BalanceSheet',
|
||||
|
||||
# Cash Flow
|
||||
'NetCashProvidedByUsedInOperatingActivities': 'CashFlow',
|
||||
'NetCashProvidedByUsedInInvestingActivities': 'CashFlow',
|
||||
'NetCashProvidedByUsedInFinancingActivities': 'CashFlow',
|
||||
'CashAndCashEquivalentsPeriodIncreaseDecrease': 'CashFlow'
|
||||
}
|
||||
|
||||
# Semantic tags for concepts
|
||||
SEMANTIC_TAGS = {
|
||||
'Revenue': ['revenue', 'sales', 'operating'],
|
||||
'NetIncomeLoss': ['profit', 'earnings', 'bottom_line'],
|
||||
'Assets': ['assets', 'resources', 'balance_sheet'],
|
||||
'CashAndCashEquivalentsAtCarryingValue': ['cash', 'liquidity', 'current_assets']
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def parse_company_facts(cls, json_data: Dict[str, Any]) -> Optional[EntityFacts]:
|
||||
"""
|
||||
Parse SEC company facts JSON into EntityFacts.
|
||||
|
||||
Args:
|
||||
json_data: Raw JSON from SEC API
|
||||
|
||||
Returns:
|
||||
EntityFacts object or None if parsing fails
|
||||
"""
|
||||
try:
|
||||
cik = int(json_data.get('cik', 0))
|
||||
entity_name = json_data.get('entityName', 'Unknown')
|
||||
|
||||
facts = []
|
||||
|
||||
# Process facts from different taxonomies
|
||||
facts_data = json_data.get('facts', {})
|
||||
|
||||
for taxonomy, taxonomy_facts in facts_data.items():
|
||||
for concept, concept_data in taxonomy_facts.items():
|
||||
# Process units for this concept
|
||||
units = concept_data.get('units', {})
|
||||
label = concept_data.get('label', concept)
|
||||
description = concept_data.get('description', '')
|
||||
|
||||
for unit, unit_facts in units.items():
|
||||
for fact_data in unit_facts:
|
||||
fact = cls._parse_single_fact(
|
||||
concept=concept,
|
||||
taxonomy=taxonomy,
|
||||
label=label,
|
||||
description=description,
|
||||
unit=unit,
|
||||
fact_data=fact_data
|
||||
)
|
||||
if fact:
|
||||
facts.append(fact)
|
||||
|
||||
if not facts:
|
||||
log.warning("No facts found for CIK %s", cik)
|
||||
return None
|
||||
|
||||
return EntityFacts(cik=cik, name=entity_name, facts=facts)
|
||||
|
||||
except Exception as e:
|
||||
log.error("Error parsing company facts: %s", e)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _parse_single_fact(cls,
|
||||
concept: str,
|
||||
taxonomy: str,
|
||||
label: str,
|
||||
description: str,
|
||||
unit: str,
|
||||
fact_data: Dict[str, Any]) -> Optional[FinancialFact]:
|
||||
"""
|
||||
Parse a single fact from SEC data.
|
||||
|
||||
Args:
|
||||
concept: Concept identifier
|
||||
taxonomy: Taxonomy namespace
|
||||
label: Human-readable label
|
||||
description: Concept description
|
||||
unit: Unit of measure
|
||||
fact_data: Raw fact data
|
||||
|
||||
Returns:
|
||||
FinancialFact or None if parsing fails
|
||||
"""
|
||||
|
||||
# Extract core values
|
||||
value = fact_data.get('val')
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
# Parse dates
|
||||
period_end = cls._parse_date(fact_data.get('end'))
|
||||
period_start = cls._parse_date(fact_data.get('start'))
|
||||
filing_date = cls._parse_date(fact_data.get('filed'))
|
||||
|
||||
# Determine period type
|
||||
if period_start:
|
||||
period_type = 'duration'
|
||||
else:
|
||||
period_type = 'instant'
|
||||
|
||||
# Parse fiscal period info
|
||||
fiscal_year = cls._parse_fiscal_year(fact_data.get('fy'))
|
||||
fiscal_period = fact_data.get('fp', '')
|
||||
|
||||
# Determine numeric value
|
||||
numeric_value = None
|
||||
if isinstance(value, (int, float)):
|
||||
numeric_value = float(value)
|
||||
elif isinstance(value, str) and value.replace('-', '').replace('.', '').isdigit():
|
||||
try:
|
||||
numeric_value = float(value)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Determine statement type
|
||||
statement_type = cls._determine_statement_type(concept)
|
||||
|
||||
# Get semantic tags
|
||||
semantic_tags = cls._get_semantic_tags(concept)
|
||||
|
||||
# Get structural metadata from learned mappings
|
||||
structural_info = cls._get_structural_info(concept)
|
||||
|
||||
# Determine data quality
|
||||
data_quality = cls._assess_data_quality(fact_data, fiscal_period)
|
||||
|
||||
# Create business context
|
||||
business_context = cls._generate_business_context(label, description, unit)
|
||||
|
||||
# Clean unit representation
|
||||
clean_unit = cls._clean_unit(unit)
|
||||
|
||||
# Determine scale
|
||||
scale = cls._determine_scale(unit)
|
||||
|
||||
return FinancialFact(
|
||||
concept=f"{taxonomy}:{concept}",
|
||||
taxonomy=taxonomy,
|
||||
label=label,
|
||||
value=value,
|
||||
numeric_value=numeric_value,
|
||||
unit=clean_unit,
|
||||
scale=scale,
|
||||
period_start=period_start,
|
||||
period_end=period_end,
|
||||
period_type=period_type,
|
||||
fiscal_year=fiscal_year,
|
||||
fiscal_period=fiscal_period,
|
||||
filing_date=filing_date,
|
||||
form_type=fact_data.get('form', ''),
|
||||
accession=fact_data.get('accn', ''),
|
||||
data_quality=data_quality,
|
||||
is_audited=fiscal_period == 'FY', # Annual reports are typically audited
|
||||
is_restated=False, # Would need additional logic to detect
|
||||
is_estimated=False, # Would need additional logic to detect
|
||||
confidence_score=0.9 if data_quality == DataQuality.HIGH else 0.7,
|
||||
semantic_tags=semantic_tags,
|
||||
business_context=business_context,
|
||||
statement_type=statement_type,
|
||||
# Add structural metadata
|
||||
depth=structural_info.get('depth'),
|
||||
parent_concept=structural_info.get('parent'),
|
||||
section=structural_info.get('section'),
|
||||
is_abstract=structural_info.get('is_abstract', False),
|
||||
is_total=structural_info.get('is_total', False),
|
||||
presentation_order=structural_info.get('avg_depth')
|
||||
)
|
||||
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _parse_date(date_str: Optional[str]) -> Optional[date]:
|
||||
"""Parse date string to date object"""
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Try common date formats
|
||||
for fmt in ['%Y-%m-%d', '%Y%m%d', '%m/%d/%Y']:
|
||||
try:
|
||||
return datetime.strptime(date_str, fmt).date()
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# If all formats fail, try to parse as ISO format
|
||||
return datetime.fromisoformat(date_str).date()
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _parse_fiscal_year(fy_value: Any) -> int:
|
||||
"""Parse fiscal year value"""
|
||||
if not fy_value:
|
||||
return 0
|
||||
|
||||
try:
|
||||
return int(fy_value)
|
||||
except (ValueError, TypeError):
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def _determine_statement_type(cls, concept: str) -> Optional[str]:
|
||||
"""
|
||||
Determine which financial statement a concept belongs to.
|
||||
|
||||
First checks static mappings, then falls back to learned mappings
|
||||
with confidence threshold.
|
||||
"""
|
||||
# Remove namespace if present
|
||||
if ':' in concept:
|
||||
concept = concept.split(':')[-1]
|
||||
|
||||
# Check static mappings first (highest confidence)
|
||||
if concept in cls.STATEMENT_MAPPING:
|
||||
return cls.STATEMENT_MAPPING[concept]
|
||||
|
||||
# Check learned mappings
|
||||
try:
|
||||
learned_mappings = load_learned_mappings()
|
||||
if concept in learned_mappings:
|
||||
mapping = learned_mappings[concept]
|
||||
# Only use high-confidence learned mappings
|
||||
if mapping.get('confidence', 0) >= 0.5: # 50% threshold
|
||||
return mapping['statement_type']
|
||||
except Exception as e:
|
||||
log.debug("Error loading learned mappings: %s", e)
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _get_semantic_tags(cls, concept: str) -> List[str]:
|
||||
"""Get semantic tags for a concept"""
|
||||
# Remove namespace if present
|
||||
if ':' in concept:
|
||||
concept = concept.split(':')[-1]
|
||||
|
||||
return cls.SEMANTIC_TAGS.get(concept, [])
|
||||
|
||||
@classmethod
|
||||
def _get_structural_info(cls, concept: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get structural metadata for a concept from learned mappings.
|
||||
|
||||
Returns dict with depth, parent, section, is_abstract, is_total
|
||||
"""
|
||||
# Remove namespace if present
|
||||
if ':' in concept:
|
||||
concept = concept.split(':')[-1]
|
||||
|
||||
try:
|
||||
learned_mappings = load_learned_mappings()
|
||||
if concept in learned_mappings:
|
||||
mapping = learned_mappings[concept]
|
||||
return {
|
||||
'depth': int(mapping.get('avg_depth', 0)) if mapping.get('avg_depth') else None,
|
||||
'parent': mapping.get('parent'),
|
||||
'section': mapping.get('section'),
|
||||
'is_abstract': mapping.get('is_abstract', False),
|
||||
'is_total': mapping.get('is_total', False)
|
||||
}
|
||||
except Exception as e:
|
||||
log.debug("Error getting structural info: %s", e)
|
||||
|
||||
return {}
|
||||
|
||||
@staticmethod
|
||||
def _assess_data_quality(fact_data: Dict[str, Any], fiscal_period: str) -> DataQuality:
|
||||
"""Assess the quality of a fact"""
|
||||
# Annual data is typically higher quality
|
||||
if fiscal_period == 'FY':
|
||||
return DataQuality.HIGH
|
||||
|
||||
# Quarterly data
|
||||
if fiscal_period in ['Q1', 'Q2', 'Q3', 'Q4']:
|
||||
return DataQuality.HIGH
|
||||
|
||||
# Other data
|
||||
return DataQuality.MEDIUM
|
||||
|
||||
@staticmethod
|
||||
def _generate_business_context(label: str, description: str, unit: str) -> str:
|
||||
"""Generate business context for a fact"""
|
||||
# Handle null/None values
|
||||
if not label:
|
||||
label = ""
|
||||
if not description:
|
||||
description = ""
|
||||
|
||||
# Return description if it's longer and more informative than label
|
||||
if description and len(description) > len(label):
|
||||
return description
|
||||
|
||||
# Generate context based on label and unit
|
||||
if label and 'Revenue' in label:
|
||||
return "Total revenue generated from operations"
|
||||
elif label and 'Income' in label:
|
||||
return "Net earnings after all expenses and taxes"
|
||||
elif label and 'Assets' in label:
|
||||
return "Total resources owned by the company"
|
||||
|
||||
# Return label if available, otherwise empty string
|
||||
return label if label else ""
|
||||
|
||||
@staticmethod
|
||||
def _clean_unit(unit: str) -> str:
|
||||
"""Clean and standardize unit representation"""
|
||||
if not unit:
|
||||
return ""
|
||||
|
||||
unit_mapping = {
|
||||
'USD': 'USD',
|
||||
'usd': 'USD',
|
||||
'pure': 'number',
|
||||
'shares': 'shares',
|
||||
'USD/shares': 'USD per share'
|
||||
}
|
||||
|
||||
return unit_mapping.get(unit, unit)
|
||||
|
||||
@staticmethod
|
||||
def _determine_scale(unit: str) -> Optional[int]:
|
||||
"""Determine scale factor from unit"""
|
||||
# SEC data is typically already scaled
|
||||
# This would need more sophisticated logic based on the actual data
|
||||
return None
|
||||
1215
venv/lib/python3.10/site-packages/edgar/entity/query.py
Normal file
1215
venv/lib/python3.10/site-packages/edgar/entity/query.py
Normal file
File diff suppressed because it is too large
Load Diff
116
venv/lib/python3.10/site-packages/edgar/entity/search.py
Normal file
116
venv/lib/python3.10/site-packages/edgar/entity/search.py
Normal file
@@ -0,0 +1,116 @@
|
||||
"""
|
||||
Search functionality for SEC entities.
|
||||
This module provides functions and classes for searching for SEC entities.
|
||||
"""
|
||||
from functools import lru_cache
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import pandas as pd
|
||||
from rich import box
|
||||
from rich.table import Column, Table
|
||||
|
||||
from edgar.entity import Company
|
||||
from edgar.entity.tickers import get_company_tickers
|
||||
from edgar.richtools import repr_rich
|
||||
from edgar.search.datasearch import FastSearch, company_ticker_preprocess, company_ticker_score
|
||||
|
||||
__all__ = [
|
||||
'find_company',
|
||||
'CompanySearchResults',
|
||||
'CompanySearchIndex'
|
||||
]
|
||||
|
||||
|
||||
class CompanySearchResults:
|
||||
"""
|
||||
Results from a company search.
|
||||
"""
|
||||
def __init__(self, query: str,
|
||||
search_results: List[Dict[str, Any]]):
|
||||
self.query: str = query
|
||||
self.results: pd.DataFrame = pd.DataFrame(search_results, columns=['cik', 'ticker', 'company', 'score'])
|
||||
|
||||
@property
|
||||
def tickers(self):
|
||||
return self.results.ticker.tolist()
|
||||
|
||||
@property
|
||||
def ciks(self):
|
||||
return self.results.cik.tolist()
|
||||
|
||||
@property
|
||||
def empty(self):
|
||||
return self.results.empty
|
||||
|
||||
def __len__(self):
|
||||
return len(self.results)
|
||||
|
||||
def __getitem__(self, item):
|
||||
if 0 <= item < len(self):
|
||||
row = self.results.iloc[item]
|
||||
cik: int = int(row.cik)
|
||||
return Company(cik)
|
||||
|
||||
def __rich__(self):
|
||||
table = Table(Column(""),
|
||||
Column("Ticker", justify="left"),
|
||||
Column("Name", justify="left"),
|
||||
Column("Score", justify="left"),
|
||||
title=f"Search results for '{self.query}'",
|
||||
box=box.SIMPLE)
|
||||
for index, row in enumerate(self.results.itertuples()):
|
||||
table.add_row(str(index), row.ticker.rjust(6), row.company, f"{int(row.score)}%")
|
||||
return table
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
class CompanySearchIndex(FastSearch):
|
||||
"""
|
||||
Search index for companies.
|
||||
"""
|
||||
def __init__(self):
|
||||
data = get_company_tickers(as_dataframe=False)
|
||||
super().__init__(data, ['company', 'ticker'],
|
||||
preprocess_func=company_ticker_preprocess,
|
||||
score_func=company_ticker_score)
|
||||
|
||||
def search(self, query: str, top_n: int = 10, threshold: float = 60) -> CompanySearchResults:
|
||||
results = super().search(query, top_n, threshold)
|
||||
return CompanySearchResults(query=query, search_results=results)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __hash__(self):
|
||||
# Combine column names and last 10 values in the 'company' column to create a hash
|
||||
column_names = tuple(self.data[0].keys())
|
||||
last_10_companies = tuple(entry['company'] for entry in self.data[-10:])
|
||||
return hash((column_names, last_10_companies))
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, CompanySearchIndex):
|
||||
return False
|
||||
return (self.data[-10:], tuple(self.data[0].keys())) == (other.data[-10:], tuple(other.data[0].keys()))
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _get_company_search_index():
|
||||
"""Get the company search index."""
|
||||
return CompanySearchIndex()
|
||||
|
||||
|
||||
@lru_cache(maxsize=16)
|
||||
def find_company(company: str, top_n: int = 10):
|
||||
"""
|
||||
Find a company by name.
|
||||
|
||||
Args:
|
||||
company: The company name or ticker to search for
|
||||
top_n: The maximum number of results to return
|
||||
|
||||
Returns:
|
||||
CompanySearchResults: The search results
|
||||
"""
|
||||
return _get_company_search_index().search(company, top_n=top_n)
|
||||
495
venv/lib/python3.10/site-packages/edgar/entity/statement.py
Normal file
495
venv/lib/python3.10/site-packages/edgar/entity/statement.py
Normal file
@@ -0,0 +1,495 @@
|
||||
"""
|
||||
Financial Statement wrapper classes with rich display and concept-aware formatting.
|
||||
|
||||
This module provides Statement classes that wrap pandas DataFrames with:
|
||||
- Intelligent formatting based on financial concept types
|
||||
- Rich display for professional presentation
|
||||
- Access to underlying data for calculations
|
||||
- LLM-ready context generation
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import pandas as pd
|
||||
from rich.box import SIMPLE, SIMPLE_HEAVY
|
||||
from rich.console import Group
|
||||
from rich.padding import Padding
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from .terminal_styles import get_current_scheme
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConceptFormatting:
|
||||
"""Formatting rules for specific financial concepts"""
|
||||
decimal_places: int = 2
|
||||
show_currency: bool = True
|
||||
scale_display: bool = True # Show M, B suffixes
|
||||
percentage: bool = False
|
||||
|
||||
|
||||
class FinancialStatement:
|
||||
"""
|
||||
A wrapper around pandas DataFrame for financial statements with intelligent formatting.
|
||||
|
||||
This class provides:
|
||||
- Concept-aware formatting (EPS to 2 decimals, revenue in millions, etc.)
|
||||
- Rich display for professional presentation
|
||||
- Access to underlying numeric data
|
||||
- LLM context generation
|
||||
"""
|
||||
|
||||
# Formatting rules by concept pattern
|
||||
CONCEPT_FORMATS = {
|
||||
# Earnings per share - always show decimals
|
||||
'earningspershare': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
|
||||
'earnings per share': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
|
||||
'eps': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
|
||||
|
||||
# Ratios and percentages
|
||||
'ratio': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
|
||||
'margin': ConceptFormatting(decimal_places=1, show_currency=False, scale_display=False, percentage=True),
|
||||
'percent': ConceptFormatting(decimal_places=1, show_currency=False, scale_display=False, percentage=True),
|
||||
|
||||
# Per-share values
|
||||
'per share': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
|
||||
'pershare': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
|
||||
'book value': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
|
||||
'dividend': ConceptFormatting(decimal_places=2, show_currency=False, scale_display=False),
|
||||
|
||||
# Share counts - show full numbers with commas
|
||||
'shares outstanding': ConceptFormatting(decimal_places=0, show_currency=False, scale_display=False),
|
||||
'common stock': ConceptFormatting(decimal_places=0, show_currency=False, scale_display=False),
|
||||
'weighted average': ConceptFormatting(decimal_places=0, show_currency=False, scale_display=False),
|
||||
|
||||
# Large financial amounts - show full numbers with commas
|
||||
'revenue': ConceptFormatting(decimal_places=0, show_currency=True, scale_display=False),
|
||||
'income': ConceptFormatting(decimal_places=0, show_currency=True, scale_display=False),
|
||||
'assets': ConceptFormatting(decimal_places=0, show_currency=True, scale_display=False),
|
||||
'liabilities': ConceptFormatting(decimal_places=0, show_currency=True, scale_display=False),
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
data: pd.DataFrame,
|
||||
statement_type: str,
|
||||
entity_name: str = "",
|
||||
period_lengths: Optional[List[str]] = None,
|
||||
mixed_periods: bool = False):
|
||||
"""
|
||||
Initialize financial statement.
|
||||
|
||||
Args:
|
||||
data: DataFrame with financial data
|
||||
statement_type: Type of statement (IncomeStatement, BalanceSheet, etc.)
|
||||
entity_name: Company name
|
||||
period_lengths: List of period lengths in the data
|
||||
mixed_periods: Whether data contains mixed period lengths
|
||||
"""
|
||||
self.data = data
|
||||
self.statement_type = statement_type
|
||||
self.entity_name = entity_name
|
||||
self.period_lengths = period_lengths or []
|
||||
self.mixed_periods = mixed_periods
|
||||
|
||||
# Store original numeric data
|
||||
self._numeric_data = data.copy()
|
||||
|
||||
def get_concept_formatting(self, concept_label: str) -> ConceptFormatting:
|
||||
"""
|
||||
Get formatting rules for a specific concept.
|
||||
|
||||
Args:
|
||||
concept_label: Label of the financial concept
|
||||
|
||||
Returns:
|
||||
ConceptFormatting rules for this concept
|
||||
"""
|
||||
label_lower = concept_label.lower()
|
||||
|
||||
# Check for exact matches first
|
||||
for pattern, formatting in self.CONCEPT_FORMATS.items():
|
||||
if pattern in label_lower:
|
||||
return formatting
|
||||
|
||||
# Default formatting for large amounts - show full numbers with commas
|
||||
return ConceptFormatting(decimal_places=0, show_currency=True, scale_display=False)
|
||||
|
||||
def format_value(self, value: float, concept_label: str) -> str:
|
||||
"""
|
||||
Format a single value based on its concept.
|
||||
|
||||
Args:
|
||||
value: Numeric value to format
|
||||
concept_label: Label of the financial concept
|
||||
|
||||
Returns:
|
||||
Formatted string representation
|
||||
"""
|
||||
if pd.isna(value):
|
||||
return ''
|
||||
|
||||
formatting = self.get_concept_formatting(concept_label)
|
||||
|
||||
# Handle percentage formatting
|
||||
if formatting.percentage:
|
||||
return f"{value:.{formatting.decimal_places}f}%"
|
||||
|
||||
# Always use full number formatting with commas - no scaling to preserve precision
|
||||
if formatting.show_currency:
|
||||
return f"${value:,.{formatting.decimal_places}f}"
|
||||
else:
|
||||
return f"{value:,.{formatting.decimal_places}f}"
|
||||
|
||||
def _repr_html_(self) -> str:
|
||||
"""
|
||||
Rich HTML representation for Jupyter notebooks.
|
||||
|
||||
Returns:
|
||||
HTML string for rich display
|
||||
"""
|
||||
# Create a formatted copy as string DataFrame
|
||||
formatted_data = pd.DataFrame(index=self.data.index, columns=self.data.columns, dtype=str)
|
||||
|
||||
# Apply formatting to each cell
|
||||
for index in self.data.index:
|
||||
concept_label = str(index)
|
||||
for column in self.data.columns:
|
||||
value = self.data.loc[index, column]
|
||||
if pd.notna(value) and isinstance(value, (int, float)):
|
||||
formatted_data.loc[index, column] = self.format_value(value, concept_label)
|
||||
else:
|
||||
formatted_data.loc[index, column] = str(value) if pd.notna(value) else ''
|
||||
|
||||
# Create HTML with styling
|
||||
html = f"""
|
||||
<div style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
|
||||
<h3 style="color: #2c3e50; margin-bottom: 10px;">
|
||||
{self.entity_name} - {self.statement_type.replace('Statement', ' Statement')}
|
||||
</h3>
|
||||
"""
|
||||
|
||||
# Add period warning if mixed
|
||||
if self.mixed_periods:
|
||||
html += """
|
||||
<div style="background-color: #fff3cd; border: 1px solid #ffeaa7;
|
||||
padding: 8px; margin-bottom: 10px; border-radius: 4px;">
|
||||
<strong>⚠️ Mixed Period Lengths:</strong> This statement contains periods of different lengths
|
||||
({periods}). Consider filtering to comparable periods for accurate analysis.
|
||||
</div>
|
||||
""".format(periods=', '.join(self.period_lengths))
|
||||
|
||||
# Add the formatted table
|
||||
html += formatted_data.to_html(classes='financial-statement',
|
||||
table_id='fs-table',
|
||||
escape=False)
|
||||
|
||||
# Add CSS styling
|
||||
html += """
|
||||
<style>
|
||||
.financial-statement {
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
font-size: 12px;
|
||||
margin-top: 10px;
|
||||
}
|
||||
.financial-statement th {
|
||||
background-color: #34495e;
|
||||
color: white;
|
||||
padding: 8px;
|
||||
text-align: right;
|
||||
font-weight: bold;
|
||||
}
|
||||
.financial-statement td {
|
||||
padding: 6px 8px;
|
||||
text-align: right;
|
||||
border-bottom: 1px solid #ecf0f1;
|
||||
}
|
||||
.financial-statement tr:hover {
|
||||
background-color: #f8f9fa;
|
||||
}
|
||||
.financial-statement tr:nth-child(even) {
|
||||
background-color: #fdfdfd;
|
||||
}
|
||||
.financial-statement td:first-child {
|
||||
text-align: left;
|
||||
font-weight: 500;
|
||||
}
|
||||
</style>
|
||||
</div>
|
||||
"""
|
||||
|
||||
return html
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
String representation for console display.
|
||||
|
||||
Returns:
|
||||
Formatted string representation
|
||||
"""
|
||||
# Create formatted version as string DataFrame
|
||||
formatted_data = pd.DataFrame(index=self.data.index, columns=self.data.columns, dtype=str)
|
||||
|
||||
# Apply formatting to each cell
|
||||
for index in self.data.index:
|
||||
concept_label = str(index)
|
||||
for column in self.data.columns:
|
||||
value = self.data.loc[index, column]
|
||||
if pd.notna(value) and isinstance(value, (int, float)):
|
||||
formatted_data.loc[index, column] = self.format_value(value, concept_label)
|
||||
else:
|
||||
formatted_data.loc[index, column] = str(value) if pd.notna(value) else ''
|
||||
|
||||
header = f"\n{self.entity_name} - {self.statement_type.replace('Statement', ' Statement')}\n"
|
||||
header += "=" * len(header.strip()) + "\n"
|
||||
|
||||
if self.mixed_periods:
|
||||
header += f"⚠️ Mixed period lengths: {', '.join(self.period_lengths)}\n\n"
|
||||
|
||||
return header + str(formatted_data)
|
||||
|
||||
def __rich__(self):
|
||||
"""Creates a rich representation for professional financial statement display."""
|
||||
|
||||
|
||||
colors = get_current_scheme()
|
||||
|
||||
if self.data.empty:
|
||||
return Panel(
|
||||
Text("No data available", style=colors["empty_value"]),
|
||||
title=f"📊 {self.statement_type.replace('Statement', ' Statement')}",
|
||||
border_style=colors["panel_border"]
|
||||
)
|
||||
|
||||
# Statement type icon mapping
|
||||
icon_map = {
|
||||
'IncomeStatement': '💰',
|
||||
'BalanceSheet': '⚖️',
|
||||
'CashFlow': '💵',
|
||||
'Statement': '📊'
|
||||
}
|
||||
icon = icon_map.get(self.statement_type, '📊')
|
||||
|
||||
# Title with company name and statement type
|
||||
if self.entity_name:
|
||||
title = Text.assemble(
|
||||
icon + " ",
|
||||
(self.entity_name, colors["company_name"]),
|
||||
" ",
|
||||
(self.statement_type.replace('Statement', ' Statement'), colors["statement_type"])
|
||||
)
|
||||
else:
|
||||
title = Text.assemble(
|
||||
icon + " ",
|
||||
(self.statement_type.replace('Statement', ' Statement'), colors["statement_type"])
|
||||
)
|
||||
|
||||
# Create the main financial statement table
|
||||
statement_table = Table(box=SIMPLE, show_header=True, padding=(0, 1))
|
||||
statement_table.add_column("Line Item", style=colors["total_item"], no_wrap=True, max_width=30)
|
||||
|
||||
# Add period columns (limit to reasonable number for display)
|
||||
periods = list(self.data.columns)
|
||||
display_periods = periods[:6] # Show max 6 periods for readability
|
||||
has_more_periods = len(periods) > 6
|
||||
|
||||
for period in display_periods:
|
||||
statement_table.add_column(str(period), justify="right", max_width=15)
|
||||
|
||||
# Add rows with formatted values
|
||||
for index in self.data.index:
|
||||
concept_label = str(index)
|
||||
# Truncate long concept names
|
||||
display_label = concept_label[:28] + "..." if len(concept_label) > 30 else concept_label
|
||||
|
||||
row_values = [display_label]
|
||||
for period in display_periods:
|
||||
value = self.data.loc[index, period]
|
||||
if pd.notna(value) and isinstance(value, (int, float)):
|
||||
formatted_value = self.format_value(value, concept_label)
|
||||
row_values.append(formatted_value)
|
||||
else:
|
||||
row_values.append("-" if pd.isna(value) else str(value)[:12])
|
||||
|
||||
statement_table.add_row(*row_values)
|
||||
|
||||
# Create summary info panel
|
||||
info_table = Table(box=SIMPLE_HEAVY, show_header=False, padding=(0, 1))
|
||||
info_table.add_column("Metric", style=colors["low_confidence_item"])
|
||||
info_table.add_column("Value", style=colors["total_item"])
|
||||
|
||||
info_table.add_row("Line Items", f"{len(self.data.index):,}")
|
||||
info_table.add_row("Periods", f"{len(self.data.columns):,}")
|
||||
if self.period_lengths:
|
||||
info_table.add_row("Period Types", ", ".join(set(self.period_lengths)))
|
||||
|
||||
info_panel = Panel(
|
||||
info_table,
|
||||
title="📋 Statement Info",
|
||||
border_style="bright_black"
|
||||
)
|
||||
|
||||
# Create period warning if needed
|
||||
warning_panel = None
|
||||
if self.mixed_periods:
|
||||
warning_text = Text.assemble(
|
||||
"⚠️ Mixed period lengths detected: ",
|
||||
(", ".join(self.period_lengths), "yellow"),
|
||||
"\nConsider filtering to comparable periods for accurate analysis."
|
||||
)
|
||||
warning_panel = Panel(
|
||||
warning_text,
|
||||
title="🚨 Period Warning",
|
||||
border_style=colors.get("warning", "yellow")
|
||||
)
|
||||
|
||||
# Subtitle with additional info
|
||||
subtitle_parts = [f"{len(self.data.index):,} line items"]
|
||||
if has_more_periods:
|
||||
subtitle_parts.append(f"showing first {len(display_periods)} of {len(periods)} periods")
|
||||
subtitle = " • ".join(subtitle_parts)
|
||||
|
||||
# Main statement panel
|
||||
statement_panel = Panel(
|
||||
statement_table,
|
||||
title="📊 Financial Data",
|
||||
subtitle=subtitle,
|
||||
border_style="bright_black"
|
||||
)
|
||||
|
||||
# Combine all panels
|
||||
content_renderables = [
|
||||
Padding("", (1, 0, 0, 0)),
|
||||
info_panel
|
||||
]
|
||||
|
||||
if warning_panel:
|
||||
content_renderables.append(warning_panel)
|
||||
|
||||
content_renderables.append(statement_panel)
|
||||
|
||||
content = Group(*content_renderables)
|
||||
|
||||
return Panel(
|
||||
content,
|
||||
title=title,
|
||||
border_style=colors["panel_border"]
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
"""String representation using rich formatting."""
|
||||
from edgar.richtools import repr_rich
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def to_numeric(self) -> pd.DataFrame:
|
||||
"""
|
||||
Get the underlying numeric DataFrame for calculations.
|
||||
|
||||
Returns:
|
||||
DataFrame with original numeric values
|
||||
"""
|
||||
return self._numeric_data.copy()
|
||||
|
||||
def to_llm_context(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate LLM-friendly context from the statement.
|
||||
|
||||
Returns:
|
||||
Dictionary with structured financial data for LLM consumption
|
||||
"""
|
||||
context = {
|
||||
"entity_name": self.entity_name,
|
||||
"statement_type": self.statement_type,
|
||||
"period_lengths": self.period_lengths,
|
||||
"mixed_periods": self.mixed_periods,
|
||||
"periods": list(self.data.columns),
|
||||
"line_items": {}
|
||||
}
|
||||
|
||||
# Convert each line item to LLM-friendly format
|
||||
for index in self.data.index:
|
||||
concept_label = str(index)
|
||||
line_item = {
|
||||
"label": concept_label,
|
||||
"values": {},
|
||||
"formatting": self.get_concept_formatting(concept_label).__dict__
|
||||
}
|
||||
|
||||
for column in self.data.columns:
|
||||
value = self.data.loc[index, column]
|
||||
if pd.notna(value):
|
||||
line_item["values"][str(column)] = {
|
||||
"raw_value": float(value),
|
||||
"formatted_value": self.format_value(value, concept_label)
|
||||
}
|
||||
|
||||
context["line_items"][concept_label] = line_item
|
||||
|
||||
return context
|
||||
|
||||
def get_concept(self, concept_name: str) -> Optional[pd.Series]:
|
||||
"""
|
||||
Get data for a specific concept across all periods.
|
||||
|
||||
Args:
|
||||
concept_name: Name of the concept to retrieve
|
||||
|
||||
Returns:
|
||||
Series with values across periods, or None if not found
|
||||
"""
|
||||
# Try exact match first
|
||||
if concept_name in self.data.index:
|
||||
return self.data.loc[concept_name]
|
||||
|
||||
# Try case-insensitive partial match
|
||||
concept_lower = concept_name.lower()
|
||||
for index in self.data.index:
|
||||
if concept_lower in str(index).lower():
|
||||
return self.data.loc[index]
|
||||
|
||||
return None
|
||||
|
||||
def calculate_growth(self, concept_name: str, periods: int = 2) -> Optional[pd.Series]:
|
||||
"""
|
||||
Calculate period-over-period growth for a concept.
|
||||
|
||||
Args:
|
||||
concept_name: Name of the concept
|
||||
periods: Number of periods to calculate growth over
|
||||
|
||||
Returns:
|
||||
Series with growth rates, or None if concept not found
|
||||
"""
|
||||
concept_data = self.get_concept(concept_name)
|
||||
if concept_data is None:
|
||||
return None
|
||||
|
||||
# Calculate percentage change
|
||||
return concept_data.pct_change(periods=periods) * 100
|
||||
|
||||
@property
|
||||
def shape(self) -> tuple:
|
||||
"""Get the shape of the underlying data."""
|
||||
return self.data.shape
|
||||
|
||||
@property
|
||||
def columns(self) -> pd.Index:
|
||||
"""Get the columns of the underlying data."""
|
||||
return self.data.columns
|
||||
|
||||
@property
|
||||
def index(self) -> pd.Index:
|
||||
"""Get the index of the underlying data."""
|
||||
return self.data.index
|
||||
|
||||
@property
|
||||
def empty(self) -> bool:
|
||||
"""Check if the underlying DataFrame is empty."""
|
||||
return self.data.empty
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get the length of the underlying DataFrame."""
|
||||
return len(self.data)
|
||||
@@ -0,0 +1,731 @@
|
||||
"""
|
||||
Statement Builder for reconstructing financial statements using canonical structures.
|
||||
|
||||
This module provides intelligent statement reconstruction using learned canonical
|
||||
structures and virtual presentation trees.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date
|
||||
from typing import Any, Dict, List, Optional, Set
|
||||
|
||||
from rich import box
|
||||
from rich.columns import Columns
|
||||
from rich.console import Group
|
||||
from rich.padding import Padding
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from edgar.entity.mappings_loader import load_canonical_structures, load_virtual_trees
|
||||
from edgar.entity.models import FinancialFact
|
||||
from edgar.richtools import repr_rich
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class StatementItem:
|
||||
"""A single item in a reconstructed financial statement."""
|
||||
concept: str
|
||||
label: str
|
||||
value: Optional[float]
|
||||
depth: int
|
||||
parent_concept: Optional[str]
|
||||
children: List['StatementItem'] = field(default_factory=list)
|
||||
|
||||
# Metadata
|
||||
is_abstract: bool = False
|
||||
is_total: bool = False
|
||||
section: Optional[str] = None
|
||||
confidence: float = 1.0
|
||||
source: str = 'fact' # 'fact', 'calculated', 'canonical', 'placeholder'
|
||||
|
||||
# Original fact if available
|
||||
fact: Optional[FinancialFact] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary representation."""
|
||||
return {
|
||||
'concept': self.concept,
|
||||
'label': self.label,
|
||||
'value': self.value,
|
||||
'depth': self.depth,
|
||||
'is_abstract': self.is_abstract,
|
||||
'is_total': self.is_total,
|
||||
'section': self.section,
|
||||
'confidence': self.confidence,
|
||||
'source': self.source,
|
||||
'children': [child.to_dict() for child in self.children]
|
||||
}
|
||||
|
||||
def get_display_value(self) -> str:
|
||||
"""Get formatted value for display."""
|
||||
if self.value is not None:
|
||||
if abs(self.value) >= 1_000_000_000:
|
||||
return f"${self.value/1_000_000_000:.1f}B"
|
||||
elif abs(self.value) >= 1_000_000:
|
||||
return f"${self.value/1_000_000:.1f}M"
|
||||
elif abs(self.value) >= 1_000:
|
||||
return f"${self.value/1_000:.0f}K"
|
||||
else:
|
||||
return f"${self.value:.0f}"
|
||||
elif self.is_abstract:
|
||||
return ""
|
||||
elif self.source == 'placeholder':
|
||||
return "[Missing]"
|
||||
else:
|
||||
return "-"
|
||||
|
||||
def __rich__(self):
|
||||
"""Create a rich representation of the statement item."""
|
||||
from rich.tree import Tree
|
||||
|
||||
# Create the node label
|
||||
if self.is_abstract:
|
||||
label = Text(self.label, style="bold cyan")
|
||||
elif self.is_total:
|
||||
label = Text(self.label, style="bold yellow")
|
||||
else:
|
||||
style = "dim" if self.confidence < 0.8 else ""
|
||||
confidence_marker = " ◦" if self.confidence < 0.8 else ""
|
||||
label = Text(f"{self.label}{confidence_marker}", style=style)
|
||||
|
||||
# Add value if present
|
||||
value_str = self.get_display_value()
|
||||
if value_str and value_str != "-":
|
||||
# Color code values
|
||||
if value_str.startswith("$") and self.value and isinstance(self.value, (int, float)):
|
||||
value_style = "red" if self.value < 0 else "green"
|
||||
else:
|
||||
value_style = ""
|
||||
|
||||
label_with_value = Text.assemble(
|
||||
label,
|
||||
" ",
|
||||
(value_str, value_style)
|
||||
)
|
||||
else:
|
||||
label_with_value = label
|
||||
|
||||
# Create tree with this item as root
|
||||
tree = Tree(label_with_value)
|
||||
|
||||
# Add children
|
||||
for child in self.children:
|
||||
tree.add(child.__rich__())
|
||||
|
||||
return tree
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""String representation using rich formatting."""
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
@dataclass
|
||||
class StructuredStatement:
|
||||
"""A complete structured financial statement."""
|
||||
statement_type: str
|
||||
fiscal_year: Optional[int]
|
||||
fiscal_period: Optional[str]
|
||||
period_end: Optional[date]
|
||||
|
||||
items: List[StatementItem]
|
||||
|
||||
# Metadata
|
||||
company_name: Optional[str] = None
|
||||
cik: Optional[str] = None
|
||||
canonical_coverage: float = 0.0
|
||||
facts_used: int = 0
|
||||
facts_total: int = 0
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary representation."""
|
||||
return {
|
||||
'statement_type': self.statement_type,
|
||||
'fiscal_year': self.fiscal_year,
|
||||
'fiscal_period': self.fiscal_period,
|
||||
'period_end': self.period_end.isoformat() if self.period_end else None,
|
||||
'company_name': self.company_name,
|
||||
'cik': self.cik,
|
||||
'canonical_coverage': self.canonical_coverage,
|
||||
'facts_used': self.facts_used,
|
||||
'facts_total': self.facts_total,
|
||||
'items': [item.to_dict() for item in self.items]
|
||||
}
|
||||
|
||||
def get_hierarchical_display(self, max_depth: int = 3) -> str:
|
||||
"""Get hierarchical text representation."""
|
||||
lines = []
|
||||
|
||||
def add_item(item: StatementItem, indent: int = 0):
|
||||
if indent > max_depth:
|
||||
return
|
||||
|
||||
indent_str = " " * indent
|
||||
value_str = item.get_display_value()
|
||||
|
||||
if item.is_abstract:
|
||||
lines.append(f"{indent_str}{item.label}")
|
||||
elif item.is_total:
|
||||
lines.append(f"{indent_str}{item.label:<40} {value_str:>15}")
|
||||
lines.append(f"{indent_str}{'-' * 55}")
|
||||
else:
|
||||
confidence_marker = "" if item.confidence > 0.8 else " *"
|
||||
lines.append(f"{indent_str}{item.label:<40} {value_str:>15}{confidence_marker}")
|
||||
|
||||
for child in item.children:
|
||||
add_item(child, indent + 1)
|
||||
|
||||
for item in self.items:
|
||||
add_item(item)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def __rich__(self):
|
||||
"""Create a rich representation of the structured statement."""
|
||||
# Statement type mapping for better display
|
||||
statement_names = {
|
||||
'IncomeStatement': 'Income Statement',
|
||||
'BalanceSheet': 'Balance Sheet',
|
||||
'CashFlow': 'Cash Flow Statement',
|
||||
'StatementsOfComprehensiveIncome': 'Comprehensive Income',
|
||||
'StatementsOfShareholdersEquity': 'Shareholders Equity'
|
||||
}
|
||||
|
||||
# Title with company name and period
|
||||
title_parts = []
|
||||
if self.company_name:
|
||||
title_parts.append((self.company_name, "bold green"))
|
||||
else:
|
||||
title_parts.append(("Financial Statement", "bold"))
|
||||
|
||||
title = Text.assemble(*title_parts)
|
||||
|
||||
# Subtitle with statement type and period
|
||||
statement_display = statement_names.get(self.statement_type, self.statement_type)
|
||||
if self.fiscal_period and self.fiscal_year:
|
||||
subtitle = f"{statement_display} • {self.fiscal_period} {self.fiscal_year}"
|
||||
elif self.period_end:
|
||||
subtitle = f"{statement_display} • As of {self.period_end}"
|
||||
else:
|
||||
subtitle = statement_display
|
||||
|
||||
# Main statement table
|
||||
stmt_table = Table(
|
||||
box=box.SIMPLE,
|
||||
show_header=False,
|
||||
padding=(0, 1),
|
||||
expand=True
|
||||
)
|
||||
stmt_table.add_column("Item", style="", ratio=3)
|
||||
stmt_table.add_column("Value", justify="right", style="bold", ratio=1)
|
||||
|
||||
def add_item_to_table(item: StatementItem, depth: int = 0):
|
||||
"""Add an item to the table with proper indentation."""
|
||||
indent = " " * depth
|
||||
|
||||
if item.is_abstract:
|
||||
# Abstract items are headers
|
||||
stmt_table.add_row(
|
||||
Text(f"{indent}{item.label}", style="bold cyan"),
|
||||
""
|
||||
)
|
||||
elif item.is_total:
|
||||
# Total items with underline
|
||||
value_text = Text(item.get_display_value(), style="bold yellow")
|
||||
stmt_table.add_row(
|
||||
Text(f"{indent}{item.label}", style="bold"),
|
||||
value_text
|
||||
)
|
||||
# Add a separator line after totals
|
||||
if depth == 0:
|
||||
stmt_table.add_row("", "")
|
||||
stmt_table.add_row(
|
||||
Text("─" * 40, style="dim"),
|
||||
Text("─" * 15, style="dim")
|
||||
)
|
||||
else:
|
||||
# Regular items
|
||||
style = "dim" if item.confidence < 0.8 else ""
|
||||
confidence_marker = " ◦" if item.confidence < 0.8 else ""
|
||||
label_text = f"{indent}{item.label}{confidence_marker}"
|
||||
|
||||
# Color code positive/negative values
|
||||
value_str = item.get_display_value()
|
||||
if value_str and value_str.startswith("$"):
|
||||
try:
|
||||
# Extract numeric value for coloring
|
||||
if item.value and isinstance(item.value, (int, float)):
|
||||
if item.value < 0:
|
||||
value_style = "red"
|
||||
else:
|
||||
value_style = "green"
|
||||
else:
|
||||
value_style = ""
|
||||
except:
|
||||
value_style = ""
|
||||
else:
|
||||
value_style = ""
|
||||
|
||||
stmt_table.add_row(
|
||||
Text(label_text, style=style),
|
||||
Text(value_str, style=value_style) if value_str else ""
|
||||
)
|
||||
|
||||
# Add children recursively
|
||||
for child in item.children:
|
||||
if depth < 3: # Limit depth for display
|
||||
add_item_to_table(child, depth + 1)
|
||||
|
||||
# Add all items to the table
|
||||
for item in self.items:
|
||||
add_item_to_table(item)
|
||||
|
||||
# Metadata summary
|
||||
metadata = Table(box=box.SIMPLE, show_header=False, padding=(0, 1))
|
||||
metadata.add_column("Metric", style="dim")
|
||||
metadata.add_column("Value", style="bold")
|
||||
|
||||
metadata.add_row("Facts Used", f"{self.facts_used:,}")
|
||||
if self.facts_total > 0:
|
||||
metadata.add_row("Total Facts", f"{self.facts_total:,}")
|
||||
|
||||
if self.canonical_coverage > 0:
|
||||
coverage_pct = self.canonical_coverage * 100
|
||||
coverage_style = "green" if coverage_pct >= 50 else "yellow" if coverage_pct >= 25 else "red"
|
||||
metadata.add_row(
|
||||
"Canonical Coverage",
|
||||
Text(f"{coverage_pct:.1f}%", style=coverage_style)
|
||||
)
|
||||
|
||||
if self.cik:
|
||||
metadata.add_row("CIK", self.cik)
|
||||
|
||||
# Data quality indicators
|
||||
quality_notes = []
|
||||
|
||||
# Count items by confidence
|
||||
low_confidence_count = sum(
|
||||
1 for item in self._flatten_items()
|
||||
if not item.is_abstract and item.confidence < 0.8
|
||||
)
|
||||
|
||||
if low_confidence_count > 0:
|
||||
quality_notes.append(
|
||||
Text(f"◦ {low_confidence_count} items with lower confidence", style="dim yellow")
|
||||
)
|
||||
|
||||
# Count calculated vs actual values
|
||||
calculated_count = sum(
|
||||
1 for item in self._flatten_items()
|
||||
if item.source == 'calculated'
|
||||
)
|
||||
|
||||
if calculated_count > 0:
|
||||
quality_notes.append(
|
||||
Text(f"◦ {calculated_count} calculated values", style="dim cyan")
|
||||
)
|
||||
|
||||
# Combine metadata and quality notes
|
||||
metadata_panel = Panel(
|
||||
metadata,
|
||||
title="📊 Statement Metadata",
|
||||
border_style="bright_black"
|
||||
)
|
||||
|
||||
# Create the main content group
|
||||
content_parts = [
|
||||
Padding("", (1, 0, 0, 0)),
|
||||
stmt_table
|
||||
]
|
||||
|
||||
# Add metadata in a column layout
|
||||
if self.facts_used > 0:
|
||||
bottom_content = [metadata_panel]
|
||||
|
||||
if quality_notes:
|
||||
quality_panel = Panel(
|
||||
Group(*quality_notes),
|
||||
title="📝 Data Quality Notes",
|
||||
border_style="bright_black"
|
||||
)
|
||||
bottom_content.append(quality_panel)
|
||||
|
||||
content_parts.append(Padding("", (1, 0)))
|
||||
content_parts.append(Columns(bottom_content, equal=True, expand=True))
|
||||
|
||||
content = Group(*content_parts)
|
||||
|
||||
# Create the main panel
|
||||
return Panel(
|
||||
content,
|
||||
title=title,
|
||||
subtitle=subtitle,
|
||||
border_style="blue",
|
||||
expand=True
|
||||
)
|
||||
|
||||
def _flatten_items(self) -> List[StatementItem]:
|
||||
"""Flatten the hierarchical items into a flat list."""
|
||||
flat_items = []
|
||||
|
||||
def flatten(item: StatementItem):
|
||||
flat_items.append(item)
|
||||
for child in item.children:
|
||||
flatten(child)
|
||||
|
||||
for item in self.items:
|
||||
flatten(item)
|
||||
|
||||
return flat_items
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""String representation using rich formatting."""
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
class StatementBuilder:
|
||||
"""
|
||||
Builds structured financial statements using canonical templates.
|
||||
|
||||
This class reconstructs complete financial statements by combining
|
||||
actual facts with canonical structures, filling in missing concepts
|
||||
and maintaining proper hierarchy.
|
||||
"""
|
||||
|
||||
def __init__(self, cik: Optional[str] = None):
|
||||
"""
|
||||
Initialize the statement builder.
|
||||
|
||||
Args:
|
||||
cik: Company CIK for context
|
||||
"""
|
||||
self.cik = cik
|
||||
self.canonical_structures = load_canonical_structures()
|
||||
self.virtual_trees = load_virtual_trees()
|
||||
|
||||
def build_statement(self,
|
||||
facts: List[FinancialFact],
|
||||
statement_type: str,
|
||||
fiscal_year: Optional[int] = None,
|
||||
fiscal_period: Optional[str] = None,
|
||||
use_canonical: bool = True,
|
||||
include_missing: bool = False) -> StructuredStatement:
|
||||
"""
|
||||
Build a structured financial statement from facts.
|
||||
|
||||
Args:
|
||||
facts: List of financial facts
|
||||
statement_type: Type of statement (BalanceSheet, IncomeStatement, etc.)
|
||||
fiscal_year: Fiscal year to filter for
|
||||
fiscal_period: Fiscal period (FY, Q1, Q2, Q3, Q4)
|
||||
use_canonical: Whether to use canonical structure for organization
|
||||
include_missing: Whether to include placeholder for missing concepts
|
||||
|
||||
Returns:
|
||||
StructuredStatement with hierarchical organization
|
||||
"""
|
||||
# Filter facts for this statement and period
|
||||
filtered_facts = self._filter_facts(facts, statement_type, fiscal_year, fiscal_period)
|
||||
|
||||
# Create fact lookup
|
||||
fact_map = self._create_fact_map(filtered_facts)
|
||||
|
||||
# Get period end date
|
||||
period_end = self._get_period_end(filtered_facts)
|
||||
|
||||
if use_canonical and statement_type in self.virtual_trees:
|
||||
# Build using canonical structure
|
||||
items = self._build_with_canonical(
|
||||
fact_map,
|
||||
self.virtual_trees[statement_type],
|
||||
include_missing
|
||||
)
|
||||
|
||||
# Add unmatched facts
|
||||
unmatched = self._find_unmatched_facts(fact_map, self.virtual_trees[statement_type])
|
||||
items.extend(self._create_items_from_facts(unmatched))
|
||||
else:
|
||||
# Build from facts only
|
||||
items = self._build_from_facts(fact_map)
|
||||
|
||||
# Calculate metadata
|
||||
facts_used = len(fact_map)
|
||||
canonical_coverage = self._calculate_coverage(fact_map, statement_type) if use_canonical else 0.0
|
||||
|
||||
return StructuredStatement(
|
||||
statement_type=statement_type,
|
||||
fiscal_year=fiscal_year,
|
||||
fiscal_period=fiscal_period,
|
||||
period_end=period_end,
|
||||
items=items,
|
||||
cik=self.cik,
|
||||
canonical_coverage=canonical_coverage,
|
||||
facts_used=facts_used,
|
||||
facts_total=len(facts)
|
||||
)
|
||||
|
||||
def _filter_facts(self, facts: List[FinancialFact],
|
||||
statement_type: str,
|
||||
fiscal_year: Optional[int],
|
||||
fiscal_period: Optional[str]) -> List[FinancialFact]:
|
||||
"""Filter facts for the requested statement and period."""
|
||||
filtered = []
|
||||
|
||||
for fact in facts:
|
||||
# Check statement type
|
||||
if fact.statement_type != statement_type:
|
||||
continue
|
||||
|
||||
# Check fiscal year
|
||||
if fiscal_year and fact.fiscal_year != fiscal_year:
|
||||
continue
|
||||
|
||||
# Check fiscal period
|
||||
if fiscal_period and fact.fiscal_period != fiscal_period:
|
||||
continue
|
||||
|
||||
filtered.append(fact)
|
||||
|
||||
return filtered
|
||||
|
||||
def _create_fact_map(self, facts: List[FinancialFact]) -> Dict[str, FinancialFact]:
|
||||
"""Create a map of concept to fact."""
|
||||
fact_map = {}
|
||||
|
||||
for fact in facts:
|
||||
# Extract clean concept name
|
||||
concept = fact.concept
|
||||
if ':' in concept:
|
||||
concept = concept.split(':', 1)[1]
|
||||
|
||||
# Use most recent fact for duplicates
|
||||
if concept not in fact_map or fact.filing_date > fact_map[concept].filing_date:
|
||||
fact_map[concept] = fact
|
||||
|
||||
return fact_map
|
||||
|
||||
def _get_period_end(self, facts: List[FinancialFact]) -> Optional[date]:
|
||||
"""Get the period end date from facts."""
|
||||
for fact in facts:
|
||||
if fact.period_end:
|
||||
return fact.period_end
|
||||
return None
|
||||
|
||||
def _build_with_canonical(self, fact_map: Dict[str, FinancialFact],
|
||||
virtual_tree: Dict[str, Any],
|
||||
include_missing: bool) -> List[StatementItem]:
|
||||
"""Build statement using canonical structure."""
|
||||
items = []
|
||||
processed = set()
|
||||
|
||||
# Process root nodes
|
||||
for root_concept in virtual_tree.get('roots', []):
|
||||
item = self._build_canonical_item(
|
||||
root_concept,
|
||||
virtual_tree['nodes'],
|
||||
fact_map,
|
||||
processed,
|
||||
include_missing,
|
||||
depth=0
|
||||
)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
return items
|
||||
|
||||
def _build_canonical_item(self, concept: str,
|
||||
nodes: Dict[str, Any],
|
||||
fact_map: Dict[str, FinancialFact],
|
||||
processed: Set[str],
|
||||
include_missing: bool,
|
||||
depth: int = 0,
|
||||
parent: Optional[str] = None) -> Optional[StatementItem]:
|
||||
"""Build a single canonical item with children."""
|
||||
if concept in processed:
|
||||
return None
|
||||
|
||||
processed.add(concept)
|
||||
|
||||
# Get node info
|
||||
node = nodes.get(concept, {})
|
||||
|
||||
# Check if we have a fact for this concept
|
||||
fact = fact_map.get(concept)
|
||||
|
||||
# Determine if we should include this item
|
||||
if not fact and not include_missing and not node.get('is_abstract'):
|
||||
# Skip missing concrete concepts unless required
|
||||
if node.get('occurrence_rate', 0) < 0.8: # Not a core concept
|
||||
return None
|
||||
|
||||
# Create the item
|
||||
item = StatementItem(
|
||||
concept=concept,
|
||||
label=fact.label if fact else node.get('label', concept),
|
||||
value=fact.numeric_value if fact else None,
|
||||
depth=depth,
|
||||
parent_concept=parent,
|
||||
is_abstract=node.get('is_abstract', False),
|
||||
is_total=node.get('is_total', False),
|
||||
section=node.get('section'),
|
||||
confidence=node.get('occurrence_rate', 1.0) if not fact else 1.0,
|
||||
source='fact' if fact else ('canonical' if not include_missing else 'placeholder'),
|
||||
fact=fact
|
||||
)
|
||||
|
||||
# Process children
|
||||
for child_concept in node.get('children', []):
|
||||
child_item = self._build_canonical_item(
|
||||
child_concept,
|
||||
nodes,
|
||||
fact_map,
|
||||
processed,
|
||||
include_missing,
|
||||
depth + 1,
|
||||
concept
|
||||
)
|
||||
if child_item:
|
||||
item.children.append(child_item)
|
||||
|
||||
# Try to calculate total if missing
|
||||
if item.is_total and item.value is None and item.children:
|
||||
calculated_value = self._calculate_total(item.children)
|
||||
if calculated_value is not None:
|
||||
item.value = calculated_value
|
||||
item.source = 'calculated'
|
||||
|
||||
return item
|
||||
|
||||
def _calculate_total(self, children: List[StatementItem]) -> Optional[float]:
|
||||
"""Calculate total from children values."""
|
||||
total = 0
|
||||
has_values = False
|
||||
|
||||
for child in children:
|
||||
if not child.is_abstract and child.value is not None:
|
||||
total += child.value
|
||||
has_values = True
|
||||
|
||||
return total if has_values else None
|
||||
|
||||
def _find_unmatched_facts(self, fact_map: Dict[str, FinancialFact],
|
||||
virtual_tree: Dict[str, Any]) -> Dict[str, FinancialFact]:
|
||||
"""Find facts that don't match canonical concepts."""
|
||||
canonical_concepts = set(virtual_tree.get('nodes', {}).keys())
|
||||
unmatched = {}
|
||||
|
||||
for concept, fact in fact_map.items():
|
||||
if concept not in canonical_concepts:
|
||||
unmatched[concept] = fact
|
||||
|
||||
return unmatched
|
||||
|
||||
def _create_items_from_facts(self, facts: Dict[str, FinancialFact]) -> List[StatementItem]:
|
||||
"""Create statement items from unmatched facts."""
|
||||
items = []
|
||||
|
||||
for concept, fact in facts.items():
|
||||
item = StatementItem(
|
||||
concept=concept,
|
||||
label=fact.label,
|
||||
value=fact.numeric_value,
|
||||
depth=1, # Default depth
|
||||
parent_concept=None,
|
||||
is_abstract=fact.is_abstract,
|
||||
is_total=fact.is_total,
|
||||
section=fact.section,
|
||||
confidence=0.7, # Lower confidence for unmatched
|
||||
source='fact',
|
||||
fact=fact
|
||||
)
|
||||
items.append(item)
|
||||
|
||||
return items
|
||||
|
||||
def _build_from_facts(self, fact_map: Dict[str, FinancialFact]) -> List[StatementItem]:
|
||||
"""Build statement directly from facts without canonical structure."""
|
||||
# Group facts by parent
|
||||
hierarchy = defaultdict(list)
|
||||
roots = []
|
||||
|
||||
for concept, fact in fact_map.items():
|
||||
if fact.parent_concept:
|
||||
hierarchy[fact.parent_concept].append(concept)
|
||||
else:
|
||||
roots.append(concept)
|
||||
|
||||
# Build items recursively
|
||||
items = []
|
||||
for root_concept in roots:
|
||||
item = self._build_fact_item(root_concept, fact_map, hierarchy)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
# Add orphaned facts
|
||||
for concept, fact in fact_map.items():
|
||||
if concept not in roots and not fact.parent_concept:
|
||||
item = StatementItem(
|
||||
concept=concept,
|
||||
label=fact.label,
|
||||
value=fact.numeric_value,
|
||||
depth=0,
|
||||
parent_concept=None,
|
||||
is_abstract=fact.is_abstract,
|
||||
is_total=fact.is_total,
|
||||
section=fact.section,
|
||||
confidence=1.0,
|
||||
source='fact',
|
||||
fact=fact
|
||||
)
|
||||
items.append(item)
|
||||
|
||||
return items
|
||||
|
||||
def _build_fact_item(self, concept: str,
|
||||
fact_map: Dict[str, FinancialFact],
|
||||
hierarchy: Dict[str, List[str]],
|
||||
depth: int = 0) -> Optional[StatementItem]:
|
||||
"""Build item from fact with children."""
|
||||
if concept not in fact_map:
|
||||
return None
|
||||
|
||||
fact = fact_map[concept]
|
||||
|
||||
item = StatementItem(
|
||||
concept=concept,
|
||||
label=fact.label,
|
||||
value=fact.numeric_value,
|
||||
depth=depth,
|
||||
parent_concept=fact.parent_concept,
|
||||
is_abstract=fact.is_abstract,
|
||||
is_total=fact.is_total,
|
||||
section=fact.section,
|
||||
confidence=1.0,
|
||||
source='fact',
|
||||
fact=fact
|
||||
)
|
||||
|
||||
# Add children
|
||||
for child_concept in hierarchy.get(concept, []):
|
||||
child_item = self._build_fact_item(child_concept, fact_map, hierarchy, depth + 1)
|
||||
if child_item:
|
||||
item.children.append(child_item)
|
||||
|
||||
return item
|
||||
|
||||
def _calculate_coverage(self, fact_map: Dict[str, FinancialFact],
|
||||
statement_type: str) -> float:
|
||||
"""Calculate canonical coverage percentage."""
|
||||
if statement_type not in self.virtual_trees:
|
||||
return 0.0
|
||||
|
||||
canonical_concepts = set(self.virtual_trees[statement_type].get('nodes', {}).keys())
|
||||
if not canonical_concepts:
|
||||
return 0.0
|
||||
|
||||
matched = len(set(fact_map.keys()) & canonical_concepts)
|
||||
return matched / len(canonical_concepts)
|
||||
216
venv/lib/python3.10/site-packages/edgar/entity/submissions.py
Normal file
216
venv/lib/python3.10/site-packages/edgar/entity/submissions.py
Normal file
@@ -0,0 +1,216 @@
|
||||
"""
|
||||
Functions for retrieving entity submission data from the SEC.
|
||||
"""
|
||||
import json
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from edgar.core import log
|
||||
from edgar.entity.data import parse_entity_submissions
|
||||
from edgar.httprequests import download_json
|
||||
from edgar.storage import get_edgar_data_directory, is_using_local_storage
|
||||
|
||||
__all__ = [
|
||||
'get_entity_submissions',
|
||||
'download_entity_submissions_from_sec',
|
||||
'load_company_submissions_from_local',
|
||||
'create_entity_from_submissions_json',
|
||||
'create_entity_from_file',
|
||||
'create_company_from_file'
|
||||
]
|
||||
|
||||
|
||||
def load_company_submissions_from_local(cik: int) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Load company submissions from local data.
|
||||
|
||||
If the cached file is corrupted or empty, it will be re-downloaded automatically.
|
||||
"""
|
||||
submissions_dir = get_edgar_data_directory() / "submissions"
|
||||
if not submissions_dir.exists():
|
||||
return None
|
||||
submissions_file = submissions_dir / f"CIK{cik:010}.json"
|
||||
|
||||
# If file doesn't exist, download it
|
||||
if not submissions_file.exists():
|
||||
submissions_json = download_entity_submissions_from_sec(cik)
|
||||
if submissions_json:
|
||||
with open(submissions_file, "w", encoding='utf-8') as f:
|
||||
json.dump(submissions_json, f)
|
||||
return submissions_json
|
||||
|
||||
# File exists, try to parse it
|
||||
try:
|
||||
return json.loads(submissions_file.read_text())
|
||||
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
||||
# File is corrupted, log warning and re-download
|
||||
log.warning(f"Corrupted submissions cache file for CIK {cik}: {e}. Re-downloading...")
|
||||
try:
|
||||
submissions_json = download_entity_submissions_from_sec(cik)
|
||||
if submissions_json:
|
||||
# Write the fresh data to cache
|
||||
with open(submissions_file, "w", encoding='utf-8') as f:
|
||||
json.dump(submissions_json, f)
|
||||
return submissions_json
|
||||
else:
|
||||
# If download failed, remove the corrupted file
|
||||
submissions_file.unlink(missing_ok=True)
|
||||
return None
|
||||
except Exception as download_error:
|
||||
log.error(f"Failed to re-download submissions for CIK {cik}: {download_error}")
|
||||
# Remove the corrupted file so it can be retried later
|
||||
submissions_file.unlink(missing_ok=True)
|
||||
return None
|
||||
|
||||
|
||||
def download_entity_submissions_from_sec(cik: int) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get the company filings for a given cik.
|
||||
|
||||
Note: This function no longer uses @lru_cache (removed in Issue #471 fix) to allow
|
||||
HttpxThrottleCache to control freshness. The HTTP cache now has a 30-second TTL
|
||||
for submissions, providing a balance between freshness and performance.
|
||||
|
||||
Args:
|
||||
cik: The company CIK
|
||||
|
||||
Returns:
|
||||
Optional[Dict[str, Any]]: The entity submissions JSON data, or None if not found
|
||||
"""
|
||||
try:
|
||||
submission_json = download_json(f"https://data.sec.gov/submissions/CIK{cik:010}.json")
|
||||
except httpx.HTTPStatusError as e:
|
||||
# Handle the case where the cik is invalid and not found on Edgar
|
||||
if e.response.status_code == 404:
|
||||
return None
|
||||
else:
|
||||
raise
|
||||
return submission_json
|
||||
|
||||
|
||||
def get_entity_submissions(cik: int) -> Optional[Any]:
|
||||
"""
|
||||
Get the entity data from the SEC submissions endpoint.
|
||||
|
||||
Note: This function no longer uses @lru_cache (removed in Issue #471 fix) to allow
|
||||
HttpxThrottleCache to control freshness with a 30-second TTL.
|
||||
|
||||
Args:
|
||||
cik: The company CIK
|
||||
|
||||
Returns:
|
||||
Optional[EntityData]: The entity data, or None if not found
|
||||
"""
|
||||
# Check the environment var EDGAR_USE_LOCAL_DATA
|
||||
if is_using_local_storage():
|
||||
submissions_json = load_company_submissions_from_local(cik)
|
||||
if not submissions_json:
|
||||
submissions_json = download_entity_submissions_from_sec(cik)
|
||||
else:
|
||||
submissions_json = download_entity_submissions_from_sec(cik)
|
||||
if submissions_json:
|
||||
return parse_entity_submissions(submissions_json)
|
||||
|
||||
|
||||
def create_entity_from_submissions_json(
|
||||
submissions_json: Dict[str, Any],
|
||||
entity_type: str = 'auto'
|
||||
) -> Any:
|
||||
"""
|
||||
Create an Entity object from a submissions JSON dictionary.
|
||||
|
||||
This is particularly useful for testing, as it allows creating
|
||||
Entity objects from local JSON files or mock data, without
|
||||
making any API calls.
|
||||
|
||||
Args:
|
||||
submissions_json: The submissions JSON dictionary (either from a file or API)
|
||||
entity_type: The type of entity to create ('company', 'fund', or 'auto' to detect)
|
||||
|
||||
Returns:
|
||||
An Entity, Company, or Fund object, depending on the entity_type parameter.
|
||||
If entity_type is 'auto', it tries to detect the entity type from the data.
|
||||
"""
|
||||
# Import locally to avoid circular imports
|
||||
from edgar.entity.core import Company, Entity
|
||||
from edgar.entity.data import parse_entity_submissions
|
||||
from edgar.funds import FundCompany
|
||||
|
||||
# First, parse the submissions JSON to get the entity data
|
||||
entity_data = parse_entity_submissions(submissions_json)
|
||||
|
||||
# Create the appropriate entity object based on the entity_type parameter
|
||||
if entity_type == 'auto':
|
||||
# Try to detect the entity type - if it has tickers or exchanges, it's likely a company
|
||||
if entity_data.tickers or hasattr(entity_data, 'exchanges') and entity_data.exchanges:
|
||||
entity_type = 'company'
|
||||
# More detection logic could be added here
|
||||
else:
|
||||
# Default to generic entity if we can't detect the type
|
||||
entity_type = 'entity'
|
||||
|
||||
# Create and return the appropriate entity type
|
||||
if entity_type.lower() == 'company':
|
||||
entity = Company(entity_data.cik)
|
||||
elif entity_type.lower() == 'fund':
|
||||
entity = FundCompany(entity_data.cik)
|
||||
else:
|
||||
entity = Entity(entity_data.cik)
|
||||
|
||||
# Set the data directly to avoid making API calls
|
||||
entity._data = entity_data
|
||||
entity._data._not_found = False
|
||||
|
||||
# Mark the entity as having already loaded all filings to prevent fetching more
|
||||
entity._data._loaded_all_filings = True
|
||||
|
||||
return entity
|
||||
|
||||
|
||||
def create_entity_from_file(
|
||||
file_path: str,
|
||||
entity_type: str = 'auto'
|
||||
) -> Any:
|
||||
"""
|
||||
Create an Entity object from a local submissions JSON file.
|
||||
|
||||
This is a convenience function that loads a JSON file and creates
|
||||
an Entity object from it, without making any API calls.
|
||||
|
||||
Args:
|
||||
file_path: Path to a submissions JSON file
|
||||
entity_type: The type of entity to create ('company', 'fund', or 'auto' to detect)
|
||||
|
||||
Returns:
|
||||
An Entity, Company, or Fund object, depending on the entity_type parameter.
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Load the JSON file
|
||||
try:
|
||||
with open(Path(file_path).expanduser(), 'r') as f:
|
||||
submissions_json = json.load(f)
|
||||
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||
log.error(f"Error loading submissions JSON file: {e}")
|
||||
return None
|
||||
|
||||
# Create the entity from the loaded JSON
|
||||
return create_entity_from_submissions_json(submissions_json, entity_type)
|
||||
|
||||
|
||||
def create_company_from_file(file_path: str) -> Any:
|
||||
"""
|
||||
Create a Company object from a local submissions JSON file.
|
||||
|
||||
This is a convenience function specifically for creating companies,
|
||||
which is the most common use case.
|
||||
|
||||
Args:
|
||||
file_path: Path to a submissions JSON file
|
||||
|
||||
Returns:
|
||||
A Company object
|
||||
"""
|
||||
return create_entity_from_file(file_path, entity_type='company')
|
||||
@@ -0,0 +1,138 @@
|
||||
"""
|
||||
Terminal-friendly color schemes for financial statement display.
|
||||
Provides better contrast and readability in various terminal environments.
|
||||
"""
|
||||
|
||||
from typing import Dict
|
||||
|
||||
# Default scheme - the current implementation
|
||||
DEFAULT_SCHEME = {
|
||||
"abstract_item": "bold cyan",
|
||||
"total_item": "bold",
|
||||
"regular_item": "",
|
||||
"low_confidence_item": "dim",
|
||||
"positive_value": "green",
|
||||
"negative_value": "red",
|
||||
"total_value_prefix": "bold yellow",
|
||||
"separator": "dim",
|
||||
"company_name": "bold green",
|
||||
"statement_type": "bold blue",
|
||||
"panel_border": "blue",
|
||||
"empty_value": "dim",
|
||||
}
|
||||
|
||||
# High contrast scheme - better for terminals with poor dim text support
|
||||
HIGH_CONTRAST_SCHEME = {
|
||||
"abstract_item": "bold bright_cyan",
|
||||
"total_item": "bold bright_white",
|
||||
"regular_item": "white",
|
||||
"low_confidence_item": "bright_black", # Usually renders as gray
|
||||
"positive_value": "bright_green",
|
||||
"negative_value": "bright_red",
|
||||
"total_value_prefix": "bold bright_yellow",
|
||||
"separator": "bright_black",
|
||||
"company_name": "bold bright_green",
|
||||
"statement_type": "bold bright_blue",
|
||||
"panel_border": "bright_blue",
|
||||
"empty_value": "bright_black",
|
||||
}
|
||||
|
||||
# Professional scheme - emphasizes important items without dim text
|
||||
PROFESSIONAL_SCHEME = {
|
||||
"abstract_item": "bold blue",
|
||||
"total_item": "bold bright_white",
|
||||
"regular_item": "",
|
||||
"low_confidence_item": "italic", # Use italic instead of dim
|
||||
"positive_value": "green",
|
||||
"negative_value": "red",
|
||||
"total_value_prefix": "bold",
|
||||
"separator": "blue",
|
||||
"company_name": "bold bright_white",
|
||||
"statement_type": "bold blue",
|
||||
"panel_border": "white",
|
||||
"empty_value": "bright_black",
|
||||
}
|
||||
|
||||
# Minimal scheme - focuses on structure over color
|
||||
MINIMAL_SCHEME = {
|
||||
"abstract_item": "bold",
|
||||
"total_item": "bold bright_white",
|
||||
"regular_item": "",
|
||||
"low_confidence_item": "italic",
|
||||
"positive_value": "",
|
||||
"negative_value": "red", # Keep red for negative values
|
||||
"total_value_prefix": "bold",
|
||||
"separator": "white",
|
||||
"company_name": "bold",
|
||||
"statement_type": "bold",
|
||||
"panel_border": "white",
|
||||
"empty_value": "bright_black",
|
||||
}
|
||||
|
||||
# Color-blind friendly scheme
|
||||
ACCESSIBLE_SCHEME = {
|
||||
"abstract_item": "bold blue",
|
||||
"total_item": "bold bright_white underline", # Use underline for emphasis
|
||||
"regular_item": "",
|
||||
"low_confidence_item": "italic",
|
||||
"positive_value": "blue", # Avoid green/red
|
||||
"negative_value": "magenta", # Avoid green/red
|
||||
"total_value_prefix": "bold underline",
|
||||
"separator": "white",
|
||||
"company_name": "bold bright_white",
|
||||
"statement_type": "bold blue",
|
||||
"panel_border": "white",
|
||||
"empty_value": "bright_black",
|
||||
}
|
||||
|
||||
# SEC filing style - mimics actual printed filings
|
||||
FILING_SCHEME = {
|
||||
"abstract_item": "bold", # Major sections (ASSETS, LIABILITIES) - just bold
|
||||
"total_item": "bold", # Subtotals - bold only
|
||||
"regular_item": "", # Regular items - no styling
|
||||
"low_confidence_item": "dim", # Low confidence items - dimmed
|
||||
"positive_value": "", # Positive values - no color (like printed filings)
|
||||
"negative_value": "", # Negative values - no color (parentheses show negative)
|
||||
"total_value_prefix": "bold", # Total values - bold only
|
||||
"separator": "dim", # Table separators - dimmed
|
||||
"company_name": "bold", # Company name - just bold
|
||||
"statement_type": "bold", # Statement title - just bold
|
||||
"panel_border": "white", # Panel borders - white
|
||||
"empty_value": "dim", # Empty values - dimmed
|
||||
}
|
||||
|
||||
# Available schemes
|
||||
SCHEMES: Dict[str, Dict[str, str]] = {
|
||||
"default": DEFAULT_SCHEME,
|
||||
"high_contrast": HIGH_CONTRAST_SCHEME,
|
||||
"professional": PROFESSIONAL_SCHEME,
|
||||
"minimal": MINIMAL_SCHEME,
|
||||
"accessible": ACCESSIBLE_SCHEME,
|
||||
"filing": FILING_SCHEME,
|
||||
}
|
||||
|
||||
def get_color_scheme(scheme_name: str = "professional") -> Dict[str, str]:
|
||||
"""
|
||||
Get a color scheme by name.
|
||||
|
||||
Args:
|
||||
scheme_name: Name of the scheme (default, high_contrast, professional, minimal, accessible, filing)
|
||||
|
||||
Returns:
|
||||
Dictionary of style mappings
|
||||
"""
|
||||
return SCHEMES.get(scheme_name, PROFESSIONAL_SCHEME)
|
||||
|
||||
# Environment variable support
|
||||
import os
|
||||
|
||||
|
||||
def get_current_scheme() -> Dict[str, str]:
|
||||
"""
|
||||
Get the current color scheme based on environment variable or default.
|
||||
|
||||
Environment variable: EDGAR_FINANCIALS_COLOR_SCHEME
|
||||
Values: default, high_contrast, professional, minimal, accessible, filing
|
||||
"""
|
||||
scheme_name = os.environ.get("EDGAR_FINANCIALS_COLOR_SCHEME", "professional")
|
||||
return get_color_scheme(scheme_name)
|
||||
56
venv/lib/python3.10/site-packages/edgar/entity/tickers.py
Normal file
56
venv/lib/python3.10/site-packages/edgar/entity/tickers.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""
|
||||
Ticker-related functionality for the entity package.
|
||||
This module re-exports ticker-related functions from edgar.reference.tickers.
|
||||
"""
|
||||
|
||||
# We need to create our own implementation of these functions
|
||||
from functools import lru_cache
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from edgar.httprequests import download_text
|
||||
from edgar.reference.tickers import find_cik, find_ticker, get_company_tickers, get_icon_from_ticker
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_ticker_to_cik_lookup():
|
||||
"""
|
||||
Create a dictionary that maps from ticker symbol to CIK.
|
||||
"""
|
||||
df = get_company_tickers()
|
||||
ticker_to_cik = {}
|
||||
for _, row in df.iterrows():
|
||||
ticker_to_cik[row['ticker']] = row['cik']
|
||||
return ticker_to_cik
|
||||
|
||||
|
||||
def _parse_cik_lookup_data(content):
|
||||
"""Parse CIK lookup data from content."""
|
||||
return [
|
||||
{
|
||||
# for companies with : in the name
|
||||
'name': ":".join(line.split(':')[:-2]),
|
||||
'cik': int(line.split(':')[-2])
|
||||
} for line in content.split("\n") if line != '']
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_cik_lookup_data() -> pd.DataFrame:
|
||||
"""
|
||||
Get a dataframe of company/entity names and their cik
|
||||
or a Dict of int(cik) to str(name)
|
||||
DECADE CAPITAL MANAGEMENT LLC:0001426822:
|
||||
DECADE COMPANIES INCOME PROPERTIES:0000775840:
|
||||
"""
|
||||
content = download_text("https://www.sec.gov/Archives/edgar/cik-lookup-data.txt")
|
||||
cik_lookup_df = pd.DataFrame(_parse_cik_lookup_data(content))
|
||||
return cik_lookup_df
|
||||
|
||||
__all__ = [
|
||||
'get_icon_from_ticker',
|
||||
'get_company_tickers',
|
||||
'get_ticker_to_cik_lookup',
|
||||
'get_cik_lookup_data',
|
||||
'find_cik',
|
||||
'find_ticker'
|
||||
]
|
||||
17
venv/lib/python3.10/site-packages/edgar/entity/tools.py
Normal file
17
venv/lib/python3.10/site-packages/edgar/entity/tools.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from edgar import Company
|
||||
|
||||
|
||||
def income_statement(ticker:str, annual:bool=True, periods:int=4):
|
||||
company = Company(ticker)
|
||||
if company:
|
||||
return company.income_statement(annual=annual, periods=periods)
|
||||
|
||||
def balance_sheet(ticker:str, annual:bool=True, periods:int=4):
|
||||
company = Company(ticker)
|
||||
if company:
|
||||
return company.balance_sheet(annual=annual, periods=periods)
|
||||
|
||||
def cash_flow_statement(ticker:str, annual:bool=True, periods:int=4):
|
||||
company = Company(ticker)
|
||||
if company:
|
||||
return company.cash_flow_statement(annual=annual, periods=periods)
|
||||
419
venv/lib/python3.10/site-packages/edgar/entity/unit_handling.py
Normal file
419
venv/lib/python3.10/site-packages/edgar/entity/unit_handling.py
Normal file
@@ -0,0 +1,419 @@
|
||||
"""
|
||||
Unit handling and normalization for financial facts.
|
||||
|
||||
This module provides comprehensive unit normalization and conversion capabilities
|
||||
to address unit inconsistencies across different companies' SEC filings.
|
||||
|
||||
Key features:
|
||||
- Currency unit normalization (USD, EUR, GBP, etc.)
|
||||
- Share-based unit standardization
|
||||
- Scale-aware unit matching
|
||||
- Unit compatibility checking
|
||||
- Error reporting with unit mismatch details
|
||||
|
||||
Usage:
|
||||
from edgar.entity.unit_handling import UnitNormalizer, UnitResult
|
||||
|
||||
# Normalize a unit
|
||||
normalized = UnitNormalizer.normalize_unit("US DOLLAR") # Returns "USD"
|
||||
|
||||
# Check unit compatibility
|
||||
compatible = UnitNormalizer.are_compatible("USD", "DOLLARS") # Returns True
|
||||
|
||||
# Get unit with error details
|
||||
result = UnitNormalizer.get_normalized_value(fact, target_unit="USD")
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from edgar.entity.models import FinancialFact
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnitType(Enum):
|
||||
"""Types of financial units."""
|
||||
CURRENCY = "currency"
|
||||
SHARES = "shares"
|
||||
RATIO = "ratio"
|
||||
BUSINESS = "business"
|
||||
TIME = "time"
|
||||
AREA = "area"
|
||||
OTHER = "other"
|
||||
|
||||
|
||||
@dataclass
|
||||
class UnitResult:
|
||||
"""Result of unit normalization with error details."""
|
||||
value: Optional[float]
|
||||
normalized_unit: Optional[str]
|
||||
original_unit: str
|
||||
success: bool
|
||||
error_reason: Optional[str] = None
|
||||
scale_applied: Optional[int] = None
|
||||
unit_type: Optional[UnitType] = None
|
||||
suggestions: List[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.suggestions is None:
|
||||
self.suggestions = []
|
||||
|
||||
|
||||
class UnitNormalizer:
|
||||
"""Comprehensive unit normalization for financial facts."""
|
||||
|
||||
# Currency unit mappings
|
||||
CURRENCY_MAPPINGS = {
|
||||
'USD': ['USD', 'US DOLLAR', 'DOLLARS', 'usd', 'US$', 'DOLLAR'],
|
||||
'EUR': ['EUR', 'EURO', 'EUROS', 'eur', '€', 'EUROPEAN UNION EURO'],
|
||||
'GBP': ['GBP', 'POUND', 'POUNDS', 'gbp', '£', 'BRITISH POUND', 'POUND STERLING'],
|
||||
'JPY': ['JPY', 'YEN', 'yen', 'jpy', '¥', 'JAPANESE YEN'],
|
||||
'CAD': ['CAD', 'CANADIAN DOLLAR', 'CANADIAN DOLLARS', 'cad'],
|
||||
'CHF': ['CHF', 'SWISS FRANC', 'SWISS FRANCS', 'chf'],
|
||||
'AUD': ['AUD', 'AUSTRALIAN DOLLAR', 'AUSTRALIAN DOLLARS', 'aud'],
|
||||
'CNY': ['CNY', 'YUAN', 'CHINESE YUAN', 'cny', '¥'],
|
||||
}
|
||||
|
||||
# Share unit mappings
|
||||
SHARE_MAPPINGS = {
|
||||
'shares': ['shares', 'share', 'SHARES', 'SHARE', 'STOCK', 'EQUITY'],
|
||||
'shares_unit': ['shares_unit', 'share_unit', 'SHARES_UNIT'],
|
||||
'partnership_unit': ['USD/PartnershipUnit', 'PartnershipUnit', 'partnership_unit']
|
||||
}
|
||||
|
||||
# Ratio/dimensionless unit mappings
|
||||
RATIO_MAPPINGS = {
|
||||
'pure': ['pure', 'number', 'ratio', 'percent', '%', 'PURE', 'NUMBER'],
|
||||
'basis_points': ['bp', 'bps', 'basis_points', 'BASIS_POINTS']
|
||||
}
|
||||
|
||||
# Per-share combinations
|
||||
PER_SHARE_MAPPINGS = {
|
||||
'USD_per_share': ['USD/shares', 'USD per share', 'USD/share', 'usd/shares'],
|
||||
'USD_per_share_unit': ['USD/shares_unit', 'USD per share unit', 'USD/share_unit']
|
||||
}
|
||||
|
||||
# Business/operational unit mappings
|
||||
BUSINESS_MAPPINGS = {
|
||||
'customer': ['Customer', 'customer', 'CUSTOMER'],
|
||||
'store': ['Store', 'store', 'STORE'],
|
||||
'entity': ['Entity', 'entity', 'ENTITY'],
|
||||
'segment': ['Segment', 'segment', 'SEGMENT', 'reportable_segment'],
|
||||
'instrument': ['instrument', 'INSTRUMENT', 'financial_instrument'],
|
||||
'contract': ['USD/Contract', 'contract', 'CONTRACT'],
|
||||
'investment': ['USD/Investment', 'investment', 'INVESTMENT']
|
||||
}
|
||||
|
||||
# Time-based unit mappings
|
||||
TIME_MAPPINGS = {
|
||||
'years': ['Year', 'years', 'YEAR', 'YEARS'],
|
||||
'months': ['Month', 'months', 'MONTH', 'MONTHS'],
|
||||
'days': ['Day', 'days', 'DAY', 'DAYS']
|
||||
}
|
||||
|
||||
# Area unit mappings
|
||||
AREA_MAPPINGS = {
|
||||
'sqft': ['sqft', 'square_feet', 'SQFT', 'sq_ft'],
|
||||
'sqm': ['sqm', 'square_meters', 'SQMETER', 'sq_m']
|
||||
}
|
||||
|
||||
# Comprehensive mapping combining all categories
|
||||
ALL_MAPPINGS = {
|
||||
**CURRENCY_MAPPINGS,
|
||||
**SHARE_MAPPINGS,
|
||||
**RATIO_MAPPINGS,
|
||||
**PER_SHARE_MAPPINGS,
|
||||
**BUSINESS_MAPPINGS,
|
||||
**TIME_MAPPINGS,
|
||||
**AREA_MAPPINGS
|
||||
}
|
||||
|
||||
# Reverse mapping for faster lookups
|
||||
_REVERSE_MAPPING = None
|
||||
|
||||
@classmethod
|
||||
def _build_reverse_mapping(cls) -> Dict[str, str]:
|
||||
"""Build reverse mapping from variant to normalized unit."""
|
||||
if cls._REVERSE_MAPPING is not None:
|
||||
return cls._REVERSE_MAPPING
|
||||
|
||||
reverse_map = {}
|
||||
for normalized_unit, variants in cls.ALL_MAPPINGS.items():
|
||||
for variant in variants:
|
||||
reverse_map[variant.upper()] = normalized_unit
|
||||
|
||||
cls._REVERSE_MAPPING = reverse_map
|
||||
return reverse_map
|
||||
|
||||
@classmethod
|
||||
def normalize_unit(cls, unit: str) -> str:
|
||||
"""
|
||||
Normalize a unit string to its canonical form.
|
||||
|
||||
Args:
|
||||
unit: Raw unit string from SEC filing
|
||||
|
||||
Returns:
|
||||
Normalized unit string
|
||||
|
||||
Example:
|
||||
>>> UnitNormalizer.normalize_unit("US DOLLAR")
|
||||
'USD'
|
||||
>>> UnitNormalizer.normalize_unit("shares_unit")
|
||||
'shares_unit'
|
||||
"""
|
||||
if not unit:
|
||||
return ""
|
||||
|
||||
reverse_map = cls._build_reverse_mapping()
|
||||
normalized = reverse_map.get(unit.upper())
|
||||
|
||||
return normalized if normalized else unit
|
||||
|
||||
@classmethod
|
||||
def get_unit_type(cls, unit: str) -> UnitType:
|
||||
"""
|
||||
Determine the type of a unit.
|
||||
|
||||
Args:
|
||||
unit: Unit string (normalized or raw)
|
||||
|
||||
Returns:
|
||||
UnitType enum value
|
||||
"""
|
||||
normalized = cls.normalize_unit(unit)
|
||||
|
||||
if normalized in cls.CURRENCY_MAPPINGS:
|
||||
return UnitType.CURRENCY
|
||||
elif normalized in cls.PER_SHARE_MAPPINGS:
|
||||
# Per-share units are a special currency-like type (amount per share)
|
||||
return UnitType.CURRENCY # Treat per-share as currency-derived
|
||||
elif normalized in cls.SHARE_MAPPINGS:
|
||||
return UnitType.SHARES
|
||||
elif normalized in cls.RATIO_MAPPINGS:
|
||||
return UnitType.RATIO
|
||||
elif normalized in cls.BUSINESS_MAPPINGS:
|
||||
return UnitType.BUSINESS
|
||||
elif normalized in cls.TIME_MAPPINGS:
|
||||
return UnitType.TIME
|
||||
elif normalized in cls.AREA_MAPPINGS:
|
||||
return UnitType.AREA
|
||||
else:
|
||||
return UnitType.OTHER
|
||||
|
||||
@classmethod
|
||||
def are_compatible(cls, unit1: str, unit2: str) -> bool:
|
||||
"""
|
||||
Check if two units are compatible for calculations.
|
||||
|
||||
Args:
|
||||
unit1: First unit
|
||||
unit2: Second unit
|
||||
|
||||
Returns:
|
||||
True if units are compatible
|
||||
"""
|
||||
norm1 = cls.normalize_unit(unit1)
|
||||
norm2 = cls.normalize_unit(unit2)
|
||||
|
||||
# Exact match
|
||||
if norm1 == norm2:
|
||||
return True
|
||||
|
||||
# Same unit type
|
||||
type1 = cls.get_unit_type(norm1)
|
||||
type2 = cls.get_unit_type(norm2)
|
||||
|
||||
if type1 == type2:
|
||||
# Special cases for compatible unit types
|
||||
if type1 == UnitType.CURRENCY:
|
||||
# Regular currencies are compatible, but per-share must match exactly
|
||||
if norm1 in cls.PER_SHARE_MAPPINGS or norm2 in cls.PER_SHARE_MAPPINGS:
|
||||
# Per-share units must match exactly (USD_per_share != USD_per_share_unit)
|
||||
return norm1 == norm2
|
||||
return True # Regular currencies could be converted
|
||||
elif type1 == UnitType.SHARES:
|
||||
# shares and shares_unit are compatible for some calculations
|
||||
return norm1 in ['shares', 'shares_unit'] and norm2 in ['shares', 'shares_unit']
|
||||
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_normalized_value(
|
||||
cls,
|
||||
fact: FinancialFact,
|
||||
target_unit: Optional[str] = None,
|
||||
apply_scale: bool = True,
|
||||
strict_unit_match: bool = False
|
||||
) -> UnitResult:
|
||||
"""
|
||||
Get a normalized value from a financial fact with detailed error reporting.
|
||||
|
||||
Args:
|
||||
fact: FinancialFact to normalize
|
||||
target_unit: Desired unit (if None, just normalize existing unit)
|
||||
apply_scale: Whether to apply scale factor
|
||||
strict_unit_match: If True, require exact unit match. If False, allow compatible units.
|
||||
|
||||
Returns:
|
||||
UnitResult with value and metadata
|
||||
"""
|
||||
if fact.numeric_value is None:
|
||||
return UnitResult(
|
||||
value=None,
|
||||
normalized_unit=None,
|
||||
original_unit=fact.unit,
|
||||
success=False,
|
||||
error_reason="No numeric value available"
|
||||
)
|
||||
|
||||
original_unit = fact.unit or ""
|
||||
normalized_unit = cls.normalize_unit(original_unit)
|
||||
unit_type = cls.get_unit_type(normalized_unit)
|
||||
|
||||
# Apply scale factor if requested
|
||||
value = fact.numeric_value
|
||||
scale_applied = None
|
||||
if apply_scale and fact.scale:
|
||||
value *= fact.scale
|
||||
scale_applied = fact.scale
|
||||
|
||||
# If no target unit specified, return normalized value
|
||||
if target_unit is None:
|
||||
return UnitResult(
|
||||
value=value,
|
||||
normalized_unit=normalized_unit,
|
||||
original_unit=original_unit,
|
||||
success=True,
|
||||
scale_applied=scale_applied,
|
||||
unit_type=unit_type
|
||||
)
|
||||
|
||||
# Check compatibility with target unit
|
||||
target_normalized = cls.normalize_unit(target_unit)
|
||||
|
||||
if normalized_unit == target_normalized:
|
||||
# Exact match
|
||||
return UnitResult(
|
||||
value=value,
|
||||
normalized_unit=target_normalized,
|
||||
original_unit=original_unit,
|
||||
success=True,
|
||||
scale_applied=scale_applied,
|
||||
unit_type=unit_type
|
||||
)
|
||||
|
||||
elif not strict_unit_match and cls.are_compatible(normalized_unit, target_normalized):
|
||||
# Compatible units - could potentially convert (only if not in strict mode)
|
||||
suggestions = []
|
||||
if cls.get_unit_type(normalized_unit) == UnitType.CURRENCY:
|
||||
suggestions.append(f"Consider currency conversion from {normalized_unit} to {target_normalized}")
|
||||
|
||||
return UnitResult(
|
||||
value=value,
|
||||
normalized_unit=normalized_unit, # Keep original, mark as compatible
|
||||
original_unit=original_unit,
|
||||
success=True,
|
||||
scale_applied=scale_applied,
|
||||
unit_type=unit_type,
|
||||
suggestions=suggestions
|
||||
)
|
||||
|
||||
else:
|
||||
# Incompatible units
|
||||
suggestions = cls._get_unit_suggestions(normalized_unit, target_normalized)
|
||||
|
||||
return UnitResult(
|
||||
value=None,
|
||||
normalized_unit=normalized_unit,
|
||||
original_unit=original_unit,
|
||||
success=False,
|
||||
error_reason=f"Unit mismatch: {normalized_unit} is not compatible with {target_normalized}",
|
||||
unit_type=unit_type,
|
||||
suggestions=suggestions
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_unit_suggestions(cls, actual_unit: str, target_unit: str) -> List[str]:
|
||||
"""Generate helpful suggestions for unit mismatches."""
|
||||
suggestions = []
|
||||
|
||||
actual_type = cls.get_unit_type(actual_unit)
|
||||
target_type = cls.get_unit_type(target_unit)
|
||||
|
||||
if actual_type != target_type:
|
||||
suggestions.append(f"Unit type mismatch: {actual_unit} is {actual_type.value}, "
|
||||
f"but {target_unit} is {target_type.value}")
|
||||
|
||||
# Specific suggestions based on unit types
|
||||
if target_type == UnitType.CURRENCY and actual_type != UnitType.CURRENCY:
|
||||
suggestions.append("Consider using a financial amount concept instead of a ratio/count")
|
||||
|
||||
elif target_type == UnitType.SHARES and actual_type != UnitType.SHARES:
|
||||
suggestions.append("Consider using a share-based concept instead of a monetary amount")
|
||||
|
||||
# Alternative units in the same category
|
||||
if actual_type == target_type:
|
||||
if actual_type == UnitType.CURRENCY:
|
||||
suggestions.append("Use currency conversion or specify the correct currency unit")
|
||||
elif actual_type == UnitType.SHARES:
|
||||
suggestions.append("Try using 'shares' instead of 'shares_unit' or vice versa")
|
||||
|
||||
return suggestions
|
||||
|
||||
|
||||
def apply_scale_factor(value: float, scale: Optional[int]) -> float:
|
||||
"""
|
||||
Apply scale factor to a value.
|
||||
|
||||
Args:
|
||||
value: Numeric value
|
||||
scale: Scale factor (e.g., 1000 for thousands)
|
||||
|
||||
Returns:
|
||||
Scaled value
|
||||
"""
|
||||
if scale and scale != 1:
|
||||
return value * scale
|
||||
return value
|
||||
|
||||
|
||||
def format_unit_error(unit_result: UnitResult) -> str:
|
||||
"""
|
||||
Format a unit error message for user display.
|
||||
|
||||
Args:
|
||||
unit_result: UnitResult with error details
|
||||
|
||||
Returns:
|
||||
Formatted error message
|
||||
"""
|
||||
if unit_result.success:
|
||||
return "No error"
|
||||
|
||||
message = f"Unit handling error: {unit_result.error_reason}"
|
||||
|
||||
if unit_result.suggestions:
|
||||
message += "\n Suggestions:\n"
|
||||
for suggestion in unit_result.suggestions:
|
||||
message += f" - {suggestion}\n"
|
||||
|
||||
message += f" Original unit: '{unit_result.original_unit}'"
|
||||
if unit_result.normalized_unit != unit_result.original_unit:
|
||||
message += f" Normalized to: '{unit_result.normalized_unit}'"
|
||||
|
||||
return message
|
||||
|
||||
|
||||
# Legacy support - maintain compatibility with existing code
|
||||
def normalize_unit_legacy(unit: str) -> str:
|
||||
"""Legacy unit normalization for backward compatibility."""
|
||||
return UnitNormalizer.normalize_unit(unit)
|
||||
|
||||
|
||||
def are_units_compatible_legacy(unit1: str, unit2: str) -> bool:
|
||||
"""Legacy unit compatibility check for backward compatibility."""
|
||||
return UnitNormalizer.are_compatible(unit1, unit2)
|
||||
132
venv/lib/python3.10/site-packages/edgar/entity/utils.py
Normal file
132
venv/lib/python3.10/site-packages/edgar/entity/utils.py
Normal file
@@ -0,0 +1,132 @@
|
||||
"""
|
||||
Utility functions for entity processing.
|
||||
|
||||
This module contains utility functions used throughout the entity package
|
||||
for data processing, normalization, and validation.
|
||||
"""
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pyarrow
|
||||
|
||||
from edgar.entity.constants import COMPANY_FORMS
|
||||
|
||||
|
||||
def has_company_filings(filings_form_array: 'pyarrow.ChunkedArray', max_filings: int = 50) -> bool:
|
||||
"""
|
||||
Efficiently check if any form in the PyArrow ChunkedArray matches company-only forms.
|
||||
Limited to checking the first max_filings entries for performance.
|
||||
|
||||
Args:
|
||||
filings_form_array: PyArrow ChunkedArray containing form values
|
||||
max_filings: Maximum number of filings to check
|
||||
|
||||
Returns:
|
||||
True if any form matches a company form, False otherwise
|
||||
"""
|
||||
|
||||
# Early exit for empty arrays
|
||||
if filings_form_array.null_count == filings_form_array.length:
|
||||
return False
|
||||
|
||||
# Handle case with fewer than max_filings
|
||||
total_filings = filings_form_array.length()
|
||||
filings_to_check = min(total_filings, max_filings)
|
||||
|
||||
# Track how many we've checked so far
|
||||
checked_count = 0
|
||||
|
||||
# Process chunks in the ChunkedArray until we hit our limit
|
||||
for chunk in filings_form_array.chunks:
|
||||
chunk_size = len(chunk)
|
||||
|
||||
# If this chunk would exceed our limit, slice it
|
||||
if checked_count + chunk_size > filings_to_check:
|
||||
# Only check remaining forms needed to reach filings_to_check
|
||||
remaining = filings_to_check - checked_count
|
||||
sliced_chunk = chunk.slice(0, remaining)
|
||||
|
||||
# Use safer iteration over array values
|
||||
for i in range(len(sliced_chunk)):
|
||||
# Get value safely, handling nulls
|
||||
val = sliced_chunk.take([i]).to_pylist()[0]
|
||||
if val is not None and val in COMPANY_FORMS:
|
||||
return True
|
||||
else:
|
||||
# Process full chunk safely
|
||||
for val in chunk.to_pylist():
|
||||
if val is not None and val in COMPANY_FORMS:
|
||||
return True
|
||||
|
||||
# Update count of checked filings
|
||||
if checked_count + chunk_size > filings_to_check:
|
||||
checked_count += (filings_to_check - checked_count)
|
||||
else:
|
||||
checked_count += chunk_size
|
||||
|
||||
# Stop if we've checked enough
|
||||
if checked_count >= filings_to_check:
|
||||
break
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def normalize_cik(cik_or_identifier: Union[str, int]) -> int:
|
||||
"""
|
||||
Normalize a CIK to an integer by removing leading zeros.
|
||||
|
||||
Args:
|
||||
cik_or_identifier: CIK as string or integer
|
||||
|
||||
Returns:
|
||||
Normalized CIK as integer
|
||||
|
||||
Raises:
|
||||
ValueError: If the identifier cannot be converted to a valid CIK
|
||||
"""
|
||||
if isinstance(cik_or_identifier, int):
|
||||
return cik_or_identifier
|
||||
|
||||
if isinstance(cik_or_identifier, str):
|
||||
# Remove leading zeros and convert to int
|
||||
try:
|
||||
return int(cik_or_identifier.lstrip('0') or '0')
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid CIK format: {cik_or_identifier}")
|
||||
|
||||
raise ValueError(f"CIK must be string or integer, got {type(cik_or_identifier)}")
|
||||
|
||||
|
||||
def validate_cik(cik: int) -> bool:
|
||||
"""
|
||||
Validate that a CIK is within the expected range.
|
||||
|
||||
Args:
|
||||
cik: CIK to validate
|
||||
|
||||
Returns:
|
||||
True if CIK is valid, False otherwise
|
||||
"""
|
||||
# CIKs are typically 1-10 digits, with valid range roughly 1 to 2,000,000,000
|
||||
return isinstance(cik, int) and 1 <= cik <= 2_000_000_000
|
||||
|
||||
|
||||
def format_cik(cik: Union[str, int], zero_pad: int = 10) -> str:
|
||||
"""
|
||||
Format a CIK with zero padding for display or API calls.
|
||||
|
||||
Args:
|
||||
cik: CIK to format
|
||||
zero_pad: Number of digits to pad to (default 10)
|
||||
|
||||
Returns:
|
||||
Zero-padded CIK string
|
||||
|
||||
Example:
|
||||
>>> format_cik(320193)
|
||||
'0000320193'
|
||||
>>> format_cik('320193', zero_pad=6)
|
||||
'320193'
|
||||
"""
|
||||
normalized_cik = normalize_cik(cik)
|
||||
return str(normalized_cik).zfill(zero_pad)
|
||||
Reference in New Issue
Block a user