Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,45 @@
from edgar import Company
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
print("Analyzing period durations for FY facts:\n")
# Group facts by (fiscal_year, fiscal_period, period_end)
fact_groups = defaultdict(list)
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year and fact.fiscal_year >= 2019 and fact.fiscal_year <= 2021:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
key = (fact.fiscal_year, fact.fiscal_period, fact.period_end)
fact_groups[key].append(fact)
# Analyze each group
for key in sorted(fact_groups.keys()):
year, period, end_date = key
facts_in_group = fact_groups[key]
if len(facts_in_group) > 1:
print(f"\nFY {year} ending {end_date}: {len(facts_in_group)} facts")
for fact in facts_in_group:
duration = None
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
period_type = "Annual" if duration and duration > 300 else "Quarterly" if duration else "Unknown"
print(f" ${fact.value:,.0f} - Duration: {duration} days ({period_type})")
print(f" Period: {fact.period_start} to {fact.period_end}")
print(f" Filed: {fact.filing_date}")
if hasattr(fact, 'form'):
print(f" Form: {fact.form}")
if hasattr(fact, 'accession'):
print(f" Accession: {fact.accession}")
print("\n\nSummary:")
print("The issue: Both annual and quarterly revenue are marked as 'FY'")
print("Solution: Use period duration to distinguish:")
print(" - Annual: period_start to period_end > 300 days")
print(" - Quarterly: period_start to period_end < 100 days")

View File

@@ -0,0 +1,57 @@
from edgar import Company
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
# Check all FY income statement facts for 2019-2024
print("Checking FY facts and their period_end dates:\n")
print("fiscal_year | fiscal_period | period_end | period_end.year | Match?")
print("-" * 70)
fy_facts = defaultdict(list)
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year and fact.fiscal_year >= 2019:
fy_facts[fact.fiscal_year].append(fact)
# Show all FY entries grouped by fiscal_year
for year in sorted(fy_facts.keys(), reverse=True):
facts_for_year = fy_facts[year]
# Get unique period_end dates for this fiscal year
unique_ends = set()
for fact in facts_for_year:
if fact.period_end:
unique_ends.add(fact.period_end)
print(f"\nFY {year} has {len(unique_ends)} unique period_end dates:")
for end_date in sorted(unique_ends):
if end_date:
match = "" if end_date.year == year else ""
print(f" {year:4d} | FY | {end_date} | {end_date.year} | {match}")
# Now check if we have the correct matches
print("\n\nChecking if we have correct year matches:")
correct_matches = defaultdict(set)
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.period_end and fact.fiscal_year:
if fact.period_end.year == fact.fiscal_year:
correct_matches[fact.fiscal_year].add(fact.period_end)
print("\nFiscal years with matching period_end.year:")
for year in sorted(correct_matches.keys(), reverse=True)[:6]:
for end_date in correct_matches[year]:
print(f" FY {year} -> {end_date}")
# Check revenue values for correct matches
print("\n\nRevenue values for CORRECT year matches:")
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.period_end and fact.fiscal_year:
if fact.period_end.year == fact.fiscal_year:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
if fact.fiscal_year >= 2019 and fact.fiscal_year <= 2024:
print(f" FY {fact.fiscal_year} (ends {fact.period_end}): ${fact.value:,.0f}")

View File

@@ -0,0 +1,172 @@
#!/usr/bin/env python3
"""
Check which renderer is actually being used in the MSFT table.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def check_renderer_usage():
print("🔍 CHECKING WHICH RENDERER IS ACTUALLY BEING USED")
print("=" * 60)
try:
# Parse with default config
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Check what the default config actually has
config = ParserConfig()
print(f"Default ParserConfig.fast_table_rendering: {config.fast_table_rendering}")
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return
print(f"✅ Found target table")
print(f"Table has _config: {'' if hasattr(target_table, '_config') else ''}")
if hasattr(target_table, '_config'):
print(f"Table config fast_table_rendering: {target_table._config.fast_table_rendering}")
# Test the decision logic in TableNode.text()
print(f"\n🔍 TRACING TableNode.text() DECISION LOGIC:")
# Check if cache exists
has_cache = hasattr(target_table, '_text_cache') and target_table._text_cache is not None
print(f"Has cached text: {has_cache}")
if has_cache:
print(f"❗ Using cached result - clearing cache to test renderer...")
target_table._text_cache = None
# Check the config decision
config_obj = getattr(target_table, '_config', None)
should_use_fast = config_obj and getattr(config_obj, 'fast_table_rendering', False)
print(f"Config object exists: {'' if config_obj else ''}")
print(f"Should use fast rendering: {'' if should_use_fast else ''}")
# Test both renderers directly
print(f"\n🧪 TESTING BOTH RENDERERS DIRECTLY:")
# Test Rich renderer
try:
print("Rich renderer test:")
rich_table = target_table.render(width=195)
from edgar.richtools import rich_to_text
rich_text = rich_to_text(rich_table)
rich_has_pipes = '|' in rich_text
print(f" Rich output has pipes: {'' if rich_has_pipes else ''}")
print(f" Rich output length: {len(rich_text)} chars")
print(f" Rich preview: {rich_text[:80]}...")
except Exception as e:
print(f" Rich renderer error: {e}")
# Test Fast renderer
try:
print("Fast renderer test:")
fast_text = target_table._fast_text_rendering()
fast_has_pipes = '|' in fast_text
print(f" Fast output has pipes: {'' if fast_has_pipes else ''}")
print(f" Fast output length: {len(fast_text)} chars")
print(f" Fast preview: {fast_text[:80]}...")
except Exception as e:
print(f" Fast renderer error: {e}")
# Test current text() method
print("Current text() method:")
current_text = target_table.text()
current_has_pipes = '|' in current_text
print(f" Current output has pipes: {'' if current_has_pipes else ''}")
print(f" Current output length: {len(current_text)} chars")
print(f" Current preview: {current_text[:80]}...")
# Determine which renderer is actually being used
if current_has_pipes and len(current_text) < 2000:
print(f"\n🎯 CONCLUSION: Currently using FAST RENDERER ✅")
elif not current_has_pipes and len(current_text) > 1500:
print(f"\n🎯 CONCLUSION: Currently using RICH RENDERER ❌")
else:
print(f"\n🤔 CONCLUSION: Unclear which renderer is being used")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
def test_explicit_configurations():
"""Test with explicit fast and rich configurations."""
print(f"\n🧪 TESTING EXPLICIT CONFIGURATIONS")
print("=" * 60)
configs = [
("Explicit Fast", ParserConfig(fast_table_rendering=True)),
("Explicit Rich", ParserConfig(fast_table_rendering=False)),
]
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
for config_name, config in configs:
print(f"\n🔧 {config_name} (fast_table_rendering={config.fast_table_rendering}):")
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if target_table:
table_text = target_table.text()
has_pipes = '|' in table_text
print(f" Output has pipes: {'' if has_pipes else ''}")
print(f" Output length: {len(table_text)} chars")
print(f" Preview: {table_text[:60]}...")
else:
print(f" ❌ Table not found")
except Exception as e:
print(f"❌ Error: {e}")
if __name__ == "__main__":
check_renderer_usage()
test_explicit_configurations()

View File

@@ -0,0 +1,46 @@
from edgar import Company
from collections import defaultdict
import json
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
# Get raw facts data - access internal facts list
raw_facts = facts._facts # Access internal facts list
# Look for Revenue facts in 2020 and 2019
revenue_facts = []
for fact in raw_facts:
if fact.concept and 'Revenue' in fact.concept:
if fact.fiscal_year in [2019, 2020]:
revenue_facts.append({
'concept': fact.concept,
'value': fact.value,
'fy': fact.fiscal_year,
'fp': fact.fiscal_period,
'period_end': str(fact.period_end) if fact.period_end else None,
'period_duration': getattr(fact, 'period_duration', None),
'statement': fact.statement_type,
'filing_date': str(fact.filing_date) if fact.filing_date else None
})
print("Revenue facts for 2019-2020:")
print(json.dumps(revenue_facts, indent=2, default=str))
# Group by fiscal year and period
by_year_period = defaultdict(list)
for fact in revenue_facts:
key = f"{fact['fy']}-{fact['fp']}"
by_year_period[key].append(fact)
print("\n\nGrouped by fiscal year and period:")
for key in sorted(by_year_period.keys()):
print(f"\n{key}:")
for fact in by_year_period[key]:
print(f" {fact['concept']}: ${fact['value']:,} (duration: {fact['period_duration']} days)")
# Now check what the income statement method returns
print("\n\nIncome statement for 2019-2020 (annual=True):")
income = facts.income_statement(annual=True, periods=6)
print(income)

View File

@@ -0,0 +1,89 @@
from edgar import Company
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
# Get raw facts data - access internal facts list
raw_facts = facts._facts # Access internal facts list
# Look for all facts in Income Statement for 2019-2020
income_facts = defaultdict(lambda: defaultdict(list))
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement':
if fact.fiscal_year in [2019, 2020]:
key = f"{fact.fiscal_year}-{fact.fiscal_period}"
income_facts[fact.concept][key].append({
'value': fact.value,
'period_end': fact.period_end,
'filing_date': fact.filing_date
})
# Find Revenue/Revenues concept
revenue_concepts = []
for concept in income_facts.keys():
if 'Revenue' in concept and 'Contract' not in concept:
revenue_concepts.append(concept)
print("Revenue concepts found:", revenue_concepts)
print("\nRevenue values by year-period:")
for concept in revenue_concepts:
print(f"\n{concept}:")
for period in sorted(income_facts[concept].keys()):
facts_list = income_facts[concept][period]
for f in facts_list:
print(f" {period}: ${f['value']:,}")
# Check what periods are actually marked as FY
print("\n\nAll FY periods in Income Statement:")
fy_periods = set()
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
fy_periods.add((fact.fiscal_year, fact.fiscal_period, fact.period_end))
for year, period, end_date in sorted(fy_periods):
print(f" {year} {period} (ends {end_date})")
# Now check what exact facts are selected for 2019 and 2020
print("\n\nChecking what's selected for income statement:")
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
builder = EnhancedStatementBuilder()
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Build period info like the builder does
period_info = {}
period_facts_map = defaultdict(list)
for fact in stmt_facts:
period_key = (fact.fiscal_year, fact.fiscal_period)
period_label = f"{fact.fiscal_period} {fact.fiscal_year}"
period_facts_map[period_label].append(fact)
if period_key not in period_info:
period_info[period_key] = {
'label': period_label,
'end_date': fact.period_end,
'is_annual': fact.fiscal_period == 'FY',
'filing_date': fact.filing_date,
'fiscal_year': fact.fiscal_year,
'fiscal_period': fact.fiscal_period
}
# Get annual periods
annual_periods = [(pk, info) for pk, info in period_info.items() if info['is_annual']]
annual_periods.sort(key=lambda x: x[0][0] if x[0][0] else 0, reverse=True)
print("\nAnnual periods found (sorted newest first):")
for (year, period), info in annual_periods[:10]:
print(f" {info['label']} - ends {info['end_date']}")
# Check if there are any revenue facts for FY 2019 and FY 2020
print("\n\nRevenue facts for FY periods:")
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year in [2019, 2020] and 'Revenue' in str(fact.concept):
print(f" {fact.fiscal_year} {fact.fiscal_period}: {fact.concept} = ${fact.value:,}")

View File

@@ -0,0 +1,37 @@
from edgar import Company
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
# Check how period_info is built
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Track all unique combinations
all_combos = set()
period_end_by_key = defaultdict(set)
for fact in stmt_facts:
if fact.fiscal_period == 'FY' and fact.fiscal_year and fact.fiscal_year >= 2019:
period_key = (fact.fiscal_year, fact.fiscal_period)
all_combos.add((fact.fiscal_year, fact.fiscal_period, fact.period_end))
period_end_by_key[period_key].add(fact.period_end)
print("Period keys and their different period_end dates:")
for key in sorted(period_end_by_key.keys(), reverse=True):
year, period = key
if year >= 2019 and year <= 2024:
ends = period_end_by_key[key]
print(f"\n({year}, '{period}'): {len(ends)} different period_ends")
for end in sorted(ends):
match = "" if end and end.year == year else ""
print(f" {end} {match}")
# The problem: period_info dict only keeps ONE per key
print("\n\nProblem: The current code builds period_info as a dict,")
print("so it only keeps ONE fact per (fiscal_year, fiscal_period) key!")
print("We lose all the other period_end variations when we do:")
print(" if period_key not in period_info:")
print(" period_info[period_key] = {...} # Only first one is kept!")

View File

@@ -0,0 +1,83 @@
from edgar import Company
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
# Build statement manually to debug
builder = EnhancedStatementBuilder()
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Build period info with new key structure
period_info = {}
period_facts = defaultdict(list)
for fact in stmt_facts:
period_key = (fact.fiscal_year, fact.fiscal_period, fact.period_end)
if period_key not in period_info:
period_info[period_key] = {
'label': f"{fact.fiscal_period} {fact.fiscal_year}",
'end_date': fact.period_end,
'is_annual': fact.fiscal_period == 'FY',
'filing_date': fact.filing_date,
'fiscal_year': fact.fiscal_year,
'fiscal_period': fact.fiscal_period
}
period_facts[period_key].append(fact)
# Apply the annual filtering logic
period_list = [(pk, info) for pk, info in period_info.items()]
true_annual_periods = []
for pk, info in period_list:
if not info['is_annual']:
continue
fiscal_year = pk[0]
period_end_date = pk[2]
# Check if fiscal_year matches period_end.year
if not (period_end_date and period_end_date.year == fiscal_year):
continue
# Check duration
period_fact_list = period_facts.get(pk, [])
if period_fact_list:
sample_fact = period_fact_list[0]
if sample_fact.period_start and sample_fact.period_end:
duration = (sample_fact.period_end - sample_fact.period_start).days
if duration > 300:
true_annual_periods.append((pk, info))
# Find revenue for this period
for fact in period_fact_list:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
print(f"Selected: FY {fiscal_year} ends {period_end_date}: ${fact.value:,.0f} (duration: {duration} days)")
break
print(f"\nTotal true annual periods found: {len(true_annual_periods)}")
# Check what's in the final selection
annual_by_year = {}
for pk, info in true_annual_periods:
fiscal_year = pk[0]
period_end_date = pk[2]
if fiscal_year not in annual_by_year or period_end_date > annual_by_year[fiscal_year][0][2]:
annual_by_year[fiscal_year] = (pk, info)
sorted_periods = sorted(annual_by_year.items(), key=lambda x: x[0], reverse=True)
selected = [period_info for year, period_info in sorted_periods[:6]]
print(f"\nFinal selected periods:")
for (year, period, end), info in selected:
print(f" FY {year} ends {end}")
# Find revenue for this period
for fact in period_facts[(year, period, end)]:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
duration = (fact.period_end - fact.period_start).days if fact.period_start else None
print(f" Revenue: ${fact.value:,.0f} (duration: {duration} days)")
break

View File

@@ -0,0 +1,33 @@
from edgar import Company
# Get Apple facts and display income statement
aapl = Company("AAPL")
facts = aapl.facts
print("Testing with annual=True, periods=6:")
income = facts.income_statement(annual=True, periods=6)
# Get the internal data
items = income.items
# Find the Total Revenue item
for item in items:
if "Revenue" in item.label and "Total" in item.label:
print(f"\n{item.label}:")
print(f" Values: {item.values}")
print(f" Periods: {income.periods}")
# Show what values we have
for i, (period, value) in enumerate(zip(income.periods, item.values)):
if value:
print(f" {period}: {value}")
# Let's also check what raw facts we have
print("\n\nChecking raw facts for FY 2019 and FY 2020:")
raw_facts = facts._facts
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year in [2019, 2020]:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
match = "" if fact.period_end and fact.period_end.year == fact.fiscal_year else ""
print(f" FY {fact.fiscal_year} ends {fact.period_end}: ${fact.value:,.0f} {match}")

View File

@@ -0,0 +1,71 @@
from edgar import Company
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
# Build statement manually to debug
builder = EnhancedStatementBuilder()
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Build period info
from collections import defaultdict
period_info = {}
period_facts_map = defaultdict(list)
for fact in stmt_facts:
period_key = (fact.fiscal_year, fact.fiscal_period)
period_label = f"{fact.fiscal_period} {fact.fiscal_year}"
period_facts_map[period_label].append(fact)
if period_key not in period_info:
period_info[period_key] = {
'label': period_label,
'end_date': fact.period_end,
'is_annual': fact.fiscal_period == 'FY',
'filing_date': fact.filing_date,
'fiscal_year': fact.fiscal_year,
'fiscal_period': fact.fiscal_period
}
# Create list of periods
period_list = [(pk, info) for pk, info in period_info.items()]
# Filter for annual
annual_periods = [(pk, info) for pk, info in period_list if info['is_annual']]
print(f"Total annual periods before sort: {len(annual_periods)}")
# Sort by end_date
annual_periods.sort(key=lambda x: x[1]['end_date'], reverse=True)
print("\nFirst 10 annual periods after sorting by end_date:")
for i, ((year, period), info) in enumerate(annual_periods[:10]):
print(f" {i}: FY {year} - ends {info['end_date']}")
# Deduplicate by fiscal year
seen_years = set()
unique_annual_periods = []
for pk, info in annual_periods:
fiscal_year = pk[0]
if fiscal_year not in seen_years:
seen_years.add(fiscal_year)
unique_annual_periods.append((pk, info))
print(f" Keeping: FY {fiscal_year} ending {info['end_date']}")
print(f"\nUnique annual periods: {len(unique_annual_periods)}")
print("\nFirst 6 unique periods:")
for (year, period), info in unique_annual_periods[:6]:
print(f" FY {year} - ends {info['end_date']}")
# Check what revenue value we have for those periods
print("\nRevenue values for selected periods:")
for (year, fp), info in unique_annual_periods[:6]:
period_label = info['label']
# Find revenue fact for this period
for fact in period_facts_map[period_label]:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
print(f" {period_label}: {fact.concept} = ${fact.value:,}")
break

View File

@@ -0,0 +1,71 @@
from edgar import Company
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
from collections import defaultdict
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
raw_facts = facts._facts
# Build statement manually to debug
builder = EnhancedStatementBuilder()
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Build period info
period_info = {}
period_facts_map = defaultdict(list)
for fact in stmt_facts:
period_key = (fact.fiscal_year, fact.fiscal_period)
period_label = f"{fact.fiscal_period} {fact.fiscal_year}"
period_facts_map[period_label].append(fact)
if period_key not in period_info:
period_info[period_key] = {
'label': period_label,
'end_date': fact.period_end,
'is_annual': fact.fiscal_period == 'FY',
'filing_date': fact.filing_date,
'fiscal_year': fact.fiscal_year,
'fiscal_period': fact.fiscal_period
}
# Apply the fix logic
period_list = [(pk, info) for pk, info in period_info.items()]
annual_periods = [(pk, info) for pk, info in period_list if info['is_annual']]
print(f"Total annual periods: {len(annual_periods)}")
# Apply the matching logic
correct_annual_periods = {}
for pk, info in annual_periods:
fiscal_year = pk[0]
if info['end_date'] and info['end_date'].year == fiscal_year:
if fiscal_year not in correct_annual_periods or \
info['end_date'] > correct_annual_periods[fiscal_year][1]['end_date']:
correct_annual_periods[fiscal_year] = (pk, info)
print(f" Selected FY {fiscal_year}: ends {info['end_date']}")
print(f"\nCorrect annual periods found: {len(correct_annual_periods)}")
# Sort and select
sorted_periods = sorted(correct_annual_periods.items(), key=lambda x: x[0], reverse=True)
selected_period_info = [period_info for year, period_info in sorted_periods[:6]]
print(f"\nSelected {len(selected_period_info)} periods:")
for (year, period), info in selected_period_info:
print(f" {info['label']}")
# Check what revenue facts we have for these periods
print("\nRevenue facts for selected periods:")
for (year, fp), info in selected_period_info:
period_label = info['label']
revenue_found = False
for fact in period_facts_map[period_label]:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
print(f" {period_label}: ${fact.value:,.0f}")
revenue_found = True
break
if not revenue_found:
print(f" {period_label}: No revenue found")

View File

@@ -0,0 +1,262 @@
#!/usr/bin/env python3
"""
Debug script to investigate table parsing/rendering issues in MSFT 10-K.
Focus on the "Weighted average outstanding shares of common stock (B)" table.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
from bs4 import BeautifulSoup
def find_table_in_html():
"""Find and examine the table HTML structure around the target text."""
print("🔍 EXAMINING TABLE HTML STRUCTURE")
print("=" * 50)
try:
# Read the MSFT file
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
print(f"File size: {len(html_content)} characters")
# Find the table containing our target text
soup = BeautifulSoup(html_content, 'html.parser')
# Search for the specific text
target_elements = soup.find_all(text=lambda text: text and "Weighted average outstanding shares of common stock" in text)
print(f"\nFound {len(target_elements)} elements with target text")
for i, element in enumerate(target_elements):
print(f"\n📍 Element {i+1}:")
print(f" Text: {element.strip()[:80]}...")
# Find the containing table
parent = element.parent
while parent and parent.name != 'table':
parent = parent.parent
if parent and parent.name == 'table':
print(f" Found containing table!")
# Analyze the table structure
rows = parent.find_all('tr')
print(f" Table has {len(rows)} rows")
# Look at first few rows
for j, row in enumerate(rows[:5]):
cells = row.find_all(['td', 'th'])
print(f" Row {j+1}: {len(cells)} cells")
for k, cell in enumerate(cells[:3]): # First 3 cells
cell_text = cell.get_text().strip()[:30].replace('\n', ' ')
print(f" Cell {k+1}: '{cell_text}...'")
return parent
else:
print(f" No containing table found")
return None
except Exception as e:
print(f"❌ Error examining HTML: {e}")
import traceback
traceback.print_exc()
return None
def test_parser_on_msft():
"""Test the document parser on the MSFT file."""
print("\n🚀 TESTING DOCUMENT PARSER")
print("=" * 50)
try:
# Read the MSFT file
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Parse with different configurations
configs_to_test = [
("Default", ParserConfig()),
("Performance", ParserConfig.for_performance()),
("Accuracy", ParserConfig.for_accuracy()),
]
for config_name, config in configs_to_test:
print(f"\n🧪 Testing with {config_name} config...")
parser = HTMLParser(config)
document = parser.parse(html_content)
print(f" Document parsed successfully")
print(f" Root children: {len(document.root.children)}")
# Find tables with our target text
matching_tables = []
def find_target_tables(node):
if isinstance(node, TableNode):
table_text = node.text()
if "Weighted average outstanding shares of common stock" in table_text:
matching_tables.append(node)
for child in node.children:
find_target_tables(child)
find_target_tables(document.root)
print(f" Found {len(matching_tables)} table(s) with target text")
for i, table in enumerate(matching_tables):
print(f"\n 📋 Table {i+1}:")
print(f" Headers: {len(table.headers)} row(s)")
print(f" Data rows: {len(table.rows)}")
print(f" Table type: {table.table_type}")
# Show table structure
if table.headers:
print(f" Header structure:")
for j, header_row in enumerate(table.headers):
print(f" Row {j+1}: {len(header_row)} cells")
for k, cell in enumerate(header_row[:3]):
cell_text = cell.text().strip()[:20].replace('\n', ' ')
print(f" Cell {k+1}: '{cell_text}...'")
print(f" First few data rows:")
for j, row in enumerate(table.rows[:3]):
print(f" Row {j+1}: {len(row.cells)} cells")
for k, cell in enumerate(row.cells[:3]):
cell_text = cell.text().strip()[:20].replace('\n', ' ')
print(f" Cell {k+1}: '{cell_text}...'")
# Get the text output
table_text = table.text()
print(f"\n Text output ({len(table_text)} chars):")
print(" " + "-" * 40)
# Show first few lines
lines = table_text.split('\n')
for line_num, line in enumerate(lines[:10]):
print(f" {line_num+1:2d}: {line}")
if len(lines) > 10:
print(f" ... ({len(lines)-10} more lines)")
print(" " + "-" * 40)
# Check for issues
issues = []
if len(table_text.strip()) == 0:
issues.append("Empty text output")
if "Weighted average outstanding shares" not in table_text:
issues.append("Missing target text in output")
if table_text.count('|') < 5: # Should have multiple columns
issues.append("Possibly missing column separators")
if len(lines) < 3:
issues.append("Very few output lines")
if issues:
print(f" ⚠️ Issues detected: {', '.join(issues)}")
return table # Return problematic table for further analysis
else:
print(f" ✅ Table appears to render correctly")
return None
except Exception as e:
print(f"❌ Parser test failed: {e}")
import traceback
traceback.print_exc()
return None
def analyze_table_structure(table):
"""Deep analysis of a problematic table."""
print("\n🔬 DEEP TABLE ANALYSIS")
print("=" * 50)
if not table:
print("No table to analyze")
return
print(f"Table type: {table.table_type}")
print(f"Caption: {table.caption}")
print(f"Summary: {table.summary}")
# Analyze headers
print(f"\n📋 HEADERS ({len(table.headers)} rows):")
for i, header_row in enumerate(table.headers):
print(f" Row {i+1} ({len(header_row)} cells):")
for j, cell in enumerate(header_row):
print(f" Cell {j+1}: colspan={cell.colspan}, rowspan={cell.rowspan}")
print(f" text='{cell.text()[:40]}...'")
print(f" is_header={cell.is_header}")
# Analyze data rows
print(f"\n📊 DATA ROWS ({len(table.rows)} rows):")
for i, row in enumerate(table.rows[:5]): # First 5 rows
print(f" Row {i+1} ({len(row.cells)} cells):")
for j, cell in enumerate(row.cells):
print(f" Cell {j+1}: colspan={cell.colspan}, rowspan={cell.rowspan}")
print(f" text='{cell.text()[:40]}...'")
print(f" is_numeric={cell.is_numeric}")
if len(table.rows) > 5:
print(f" ... and {len(table.rows)-5} more rows")
# Test different rendering approaches
print(f"\n🖼️ TESTING DIFFERENT RENDERERS:")
# Rich renderer
try:
rich_table = table.render(width=120)
from edgar.richtools import rich_to_text
rich_text = rich_to_text(rich_table)
print(f" Rich renderer: {len(rich_text)} chars")
print(f" Preview: {rich_text[:100]}...")
except Exception as e:
print(f" Rich renderer failed: {e}")
# Fast renderer
try:
fast_text = table._fast_text_rendering()
print(f" Fast renderer: {len(fast_text)} chars")
print(f" Preview: {fast_text[:100]}...")
except Exception as e:
print(f" Fast renderer failed: {e}")
# Compare outputs
try:
current_text = table.text()
print(f" Current text() method: {len(current_text)} chars")
if "Weighted average outstanding shares" in current_text:
print(f" ✅ Contains target text")
else:
print(f" ❌ Missing target text")
except Exception as e:
print(f" Current text() method failed: {e}")
if __name__ == "__main__":
print("🎯 DEBUGGING MSFT TABLE PARSING ISSUE")
print("Target: 'Weighted average outstanding shares of common stock (B)' table")
print()
# Step 1: Examine HTML structure
table_element = find_table_in_html()
# Step 2: Test parser with different configurations
problematic_table = test_parser_on_msft()
# Step 3: Deep analysis if issues found
if problematic_table:
analyze_table_structure(problematic_table)
print(f"\n🎯 CONCLUSION:")
print("A problematic table was identified. Check the analysis above")
print("for specific issues with parsing or rendering.")
else:
print(f"\n✅ CONCLUSION:")
print("No obvious parsing issues were detected. The table appears to")
print("be parsing and rendering correctly with the current parser.")
print("If there are still issues, they may be subtle formatting problems.")

View File

@@ -0,0 +1,159 @@
#!/usr/bin/env python3
"""
Debug why Rich table rendering is still producing poor structure even with headers detected.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def debug_rich_rendering_issue():
print("🔍 DEBUGGING RICH RENDERING WITH DETECTED HEADERS")
print("=" * 60)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
config = ParserConfig()
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return
print("✅ Found target table")
print(f"Headers: {len(target_table.headers)}")
print(f"Data rows: {len(target_table.rows)}")
# Examine the table structure in detail
print(f"\n🔍 DETAILED TABLE STRUCTURE ANALYSIS:")
# Check headers
if target_table.headers:
for i, header_row in enumerate(target_table.headers):
print(f"\nHeader row {i+1}: {len(header_row)} cells")
for j, cell in enumerate(header_row[:8]): # First 8 cells
print(f" Cell {j+1}: '{cell.text()}' (colspan={cell.colspan}, rowspan={cell.rowspan})")
# Check data row structure
print(f"\n📊 DATA ROW ANALYSIS:")
for i, row in enumerate(target_table.rows[:5]): # First 5 data rows
content_cells = [j for j, cell in enumerate(row.cells) if cell.text().strip()]
print(f"Row {i+1}: {len(row.cells)} total cells, content in positions {content_cells}")
# Show first few cells with content
for j in content_cells[:3]:
if j < len(row.cells):
cell = row.cells[j]
print(f" Cell {j+1}: '{cell.text()[:30]}...' (align={cell.align})")
# Check table dimensions
max_cols = max(len(row.cells) for row in target_table.rows) if target_table.rows else 0
header_cols = len(target_table.headers[0]) if target_table.headers else 0
print(f"\n📏 TABLE DIMENSIONS:")
print(f" Header columns: {header_cols}")
print(f" Max data columns: {max_cols}")
print(f" Dimension mismatch: {'YES' if header_cols != max_cols else 'NO'}")
# Count empty vs content cells
total_cells = sum(len(row.cells) for row in target_table.rows)
empty_cells = sum(1 for row in target_table.rows for cell in row.cells if not cell.text().strip())
print(f" Total data cells: {total_cells}")
print(f" Empty data cells: {empty_cells} ({empty_cells/total_cells*100:.1f}%)")
# Test Rich table creation manually
print(f"\n🎨 TESTING RICH TABLE CREATION:")
try:
rich_table = target_table.render(width=120)
print(f"✅ Rich table created successfully")
print(f"Rich table type: {type(rich_table)}")
# Check Rich table properties
if hasattr(rich_table, 'columns'):
print(f"Rich columns: {len(rich_table.columns)}")
if hasattr(rich_table, 'rows'):
print(f"Rich rows: {len(rich_table.rows)}")
except Exception as e:
print(f"❌ Rich table creation failed: {e}")
import traceback
traceback.print_exc()
return
# Test text conversion
print(f"\n📝 TESTING TEXT CONVERSION:")
try:
from edgar.richtools import rich_to_text
rich_text = rich_to_text(rich_table)
lines = rich_text.split('\n')
print(f"Text output: {len(lines)} lines, {len(rich_text)} chars")
# Analyze line types
empty_lines = sum(1 for line in lines if not line.strip())
border_lines = sum(1 for line in lines if any(c in line for c in '┌┐└┘├┤│─'))
content_lines = sum(1 for line in lines if line.strip() and not all(c in '┌┐└┘├┤│─ ' for c in line))
print(f" Empty lines: {empty_lines}")
print(f" Border lines: {border_lines}")
print(f" Content lines: {content_lines}")
# Show actual structure
print(f"\nFirst 10 lines of output:")
for i, line in enumerate(lines[:10]):
line_type = "EMPTY" if not line.strip() else "BORDER" if any(c in line for c in '┌┐└┘├┤│─') else "CONTENT"
print(f" {i+1:2d} [{line_type:7}]: {line[:60]}{'...' if len(line) > 60 else ''}")
# The problem might be that Rich is creating a table but with poor formatting
# Let's see if we can identify the issue
if border_lines < 3:
print(f"\n❌ DIAGNOSIS: Very few border lines - Rich table structure is poor")
print("This suggests the table has structural issues that prevent proper rendering.")
print("Possible causes:")
print("1. Column count mismatch between headers and data")
print("2. Too many empty cells causing poor layout")
print("3. Cell spanning issues")
print("4. Table too wide for rendering width")
else:
print(f"\n✅ Rich table structure appears normal")
except Exception as e:
print(f"❌ Text conversion failed: {e}")
return
return target_table
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
debug_rich_rendering_issue()
print(f"\n🎯 NEXT STEPS:")
print("Based on the analysis above, we can identify specific issues preventing")
print("proper Rich table rendering and address them systematically.")

View File

@@ -0,0 +1,61 @@
from edgar import Company
from edgar.entity.enhanced_statement import EnhancedStatementBuilder
# Get Apple facts
aapl = Company("AAPL")
facts = aapl.facts
# Build the income statement
builder = EnhancedStatementBuilder()
stmt = builder.build_multi_period_statement(
facts=facts._facts,
statement_type='IncomeStatement',
periods=6,
annual=True
)
print(f"Selected periods: {stmt.periods}")
print("\nChecking Revenue item values:")
# Find the revenue item
for item in stmt.items:
if item.label and 'Revenue' in item.label and 'Total' in item.label:
print(f"\n{item.label}:")
for i, (period, value) in enumerate(zip(stmt.periods, item.values)):
print(f" {period}: {value}")
# Check what concept this maps to
if hasattr(item, 'concept'):
print(f" Concept: {item.concept}")
# Now let's check what facts are in period_facts_by_label
print("\n\nChecking what facts are in the FY 2020 period:")
from collections import defaultdict
# Recreate what the builder does
raw_facts = facts._facts
stmt_facts = [f for f in raw_facts if f.statement_type == 'IncomeStatement']
# Build period_facts with the new key structure
period_facts = defaultdict(list)
for fact in stmt_facts:
period_key = (fact.fiscal_year, fact.fiscal_period, fact.period_end)
period_facts[period_key].append(fact)
# Look for FY 2020 periods
for key in period_facts.keys():
if key[0] == 2020 and key[1] == 'FY':
if key[2] and key[2].year == 2020: # Correct match
print(f"\nKey: {key}")
# Check revenue facts in this period
for fact in period_facts[key]:
if 'RevenueFromContract' in str(fact.concept) and 'Liability' not in str(fact.concept):
duration = None
if fact.period_start:
duration = (fact.period_end - fact.period_start).days
print(f" Revenue: ${fact.value:,.0f} (duration: {duration})")
# The issue might be in how period_facts_by_label is built
print("\n\nChecking period_facts_by_label mapping:")
# This is what happens in the builder after selection
# It remaps from period_key to label, but multiple keys can have the same label!

View File

@@ -0,0 +1,190 @@
#!/usr/bin/env python3
"""
Debug the table structure to understand why we're getting so many empty columns.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def analyze_table_structure():
print("🔍 ANALYZING TABLE STRUCTURE")
print("=" * 50)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
config = ParserConfig(fast_table_rendering=True)
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return
print("✅ Found target table")
# Analyze the structure
print(f"\nTable structure:")
print(f" Headers: {len(target_table.headers)} rows")
print(f" Data rows: {len(target_table.rows)}")
# Analyze header structure
print(f"\n📋 HEADER ANALYSIS:")
for i, header_row in enumerate(target_table.headers):
print(f" Header row {i+1}: {len(header_row)} cells")
for j, cell in enumerate(header_row[:10]): # First 10 cells
text = cell.text().strip()
display_text = text[:20] if text else "[EMPTY]"
print(f" Cell {j+1}: '{display_text}' (colspan={cell.colspan})")
# Analyze data rows
print(f"\n📊 DATA ROW ANALYSIS:")
for i, row in enumerate(target_table.rows[:5]): # First 5 rows
print(f" Row {i+1}: {len(row.cells)} cells")
for j, cell in enumerate(row.cells[:10]): # First 10 cells
text = cell.text().strip()
display_text = text[:20] if text else "[EMPTY]"
print(f" Cell {j+1}: '{display_text}' (colspan={cell.colspan})")
# Count empty vs filled cells
total_cells = 0
empty_cells = 0
for header_row in target_table.headers:
for cell in header_row:
total_cells += 1
if not cell.text().strip():
empty_cells += 1
for row in target_table.rows:
for cell in row.cells:
total_cells += 1
if not cell.text().strip():
empty_cells += 1
print(f"\n📊 CELL STATISTICS:")
print(f" Total cells: {total_cells}")
print(f" Empty cells: {empty_cells}")
print(f" Filled cells: {total_cells - empty_cells}")
print(f" Empty percentage: {empty_cells/total_cells*100:.1f}%")
# Check maximum meaningful columns
max_meaningful_cols = 0
for row in target_table.rows:
meaningful_cols = 0
for cell in row.cells:
if cell.text().strip():
meaningful_cols = len([c for c in row.cells[:len(row.cells)] if c.text().strip()])
break
max_meaningful_cols = max(max_meaningful_cols, meaningful_cols)
print(f" Maximum meaningful columns in any row: {max_meaningful_cols}")
return target_table
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return None
def test_column_filtering():
"""Test filtering out empty columns."""
print(f"\n🔧 TESTING COLUMN FILTERING")
print("=" * 50)
target_table = analyze_table_structure()
if not target_table:
return
# Analyze which columns actually have content
if not target_table.rows:
print("No data rows to analyze")
return
max_cols = max(len(row.cells) for row in target_table.rows)
print(f"Maximum columns: {max_cols}")
# Check each column for meaningful content
meaningful_columns = []
for col_idx in range(max_cols):
has_content = False
# Check headers
for header_row in target_table.headers:
if col_idx < len(header_row) and header_row[col_idx].text().strip():
has_content = True
break
# Check data rows
if not has_content:
for row in target_table.rows:
if col_idx < len(row.cells) and row.cells[col_idx].text().strip():
has_content = True
break
if has_content:
meaningful_columns.append(col_idx)
print(f"Meaningful columns: {meaningful_columns} ({len(meaningful_columns)} total)")
# Test rendering with only meaningful columns
print(f"\n📊 FILTERED TABLE PREVIEW:")
# Show first data row with only meaningful columns
if target_table.rows:
first_row = target_table.rows[0]
filtered_cells = []
for col_idx in meaningful_columns:
if col_idx < len(first_row.cells):
cell_text = first_row.cells[col_idx].text().strip()
filtered_cells.append(cell_text if cell_text else "[EMPTY]")
else:
filtered_cells.append("[MISSING]")
print("First row filtered:", " | ".join(filtered_cells))
return meaningful_columns
if __name__ == "__main__":
print("🎯 DEBUGGING TABLE STRUCTURE ISSUE")
print("Focus: Understanding why we get so many empty columns")
print()
meaningful_cols = test_column_filtering()
if meaningful_cols:
print(f"\n🎯 FINDINGS:")
print(f"The table has many empty spacing columns.")
print(f"Only {len(meaningful_cols)} out of many columns have actual content.")
print(f"The FastTableRenderer should filter out empty columns.")
print(f"\n🔧 SOLUTION:")
print("Update FastTableRenderer to:")
print("1. Identify columns with meaningful content")
print("2. Filter out purely empty/spacing columns")
print("3. Only render the meaningful columns")
else:
print("❌ Could not analyze column structure")

View File

@@ -0,0 +1,225 @@
#!/usr/bin/env python3
"""
Debug why tables are losing their structure during parsing.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
from bs4 import BeautifulSoup
def examine_raw_html_table():
"""Examine the raw HTML structure of the problematic table."""
print("🔍 EXAMINING RAW HTML TABLE STRUCTURE")
print("=" * 55)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Find the table HTML
soup = BeautifulSoup(html_content, 'html.parser')
# Look for table containing our target text
target_elements = soup.find_all(string=lambda text: text and "Weighted average outstanding shares" in text)
if not target_elements:
print("❌ Target text not found in HTML")
return None
target_element = target_elements[0]
# Find the containing table
table_element = target_element
while table_element and table_element.name != 'table':
table_element = table_element.parent
if not table_element:
print("❌ No containing table found")
return None
print("✅ Found containing HTML table")
# Analyze the HTML table structure
rows = table_element.find_all('tr')
print(f"HTML table has {len(rows)} rows")
# Look for thead, tbody structure
thead = table_element.find('thead')
tbody = table_element.find('tbody')
print(f"Has <thead>: {'' if thead else ''}")
print(f"Has <tbody>: {'' if tbody else ''}")
# Analyze first few rows
print(f"\nFirst few rows analysis:")
for i, row in enumerate(rows[:10]):
cells = row.find_all(['td', 'th'])
cell_info = []
for cell in cells[:5]: # First 5 cells
text = cell.get_text().strip()[:20]
tag = cell.name
colspan = cell.get('colspan', '1')
cell_info.append(f"{tag}({colspan}):'{text}'")
print(f" Row {i+1}: {len(cells)} cells - {', '.join(cell_info)}")
if len(cells) > 5:
print(f" ... and {len(cells)-5} more cells")
# Check if there are any TH (header) cells
th_cells = table_element.find_all('th')
print(f"\nTotal <th> header cells: {len(th_cells)}")
# Look for potential header patterns
header_candidates = []
for i, row in enumerate(rows[:5]): # Check first 5 rows for headers
cells = row.find_all(['td', 'th'])
row_text = ' '.join(cell.get_text().strip() for cell in cells).strip()
if any(keyword in row_text.lower() for keyword in ['year', 'ended', '2025', '2024', '2023']):
header_candidates.append(i)
print(f" Potential header row {i+1}: {row_text[:80]}...")
return table_element
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return None
def debug_table_parsing_pipeline():
"""Debug how the table gets processed through the parsing pipeline."""
print(f"\n🔧 DEBUGGING TABLE PARSING PIPELINE")
print("=" * 55)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
config = ParserConfig(fast_table_rendering=False)
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found in parsed document")
return
print("✅ Found target table in parsed document")
# Analyze how the table was parsed
print(f"\nParsed table analysis:")
print(f" Table type: {target_table.table_type}")
print(f" Has headers: {'' if target_table.headers else ''}")
print(f" Header rows: {len(target_table.headers)}")
print(f" Data rows: {len(target_table.rows)}")
print(f" Caption: {target_table.caption}")
# Check if headers were detected
if target_table.headers:
print(f"\n Header structure:")
for i, header_row in enumerate(target_table.headers):
header_texts = [cell.text().strip()[:20] for cell in header_row]
print(f" Header row {i+1}: {header_texts}")
else:
print(f"\n ❌ NO HEADERS DETECTED - This is likely the problem!")
print(f" The parser failed to identify header rows in the HTML table.")
# Check if any of the first few data rows look like headers
print(f"\n First few data rows (might be misclassified headers):")
for i, row in enumerate(target_table.rows[:5]):
row_texts = [cell.text().strip()[:20] for cell in row.cells[:5]]
print(f" Data row {i+1}: {row_texts}")
# Check if this row looks like a header
row_text = ' '.join(cell.text().strip() for cell in row.cells)
if any(keyword in row_text.lower() for keyword in ['year', 'ended', '2025', '2024', '2023', 'millions']):
print(f" ⚠️ This looks like it should be a header row!")
# Test manual header detection
print(f"\n🔍 MANUAL HEADER DETECTION TEST:")
potential_headers = []
for i, row in enumerate(target_table.rows[:5]):
row_text = ' '.join(cell.text().strip() for cell in row.cells).strip()
# Score this row as a potential header
header_score = 0
# Check for typical header keywords
header_keywords = ['millions', 'year ended', 'june 30', '2025', '2024', '2023']
for keyword in header_keywords:
if keyword in row_text.lower():
header_score += 1
# Check for mostly empty cells (common in header spacing rows)
empty_cells = sum(1 for cell in row.cells if not cell.text().strip())
if empty_cells / len(row.cells) > 0.7: # More than 70% empty
header_score -= 1
# Check for meaningful content vs pure spacing
meaningful_cells = sum(1 for cell in row.cells if len(cell.text().strip()) > 2)
if meaningful_cells >= 2: # At least 2 cells with meaningful content
header_score += 1
potential_headers.append((i, row, header_score, row_text))
print(f" Row {i+1}: score={header_score}, text='{row_text[:60]}...'")
# Find the best header candidate
best_header = max(potential_headers, key=lambda x: x[2])
if best_header[2] > 0:
print(f"\n ✅ Best header candidate: Row {best_header[0]+1} (score={best_header[2]})")
print(f" Text: {best_header[3]}")
else:
print(f"\n ❌ No good header candidates found")
return target_table
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
print("🎯 DEBUGGING TABLE STRUCTURE PARSING")
print("Focus: Why tables lose structure during parsing")
print()
# Step 1: Examine raw HTML
html_table = examine_raw_html_table()
# Step 2: Debug parsing pipeline
parsed_table = debug_table_parsing_pipeline()
print(f"\n🎯 DIAGNOSIS:")
if html_table and parsed_table:
print("The table exists in HTML and is being parsed into a TableNode.")
print("The issue is likely in header detection - the parser isn't")
print("properly identifying which rows should be headers vs data.")
print(f"\n🔧 SOLUTION:")
print("1. Improve header detection logic in table parsing")
print("2. Look for rows with year indicators (2025, 2024, 2023) as headers")
print("3. Handle tables without explicit <th> tags better")
print("4. Keep Rich rendering as default for beautiful output")
else:
print("Basic table parsing is failing - need to investigate further.")

View File

@@ -0,0 +1,209 @@
"""
Check specific edge cases in our solution
"""
from edgar import Company
def check_instant_facts():
"""Check how we handle instant facts (balance sheet items)"""
print("\n1. INSTANT FACTS (Balance Sheet Items)")
print("-" * 50)
aapl = Company("AAPL")
facts = aapl.facts._facts
# Look for balance sheet instant facts
instant_count = 0
duration_count = 0
for fact in facts:
if fact.statement_type == 'BalanceSheet' and fact.fiscal_period == 'FY':
if fact.fiscal_year == 2023:
if fact.period_start:
duration_count += 1
else:
instant_count += 1
print(f" Balance Sheet FY 2023 facts:")
print(f" - With duration (period_start exists): {duration_count}")
print(f" - Instant (no period_start): {instant_count}")
print(f" ✓ Our solution handles instant facts correctly (no duration check)")
def check_fiscal_year_boundaries():
"""Check companies with different fiscal year ends"""
print("\n2. FISCAL YEAR BOUNDARY ISSUES")
print("-" * 50)
# Microsoft has June year-end
msft = Company("MSFT")
facts = msft.facts._facts
print(" Microsoft (June year-end):")
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year == 2023 and 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 300:
print(f" FY 2023: {fact.period_start} to {fact.period_end}")
print(f" Period end year: {fact.period_end.year}")
print(f" Fiscal year: {fact.fiscal_year}")
match = "" if fact.period_end.year == fact.fiscal_year else ""
print(f" Year match: {match}")
break
# Walmart has January year-end
print("\n Walmart (January year-end):")
wmt = Company("WMT")
facts = wmt.facts._facts
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year == 2023 and 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 300:
print(f" FY 2023: {fact.period_start} to {fact.period_end}")
print(f" Period end year: {fact.period_end.year}")
print(f" Fiscal year: {fact.fiscal_year}")
match = "" if fact.period_end.year == fact.fiscal_year else ""
print(f" Year match: {match}")
break
def check_duration_edge_cases():
"""Check edge cases around our 300-day threshold"""
print("\n3. DURATION EDGE CASES")
print("-" * 50)
# Collect all annual durations across companies
test_tickers = ['AAPL', 'MSFT', 'WMT', 'JNJ', 'TSLA']
all_durations = []
for ticker in test_tickers:
try:
company = Company(ticker)
facts = company.facts._facts
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year >= 2020 and 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 200: # Collect all potentially annual
all_durations.append((ticker, duration))
except:
pass
# Analyze distribution
from collections import Counter
duration_counts = Counter([d for _, d in all_durations])
print(" Duration distribution for FY Revenue facts:")
for duration in sorted(set([d for _, d in all_durations])):
count = duration_counts[duration]
if duration < 300:
status = "❌ Would be filtered out"
elif duration > 400:
status = "⚠️ Unusually long"
else:
status = "✓ Accepted as annual"
print(f" {duration} days: {count} facts - {status}")
# Check if any annual facts are < 300 days
short_annuals = [d for _, d in all_durations if d >= 250 and d < 300]
if short_annuals:
print(f"\n ⚠️ WARNING: Found {len(short_annuals)} facts between 250-300 days")
print(f" These might be annual but would be filtered out")
def check_leap_year_impact():
"""Check if leap years affect our logic"""
print("\n4. LEAP YEAR IMPACT")
print("-" * 50)
# 2020 was a leap year
aapl = Company("AAPL")
facts = aapl.facts._facts
leap_year_durations = []
regular_year_durations = []
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 300:
if fact.fiscal_year == 2020:
leap_year_durations.append(duration)
elif fact.fiscal_year in [2019, 2021]:
regular_year_durations.append(duration)
if leap_year_durations and regular_year_durations:
print(f" Leap year (2020) durations: {set(leap_year_durations)}")
print(f" Regular year durations: {set(regular_year_durations)}")
print(f" ✓ Difference is minimal, 300-day threshold handles both")
def check_amended_filings():
"""Check how amended filings affect our logic"""
print("\n5. AMENDED FILINGS")
print("-" * 50)
# Look for duplicate facts from amendments
aapl = Company("AAPL")
facts = aapl.facts._facts
# Track facts by fiscal year and duration
from collections import defaultdict
facts_by_year_duration = defaultdict(list)
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year == 2023 and 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 300:
key = (fact.fiscal_year, duration, fact.period_end)
facts_by_year_duration[key].append({
'value': fact.value,
'filing_date': fact.filing_date,
'accession': fact.accession if hasattr(fact, 'accession') else None
})
# Check for duplicates
for key, facts_list in facts_by_year_duration.items():
if len(facts_list) > 1:
year, duration, end_date = key
print(f" Found {len(facts_list)} facts for FY {year} ({duration} days, ends {end_date}):")
for f in facts_list:
print(f" Value: ${f['value']:,.0f}, Filed: {f['filing_date']}")
print(" ⚠️ Multiple facts for same period - might need to pick latest filing")
# Run all checks
if __name__ == "__main__":
print("=" * 60)
print("EDGE CASE ANALYSIS FOR DURATION-BASED SOLUTION")
print("=" * 60)
check_instant_facts()
check_fiscal_year_boundaries()
check_duration_edge_cases()
check_leap_year_impact()
check_amended_filings()
print("\n" + "=" * 60)
print("SUMMARY OF FINDINGS")
print("=" * 60)
print("\n✓ STRENGTHS:")
print(" 1. 300-day threshold works well for standard annual periods (363-365 days)")
print(" 2. Instant facts (balance sheet) handled correctly")
print(" 3. Leap years don't cause issues")
print("\n⚠️ POTENTIAL ISSUES:")
print(" 1. Fiscal year boundary: Some companies' FY doesn't match calendar year")
print(" - WMT FY 2023 ends in Jan 2023 (year mismatch)")
print(" 2. Amended filings might create duplicates")
print(" 3. No handling for multi-year aggregates (>400 days)")
print("\nRECOMMENDED IMPROVEMENTS:")
print(" 1. For fiscal year matching, be more flexible:")
print(" - Allow FY to match period_end.year OR period_end.year + 1")
print(" 2. When duplicates exist, prefer latest filing_date")
print(" 3. Add upper bound check (duration < 400) to exclude multi-year")

View File

@@ -0,0 +1,170 @@
#!/usr/bin/env python3
"""
Test that the table parsing issue is actually fixed with proper config propagation.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def test_msft_table_with_proper_config():
"""Test MSFT table with proper config propagation."""
print("🧪 TESTING MSFT TABLE WITH PROPER CONFIG")
print("=" * 60)
try:
# Parse the document with explicit config
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Test with explicit fast rendering config
config = ParserConfig(fast_table_rendering=True)
parser = HTMLParser(config)
document = parser.parse(html_content)
print(f"Config fast_table_rendering: {config.fast_table_rendering}")
# Find the target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return False
print("✅ Found target table!")
# Ensure config is set on the table
target_table._config = config
# Test the output
table_text = target_table.text()
print(f"\nTable output ({len(table_text)} characters):")
print("-" * 40)
print(table_text)
print("-" * 40)
# Check for proper formatting
lines = table_text.split('\n')
pipe_lines = [line for line in lines if '|' in line and line.strip()]
print(f"\nFormatting analysis:")
print(f" Total lines: {len(lines)}")
print(f" Lines with pipes: {len(pipe_lines)}")
print(f" Contains target text: {'' if 'Weighted average outstanding shares' in table_text else ''}")
if len(pipe_lines) > 5 and 'Weighted average outstanding shares' in table_text:
print("✅ TABLE IS PROPERLY FORMATTED!")
return True
else:
print("❌ Table formatting issues persist")
return False
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return False
def verify_config_propagation():
"""Verify that table nodes receive the config during parsing."""
print(f"\n🔧 VERIFYING CONFIG PROPAGATION")
print("=" * 60)
# We need to check if the HTMLParser properly sets config on table nodes
# This might require modifications to ensure config propagation
print("Checking if TableNodes receive config during parsing...")
# Create a simple test HTML
simple_html = """
<html>
<body>
<table>
<tr><td>Header 1</td><td>Header 2</td></tr>
<tr><td>Data 1</td><td>Data 2</td></tr>
</table>
</body>
</html>
"""
config = ParserConfig(fast_table_rendering=True)
parser = HTMLParser(config)
document = parser.parse(simple_html)
# Find table and check config
table_found = False
def check_table_config(node):
nonlocal table_found
if isinstance(node, TableNode):
table_found = True
has_config = hasattr(node, '_config')
config_matches = has_config and node._config.fast_table_rendering == True
print(f" Table found: ✅")
print(f" Has _config attribute: {'' if has_config else ''}")
print(f" Config fast_table_rendering: {'' if config_matches else ''}")
if not has_config:
print(" 🔧 Setting config manually...")
node._config = config
test_text = node.text()
print(f" Manual config test: {'' if '|' in test_text else ''}")
print(f" Test output preview: {test_text[:50]}...")
return has_config and config_matches
if hasattr(node, 'children'):
for child in node.children:
check_table_config(child)
config_working = check_table_config(document.root)
if not table_found:
print(" ❌ No table found in simple test")
return False
return config_working
if __name__ == "__main__":
print("🎯 FINAL TEST: MSFT TABLE PARSING FIX")
print()
# Test config propagation
config_ok = verify_config_propagation()
# Test MSFT table
table_ok = test_msft_table_with_proper_config()
print(f"\n🏁 FINAL RESULTS:")
print(f" Config propagation: {'' if config_ok else ''}")
print(f" MSFT table formatting: {'' if table_ok else ''}")
if table_ok:
print(f"\n🎉 SUCCESS!")
print("The MSFT table parsing issue has been resolved!")
print("Tables now render with proper pipe formatting.")
else:
print(f"\n🔧 NEEDS WORK:")
if not config_ok:
print("- Config propagation to TableNodes needs to be implemented")
if not table_ok:
print("- Table formatting still has issues")
print("\nRecommended fix: Ensure HTMLParser sets _config on all TableNode instances")

View File

@@ -0,0 +1,196 @@
#!/usr/bin/env python3
"""
Test the improved header detection logic.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def test_header_detection_improvement():
print("🔧 TESTING IMPROVED HEADER DETECTION")
print("=" * 50)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Use default config (Rich rendering)
config = ParserConfig()
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return False
print("✅ Found target table")
# Check the results
print(f"\nImproved parsing results:")
print(f" Headers detected: {len(target_table.headers)} rows")
print(f" Data rows: {len(target_table.rows)}")
if target_table.headers:
print(f"\n📋 DETECTED HEADERS:")
for i, header_row in enumerate(target_table.headers):
header_texts = [cell.text().strip() for cell in header_row if cell.text().strip()]
print(f" Header row {i+1}: {header_texts}")
else:
print(f"\n❌ Still no headers detected")
return False
# Test Rich rendering with proper headers
print(f"\n🎨 TESTING RICH RENDERING:")
rich_table = target_table.render(width=120)
from edgar.richtools import rich_to_text
rich_text = rich_to_text(rich_table)
# Check if Rich now produces structured output
lines = rich_text.split('\n')
structured_lines = [line for line in lines if any(c in line for c in '┌┐└┘├┤│─')]
print(f" Rich output length: {len(rich_text)} chars")
print(f" Total lines: {len(lines)}")
print(f" Structured lines: {len(structured_lines)}")
if len(structured_lines) > 5:
print(f" ✅ Rich output is now properly structured!")
# Show a sample of the structured output
print(f"\n📊 RICH TABLE SAMPLE:")
for i, line in enumerate(lines[:10]):
if line.strip():
print(f" {line}")
return True
else:
print(f" ❌ Rich output still lacks proper structure")
print(f" Sample lines:")
for i, line in enumerate(lines[:5]):
print(f" {i+1}: {line[:60]}{'...' if len(line) > 60 else ''}")
return False
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return False
def compare_before_after():
"""Compare table quality across all tables after the fix."""
print(f"\n📊 COMPARING TABLE QUALITY ACROSS ALL TABLES")
print("=" * 50)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
config = ParserConfig()
parser = HTMLParser(config)
document = parser.parse(html_content)
# Collect all tables
all_tables = []
def collect_tables(node):
if isinstance(node, TableNode):
all_tables.append(node)
if hasattr(node, 'children'):
for child in node.children:
collect_tables(child)
collect_tables(document.root)
print(f"Found {len(all_tables)} total tables")
# Analyze table quality
good_tables = 0
tables_with_headers = 0
from edgar.richtools import rich_to_text
for i, table in enumerate(all_tables):
try:
# Count tables with headers
if table.headers:
tables_with_headers += 1
# Test Rich rendering quality
rich_table = table.render(width=120)
rich_text = rich_to_text(rich_table)
lines = rich_text.split('\n')
structured_lines = [line for line in lines if any(c in line for c in '┌┐└┘├┤│─')]
if len(structured_lines) > 3:
good_tables += 1
except Exception as e:
pass # Skip problematic tables
print(f"\nTable quality summary:")
print(f" Tables with headers: {tables_with_headers}/{len(all_tables)} ({tables_with_headers/len(all_tables)*100:.1f}%)")
print(f" Well-structured tables: {good_tables}/{len(all_tables)} ({good_tables/len(all_tables)*100:.1f}%)")
if tables_with_headers > 0:
print(f" ✅ Header detection is working!")
else:
print(f" ❌ Header detection still needs work")
if good_tables > 0:
print(f" ✅ Some tables now render with proper structure!")
else:
print(f" ❌ Rich rendering still needs improvement")
return tables_with_headers > 0 and good_tables > 0
except Exception as e:
print(f"❌ Error: {e}")
return False
if __name__ == "__main__":
print("🎯 TESTING IMPROVED TABLE PARSING")
print("Focus: Better header detection for Rich table rendering")
print()
# Test specific target table
target_success = test_header_detection_improvement()
# Test overall improvement
overall_success = compare_before_after()
print(f"\n🏁 FINAL RESULTS:")
print(f" Target table fixed: {'' if target_success else ''}")
print(f" Overall improvement: {'' if overall_success else ''}")
if target_success and overall_success:
print(f"\n🎉 SUCCESS!")
print("The table parsing issue has been resolved!")
print("Tables now render with beautiful Rich formatting!")
elif target_success:
print(f"\n🎯 PARTIAL SUCCESS!")
print("The target table is fixed, but more work needed on other tables.")
else:
print(f"\n🔧 MORE WORK NEEDED")
print("Header detection improvements aren't sufficient yet.")

View File

@@ -0,0 +1,194 @@
#!/usr/bin/env python3
"""
Test the improved FastTableRenderer with column filtering.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def test_improved_rendering():
print("🧪 TESTING IMPROVED FAST TABLE RENDERER")
print("=" * 55)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
config = ParserConfig(fast_table_rendering=True)
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return False
print("✅ Found target table")
# Clear cache to get fresh rendering
if hasattr(target_table, '_text_cache'):
target_table._text_cache = None
# Get new table text
table_text = target_table.text()
print(f"\nImproved table output ({len(table_text)} characters):")
print("-" * 60)
print(table_text)
print("-" * 60)
# Analyze the improvement
lines = [line for line in table_text.split('\n') if line.strip()]
pipe_lines = [line for line in lines if '|' in line]
if pipe_lines:
# Count columns in the first content line
first_content_line = pipe_lines[0]
column_count = first_content_line.count('|') - 1 # Subtract 1 for border
print(f"\nTable structure analysis:")
print(f" Total lines: {len(lines)}")
print(f" Lines with pipes: {len(pipe_lines)}")
print(f" Columns: {column_count}")
# Check if it looks reasonable (should be ~4 columns: Description, 2025, 2024, 2023)
if 3 <= column_count <= 6:
print(f" ✅ Column count looks reasonable ({column_count} columns)")
else:
print(f" ⚠️ Column count still seems high ({column_count} columns)")
# Check for specific improvements
improvements = []
issues = []
if "Weighted average outstanding shares" in table_text:
improvements.append("Contains target text")
else:
issues.append("Missing target text")
if "|" in table_text:
improvements.append("Has pipe separators")
else:
issues.append("No pipe separators")
# Count empty columns (sequences of | | | with only spaces between)
empty_column_pattern = r'\|\s*\|\s*\|'
import re
empty_sequences = len(re.findall(empty_column_pattern, table_text))
if empty_sequences < 5: # Much fewer than before
improvements.append("Reduced empty columns")
else:
issues.append("Still many empty columns")
if len(table_text) < 2000: # Should be more compact
improvements.append("More compact output")
else:
issues.append("Still verbose output")
print(f"\nQuality assessment:")
if improvements:
print(" ✅ Improvements:")
for improvement in improvements:
print(f" - {improvement}")
if issues:
print(" ⚠️ Remaining issues:")
for issue in issues:
print(f" - {issue}")
# Show sample of first few lines for readability
print(f"\nFirst few lines preview:")
for i, line in enumerate(pipe_lines[:5]):
print(f" {i+1}: {line}")
return len(issues) == 0
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return False
def compare_with_rich():
"""Compare the improved fast renderer with Rich renderer."""
print(f"\n🔄 COMPARING WITH RICH RENDERER")
print("=" * 55)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Test both renderers
configs = [
("Fast Renderer", ParserConfig(fast_table_rendering=True)),
("Rich Renderer", ParserConfig(fast_table_rendering=False)),
]
for config_name, config in configs:
print(f"\n🔧 {config_name}:")
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if target_table:
table_text = target_table.text()
lines = table_text.split('\n')
pipe_lines = [line for line in lines if '|' in line and line.strip()]
print(f" Length: {len(table_text)} chars")
print(f" Lines: {len(lines)}")
print(f" Pipe lines: {len(pipe_lines)}")
print(f" Contains target: {'' if 'Weighted average outstanding shares' in table_text else ''}")
print(f" First line: {lines[0][:60]}..." if lines else " No lines")
else:
print(" ❌ Table not found")
except Exception as e:
print(f"❌ Comparison failed: {e}")
if __name__ == "__main__":
success = test_improved_rendering()
compare_with_rich()
if success:
print(f"\n🎉 SUCCESS!")
print("The improved FastTableRenderer is working well!")
else:
print(f"\n🔧 NEEDS MORE WORK")
print("The renderer still needs improvements.")

View File

@@ -0,0 +1,134 @@
"""
Test our duration-based solution across different companies to identify edge cases
"""
from edgar import Company
from collections import defaultdict
import sys
def analyze_company_periods(ticker, company_name):
"""Analyze period durations for a company"""
print(f"\n{'='*60}")
print(f"Analyzing {company_name} ({ticker})")
print('='*60)
try:
company = Company(ticker)
facts = company.facts
raw_facts = facts._facts
# Find FY facts with different durations
fy_facts_by_duration = defaultdict(list)
for fact in raw_facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year and fact.fiscal_year >= 2019:
# Check for revenue facts
if 'Revenue' in str(fact.concept):
duration = None
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
duration_bucket = "No duration"
if duration:
if duration < 100:
duration_bucket = f"Quarterly (~{duration} days)"
elif duration > 300 and duration < 400:
duration_bucket = f"Annual (~{duration} days)"
elif duration > 180 and duration < 200:
duration_bucket = f"Semi-annual (~{duration} days)"
elif duration > 700:
duration_bucket = f"Multi-year (~{duration} days)"
else:
duration_bucket = f"Other ({duration} days)"
fy_facts_by_duration[duration_bucket].append({
'year': fact.fiscal_year,
'value': fact.value,
'duration': duration,
'period_end': fact.period_end
})
# Report findings
for bucket in sorted(fy_facts_by_duration.keys()):
facts_list = fy_facts_by_duration[bucket]
print(f"\n{bucket}: {len(facts_list)} facts")
# Show a few examples
for fact in facts_list[:3]:
print(f" FY {fact['year']}: ${fact['value']:,.0f}")
return fy_facts_by_duration
except Exception as e:
print(f" Error: {e}")
return None
# Test various types of companies
test_companies = [
('AAPL', 'Apple - Tech Giant'),
('MSFT', 'Microsoft - Different fiscal year end'),
('WMT', 'Walmart - Retail with Jan year end'),
('BAC', 'Bank of America - Financial institution'),
('JNJ', 'Johnson & Johnson - Healthcare'),
('TSLA', 'Tesla - Newer company'),
('AMZN', 'Amazon - E-commerce'),
('XOM', 'Exxon - Energy sector'),
]
# Analyze each company
results = {}
for ticker, name in test_companies:
result = analyze_company_periods(ticker, name)
if result:
results[ticker] = result
# Summary of potential issues
print("\n" + "="*60)
print("POTENTIAL ISSUES WITH OUR SOLUTION")
print("="*60)
print("\n1. DURATION THRESHOLD (>300 days):")
print(" Our fix assumes annual = >300 days")
print(" Potential issues:")
# Check for edge cases around 300 days
for ticker in results:
for bucket in results[ticker]:
if "Other" in bucket or "Semi-annual" in bucket:
print(f" - {ticker} has unusual duration: {bucket}")
print("\n2. NO DURATION DATA:")
print(" Some facts might not have period_start")
for ticker in results:
if "No duration" in results[ticker]:
count = len(results[ticker]["No duration"])
print(f" - {ticker}: {count} facts without duration")
print("\n3. FISCAL YEAR VARIATIONS:")
print(" Companies have different fiscal year ends:")
fiscal_year_ends = {
'AAPL': 'September',
'MSFT': 'June',
'WMT': 'January',
'BAC': 'December',
'JNJ': 'December',
'TSLA': 'December',
'AMZN': 'December',
'XOM': 'December'
}
for ticker, month in fiscal_year_ends.items():
print(f" - {ticker}: Fiscal year ends in {month}")
print("\n4. MULTI-YEAR FACTS:")
print(" Some companies might report multi-year aggregates")
for ticker in results:
if "Multi-year" in results[ticker]:
count = len(results[ticker]["Multi-year"])
print(f" - {ticker}: {count} multi-year facts found")
print("\nRECOMMENDATIONS:")
print("1. The 300-day threshold works for most companies")
print("2. Consider 350-380 days as 'normal' annual range")
print("3. Handle edge cases:")
print(" - No duration: Could check fiscal_period or use other heuristics")
print(" - Multi-year: Filter out (duration > 400)")
print(" - Semi-annual: Rare but should be filtered for annual=True")

View File

@@ -0,0 +1,145 @@
#!/usr/bin/env python3
"""
Test specific header detection logic on the target table rows.
"""
import sys
sys.path.insert(0, '/Users/dwight/PycharmProjects/edgartools')
import re
from edgar.documents.parser import HTMLParser
from edgar.documents.config import ParserConfig
from edgar.documents.table_nodes import TableNode
def test_header_detection_logic():
print("🔍 TESTING SPECIFIC HEADER DETECTION LOGIC")
print("=" * 50)
try:
with open('/Users/dwight/PycharmProjects/edgartools/data/html/MSFT.10-K.html', 'r') as f:
html_content = f.read()
# Parse document
config = ParserConfig()
parser = HTMLParser(config)
document = parser.parse(html_content)
# Find target table
target_table = None
def find_target(node):
nonlocal target_table
if isinstance(node, TableNode):
try:
if "Weighted average outstanding shares" in node.text():
target_table = node
return
except:
pass
if hasattr(node, 'children'):
for child in node.children:
find_target(child)
find_target(document.root)
if not target_table:
print("❌ Target table not found")
return
print("✅ Found target table")
print(f"Current status: {len(target_table.headers)} headers, {len(target_table.rows)} data rows")
# Test our header detection logic on each of the first few rows
print(f"\n🔧 TESTING HEADER DETECTION ON FIRST 7 ROWS:")
for i, row in enumerate(target_table.rows[:7]):
print(f"\n--- ROW {i+1} ---")
# Get the row text
row_text = ' '.join(cell.text().strip() for cell in row.cells)
print(f"Row text: '{row_text}'")
# Test each part of our header detection logic
score = 0
reasons = []
# 1. Check for year patterns in the combined text
year_pattern = r'\b(19\d{2}|20\d{2})\b'
years_found = re.findall(year_pattern, row_text)
if len(years_found) >= 2:
if 'total' not in row_text.lower()[:20]:
score += 3
reasons.append(f"Multiple years found: {years_found}")
# 2. Enhanced year detection - check individual cells
year_cells = 0
date_phrases = 0
cell_contents = []
for cell in row.cells:
cell_text = cell.text().strip()
cell_contents.append(f"'{cell_text}'")
if cell_text:
# Check for individual years
if re.match(r'^\s*(19\d{2}|20\d{2})\s*$', cell_text):
year_cells += 1
# Check for date phrases
elif 'june 30' in cell_text.lower() or 'december 31' in cell_text.lower():
date_phrases += 1
print(f"Cell contents: {cell_contents[:5]}{'...' if len(cell_contents) > 5 else ''}")
print(f"Year cells: {year_cells}, Date phrases: {date_phrases}")
if year_cells >= 2 or (year_cells >= 1 and date_phrases >= 1):
if 'total' not in row_text.lower()[:20]:
score += 4
reasons.append(f"Enhanced year detection: {year_cells} year cells, {date_phrases} date phrases")
# 3. Check for financial header patterns
row_text_lower = row_text.lower()
financial_patterns = [
r'year\s+ended\s+(june|december|march|september)',
r'(three|six|nine|twelve)\s+months?\s+ended',
r'\(in\s+(millions|thousands|billions)\)',
r'fiscal\s+year\s+ended'
]
for pattern in financial_patterns:
if re.search(pattern, row_text_lower):
score += 2
reasons.append(f"Financial pattern: {pattern}")
# 4. Check for period indicators
period_keywords = ['quarter', 'q1', 'q2', 'q3', 'q4', 'month',
'january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november', 'december',
'ended', 'three months', 'six months', 'nine months']
matching_keywords = [kw for kw in period_keywords if kw in row_text_lower]
if matching_keywords:
score += 1
reasons.append(f"Period keywords: {matching_keywords}")
print(f"HEADER SCORE: {score}")
if reasons:
print(f"Reasons: {', '.join(reasons)}")
# Determine if this should be considered a header
should_be_header = score >= 3
print(f"SHOULD BE HEADER: {'YES' if should_be_header else 'NO'}")
if should_be_header and i == 4: # Row 5 (index 4) is our expected header
print("🎯 This matches our expected header row!")
elif should_be_header:
print("⚠️ This would be detected as a header but wasn't expected")
elif i == 4:
print("❌ This should be the header row but isn't being detected!")
return target_table
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
test_header_detection_logic()

View File

@@ -0,0 +1,98 @@
"""
Verify the fiscal year pattern across companies
"""
from edgar import Company
def check_fiscal_year_pattern(ticker, name):
"""Check the relationship between fiscal_year and period_end.year"""
print(f"\n{name} ({ticker}):")
print("-" * 40)
try:
company = Company(ticker)
facts = company.facts._facts
# Collect FY facts with revenue
fy_facts = []
for fact in facts:
if fact.statement_type == 'IncomeStatement' and fact.fiscal_period == 'FY':
if fact.fiscal_year and fact.fiscal_year >= 2019 and fact.fiscal_year <= 2024:
if 'Revenue' in str(fact.concept):
if fact.period_start and fact.period_end:
duration = (fact.period_end - fact.period_start).days
if duration > 300 and duration < 400: # Annual only
fy_facts.append({
'fiscal_year': fact.fiscal_year,
'period_end': fact.period_end,
'period_end_year': fact.period_end.year,
'difference': fact.fiscal_year - fact.period_end.year
})
# Deduplicate and sort
unique_facts = {}
for f in fy_facts:
key = (f['fiscal_year'], f['period_end'])
unique_facts[key] = f
# Analyze the pattern
differences = set()
for f in unique_facts.values():
differences.add(f['difference'])
print(f" Fiscal Year vs Period End Year differences: {sorted(differences)}")
# Show examples
print("\n Examples:")
for f in sorted(unique_facts.values(), key=lambda x: x['fiscal_year'], reverse=True)[:5]:
print(f" FY {f['fiscal_year']} → ends {f['period_end']} (diff: {f['difference']} years)")
# What's the consistent pattern?
if len(differences) == 1:
diff = list(differences)[0]
print(f"\n ✓ Consistent pattern: fiscal_year = period_end.year + {diff}")
else:
print(f"\n ⚠️ Multiple patterns found: {differences}")
return differences
except Exception as e:
print(f" Error: {e}")
return set()
# Test various companies
companies = [
('AAPL', 'Apple (Sept year-end)'),
('MSFT', 'Microsoft (June year-end)'),
('WMT', 'Walmart (Jan year-end)'),
('AMZN', 'Amazon (Dec year-end)'),
('JNJ', 'J&J (Dec year-end)'),
('TSLA', 'Tesla (Dec year-end)'),
]
all_differences = set()
for ticker, name in companies:
diffs = check_fiscal_year_pattern(ticker, name)
all_differences.update(diffs)
print("\n" + "="*60)
print("CONCLUSION")
print("="*60)
if len(all_differences) == 1:
diff = list(all_differences)[0]
print(f"\n✓ ALL companies show the same pattern:")
print(f" fiscal_year = period_end.year + {diff}")
print("\nThis appears to be how the SEC Facts API structures the data!")
print("The 'fiscal_year' field indicates when the data was filed/reported,")
print("not the actual year of the fiscal period.")
else:
print(f"\n⚠️ Different companies show different patterns: {all_differences}")
print("The most common pattern seems to be a 2-year difference.")
print("\nIMPLICATION FOR OUR FIX:")
print("We should NOT require fiscal_year == period_end.year")
print("Instead, we should:")
print("1. Use duration (>300 days) as the primary filter")
print("2. Match facts where fiscal_year is within 0-3 years of period_end.year")
print("3. Deduplicate by keeping the latest period_end for each actual year")