edgartools/chart_generator.py

#!/usr/bin/env python3

import os
import sys
import subprocess
import matplotlib.pyplot as plt

# Check if virtual environment is activated
if not os.environ.get('VIRTUAL_ENV'):
    print("Virtual environment is not activated.")
    print("To activate: . venv/bin/activate")
    print("Then run: python chart_generator.py <TICKER>")
    exit(1)

import pandas as pd
from edgar import Company, set_identity, set_local_storage_path, use_local_storage, XBRL
from bs4 import BeautifulSoup
import re

# Set your identity (required by SEC)
set_identity("your.email@example.com")

# Enable local storage for caching filings
LOCAL_STORAGE_PATH = os.path.abspath("./edgar_cache")
os.makedirs(LOCAL_STORAGE_PATH, exist_ok=True)
use_local_storage(LOCAL_STORAGE_PATH)

def show_image(image_path):
    viewers = ['eog', 'feh', 'gthumb', 'gwenview', 'shotwell', 'display']  # Common Linux image viewers
    for viewer in viewers:
        if subprocess.run(['which', viewer], capture_output=True).returncode == 0:
            try:
                subprocess.run([viewer, image_path])
                print(f"Displayed chart with {viewer}")
                return
            except Exception as e:
                print(f"Failed to open with {viewer}: {e}")
    print("No suitable image viewer found. Chart saved but not displayed.")

def parse_20f_html(html_content, year):
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text().lower()
    rev = gp = ni = None
    # Use regex to find patterns like "net revenue" followed by large number
    rev_match = re.search(r'net revenue.*?(\d{4,}(?:,\d{3})*(?:\.\d+)?)', text)
    if rev_match:
        rev = float(rev_match.group(1).replace(',', ''))
    gp_match = re.search(r'gross profit.*?(\d{4,}(?:,\d{3})*(?:\.\d+)?)', text)
    if gp_match:
        gp = float(gp_match.group(1).replace(',', ''))
    ni_match = re.search(r'net income.*?(\d{4,}(?:,\d{3})*(?:\.\d+)?)', text)
    if ni_match:
        ni = float(ni_match.group(1).replace(',', ''))
    return rev, gp, ni

def extract_number(text):
    # Extract number from text, handle commas, parentheses for negative
    text = re.sub(r'[^\d,.\-\(\)]', '', text)
    if '(' in text and ')' in text:
        text = '-' + text.replace('(', '').replace(')', '')
    text = text.replace(',', '')
    try:
        return float(text)
    except:
        return None

def generate_charts(ticker):
    print(f"Generating charts for {ticker}...")

    company = Company(ticker)
    if company.not_found:
        print(f"Company {ticker} not found.")
        return

    # Get last 20 10-Q filings for quarterly data
    filings_10q = company.get_filings(form="10-Q", amendments=False).head(20)
    # Get last 5 10-K filings for yearly data
    filings_10k = company.get_filings(form="10-K", amendments=False).head(5)

    if not filings_10q:
        print("No 10-Q filings found.")
        return
    if not filings_10k:
        print("No 10-K filings found.")
        return

    # Collect data from each filing
    rev_dict_quarterly = {}
    gp_dict_quarterly = {}
    ni_dict_quarterly = {}
    rev_dict_yearly = {}
    gp_dict_yearly = {}
    ni_dict_yearly = {}

    for filing in filings_10q:
        print(f"Processing filing {filing.accession_number} from {filing.filing_date}")
        try:
            df = None
            xbrl = XBRL.from_filing(filing)
            if not xbrl:
                print("  No XBRL found")
                continue
            data = xbrl.to_pandas()
            df = data['facts']
            print(f"df columns: {list(df.columns)}")

            print(f"  Available elements: {sorted([e for e in df['element_id'].unique() if 'profit' in e.lower() or 'revenue' in e.lower() or 'income' in e.lower()])}")

            print(f"  Gross elements: {sorted([e for e in df['element_id'].unique() if 'Gross' in e])}")

            print(f"  Cost elements: {sorted([e for e in df['element_id'].unique() if 'Cost' in e])}")

            # Extract metrics for this filing's period
            # Try multiple possible revenue elements
            revenue_elements = ['us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax', 'us-gaap:Revenues', 'us-gaap:SalesRevenueNet']
            revenues = pd.DataFrame()
            for elem in revenue_elements:
                temp = df[df['element_id'] == elem]
                if not temp.empty:
                    revenues = temp
                    print(f"    Found revenue element: {elem}")
                    break

            rev_val = gp_val = ni_val = year = period_end = quarter_key = None
            if len(revenues) > 0:
                period_end = revenues['period_end'].max()  # type: ignore
                quarter_key = pd.to_datetime(period_end).to_period('Q').strftime('%Y-Q%q')
                rev_val = pd.to_numeric(revenues['value'], errors='coerce').max()  # type: ignore
                print(f"    Revenue for {quarter_key}: {rev_val}")

            # Try multiple possible gross profit elements, using same period
            gp_elements = ['us-gaap:GrossProfit', 'us-gaap:GrossMargin']
            gross_profits = pd.DataFrame()
            if period_end:
                for elem in gp_elements:
                    temp = df[(df['element_id'].str.contains(elem.split(':')[1])) & (df['period_end'] == period_end)]
                    print(f"    Checking {elem} for period {period_end}: {temp.shape}")
                    if not temp.empty:
                        gross_profits = temp
                        print(f"    Found gross profit element: {elem}")
                        break
            else:
                for elem in gp_elements:
                    temp = df[df['element_id'].str.contains(elem.split(':')[1])]
                    print(f"    Checking {elem}: {temp.shape}")
                    if not temp.empty:
                        gross_profits = temp
                        print(f"    Found gross profit element: {elem}")
                        break

            # If no direct GP found, try calculating from Revenue - COGS
            if gross_profits.empty and period_end and rev_val:
                cogs = df[(df['element_id'] == 'us-gaap:CostOfGoodsAndServicesSold') & (df['period_end'] == period_end)]
                if not cogs.empty:
                    cogs_val = pd.to_numeric(cogs['value'], errors='coerce').max()
                    if cogs_val:
                        gp_val = rev_val - cogs_val
                        print(f"    Calculated Gross Profit from Revenue - COGS: {gp_val}")
                        # Set a dummy gross_profits to indicate found
                        gross_profits = cogs  # not used further, just for flag

            net_incomes = pd.DataFrame()
            if period_end:
                net_incomes = df[(df['element_id'] == 'us-gaap:NetIncomeLoss') & (df['period_end'] == period_end)]
            else:
                net_incomes = df[df['element_id'] == 'us-gaap:NetIncomeLoss']

            print(f"    Revenues found: {not revenues.empty}, Gross Profit: {not gross_profits.empty}, Net Income: {not net_incomes.empty}")

            if not gross_profits.empty:
                if gp_val is None:
                    gp_val = pd.to_numeric(gross_profits['value'], errors='coerce').sum()
                print(f"    Gross Profit for {quarter_key}: {gp_val}")
            elif 'gp_val' in locals():
                print(f"    Gross Profit for {quarter_key}: {gp_val}")
            else:
                print(f"    Gross Profit not found for {quarter_key}")
            if not net_incomes.empty:
                ni_val = pd.to_numeric(net_incomes['value'], errors='coerce').max()
                print(f"    Net Income for {quarter_key}: {ni_val}")
            else:
                # Parse 20-F HTML
                html_content = filing.text()
                # Assume year from filing_date
                year = pd.to_datetime(filing.filing_date).year - 1  # Filing in next year
                rev_val, gp_val, ni_val = parse_20f_html(html_content, year)
                print(f"    Parsed 20-F: Rev {rev_val}, GP {gp_val}, NI {ni_val} for {year}")

            if quarter_key and rev_val is not None:
                rev_dict_quarterly[quarter_key] = rev_val
            if quarter_key and gp_val is not None:
                gp_dict_quarterly[quarter_key] = gp_val
            if quarter_key and ni_val is not None:
                ni_dict_quarterly[quarter_key] = ni_val
        except Exception as e:
            print(f"Error processing filing {filing.accession_number}: {e}")
            continue

    # Process 10-K filings for yearly data
    for filing in filings_10k:
        print(f"Processing filing {filing.accession_number} from {filing.filing_date}")
        try:
            xbrl = XBRL.from_filing(filing)
            if not xbrl:
                print("  No XBRL found")
                continue
            data = xbrl.to_pandas()
            df = data['facts']
            print(f"df columns: {list(df.columns)}")
            print(f"  Available elements: {sorted([e for e in df['element_id'].unique() if 'profit' in e.lower() or 'revenue' in e.lower() or 'income' in e.lower()])}")
            print(f"  Gross elements: {sorted([e for e in df['element_id'].unique() if 'Gross' in e])}")
            print(f"  Cost elements: {sorted([e for e in df['element_id'].unique() if 'Cost' in e])}")
            # Extract metrics for this filing's period
            # Try multiple possible revenue elements
            revenue_elements = ['us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax', 'us-gaap:Revenues', 'us-gaap:SalesRevenueNet']
            revenues = pd.DataFrame()
            for elem in revenue_elements:
                temp = df[df['element_id'] == elem]
                if not temp.empty:
                    revenues = temp
                    print(f"    Found revenue element: {elem}")
                    break
            rev_val = gp_val = ni_val = year = period_end = year_key = None
            if not revenues.empty:
                period_end = revenues['period_end'].max()
                year_key = str(pd.to_datetime(period_end).year)
                rev_val = pd.to_numeric(revenues['value'], errors='coerce').max()
                print(f"    Revenue for {year_key}: {rev_val}")
            # Try multiple possible gross profit elements, using same period
            gp_elements = ['us-gaap:GrossProfit', 'us-gaap:GrossMargin']
            gross_profits = pd.DataFrame()
            if period_end:
                for elem in gp_elements:
                    temp = df[(df['element_id'].str.contains(elem.split(':')[1])) & (df['period_end'] == period_end)]
                    print(f"    Checking {elem} for period {period_end}: {temp.shape}")
                    if not temp.empty:
                        gross_profits = temp
                        print(f"    Found gross profit element: {elem}")
                        break
            else:
                for elem in gp_elements:
                    temp = df[df['element_id'].str.contains(elem.split(':')[1])]
                    print(f"    Checking {elem}: {temp.shape}")
                    if not temp.empty:
                        gross_profits = temp
                        print(f"    Found gross profit element: {elem}")
                        break
            # If no direct GP found, try calculating from Revenue - COGS
            if gross_profits.empty and period_end and rev_val:
                cogs = df[(df['element_id'] == 'us-gaap:CostOfGoodsAndServicesSold') & (df['period_end'] == period_end)]
                if not cogs.empty:
                    cogs_val = pd.to_numeric(cogs['value'], errors='coerce').max()
                    if cogs_val:
                        gp_val = rev_val - cogs_val
                        print(f"    Calculated Gross Profit from Revenue - COGS: {gp_val}")
                        # Set a dummy gross_profits to indicate found
                        gross_profits = cogs  # not used further, just for flag
            net_incomes = pd.DataFrame()
            if period_end:
                net_incomes = df[(df['element_id'] == 'us-gaap:NetIncomeLoss') & (df['period_end'] == period_end)]
            else:
                net_incomes = df[df['element_id'] == 'us-gaap:NetIncomeLoss']
            print(f"    Revenues found: {not revenues.empty}, Gross Profit: {not gross_profits.empty}, Net Income: {not net_incomes.empty}")
            if not gross_profits.empty:
                if gp_val is None:
                    gp_val = pd.to_numeric(gross_profits['value'], errors='coerce').sum()
                print(f"    Gross Profit for {year_key}: {gp_val}")
            elif 'gp_val' in locals():
                print(f"    Gross Profit for {year_key}: {gp_val}")
            else:
                print(f"    Gross Profit not found for {year_key}")
            if not net_incomes.empty:
                ni_val = pd.to_numeric(net_incomes['value'], errors='coerce').max()
                print(f"    Net Income for {year_key}: {ni_val}")
            if year_key and rev_val is not None:
                rev_dict_yearly[year_key] = rev_val
            if year_key and gp_val is not None:
                gp_dict_yearly[year_key] = gp_val
            if year_key and ni_val is not None:
                ni_dict_yearly[year_key] = ni_val
        except Exception as e:
            print(f"Error processing filing {filing.accession_number}: {e}")
            continue

    if not rev_dict_quarterly or not ni_dict_quarterly:
        print("Insufficient quarterly financial data across filings.")
        return

    print(f"Quarterly Rev dict: {rev_dict_quarterly}")
    print(f"Quarterly GP dict: {gp_dict_quarterly}")
    print(f"Quarterly NI dict: {ni_dict_quarterly}")

    # Convert to Series
    rev_data = pd.Series(rev_dict_quarterly)
    gp_data = pd.Series(gp_dict_quarterly)
    ni_data = pd.Series(ni_dict_quarterly)

    # Get last 20 quarters
    all_quarters = sorted(set(rev_data.index) | set(gp_data.index) | set(ni_data.index), key=lambda x: pd.Period(x, 'Q'))
    quarters = all_quarters[-20:] if len(all_quarters) > 20 else all_quarters
    rev_data = rev_data.reindex(quarters).fillna(0)
    gp_data = gp_data.reindex(quarters).fillna(0)
    ni_data = ni_data.reindex(quarters).fillna(0)

    # Calculate margins
    gross_margin = (gp_data / rev_data * 100).fillna(0)
    net_margin = (ni_data / rev_data * 100).fillna(0)

    # Plot
    fig, ax1 = plt.subplots(figsize=(12, 8))

    # Bars for Revenue, Gross Profit, Net Income
    x = range(len(quarters))
    width = 0.25
    bars1 = ax1.bar([i - width for i in x], rev_data.values / 1e9, width, label='Revenue', color='blue')
    bars2 = ax1.bar(x, gp_data.values / 1e9, width, label='Gross Profit', color='orange')
    bars3 = ax1.bar([i + width for i in x], ni_data.values / 1e9, width, label='Net Income', color='green')

    # Add value labels on bars
    def format_value(val):
        if abs(val) >= 1:
            return f'${val:.1f}B'
        else:
            return f'${val*1000:.0f}M'

    for bars in [bars1, bars2, bars3]:
        for bar in bars:
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., height + max(rev_data.values / 1e9)*0.01,
                     format_value(height), ha='center', va='bottom', fontsize=8)

    ax1.set_xlabel('Quarter')
    ax1.set_ylabel('Billions USD')
    ax1.set_title(f'{company.name} - Financial Metrics (Last 20 Quarters)')
    ax1.set_xticks(x)
    ax1.set_xticklabels(quarters)
    ax1.legend(loc='upper left')
    ax1.grid(axis='y')

    # Secondary Y-axis for margins
    ax2 = ax1.twinx()
    ax2.plot(x, gross_margin.values, label='Gross Margin %', color='red', marker='o', linestyle='-')
    ax2.plot(x, net_margin.values, label='Net Margin %', color='purple', marker='s', linestyle='-')
    ax2.set_ylabel('Profit Margin (%)')
    ax2.legend(loc='upper right')

    plt.tight_layout()
    chart_file = f"charts/{ticker}_chart.png"
    plt.savefig(chart_file)
    print(f"Quarterly chart saved to {chart_file}")
    show_image(chart_file)

    # Yearly chart
    if rev_dict_yearly or ni_dict_yearly:
        print(f"Yearly Rev dict: {rev_dict_yearly}")
        print(f"Yearly GP dict: {gp_dict_yearly}")
        print(f"Yearly NI dict: {ni_dict_yearly}")
        # Convert to Series
        rev_data_yearly = pd.Series(rev_dict_yearly)
        gp_data_yearly = pd.Series(gp_dict_yearly)
        ni_data_yearly = pd.Series(ni_dict_yearly)
        # Get last 5 years
        all_years = sorted(set(rev_data_yearly.index) | set(gp_data_yearly.index) | set(ni_data_yearly.index))
        if len(all_years) < 5:
            min_year = min(all_years) if all_years else 2024
            years = list(range(min_year - (5 - len(all_years)), min_year + len(all_years)))
        else:
            years = all_years[-5:]
        rev_data_yearly = rev_data_yearly.reindex(years).fillna(0)
        gp_data_yearly = gp_data_yearly.reindex(years).fillna(0)
        ni_data_yearly = ni_data_yearly.reindex(years).fillna(0)
        # Calculate margins
        gross_margin_yearly = (gp_data_yearly / rev_data_yearly * 100).fillna(0)
        net_margin_yearly = (ni_data_yearly / rev_data_yearly * 100).fillna(0)
        # Plot
        fig, ax1 = plt.subplots(figsize=(12, 8))
        # Bars for Revenue, Gross Profit, Net Income
        x = range(len(years))
        width = 0.25
        bars1 = ax1.bar([i - width for i in x], rev_data_yearly.values / 1e9, width, label='Revenue', color='blue')
        bars2 = ax1.bar(x, gp_data_yearly.values / 1e9, width, label='Gross Profit', color='orange')
        bars3 = ax1.bar([i + width for i in x], ni_data_yearly.values / 1e9, width, label='Net Income', color='green')
        # Add value labels on bars
        def format_value(val):
            if abs(val) >= 1:
                return f'${val:.1f}B'
            else:
                return f'${val*1000:.0f}M'
        for bars in [bars1, bars2, bars3]:
            for bar in bars:
                height = bar.get_height()
                ax1.text(bar.get_x() + bar.get_width()/2., height + max(rev_data_yearly.values / 1e9)*0.01,
                         format_value(height), ha='center', va='bottom', fontsize=8)
        ax1.set_xlabel('Fiscal Year')
        ax1.set_ylabel('Billions USD')
        ax1.set_title(f'{company.name} - Financial Metrics (Last 5 Years)')
        ax1.set_xticks(x)
        ax1.set_xticklabels(years)
        ax1.legend(loc='upper left')
        ax1.grid(axis='y')
        # Secondary Y-axis for margins
        ax2 = ax1.twinx()
        ax2.plot(x, gross_margin_yearly.values, label='Gross Margin %', color='red', marker='o', linestyle='-')
        ax2.plot(x, net_margin_yearly.values, label='Net Margin %', color='purple', marker='s', linestyle='-')
        ax2.set_ylabel('Profit Margin (%)')
        ax2.legend(loc='upper right')
    plt.tight_layout()
    chart_file_yearly = f"charts/{ticker}_yearly_chart.png"
    plt.savefig(chart_file_yearly)
    print(f"Yearly chart saved to {chart_file_yearly}")
    show_image(chart_file_yearly)

if __name__ == "__main__":
    if len(sys.argv) > 1:
        ticker = sys.argv[1].upper()
    else:
        ticker = input("Enter ticker: ").strip().upper()

    generate_charts(ticker)