Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
# SPDX-FileCopyrightText: 2022-present Dwight Gunning <dgunning@gmail.com>
#
# SPDX-License-Identifier: MIT
__version__ = '4.25.0'

View File

@@ -0,0 +1,188 @@
# SPDX-FileCopyrightText: 2022-present Dwight Gunning <dgunning@gmail.com>
#
# SPDX-License-Identifier: MIT
import re
from functools import lru_cache, partial
from typing import List, Optional, Union
from edgar._filings import Attachment, Attachments, Filing, FilingHeader, FilingHomepage, Filings, get_by_accession_number, get_by_accession_number_enriched, get_filings
from edgar.core import CAUTION, CRAWL, NORMAL, edgar_mode, get_identity, listify, set_identity
from edgar.current_filings import CurrentFilings, get_all_current_filings, get_current_filings, iter_current_filings_pages
from edgar.entity import (
Company,
CompanyData,
CompanyFiling,
CompanyFilings,
CompanySearchResults,
Entity,
EntityData,
find_company,
get_cik_lookup_data,
get_company_facts,
get_company_tickers,
get_entity,
get_entity_submissions,
get_icon_from_ticker,
get_ticker_to_cik_lookup,
)
from edgar.files import detect_page_breaks, mark_page_breaks
from edgar.files.html import Document
from edgar.financials import Financials, MultiFinancials
from edgar.funds import FundClass, FundCompany, FundSeries, find_fund
from edgar.funds.reports import NPORT_FORMS, FundReport
from edgar.storage import download_edgar_data, download_filings, is_using_local_storage, set_local_storage_path, use_local_storage
from edgar.storage_management import (
StorageAnalysis,
StorageInfo,
analyze_storage,
availability_summary,
check_filing,
check_filings_batch,
cleanup_storage,
clear_cache,
optimize_storage,
storage_info,
)
from edgar.thirteenf import THIRTEENF_FORMS, ThirteenF
from edgar.xbrl import XBRL
# Fix for Issue #457: Clear locale-corrupted cache files on first import
# This is a one-time operation that only runs if the marker file doesn't exist
try:
from edgar.httpclient import clear_locale_corrupted_cache
clear_locale_corrupted_cache()
except Exception:
# Silently continue if cache clearing fails - it's not critical
pass
# Another name for get_current_filings
get_latest_filings = get_current_filings
latest_filings = get_current_filings
current_filings = get_current_filings
# Fund portfolio report filings
get_fund_portfolio_filings = partial(get_filings, form=NPORT_FORMS)
# Restricted stock sales
get_restricted_stock_filings = partial(get_filings, form=[144])
# Insider transaction filings
get_insider_transaction_filings = partial(get_filings, form=[3, 4, 5])
# 13F filings - portfolio holdings
get_portfolio_holding_filings = partial(get_filings, form=THIRTEENF_FORMS)
@lru_cache(maxsize=16)
def find(search_id: Union[str, int]) -> Optional[Union[Filing, Entity, CompanySearchResults, FundCompany, FundClass, FundSeries]]:
"""This is an uber search function that can take a variety of search ids and return the appropriate object
- accession number -> returns a Filing
- CIK -> returns an Entity
- Class/Contract ID -> returns a FundClass
- Series ID -> returns a FundSeries
- Ticker -> returns a Company or a Fund if the ticker is a fund ticker
- Company name -> returns CompanySearchResults
:type: object
"""
if isinstance(search_id, int):
return Entity(search_id)
elif re.match(r"\d{10}-\d{2}-\d{6}", search_id):
return get_by_accession_number_enriched(search_id)
elif re.match(r"^\d{18}$", search_id): # accession number with no dashes
accession_number = search_id[:10] + "-" + search_id[10:12] + "-" + search_id[12:]
return get_by_accession_number_enriched(accession_number)
elif re.match(r"\d{4,10}$", search_id):
return Entity(search_id)
elif re.match(r"^[A-WYZ]{1,5}([.-][A-Z])?$", search_id): # Ticker (including dot or hyphenated)
return Entity(search_id)
elif re.match(r"^[A-Z]{4}X$", search_id): # Mutual Fund Ticker
return find_fund(search_id)
elif re.match(r"^[CS]\d+$", search_id):
return find_fund(search_id)
elif re.match(r"^\d{6,}-", search_id):
# Probably an invalid accession number
return None
else:
return find_company(search_id)
def matches_form(sec_filing: Filing,
form: Union[str, List[str]]) -> bool:
"""Check if the filing matches the forms"""
form_list = listify(form)
if sec_filing.form in form_list + [f"{f}/A" for f in form_list]:
return True
return False
class DataObjectException(Exception):
def __init__(self, filing: Filing):
self.message = f"Could not create a data object for Form {filing.form} filing: {filing.accession_no}"
super().__init__(self.message)
def obj(sec_filing: Filing) -> Optional[object]:
"""
Depending on the filing return the data object that contains the data for the filing
This usually coms from the xml associated with the filing, but it can also come from the extracted xbrl
:param sec_filing: The filing
:return:
"""
from edgar.company_reports import CurrentReport, EightK, TenK, TenQ, TwentyF
from edgar.effect import Effect
from edgar.form144 import Form144
from edgar.muniadvisors import MunicipalAdvisorForm
from edgar.offerings import FormC, FormD
from edgar.ownership import Form3, Form4, Form5, Ownership
if matches_form(sec_filing, "6-K"):
return CurrentReport(sec_filing)
if matches_form(sec_filing, "8-K"):
return EightK(sec_filing)
elif matches_form(sec_filing, "10-Q"):
return TenQ(sec_filing)
elif matches_form(sec_filing, "10-K"):
return TenK(sec_filing)
elif matches_form(sec_filing, "20-F"):
return TwentyF(sec_filing)
elif matches_form(sec_filing, THIRTEENF_FORMS):
# ThirteenF can work with either XML (2013+) or TXT (2012 and earlier) format
return ThirteenF(sec_filing)
elif matches_form(sec_filing, "144"):
return Form144.from_filing(sec_filing)
elif matches_form(sec_filing, "MA-I"):
return MunicipalAdvisorForm.from_filing(sec_filing)
elif matches_form(sec_filing, "3"):
xml = sec_filing.xml()
if xml:
return Form3(**Ownership.parse_xml(xml))
elif matches_form(sec_filing, "4"):
xml = sec_filing.xml()
if xml:
return Form4(**Ownership.parse_xml(xml))
elif matches_form(sec_filing, "5"):
xml = sec_filing.xml()
if xml:
return Form5(**Ownership.parse_xml(xml))
elif matches_form(sec_filing, "EFFECT"):
xml = sec_filing.xml()
if xml:
return Effect.from_xml(xml)
elif matches_form(sec_filing, "D"):
xml = sec_filing.xml()
if xml:
return FormD.from_xml(xml)
elif matches_form(sec_filing, ["C", "C-U", "C-AR", "C-TR"]):
xml = sec_filing.xml()
if xml:
return FormC.from_xml(xml, form=sec_filing.form)
elif matches_form(sec_filing, ["NPORT-P", "NPORT-EX"]):
return FundReport.from_filing(sec_filing)
filing_xbrl = sec_filing.xbrl()
if filing_xbrl:
return filing_xbrl

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,127 @@
import re
from rich import box
from rich.console import Console, Group
from rich.markdown import Markdown
from rich.panel import Panel
from rich.table import Table
from edgar.files.html_documents import HtmlDocument
from edgar.richtools import repr_rich
__all__ = [
'convert_table',
'MarkdownContent',
'markdown_to_rich',
'html_to_markdown',
"fix_markdown",
"text_to_markdown",
]
def _empty(row):
if not row:
return True
chars = set(re.sub(r"\s", "", row.strip()))
return chars == {'|'} or chars == {'-', '|'}
def convert_table(table_markdown: str):
"""Convert the markdown to a rich Table"""
all_rows = table_markdown.replace("| |", "|\n|").split("\n")
# Just output a simple table with no headers
table = Table(" " * all_rows[0].count("|"), box=box.SIMPLE)
for row in all_rows:
if not _empty(row):
row = [cell.strip() for cell in row[1:-1].strip().split("|")]
table.add_row(*row)
return table
skip_tags = ["<DOCUMENT>", "<TYPE>", "<SEQUENCE>", "<FILENAME>", "<DESCRIPTION>", "<TEXT>"]
def markdown_to_rich(md: str, title: str = "") -> Panel:
"""Convert the markdown to rich .. handling tables better than rich"""
content = []
buf = ""
table_buf = ""
is_table = False
for line in md.split("\n"):
if is_table:
if not line.strip():
table = convert_table(table_buf)
content.append(table)
is_table = False
table_buf = ""
else:
table_buf += line + "\n"
else:
if "| |" in line:
markdown = Markdown(buf)
buf = ""
table_buf = line + "\n"
content.append(markdown)
is_table = True
else:
buf += line + "\n"
if buf:
content.append(Markdown(buf))
return Panel(Group(*content), title=title, subtitle=title, box=box.ROUNDED)
def fix_markdown(md: str):
# Clean up issues with not spaces between sentences like "Condition.On"
md = re.sub(r"([a-z]\.)([A-Z])", r"\1 \2", md)
# Remove asterisks inside Items
md = re.sub(r"\*\*(Item)\*\*\xa0\*\*(\d)", r"\1 \2", md, flags=re.IGNORECASE)
# And fix split Item numbers e.g. "Item\n5.02"
md = re.sub(r"(Item)[\n\xa0]\s?(\d)", r"\1 \2", md, flags=re.IGNORECASE)
# Fix items not on newlines e.g. ". Item 5.02"
md = re.sub(r"\. (Item)\s?(\d.\d{,2})", r".\n \1 \2", md, flags=re.IGNORECASE)
# Fix items with no space before Item e.g. "ReservedItem 7"
md = re.sub(r"(\S)(Item)\s?(\d.\d{,2})", r"\1\n\n \2 \3", md, flags=re.IGNORECASE)
return md
def html_to_markdown(html: str) -> str:
"""Convert the html to markdown"""
document: HtmlDocument = HtmlDocument.from_html(html)
return document.markdown
def text_to_markdown(text: str) -> str:
"""Convert the text to markdown"""
return f"""
<pre>{text}</pre>
"""
class MarkdownContent:
def __init__(self,
markdown: str,
title: str = ""):
self.md = markdown
self.title = title
@classmethod
def from_html(cls, html: str, title: str = ""):
md = html_to_markdown(html)
return cls(markdown=md, title=title)
def view(self):
console = Console()
console.print(self.__rich__())
def __rich__(self):
_renderable = markdown_to_rich(self.md, title=self.title)
return _renderable
def __repr__(self):
return repr_rich(self.__rich__())

View File

@@ -0,0 +1,257 @@
from typing import Any, Dict, List, Optional
from bs4 import Tag
from pydantic import BaseModel
from rich.columns import Columns
from rich.console import Group
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from edgar.core import IntString
from edgar.richtools import repr_rich
from edgar.xmltools import child_text, child_value
__all__ = [
'Address',
'Issuer',
'Person',
'Name',
'Filer',
'get_addresses_as_columns'
]
class Address(BaseModel):
street1: Optional[str] = None
street2: Optional[str] = None
city: Optional[str] = None
state_or_country: Optional[str] = None
state_or_country_description: Optional[str] = None
zipcode: Optional[str] = None
@property
def empty(self):
return not self.street1 and not self.street2 and not self.city and not self.state_or_country and not self.zipcode
@classmethod
def from_dict(cls, address_dict: Dict[str, Any]):
return Address(
street1=address_dict.get('STREET1'),
street2=address_dict.get('STREET2'),
city=address_dict.get('CITY'),
state_or_country=address_dict.get('STATE'),
zipcode=address_dict.get('ZIP')
)
def __str__(self):
if not self.street1:
return ""
address_format = "{street1}\n"
if self.street2:
address_format += "{street2}\n"
address_format += "{city}, {state_or_country} {zipcode}"
return address_format.format(
street1=self.street1,
street2=self.street2,
city=self.city,
state_or_country=self.state_or_country_description or self.state_or_country,
zipcode=self.zipcode or ""
)
def __repr__(self):
return (f'Address(street1="{self.street1 or ""}", street2="{self.street2 or ""}", city="{self.city or ""}",'
f'zipcode="{self.zipcode or ""}", state_or_country="{self.state_or_country} or "")'
)
def get_addresses_as_columns(*,
mailing_address: Optional[Address],
business_address: Optional[Address]) -> Columns:
"""
Returns a rich Columns object with mailing and business addresses
"""
addresses = []
if mailing_address and not mailing_address.empty:
addresses.append(Panel(Text(str(mailing_address)), title='\U00002709 Mailing Address', width=40))
if business_address and not business_address.empty:
addresses.append(Panel((Text(str(business_address))), title='\U0001F3E2 Business Address', width=40))
return Columns(addresses, equal=True, expand=True)
class Issuer:
"""
<primaryIssuer>
<cik>0001961089</cik>
<entityName>1685 38th REIT, L.L.C.</entityName>
<issuerAddress>
<street1>2029 CENTURY PARK EAST</street1>
<street2>SUITE 1370</street2>
<city>LOS ANGELES</city>
<stateOrCountry>CA</stateOrCountry>
<stateOrCountryDescription>CALIFORNIA</stateOrCountryDescription>
<zipCode>90067</zipCode>
</issuerAddress>
<issuerPhoneNumber>424-313-1550</issuerPhoneNumber>
<jurisdictionOfInc>DELAWARE</jurisdictionOfInc>
<issuerPreviousNameList>
<value>None</value>
</issuerPreviousNameList>
<edgarPreviousNameList>
<value>None</value>
</edgarPreviousNameList>
<entityType>Limited Liability Company</entityType>
<yearOfInc>
<withinFiveYears>true</withinFiveYears>
<value>2022</value>
</yearOfInc>
</primaryIssuer>
"""
def __init__(self,
cik: IntString,
entity_name: str,
entity_type: str,
primary_address: Address,
phone_number: str,
jurisdiction: str,
issuer_previous_names: List[str],
edgar_previous_names: List[str],
year_of_incorporation: IntString,
incorporated_within_5_years: bool):
self.cik = cik
self.entity_name: str = entity_name
self.entity_type = entity_type
self.primary_address: Address = primary_address
self.phone_number: str = phone_number
self.issuer_previous_names = issuer_previous_names
self.edgar_previous_names = edgar_previous_names
self.jurisdiction: str = jurisdiction
self.year_of_incorporation = year_of_incorporation
self.incorporated_within_5_years: bool = incorporated_within_5_years
@classmethod
def from_xml(cls, issuer_el: Tag):
# edgar previous names
edgar_previous_names_el = issuer_el.find("edgarPreviousNameList")
edgar_previous_names = [el.text
for el in edgar_previous_names_el.find_all("value")
if el.text != 'None'] if edgar_previous_names_el else []
# issuer previous names
issuer_previous_names_el = issuer_el.find("issuerPreviousNameList")
issuer_previous_names = [el.text
for el in issuer_previous_names_el.find_all("value")
if el.text != 'None'] if issuer_previous_names_el else []
year_of_inc_el = issuer_el.find("yearOfInc")
# Address
issuer_address_el = issuer_el.find("issuerAddress")
address: Address = Address(
street1=child_text(issuer_address_el, "street1"),
street2=child_text(issuer_address_el, "street2"),
city=child_text(issuer_address_el, "city"),
state_or_country=child_text(issuer_address_el, "stateOrCountry"),
state_or_country_description=child_text(issuer_address_el, "stateOrCountryDescription"),
zipcode=child_text(issuer_address_el, "zipCode")
)
return cls(
cik=child_text(issuer_el, "cik"),
entity_name=child_text(issuer_el, "entityName"),
phone_number=child_text(issuer_el, "issuerPhoneNumber"),
jurisdiction=child_text(issuer_el, "jurisdictionOfInc"),
entity_type=child_text(issuer_el, "entityType"),
edgar_previous_names=edgar_previous_names,
primary_address=address,
issuer_previous_names=issuer_previous_names,
year_of_incorporation=child_value(issuer_el, "yearOfInc"),
incorporated_within_5_years=year_of_inc_el and child_text(year_of_inc_el, "withinFiveYears") == "true"
)
def __rich__(self):
table = Table("issuer", "entity type", "incorporated")
table.add_row(self.entity_name, self.entity_type, self.year_of_incorporation)
return Group(table)
def __repr__(self):
return repr_rich(self.__rich__())
class Person:
def __init__(self,
first_name: str,
last_name: str,
address: Optional[Address] = None):
self.first_name = first_name
self.last_name = last_name
self.address: Address = address
def __str__(self):
return f"{self.first_name} {self.first_name}"
def __repr__(self):
return f"{self.first_name} {self.last_name}"
class Name:
def __init__(self,
first_name: str,
middle_name: str,
last_name: str,
suffix:Optional[str]=None):
self.first_name = first_name
self.middle_name = middle_name
self.last_name = last_name
self.suffix = suffix
@property
def full_name(self):
return f"{self.first_name}{' ' + self.middle_name or ''} {self.last_name} {self.suffix or ''}".rstrip()
def __str__(self):
return self.full_name
def __repr__(self):
return self.full_name
class Filer:
def __init__(self,
cik: str,
entity_name: str,
file_number: str
):
self.cik: str = cik
self.entity_name: str = entity_name
self.file_number: str = file_number
def __str__(self):
return f"{self.entity_name} ({self.cik})"
def __repr__(self):
return f"{self.entity_name} ({self.cik})"
class Contact:
def __init__(self,
name: str,
phone_number: str,
email: str):
self.name: str = name
self.phone_number: str = phone_number
self.email: str = email
def __str__(self):
return f"{self.name} ({self.phone_number}) {self.email}"
def __repr__(self):
return f"{self.name} ({self.phone_number}) {self.email}"

View File

@@ -0,0 +1,263 @@
"""
EdgarTools AI: AI and LLM integration for SEC financial data analysis.
This package provides AI capabilities for EdgarTools including:
- AI Skills: Portable documentation packages for Claude Desktop and other AI tools
- AI-optimized text methods (.text()) with research-backed formats (Markdown-KV, TSV)
- LLM context generation with token optimization
- Model Context Protocol (MCP) server for Claude Desktop integration
- Semantic enrichment of financial data
- Token counting and optimization
Installation:
pip install edgartools[ai]
Dependencies included:
- mcp: Model Context Protocol server support
- tiktoken: Token counting and optimization
Skills API:
>>> from edgar.ai import install_skill, package_skill
>>>
>>> # Install skill to ~/.claude/skills/
>>> install_skill()
PosixPath('/Users/username/.claude/skills/edgartools')
>>>
>>> # Create ZIP for Claude Desktop upload
>>> package_skill()
PosixPath('edgartools.zip')
>>> # List available skills
>>> from edgar.ai import list_skills
>>> skills = list_skills()
AI-Optimized Objects:
>>> from edgar import Company
>>> company = Company("AAPL")
>>>
>>> # Get AI-optimized text representation (Markdown-KV format)
>>> text = company.text(max_tokens=2000)
>>> print(text)
**Company:** Apple Inc.
**CIK:** 0000320193
**Ticker:** AAPL
Context Generation:
>>> from edgar.ai import enhance_financial_fact_llm_context
>>> context = enhance_financial_fact_llm_context(fact, detail_level='detailed')
"""
# Check for AI dependencies
MISSING_DEPS = []
try:
import mcp
MCP_AVAILABLE = True
except ImportError:
MCP_AVAILABLE = False
MISSING_DEPS.append("mcp")
try:
import tiktoken
TIKTOKEN_AVAILABLE = True
except ImportError:
TIKTOKEN_AVAILABLE = False
MISSING_DEPS.append("tiktoken")
# AI is available if we have at least some key dependencies
AI_AVAILABLE = MCP_AVAILABLE or TIKTOKEN_AVAILABLE
# Core functionality (always available)
from edgar.ai.core import AIEnabled, SemanticEnricher, TokenOptimizer, check_ai_capabilities, enhance_financial_fact_llm_context
# Skills infrastructure (always available)
from edgar.ai.skills.base import BaseSkill
from edgar.ai.skills import list_skills, get_skill
from edgar.ai.skills.core import edgartools_skill
from edgar.ai.exporters import export_skill
# Convenience functions for common workflows
def install_skill(skill=None, to=None, quiet=False):
"""
Install a skill to ~/.claude/skills/ for automatic discovery.
Simple, delightful API for installing skills to Claude.
Args:
skill: Skill to install (defaults to edgartools_skill)
to: Custom installation directory (defaults to ~/.claude/skills/)
quiet: If True, suppress output messages (default: False)
Returns:
Path: Path to installed skill directory
Examples:
>>> from edgar.ai import install_skill
>>>
>>> # Install EdgarTools skill (default)
>>> install_skill()
✨ Installing EdgarTools skill...
📁 Installed to: /Users/username/.claude/skills/edgartools
✅ Ready to use in Claude Desktop and Claude Code!
>>>
>>> # Install to custom location
>>> install_skill(to="~/my-skills")
PosixPath('/Users/username/my-skills/edgartools')
"""
if skill is None:
skill = edgartools_skill
# Show delightful message
if not quiet:
print("\n" + "="*60)
print("""
___ _ _____ _
| __|__| |__ _ __ _ _ _ |_ _|__ ___ | |___
| _|/ _` / _` / _` | '_| | |/ _ \\/ _ \\| (_-<
|___\\__,_\\__, \\__,_|_| |_|\\___/\\___/|_/__/
|___/
""")
print("="*60)
print(f"✨ Installing {skill.name} skill...")
print()
result = export_skill(
skill,
format="claude-skills",
output_dir=to,
install=(to is None) # Only use install flag if no custom dir
)
if not quiet:
print(f"📁 Installed to: {result}")
print(f"✅ Ready to use in Claude Desktop and Claude Code!")
print("="*60 + "\n")
return result
def package_skill(skill=None, output=None, quiet=False):
"""
Create a ZIP package for Claude Desktop upload.
Simple, delightful API for packaging skills as ZIP files.
Args:
skill: Skill to package (defaults to edgartools_skill)
output: Output directory (defaults to current directory)
quiet: If True, suppress output messages (default: False)
Returns:
Path: Path to created ZIP file
Examples:
>>> from edgar.ai import package_skill
>>>
>>> # Create ZIP in current directory (default)
>>> package_skill()
📦 Packaging EdgarTools skill...
✅ Created: edgartools.zip
💡 Ready to upload via Claude Desktop's skill upload interface!
>>>
>>> # Create ZIP in custom location
>>> package_skill(output="~/Desktop")
PosixPath('/Users/username/Desktop/edgartools.zip')
"""
if skill is None:
skill = edgartools_skill
# Show delightful message
if not quiet:
print("\n" + "="*60)
print("""
___ _ _____ _
| __|__| |__ _ __ _ _ _ |_ _|__ ___ | |___
| _|/ _` / _` / _` | '_| | |/ _ \\/ _ \\| (_-<
|___\\__,_\\__, \\__,_|_| |_|\\___/\\___/|_/__/
|___/
""")
print("="*60)
print(f"📦 Packaging {skill.name} skill as ZIP...")
print()
result = export_skill(
skill,
format="claude-desktop",
output_dir=output,
create_zip=True
)
if not quiet:
print(f"✅ Created: {result.name}")
print(f"📍 Location: {result.parent}")
print(f"💡 Ready to upload via Claude Desktop's skill upload interface!")
print("="*60 + "\n")
return result
# Optional MCP functionality
# Note: The class-based MCPServer and EdgarToolsServer are deprecated.
# Use the function-based API instead: from edgar.ai.mcp import main, test_server
if MCP_AVAILABLE:
# Provide stub classes for backward compatibility
class MCPServer:
def __init__(self, *args, **kwargs):
raise DeprecationWarning(
"MCPServer class is deprecated. "
"Use function-based API: from edgar.ai.mcp import main, test_server"
)
class EdgarToolsServer:
def __init__(self, *args, **kwargs):
raise DeprecationWarning(
"EdgarToolsServer class is deprecated. "
"Use function-based API: from edgar.ai.mcp import main, test_server"
)
else:
def MCPServer(*args, **kwargs):
raise ImportError(
"MCP support requires additional dependencies. "
"Install with: pip install edgartools[ai]"
)
EdgarToolsServer = MCPServer
# Public API
__all__ = [
# Core
"AIEnabled",
"TokenOptimizer",
"SemanticEnricher",
"enhance_financial_fact_llm_context",
"check_ai_capabilities",
# Skills
"BaseSkill",
"list_skills",
"get_skill",
"edgartools_skill",
"export_skill",
# Convenience functions (delightful API)
"install_skill",
"package_skill",
# MCP
"MCPServer",
"EdgarToolsServer",
# Status flags
"AI_AVAILABLE",
"MCP_AVAILABLE",
"TIKTOKEN_AVAILABLE",
"MISSING_DEPS"
]
def get_ai_info():
"""Get information about AI capabilities."""
return {
"ai_available": AI_AVAILABLE,
"mcp_available": MCP_AVAILABLE,
"tiktoken_available": TIKTOKEN_AVAILABLE,
"missing_dependencies": MISSING_DEPS,
"install_command": "pip install edgartools[ai]" if MISSING_DEPS else None
}

View File

@@ -0,0 +1,16 @@
#!/usr/bin/env python3
"""
EdgarTools MCP Server Entry Point
Enables running the server via: python -m edgar.ai
"""
if __name__ == "__main__":
import sys
from edgar.ai.mcp import main, test_server
# Check for --test flag before starting server
if "--test" in sys.argv or "-t" in sys.argv:
sys.exit(0 if test_server() else 1)
else:
main()

View File

@@ -0,0 +1,391 @@
"""
AI enhancements for EdgarTools entity models.
This module provides enhanced AI capabilities building on the existing
to_llm_context() implementation, adding token optimization, semantic
enrichment, and MCP compatibility.
"""
import json
from abc import ABC, abstractmethod
from datetime import date
from typing import Any, Dict, List, Optional, Union
class TokenOptimizer:
"""Utilities for optimizing content for LLM token limits."""
@staticmethod
def estimate_tokens(content: Union[str, dict]) -> int:
"""
Estimate token count for content.
Rough estimation: ~4 characters per token for English text.
"""
if isinstance(content, dict):
content = json.dumps(content)
return len(content) // 4
@staticmethod
def optimize_for_tokens(content: Dict[str, Any], max_tokens: int) -> Dict[str, Any]:
"""
Optimize content to fit within token limit.
Uses progressive summarization to retain most important information.
"""
current_tokens = TokenOptimizer.estimate_tokens(content)
if current_tokens <= max_tokens:
return content
# Define priority order for content retention
priority_keys = [
'concept', 'value', 'period', 'context',
'quality', 'confidence', 'source'
]
# Start with high-priority content
optimized = {}
for key in priority_keys:
if key in content:
optimized[key] = content[key]
if TokenOptimizer.estimate_tokens(optimized) > max_tokens:
# Remove last added item if we exceed limit
optimized.pop(key)
break
# Add truncation indicator
if len(optimized) < len(content):
optimized['_truncated'] = True
return optimized
class SemanticEnricher:
"""Add semantic context and interpretations to financial data."""
# Concept definitions for common financial terms
CONCEPT_DEFINITIONS = {
"Revenue": "Total income generated from normal business operations",
"Revenues": "Total income generated from normal business operations",
"NetIncome": "Company's total earnings after all expenses and taxes",
"NetIncomeLoss": "Company's total earnings or losses after all expenses",
"Assets": "Resources owned by the company with economic value",
"Liabilities": "Company's financial debts or obligations",
"StockholdersEquity": "Residual interest in assets after deducting liabilities",
"CashAndCashEquivalents": "Highly liquid assets readily convertible to cash",
"OperatingIncome": "Profit from core business operations before interest and taxes",
"EarningsPerShare": "Company's profit divided by outstanding shares",
"CurrentAssets": "Assets expected to be converted to cash within one year",
"CurrentLiabilities": "Obligations due within one year",
}
# Relationships between concepts
CONCEPT_RELATIONSHIPS = {
"Revenue": ["GrossProfit", "OperatingIncome", "NetIncome"],
"Assets": ["CurrentAssets", "NonCurrentAssets", "CashAndCashEquivalents"],
"Liabilities": ["CurrentLiabilities", "LongTermDebt"],
"NetIncome": ["Revenue", "OperatingExpenses", "TaxExpense"],
"StockholdersEquity": ["Assets", "Liabilities", "RetainedEarnings"],
}
@classmethod
def get_concept_definition(cls, concept: str) -> Optional[str]:
"""Get human-readable definition for a concept."""
# Remove namespace prefix if present
concept_key = concept.split(':')[-1]
return cls.CONCEPT_DEFINITIONS.get(concept_key)
@classmethod
def get_related_concepts(cls, concept: str) -> List[str]:
"""Get semantically related concepts."""
concept_key = concept.split(':')[-1]
return cls.CONCEPT_RELATIONSHIPS.get(concept_key, [])
@classmethod
def interpret_value(cls, concept: str, value: Union[int, float],
unit: str, period_type: str = None) -> str:
"""
Generate business interpretation of a financial value.
Args:
concept: The financial concept (e.g., "Revenue")
value: The numeric value
unit: The unit of measurement (e.g., "USD")
period_type: 'instant' or 'duration'
Returns:
Human-readable interpretation
"""
concept_key = concept.split(':')[-1]
# Revenue interpretations
if concept_key in ["Revenue", "Revenues"]:
if value > 1_000_000_000:
scale = "billion-dollar"
elif value > 100_000_000:
scale = "multi-million dollar"
else:
scale = "smaller-scale"
return f"The company is a {scale} business based on revenue"
# Profitability interpretations
elif concept_key in ["NetIncome", "NetIncomeLoss"]:
if value > 0:
return "The company is profitable"
elif value == 0:
return "The company broke even"
else:
return "The company reported a net loss"
# Asset interpretations
elif concept_key == "CashAndCashEquivalents":
if value > 10_000_000_000:
return "Very strong cash position providing significant financial flexibility"
elif value > 1_000_000_000:
return "Healthy cash reserves for operations and investments"
elif value > 100_000_000:
return "Adequate cash position for normal operations"
else:
return "Limited cash reserves may constrain growth opportunities"
return ""
class AIEnabled(ABC):
"""
Base mixin for AI-enabled EdgarTools classes.
Provides standardized AI methods that all classes should implement.
"""
@abstractmethod
def to_llm_context(self, detail_level: str = 'standard',
max_tokens: Optional[int] = None) -> Dict[str, Any]:
"""
Convert object to LLM-optimized context.
Args:
detail_level: Level of detail ('minimal', 'standard', 'detailed')
max_tokens: Optional token limit for response optimization
Returns:
Dictionary optimized for LLM consumption
"""
pass
def to_agent_tool(self) -> Dict[str, Any]:
"""
Convert object to MCP agent tool response format.
Returns:
Dictionary following MCP tool response schema
"""
return {
"data": self.to_dict() if hasattr(self, 'to_dict') else {},
"context": self.to_llm_context(),
"metadata": {
"source": "SEC EDGAR",
"object_type": self.__class__.__name__,
"timestamp": date.today().isoformat()
}
}
@abstractmethod
def get_semantic_description(self) -> str:
"""
Get natural language description of the object.
Returns:
Human-readable description with key insights
"""
pass
def enhance_financial_fact_llm_context(fact, detail_level='standard', max_tokens=None):
"""
Enhanced version of FinancialFact.to_llm_context() with new features.
This function shows how to enhance the existing implementation while
maintaining backward compatibility.
Args:
fact: FinancialFact instance
detail_level: 'minimal', 'standard', or 'detailed'
max_tokens: Optional token limit
Returns:
Enhanced LLM context dictionary
"""
# Start with the existing implementation
context = fact.to_llm_context()
# Add semantic enrichment based on detail level
if detail_level in ['standard', 'detailed']:
# Add concept definition
definition = SemanticEnricher.get_concept_definition(fact.concept)
if definition:
context['definition'] = definition
# Add value interpretation
interpretation = SemanticEnricher.interpret_value(
fact.concept,
fact.numeric_value or fact.value,
fact.unit,
fact.period_type
)
if interpretation:
context['interpretation'] = interpretation
if detail_level == 'detailed':
# Add related concepts
related = SemanticEnricher.get_related_concepts(fact.concept)
if related:
context['related_concepts'] = related
# Add additional metadata
context['metadata'] = {
'taxonomy': fact.taxonomy,
'scale': fact.scale,
'decimals': getattr(fact, 'decimals', None),
'statement_type': fact.statement_type
}
# Add calculation context if available
if hasattr(fact, 'calculation_context') and fact.calculation_context:
context['calculation_context'] = fact.calculation_context
# Optimize for token limit if specified
if max_tokens:
context = TokenOptimizer.optimize_for_tokens(context, max_tokens)
return context
class FinancialFactAIWrapper:
"""
Wrapper to add AI methods to existing FinancialFact instances.
This demonstrates how to add AI capabilities without modifying
the original class definition.
"""
def __init__(self, fact):
self.fact = fact
def to_llm_context(self, detail_level='standard', max_tokens=None):
"""Enhanced LLM context with new features."""
return enhance_financial_fact_llm_context(
self.fact, detail_level, max_tokens
)
def to_agent_tool(self):
"""Convert to MCP tool response format."""
return {
"data": {
"concept": self.fact.concept,
"value": self.fact.value,
"numeric_value": self.fact.numeric_value,
"unit": self.fact.unit,
"period_end": self.fact.period_end.isoformat() if self.fact.period_end else None,
"fiscal_period": self.fact.fiscal_period,
"fiscal_year": self.fact.fiscal_year
},
"context": self.to_llm_context(),
"metadata": {
"source": f"SEC {self.fact.form_type}",
"filed": self.fact.filing_date.isoformat() if self.fact.filing_date else None,
"quality": self.fact.data_quality.value,
"confidence": self.fact.confidence_score
}
}
def get_semantic_description(self):
"""Natural language description of the fact."""
context = self.fact.to_llm_context()
return (f"{context['concept']} of {context['value']} {context['unit']} "
f"{context['period']} from {context['source']}")
def check_ai_capabilities():
"""
Check which AI features are available based on installed dependencies.
Returns:
Dictionary with capability flags
"""
capabilities = {
'basic': True, # Always available
'mcp': False,
'token_optimization': False,
'semantic_enrichment': True, # Works without external deps
}
try:
import mcp # noqa: F401
capabilities['mcp'] = True
except ImportError:
pass
try:
import tiktoken # noqa: F401
capabilities['token_optimization'] = True
except ImportError:
pass
return capabilities
# Example usage demonstrating the enhanced capabilities
if __name__ == "__main__":
# This would be imported from edgar.entity.models
from dataclasses import dataclass
from enum import Enum
class DataQuality(Enum):
HIGH = "high"
@dataclass
class MockFinancialFact:
"""Mock class for demonstration"""
concept: str = "us-gaap:Revenue"
taxonomy: str = "us-gaap"
value: float = 125_000_000_000
numeric_value: float = 125_000_000_000
unit: str = "USD"
scale: int = 1
period_end: date = date(2024, 3, 31)
period_type: str = "duration"
fiscal_period: str = "Q1"
fiscal_year: int = 2024
form_type: str = "10-Q"
filing_date: date = date(2024, 4, 30)
data_quality: DataQuality = DataQuality.HIGH
confidence_score: float = 0.95
statement_type: str = "IncomeStatement"
def to_llm_context(self):
# Simulate existing implementation
return {
"concept": "Revenue",
"value": "125,000 million",
"unit": "USD",
"period": "for Q1 2024",
"context": "",
"quality": "high",
"confidence": 0.95,
"source": "10-Q filed 2024-04-30"
}
# Create a mock fact
fact = MockFinancialFact()
# Wrap it with AI enhancements
ai_fact = FinancialFactAIWrapper(fact)
# Test different detail levels

View File

@@ -0,0 +1,5 @@
"""
EdgarTools AI examples.
This package contains example scripts demonstrating AI capabilities.
"""

View File

@@ -0,0 +1,187 @@
#!/usr/bin/env python3
"""
Basic usage examples for EdgarTools AI features.
This script demonstrates how to use the AI capabilities including
LLM context generation and MCP server functionality.
"""
import json
from datetime import date
# Check if AI features are available
try:
from edgar.ai import (
AI_AVAILABLE,
MCP_AVAILABLE,
get_ai_info,
enhance_financial_fact_llm_context,
check_ai_capabilities
)
except ImportError:
print("EdgarTools AI features not available.")
print("Install with: pip install edgartools[llm]")
exit(1)
def demonstrate_ai_capabilities():
"""Show available AI capabilities."""
print("=== AI Capabilities ===")
info = get_ai_info()
print(f"AI Available: {info['ai_available']}")
print(f"MCP Available: {info['mcp_available']}")
print(f"Token Optimization: {info['tiktoken_available']}")
if info['missing_dependencies']:
print(f"\nMissing dependencies: {', '.join(info['missing_dependencies'])}")
print(f"Install with: {info['install_command']}")
print("\nDetailed capabilities:")
capabilities = check_ai_capabilities()
for capability, available in capabilities.items():
status = "" if available else ""
print(f" {status} {capability}")
def demonstrate_financial_fact_enhancement():
"""Demonstrate enhancing financial facts for LLM consumption."""
print("\n=== Financial Fact Enhancement ===")
# Create a mock financial fact (in real usage, this would come from EdgarTools)
from dataclasses import dataclass
from enum import Enum
class DataQuality(Enum):
HIGH = "high"
@dataclass
class MockFinancialFact:
concept: str = "us-gaap:Revenue"
taxonomy: str = "us-gaap"
label: str = "Revenue"
value: float = 125_000_000_000
numeric_value: float = 125_000_000_000
unit: str = "USD"
scale: int = 1
period_end: date = date(2024, 3, 31)
period_type: str = "duration"
fiscal_period: str = "Q1"
fiscal_year: int = 2024
filing_date: date = date(2024, 4, 30)
form_type: str = "10-Q"
data_quality: DataQuality = DataQuality.HIGH
confidence_score: float = 0.95
statement_type: str = "IncomeStatement"
def to_llm_context(self):
"""Basic LLM context (existing in EdgarTools)."""
return {
"concept": self.label,
"value": f"{self.value:,.0f}",
"unit": self.unit,
"period": f"for {self.fiscal_period} {self.fiscal_year}",
"quality": self.data_quality.value,
"confidence": self.confidence_score,
"source": f"{self.form_type} filed {self.filing_date}"
}
fact = MockFinancialFact()
# Show different detail levels
print("\nMinimal context:")
minimal = enhance_financial_fact_llm_context(fact, detail_level='minimal')
print(json.dumps(minimal, indent=2))
print("\nStandard context (with semantic enrichment):")
standard = enhance_financial_fact_llm_context(fact, detail_level='standard')
print(json.dumps(standard, indent=2))
print("\nToken-limited context (100 tokens):")
limited = enhance_financial_fact_llm_context(fact, detail_level='detailed', max_tokens=100)
print(json.dumps(limited, indent=2))
def demonstrate_mcp_server():
"""Demonstrate MCP server setup."""
print("\n=== MCP Server Setup ===")
if not MCP_AVAILABLE:
print("MCP not available. Install with: pip install edgartools[llm]")
return
try:
from edgar.ai.mcp import get_simple_server
server = get_simple_server()
print("MCP Server created successfully!")
print(f"Server name: {server.name}")
print("\nTo run the server:")
print(" python edgar/ai/run_mcp_server.py")
print("\nOr use in Claude Desktop config:")
print(""" {
"tools": [
{
"type": "mcp",
"name": "edgartools",
"config": {
"command": "python",
"args": ["edgar/ai/run_mcp_server.py"]
}
}
]
}""")
except ImportError as e:
print(f"Error creating MCP server: {e}")
def demonstrate_usage_with_company():
"""Demonstrate AI features with real EdgarTools objects."""
print("\n=== Usage with EdgarTools Company ===")
try:
from edgar import Company
# Get a company
company = Company("AAPL")
print(f"Company: {company.name} ({company.get_ticker()})")
# If the company has a to_llm_context method (future enhancement)
if hasattr(company, 'to_llm_context'):
context = company.to_llm_context()
print("\nLLM Context:")
print(json.dumps(context, indent=2))
else:
print("\nNote: Company.to_llm_context() will be available in future versions")
print("For now, use the AI wrapper functions to enhance EdgarTools objects")
except Exception as e:
print(f"Error demonstrating company usage: {e}")
print("This example requires a working internet connection and valid SEC API access")
def main():
"""Run all demonstrations."""
print("EdgarTools AI Features Demonstration")
print("=" * 50)
# Check capabilities
demonstrate_ai_capabilities()
# Show financial fact enhancement
demonstrate_financial_fact_enhancement()
# Show MCP server setup
demonstrate_mcp_server()
# Show usage with real EdgarTools objects
demonstrate_usage_with_company()
print("\n" + "=" * 50)
print("For more examples, see the documentation in edgar/ai/docs/")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,53 @@
"""
EdgarTools AI skill exporters.
Provides functions to export skills in various formats for AI tool integration.
"""
from edgar.ai.exporters.claude_desktop import export_claude_desktop
from edgar.ai.exporters.claude_skills import export_claude_skills
__all__ = ['export_claude_desktop', 'export_claude_skills', 'export_skill']
def export_skill(skill, format: str = "claude-skills", output_dir=None, **kwargs):
"""
Export a skill in the specified format.
Args:
skill: BaseSkill instance to export
format: Export format:
- "claude-skills": Official Claude Skills format (default, ~/.claude/skills/)
- "claude-desktop": Portable format (current directory)
output_dir: Optional output directory (format-specific defaults)
**kwargs: Additional format-specific parameters:
- claude-skills: install (bool, default True)
- claude-desktop: create_zip (bool, default False)
Returns:
Path: Path to exported skill directory or archive
Examples:
>>> from edgar.ai.skills import edgartools_skill
>>> # Export to ~/.claude/skills/ (default)
>>> export_skill(edgartools_skill, format="claude-skills")
PosixPath('/Users/username/.claude/skills/edgartools')
>>> # Export to current directory (portable)
>>> export_skill(edgartools_skill, format="claude-desktop")
PosixPath('edgartools')
>>> # Export as zip archive
>>> export_skill(edgartools_skill, format="claude-desktop", create_zip=True)
PosixPath('edgartools.zip')
"""
if format == "claude-skills":
return export_claude_skills(skill, output_dir=output_dir, **kwargs)
elif format == "claude-desktop":
return export_claude_desktop(skill, output_dir=output_dir, **kwargs)
else:
raise ValueError(
f"Unknown export format: {format}. "
f"Supported formats: 'claude-skills', 'claude-desktop'"
)

View File

@@ -0,0 +1,173 @@
"""
Claude Desktop skill exporter.
Exports EdgarTools skills for Claude Desktop upload:
- Creates ZIP file with SKILL.md at root (required by Claude Desktop)
- Validates YAML frontmatter structure
- Includes all supporting markdown files and API reference
"""
import shutil
import zipfile
from pathlib import Path
from typing import Optional
import re
def export_claude_desktop(skill, output_dir: Optional[Path] = None, create_zip: bool = True) -> Path:
"""
Export a skill for Claude Desktop upload.
Creates a ZIP file with SKILL.md at the root level, as required by Claude Desktop's
upload interface. The ZIP includes all supporting markdown files and API reference.
Args:
skill: BaseSkill instance to export
output_dir: Optional output directory (defaults to current directory)
create_zip: If True (default), create a zip archive; if False, create directory
Returns:
Path: Path to exported ZIP file (or directory if create_zip=False)
Examples:
>>> from edgar.ai.skills import edgartools_skill
>>> # Create ZIP for Claude Desktop upload (default)
>>> export_claude_desktop(edgartools_skill)
PosixPath('edgartools.zip')
>>> # Create directory for manual installation
>>> export_claude_desktop(edgartools_skill, create_zip=False)
PosixPath('edgartools')
"""
from edgar.ai.skills.base import BaseSkill
if not isinstance(skill, BaseSkill):
raise TypeError(f"Expected BaseSkill instance, got {type(skill)}")
# Determine output directory
if output_dir is None:
output_dir = Path.cwd()
else:
output_dir = Path(output_dir)
# Create skill-specific directory name (kebab-case from skill name)
skill_dir_name = skill.name.lower().replace(' ', '-')
skill_output_dir = output_dir / skill_dir_name
# Remove existing directory if present
if skill_output_dir.exists():
shutil.rmtree(skill_output_dir)
skill_output_dir.mkdir(parents=True, exist_ok=True)
# Get markdown files from skill content directory
content_dir = skill.content_dir
markdown_files = list(content_dir.glob("*.md"))
if not markdown_files:
raise ValueError(f"No markdown files found in {content_dir}")
# Copy and validate each markdown file
# Claude Desktop requires SKILL.md (uppercase) at root
for md_file in markdown_files:
_copy_and_validate_markdown(md_file, skill_output_dir)
# Copy centralized object documentation (API reference)
object_docs = skill.get_object_docs()
if object_docs:
api_ref_dir = skill_output_dir / "api-reference"
api_ref_dir.mkdir(exist_ok=True)
for doc_path in object_docs:
if doc_path.exists():
shutil.copy2(doc_path, api_ref_dir / doc_path.name)
# Silently skip missing docs (allows for optional docs)
# Create zip archive if requested
if create_zip:
zip_path = output_dir / f"{skill_dir_name}.zip"
_create_zip_archive(skill_output_dir, zip_path)
# Clean up directory after zipping
shutil.rmtree(skill_output_dir)
return zip_path
return skill_output_dir
def _copy_and_validate_markdown(source: Path, destination_dir: Path) -> None:
"""
Copy markdown file and validate YAML frontmatter.
Args:
source: Source markdown file path
destination_dir: Destination directory
Raises:
ValueError: If YAML frontmatter is invalid or missing in SKILL.md
"""
dest_file = destination_dir / source.name
# Read and validate
content = source.read_text(encoding='utf-8')
# Only require frontmatter for SKILL.md
if source.name == 'SKILL.md':
# Check for YAML frontmatter
if not content.startswith('---'):
raise ValueError(f"Missing YAML frontmatter in {source.name}")
# Extract frontmatter
parts = content.split('---', 2)
if len(parts) < 3:
raise ValueError(f"Invalid YAML frontmatter structure in {source.name}")
frontmatter = parts[1].strip()
# Validate required frontmatter fields
_validate_skill_frontmatter(frontmatter, source.name)
else:
# Optional: validate frontmatter if present in supporting files
if content.startswith('---'):
parts = content.split('---', 2)
if len(parts) < 3:
raise ValueError(f"Invalid YAML frontmatter structure in {source.name}")
# Copy file
shutil.copy2(source, dest_file)
def _validate_skill_frontmatter(frontmatter: str, filename: str) -> None:
"""
Validate required fields in skill.md frontmatter.
Args:
frontmatter: YAML frontmatter content
filename: Source filename (for error messages)
Raises:
ValueError: If required fields are missing
"""
# Only require essential fields (name and description)
# version and author are optional
required_fields = ['name', 'description']
for field in required_fields:
# Simple regex check (not full YAML parsing to avoid dependencies)
if not re.search(rf'^{field}:', frontmatter, re.MULTILINE):
raise ValueError(f"Missing required field '{field}' in {filename} frontmatter")
def _create_zip_archive(source_dir: Path, zip_path: Path) -> None:
"""
Create a zip archive of the skill directory.
Args:
source_dir: Source directory to zip
zip_path: Output zip file path
"""
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path in source_dir.rglob('*'):
if file_path.is_file():
arcname = file_path.relative_to(source_dir.parent)
zipf.write(file_path, arcname)

View File

@@ -0,0 +1,163 @@
"""
Claude Skills exporter.
Exports EdgarTools skills in official Anthropic Claude Skills format:
- Installs to ~/.claude/skills/ by default
- Main file: SKILL.md (uppercase, per Anthropic spec)
- Keeps all supporting markdown files
- Validates YAML frontmatter structure
"""
import shutil
from pathlib import Path
from typing import Optional
import re
def export_claude_skills(skill, output_dir: Optional[Path] = None, install: bool = True) -> Path:
"""
Export a skill in official Claude Skills format.
Exports to ~/.claude/skills/ by default, creating SKILL.md (uppercase) as the
main skill file per Anthropic's specification. All supporting markdown files
are preserved.
Args:
skill: BaseSkill instance to export
output_dir: Optional output directory (defaults to ~/.claude/skills/)
install: If True (default), install to ~/.claude/skills/;
if False, use output_dir or current directory
Returns:
Path: Path to exported skill directory
Examples:
>>> from edgar.ai.skills import edgartools_skill
>>> export_claude_skills(edgartools_skill)
PosixPath('/Users/username/.claude/skills/edgartools')
>>> # Export to custom location
>>> export_claude_skills(edgartools_skill,
... output_dir="./my-skills",
... install=False)
PosixPath('./my-skills/edgartools')
"""
from edgar.ai.skills.base import BaseSkill
if not isinstance(skill, BaseSkill):
raise TypeError(f"Expected BaseSkill instance, got {type(skill)}")
# Determine output directory
if install and output_dir is None:
# Default: Install to ~/.claude/skills/
output_dir = Path.home() / ".claude" / "skills"
elif output_dir is None:
# No install flag, no output_dir: use current directory
output_dir = Path.cwd()
else:
output_dir = Path(output_dir)
# Create skill-specific directory name (kebab-case from skill name)
skill_dir_name = skill.name.lower().replace(' ', '-')
skill_output_dir = output_dir / skill_dir_name
# Remove existing directory if present
if skill_output_dir.exists():
shutil.rmtree(skill_output_dir)
skill_output_dir.mkdir(parents=True, exist_ok=True)
# Get markdown files from skill content directory
content_dir = skill.content_dir
markdown_files = list(content_dir.glob("*.md"))
if not markdown_files:
raise ValueError(f"No markdown files found in {content_dir}")
# Copy markdown files
skill_md_found = False
for md_file in markdown_files:
if md_file.name == 'SKILL.md':
# Validate and copy SKILL.md
_copy_and_validate_skill_md(md_file, skill_output_dir)
skill_md_found = True
else:
# Copy supporting markdown files as-is
dest_file = skill_output_dir / md_file.name
shutil.copy2(md_file, dest_file)
if not skill_md_found:
raise ValueError("No SKILL.md found in skill content directory")
# Copy centralized object documentation (API reference)
object_docs = skill.get_object_docs()
if object_docs:
api_ref_dir = skill_output_dir / "api-reference"
api_ref_dir.mkdir(exist_ok=True)
for doc_path in object_docs:
if doc_path.exists():
shutil.copy2(doc_path, api_ref_dir / doc_path.name)
# Silently skip missing docs (allows for optional docs)
return skill_output_dir
def _copy_and_validate_skill_md(source: Path, destination_dir: Path) -> None:
"""
Copy SKILL.md and validate YAML frontmatter.
Args:
source: Source SKILL.md file path
destination_dir: Destination directory
Raises:
ValueError: If YAML frontmatter is invalid or missing
"""
dest_file = destination_dir / source.name
# Read and validate
content = source.read_text(encoding='utf-8')
# Check for YAML frontmatter
if not content.startswith('---'):
raise ValueError(f"Missing YAML frontmatter in {source.name}")
# Extract frontmatter
parts = content.split('---', 2)
if len(parts) < 3:
raise ValueError(f"Invalid YAML frontmatter structure in {source.name}")
frontmatter = parts[1].strip()
# Validate required frontmatter fields
_validate_skill_frontmatter(frontmatter, source.name)
# Copy file
dest_file.write_text(content, encoding='utf-8')
def _validate_skill_frontmatter(frontmatter: str, filename: str) -> None:
"""
Validate required fields in SKILL.md frontmatter.
Per Anthropic spec, SKILL.md must have:
- name: skill identifier (lowercase with hyphens)
- description: clear description of what skill does
Args:
frontmatter: YAML frontmatter content
filename: Source filename (for error messages)
Raises:
ValueError: If required fields are missing
"""
required_fields = ['name', 'description']
for field in required_fields:
# Simple regex check (not full YAML parsing to avoid dependencies)
if not re.search(rf'^{field}:', frontmatter, re.MULTILINE):
raise ValueError(
f"Missing required field '{field}' in {filename} frontmatter. "
f"Claude Skills require both 'name' and 'description' fields."
)

View File

@@ -0,0 +1,101 @@
"""
AI-optimized text formatting utilities for EdgarTools.
Provides research-backed text formats optimized for LLM accuracy and token efficiency:
- Markdown-KV: Best accuracy (60.7%) for metadata
- TSV: Most efficient for tabular data
Based on research from improvingagents.com/blog/best-input-data-format-for-llms
"""
from typing import List, Dict
__all__ = ['to_markdown_kv', 'to_tsv']
def to_markdown_kv(data: dict, max_tokens: int = 2000) -> str:
"""
Convert dict to Markdown Key-Value format optimized for LLMs.
Research shows Markdown-KV format provides:
- 60.7% accuracy (best among tested formats)
- 25% fewer tokens than JSON
- Better readability for both humans and AI
Source: improvingagents.com/blog/best-input-data-format-for-llms
Args:
data: Dictionary with string keys and simple values
max_tokens: Approximate token limit (4 chars/token heuristic)
Returns:
Markdown-formatted key-value text
Example:
>>> to_markdown_kv({"name": "Apple Inc.", "cik": "320193"})
'**Name:** Apple Inc.\\n**Cik:** 320193'
"""
lines = []
for key, value in data.items():
if value is None:
continue
# Convert key to title case for readability
display_key = key.replace('_', ' ').title()
lines.append(f"**{display_key}:** {value}")
text = "\n".join(lines)
# Token limiting (4 chars/token heuristic)
max_chars = max_tokens * 4
if len(text) > max_chars:
text = text[:max_chars] + "\n\n[Truncated for token limit]"
return text
def to_tsv(rows: List[Dict], headers: List[str], max_tokens: int = 2000, limit: int = 10) -> str:
"""
Convert list of dicts to TSV (tab-separated values) format.
TSV is extremely token-efficient for tabular data and provides better
accuracy than CSV. This pattern is proven in MultiPeriodStatement.to_llm_string().
Args:
rows: List of dicts with consistent keys
headers: Column headers to include
max_tokens: Approximate token limit (4 chars/token heuristic)
limit: Maximum rows to include (default: 10)
Returns:
Tab-separated values with header row
Example:
>>> rows = [{"form": "10-K", "cik": "320193"}, {"form": "10-Q", "cik": "789019"}]
>>> to_tsv(rows, ["form", "cik"], limit=2)
'form\\tcik\\n10-K\\t320193\\n10-Q\\t789019'
"""
lines = []
# Header row
lines.append("\t".join(headers))
# Data rows
for row in rows[:limit]:
values = [str(row.get(h, "N/A")) for h in headers]
lines.append("\t".join(values))
text = "\n".join(lines)
# Add summary if truncated
if len(rows) > limit:
text += f"\n\n[Showing {limit} of {len(rows)} rows]"
# Token limiting
max_chars = max_tokens * 4
if len(text) > max_chars:
# Estimate rows that fit
avg_row_size = len(text) // len(lines) if lines else 100
rows_that_fit = max(1, max_chars // avg_row_size)
text = "\n".join(lines[:rows_that_fit]) + "\n\n[Truncated for token limit]"
return text

View File

@@ -0,0 +1,667 @@
"""
Helper functions for common SEC filing analysis tasks.
These convenience wrappers provide simple, high-level access to EdgarTools functionality
for common SEC filing analysis patterns.
"""
from typing import Optional, List, Dict, Union
import pandas as pd
from edgar import get_filings, get_current_filings, Company
__all__ = [
# Filing retrieval
'get_filings_by_period',
'get_today_filings',
# Financial analysis
'get_revenue_trend',
'get_filing_statement',
'compare_companies_revenue',
# Industry and company subset filtering
'filter_by_industry',
'filter_by_company_subset',
# Company subset convenience functions
'get_companies_by_state',
'get_pharmaceutical_companies',
'get_biotechnology_companies',
'get_software_companies',
'get_semiconductor_companies',
'get_banking_companies',
'get_investment_companies',
'get_insurance_companies',
'get_real_estate_companies',
'get_oil_gas_companies',
'get_retail_companies',
]
def get_filings_by_period(
year: int,
quarter: int,
form: Optional[str] = None,
filing_date: Optional[str] = None
):
"""
Get published filings for a specific time period from SEC quarterly indexes.
This is a convenience wrapper around get_filings() with clear parameter names.
Args:
year: Year (e.g., 2023)
quarter: Quarter 1-4 (1=Jan-Mar, 2=Apr-Jun, 3=Jul-Sep, 4=Oct-Dec)
form: Optional form type filter (e.g., "10-K", "10-Q", "S-1")
filing_date: Optional date or range filter (e.g., "2023-02-01:2023-02-28")
Returns:
Filings collection that can be further filtered or iterated
Raises:
HTTPError: If SEC API request fails
ValueError: If year/quarter parameters are invalid
Examples:
>>> # Get all filings from Q1 2023
>>> filings = get_filings_by_period(2023, 1)
>>> # Get only 10-K filings from Q1 2023
>>> filings = get_filings_by_period(2023, 1, form="10-K")
>>> # Get S-1 filings from February 2023
>>> filings = get_filings_by_period(
... 2023, 1,
... form="S-1",
... filing_date="2023-02-01:2023-02-28"
... )
See Also:
- get_filings() - The underlying raw API function
- get_today_filings() - For real-time filings (last 24h)
- Company.get_filings() - For company-specific filings
"""
return get_filings(year, quarter, form=form, filing_date=filing_date)
def get_today_filings():
"""
Get current filings from the last ~24 hours using SEC RSS feed.
This is a convenience wrapper around get_current_filings() for simpler naming.
Returns:
CurrentFilings collection with recent submissions
Raises:
HTTPError: If SEC RSS feed request fails
Examples:
>>> # Get all recent filings
>>> current = get_today_filings()
>>> print(f"Found {len(current)} filings in last 24 hours")
>>> # Filter for specific forms
>>> reports = current.filter(form=["10-K", "10-Q"])
>>> # Filter for specific companies
>>> tech_filings = current.filter(ticker=["AAPL", "MSFT", "GOOGL"])
See Also:
- get_current_filings() - The underlying raw API function
- get_filings_by_period() - For historical filings by quarter
"""
return get_current_filings()
def get_revenue_trend(
ticker: str,
periods: int = 3,
quarterly: bool = False
):
"""
Get income statement trend for revenue analysis using Entity Facts API.
This is the most efficient way to get multi-period financial data as it
uses a single API call to retrieve comparative periods.
Args:
ticker: Company ticker symbol (e.g., "AAPL", "MSFT", "GOOGL")
periods: Number of periods to retrieve (default: 3)
- For annual: Gets last N fiscal years
- For quarterly: Gets last N quarters
quarterly: If True, get quarterly data; if False, get annual data
(default: False for annual)
Returns:
MultiPeriodStatement object containing income statement data across
multiple periods. Can be printed directly or accessed programmatically
via .periods attribute.
Raises:
ValueError: If ticker is invalid or company not found
HTTPError: If SEC Company Facts API request fails
NoCompanyFactsFound: If company has no financial data
Examples:
>>> # Get 3 fiscal years of revenue data (default)
>>> income = get_revenue_trend("AAPL")
>>> print(income) # Shows 3-year revenue trend
>>> # Get 4 quarters of revenue data
>>> quarterly = get_revenue_trend("TSLA", periods=4, quarterly=True)
>>> print(quarterly) # Shows 4-quarter trend
>>> # Get 5 years for long-term analysis
>>> long_term = get_revenue_trend("MSFT", periods=5)
>>> # Access specific period programmatically
>>> income = get_revenue_trend("AAPL", periods=3)
>>> fy2023_data = income.periods[0] # Most recent period
See Also:
- Company.income_statement() - The underlying raw API method
- get_filing_statement() - For statement from specific filing
- compare_companies_revenue() - For multi-company comparison
"""
company = Company(ticker)
return company.income_statement(periods=periods, annual=not quarterly)
def get_filing_statement(
ticker: str,
year: int,
form: str,
statement_type: str = "income"
):
"""
Get a specific financial statement from a company's filing using XBRL.
This provides the most detailed financial data from a specific filing,
including all line items as filed. For multi-period comparison, consider
using get_revenue_trend() instead (more efficient).
Args:
ticker: Company ticker symbol (e.g., "AAPL", "MSFT")
year: Filing year (e.g., 2023)
form: Form type (e.g., "10-K" for annual, "10-Q" for quarterly)
statement_type: Type of statement to retrieve (default: "income")
- "income" - Income statement
- "balance" - Balance sheet
- "cash_flow" - Cash flow statement
Returns:
Statement object with detailed line items from the filing.
Can be printed directly or accessed programmatically.
Raises:
ValueError: If statement_type is not recognized or ticker invalid
HTTPError: If SEC API request fails
IndexError: If no filing found for the specified year/form
XBRLError: If XBRL parsing fails
Examples:
>>> # Get income statement from Apple's 2023 10-K
>>> income = get_filing_statement("AAPL", 2023, "10-K", "income")
>>> print(income)
>>> # Get balance sheet from quarterly filing
>>> balance = get_filing_statement("AAPL", 2023, "10-Q", "balance")
>>> # Get cash flow statement
>>> cash_flow = get_filing_statement("MSFT", 2023, "10-K", "cash_flow")
>>> # Get all three major statements
>>> income = get_filing_statement("GOOGL", 2023, "10-K", "income")
>>> balance = get_filing_statement("GOOGL", 2023, "10-K", "balance")
>>> cash = get_filing_statement("GOOGL", 2023, "10-K", "cash_flow")
See Also:
- Filing.xbrl() - The underlying XBRL parsing method
- get_revenue_trend() - More efficient for multi-period data
- Company.get_filings() - For accessing filings directly
"""
company = Company(ticker)
filing = company.get_filings(year=year, form=form)[0]
xbrl = filing.xbrl()
if statement_type == "income":
return xbrl.statements.income_statement()
elif statement_type == "balance":
return xbrl.statements.balance_sheet()
elif statement_type == "cash_flow":
return xbrl.statements.cash_flow_statement()
else:
raise ValueError(
f"Unknown statement type: {statement_type}. "
f"Must be 'income', 'balance', or 'cash_flow'"
)
def compare_companies_revenue(
tickers: Union[List[str], tuple],
periods: int = 3
) -> Dict[str, 'MultiPeriodStatement']:
"""
Compare revenue trends across multiple companies using Entity Facts API.
This is the most efficient way to compare companies as it makes one API
call per company (vs. multiple calls if using individual filings).
Args:
tickers: List or tuple of ticker symbols (e.g., ["AAPL", "MSFT", "GOOGL"])
periods: Number of periods to compare (default: 3 fiscal years)
Returns:
Dictionary mapping ticker symbol to MultiPeriodStatement.
Access individual company data via results["TICKER"].
Raises:
ValueError: If any ticker is invalid
HTTPError: If SEC Company Facts API request fails for any company
Examples:
>>> # Compare three tech companies
>>> results = compare_companies_revenue(["AAPL", "MSFT", "GOOGL"], periods=3)
>>> print("Apple Revenue:")
>>> print(results["AAPL"])
>>> print("\nMicrosoft Revenue:")
>>> print(results["MSFT"])
>>> # Compare with tuple of tickers
>>> results = compare_companies_revenue(("AAPL", "MSFT"), periods=5)
>>> # Iterate through all results
>>> results = compare_companies_revenue(["AAPL", "MSFT", "GOOGL"])
>>> for ticker, statement in results.items():
... print(f"\n{ticker} Revenue Trend:")
... print(statement)
>>> # Handle errors gracefully
>>> tickers = ["AAPL", "INVALID", "MSFT"]
>>> results = {}
>>> for ticker in tickers:
... try:
... company = Company(ticker)
... results[ticker] = company.income_statement(periods=3)
... except Exception as e:
... print(f"Error with {ticker}: {e}")
See Also:
- get_revenue_trend() - For single company analysis
- Company.income_statement() - The underlying method used
"""
results = {}
for ticker in tickers:
company = Company(ticker)
results[ticker] = company.income_statement(periods=periods)
return results
def filter_by_industry(
filings: 'Filings',
sic: Optional[Union[int, List[int]]] = None,
sic_range: Optional[tuple[int, int]] = None,
sic_description_contains: Optional[str] = None,
) -> 'Filings':
"""
Filter filings by industry using comprehensive company dataset (EFFICIENT).
This REPLACES the old implementation which made N SEC API calls.
New approach uses the comprehensive company dataset to identify target
companies instantly (zero API calls), then filters filings by CIK.
Performance Comparison:
- OLD: ~9 minutes for Q4 2023 8-K (5,400 API calls)
- NEW: ~30s first time, <1s cached (zero API calls)
- 100x+ faster for large filing sets
Args:
filings: Filings collection to filter (from get_filings() or similar)
sic: Single SIC code or list (e.g., 2834 or [2834, 2835, 2836])
sic_range: SIC range tuple (e.g., (7300, 7400) for tech)
Note: Use EXCLUSIVE upper bound (7400 means up to 7399)
sic_description_contains: Search SIC description (e.g., "software")
Returns:
Filtered Filings collection containing only filings from companies
in the specified industry
Raises:
ValueError: If no filter parameters provided
Examples:
>>> from edgar import get_filings
>>> from edgar.ai.helpers import filter_by_industry
>>>
>>> # Filter filings to pharmaceutical companies
>>> filings = get_filings(2023, 4, form="10-K")
>>> pharma_10ks = filter_by_industry(filings, sic=2834)
>>>
>>> # Filter to technology companies (SIC 7300-7399)
>>> filings = get_filings(2023, 4, form="8-K")
>>> tech_8ks = filter_by_industry(filings, sic_range=(7300, 7400))
>>>
>>> # Filter using description search
>>> filings = get_filings(2023, 4)
>>> software = filter_by_industry(filings, sic_description_contains="software")
>>>
>>> # Combine with other filters
>>> filings = get_filings(2023, 4, form="10-K") # Pre-filter by form
>>> nyse = filings.filter(exchange="NYSE") # Pre-filter by exchange
>>> pharma_nyse = filter_by_industry(nyse, sic=2834) # Then by industry
See Also:
- filter_by_company_subset() - Filter using CompanySubset fluent interface
- get_companies_by_industry() - Get company list directly (from edgar.reference)
- Filings.filter() - The underlying filter method
"""
from edgar.reference import get_companies_by_industry
# Validate inputs
if len(filings) == 0:
return filings
# Get companies in target industry (instant, local, zero API calls)
companies = get_companies_by_industry(
sic=sic,
sic_range=sic_range,
sic_description_contains=sic_description_contains
)
# Extract CIKs
target_ciks = companies['cik'].tolist()
if not target_ciks:
# Return empty Filings collection with same structure
return filings.filter(cik=[])
# Filter filings using target CIKs (instant, PyArrow operation)
return filings.filter(cik=target_ciks)
def filter_by_company_subset(
filings: 'Filings',
companies: Union['CompanySubset', pd.DataFrame]
) -> 'Filings':
"""
Filter filings using a CompanySubset or company DataFrame.
This enables advanced company filtering using the CompanySubset fluent
interface (industry + state + sampling + etc) or any custom company DataFrame.
Args:
filings: Filings collection to filter
companies: CompanySubset object or pandas DataFrame with 'cik' column
Returns:
Filtered Filings collection
Raises:
ValueError: If companies DataFrame doesn't have 'cik' column
Examples:
>>> from edgar import get_filings
>>> from edgar.reference import CompanySubset
>>> from edgar.ai.helpers import filter_by_company_subset
>>>
>>> # Get filings
>>> filings = get_filings(2023, 4, form="10-K")
>>>
>>> # Filter to Delaware pharmaceutical companies, sample 10
>>> companies = (CompanySubset()
... .from_industry(sic=2834)
... .from_state('DE')
... .sample(10, random_state=42))
>>> pharma_de_filings = filter_by_company_subset(filings, companies)
>>>
>>> # Or pass the DataFrame directly
>>> from edgar.reference import get_pharmaceutical_companies
>>> pharma = get_pharmaceutical_companies()
>>> pharma_filings = filter_by_company_subset(filings, pharma)
See Also:
- filter_by_industry() - Simpler industry-only filtering
- CompanySubset - Fluent interface for complex filtering (from edgar.reference)
"""
from edgar.reference import CompanySubset
# Extract DataFrame if CompanySubset passed
if isinstance(companies, CompanySubset):
companies = companies.get()
# Extract CIKs
if 'cik' not in companies.columns:
raise ValueError("companies DataFrame must have 'cik' column")
target_ciks = companies['cik'].tolist()
if not target_ciks:
return filings.filter(cik=[])
return filings.filter(cik=target_ciks)
# ============================================================================
# Company Subset Convenience Functions
# ============================================================================
def get_companies_by_state(states: Union[str, List[str]]) -> pd.DataFrame:
"""
Get companies by state of incorporation.
Args:
states: State code(s) (e.g., 'DE' or ['DE', 'NV'])
Returns:
DataFrame with companies incorporated in specified state(s).
Columns: cik, ticker, name, exchange, sic, sic_description,
state_of_incorporation, state_of_incorporation_description,
fiscal_year_end, entity_type, ein
Examples:
>>> # Delaware companies (most common)
>>> de_companies = get_companies_by_state('DE')
>>> print(f"Found {len(de_companies)} Delaware companies")
>>>
>>> # Multiple states
>>> tech_hubs = get_companies_by_state(['DE', 'CA', 'NV'])
>>> print(tech_hubs[['ticker', 'name', 'state_of_incorporation']].head())
See Also:
- filter_by_company_subset() - Filter filings by company subset
- CompanySubset.from_state() - Fluent interface (from edgar.reference)
"""
from edgar.reference import get_companies_by_state as _get_by_state
return _get_by_state(states)
def get_pharmaceutical_companies() -> pd.DataFrame:
"""
Get all pharmaceutical companies (SIC 2834 - Pharmaceutical Preparations).
Returns:
DataFrame with pharmaceutical companies and comprehensive metadata.
Examples:
>>> pharma = get_pharmaceutical_companies()
>>> print(f"Found {len(pharma)} pharmaceutical companies")
>>> print(pharma[['ticker', 'name']].head())
See Also:
- get_biotechnology_companies() - Broader biotech category
- filter_by_industry() - Filter filings by industry
"""
from edgar.reference import get_pharmaceutical_companies as _get_pharma
return _get_pharma()
def get_biotechnology_companies() -> pd.DataFrame:
"""
Get all biotechnology companies (SIC 2833-2836).
Returns:
DataFrame with biotechnology companies and comprehensive metadata.
Examples:
>>> biotech = get_biotechnology_companies()
>>> print(f"Found {len(biotech)} biotechnology companies")
See Also:
- get_pharmaceutical_companies() - Narrower pharma category
- filter_by_industry() - Filter filings by industry
"""
from edgar.reference import get_biotechnology_companies as _get_biotech
return _get_biotech()
def get_software_companies() -> pd.DataFrame:
"""
Get all software companies (SIC 7371-7379 - Computer Programming and Software).
Returns:
DataFrame with software companies and comprehensive metadata.
Examples:
>>> software = get_software_companies()
>>> print(f"Found {len(software)} software companies")
>>> # Get recent 10-K filings from software companies
>>> from edgar import get_filings
>>> filings = get_filings(2023, 4, form="10-K")
>>> software_10ks = filter_by_company_subset(filings, software)
See Also:
- get_semiconductor_companies() - Hardware tech companies
- filter_by_industry() - Filter filings by industry
"""
from edgar.reference import get_software_companies as _get_software
return _get_software()
def get_semiconductor_companies() -> pd.DataFrame:
"""
Get all semiconductor companies (SIC 3674 - Semiconductors and Related Devices).
Returns:
DataFrame with semiconductor companies and comprehensive metadata.
Examples:
>>> semis = get_semiconductor_companies()
>>> print(f"Found {len(semis)} semiconductor companies")
See Also:
- get_software_companies() - Software tech companies
- filter_by_industry() - Filter filings by industry
"""
from edgar.reference import get_semiconductor_companies as _get_semi
return _get_semi()
def get_banking_companies() -> pd.DataFrame:
"""
Get all banking companies (SIC 6020-6029 - Commercial Banks).
Returns:
DataFrame with banking companies and comprehensive metadata.
Examples:
>>> banks = get_banking_companies()
>>> print(f"Found {len(banks)} banking companies")
See Also:
- get_investment_companies() - Investment/securities firms
- get_insurance_companies() - Insurance companies
- filter_by_industry() - Filter filings by industry
"""
from edgar.reference import get_banking_companies as _get_banks
return _get_banks()
def get_investment_companies() -> pd.DataFrame:
"""
Get all investment companies (SIC 6200-6299 - Security and Commodity Brokers).
Returns:
DataFrame with investment companies and comprehensive metadata.
Examples:
>>> investments = get_investment_companies()
>>> print(f"Found {len(investments)} investment companies")
See Also:
- get_banking_companies() - Commercial banks
- get_insurance_companies() - Insurance companies
- filter_by_industry() - Filter filings by industry
"""
from edgar.reference import get_investment_companies as _get_invest
return _get_invest()
def get_insurance_companies() -> pd.DataFrame:
"""
Get all insurance companies (SIC 6300-6399 - Insurance Carriers).
Returns:
DataFrame with insurance companies and comprehensive metadata.
Examples:
>>> insurance = get_insurance_companies()
>>> print(f"Found {len(insurance)} insurance companies")
See Also:
- get_banking_companies() - Commercial banks
- get_investment_companies() - Investment firms
- filter_by_industry() - Filter filings by industry
"""
from edgar.reference import get_insurance_companies as _get_insurance
return _get_insurance()
def get_real_estate_companies() -> pd.DataFrame:
"""
Get all real estate companies (SIC 6500-6599 - Real Estate).
Returns:
DataFrame with real estate companies and comprehensive metadata.
Examples:
>>> real_estate = get_real_estate_companies()
>>> print(f"Found {len(real_estate)} real estate companies")
See Also:
- filter_by_industry() - Filter filings by industry
"""
from edgar.reference import get_real_estate_companies as _get_re
return _get_re()
def get_oil_gas_companies() -> pd.DataFrame:
"""
Get all oil and gas companies (SIC 1300-1399 - Oil and Gas Extraction).
Returns:
DataFrame with oil and gas companies and comprehensive metadata.
Examples:
>>> oil_gas = get_oil_gas_companies()
>>> print(f"Found {len(oil_gas)} oil and gas companies")
See Also:
- filter_by_industry() - Filter filings by industry
"""
from edgar.reference import get_oil_gas_companies as _get_oil_gas
return _get_oil_gas()
def get_retail_companies() -> pd.DataFrame:
"""
Get all retail companies (SIC 5200-5999 - Retail Trade).
Returns:
DataFrame with retail companies and comprehensive metadata.
Examples:
>>> retail = get_retail_companies()
>>> print(f"Found {len(retail)} retail companies")
See Also:
- filter_by_industry() - Filter filings by industry
"""
from edgar.reference import get_retail_companies as _get_retail
return _get_retail()

View File

@@ -0,0 +1,27 @@
"""
Model Context Protocol (MCP) server for EdgarTools.
This module provides MCP server functionality to expose EdgarTools
capabilities to AI agents and assistants like Claude Desktop.
Usage:
# Start the server
python -m edgar.ai
# Or via console script
edgartools-mcp
# Test the server configuration
python -m edgar.ai --test
For configuration and setup instructions, see:
edgar/ai/mcp/docs/MCP_QUICKSTART.md
"""
from edgar.ai.mcp.server import main, test_server
__all__ = [
"main",
"test_server",
]

View File

@@ -0,0 +1,467 @@
# EdgarTools MCP Quickstart Guide
This guide helps you get started with EdgarTools MCP server in under 5 minutes.
## Installation
```bash
# Install EdgarTools with AI features
pip install edgartools[ai]
```
## Starting the Server
EdgarTools provides two ways to start the MCP server:
### Option 1: Python Module (Recommended)
```bash
python -m edgar.ai
```
### Option 2: Console Script
```bash
edgartools-mcp
```
Both methods work identically and will start the MCP server listening on stdin/stdout.
## Client Configuration
### Claude Desktop
**Step 1: Install Claude Desktop**
- Download from https://claude.ai/download (macOS or Windows)
**Step 2: Configure the Server**
You can configure EdgarTools MCP in two ways:
**Option A: Using Claude Desktop Settings (Easier)**
1. Open Claude Desktop
2. Go to Settings (macOS: `Cmd+,` / Windows: `Ctrl+,`)
3. Navigate to **Developer** tab
4. Click **Edit Config** button
5. This will open `claude_desktop_config.json` in your default editor
**Option B: Edit Configuration File Directly**
Configuration file location:
- **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
- **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
**Configuration (macOS):**
```json
{
"mcpServers": {
"edgartools": {
"command": "python3",
"args": ["-m", "edgar.ai"],
"env": {
"EDGAR_IDENTITY": "Your Name your.email@example.com"
}
}
}
}
```
**Configuration (Windows):**
```json
{
"mcpServers": {
"edgartools": {
"command": "python",
"args": ["-m", "edgar.ai"],
"env": {
"EDGAR_IDENTITY": "Your Name your.email@example.com"
}
}
}
}
```
**Important:** On macOS, use `python3` (not `python`) as the command. On Windows, use `python`.
**Important Notes:**
- Replace `"Your Name your.email@example.com"` with your actual name and email
- The `EDGAR_IDENTITY` is required by the SEC for API requests
- Use forward slashes in paths, even on Windows
**Step 3: Restart and Verify**
1. Save the configuration file
2. Restart Claude Desktop
3. Look for the MCP server indicator (🔨) in the bottom-right corner of the chat input
4. Try asking: "Research Apple Inc with financials"
### Cline (VS Code Extension)
**Configuration File:** `.vscode/cline_mcp_settings.json` in your project
```json
{
"mcpServers": {
"edgartools": {
"command": "python3",
"args": ["-m", "edgar.ai"],
"env": {
"EDGAR_IDENTITY": "Your Name your.email@example.com"
}
}
}
}
```
**Note:** Use `python3` on macOS/Linux, or `python` on Windows.
### Continue.dev
**Configuration File:** `~/.continue/config.json`
```json
{
"mcpServers": {
"edgartools": {
"command": "python3",
"args": ["-m", "edgar.ai"],
"env": {
"EDGAR_IDENTITY": "Your Name your.email@example.com"
}
}
}
}
```
**Note:** Use `python3` on macOS/Linux, or `python` on Windows.
## Available Tools
Once connected, AI agents have access to workflow-oriented tools designed for real-world research tasks:
### Workflow Tools (Recommended)
#### 1. edgar_company_research
Comprehensive company intelligence combining profile, financials, recent activity, and ownership in a single workflow.
**Example prompts:**
- "Research Tesla including financials and recent filings"
- "Give me a detailed analysis of Apple Inc"
- "Show me Microsoft's company profile with ownership data"
**Parameters:**
- `identifier` (required): Company ticker, CIK, or name
- `include_financials` (default: true): Include latest financial statements
- `include_filings` (default: true): Include recent filing activity summary
- `include_ownership` (default: false): Include insider/institutional ownership highlights
- `detail_level` (default: "standard"): Response detail - "minimal", "standard", or "detailed"
**What it provides:**
- Company profile (name, CIK, ticker, industry)
- Latest financial metrics and statements
- Recent filing activity summary
- Ownership highlights (when requested)
#### 2. edgar_analyze_financials
Multi-period financial statement analysis for trend analysis and comparisons.
**Example prompts:**
- "Analyze Apple's income statement for the last 4 years"
- "Show me Tesla's quarterly cash flow for the last 8 quarters"
- "Compare Microsoft's income, balance sheet, and cash flow statements"
**Parameters:**
- `company` (required): Company ticker, CIK, or name
- `periods` (default: 4): Number of periods to analyze
- `annual` (default: true): Annual (true) or quarterly (false) periods
- `statement_types` (default: ["income"]): Statements to include - "income", "balance", "cash_flow"
**What it provides:**
- Multi-period income statements
- Multi-period balance sheets
- Multi-period cash flow statements
- Formatted for AI analysis and comparison
### Basic Tools (Backward Compatibility)
#### 3. edgar_get_company
Get basic company information from SEC filings.
**Example prompts:**
- "Get information about Tesla"
- "Show me Apple's company details"
**Parameters:**
- `identifier` (required): Company ticker, CIK, or name
- `include_financials` (optional): Include latest financial statements
#### 4. edgar_current_filings
Get the most recent SEC filings across all companies.
**Example prompts:**
- "Show me the latest SEC filings"
- "What are the most recent 10-K filings?"
- "Get current 8-K filings"
**Parameters:**
- `limit` (optional): Number of filings to return (default: 20)
- `form_type` (optional): Filter by form type (e.g., "10-K", "10-Q", "8-K")
## Environment Variables
### EDGAR_IDENTITY (Recommended)
The SEC requires proper identification for all API requests. You can configure this in two ways:
**Option 1: In MCP Client Configuration (Recommended)**
Set it in your MCP client config as shown in the examples above:
```json
"env": {
"EDGAR_IDENTITY": "Your Name your.email@example.com"
}
```
**Option 2: Shell Environment Variable**
Add to your `~/.bashrc` or `~/.zshrc`:
```bash
export EDGAR_IDENTITY="Your Name your.email@example.com"
```
**What happens if not set:**
- Server starts with a warning message
- SEC API may rate-limit or return errors
- The server will log helpful instructions for configuring it
**SEC Requirements:**
- Format: "Full Name email@domain.com"
- Must be a valid email you monitor
- Used by SEC to contact you if issues arise with your API usage
## Troubleshooting
### Finding Logs
Claude Desktop logs MCP server activity to help diagnose issues:
**Log Locations:**
- **macOS**: `~/Library/Logs/Claude/`
- Main log: `mcp.log`
- Server-specific: `mcp-server-edgartools.log`
- **Windows**: `%APPDATA%\Claude\logs\`
**Viewing logs:**
```bash
# macOS - watch logs in real-time
tail -f ~/Library/Logs/Claude/mcp-server-edgartools.log
# macOS - view recent errors
tail -50 ~/Library/Logs/Claude/mcp-server-edgartools.log | grep error
```
### "spawn python ENOENT" Error
**Issue:** Claude Desktop logs show `spawn python ENOENT` error
**Where to check:** View logs at `~/Library/Logs/Claude/mcp-server-edgartools.log`
**Cause:** The `python` command is not found in your system PATH. This is the most common issue on macOS.
**Solution:**
1. **Use `python3` instead of `python` (macOS/Linux):**
```json
{
"mcpServers": {
"edgartools": {
"command": "python3",
"args": ["-m", "edgar.ai"]
}
}
}
```
2. **Or specify the full Python path:**
Find your Python path:
```bash
which python3
```
Then use the full path in your configuration:
```json
{
"mcpServers": {
"edgartools": {
"command": "/opt/homebrew/bin/python3",
"args": ["-m", "edgar.ai"]
}
}
}
```
3. **Verify Python is accessible:**
```bash
python3 --version
# Should show: Python 3.11.x or higher
```
### Server won't start
**Issue:** `ModuleNotFoundError: No module named 'mcp'`
**Solution:** Install AI dependencies
```bash
pip install edgartools[ai]
# or with pip3
pip3 install edgartools[ai]
```
### Client can't find server
**Issue:** Claude Desktop shows connection error
**Solution:** Verify the command works from terminal first
```bash
python3 -m edgar.ai
# Should show: Starting EdgarTools MCP Server v...
# Press Ctrl+C to stop
```
### Wrong Python version
**Issue:** Server starts but tools don't work
**Solution:** MCP requires Python 3.10+. Check your version:
```bash
python --version
```
If using Python 3.9 or earlier, upgrade Python:
```bash
# macOS with Homebrew
brew install python@3.11
# Update your config to use the specific version
{
"mcpServers": {
"edgartools": {
"command": "/opt/homebrew/bin/python3.11",
"args": ["-m", "edgar.ai"]
}
}
}
```
## Verification
### Quick Test
Before configuring your MCP client, verify the server is working:
```bash
python -m edgar.ai --test
```
**Expected output:**
```
Testing EdgarTools MCP Server Configuration...
✓ EdgarTools v4.18.0 imports successfully
✓ MCP framework available
✓ EDGAR_IDENTITY configured: Your Name your@email.com
✓ Core EdgarTools functionality available
✓ All checks passed - MCP server is ready to run
```
If any checks fail, the test will show specific error messages and installation instructions.
### Full Integration Test
1. **Start the server manually:**
```bash
python -m edgar.ai
```
You should see: `Starting EdgarTools MCP Server v4.18.0`
2. **Configure your MCP client** (see configurations above)
3. **Test in your MCP client:**
Try these example prompts:
- "Research Apple Inc with financials and recent filings"
- "Analyze Tesla's quarterly income statement for the last 4 quarters"
- "Get the latest 10-K filings"
4. **Check server logs:**
The server logs to stderr. Check your MCP client's developer console for any errors.
5. **Verify tool availability:**
In Claude Desktop, look for the MCP indicator (🔨) in the bottom-right corner of the chat input. Clicking it should show available EdgarTools tools.
## Migration from Legacy Setup
If you're currently using the old `run_mcp_server.py` entry point, here's how to migrate:
### Old Configuration (Deprecated):
```json
{
"mcpServers": {
"edgartools": {
"command": "python",
"args": ["/absolute/path/to/edgartools/edgar/ai/run_mcp_server.py"]
}
}
}
```
### New Configuration (macOS):
```json
{
"mcpServers": {
"edgartools": {
"command": "python3",
"args": ["-m", "edgar.ai"],
"env": {
"EDGAR_IDENTITY": "Your Name your@email.com"
}
}
}
}
```
### New Configuration (Windows):
```json
{
"mcpServers": {
"edgartools": {
"command": "python",
"args": ["-m", "edgar.ai"],
"env": {
"EDGAR_IDENTITY": "Your Name your@email.com"
}
}
}
}
```
### Benefits of Migrating:
- ✅ No absolute file paths required
- ✅ Works from any directory
- ✅ Proper SEC identity configuration
- ✅ Simpler configuration
- ✅ Better error messages
- ✅ Verification tool support (`--test` flag)
**Note:** The old entry point still works but shows a deprecation warning. It will be removed in a future version.
## Next Steps
- Read the [full MCP documentation](../../../docs-internal/features/edgartools-mcp-ai-support.md) for advanced features
- See [AI package structure](../../../docs-internal/features/ai-mcp-package-structure-plan.md) for architecture details
- Explore example notebooks showing MCP workflows
## Support
- **Issues:** https://github.com/dgunning/edgartools/issues
- **Discussions:** https://github.com/dgunning/edgartools/discussions
- **Documentation:** https://dgunning.github.io/edgartools/

View File

@@ -0,0 +1,394 @@
#!/usr/bin/env python3
"""
EdgarTools MCP Server
MCP (Model Context Protocol) server providing AI agents access to SEC filing data.
This module provides the main entry point for the MCP server.
Usage:
python -m edgar.ai.mcp # Via module
edgartools-mcp # Via console script
"""
import asyncio
import logging
import os
from typing import Any
from mcp import Resource, Tool
from mcp.server import NotificationOptions, Server
from mcp.server.models import InitializationOptions
from mcp.server.stdio import stdio_server
from mcp.types import TextContent
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("edgartools-mcp")
def setup_edgar_identity():
"""Configure SEC identity from environment variable.
The SEC requires proper identification for API requests. This function
checks for the EDGAR_IDENTITY environment variable and configures it.
If not set, logs a warning but continues (API errors will guide user).
"""
try:
from edgar import set_identity
identity = os.environ.get('EDGAR_IDENTITY')
if not identity:
logger.warning(
"EDGAR_IDENTITY environment variable not set. "
"The SEC requires proper identification for API requests.\n"
"Add to your MCP client configuration:\n"
' "env": {"EDGAR_IDENTITY": "Your Name your.email@example.com"}\n'
"Or set in your shell: export EDGAR_IDENTITY=\"Your Name your.email@example.com\""
)
return
set_identity(identity)
logger.info(f"SEC identity configured: {identity}")
except Exception as e:
logger.error(f"Error setting up EDGAR identity: {e}")
# Create the server
app = Server("edgartools")
@app.list_tools()
async def list_tools() -> list[Tool]:
"""List available tools."""
return [
Tool(
name="edgar_company_research",
description="Get company overview and background. Returns profile, 3-year financial trends, and recent filing activity. Use this for initial company research or to get a snapshot of recent performance.",
inputSchema={
"type": "object",
"properties": {
"identifier": {
"type": "string",
"description": "Company ticker (AAPL), CIK (0000320193), or name (Apple Inc)"
},
"include_financials": {
"type": "boolean",
"description": "Include 3-year income statement showing revenue and profit trends",
"default": True
},
"include_filings": {
"type": "boolean",
"description": "Include summary of last 5 SEC filings",
"default": True
},
"include_ownership": {
"type": "boolean",
"description": "Include insider and institutional ownership data (currently not implemented)",
"default": False
},
"detail_level": {
"type": "string",
"enum": ["minimal", "standard", "detailed"],
"description": "Response detail: 'minimal' (key metrics only), 'standard' (balanced), 'detailed' (comprehensive data)",
"default": "standard"
}
},
"required": ["identifier"]
}
),
Tool(
name="edgar_analyze_financials",
description="Detailed financial statement analysis across multiple periods. Use this for trend analysis, growth calculations, or comparing financial performance over time.",
inputSchema={
"type": "object",
"properties": {
"company": {
"type": "string",
"description": "Company ticker (TSLA), CIK (0001318605), or name (Tesla Inc)"
},
"periods": {
"type": "integer",
"description": "Number of periods: 4-5 for trends, 8-10 for patterns (max 10)",
"default": 4
},
"annual": {
"type": "boolean",
"description": "Use annual periods (true) for long-term trends and year-over-year comparisons, or quarterly periods (false) for recent performance and current earnings. Quarterly provides more recent data but may show seasonal volatility.",
"default": True
},
"statement_types": {
"type": "array",
"items": {"type": "string", "enum": ["income", "balance", "cash_flow"]},
"description": "Statements to include: 'income' (revenue, profit, growth), 'balance' (assets, liabilities, equity), 'cash_flow' (operating, investing, financing cash flows)",
"default": ["income"]
}
},
"required": ["company"]
}
),
Tool(
name="edgar_industry_overview",
description="Get overview of an industry sector including company count, major players, and aggregate metrics. Use this to understand industry landscape before diving into specific companies.",
inputSchema={
"type": "object",
"properties": {
"industry": {
"type": "string",
"enum": [
"pharmaceuticals", "biotechnology", "software",
"semiconductors", "banking", "investment",
"insurance", "real_estate", "oil_gas", "retail"
],
"description": "Industry sector to analyze"
},
"include_top_companies": {
"type": "boolean",
"description": "Include list of major companies in the sector",
"default": True
},
"limit": {
"type": "integer",
"description": "Number of top companies to show (by filing activity)",
"default": 10
}
},
"required": ["industry"]
}
),
Tool(
name="edgar_compare_industry_companies",
description="Compare financial performance of companies within an industry sector. Automatically selects top companies or accepts custom company list for side-by-side financial comparison.",
inputSchema={
"type": "object",
"properties": {
"industry": {
"type": "string",
"enum": [
"pharmaceuticals", "biotechnology", "software",
"semiconductors", "banking", "investment",
"insurance", "real_estate", "oil_gas", "retail"
],
"description": "Industry sector to analyze"
},
"companies": {
"type": "array",
"items": {"type": "string"},
"description": "Optional: Specific tickers to compare (e.g., ['AAPL', 'MSFT', 'GOOGL']). If omitted, uses top companies by market presence.",
"default": None
},
"limit": {
"type": "integer",
"description": "Number of companies to compare if not specified (default 5, max 10)",
"default": 5
},
"periods": {
"type": "integer",
"description": "Number of periods for comparison (default 3)",
"default": 3
},
"annual": {
"type": "boolean",
"description": "Annual (true) or quarterly (false) comparison",
"default": True
}
},
"required": ["industry"]
}
)
]
@app.call_tool()
async def call_tool(name: str, arguments: dict[str, Any] | None) -> list[TextContent]:
"""Handle tool calls."""
if arguments is None:
arguments = {}
try:
if name == "edgar_company_research":
from edgar.ai.mcp.tools.company_research import handle_company_research
return await handle_company_research(arguments)
elif name == "edgar_analyze_financials":
from edgar.ai.mcp.tools.financial_analysis import handle_analyze_financials
return await handle_analyze_financials(arguments)
elif name == "edgar_industry_overview":
from edgar.ai.mcp.tools.industry_analysis import handle_industry_overview
return await handle_industry_overview(arguments)
elif name == "edgar_compare_industry_companies":
from edgar.ai.mcp.tools.industry_analysis import handle_compare_industry_companies
return await handle_compare_industry_companies(arguments)
else:
raise ValueError(f"Unknown tool: {name}")
except Exception as e:
logger.error("Error in tool %s: %s", name, e)
return [TextContent(
type="text",
text=f"Error: {str(e)}"
)]
@app.list_resources()
async def list_resources() -> list[Resource]:
"""List available resources."""
return [
Resource(
uri="edgartools://docs/quickstart",
name="EdgarTools Quickstart Guide",
description="Quick start guide for using EdgarTools",
mimeType="text/markdown"
)
]
@app.read_resource()
async def read_resource(uri: str) -> str:
"""Read a resource."""
if uri == "edgartools://docs/quickstart":
return """# EdgarTools Quickstart
## Basic Usage
```python
from edgar import Company, get_current_filings
# Get company information
company = Company("AAPL")
print(f"{company.name} - CIK: {company.cik}")
# Get filings
filings = company.get_filings(form="10-K", limit=5)
for filing in filings:
print(f"{filing.form} - {filing.filing_date}")
# Get current filings across all companies
current = get_current_filings(limit=20)
for filing in current.data.to_pylist():
print(f"{filing['company']} - {filing['form']}")
```
## Available Tools
- **edgar_get_company**: Get detailed company information
- **edgar_current_filings**: Get the latest SEC filings
## Example Queries
- "Get information about Apple Inc including recent financials"
- "Show me the 20 most recent SEC filings"
- "Find current 8-K filings"
"""
else:
raise ValueError(f"Unknown resource: {uri}")
def main():
"""Main entry point for MCP server."""
try:
# Get package version for server version
from edgar.__about__ import __version__
# Configure EDGAR identity from environment
setup_edgar_identity()
async def run_server():
"""Run the async MCP server."""
logger.info(f"Starting EdgarTools MCP Server v{__version__}")
# Use stdio transport
async with stdio_server() as (read_stream, write_stream):
await app.run(
read_stream,
write_stream,
InitializationOptions(
server_name="edgartools",
server_version=__version__, # Sync with package version
capabilities=app.get_capabilities(
notification_options=NotificationOptions(),
experimental_capabilities={}
)
)
)
asyncio.run(run_server())
except KeyboardInterrupt:
logger.info("Server stopped by user")
except Exception as e:
logger.error(f"Server error: {e}", exc_info=True)
raise
def test_server():
"""Test that MCP server is properly configured and ready to run.
Returns:
bool: True if all checks pass, False otherwise
"""
import sys
print("Testing EdgarTools MCP Server Configuration...\n")
all_passed = True
# Test 1: EdgarTools import check
try:
from edgar import Company
from edgar.__about__ import __version__
print(f"✓ EdgarTools v{__version__} imports successfully")
except ImportError as e:
print(f"✗ EdgarTools import error: {e}")
print(" Install with: pip install edgartools")
all_passed = False
# Test 2: MCP framework check
try:
from mcp.server import Server
print("✓ MCP framework available")
except ImportError as e:
print(f"✗ MCP framework not installed: {e}")
print(" Install with: pip install edgartools[ai]")
all_passed = False
# Test 3: Identity configuration check
identity = os.environ.get('EDGAR_IDENTITY')
if identity:
print(f"✓ EDGAR_IDENTITY configured: {identity}")
else:
print("⚠ EDGAR_IDENTITY not set (recommended)")
print(" Set with: export EDGAR_IDENTITY=\"Your Name your@email.com\"")
print(" Or configure in MCP client's env settings")
# Test 4: Quick functionality test
try:
from edgar import get_current_filings
print("✓ Core EdgarTools functionality available")
except Exception as e:
print(f"✗ EdgarTools functionality check failed: {e}")
all_passed = False
# Summary
print()
if all_passed:
print("✓ All checks passed - MCP server is ready to run")
print("\nTo start the server:")
print(" python -m edgar.ai")
print(" or")
print(" edgartools-mcp")
return True
else:
print("✗ Some checks failed - please fix the issues above")
return False
if __name__ == "__main__":
import sys
# Check for --test flag
if "--test" in sys.argv or "-t" in sys.argv:
sys.exit(0 if test_server() else 1)
else:
main()

View File

@@ -0,0 +1,15 @@
"""
EdgarTools MCP Tool Handlers
This module contains workflow-oriented tool handlers for the MCP server.
"""
from edgar.ai.mcp.tools.utils import (
check_output_size,
format_error_with_suggestions,
)
__all__ = [
"check_output_size",
"format_error_with_suggestions",
]

View File

@@ -0,0 +1,192 @@
"""
Company Research Tool Handler
Provides comprehensive company intelligence including profile,
financials, recent activity, and ownership information.
"""
import logging
from typing import Any
from mcp.types import TextContent
from edgar import Company
from edgar.ai.mcp.tools.utils import (
build_company_profile,
check_output_size,
format_error_with_suggestions,
)
logger = logging.getLogger(__name__)
async def handle_company_research(args: dict[str, Any]) -> list[TextContent]:
"""
Handle company research tool requests.
Provides comprehensive company intelligence in one call, combining:
- Company profile (name, CIK, ticker, industry)
- Latest financial information (optional)
- Recent filing activity (optional)
- Ownership highlights (optional)
Args:
args: Tool arguments containing:
- identifier (required): Company ticker, CIK, or name
- include_financials (default True): Include latest financials
- include_filings (default True): Include recent filing summary
- include_ownership (default False): Include ownership highlights
- detail_level (default "standard"): minimal/standard/detailed
Returns:
List containing TextContent with company research results
"""
identifier = args.get("identifier")
detail_level = args.get("detail_level", "standard")
include_financials = args.get("include_financials", True)
include_filings = args.get("include_filings", True)
include_ownership = args.get("include_ownership", False)
if not identifier:
return [TextContent(
type="text",
text="Error: identifier parameter is required"
)]
try:
# Get company
company = Company(identifier)
# Build response parts
response_parts = []
# 1. Company profile
profile = build_company_profile(company, detail_level)
response_parts.append(profile)
# 2. Latest financials (if requested)
if include_financials:
try:
financials = extract_latest_financials(company, detail_level)
if financials:
response_parts.append("\n\nLatest Financials:")
response_parts.append(financials)
except Exception as e:
logger.warning(f"Could not retrieve financials: {e}")
response_parts.append(f"\n\nFinancials: Not available ({str(e)})")
# 3. Recent filings (if requested)
if include_filings:
try:
filings = recent_filing_summary(company, detail_level)
if filings:
response_parts.append("\n\nRecent Filings:")
response_parts.append(filings)
except Exception as e:
logger.warning(f"Could not retrieve filings: {e}")
response_parts.append(f"\n\nRecent Filings: Not available ({str(e)})")
# 4. Ownership highlights (if requested)
if include_ownership:
try:
ownership = ownership_highlights(company)
if ownership:
response_parts.append("\n\nOwnership Highlights:")
response_parts.append(ownership)
except Exception as e:
logger.warning(f"Could not retrieve ownership: {e}")
response_parts.append(f"\n\nOwnership: Not available ({str(e)})")
# Combine response
response_text = "\n".join(response_parts)
# Check output size and truncate if needed
response_text = check_output_size(response_text)
return [TextContent(type="text", text=response_text)]
except Exception as e:
logger.error(f"Error in company research: {e}", exc_info=True)
return [TextContent(
type="text",
text=format_error_with_suggestions(e)
)]
def extract_latest_financials(company: Any, detail_level: str = "standard") -> str:
"""
Extract latest financial information for a company.
Args:
company: Company object
detail_level: Level of detail to include
Returns:
Formatted financial summary
"""
try:
# Get income statement with 3 periods for trend analysis (annual) with concise format for LLM
stmt = company.income_statement(periods=3, annual=True, concise_format=True)
if detail_level == "minimal":
# Just key metrics
parts = ["Latest Annual Period"]
# TODO: Extract specific metrics once we understand the API better
return stmt.to_llm_string()
else:
# Standard or detailed
return stmt.to_llm_string()
except Exception as e:
logger.warning(f"Could not extract financials: {e}")
return ""
def recent_filing_summary(company: Any, detail_level: str = "standard") -> str:
"""
Get summary of recent filing activity.
Args:
company: Company object
detail_level: Level of detail to include
Returns:
Formatted filing summary
"""
try:
# Get recent filings (last 5)
filings = company.get_filings(limit=5)
if not filings:
return "No recent filings found"
parts = []
for filing in filings:
if detail_level == "minimal":
parts.append(f"- {filing.form} ({filing.filing_date})")
else:
parts.append(f"- {filing.form} - {filing.filing_date}")
if hasattr(filing, 'description') and filing.description:
parts.append(f" {filing.description}")
return "\n".join(parts)
except Exception as e:
logger.warning(f"Could not retrieve filings: {e}")
return ""
def ownership_highlights(company: Any) -> str:
"""
Get ownership highlights (insider/institutional activity).
Args:
company: Company object
Returns:
Formatted ownership summary
"""
# TODO: Implement once we understand ownership data access
# This might require analyzing Form 4 (insider) and 13F (institutional) filings
logger.info("Ownership highlights not yet implemented")
return "Ownership data: Feature not yet implemented"

View File

@@ -0,0 +1,106 @@
"""
Financial Analysis Tool Handler
Provides multi-period financial statement analysis.
"""
import logging
from typing import Any
from mcp.types import TextContent
from edgar import Company
from edgar.ai.mcp.tools.utils import (
check_output_size,
format_error_with_suggestions,
)
logger = logging.getLogger(__name__)
async def handle_analyze_financials(args: dict[str, Any]) -> list[TextContent]:
"""
Handle financial analysis tool requests.
Provides multi-period financial statement analysis using Company
convenience methods (income_statement, balance_sheet, cash_flow).
Args:
args: Tool arguments containing:
- company (required): Company ticker, CIK, or name
- periods (default 4): Number of periods to analyze
- annual (default True): Annual (true) or quarterly (false)
- statement_types (default ["income"]): Statements to include
Returns:
List containing TextContent with financial analysis results
"""
company_id = args.get("company")
periods = args.get("periods", 4)
annual = args.get("annual", True)
statement_types = args.get("statement_types", ["income"])
if not company_id:
return [TextContent(
type="text",
text="Error: company parameter is required"
)]
try:
# Get company
company = Company(company_id)
# Extract requested statements
response_parts = []
response_parts.append(f"Financial Analysis: {company.name}")
response_parts.append(f"Periods: {periods} {'Annual' if annual else 'Quarterly'}")
response_parts.append("")
# Process each requested statement type
if "income" in statement_types:
try:
stmt = company.income_statement(periods=periods, annual=annual, concise_format=True)
response_parts.append("=== Income Statement ===")
response_parts.append(stmt.to_llm_string())
response_parts.append("")
except Exception as e:
logger.warning(f"Could not retrieve income statement: {e}")
response_parts.append(f"Income Statement: Not available ({str(e)})")
response_parts.append("")
if "balance" in statement_types:
try:
stmt = company.balance_sheet(periods=periods, annual=annual, concise_format=True)
response_parts.append("=== Balance Sheet ===")
response_parts.append(stmt.to_llm_string())
response_parts.append("")
except Exception as e:
logger.warning(f"Could not retrieve balance sheet: {e}")
response_parts.append(f"Balance Sheet: Not available ({str(e)})")
response_parts.append("")
if "cash_flow" in statement_types:
try:
stmt = company.cash_flow(periods=periods, annual=annual, concise_format=True)
response_parts.append("=== Cash Flow Statement ===")
response_parts.append(stmt.to_llm_string())
response_parts.append("")
except Exception as e:
logger.warning(f"Could not retrieve cash flow: {e}")
response_parts.append(f"Cash Flow: Not available ({str(e)})")
response_parts.append("")
# Combine response
response_text = "\n".join(response_parts)
# Check output size and truncate if needed
response_text = check_output_size(response_text, max_tokens=3000) # Larger limit for financials
return [TextContent(type="text", text=response_text)]
except Exception as e:
logger.error(f"Error in financial analysis: {e}", exc_info=True)
return [TextContent(
type="text",
text=format_error_with_suggestions(e)
)]

View File

@@ -0,0 +1,238 @@
"""
Industry Analysis Tool Handlers
Provides industry sector analysis and competitive benchmarking capabilities.
"""
import logging
from typing import Any
from mcp.types import TextContent
from edgar import Company
from edgar.ai.mcp.tools.utils import (
check_output_size,
format_error_with_suggestions,
)
logger = logging.getLogger(__name__)
# Industry function mapping
INDUSTRY_FUNCTIONS = {
"pharmaceuticals": "get_pharmaceutical_companies",
"biotechnology": "get_biotechnology_companies",
"software": "get_software_companies",
"semiconductors": "get_semiconductor_companies",
"banking": "get_banking_companies",
"investment": "get_investment_companies",
"insurance": "get_insurance_companies",
"real_estate": "get_real_estate_companies",
"oil_gas": "get_oil_gas_companies",
"retail": "get_retail_companies",
}
async def handle_industry_overview(args: dict[str, Any]) -> list[TextContent]:
"""
Handle industry overview tool requests.
Provides overview of an industry sector including:
- Total company count
- SIC code(s)
- Major public companies
- Industry description
Args:
args: Tool arguments containing:
- industry (required): Industry sector name
- include_top_companies (default True): Include major companies
- limit (default 10): Number of top companies to show
Returns:
List containing TextContent with industry overview
"""
industry = args.get("industry")
include_top = args.get("include_top_companies", True)
limit = args.get("limit", 10)
if not industry:
return [TextContent(
type="text",
text="Error: industry parameter is required"
)]
if industry not in INDUSTRY_FUNCTIONS:
return [TextContent(
type="text",
text=f"Error: Unknown industry '{industry}'. Must be one of: {', '.join(INDUSTRY_FUNCTIONS.keys())}"
)]
try:
# Import and call the appropriate industry function
from edgar.ai import helpers
function_name = INDUSTRY_FUNCTIONS[industry]
get_companies = getattr(helpers, function_name)
companies = get_companies()
# Build response
response_parts = [
f"# {industry.replace('_', ' ').title()} Industry Overview",
"",
f"**Total Companies**: {len(companies):,}",
]
# Get unique SIC codes
sic_codes = sorted(companies['sic'].unique().tolist())
if len(sic_codes) == 1:
response_parts.append(f"**SIC Code**: {sic_codes[0]}")
else:
response_parts.append(f"**SIC Codes**: {', '.join(map(str, sic_codes))}")
# Get primary description (from first company)
if len(companies) > 0 and 'sic_description' in companies.columns:
primary_desc = companies['sic_description'].iloc[0]
response_parts.append(f"**Description**: {primary_desc}")
response_parts.append("")
# Add major companies if requested
if include_top and len(companies) > 0:
# Filter to companies with tickers (publicly traded)
public = companies[companies['ticker'].notna()].copy()
if len(public) > 0:
response_parts.append("## Major Public Companies")
response_parts.append("")
# Show top N companies
top_companies = public.head(limit)
for _, row in top_companies.iterrows():
ticker = row['ticker'] if row['ticker'] else 'N/A'
exchange = row['exchange'] if row['exchange'] else 'N/A'
response_parts.append(
f"- **{ticker}** - {row['name']} ({exchange})"
)
else:
response_parts.append("*No public companies found in this sector*")
# Combine response
response_text = "\n".join(response_parts)
# Check output size
response_text = check_output_size(response_text)
return [TextContent(type="text", text=response_text)]
except Exception as e:
logger.error(f"Error in industry overview: {e}", exc_info=True)
return [TextContent(
type="text",
text=format_error_with_suggestions(e)
)]
async def handle_compare_industry_companies(args: dict[str, Any]) -> list[TextContent]:
"""
Handle industry company comparison tool requests.
Compares financial performance of companies within an industry sector.
Args:
args: Tool arguments containing:
- industry (required): Industry sector name
- companies (optional): Specific tickers to compare
- limit (default 5): Number of companies if not specified
- periods (default 3): Number of periods for comparison
- annual (default True): Annual (true) or quarterly (false)
Returns:
List containing TextContent with comparative analysis
"""
industry = args.get("industry")
company_tickers = args.get("companies")
limit = args.get("limit", 5)
periods = args.get("periods", 3)
annual = args.get("annual", True)
if not industry:
return [TextContent(
type="text",
text="Error: industry parameter is required"
)]
if industry not in INDUSTRY_FUNCTIONS:
return [TextContent(
type="text",
text=f"Error: Unknown industry '{industry}'. Must be one of: {', '.join(INDUSTRY_FUNCTIONS.keys())}"
)]
try:
# Import and call the appropriate industry function
from edgar.ai import helpers
function_name = INDUSTRY_FUNCTIONS[industry]
get_companies = getattr(helpers, function_name)
companies = get_companies()
# Select companies
if company_tickers:
# Filter to specified tickers
selected = companies[companies['ticker'].isin(company_tickers)].copy()
if len(selected) == 0:
return [TextContent(
type="text",
text=f"Error: None of the specified tickers found in {industry} industry"
)]
else:
# Use top N companies with tickers
public = companies[companies['ticker'].notna()].copy()
if len(public) == 0:
return [TextContent(
type="text",
text=f"Error: No public companies found in {industry} industry"
)]
selected = public.head(limit)
# Compare financials
response_parts = [
f"# {industry.replace('_', ' ').title()} Industry Comparison",
f"",
f"Comparing {len(selected)} companies over {periods} {'annual' if annual else 'quarterly'} periods",
"",
]
for _, row in selected.iterrows():
ticker = row['ticker']
try:
company = Company(ticker)
stmt = company.income_statement(
periods=periods,
annual=annual,
concise_format=True
)
response_parts.append(f"## {ticker} - {row['name']}")
response_parts.append("")
response_parts.append(stmt.to_llm_string())
response_parts.append("")
except Exception as e:
logger.warning(f"Could not get financials for {ticker}: {e}")
response_parts.append(f"## {ticker} - {row['name']}")
response_parts.append(f"*Financial data not available: {str(e)}*")
response_parts.append("")
# Combine response
response_text = "\n".join(response_parts)
# Check output size (larger limit for comparative data)
response_text = check_output_size(response_text, max_tokens=5000)
return [TextContent(type="text", text=response_text)]
except Exception as e:
logger.error(f"Error in industry comparison: {e}", exc_info=True)
return [TextContent(
type="text",
text=format_error_with_suggestions(e)
)]

View File

@@ -0,0 +1,137 @@
"""
Utility functions for MCP tool handlers.
Provides helper functions for output management, error handling,
and data formatting for MCP responses.
"""
import logging
from typing import Any
logger = logging.getLogger(__name__)
def check_output_size(data: str, max_tokens: int = 2000) -> str:
"""
Prevent context overflow with intelligent summarization.
Estimates token count and truncates/summarizes if needed to stay
within context window limits.
Args:
data: The text data to check
max_tokens: Maximum allowed tokens (default: 2000)
Returns:
Original data if under limit, truncated data otherwise
"""
# Rough estimation: 1 token ≈ 4 characters
estimated_tokens = len(data) / 4
if estimated_tokens > max_tokens:
# Simple truncation with ellipsis
# TODO: Implement smarter summarization in future
char_limit = int(max_tokens * 4 * 0.9) # 90% of limit to be safe
truncated = data[:char_limit]
logger.warning(f"Output truncated: {int(estimated_tokens)} tokens -> {max_tokens} tokens")
return f"{truncated}\n\n... (output truncated to stay within token limit)"
return data
def format_error_with_suggestions(error: Exception) -> str:
"""
Provide helpful error messages with alternatives.
Creates AI-friendly error messages that include specific suggestions
for common error types.
Args:
error: The exception that occurred
Returns:
Formatted error message with suggestions
"""
error_type = type(error).__name__
error_message = str(error)
# Define helpful suggestions for common errors
suggestions_map = {
"CompanyNotFound": [
"Try searching by CIK instead of ticker",
"Use the full company name",
"Check spelling of ticker symbol"
],
"NoFinancialsAvailable": [
"Company may not have filed recent 10-K/10-Q",
"Try include_financials=False for basic info",
"Check filing history with edgar_market_monitor tool"
],
"FileNotFoundError": [
"The requested filing may not be available",
"Try a different form type or date range",
"Verify the company has filed this type of document"
],
"HTTPError": [
"SEC EDGAR website may be temporarily unavailable",
"Check your internet connection",
"Try again in a few moments"
],
"ValueError": [
"Check that all required parameters are provided",
"Verify parameter formats (e.g., valid ticker symbols)",
"Review the tool's parameter documentation"
]
}
suggestions = suggestions_map.get(error_type, [
"Try rephrasing your request",
"Check parameter values",
"Consult the tool documentation"
])
# Format the error response
response_parts = [
f"Error: {error_message}",
f"Error Type: {error_type}",
"",
"Suggestions:"
]
for i, suggestion in enumerate(suggestions, 1):
response_parts.append(f"{i}. {suggestion}")
return "\n".join(response_parts)
def build_company_profile(company: Any, detail_level: str = "standard") -> str:
"""
Build a company profile summary.
Args:
company: Company object
detail_level: Level of detail (minimal/standard/detailed)
Returns:
Formatted company profile text
"""
parts = [f"Company: {company.name}"]
# Add CIK
parts.append(f"CIK: {company.cik}")
# Add ticker if available
if hasattr(company, 'tickers') and company.tickers:
parts.append(f"Ticker: {company.tickers[0]}")
# Add industry/sector if available and detail level permits
if detail_level in ["standard", "detailed"]:
if hasattr(company, 'sic_description'):
parts.append(f"Industry: {company.sic_description}")
# Add description for detailed level
if detail_level == "detailed":
if hasattr(company, 'description') and company.description:
parts.append(f"\nDescription: {company.description}")
return "\n".join(parts)

View File

@@ -0,0 +1,63 @@
"""
EdgarTools AI Skills - Skill discovery and management.
Skills are self-contained packages of documentation and helper functions
that enable AI agents to perform domain-specific tasks with EdgarTools.
"""
from edgar.ai.skills.base import BaseSkill
from edgar.ai.skills.core import edgartools_skill, EdgarToolsSkill
__all__ = [
'BaseSkill',
'EdgarToolsSkill',
'edgartools_skill',
'list_skills',
'get_skill',
]
def list_skills() -> list:
"""
List all available skills (built-in + external).
Returns:
List of BaseSkill instances
Example:
>>> from edgar.ai.skills import list_skills
>>> skills = list_skills()
>>> for skill in skills:
... print(f"{skill.name}: {skill.description}")
"""
# Currently only one built-in skill
# External packages can register additional skills here
return [edgartools_skill]
def get_skill(name: str) -> BaseSkill:
"""
Get skill by name.
Args:
name: Skill name (e.g., "EdgarTools")
Returns:
BaseSkill instance
Raises:
ValueError: If skill not found
Example:
>>> from edgar.ai.skills import get_skill
>>> skill = get_skill("EdgarTools")
>>> docs = skill.get_documents()
"""
for skill in list_skills():
if skill.name == name:
return skill
available = [s.name for s in list_skills()]
raise ValueError(
f"Skill '{name}' not found. Available skills: {', '.join(available)}"
)

View File

@@ -0,0 +1,213 @@
"""
Base class for EdgarTools AI skills.
Provides the foundation for creating AI skills that integrate with
edgar.ai infrastructure. External packages can subclass BaseSkill to
create specialized skills (e.g., insider trading detection, fraud analysis).
"""
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, Optional, Callable
__all__ = ['BaseSkill']
class BaseSkill(ABC):
"""
Abstract base class for EdgarTools AI skills.
A skill packages:
- Documentation (markdown files with YAML frontmatter)
- Helper functions (workflow wrappers)
- Examples and patterns
External packages can subclass this to create specialized skills
that integrate seamlessly with edgar.ai infrastructure.
Example:
>>> from edgar.ai.skills.base import BaseSkill
>>> from pathlib import Path
>>>
>>> class InsiderTradingSkill(BaseSkill):
... @property
... def name(self) -> str:
... return "Insider Trading Detection"
...
... @property
... def description(self) -> str:
... return "Analyze Form 4 filings for insider trading patterns"
...
... @property
... def content_dir(self) -> Path:
... return Path(__file__).parent / "content"
...
... def get_helpers(self) -> Dict[str, Callable]:
... return {
... 'detect_unusual_trades': self.detect_unusual_trades,
... }
"""
@property
@abstractmethod
def name(self) -> str:
"""
Skill name for display and identification.
Should be descriptive and unique. Example: "SEC Filing Analysis"
Returns:
Human-readable skill name
"""
pass
@property
@abstractmethod
def description(self) -> str:
"""
Brief description of skill capabilities.
Used by AI agents to determine when to activate the skill.
Should clearly describe what problems the skill solves.
Returns:
One-sentence skill description
"""
pass
@property
@abstractmethod
def content_dir(self) -> Path:
"""
Directory containing skill documentation (markdown files).
This directory should contain:
- skill.md: Main skill documentation with YAML frontmatter
- objects.md: Object reference (optional)
- workflows.md: Workflow patterns (optional)
- readme.md: Installation/overview (optional)
Returns:
Path to skill content directory
"""
pass
@abstractmethod
def get_helpers(self) -> Dict[str, Callable]:
"""
Return dictionary of helper functions this skill provides.
Helper functions are convenience wrappers that simplify
common workflows for the skill's domain.
Returns:
Dict mapping function names to callable objects
Example:
>>> {
... 'get_revenue_trend': helpers.get_revenue_trend,
... 'compare_companies': helpers.compare_companies,
... }
"""
pass
# Non-abstract methods with default implementations
def get_object_docs(self) -> List[Path]:
"""
Return paths to centralized object documentation files to include in exports.
Override this method to specify which centralized API reference docs
should be included when exporting the skill. These docs are copied to
an 'api-reference/' subdirectory in the exported skill package.
Returns:
List of Path objects pointing to markdown documentation files
Example:
>>> def get_object_docs(self) -> List[Path]:
... from pathlib import Path
... root = Path(__file__).parent.parent.parent
... return [
... root / "entity/docs/Company.md",
... root / "xbrl/docs/XBRL.md",
... ]
"""
return [] # Default: no object docs
def get_documents(self) -> List[str]:
"""
List of markdown documents in this skill.
Returns:
List of document names (without .md extension)
"""
if not self.content_dir.exists():
return []
return [f.stem for f in self.content_dir.glob("*.md")]
def get_document_content(self, name: str) -> str:
"""
Get content of a specific markdown document.
Args:
name: Document name (with or without .md extension)
Returns:
Full markdown content as string
Raises:
FileNotFoundError: If document doesn't exist
"""
doc_name = name if name.endswith('.md') else f"{name}.md"
doc_path = self.content_dir / doc_name
if not doc_path.exists():
available = ", ".join(self.get_documents())
raise FileNotFoundError(
f"Document '{name}' not found in skill '{self.name}'. "
f"Available: {available}"
)
return doc_path.read_text()
def export(self, format: str = "claude-desktop", output_dir: Optional[Path] = None, **kwargs) -> Path:
"""
Export skill in specified format.
Args:
format: Export format (default: "claude-desktop")
- "claude-desktop": Claude Desktop Skills format (ZIP)
- "claude-skills": Official Claude Skills format (~/.claude/skills/)
output_dir: Where to create export (default: ./skills_export/)
**kwargs: Additional format-specific parameters
- create_zip (bool): For claude-desktop format (default: True)
- install (bool): For claude-skills format (default: True)
Returns:
Path to exported skill directory or archive
Example:
>>> skill = EdgarToolsSkill()
>>> # Export as ZIP for Claude Desktop upload
>>> path = skill.export(format="claude-desktop")
>>> # Export to ~/.claude/skills/ for automatic discovery
>>> path = skill.export(format="claude-skills")
"""
from edgar.ai.exporters import export_skill
return export_skill(self, format=format, output_dir=output_dir, **kwargs)
def __repr__(self) -> str:
"""String representation of the skill."""
return f"{self.__class__.__name__}(name='{self.name}')"
def __str__(self) -> str:
"""Human-readable skill description."""
docs_count = len(self.get_documents())
helpers_count = len(self.get_helpers())
return (
f"Skill: {self.name}\n"
f"Description: {self.description}\n"
f"Documents: {docs_count}\n"
f"Helper Functions: {helpers_count}"
)

View File

@@ -0,0 +1,119 @@
"""
EdgarTools Skill - Core EdgarTools AI skill.
Provides comprehensive documentation and helper functions for analyzing
SEC filings and financial statements using EdgarTools.
"""
from pathlib import Path
from typing import Dict, Callable
from edgar.ai.skills.base import BaseSkill
__all__ = ['EdgarToolsSkill', 'edgartools_skill']
class EdgarToolsSkill(BaseSkill):
"""
EdgarTools - AI skill for SEC filing analysis.
This skill provides:
- Comprehensive API documentation for SEC filing analysis
- Helper functions for common workflows
- Object reference with token estimates
- Workflow patterns for multi-step analysis
The skill covers:
- Getting filings (3 approaches: Published, Current, Company-specific)
- Getting financials (2 approaches: Entity Facts, Filing XBRL)
- Multi-company analysis
- Object representations optimized for AI
Example:
>>> from edgar.ai.skills.core import edgartools_skill
>>>
>>> # List available documentation
>>> print(edgartools_skill.get_documents())
>>> ['skill', 'objects', 'workflows', 'readme']
>>>
>>> # Get main skill documentation
>>> guide = edgartools_skill.get_document_content("skill")
>>>
>>> # Access helper functions
>>> helpers = edgartools_skill.get_helpers()
>>> get_revenue_trend = helpers['get_revenue_trend']
>>> income = get_revenue_trend("AAPL", periods=3)
>>>
>>> # Export skill for Claude Desktop
>>> path = edgartools_skill.export(
... format="claude-desktop",
... output_dir="~/.config/claude/skills"
... )
"""
@property
def name(self) -> str:
"""Skill name: 'EdgarTools'"""
return "EdgarTools"
@property
def description(self) -> str:
"""Skill description for AI agents."""
return (
"Query and analyze SEC filings and financial statements using EdgarTools. "
"Get company data, filings, XBRL financials, and perform multi-company analysis."
)
@property
def content_dir(self) -> Path:
"""Path to skill documentation directory."""
return Path(__file__).parent
def get_object_docs(self) -> list[Path]:
"""
Return centralized object documentation to include in skill exports.
Returns paths to detailed API reference docs that complement the
skill's tutorial documentation.
Returns:
List of Path objects to centralized markdown documentation files
"""
# Navigate from edgar/ai/skills/core/ to edgar/ root
edgar_root = Path(__file__).parent.parent.parent.parent
return [
edgar_root / "entity/docs/Company.md",
edgar_root / "entity/docs/EntityFiling.md",
edgar_root / "entity/docs/EntityFilings.md",
edgar_root / "xbrl/docs/XBRL.md",
edgar_root / "xbrl/docs/Statement.md",
]
def get_helpers(self) -> Dict[str, Callable]:
"""
Return helper functions provided by this skill.
Helper functions simplify common SEC analysis workflows:
- get_filings_by_period: Get filings for a specific quarter
- get_today_filings: Get recent filings (last ~24 hours)
- get_revenue_trend: Get multi-period income statement
- get_filing_statement: Get statement from specific filing
- compare_companies_revenue: Compare revenue across companies
Returns:
Dict mapping function names to callable objects
"""
# Import here to avoid circular dependencies
from edgar.ai import helpers
return {
'get_filings_by_period': helpers.get_filings_by_period,
'get_today_filings': helpers.get_today_filings,
'get_revenue_trend': helpers.get_revenue_trend,
'get_filing_statement': helpers.get_filing_statement,
'compare_companies_revenue': helpers.compare_companies_revenue,
}
# Create singleton instance for convenience
edgartools_skill = EdgarToolsSkill()

View File

@@ -0,0 +1,950 @@
import http.server
import os
import re
import signal
import socketserver
import tempfile
import time
import webbrowser
import zipfile
from functools import lru_cache
from pathlib import Path
from threading import Thread
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
if TYPE_CHECKING:
from edgar.company_reports import Report
from edgar.sgml.sgml_common import FilingSGML, SGMLDocument
import textwrap
from bs4 import BeautifulSoup
from pydantic import BaseModel
from rich import box
from rich.columns import Columns
from rich.console import Group
from rich.panel import Panel
from rich.table import Column, Table
from rich.text import Text
from edgar.core import binary_extensions, has_html_content, sec_dot_gov, text_extensions
from edgar.files.html_documents import get_clean_html
from edgar.files.markdown import to_markdown
from edgar.httpclient import async_http_client
from edgar.httprequests import download_file, download_file_async, get_with_retry
from edgar.richtools import print_rich, print_xml, repr_rich, rich_to_text
xbrl_document_types = ['XBRL INSTANCE DOCUMENT', 'XBRL INSTANCE FILE', 'EXTRACTED XBRL INSTANCE DOCUMENT']
__all__ = ['Attachment', 'Attachments', 'FilingHomepage', 'FilerInfo', 'AttachmentServer', 'sec_document_url', 'get_document_type']
def sec_document_url(attachment_url: str) -> str:
# Remove "ix?doc=/" or "ix.xhtml?doc=/" from the filing url
attachment_url = re.sub(r"ix(\.xhtml)?\?doc=/", "", attachment_url)
return f"{sec_dot_gov}{attachment_url}"
def sequence_sort_key(x):
seq = x.sequence_number
if seq.strip() == '': # Handle empty or whitespace-only strings
return (float('inf'), '') # Sort to end using infinity
try:
return (0, float(seq)) # Convert to number for numeric sorting
except ValueError:
return (1, seq) #
# Mapping of SEC filing file types to Unicode symbols
FILE_TYPE_SYMBOLS: Dict[str, str] = {
# Main SEC filing documents
"10-K": "📄", # Document emoji for main filing
"EX-21.1": "📎", # Paperclip for exhibits
"EX-23.1": "📎",
"EX-31.1": "📎",
"EX-31.2": "📎",
"EX-32.1": "📎",
"EX-97.1": "📎",
# XBRL-related documents
"EX-101.SCH": "🔰", # Clipboard for schema
"EX-101.CAL": "📊", # Chart for calculations
"EX-101.DEF": "📚", # Books for definitions
"EX-101.LAB": "📎", # Paperclip for labels (changed from label)
"EX-101.PRE": "📈", # Graph for presentation
# Common file types
"XML": "🔷", # Document for XML files
"HTML": "🌍", # Page for HTML files
"GRAPHIC": "🎨", # Camera for images
"EXCEL": "📊", # Chart for Excel
"JSON": "📝", # Note for JSON
"ZIP": "📦", # Package for ZIP
"CSS": "📃", # Page for CSS
"JS": "📄", # Document for JavaScript
".css": "📃", # Page for CSS extension
".js": "📄", # Document for JS extension
"PDF": "📕", # Book for PDF
".pdf": "📕", # Book for PDF extension
"INFORMATION TABLE": "📊" # Chart for tables
}
def get_extension(filename: str) -> str:
"""Extract the file extension including the dot."""
if '.' in filename:
return filename[filename.rindex('.'):]
return ''
def get_document_type(filename: str, declared_document_type:str) -> str:
"""
Sometimes the SEC gets the document type wrong. This function uses the extension to determine the document type
"""
if declared_document_type.upper() in ["XML", "HTML", "PDF", "HTM", "JS", "CSS", "ZIP", "XLS", "XSLX", "JSON"]:
extension = get_extension(filename)
document_type = extension[1:].upper()
if document_type in ["HTM", "HTML"]:
return "HTML"
return document_type
return declared_document_type
def get_file_icon(file_type: str, sequence: str = None, filename: str = None) -> str:
"""
Get the Unicode symbol for a given file type and sequence number.
Args:
file_type: The type of the file from SEC filing
sequence: The sequence number of the file in the filing
filename: The name of the file to extract the extension
Returns:
Unicode symbol corresponding to the file type.
If sequence is 1, returns "📜" (scroll) to indicate main filing document.
Returns "📄" (document) as default if type not found.
"""
icon = None
if sequence == "1":
icon = "📜" # Scroll emoji for main document
# Check if it's an XBRL exhibit (EX-101.*)
elif file_type.startswith("EX-101."):
icon = FILE_TYPE_SYMBOLS.get(file_type, "📄")
# Check if it's a regular exhibit (starts with EX-)
elif file_type.startswith("EX-"):
icon = "📋" # Clipboard + writing hand for exhibits
# Check for file extension first if filename is provided
elif filename:
ext = get_extension(filename)
if ext in FILE_TYPE_SYMBOLS:
icon = FILE_TYPE_SYMBOLS[ext]
if not icon:
icon =FILE_TYPE_SYMBOLS.get(file_type, "📄")
icon = f"{icon} " if len(icon) == 1 else icon # Add spaces around the icon for padding
return icon
class FilerInfo(BaseModel):
company_name: str
cik:str
identification: str
addresses: List[str]
def __rich__(self):
return Panel(
Columns([self.identification, Text(" "), self.addresses[0], self.addresses[1]]),
title=self.company_name
)
def __repr__(self):
return repr_rich(self.__rich__())
class Attachment:
"""
A class to represent an attachment in an SEC filing
"""
def __init__(self,
sequence_number: str,
description: str,
document: str,
ixbrl: bool,
path: str,
document_type: str,
size: Optional[int],
sgml_document: Optional['SGMLDocument'] = None,
purpose: Optional[str] = None,
filing_sgml: Optional['FilingSGML'] = None):
self.sequence_number = sequence_number
self.description = description
self.document = document
self.ixbrl = ixbrl
self.path = path
self.document_type = document_type
self.size = size
self.sgml_document:Optional['SGMLDocument'] = sgml_document
self.sgml = filing_sgml
self.purpose = purpose
# Allows tests to override content via property patching
self._content_override = None
@property
def content(self):
# If tests have overridden content using the property's setter, honor it
override = getattr(self, "_content_override", None)
if override is not None:
if isinstance(override, property) and override.fget is not None:
return override.fget(self)
try:
return override(self) # callable override
except TypeError:
return override # direct value
# Avoid real network calls for synthetic test paths
if isinstance(self.path, str) and self.path.startswith("/test/"):
return ""
if self.sgml_document:
return self.sgml_document.content
else:
return download_file(self.url)
@content.setter
def content(self, value):
# Enable tests to patch instance property via unittest.mock.patch.object
self._content_override = value
@content.deleter
def content(self):
self._content_override = None
@property
def url(self):
return sec_document_url(self.path)
@property
def extension(self):
"""The actual extension of the filing document
Usually one of .xml or .html or .pdf or .txt or .paper
"""
return os.path.splitext(self.document)[1]
@property
def display_extension(self) -> str:
"""This is the extension displayed in the html e.g. "es220296680_4-davis.html"
The actual extension would be "es220296680_4-davis.xml", that displays as html in the browser
"""
return os.path.splitext(self.document)[1]
def validate_sequence_number(self, v):
if not v.isdigit() and v != '':
raise ValueError('sequence_number must be digits or an empty string')
return v
def is_text(self) -> bool:
"""Is this a text document"""
return self.extension in text_extensions
def is_xml(self):
return self.extension.lower() in [".xsd", ".xml", ".xbrl"]
def is_html(self):
return self.extension.lower() in [".htm", ".html"]
def is_binary(self) -> bool:
"""Is this a binary document"""
return self.extension in binary_extensions
@property
def empty(self):
"""Some older filings have no document url. So effectively this attachment is empty"""
return self.document is None or self.document.strip() == ''
def download(self, path: Optional[Union[str, Path]] = None) -> Optional[Union[str, bytes]]:
"""
Download the file to a specified path.
If the path is not provided, return the downloaded content as text or bytes.
If the path is a directory, the file is saved with its original name in that directory.
If the path is a file, the file is saved with the given path name.
"""
if path is None:
return self.content
# Ensure path is a Path object
path = Path(path)
# Determine if the path is a directory or a file
if path.is_dir():
file_path = path / self.document
else:
file_path = path
# Save the file
if isinstance(self.content, bytes):
file_path.write_bytes(self.content)
else:
file_path.write_text(self.content)
return str(file_path)
def view(self):
# Check if this is a report
if self.is_report() and self.sgml:
report = self.sgml.filing_summary.reports.get_by_filename(self.document)
if report:
report.view()
else:
if self.is_text():
content = self.content
if self.is_html() or has_html_content(content):
from edgar import Document
document = Document.parse(content)
print_rich(document)
elif self.is_xml():
print_xml(content)
else:
pass
else:
pass
def is_report(self):
return re.match(r"R\d+\.htm", self.document)
def text(self):
# Check if this is a report
if self.is_report() and self.sgml:
report = self.sgml.filing_summary.reports.get_by_filename(self.document)
if report:
return report.text()
if self.is_text():
content = self.content
if self.is_html() or has_html_content(content):
from edgar import Document
document = Document.parse(content)
return rich_to_text(document)
else:
return content
return None
def markdown(self, include_page_breaks: bool = False, start_page_number: int = 0) -> Optional[str]:
"""
Convert the attachment to markdown format if it's HTML content.
Args:
include_page_breaks: If True, include page break delimiters in the markdown
start_page_number: Starting page number for page break markers (default: 0)
Returns:
None if the attachment is not HTML or cannot be converted.
"""
if not self.is_html():
return None
content = self.content
if not content:
return None
# Check if content has HTML structure
if not has_html_content(content):
return None
# Use the same approach as Filing.markdown() but with page break support
clean_html = get_clean_html(content)
if clean_html:
return to_markdown(clean_html, include_page_breaks=include_page_breaks, start_page_number=start_page_number)
return None
def __rich__(self):
icon = get_file_icon(self.document_type, self.sequence_number, self.document)
text = Text.assemble( (f"{self.sequence_number:<3} ", "dim italic"),
" ",
(self.document, "bold"),
" ", (self.purpose or self.description, "grey54"),
" ",
(icon, ""),
" ",
(self.document_type,
"bold deep_sky_blue1" if self.sequence_number == "1" else "")
)
return Panel(text, box=box.ROUNDED, width=200, expand=False)
def __repr__(self):
return repr_rich(self.__rich__())
def __str__(self):
return repr_rich(self.__rich__())
class Attachments:
"""
A class to represent the attachments of an SEC filing
"""
def __init__(self,
document_files: List[Attachment],
data_files: Optional[List[Attachment]],
primary_documents: List[Attachment],
sgml:Optional['FilingSGML'] = None):
self.documents = document_files
self.data_files = data_files
self._attachments = document_files + (data_files or [])
self.primary_documents = primary_documents
self.sgml = sgml
self.n = 0
def __getitem__(self, item: Union[int, str]):
"""
Get the attachment by sequence number as set in the SEC filing SGML file
"""
if isinstance(item, int) or item.isdigit():
return self.get_by_sequence(item)
elif isinstance(item, str):
for doc in self._attachments:
if doc.document == item:
return doc
raise KeyError(f"Document not found: {item}")
def get_by_sequence(self, sequence: Union[str, int]):
"""
Get the attachment by sequence number starting at 1
The sequence number is the exact sequence number in the filing
"""
for doc in self._attachments:
if doc.sequence_number == str(sequence):
return doc
raise KeyError(f"Document not found: {sequence}")
def get_by_index(self, index: int):
"""
Get the attachment by index starting at 1
"""
return self._attachments[index]
def get_report(self, filename:str) -> 'Report':
"""
Get a report by filename
"""
if self.sgml:
reports = self.sgml.filing_summary.reports
if reports:
return reports.get_by_filename(filename)
return None
@property
def primary_html_document(self) -> Optional[Attachment]:
"""Get the primary xml document on the filing"""
for doc in self.primary_documents:
if doc.display_extension == ".html" or doc.display_extension == '.htm':
return doc
"""
Most filings have html primary documents. Some don't.
E.g. Form's 3,4,5 do when loaded directly from edgar but not when loaded from local files
However, there are unusual filings with endings like ".fil" that require a return. So return the first one
"""
if len(self.primary_documents) > 0:
return self.primary_documents[0]
return None
@property
def primary_xml_document(self) -> Optional[Attachment]:
"""Get the primary xml document on the filing"""
for doc in self.primary_documents:
if doc.display_extension == ".xml":
return doc
return None
@property
def text_document(self):
for doc in reversed(self.documents):
if doc.description == "Complete submission text file":
return doc
return None
@property
def exhibits(self):
"""
Get all the exhibits in the filing.
This is the primary document plus all the documents listed as EX-XX
"""
primary_documents = [self.primary_html_document]
exhibits_documents = self.query("re.match('EX-', document_type)", False).documents
return Attachments(
document_files=primary_documents + exhibits_documents,
data_files=[],
primary_documents=primary_documents,
sgml=self.sgml)
@property
def graphics(self):
return self.query("document_type=='GRAPHIC'")
def query(self, query_str: str, include_data_files: bool = True):
"""
Query attachments based on a simple query string.
Supports conditions on 'document', 'description', and 'document_type'.
Example query: "document.endswith('.htm') and 'RELEASE' in description and document_type in ['EX-99.1', 'EX-99', 'EX-99.01']"
"""
allowed_attrs = {'document', 'description', 'document_type'}
# Precompile regex for finding attributes and match patterns
attr_regex = re.compile(rf"\b({'|'.join(allowed_attrs)})\b")
match_regex = re.compile(r"re\.match\('(.*)', (\w+)\)")
def safe_eval(attachment, query):
# Replace attribute references with attachment attributes
query = attr_regex.sub(lambda m: f"attachment.{m.group(0)}", query)
# Handle regex match explicitly
match = match_regex.search(query)
if match:
pattern, attr = match.groups()
query = query.replace(f"re.match('{pattern}', {attr})",
f"re.match(r'{pattern}', attachment.{attr})")
return eval(query, {"re": re, "attachment": attachment})
# Evaluate the query for documents and data files
new_documents = [attachment for attachment in self.documents if safe_eval(attachment, query_str)]
if include_data_files:
new_data_files = [attachment for attachment in self.data_files if
safe_eval(attachment, query_str)] if self.data_files else None
else:
new_data_files = []
return Attachments(document_files=new_documents, data_files=new_data_files,
primary_documents=self.primary_documents, sgml=self.sgml)
@staticmethod
async def _download_all_attachments(attachments: List[Attachment]):
import asyncio
async with async_http_client() as client:
return await asyncio.gather(
*[download_file_async(client, attachment.url, as_text=attachment.is_text()) for attachment in attachments])
def download(self, path: Union[str, Path], archive: bool = False):
"""
Download all the attachments to a specified path.
If the path is a directory, the file is saved with its original name in that directory.
If the path is a file, the file is saved with the given path name.
If archive is True, the attachments are saved in a zip file.
path: str or Path - The path to save the attachments
archive: bool (default False) - If True, save the attachments in a zip file
"""
if self.sgml:
self.sgml.download(path, archive)
return
import asyncio
loop = asyncio.get_event_loop()
downloaded_files = loop.run_until_complete(Attachments._download_all_attachments(self._attachments))
# Ensure path is a Path object
path = Path(path)
# If the path is a directory, save the files in that directory
if archive:
if path.is_dir():
raise ValueError("Path must be a zip file name to create zipfile")
else:
with zipfile.ZipFile(path, 'w') as zipf:
for attachment, downloaded in zip(self._attachments, downloaded_files, strict=False):
if isinstance(downloaded, bytes):
zipf.writestr(attachment.document, downloaded)
else:
zipf.writestr(attachment.document, downloaded.encode('utf-8'))
else:
if path.is_dir():
for attachment, downloaded in zip(self._attachments, downloaded_files, strict=False):
file_path = path / attachment.document
if isinstance(downloaded, bytes):
file_path.write_bytes(downloaded)
else:
file_path.write_text(downloaded, encoding='utf-8')
else:
raise ValueError("Path must be a directory")
def serve(self, port: int = 8000) -> Tuple[Thread, socketserver.TCPServer, str]:
"""
Serve the attachment on a local server
The server can be stopped using CTRL-C
port: int (default 8000) - The port to serve the attachment
"""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
self.download(temp_path)
class Handler(http.server.SimpleHTTPRequestHandler):
def __init__(self, *args, **kwargs):
super().__init__(*args, directory=temp_dir, **kwargs)
primary_html = os.path.basename(self.primary_html_document.path)
url = f'http://localhost:{port}/{primary_html}'
httpd = socketserver.TCPServer(("", port), Handler)
def serve_forever():
with httpd:
httpd.serve_forever()
thread = Thread(target=serve_forever)
thread.daemon = True
thread.start()
# Wait for the server to start
time.sleep(1)
def signal_handler(sig, frame):
httpd.shutdown()
thread.join()
signal.signal(signal.SIGINT, signal_handler)
webbrowser.open(url)
# Keep the main thread alive to handle signals
while thread.is_alive():
time.sleep(0.1)
return thread, httpd, url
def markdown(self, include_page_breaks: bool = False, start_page_number: int = 0) -> Dict[str, str]:
"""
Convert all HTML attachments to markdown format.
Args:
include_page_breaks: If True, include page break delimiters in the markdown
start_page_number: Starting page number for page break markers (default: 0)
Returns:
A dictionary mapping attachment document names to their markdown content.
Only includes attachments that can be successfully converted to markdown.
"""
markdown_attachments = {}
for attachment in self._attachments:
if attachment.is_html():
md_content = attachment.markdown(include_page_breaks=include_page_breaks, start_page_number=start_page_number)
if md_content:
markdown_attachments[attachment.document] = md_content
return markdown_attachments
def __len__(self):
return len(self._attachments)
def __iter__(self):
self.n = 0
return self
def __next__(self):
if self.n < len(self):
_attachment = self._attachments[self.n]
assert _attachment is not None
self.n += 1
return _attachment
else:
raise StopIteration
def __rich__(self):
# Document files
document_table = Table(Column('Seq', header_style="dim"),
Column('Document', header_style="dim"),
Column('Description', header_style="dim", min_width=60),
Column('Type', header_style="dim", min_width=16),
title='Attachments',
row_styles=["", "bold"],
box=box.SIMPLE_HEAD)
all_attachments = sorted(self.documents + (self.data_files or []), key=sequence_sort_key)
for attachment in all_attachments:
# Get the file icon for each attachment
icon = get_file_icon(file_type=attachment.document_type,
sequence= attachment.sequence_number,
filename=attachment.document)
sequence_number = f"{attachment.sequence_number}" if attachment.sequence_number == "1" else attachment.sequence_number
description = "\n".join(textwrap.wrap(attachment.purpose or attachment.description, 100))
document_table.add_row(Text(sequence_number, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else sequence_number,
Text(attachment.document, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else attachment.document,
Text(description, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else description,
Text.assemble((icon, ""), " ", (attachment.document_type, "bold deep_sky_blue1" if attachment.sequence_number == "1" else "")),)
return document_table
def __repr__(self):
return repr_rich(self.__rich__())
@classmethod
def load(cls, soup: BeautifulSoup):
"""
Load the attachments from the SEC filing home page
"""
tables = soup.find_all('table', class_='tableFile')
def parse_table(table, documents: bool):
min_seq = None
# The list of attachments which are primary. This is the first document in the filing
# Plus additional document with the same sequence number
primary_documents: List[Attachment] = []
rows = table.find_all('tr')[1:] # Skip header row
attachments = []
for _index, row in enumerate(rows):
cols = row.find_all('td')
sequence_number = cols[0].text.strip().replace('\xa0', '-')
description = cols[1].text.strip()
# The document text is the text of the document link.
document_text = cols[2].text.strip()
document = document_text.split(' ')[0].strip()
iXbrl = 'iXBRL' in document_text
path = cols[2].a['href'].strip()
document_type = cols[3].text.strip()
size = cols[4].text.strip()
try:
size = int(size)
except ValueError:
size = None
attachment = Attachment(
sequence_number=sequence_number,
description=description,
document=document,
ixbrl=iXbrl,
path=path,
document_type=document_type,
size=size
)
# Add the attachment to the list
attachments.append(attachment)
# Set the SGML on the attachment
attachment.sgml = attachment.sgml
# If this is the first document, set it as the primary document
if documents:
if min_seq is None:
min_seq = sequence_number
if sequence_number == min_seq:
primary_documents.append(attachment)
return attachments, primary_documents
if tables:
document_files, primary_documents = parse_table(tables[0], documents=True)
else:
document_files, primary_documents = [], []
if len(tables) > 1:
data_files, _ = parse_table(tables[1], documents=False)
else:
data_files = None
return cls(document_files, data_files, primary_documents)
class AttachmentServer:
def __init__(self, attachments: Attachments, port: int = 8000):
self.attachments = attachments
self.port = port
self.thread = None
self.httpd = None
self.url = None
self.setup()
def setup(self):
temp_dir = tempfile.TemporaryDirectory()
temp_path = Path(temp_dir.name)
self.attachments.download(temp_path)
class Handler(http.server.SimpleHTTPRequestHandler):
def __init__(self, *args, **kwargs):
super().__init__(*args, directory=temp_dir.name, **kwargs)
primary_html = os.path.basename(self.attachments.primary_html_document.path)
self.url = f'http://localhost:{self.port}/{primary_html}'
self.httpd = socketserver.TCPServer(("", self.port), Handler)
def serve_forever():
with self.httpd:
self.httpd.serve_forever()
self.thread = Thread(target=serve_forever)
self.thread.daemon = True
signal.signal(signal.SIGINT, self.signal_handler)
def start(self):
self.thread.start()
webbrowser.open(self.url)
# Keep the main thread alive to handle signals
while self.thread.is_alive():
time.sleep(0.1)
def stop(self):
self.httpd.shutdown()
self.thread.join()
def signal_handler(self, sig, frame):
self.stop()
exit(0) # Ensure the program exits
class FilingHomepage:
def __init__(self,
url: str,
soup: BeautifulSoup,
attachments: Attachments):
self.attachments = attachments
self.url = url
self._soup = soup
def open(self):
webbrowser.open(self.url)
@property
def documents(self):
return self.attachments.documents
@property
def datafiles(self):
return self.attachments.data_files
@property
def primary_html_document(self) -> Optional[Attachment]:
"""Get the primary html document on the filing"""
return self.attachments.primary_html_document
@property
def primary_xml_document(self) -> Optional[Attachment]:
"""Get the primary xml document on the filing"""
return self.attachments.primary_xml_document
@property
def primary_documents(self):
return self.attachments.primary_documents
@property
def text_document(self):
return self.attachments.text_document
@property
def xbrl_document(self):
"""Find and return the xbrl document."""
if self.datafiles is None:
return None
for datafile in reversed(self.datafiles):
if datafile.description in xbrl_document_types:
return datafile
@lru_cache(maxsize=1)
def get_filers(self):
filer_divs = self._soup.find_all("div", id="filerDiv")
filer_infos = []
for filer_div in filer_divs:
# Get the company name
company_info_div = filer_div.find("div", class_="companyInfo")
company_name_span = company_info_div.find("span", class_="companyName")
if company_name_span:
full_text = company_name_span.text.strip()
# Split the text into company name and CIK
parts = full_text.split('CIK: ')
company_name = parts[0].strip()
cik = parts[1].split()[0] if len(parts) > 1 else ""
# Clean up the company name
company_name = re.sub("\n", "", company_name).replace("(Filer)", "").strip()
else:
company_name = ""
cik = ""
# Get the identification information
ident_info_div = company_info_div.find("p", class_="identInfo")
# Replace <br> with newlines
for br in ident_info_div.find_all("br"):
br.replace_with("\n")
identification = ident_info_div.text
# Get the mailing information
mailer_divs = filer_div.find_all("div", class_="mailer")
# For each mailed_div.text remove multiple spaces after a newline
addresses = [re.sub(r'\n\s+', '\n', mailer_div.text.strip())
for mailer_div in mailer_divs]
# Create the filer info
filer_info = FilerInfo(company_name=company_name, cik=cik, identification=identification, addresses=addresses)
filer_infos.append(filer_info)
return filer_infos
@property
def period_of_report(self)-> Optional[str]:
"Get the period of report"
_,_, period = self.get_filing_dates()
return period
@lru_cache(maxsize=None)
def get_filing_dates(self)-> Optional[Tuple[str,str, Optional[str]]]:
# Find the form grouping divs
grouping_divs = self._soup.find_all("div", class_="formGrouping")
if len(grouping_divs) == 0:
return None
date_grouping_div = grouping_divs[0]
info_divs = date_grouping_div.find_all("div", class_="info")
filing_date = info_divs[0].text.strip()
accepted_date = info_divs[1].text.strip()
if len(grouping_divs) > 1:
period_grouping_div = grouping_divs[1]
first_info_div = period_grouping_div.find("div", class_="info")
if first_info_div:
period = first_info_div.text.strip()
return filing_date, accepted_date, period
return filing_date, accepted_date, None
@classmethod
def load(cls, url: str):
response = get_with_retry(url)
soup = BeautifulSoup(response.text, 'html.parser')
attachments = Attachments.load(soup)
return cls(url, soup, attachments)
def __repr__(self):
return repr_rich(self.__rich__())
def __rich__(self):
return Panel(
Group(
self.attachments,
Group(
*[filer_info.__rich__() for filer_info in self.get_filers()]
)
))

View File

@@ -0,0 +1,930 @@
import re
from datetime import datetime
from functools import cached_property, lru_cache, partial
from typing import Dict, List, Optional
from rich import box, print
from rich.console import Group, Text
from rich.padding import Padding
from rich.panel import Panel
from rich.table import Table
from rich.tree import Tree
from edgar._filings import Attachment, Attachments
from edgar._markdown import MarkdownContent
from edgar.files.html import Document
from edgar.files.html_documents import HtmlDocument
from edgar.files.htmltools import ChunkedDocument, adjust_for_empty_items, chunks2df, detect_decimal_items
from edgar.financials import Financials
from edgar.formatting import datefmt
from edgar.richtools import repr_rich, rich_to_text
__all__ = [
'TenK',
'TenQ',
'TwentyF',
'CurrentReport',
'SixK',
'EightK',
'PressRelease',
'PressReleases',
'is_valid_item_for_filing'
]
class CompanyReport:
def __init__(self, filing):
self._filing = filing
@property
def filing_date(self):
return self._filing.filing_date
@property
def form(self):
return self._filing.form
@property
def company(self):
return self._filing.company
@property
def income_statement(self):
return self.financials.income_statement() if self.financials else None
@property
def balance_sheet(self):
return self.financials.balance_sheet() if self.financials else None
@property
def cash_flow_statement(self):
return self.financials.cashflow_statement() if self.financials else None
@cached_property
def financials(self):
"""Get the financials for this filing"""
return Financials.extract(self._filing)
@property
def period_of_report(self):
return self._filing.header.period_of_report
@cached_property
def chunked_document(self):
return ChunkedDocument(self._filing.html())
@property
def doc(self):
return self.chunked_document
@property
def items(self) -> List[str]:
return self.chunked_document.list_items()
def __getitem__(self, item_or_part: str):
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
item_text = self.chunked_document[item_or_part]
return item_text
def view(self, item_or_part: str):
"""Get the Item or Part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q"""
item_text = self[item_or_part]
if item_text:
print(item_text)
def __rich__(self):
return Panel(
Group(
self._filing.__rich__(),
self.financials() or Text("No financial data available")
)
)
def __repr__(self):
return repr_rich(self.__rich__())
class FilingStructure:
def __init__(self, structure: Dict):
self.structure = structure
def get_part(self, part: str):
return self.structure.get(part.upper())
def get_item(self, item: str, part: str = None):
item = item.upper()
if part:
part_dict = self.get_part(part)
if part_dict:
return part_dict.get(item)
else:
for _, items in self.structure.items():
if item in items:
return items[item]
return None
def is_valid_item(self, item: str, part: str = None):
return self.get_item(item, part) is not None
class ItemOnlyFilingStructure(FilingStructure):
def get_part(self, part: str):
return None
def get_item(self, item: str, part: str = None):
return self.structure.get(item.upper())
class TenK(CompanyReport):
structure = FilingStructure({
"PART I": {
# special case for 10-K
# Items 1 and 2. Business and Properties
"ITEM 1": {
"Title": "Business",
"Description": "Overview of the company's business operations, products, services, and market environment."
},
"ITEM 1A": {
"Title": "Risk Factors",
"Description": "Discussion of risks and uncertainties that could materially affect the company's " +
"financial condition or results of operations."
},
"ITEM 1B": {
"Title": "Unresolved Staff Comments",
"Description": "Any comments from the SEC staff on the company's previous filings" +
"that remain unresolved."
},
"ITEM 1C": {
"Title": "Cybersecurity",
"Description": "Cybersecurity risk management, strategy, and governance disclosures."
},
"ITEM 2": {
"Title": "Properties",
"Description": "Information about the physical properties owned or leased by the company."
},
"ITEM 3": {
"Title": "Legal Proceedings",
"Description": "Details of significant ongoing legal proceedings."
},
"ITEM 4": {
"Title": "Mine Safety Disclosures",
"Description": "Relevant for mining companies, disclosures about mine safety and regulatory compliance."
}
},
"PART II": {
"ITEM 5": {
"Title": "Market for Registrants Common Equity",
"Description": "Information on the companys equity, including stock performance " +
"and shareholder matters."
},
"ITEM 6": {
"Title": "Selected Financial Data",
"Description": "Financial data summary for the last five fiscal years."
},
"ITEM 7": {
"Title": "Managements Discussion and Analysis (MD&A)",
"Description": "Managements perspective on the financial condition, changes in financial condition, " +
"and results of operations."
},
"ITEM 7A": {
"Title": "Quantitative and Qualitative Disclosures About Market Risk",
"Description": "Information on the company's exposure to market risk, such as interest rate risk, " +
"foreign currency exchange risk, commodity price risk, etc."
},
"ITEM 8": {
"Title": "Financial Statements",
"Description": "Complete audited financial statements, including balance sheet, income statement, " +
"cash flow statement, and notes to the financial statements."
},
"ITEM 9": {
"Title": "Controls and Procedures",
"Description": "Evaluation of the effectiveness of the design and operation of the companys disclosure controls and procedures."
},
"ITEM 9A": {
"Title": "Controls and Procedures",
"Description": "Evaluation of internal controls over financial reporting."
},
"ITEM 9B": {
"Title": "Other Information",
"Description": "Any other relevant information not covered in other sections."
},
"ITEM 9C": {
"Title": "Disclosure Regarding Foreign Jurisdictions That Prevent Inspections",
"Description": "Disclosure Regarding Foreign Jurisdictions That Prevent Inspections."
}
},
"PART III": {
"ITEM 10": {
"Title": "Directors, Executive Officers, and Corporate Governance",
"Description": "Information about the company's directors, executive officers, and governance policies."
},
"ITEM 11": {
"Title": "Executive Compensation",
"Description": "Details of compensation paid to key executives."
},
"ITEM 12": {
"Title": "Security Ownership of Certain Beneficial Owners and Management",
"Description": "Information about stock ownership of major shareholders, directors, and management."
},
"ITEM 13": {
"Title": "Certain Relationships and Related Transactions, and Director Independence",
"Description": "Information on transactions between the company and its directors, officers, " +
"and significant shareholders."
},
"ITEM 14": {
"Title": "Principal Accounting Fees and Services",
"Description": "Fees paid to the principal accountant and services rendered."
}
},
"PART IV": {
"ITEM 15": {
"Title": "Exhibits, Financial Statement Schedules",
"Description": "Legal documents and financial schedules that support the financial statements " +
"and disclosures."
},
"ITEM 16": {
"Title": "Form 10-K Summary",
"Description": "Form 10-K Summary"
}
}
})
def __init__(self, filing):
assert filing.form in ['10-K', '10-K/A'], f"This form should be a 10-K but was {filing.form}"
super().__init__(filing)
@property
def business(self):
return self['Item 1']
@property
def risk_factors(self):
return self['Item 1A']
@property
def management_discussion(self):
return self['Item 7']
@property
def directors_officers_and_governance(self):
return self['Item 10']
@cached_property
def chunked_document(self):
return ChunkedDocument(self._filing.html(), prefix_src=self._filing.base_dir)
@lru_cache(maxsize=1)
def id_parse_document(self, markdown:bool=False):
from edgar.files.html_documents_id_parser import ParsedHtml10K
return ParsedHtml10K().extract_html(self._filing.html(), self.structure, markdown=markdown)
def __str__(self):
return f"""TenK('{self.company}')"""
def __getitem__(self, item_or_part: str):
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
item_text = self.chunked_document[item_or_part]
if item_text:
item_text = item_text.rstrip()
last_line = item_text.split("\n")[-1]
if re.match(r'^\b(PART\s+[IVXLC]+)\b', last_line):
item_text = item_text.rstrip(last_line)
return item_text
def get_item_with_part(self, part: str, item: str, markdown:bool=True):
if not part:
return self.id_parse_document(markdown).get(item.lower())
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
item_text = self.chunked_document.get_item_with_part(part, item, markdown=markdown)
# remove first line or last line (redundant part information)
if not item_text or not item_text.strip():
return self.id_parse_document(markdown).get(part.lower(), {}).get(item.lower())
return item_text
def get_structure(self):
# Create the main tree
tree = Tree("📄 ")
# Get the actual items from the filing
actual_items = self.items
# Create a mapping of uppercase to actual case items
case_mapping = {item.upper(): item for item in actual_items}
# Process each part in the structure
for part, items in self.structure.structure.items():
# Create a branch for each part
part_tree = tree.add(f"[bold blue]{part}[/]")
# Add items under each part
for item_key, item_data in items.items():
# Check if this item exists in the actual filing
if item_key in case_mapping:
# Use the actual case from the filing
actual_item = case_mapping[item_key]
item_text = Text.assemble(
(f"{actual_item:<7} ", "bold green"),
(f"{item_data['Title']}", "bold"),
)
else:
# Item doesn't exist - show in grey with original structure case
item_text = Text.assemble(
(f"{item_key}: ", "dim"),
(f"{item_data['Title']}", "dim"),
)
part_tree.add(item_text)
return tree
def __rich__(self):
title = Text.assemble(
(f"{self.company}", "bold deep_sky_blue1"),
(" ", ""),
(f"{self.form}", "bold"),
)
periods = Text.assemble(
("Period ending ", "grey70"),
(f"{datefmt(self.period_of_report, '%B %d, %Y')}", "bold"),
(" filed on ", "grey70"),
(f"{datefmt(self.filing_date, '%B %d, %Y')}", "bold"),
)
panel = Panel(
Group(
periods,
Padding(" ", (1, 0, 0, 0)),
self.get_structure(),
Padding(" ", (1, 0, 0, 0)),
self.financials or Text("No financial data available", style="italic")
),
title=title,
box=box.ROUNDED,
)
return panel
class TenQ(CompanyReport):
structure = FilingStructure({
"PART I": { # Financial Information
"ITEM 1": {
"Title": "Financial Statements",
"Description": "Unaudited financial statements including balance sheets, income statements, " +
"and cash flow statements."
},
"ITEM 2": {
"Title": "Managements Discussion and Analysis of Financial Condition and Results of Operations (MD&A)",
"Description": "Managements perspective on the financial condition and results of operations."
},
"ITEM 3": {
"Title": "Quantitative and Qualitative Disclosures About Market Risk",
"Description": "Information on the company's exposure to market risk."
},
"ITEM 4": {
"Title": "Controls and Procedures",
"Description": "Evaluation of the effectiveness of disclosure controls and procedures."
}
},
"PART II": { # Other Information
"ITEM 1": {
"Title": "Legal Proceedings",
"Description": "Brief description of any significant pending legal proceedings."
},
"ITEM 1A": {
"Title": "Risk Factors",
"Description": "An update on risk factors that may affect future results."
},
"ITEM 2": {
"Title": "Unregistered Sales of Equity Securities and Use of Proceeds",
"Description": "Details of unregistered sales of equity securities."
},
"ITEM 3": {
"Title": "Defaults Upon Senior Securities",
"Description": "Information regarding any defaults on senior securities."
},
"ITEM 4": {
"Title": "Mine Safety Disclosures",
"Description": "Required for companies with mining operations."
},
"ITEM 5": {
"Title": "Other Information",
"Description": "Any other information that should be disclosed to investors."
},
"ITEM 6": {
"Title": "Exhibits",
"Description": "List of exhibits required by Item 601 of Regulation S-K."
}
}
})
def __init__(self, filing):
assert filing.form in ['10-Q', '10-Q/A'], f"This form should be a 10-Q but was {filing.form}"
super().__init__(filing)
def __str__(self):
return f"""TenQ('{self.company}')"""
def __getitem__(self, item_or_part: str):
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
item_text = self.chunked_document[item_or_part]
return item_text
def get_item_with_part(self, part: str, item: str, markdown:bool=True):
if not part:
return self.id_parse_document(markdown).get(part.lower(), {}).get(item.lower())
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
item_text = self.chunked_document.get_item_with_part(part, item, markdown=markdown)
# remove first line or last line (redundant part information)
if not item_text or not item_text.strip():
return self.id_parse_document(markdown).get(part.lower(), {}).get(item.lower())
return item_text
@lru_cache(maxsize=1)
def id_parse_document(self, markdown:bool=True):
from edgar.files.html_documents_id_parser import ParsedHtml10Q
return ParsedHtml10Q().extract_html(self._filing.html(), self.structure, markdown=markdown)
@cached_property
def chunked_document(self):
return ChunkedDocument(self._filing.html(), prefix_src=self._filing.base_dir)
def get_structure(self):
# Create the main tree
tree = Tree("📄 ")
# Get the actual items from the filing
actual_items = self.items
# Create a mapping of uppercase to actual case items
case_mapping = {item.upper(): item for item in actual_items}
# Process each part in the structure
for part, items in self.structure.structure.items():
# Create a branch for each part
part_tree = tree.add(f"[bold blue]{part}[/]")
# Add items under each part
for item_key, item_data in items.items():
# Check if this item exists in the actual filing
if item_key in case_mapping:
# Use the actual case from the filing
actual_item = case_mapping[item_key]
item_text = Text.assemble(
(f"{actual_item:<7} ", "bold green"),
(f"{item_data['Title']}", "bold"),
)
else:
# Item doesn't exist - show in grey with original structure case
item_text = Text.assemble(
(f"{item_key}: ", "dim"),
(f"{item_data['Title']}", "dim"),
)
part_tree.add(item_text)
return tree
def __rich__(self):
title = Text.assemble(
(f"{self.company}", "bold deep_sky_blue1"),
(" ", ""),
(f"{self.form}", "bold"),
)
periods = Text.assemble(
("Period ending ", "grey70"),
(f"{datefmt(self.period_of_report, '%B %d, %Y')}", "bold"),
(" filed on ", "grey70"),
(f"{datefmt(self.filing_date, '%B %d, %Y')}", "bold"),
)
panel = Panel(
Group(
periods,
Padding(" ", (1, 0, 0, 0)),
self.get_structure(),
Padding(" ", (1, 0, 0, 0)),
self.financials or Text("No financial data available", style="italic")
),
title=title,
box=box.ROUNDED,
)
return panel
class TwentyF(CompanyReport):
structure = FilingStructure({
"PART I": {
"ITEM 1": {
"Title": "Identity of Directors, Senior Management, and Advisers",
"Description": "Information about the company's directors, senior management, and advisers."
},
"ITEM 2": {
"Title": "Offer Statistics and Expected Timetable",
"Description": "Details on recent and expected offers of securities."
},
"ITEM 3": {
"Title": "Key Information",
"Description": "Financial and other key information about the company, including risk factors and ratios."
},
"ITEM 4": {
"Title": "Information on the Company",
"Description": "Detailed information about the company's operations and properties."
},
"ITEM 4A": {
"Title": "Unresolved Staff Comments",
"Description": "Any comments from the SEC staff on the companys previous filings that " +
"remain unresolved."
}
},
"PART II": {
"ITEM 5": {
"Title": "Operating and Financial Review and Prospects",
"Description": "Managements discussion and analysis of financial condition and results of operations."
},
"ITEM 6": {
"Title": "Directors, Senior Management, and Employees",
"Description": "Information about the company's directors, senior management, and employees."
},
"ITEM 7": {
"Title": "Major Shareholders and Related Party Transactions",
"Description": "Information about major shareholders and transactions with related parties."
},
"ITEM 8": {
"Title": "Financial Information",
"Description": "Audited financial statements and supplementary financial information."
},
"ITEM 9": {
"Title": "The Offer and Listing",
"Description": "Details on the company's securities and markets where they are traded."
}
},
"PART III": {
"ITEM 10": {
"Title": "Additional Information",
"Description": "Additional information such as share capital, memoranda, and articles of association."
},
"ITEM 11": {
"Title": "Quantitative and Qualitative Disclosures About Market Risk",
"Description": "Information on the company's exposure to market risk."
},
"ITEM 12": {
"Title": "Description of Securities Other Than Equity Securities",
"Description": "Detailed information on securities other than equity."
}
},
"PART IV": {
"ITEM 13": {
"Title": "Defaults, Dividend Arrearages, and Delinquencies",
"Description": "Information about defaults on payments and arrearages."
},
"ITEM 14": {
"Title": "Material Modifications to the Rights of Security Holders and Use of Proceeds",
"Description": "Details on any modifications to the rights of security holders."
},
"ITEM 15": {
"Title": "Controls and Procedures",
"Description": "Assessment of the effectiveness of disclosure controls and internal controls over financial reporting."
},
"ITEM 16": {
"Title": "Various Disclosures",
"Description": "Includes disclosures related to audit committee financial experts, code of ethics, " +
"principal accountant fees and services, and other corporate governance matters."
}
},
"PART V": {
"ITEM 17": {
"Title": "Financial Statements",
"Description": "Financial statements prepared in accordance with or reconciled to U.S. GAAP or IFRS."
},
"ITEM 18": {
"Title": "Financial Statements",
"Description": "If different from Item 17, financial statements prepared in accordance with " +
"home country standards."
},
"ITEM 19": {
"Title": "Exhibits",
"Description": "Legal and financial documents supporting the information in the report."
}
}
})
def __init__(self, filing):
assert filing.form in ['20-F', '20-F/A'], f"This form should be a 20-F but was {filing.form}"
super().__init__(filing)
def __str__(self):
return f"""TwentyF('{self.company}')"""
class CurrentReport(CompanyReport):
structure = ItemOnlyFilingStructure({
"ITEM 1.01": {
"Title": "Entry into a Material Definitive Agreement",
"Description": "Reports any material agreement not made in the ordinary course of business."
},
"ITEM 1.02": {
"Title": "Termination of a Material Definitive Agreement",
"Description": "Reports the termination of any material agreement."
},
"ITEM 1.03": {
"Title": "Bankruptcy or Receivership",
"Description": "Reports any bankruptcy or receivership."
},
"ITEM 2.01": {"Title": "Completion of Acquisition or Disposition of Assets",
"Description": "Reports the completion of an acquisition or disposition of a significant " +
"amount of assets."},
"ITEM 2.02": {"Title": "Results of Operations and Financial Condition",
"Description": "Reports on the company's results of operations and financial condition."},
"ITEM 2.03": {
"Title": "Creation of a Direct Financial Obligation or an Obligation under an Off-Balance Sheet " +
"Arrangement of a Registrant",
"Description": "Reports the creation of a direct financial obligation."},
"ITEM 2.04": {
"Title": "Triggering Events That Accelerate or Increase a Direct Financial Obligation or an Obligation "
+ "under an Off-Balance Sheet Arrangement",
"Description": "Reports any triggering events."},
"ITEM 2.05": {"Title": "Costs Associated with Exit or Disposal Activities",
"Description": "Reports costs related to exit or disposal activities."},
"ITEM 2.06": {"Title": "Material Impairments", "Description": "Reports on any material impairments."},
"ITEM 3.01": {
"Title": "Notice of Delisting or Failure to Satisfy a Continued Listing Rule or Standard; " +
"Transfer of Listing",
"Description": "Reports on delisting or failure to satisfy listing rules."},
"ITEM 3.02": {"Title": "Unregistered Sales of Equity Securities",
"Description": "Reports on the sale of unregistered equity securities."},
"ITEM 3.03": {"Title": "Material Modification to Rights of Security Holders",
"Description": "Reports on any modifications to the rights of security holders."},
"ITEM 4.01": {"Title": "Changes in Registrant's Certifying Accountant",
"Description": "Reports any change in the company's accountant."},
"ITEM 4.02": {
"Title": "Non-Reliance on Previously Issued Financial Statements or a Related Audit Report or " +
"Completed Interim Review",
"Description": "Reports on non-reliance on previously issued financial statements."},
"ITEM 5.01": {"Title": "Changes in Control of Registrant",
"Description": "Reports changes in control of the company."},
"ITEM 5.02": {
"Title": "Departure of Directors or Certain Officers; Election of Directors; Appointment of Certain " +
"Officers",
"Description": "Compensatory Arrangements of Certain Officers: Reports any changes in the company's " +
"directors or certain officers."},
"ITEM 5.03": {"Title": "Amendments to Articles of Incorporation or Bylaws; Change in Fiscal Year",
"Description": "Reports on amendments to articles of incorporation or bylaws."},
"ITEM 5.04": {
"Title": "Temporary Suspension of Trading Under Registrants Employee Benefit Plans",
"Description": "Reports on the temporary suspension of trading under the companys employee benefit plans."
},
"ITEM 5.05": {
"Title": "Amendment to the Registrants Code of Ethics, or Waiver of a Provision of the Code of Ethics",
"Description": "Reports on amendments or waivers to the code of ethics."},
"ITEM 5.06": {"Title": "Change in Shell Company Status",
"Description": "Reports a change in the company's shell company status."},
"ITEM 5.07": {"Title": "Submission of Matters to a Vote of Security Holders",
"Description": "Reports on matters submitted to a vote of security holders."},
"ITEM 5.08": {"Title": "Shareholder Director Nominations",
"Description": "Reports on shareholder director nominations."},
"ITEM 6.01": {"Title": "ABS Informational and Computational Material",
"Description": "Reports ABS informational and computational material."},
"ITEM 6.02": {"Title": "Change of Servicer or Trustee",
"Description": "Reports on the change of servicer or trustee."},
"ITEM 6.03": {"Title": "Change in Credit Enhancement or Other External Support",
"Description": "Reports on changes in credit enhancement or external support."},
"ITEM 6.04": {"Title": "Failure to Make a Required Distribution",
"Description": "Reports on the failure to make a required distribution."},
"ITEM 6.05": {"Title": "Securities Act Updating Disclosure",
"Description": "Reports on Securities Act updating disclosure."},
"ITEM 9.01": {
"Title": "Financial Statements and Exhibits",
"Description": "Reports financial statements and other exhibits related to the events reported in the 8-K."
}
})
def __init__(self, filing):
assert filing.form in ['8-K', '8-K/A', '6-K', '6-K/A'], f"This form should be an 8-K but was {filing.form}"
super().__init__(filing)
@property
def has_press_release(self):
return self.press_releases is not None
@property
def press_releases(self):
attachments: Attachments = self._filing.attachments
# This query for press release currently includes EX-99, EX-99.1, EX-99.01 but not EX-99.2
# Here is what we think so far
html_document = "document.endswith('.htm')"
named_release = "re.match('.*RELEASE', description)"
type_ex_99 = "document_type in ['EX-99.1', 'EX-99', 'EX-99.01']"
press_release_query = f"{html_document} and ({named_release} or {type_ex_99})"
press_release_results = attachments.query(press_release_query)
if press_release_results:
return PressReleases(press_release_results)
@cached_property
def chunked_document(self):
html = self._filing.html()
if not html:
return None
decimal_chunk_fn = partial(chunks2df,
item_detector=detect_decimal_items,
item_adjuster=adjust_for_empty_items,
item_structure=self.structure)
return ChunkedDocument(html,
chunk_fn=decimal_chunk_fn)
@property
def doc(self):
return self.chunked_document
@property
def items(self) -> List[str]:
if self.chunked_document:
return self.chunked_document.list_items()
return []
def __getitem__(self, item_or_part: str):
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
item_text = self.chunked_document[item_or_part]
return item_text
def view(self, item_or_part: str):
"""Get the Item or Part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q"""
item_text = self[item_or_part]
if item_text:
print(item_text)
@property
def date_of_report(self):
"""Return the period of report for this filing"""
period_of_report_str = self._filing.header.period_of_report
if period_of_report_str:
period_of_report = datetime.strptime(period_of_report_str, "%Y-%m-%d")
return period_of_report.strftime("%B %d, %Y")
return ""
def _get_exhibit_content(self, exhibit: Attachment) -> Optional[str]:
"""
Get the content of the exhibit
"""
# For old filings the exhibit might not have a document. So we need to get the full text content
# from the sgml content
if exhibit.empty:
# Download the SGML document
sgml_document = self._filing.sgml().get_document_by_sequence(exhibit.sequence_number)
if sgml_document:
exhibit_content = sgml_document.text()
return exhibit_content
else:
html_content = exhibit.download()
if html_content:
document = Document.parse(html_content)
return repr_rich(document, width=200, force_terminal=False)
def _content_renderables(self):
"""Get the content of the exhibits as renderables"""
renderables = []
for exhibit in self._filing.exhibits:
# Skip binary files
if exhibit.is_binary():
continue
exhibit_content = self._get_exhibit_content(exhibit)
if exhibit_content:
# Remove text like [/she] and replace with (she) to prevent it being treated as rich markup
cleaned_content = re.sub(r'\[(/[^]]*)]', r'(\1)',exhibit_content)
title = Text.assemble(("Exhibit ", "bold gray54"), (exhibit.document_type, "bold green"))
renderables.append(Panel(cleaned_content,
title=title,
subtitle=Text(exhibit.description, style="gray54"),
box=box.SIMPLE))
return Group(*renderables)
def text(self):
"""Get the text of the EightK filing
This includes the text content of all the exhibits
"""
return rich_to_text(self._content_renderables())
def __rich__(self):
# Renderables for the panel.
renderables = []
# List the exhibits as a table
exhibit_table = Table("", "Type", "Description",
title="Exhibits", show_header=True, header_style="bold", box=box.ROUNDED)
renderables.append(exhibit_table)
for exhibit in self._filing.exhibits:
exhibit_table.add_row(exhibit.sequence_number, exhibit.document_type, exhibit.description)
panel_title = Text.assemble(
(f"{self.company}", "bold deep_sky_blue1"),
(" ", ""),
(f"{self.form}", "bold green"),
(" ", ""),
(f"{self.date_of_report}", "bold yellow")
)
# Add the content of the exhibits
renderables.append(self._content_renderables())
return Panel(
Group(*renderables),
title=panel_title,
box=box.SIMPLE
)
def __str__(self):
return f"{self.company} {self.form} {self.date_of_report}"
def __repr__(self):
return repr_rich(self.__rich__())
# Aliases fpr the current report
EightK = CurrentReport
SixK = CurrentReport
class PressReleases:
"""
Represent the attachment on an 8-K filing that could be press releases
"""
def __init__(self, attachments: Attachments):
self.attachments: Attachments = attachments
def __len__(self):
return len(self.attachments)
def __getitem__(self, item):
attachment = self.attachments.get_by_index(item)
if attachment:
return PressRelease(attachment)
def __rich__(self):
return self.attachments.__rich__()
def __repr__(self):
return repr_rich(self.__rich__())
class PressRelease:
"""
Represents a press release attachment from an 8-K filing
With the Type EX-99.1
"""
def __init__(self, attachment: Attachment):
self.attachment: Attachment = attachment
def url(self):
return self.attachment.url
@property
def document(self) -> str:
return self.attachment.document
@property
def description(self) -> str:
return self.attachment.description
@lru_cache(maxsize=1)
def html(self) -> str:
return self.attachment.download()
def text(self) -> str:
html = self.html()
if html:
return HtmlDocument.from_html(html, extract_data=False).text
def open(self):
self.attachment.open()
def view(self):
return self.to_markdown().view()
def to_markdown(self):
html = self.html()
markdown_content = MarkdownContent.from_html(html, title="8-K Press Release")
return markdown_content
def __rich__(self):
return self.to_markdown()
def __repr__(self):
return repr_rich(self.__rich__())
def is_valid_item_for_filing(filing_structure: Dict, item: str, part: str = None):
"""Return true if the item is valid"""
item = item.upper()
if part:
part_dict = filing_structure.get(part.upper())
if part_dict:
return item in part_dict
else:
for _, items in filing_structure.items():
if item in items:
return True
return False

View File

@@ -0,0 +1,689 @@
import asyncio
import datetime
import logging.config
import os
import random
import re
import sys
import threading
from _thread import interrupt_main
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from datetime import date
from functools import lru_cache, partial, wraps
from pathlib import Path
from typing import Callable, Iterable, List, Optional, Tuple, TypeVar, Union
import httpx
import pandas as pd
import pyarrow as pa
import pytz
from pandas.tseries.offsets import BDay
from rich.logging import RichHandler
from rich.prompt import Prompt
from edgar.datatools import PagingState
log = logging.getLogger(__name__)
def parse_pandas_version():
"""Parse pandas version without external dependencies"""
version_parts = pd.__version__.split('.')
major = int(version_parts[0])
minor = int(version_parts[1]) if len(version_parts) > 1 else 0
# Handle dev versions, rc versions, and build metadata
patch_str = version_parts[2] if len(version_parts) > 2 else '0'
patch = int(patch_str.split('+')[0].split('rc')[0].split('dev')[0])
return (major, minor, patch)
pandas_version = parse_pandas_version()
# sys version
python_version = tuple(map(int, sys.version.split()[0].split('.')))
__all__ = [
'log',
'Result',
'get_bool',
'edgar_mode',
'NORMAL',
'CRAWL',
'CAUTION',
'sec_edgar',
'IntString',
'sec_dot_gov',
'get_identity',
'python_version',
'set_identity',
'strtobool',
'listify',
'decode_content',
'cache_except_none',
'text_extensions',
'binary_extensions',
'ask_for_identity',
'is_start_of_quarter',
'run_async_or_sync',
'get_edgar_data_directory',
'is_probably_html',
'has_html_content',
'default_page_size',
'parse_acceptance_datetime',
'PagingState',
'Years',
'Quarters',
'YearAndQuarter',
'YearAndQuarters',
'quarters_in_year',
'parallel_thread_map',
'pandas_version'
]
IntString = Union[str, int]
quarters_in_year: List[int] = list(range(1, 5))
YearAndQuarter = Tuple[int, int]
YearAndQuarters = List[YearAndQuarter]
Years = Union[int, List[int], range]
Quarters = Union[int, List[int], range]
# Date patterns
YYYY_MM_DD = "\\d{4}-\\d{2}-\\d{2}"
DATE_PATTERN = re.compile(YYYY_MM_DD)
DATE_RANGE_PATTERN = re.compile(f"^({YYYY_MM_DD}(:({YYYY_MM_DD})?)?|:({YYYY_MM_DD}))$")
default_http_timeout: int = 12
default_page_size = 50
default_max_connections = 10
default_retries = 3
limits = httpx.Limits(max_connections=default_max_connections)
def strtobool (val:str):
"""Convert a string representation of truth to true (1) or false (0).
True values are case insensitive 'y', 'yes', 't', 'true', 'on', and '1'.
false values are case insensitive 'n', 'no', 'f', 'false', 'off', and '0'.
Raises ValueError if 'val' is anything else.
"""
if not val:
return False
val = val.lower()
if val in ('y', 'yes', 't', 'true', 'on', '1'):
return True
elif val in ('n', 'no', 'f', 'false', 'off', '0'):
return False
else:
return False
#raise ValueError("invalid truth value %r" % (val,))
@dataclass
class EdgarSettings:
http_timeout: int
max_connections: int
retries: int = 3
@property
@lru_cache(maxsize=1)
def limits(self):
return httpx.Limits(max_connections=default_max_connections)
def __eq__(self, othr):
return (isinstance(othr, type(self))
and (self.http_timeout, self.max_connections, self.retries) ==
(othr.http_timeout, othr.max_connections, othr.retries))
def __hash__(self):
return hash((self.http_timeout, self.max_connections, self.retries))
# Modes of accessing edgar
# The normal mode of accessing edgar
NORMAL = EdgarSettings(http_timeout=15, max_connections=10)
# A bit more cautious mode of accessing edgar
CAUTION = EdgarSettings(http_timeout=20, max_connections=5)
# Use this setting when you have long-running jobs and want to avoid breaching Edgar limits
CRAWL = EdgarSettings(http_timeout=25, max_connections=2, retries=2)
edgar_access_mode = os.getenv('EDGAR_ACCESS_MODE', 'NORMAL')
if edgar_access_mode == 'CAUTION':
# A bit more cautious mode of accessing edgar
edgar_mode = CAUTION
elif edgar_access_mode == 'CRAWL':
# Use this setting when you have long-running jobs and want to avoid breaching Edgar limits
edgar_mode = CRAWL
else:
# The normal mode of accessing edgar
edgar_mode = NORMAL
edgar_identity = 'EDGAR_IDENTITY'
# SEC urls
sec_dot_gov = "https://www.sec.gov"
sec_edgar = "https://www.sec.gov/Archives/edgar"
# Local storage directory.
edgar_data_dir = os.path.join(os.path.expanduser("~"), ".edgar")
def set_identity(user_identity: str):
"""
This function sets the environment variable EDGAR_IDENTITY to the identity you will use to call Edgar
This user identity looks like
"Sample Company Name AdminContact@<sample company domain>.com"
See https://www.sec.gov/os/accessing-edgar-data
:param user_identity:
"""
os.environ[edgar_identity] = user_identity
log.info("Identity of the Edgar REST client set to [%s]", user_identity)
from edgar.httpclient import close_clients
close_clients() # close any httpx clients, to reset the identity.
identity_prompt = """
[bold turquoise4]Identify your client to SEC Edgar[/bold turquoise4]
------------------------------------------------------------------------------
Before running [bold]edgartools[/bold] it needs to know the UserAgent string to send to Edgar.
See https://www.sec.gov/os/accessing-edgar-data
This can be set in the environment variable [bold green]EDGAR_IDENTITY[/bold green].
1. Set an OS environment variable
[bold]EDGAR_IDENTITY=[green]Name email@domain.com[/green][/bold]
2. Or a Python environment variable
import os
[bold]os.environ['EDGAR_IDENTITY']=[green]"Name email@domain.com"[/green][/bold]
3. Or use [bold magenta]edgartools.set_identity[/bold magenta]
from edgar import set_identity
[bold]set_identity([green]'Name email@domain.com'[/green])[/bold]
But since you are already using [bold]edgartools[/bold] you can set it here
Enter your [bold green]EDGAR_IDENTITY[/bold green] e.g. [bold italic green]Name email@domain.com[/bold italic green]
"""
def ask_for_identity(user_prompt: str = identity_prompt,
timeout: int = 60):
timer = threading.Timer(timeout, interrupt_main)
timer.start()
try:
# Prompt the user for input
input_str = Prompt.ask(user_prompt)
# Strip the newline character from the end of the input string
input_str = input_str.strip()
except KeyboardInterrupt:
# If the timeout is reached, raise a TimeoutError exception
message = "You did not enter your Edgar user identity. Try again .. or set environment variable EDGAR_IDENTITY"
log.warning(message)
raise TimeoutError(message) from None
finally:
# Cancel the timer to prevent it from interrupting the main thread
timer.cancel()
return input_str
def get_identity() -> str:
"""
Get the sec identity used to set the UserAgent string
:return:
"""
identity = os.environ.get(edgar_identity)
if not identity:
identity = ask_for_identity()
os.environ[edgar_identity] = identity
return identity
def decode_content(content: bytes):
try:
return content.decode('utf-8')
except UnicodeDecodeError:
return content.decode('latin-1')
text_extensions = (".txt", ".htm", ".html", ".xsd", ".xml", "XML", ".json", ".idx", ".paper")
binary_extensions = (".pdf", ".jpg", ".jpeg", "png", ".gif", ".tif", ".tiff", ".bmp", ".ico", ".svg", ".webp", ".avif",
".apng")
def get_bool(value: str = None) -> Optional[bool]:
"""Convert the value to a boolean"""
return value in [1, "1", "Y", "true", "True", "TRUE"]
class Result:
"""
This class represents the result of an operation which can succeed or fail.
It allows for handling the failures more gracefully that using error handling
"""
def __init__(self,
success: bool,
error: Optional[str] = None,
value: Optional[object] = None):
self.success = success
self.error = error
self.value = value
@property
def failure(self) -> bool:
""":return True if the operation failed"""
return not self.success
def __str__(self):
if self.success:
return '[Success]'
else:
return f'[Failure] "{self.error}"'
def __repr__(self):
if self.success:
return f"Result (success={self.success})"
else:
return f'Result (success={self.success}, message="{self.error}")'
@classmethod
def Fail(cls,
error: str):
"""Create a Result for a failed operation"""
return cls(False, error=error, value=None)
@classmethod
def Ok(cls,
value: object):
"""Create a Result for a successful operation"""
return cls(success=True, value=value, error=None)
def get_resource(file: str):
import importlib
import edgar
return importlib.resources.path(edgar, file)
def get_edgar_data_directory() -> Path:
"""Get the edgar data directory"""
default_local_data_dir = Path(os.path.join(os.path.expanduser("~"), ".edgar"))
edgar_data_dir = Path(os.getenv('EDGAR_LOCAL_DATA_DIR', default_local_data_dir))
os.makedirs(edgar_data_dir, exist_ok=True)
return edgar_data_dir
class TooManyRequestsException(Exception):
def __init__(self, message: str):
super().__init__(message)
def filing_date_to_year_quarters(filing_date: str) -> List[Tuple[int, int]]:
if ":" in filing_date:
start_date, end_date = filing_date.split(":")
if not start_date:
start_date = "1994-06-01"
if not end_date:
end_date = date.today().strftime("%Y-%m-%d")
start_year, start_month, _ = map(int, start_date.split("-"))
end_year, end_month, _ = map(int, end_date.split("-"))
start_quarter = (start_month - 1) // 3 + 1
end_quarter = (end_month - 1) // 3 + 1
result = []
for year in range(start_year, end_year + 1):
if year == start_year and year == end_year:
quarters = range(start_quarter, end_quarter + 1)
elif year == start_year:
quarters = range(start_quarter, 5)
elif year == end_year:
quarters = range(1, end_quarter + 1)
else:
quarters = range(1, 5)
for quarter in quarters:
result.append((year, quarter))
return result
else:
year, month, _ = map(int, filing_date.split("-"))
quarter = (month - 1) // 3 + 1
return [(year, quarter)]
def current_year_and_quarter() -> Tuple[int, int]:
# Define the Eastern timezone
eastern = pytz.timezone('America/New_York')
# Get the current time in Eastern timezone
now_eastern = datetime.datetime.now(eastern)
# Calculate the current year and quarter
current_year, current_quarter = now_eastern.year, (now_eastern.month - 1) // 3 + 1
return current_year, current_quarter
def filter_by_date(data: pa.Table,
date: Union[str, datetime.datetime],
date_col: str) -> pa.Table:
# If datetime convert to string
if isinstance(date, datetime.date) or isinstance(date, datetime.datetime):
date = date.strftime('%Y-%m-%d')
def decode_content(content: bytes):
try:
return content.decode('utf-8')
except UnicodeDecodeError:
return content.decode('latin-1')
text_extensions = (".txt", ".htm", ".html", ".xsd", ".xml", "XML", ".json", ".idx", ".paper")
binary_extensions = (".pdf", ".jpg", ".jpeg", "png", ".gif", ".tif", ".tiff", ".bmp", ".ico", ".svg", ".webp", ".avif",
".apng")
class DataPager:
def __init__(self,
data: Union[pa.Table, pd.DataFrame],
page_size=default_page_size):
self.data: Union[pa.Table, pd.DataFrame] = data
self.page_size = page_size
self.total_pages = (len(self.data) // page_size) + 1
self.current_page = 1
def has_next(self):
return self.current_page < self.total_pages
def has_previous(self):
return self.current_page > 1
def next(self):
"""Get the next page of data"""
if self.has_next():
self.current_page += 1
return self.current()
else:
return None
def previous(self):
"""Get the previous page of data"""
if self.has_previous():
self.current_page -= 1
return self.current()
else:
return None
@property
def _current_range(self) -> Tuple[int, int]:
"""Get the current start and end index for the data"""
start_index = (self.current_page - 1) * self.page_size
end_index = min(len(self.data), start_index + self.page_size)
return start_index, end_index
def current(self) -> pa.Table:
"""
Get the current data page as a pyarrow Table
:return:
"""
start_index = (self.current_page - 1) * self.page_size
end_index = start_index + self.page_size
if isinstance(self.data, pa.Table):
return self.data.slice(offset=start_index, length=self.page_size)
else:
return self.data.iloc[start_index:end_index]
@property
def start_index(self):
return (self.current_page - 1) * self.page_size
@property
def end_index(self):
return self.start_index + self.page_size
@dataclass
class PagingState:
page_start: int
num_records: int
def parse_acceptance_datetime(acceptance_datetime: str) -> datetime.datetime:
return datetime.datetime.fromisoformat(acceptance_datetime.replace('Z', '+00:00'))
def sample_table(table, n=None, frac=None, replace=False, random_state=None):
"""Take a sample from a pyarrow Table"""
if random_state:
random.seed(random_state)
if frac is not None:
n = int(len(table) * frac)
if n is not None:
if replace:
indices = [random.randint(0, len(table) - 1) for _ in range(n)]
else:
indices = random.sample(range(len(table)), min(n, len(table)))
else:
indices = random.sample(range(len(table)), len(table))
return table.take(indices)
def run_async_or_sync(coroutine):
try:
# Check if we're in an IPython environment
ipython = sys.modules['IPython']
if 'asyncio' in sys.modules:
# try is needed for ipython console
try:
loop = asyncio.get_event_loop()
except RuntimeError:
import nest_asyncio
nest_asyncio.apply()
loop = asyncio.get_event_loop()
if loop.is_running():
# We're in a notebook with an active event loop
import nest_asyncio
nest_asyncio.apply()
return loop.run_until_complete(coroutine)
else:
# We're in IPython but without an active event loop
return loop.run_until_complete(coroutine)
else:
# We're in IPython but asyncio is not available
return ipython.get_ipython().run_cell_magic('time', '', f'import asyncio; asyncio.run({coroutine!r})')
except (KeyError, AttributeError):
# We're not in an IPython environment, use asyncio.run()
return asyncio.run(coroutine)
def listify(value):
"""
Convert the input to a list if it's not already a list.
Args:
value: Any type of input
Returns:
list: The input as a list
"""
if isinstance(value, list):
return value
elif isinstance(value, range):
return list(value)
else:
return [value]
def is_start_of_quarter():
today = datetime.datetime.now().date()
# Check if it's the start of a quarter
if today.month in [1, 4, 7, 10] and today.day <= 5:
# Get the first day of the current quarter
first_day_of_quarter = datetime.datetime(today.year, today.month, 1).date()
# Calculate one business day after the start of the quarter
one_business_day_after = (first_day_of_quarter + BDay(1)).date()
# Check if we haven't passed one full business day yet
if today <= one_business_day_after:
return True
return False
def cache_except_none(maxsize=128):
"""
A decorator that caches the result of a function, but only if the result is not None.
"""
def decorator(func):
cache = lru_cache(maxsize=maxsize)
@cache
def cached_func(*args, **kwargs):
result = func(*args, **kwargs)
if result is None:
# Clear this result from the cache
cached_func.cache_clear()
return result
@wraps(func)
def wrapper(*args, **kwargs):
return cached_func(*args, **kwargs)
# Preserve cache methods
wrapper.cache_info = cached_func.cache_info
wrapper.cache_clear = cached_func.cache_clear
return wrapper
return decorator
def is_probably_html(content: str) -> bool:
"""Does it have html tags"""
if isinstance(content, bytes):
content = content.decode('utf-8', errors='ignore')
# Check for common HTML tags
html_tags = ['<html>', '<body>', '<head>', '<title>', '<div', '<span', '<p>']
return any(tag in content.lower() for tag in html_tags)
def has_html_content(content: str) -> bool:
"""
Check if the content is HTML or inline XBRL HTML
"""
if content is None:
return False
if isinstance(content, bytes):
content = content.decode('utf-8', errors='ignore')
# Strip only leading whitespace and get first 200 chars for doctype check
content = content.lstrip()
first_200_lower = content[:200].lower()
# Check for XHTML doctype declarations
if '<!doctype html public "-//w3c//dtd xhtml' in first_200_lower or \
'<!doctype html system "http://www.w3.org/tr/xhtml1/dtd/' in first_200_lower or \
'<!doctype html public "-//w3c//dtd html 4.01 transitional//en"' in first_200_lower:
return True
# Look for common XML/HTML indicators in first 1000 chars
first_1000 = content[:1000]
# Check for standard XHTML namespace
if 'xmlns="http://www.w3.org/1999/xhtml"' in first_1000:
return True
# Check for HTML root element
if '<html' in first_1000:
# Check for common inline XBRL namespaces
if ('xmlns:xbrli' in first_1000 or
'xmlns:ix' in first_1000 or
'xmlns:html' in first_1000):
return True
# If we have an <html> tag, it's likely HTML content
# This catches cases like <html style="..."> that don't have XBRL namespaces
return True
# Just check for straightforward HTML
if first_200_lower.startswith('<html>') and content[-7:].lower().startswith('</html>'):
return True
return False
T = TypeVar('T')
R = TypeVar('R')
def parallel_thread_map(func: Callable[[T], R],
items: Iterable[T],
**kwargs) -> List[R]:
"""
Run a function in parallel across multiple items using ThreadPoolExecutor.
This is a replacement for fastcore's parallel function, supporting only the threadpool
execution mode. It does not include progress bars.
Args:
func: The function to apply to each item
items: The items to process
**kwargs: Additional keyword arguments to pass to func
Returns:
List of results from applying func to each item
"""
# Default to min(32, cores+4) which is a good balance for I/O-bound tasks
max_workers = kwargs.pop('n_workers', None) or min(32, (os.cpu_count() or 1) + 4)
# Convert items to a list for easier handling
items_list = list(items)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
if kwargs:
# If there are kwargs, create a partial function
partial_func = partial(func, **kwargs)
results = list(executor.map(partial_func, items_list))
else:
results = list(executor.map(func, items_list))
return results
def initialize_rich_logging():
# Rich logging
logging.basicConfig(
level="INFO",
format="%(message)s",
datefmt="[%X]",
handlers=[RichHandler(rich_tracebacks=True)]
)
# Turn down 3rd party logging
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpxthrottlecache").setLevel(logging.WARNING)
logging.getLogger("pyrate_limiter").setLevel(
logging.CRITICAL
) # TODO: Temporary, until next pyrate_limiter update that reduces the spurious "async" message
# Turn on rich logging if the environment variable is set
if os.getenv('EDGAR_USE_RICH_LOGGING', '0') == '1':
initialize_rich_logging()

View File

@@ -0,0 +1,428 @@
import re
from datetime import datetime
from functools import lru_cache
from typing import Optional
import pyarrow as pa
import pyarrow.compute as pc
from bs4 import BeautifulSoup
from rich import box
from rich.console import Group
from rich.panel import Panel
from rich.status import Status
from rich.table import Table
from rich.text import Text
from edgar._filings import Filings
from edgar.core import IntString
from edgar.formatting import accepted_time_text, accession_number_text
from edgar.httprequests import get_with_retry
from edgar.reference.tickers import find_ticker
from edgar.xmltools import child_text
__all__ = [
'CurrentFilings',
'get_current_filings',
'get_all_current_filings',
'iter_current_filings_pages',
]
summary_regex = re.compile(r'<b>([^<]+):</b>\s+([^<\s]+)')
title_regex = re.compile(r"(.*?) - (.*) \((\d+)\) \((.*)\)")
"""
Get the current filings from the SEC. Use this to get the filings filed after the 5:30 deadline
"""
GET_CURRENT_URL = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&output=atom&owner=only&count=100"
def _empty_filing_index():
schema = pa.schema([
('form', pa.string()),
('company', pa.string()),
('cik', pa.int32()),
('filing_date', pa.date32()),
('accession_number', pa.string()),
('accepted', pa.timestamp('s')),
])
# Create an empty table with the defined schema
return pa.Table.from_arrays([
pa.array([], type=pa.string()),
pa.array([], type=pa.string()),
pa.array([], type=pa.int32()),
pa.array([], type=pa.date32()),
pa.array([], type=pa.string()),
pa.array([], type=pa.timestamp('s')),
], schema=schema)
def parse_title(title: str):
"""
Given the title in this example
"144 - monday.com Ltd. (0001845338) (Subject)"
which contains the form type, company name, CIK, and status
parse into a tuple of form type, company name, CIK, and status using regex
"""
match = title_regex.match(title)
if not match:
raise ValueError(f"Could not parse title: {title} using regex: {title_regex}")
return match.groups()
def parse_summary(summary: str):
"""
Given the summary in this example
"Filed: 2021-09-30 AccNo: 0001845338-21-000002 Size: 1 MB"
parse into a tuple of filing date, accession number, and size
"""
# Remove <b> and </b> tags from summary
matches = re.findall(summary_regex, summary)
# Convert matches into a dictionary
fields = {k.strip(): (int(v) if v.isdigit() else v) for k, v in matches}
filed_date = fields.get('Filed')
if not filed_date:
raise ValueError(f"Could not find 'Filed' date in summary: {summary}")
accession_no = fields.get('AccNo')
if not accession_no:
raise ValueError(f"Could not find 'AccNo' in summary: {summary}")
try:
filing_date = datetime.strptime(str(filed_date), '%Y-%m-%d').date()
except ValueError as e:
raise ValueError(f"Invalid date format in summary: {filed_date}") from e
return filing_date, accession_no
def get_current_url(atom: bool = True,
count: int = 100,
start: int = 0,
form: str = '',
owner: str = 'include'):
url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent"
count = count if count in [10, 20, 40, 80, 100] else 40
owner = owner if owner in ['include', 'exclude', 'only'] else 'include'
url = url + f"&count={count}&start={start}&type={form}&owner={owner}"
if atom:
url += "&output=atom"
return url
@lru_cache(maxsize=32)
def get_current_entries_on_page(count: int, start: int, form: Optional[str] = None, owner: str = 'include'):
url = get_current_url(count=count, start=start, form=form if form else '', owner=owner, atom=True)
response = get_with_retry(url)
soup = BeautifulSoup(response.text, features="xml")
entries = []
for entry in soup.find_all("entry"):
# The title contains the form type, company name, CIK, and status e.g 4 - WILKS LEWIS (0001076463) (Reporting)
title = child_text(entry, "title")
form_type, company_name, cik, status = parse_title(title)
# The summary contains the filing date and link to the filing
summary = child_text(entry, "summary")
filing_date, accession_number = parse_summary(summary)
accepted = datetime.fromisoformat(child_text(entry, "updated"))
entries.append({'form': form_type,
'company': company_name,
'cik': int(cik),
'filing_date': filing_date,
'accession_number': accession_number,
'accepted': accepted})
return entries
class CurrentFilings(Filings):
"""
This version of the Filings class is used to get the current filings from the SEC
page by page
"""
def __init__(self,
filing_index: pa.Table,
form: str = '',
start: int = 1,
page_size: int = 40,
owner: str = 'include'):
super().__init__(filing_index, original_state=None)
self._start = start
self._page_size = page_size
self.owner = owner
self.form = form
def next(self):
# If the number of entries is less than the page size then we are at the end of the data
if len(self.data) < self._page_size:
return None
start = self._start + len(self.data)
next_entries = get_current_entries_on_page(start=start-1, count=self._page_size, form=self.form, owner=self.owner)
if next_entries:
# Copy the values to this Filings object and return it
self.data = pa.Table.from_pylist(next_entries)
self._start = start
return self
def previous(self):
# If start = 1 then there are no previous entries
if self._start == 1:
return None
start = max(1, self._start - self._page_size)
previous_entries = get_current_entries_on_page(start=start, count=self._page_size, form=self.form, owner=self.owner)
if previous_entries:
# Copy the values to this Filings object and return it
self.data = pa.Table.from_pylist(previous_entries)
self._start = start
return self
def __getitem__(self, item): # type: ignore
result = self.get(item)
if result is None:
if isinstance(item, int) or item.isdigit():
raise IndexError(f"Filing index {item} is out of range for current page")
else:
raise KeyError(f"Filing with accession number '{item}' not found")
return result
def __iter__(self):
"""Override to reset iteration index for current page"""
self.n = 0
return self
def __next__(self):
"""Override to handle pagination properly - use page-relative indices"""
if self.n < len(self.data):
filing = super().get_filing_at(self.n) # Use page-relative index directly
self.n += 1
return filing
else:
raise StopIteration
def get(self, index_or_accession_number: IntString):
if isinstance(index_or_accession_number, int) or index_or_accession_number.isdigit():
idx = int(index_or_accession_number)
if self._start - 1 <= idx < self._start - 1 + len(self.data):
# Where on this page is the index
idx_on_page = idx - (self._start - 1)
return super().get_filing_at(idx_on_page)
# Index is out of bounds for current page
return None
else:
accession_number = index_or_accession_number.strip()
# See if the filing is in this page
filing = super().get(accession_number)
if filing:
return filing
current_filings = get_current_filings(self.form, self.owner, page_size=100)
filing = CurrentFilings._get_current_filing_by_accession_number(current_filings.data, accession_number)
if filing:
return filing
with Status(f"[bold deep_sky_blue1]Searching through the most recent filings for {accession_number}...",
spinner="dots2"):
while True:
current_filings = current_filings.next()
if current_filings is None:
return None
filing = CurrentFilings._get_current_filing_by_accession_number(current_filings.data,
accession_number)
if filing:
return filing
@staticmethod
def _get_current_filing_by_accession_number(data: pa.Table, accession_number: str):
from edgar import Filing
mask = pc.equal(data['accession_number'], accession_number)
try:
idx = mask.index(True).as_py()
if idx > -1:
return Filing(
cik=data['cik'][idx].as_py(),
company=data['company'][idx].as_py(),
form=data['form'][idx].as_py(),
filing_date=data['filing_date'][idx].as_py(),
accession_no=data['accession_number'][idx].as_py(),
)
except ValueError:
# Accession number not found in this batch
pass
return None
def __rich__(self):
# Create table with appropriate columns and styling
table = Table(
show_header=True,
header_style="bold",
show_edge=True,
expand=False,
padding=(0, 1),
box=box.SIMPLE,
)
# Add columns with specific styling and alignment
table.add_column("#", style="dim", justify="right")
table.add_column("Form", width=14)
table.add_column("CIK", style="dim", width=10, justify="right")
table.add_column("Ticker", width=6, style="yellow")
table.add_column("Company", style="bold green", width=38, no_wrap=True)
table.add_column("Accepted", width=20)
table.add_column("Accession Number", width=20)
table.add_column(" ", width=1, style="cyan dim") # Group indicator column
# Access data directly from PyArrow table (zero-copy)
num_rows = len(self.data)
start_idx = self._start - 1
# Get accession numbers for grouping (zero-copy access)
accession_numbers = self.data.column('accession_number').to_pylist()
# Identify groups of consecutive filings with same accession number
groups = {}
for i in range(len(accession_numbers)):
acc_no = accession_numbers[i]
# Check previous and next accession numbers
prev_acc = accession_numbers[i-1] if i > 0 else None
next_acc = accession_numbers[i+1] if i < len(accession_numbers)-1 else None
if acc_no != prev_acc and acc_no == next_acc:
groups[i] = '' # Start of group
elif acc_no == prev_acc and acc_no == next_acc:
groups[i] = '' # Middle of group
elif acc_no == prev_acc and acc_no != next_acc:
groups[i] = '' # End of group
else:
groups[i] = ' ' # Standalone filing
# Iterate through PyArrow table directly (zero-copy)
for idx in range(num_rows):
row_index = start_idx + idx
cik = self.data['cik'][idx].as_py()
ticker = find_ticker(cik)
row = [
str(row_index),
self.data['form'][idx].as_py(),
str(cik),
ticker,
self.data['company'][idx].as_py(),
accepted_time_text(self.data['accepted'][idx].as_py()),
accession_number_text(self.data['accession_number'][idx].as_py()),
groups.get(idx, ' ') # Add group indicator
]
table.add_row(*row)
# Show paging information only if there are multiple pages
elements = [table]
page_info = Text.assemble(
("Showing ", "dim"),
(f"{start_idx:,}", "bold red"),
(" to ", "dim"),
(f"{start_idx + num_rows - 1:,}", "bold red"),
(" most recent filings.", "dim"),
(" Page using ", "dim"),
("← prev()", "bold gray54"),
(" and ", "dim"),
("next() →", "bold gray54")
)
elements.extend([Text("\n"), page_info])
# Get the subtitle
start_date, end_date = self.date_range
subtitle = "Most recent filings from the SEC"
return Panel(
Group(*elements),
title="SEC Filings",
subtitle=subtitle,
border_style="bold grey54",
expand=False
)
def get_all_current_filings(form: str = '',
owner: str = 'include',
page_size: int = 100) -> 'Filings':
"""
Get ALL current filings by iterating through all pages.
Args:
form: Form type to filter by (e.g., "10-K", "8-K")
owner: Owner filter ('include', 'exclude', 'only')
page_size: Number of filings per page (10, 20, 40, 80, 100)
Returns:
Filings: A regular Filings object containing all current filings
Example:
>>> all_filings = get_all_current_filings(form="10-K")
>>> print(f"Found {len(all_filings)} total current 10-K filings")
"""
from edgar._filings import Filings
all_entries = []
for page in iter_current_filings_pages(form=form, owner=owner, page_size=page_size):
# Convert PyArrow table to list and extend
page_entries = page.data.to_pylist()
all_entries.extend(page_entries)
if not all_entries:
return Filings(_empty_filing_index())
# Return as regular Filings object (not CurrentFilings)
return Filings(pa.Table.from_pylist(all_entries))
def get_current_filings(form: str = '',
owner: str = 'include',
page_size: int = 40):
"""
Get the current filings from the SEC
:return: The current filings from the SEC
"""
owner = owner if owner in ['include', 'exclude', 'only'] else 'include'
page_size = page_size if page_size in [10, 20, 40, 80, 100] else 100
start = 0
entries = get_current_entries_on_page(count=page_size, start=start, form=form, owner=owner)
if not entries:
return CurrentFilings(filing_index=_empty_filing_index(), owner=owner, form=form, page_size=page_size)
return CurrentFilings(filing_index=pa.Table.from_pylist(entries), owner=owner, form=form, page_size=page_size)
def iter_current_filings_pages(form: str = '',
owner: str = 'include',
page_size: int = 100):
"""
Iterator that yields CurrentFilings pages until exhausted.
Args:
form: Form type to filter by (e.g., "10-K", "8-K")
owner: Owner filter ('include', 'exclude', 'only')
page_size: Number of filings per page (10, 20, 40, 80, 100)
Yields:
CurrentFilings: Each page of current filings until no more pages
Example:
>>> for page in iter_current_filings_pages(form="10-K"):
... print(f"Processing {len(page)} filings")
... # Process each page
"""
current_page = get_current_filings(form=form, owner=owner, page_size=page_size)
while current_page is not None:
yield current_page
current_page = current_page.next()

View File

@@ -0,0 +1,387 @@
from dataclasses import dataclass
from typing import Union
import numpy as np
import pandas as pd
import pyarrow as pa
from lxml import html as lxml_html
__all__ = [
"compress_dataframe",
"table_html_to_dataframe",
"table_tag_to_dataframe",
"markdown_to_dataframe",
"dataframe_to_text",
"clean_column_text",
'convert_to_numeric',
'describe_dataframe',
'na_value',
'replace_all_na_with_empty',
'convert_to_pyarrow_backend',
'drop_duplicates_pyarrow',
'repr_df',
'DataPager',
'PagingState',
]
def clean_column_text(text: str):
"""Remove newlines and extra spaces from column text.
' Per Share ' -> 'Per Share'
'Per\nShare' -> 'Per Share'
'Per Share' -> 'Per Share'
"""
text = ' '.join(text.strip().split())
text = text.strip()
return text
def compress_dataframe(df: pd.DataFrame) -> pd.DataFrame:
"""Remove empty rows and columns from a DataFrame.
Args:
df: DataFrame to compress
Returns:
Compressed DataFrame with empty rows and columns removed
"""
# Remove empty rows and columns
df = (df.replace('', pd.NA)
.dropna(axis=1, how="all")
.dropna(axis=0, how="all"))
# Fill na
df = df.fillna('')
return df
def repr_df(df: pd.DataFrame, hide_index: bool = True) -> str:
"""Return a string representation of a DataFrame.
Args:
df: DataFrame to represent as string
hide_index: Whether to hide the index in the output
Returns:
String representation of the DataFrame
"""
if hide_index:
return df.to_string(index=False)
return df.to_string()
@dataclass
class PagingState:
"""State for paginating through data."""
page: int = 1
page_size: int = 50
total_items: int = 0
@property
def start_idx(self) -> int:
"""Get the start index for the current page."""
return (self.page - 1) * self.page_size
@property
def end_idx(self) -> int:
"""Get the end index for the current page."""
return min(self.start_idx + self.page_size, self.total_items)
@property
def has_more(self) -> bool:
"""Check if there are more pages."""
return self.end_idx < self.total_items
class DataPager:
"""Class for paginating through data."""
def __init__(self, data: Union[pd.DataFrame, pa.Table], page_size: int = 50):
"""Initialize the pager.
Args:
data: Data to paginate through
page_size: Number of items per page
"""
self.data = data
self.state = PagingState(page_size=page_size, total_items=len(data))
def get_page(self, page: int = 1) -> Union[pd.DataFrame, pa.Table]:
"""Get a specific page of data.
Args:
page: Page number to get (1-based)
Returns:
Slice of data for the requested page
"""
self.state.page = page
return self.data[self.state.start_idx:self.state.end_idx]
def adjust_column_headers(df: pd.DataFrame):
""" Replace numeric column headers with blank strings. """
# Check if column names are integers (default index names in pandas DataFrames)
if all(isinstance(col, int) for col in df.columns):
# Replace them with blank strings
df.columns = ['' for _ in df.columns]
return df
def should_promote_to_header(df: pd.DataFrame) -> bool:
if df.shape[0] > 1:
first_row = df.iloc[0]
# Check for uniformity and non-numeric nature
if all(isinstance(item, str) for item in first_row):
# Pattern matching for typical header keywords
header_keywords = {'title', 'name', 'number', 'description', 'date', 'total', 'id'}
if any(any(keyword in str(cell).lower() for keyword in header_keywords) for cell in first_row):
return True
# Check distinctiveness compared to the second row (simple heuristic)
second_row = df.iloc[1]
difference_count = sum(1 for f, s in zip(first_row, second_row, strict=False) if f != s)
if difference_count > len(first_row) / 2: # Arbitrary threshold: more than half are different
return True
return False
def table_html_to_dataframe(html_str: str) -> pd.DataFrame:
tree = lxml_html.fromstring(html_str)
table_element = tree.xpath("//table")[0]
rows = table_element.xpath(".//tr")
data = []
for row in rows:
cols = row.xpath(".//td | .//th") # Handle both 'td' and 'th' if present
cols = [clean_column_text(lxml_html.tostring(c, method='text', encoding='unicode').strip()) for c in cols]
data.append(cols)
df = pd.DataFrame(data)
df = adjust_column_headers(df) # Adjust headers if not promoted
df = compress_dataframe(df)
return df
def table_tag_to_dataframe(table_tag):
"""Convert a BeautifulSoup table Tag to a DataFrame."""
rows = table_tag.find_all('tr')
data = []
for row in rows:
# Find all 'td' tags within each 'tr' tag
cols = row.find_all('td')
# Get the text from each 'td' tag, handling nested tags automatically
cols = [clean_column_text(col.get_text(strip=True)) for col in cols]
data.append(cols)
df = pd.DataFrame(data)
return df
def markdown_to_dataframe(markdown_table):
# Split the markdown table into rows
rows = markdown_table.split('\n')
# Extract the header row
header = rows[0].split('|')
header = [col.strip() for col in header]
# Extract the data rows
data_rows = []
for row in rows[2:]:
if not row.strip():
continue
data_row = row.split('|')
data_row = [col.strip() for col in data_row]
data_rows.append(data_row)
# Create a pandas DataFrame
if len(data_rows) == 0:
df = pd.DataFrame([header], columns=["" for col in header])
else:
df = pd.DataFrame(data_rows, columns=header)
df = compress_dataframe(df)
return df
def dataframe_to_text(df, include_index=False, include_headers=False):
"""
Convert a Pandas DataFrame to a plain text string, with formatting options for including
the index and column headers.
Parameters:
- df (pd.DataFrame): The dataframe to convert
- include_index (bool): Whether to include the index in the text output. Defaults to True.
- include_headers (bool): Whether to include column headers in the text output. Defaults to True.
Returns:
str: The dataframe converted to a text string.
"""
# Getting the maximum width for each column
column_widths = df.apply(lambda col: col.astype(str).str.len().max())
# If including indexes, get the maximum width of the index
index_label = ''
if include_index:
index_label = "Index"
index_width = max(df.index.astype(str).map(len).max(), len(index_label))
else:
index_width = 0
# Initialize an empty string to store the text
text_output = ""
# Include column headers if specified
if include_headers:
# Add index label if specified
if include_index:
text_output += f"{index_label:<{index_width}}\t"
# Create and add the header row
headers = [f"{col:<{width}}" for col, width in zip(df.columns, column_widths, strict=False)]
text_output += '\t'.join(headers) + '\n'
# Loop through each row of the dataframe
for index, row in df.iterrows():
# Include index if specified
if include_index:
text_output += f"{index:<{index_width}}\t"
# Format each value according to the column width and concatenate
row_values = [f"{val:<{width}}" for val, width in zip(row.astype(str), column_widths, strict=False)]
text_output += '\t'.join(row_values) + '\n'
return text_output
def convert_to_numeric(series):
"""Convert a pandas Series to numeric if possible, otherwise return the original series."""
try:
return pd.to_numeric(series)
except ValueError:
return series
def describe_dataframe(df: pd.DataFrame) -> pd.DataFrame:
# Get data types of columns
dtypes = df.dtypes
# Create a Series for the index dtype
index_dtype = pd.Series(df.index.dtype, index=['Index'])
# Concatenate the dtypes and index_dtype
all_dtypes = pd.concat([index_dtype, dtypes])
# Get memory usage of each column including the index, in kilobytes and round to 2 decimal places
memory_usage = df.memory_usage(deep=True) / 1024
memory_usage.index = memory_usage.index.astype(str) # Ensure index labels are string type
memory_usage = memory_usage.round(2) # Round memory usage to 2 decimal places
# Calculate total memory usage
total_memory_usage = memory_usage.sum()
# Create a DataFrame with the information
description_df = pd.DataFrame({
'Data type': all_dtypes.to_numpy(),
'Memory Usage (KB)': memory_usage.to_numpy()
}, index=all_dtypes.index)
# Append the total memory usage as the last row
total_row = pd.DataFrame({
'Data type': [''],
'Memory Usage (KB)': [total_memory_usage]
}, index=['Total'])
description_df = pd.concat([description_df, total_row])
return description_df
def convert_to_pyarrow_backend(data:pd.DataFrame):
# Convert dtypes carefully
for col in data.columns:
if data[col].dtype == 'object':
# For object columns, convert to string
data[col] = data[col].astype(str)
elif data[col].dtype == 'float64':
# For float columns, use float32 to match PyArrow's default
data[col] = data[col].astype('float32')
# Now convert to PyArrow
return data.convert_dtypes(dtype_backend="pyarrow")
def replace_all_na_with_empty(df_or_series):
if isinstance(df_or_series, pd.DataFrame):
for column in df_or_series.columns:
# Check if the column is all NA or None
if df_or_series[column].isna().all():
# Get the length of the DataFrame
length = len(df_or_series)
# Create a new Series of empty strings
empty_series = pd.Series([''] * length, name=column)
# Replace the column with the new Series
df_or_series[column] = empty_series
return df_or_series
elif isinstance(df_or_series, pd.Series):
# Check if the series is all NA or None
if df_or_series.isna().all():
# Create a new Series of empty strings with the same index and name
return pd.Series('', index=df_or_series.index, name=df_or_series.name)
else:
# If not all NA, return the original series
return df_or_series
def na_value(value, default_value:object=''):
if pd.isna(value):
return default_value
return value
def drop_duplicates_pyarrow(table, column_name, keep='first'):
"""
Drop duplicates from a PyArrow Table based on a specified column.
Parameters:
- table (pa.Table): The input PyArrow Table
- column_name (str): The column to check for duplicates
- keep (str): 'first' to keep first occurrence, 'last' to keep last occurrence
Returns:
- pa.Table: A new table with duplicates removed
"""
if column_name not in table.column_names:
raise ValueError(f"Column '{column_name}' not found in table")
if keep not in ['first', 'last']:
raise ValueError("Parameter 'keep' must be 'first' or 'last'")
# Extract the column as an array
column_array = table[column_name]
# Convert to NumPy array and get unique indices
np_array = column_array.to_numpy()
unique_values, unique_indices = np.unique(np_array, return_index=True)
if keep == 'first':
# Sort indices to maintain original order for first occurrences
sorted_indices = np.sort(unique_indices)
else: # keep == 'last'
# Get the last occurrence by reversing the array logic
reverse_indices = len(np_array) - 1 - np.unique(np_array[::-1], return_index=True)[1]
sorted_indices = np.sort(reverse_indices)
# Create a boolean mask to filter the table
mask = np.zeros(len(table), dtype=bool)
mask[sorted_indices] = True
# Filter the table using the mask
deduplicated_table = table.filter(pa.array(mask))
return deduplicated_table

View File

@@ -0,0 +1,76 @@
import datetime
from typing import Optional, Tuple
__all__ = [
"extract_dates",
"InvalidDateException"
]
class InvalidDateException(Exception):
def __init__(self, message: str):
super().__init__(message)
def extract_dates(date_str: str) -> Tuple[Optional[datetime.datetime], Optional[datetime.datetime], bool]:
"""
Split a date or a date range into start_date and end_date
Examples:
extract_dates("2022-03-04") -> 2022-03-04, None, False
extract_dates("2022-03-04:2022-04-05") -> 2022-03-04, 2022-04-05, True
extract_dates("2022-03-04:") -> 2022-03-04, <current_date>, True
extract_dates(":2022-03-04") -> 1994-07-01, 2022-03-04, True
Args:
date_str: Date string in YYYY-MM-DD format, optionally with a range separator ':'
Returns:
Tuple of (start_date, end_date, is_range) where dates are datetime objects
and is_range indicates if this was a date range query
Raises:
InvalidDateException: If the date string cannot be parsed
"""
if not date_str:
raise InvalidDateException("Empty date string provided")
try:
# Split on colon, handling the single date case
has_colon = ':' in date_str
parts = date_str.split(':') if has_colon else [date_str]
# Handle invalid formats
if len(parts) != (2 if has_colon else 1):
raise InvalidDateException("Invalid date range format")
# Parse start date
if not has_colon or parts[0]:
start_date = datetime.datetime.strptime(parts[0], "%Y-%m-%d")
else:
start_date = datetime.datetime.strptime('1994-07-01', '%Y-%m-%d')
# Parse end date
if has_colon and parts[1]:
end_date = datetime.datetime.strptime(parts[1], "%Y-%m-%d")
elif has_colon:
end_date = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
else:
end_date = None
# Validate date order if both dates are present
if has_colon and end_date and start_date > end_date:
raise InvalidDateException(
f"Invalid date range: start date ({start_date.date()}) "
f"cannot be after end date ({end_date.date()})"
)
return start_date, end_date, has_colon
except ValueError as e:
raise InvalidDateException(f"""
Cannot extract a date or date range from string {date_str}
Provide either
1. A date in the format "YYYY-MM-DD" e.g. "2022-10-27"
2. A date range in the format "YYYY-MM-DD:YYYY-MM-DD" e.g. "2022-10-01:2022-10-27"
3. A partial date range "YYYY-MM-DD:" to specify dates after the value e.g. "2022-10-01:"
4. A partial date range ":YYYY-MM-DD" to specify dates before the value e.g. ":2022-10-27"
""") from e

View File

@@ -0,0 +1,263 @@
# Filing Class Documentation
## Overview
The `Filing` class is the core object in edgartools for working with individual SEC filings. It provides comprehensive access to filing content, metadata, documents, and related functionality, making it easy to analyze and extract data from SEC filings.
## Common Actions
Quick reference for the most frequently used Filing methods:
### Access Filing Content
```python
# Get HTML content
html = filing.html()
# Get plain text
text = filing.text()
# Get markdown formatted content
markdown = filing.markdown()
```
### Get Structured Data
```python
# Get form-specific object (10-K, 10-Q, 8-K, etc.)
report = filing.obj()
# Get XBRL financial data
xbrl = filing.xbrl()
```
### View in Browser
```python
# Open filing in web browser
filing.open()
```
### Get Attachments
```python
# Access all filing attachments
attachments = filing.attachments
```
## Constructor
```python
Filing(cik: int, company: str, form: str, filing_date: str, accession_no: str)
```
**Parameters:**
- `cik`: Company's Central Index Key (integer)
- `company`: Company name (string)
- `form`: SEC form type (e.g., "10-K", "8-K", "DEF 14A")
- `filing_date`: Date of filing (YYYY-MM-DD format)
- `accession_no`: Unique SEC accession number
## Core Properties
| Property | Type | Description |
|----------|------|-------------|
| `cik` | int | Company's Central Index Key |
| `company` | str | Company name |
| `form` | str | SEC form type |
| `filing_date` | str | Filing date |
| `accession_no` | str | SEC accession number |
| `accession_number` | str | Alias for accession_no |
## Document Access
### Primary Documents
- **`document`** - The primary display document (HTML/XHTML)
- **`primary_documents`** - List of all primary documents
- **`attachments`** - All filing attachments
- **`exhibits`** - Filing exhibits
### Content Formats
- **`html()`** - HTML content of the primary document
- **`xml()`** - XML content of the primary document
- **`text()`** - Plain text version of the document
- **`markdown()`** - Markdown formatted version
## Financial Data Access
### XBRL Data
```python
# Access structured financial data
filing.xbrl() # Returns XBRLInstance with financial statements
filing.statements # Direct access to financial statements
```
### SGML Data
```python
# Access SGML filing data
filing.sgml() # Returns SGMLFiling object
```
## Navigation & URLs
| Property/Method | Description |
|----------------|-------------|
| `homepage` | Filing homepage information |
| `homepage_url` | URL to the filing homepage |
| `filing_url` | URL to the main filing document |
| `text_url` | URL to the text version |
| `base_dir` | Base directory URL for the filing |
## Search & Analysis
### Content Search
```python
# Search filing content
results = filing.search("revenue recognition", regex=False)
# Search with regex
results = filing.search(r"\b\d+\.\d+%", regex=True)
```
### Document Structure
- **`sections()`** - Get HTML sections for advanced search
- **`period_of_report`** - Get the reporting period
## Entity Relationships
### Company Integration
```python
# Get the associated Company object
company = filing.get_entity()
# Convert to company filing with additional data
company_filing = filing.as_company_filing()
# Find related filings
related = filing.related_filings()
```
## Display & Interaction
### Console Display
```python
# Rich console display
filing.view() # Display in console with rich formatting
# String representations
str(filing) # Concise string representation
repr(filing) # Detailed representation
```
### Browser Integration
```python
# Open filing in web browser
filing.open() # Open main document
filing.open_homepage() # Open filing homepage
# Serve filing locally
filing.serve(port=8000) # Serve on localhost:8000
```
## Data Export & Persistence
### Export Formats
```python
# Convert to different formats
filing_dict = filing.to_dict() # Dictionary
filing_df = filing.to_pandas() # DataFrame
summary_df = filing.summary() # Summary DataFrame
```
### Save & Load
```python
# Save filing for later use
filing.save("my_filing.pkl") # Save to file
filing.save("/path/to/directory/") # Save to directory
# Load saved filing
loaded_filing = Filing.load("my_filing.pkl")
```
## Class Methods
### Alternative Constructors
```python
# Create from dictionary
filing = Filing.from_dict(data_dict)
# Create from JSON file
filing = Filing.from_json("filing_data.json")
# Create from SGML data
filing = Filing.from_sgml(sgml_source)
```
## Common Usage Patterns
### Basic Filing Analysis
```python
# Get a filing and explore its content
filing = company.get_filings(form="10-K").latest(1)[0]
# Access financial statements
statements = filing.xbrl()
income_statement = statements.income_statement
# Search for specific content
results = filing.search("risk factors")
# View in browser
filing.open()
```
### Working with Attachments
```python
# Get all attachments
attachments = filing.attachments
# Find specific exhibits
exhibits = filing.exhibits
exhibit_99_1 = [ex for ex in exhibits if "99.1" in ex.description]
# Access exhibit content
if exhibit_99_1:
content = exhibit_99_1[0].html()
```
### Financial Data Extraction
```python
# Get financial statements
xbrl = filing.xbrl()
# Access different statement types
balance_sheet = xbrl.balance_sheet
income_statement = xbrl.income_statement
cash_flow = xbrl.cash_flow_statement
# Get specific facts
revenue = xbrl.get_facts("Revenues")
```
## Error Handling
The Filing class handles various edge cases gracefully:
- **Missing documents**: Returns None or empty collections
- **Network errors**: Raises appropriate HTTP exceptions
- **Malformed data**: Provides informative error messages
- **File access**: Handles permissions and missing files
## Integration with Other Classes
The Filing class works seamlessly with other edgartools components:
- **Company**: Get filings from companies, convert back to company context
- **Filings**: Part of filing collections with filtering and search
- **XBRLInstance**: Access structured financial data
- **Attachments**: Work with filing documents and exhibits
## Performance Considerations
- **Lazy loading**: Documents and data are loaded only when accessed
- **Caching**: Network requests are cached to improve performance
- **Streaming**: Large documents can be processed in chunks
- **Async support**: Some operations support asynchronous execution
This comprehensive API makes the Filing class the primary interface for working with SEC filing data in edgartools.

View File

@@ -0,0 +1,302 @@
# Filings Class Documentation
## Overview
The `Filings` class is a powerful container for SEC filing data that provides comprehensive functionality for filtering, searching, pagination, and data manipulation. It's built on PyArrow tables for efficient processing of large datasets and offers an intuitive interface for working with collections of SEC filings.
## Getting Filings
```python
filings = get_filings()
```
- **Parameters**:
- `year`: Year of filings (optional)
- `quarter`: Quarter of filings (optional)
- `amendments`: Include amended filings (default: True)
- `ticker`: Company ticker symbol (optional)
- `filing_date`: Date or date range for filtering (optional)
## Core Properties
| Property | Type | Description |
|----------|------|-------------|
| `data` | pa.Table | PyArrow table with filing information |
| `date_range` | Tuple[str, str] | Start and end dates of filings |
| `start_date` | str | Earliest filing date in collection |
| `end_date` | str | Latest filing date in collection |
| `empty` | bool | True if collection contains no filings |
| `summary` | str | Description of current page/total filings |
## Data Access & Conversion
### DataFrame Conversion
```python
# Convert to pandas DataFrame
df = filings.to_pandas() # All columns
df = filings.to_pandas('form', 'company') # Specific columns
```
### Individual Filing Access
```python
# Get filing by index
filing = filings.get_filing_at(0) # First filing
filing = filings[0] # Alternative syntax
# Get filing by accession number
filing = filings.get("0000320193-23-000077")
# Get filing by index or accession
filing = filings.get(5) # By index
filing = filings.get("0000320193-23-000077") # By accession
```
### Export & Persistence
```python
# Save as Parquet file
filings.save_parquet("filings_data.parquet")
filings.save("filings_data.parquet") # Alternative
# Convert to dictionary
data_dict = filings.to_dict(max_rows=1000)
```
## Filtering & Search
### Form-based Filtering
```python
# Single form type
filings.filter(form="10-K")
filings.filter(form="8-K")
# Multiple form types
filings.filter(form=["10-K", "10-Q"])
filings.filter(form=["8-K", "DEF 14A"])
# Include/exclude amendments
filings.filter(form="10-K", amendments=True) # Include amendments
filings.filter(form="10-K", amendments=False) # Exclude amendments
```
### Date Filtering
```python
# Specific date
filings.filter(date="2023-06-15")
filings.filter(filing_date="2023-06-15") # Alternative
# Date ranges
filings.filter(date="2023-01-01:2023-03-31") # Between dates
filings.filter(date="2023-01-01:") # From date onwards
filings.filter(date=":2023-03-31") # Up to date
```
### Company-based Filtering
```python
# By CIK (Central Index Key)
filings.filter(cik=320193) # Single CIK
filings.filter(cik=[320193, 789019]) # Multiple CIKs
# By ticker symbol
filings.filter(ticker="AAPL")
filings.filter(ticker=["AAPL", "MSFT"])
# By exchange
filings.filter(exchange="NASDAQ")
filings.filter(exchange=["NYSE", "NASDAQ"])
# By accession number
filings.filter(accession_number="0000320193-23-000077")
```
### Company Search
```python
# Search for company and filter
apple_filings = filings.find("Apple")
microsoft_filings = filings.find("Microsoft Corporation")
```
### Combined Filtering
```python
# Complex filtering example
filtered = filings.filter(
form=["10-K", "10-Q"],
date="2023-01-01:2023-12-31",
ticker=["AAPL", "MSFT", "GOOGL"],
amendments=False
)
```
## Data Selection & Sampling
### Latest Filings
```python
# Get most recent filings
latest_filing = filings.latest() # Most recent (default n=1)
latest_five = filings.latest(5) # Most recent 5
```
### Head & Tail
```python
# Get first/last n filings
first_ten = filings.head(10) # First 10 filings
last_ten = filings.tail(10) # Last 10 filings
```
### Random Sampling
```python
# Get random sample
sample = filings.sample(20) # Random 20 filings
```
## Pagination
### Navigation
```python
# Navigate through pages
current_page = filings.current() # Current page info
next_page = filings.next() # Next page
prev_page = filings.previous() # Previous page
```
### Page Information
```python
# Check pagination status
print(filings.summary) # "Page 1 of 50 (total: 12,543 filings)"
is_empty = filings.empty # Check if no results
```
## File Operations
### Download Filings
```python
# Download all filings in collection
filings.download() # Download to default directory
filings.download("./my_filings/") # Download to specific directory
```
## Integration with Other Classes
### Filing Objects
```python
# Each item returns a Filing object
for filing in filings:
print(f"Form: {filing.form}")
print(f"Company: {filing.company}")
print(f"Date: {filing.filing_date}")
# Access filing content
html_content = filing.html()
attachments = filing.attachments
xbrl_data = filing.xbrl()
```
### Company Integration
```python
# Convert filing to company context
filing = filings[0]
company = filing.get_entity() # Get Company object
company_filing = filing.as_company_filing() # Enhanced filing with company data
```
## Rich Console Display
The Filings class provides formatted console output showing:
- Filing table with Form, CIK, Ticker, Company, Filing Date, Accession Number
- Pagination information
- Navigation hints
```python
# Display in console
print(filings) # Rich formatted table
filings.view() # Alternative display method
```
## Common Usage Patterns
### Quarterly Filing Analysis
```python
# Get all 10-K filings for 2023
annual_reports = get_filings(2023).filter(form="10-K", amendments=False)
# Find latest 10-Q for major tech companies
tech_quarterlies = get_filings(2023, 4).filter(
form="10-Q",
ticker=["AAPL", "MSFT", "GOOGL", "TSLA"]
).latest(4)
```
### Company-Specific Research
```python
# Get all Apple filings from Q1 2023
apple_filings = get_filings(2023, 1).find("Apple Inc")
# Filter for specific forms
apple_major_filings = apple_filings.filter(
form=["10-K", "10-Q", "8-K"],
amendments=False
)
```
### Event-Driven Analysis
```python
# Find 8-K filings around specific dates
event_filings = get_filings(2023, 2).filter(
form="8-K",
date="2023-02-01:2023-02-28"
)
# Sample for analysis
sample_events = event_filings.sample(50)
```
### Bulk Data Processing
```python
# Get large dataset and save for later
all_2023_filings = get_filings(2023)
all_2023_filings.save_parquet("2023_filings.parquet")
# Convert to pandas for analysis
df = all_2023_filings.to_pandas(['form', 'company', 'filing_date'])
```
## Performance Considerations
- **PyArrow Backend**: Efficient columnar data processing
- **Lazy Evaluation**: Filters are applied efficiently without loading full documents
- **Pagination**: Large datasets are handled through pagination
- **Caching**: Network requests are cached for improved performance
- **Parallel Processing**: Some operations support concurrent execution
## Error Handling
The Filings class handles various scenarios gracefully:
- **Empty Results**: Returns empty Filings object with `empty=True`
- **Invalid Filters**: Raises informative ValueError with guidance
- **Network Issues**: Propagates HTTP errors with context
- **Data Type Mismatches**: Automatic type conversion where possible
## Method Chaining
Most filtering and selection methods return new Filings objects, enabling method chaining:
```python
# Chain multiple operations
result = (filings
.filter(form=["10-K", "10-Q"])
.filter(date="2023-01-01:2023-06-30")
.filter(amendments=False)
.latest(10))
```
## Schema Information
The underlying PyArrow table contains these key columns:
- `form`: SEC form type
- `cik`: Company Central Index Key
- `ticker`: Stock ticker symbol
- `company`: Company name
- `filing_date`: Date of filing
- `accession_number`: Unique SEC identifier
- Additional metadata columns for enhanced functionality
This comprehensive API makes the Filings class the primary interface for working with collections of SEC filing data in edgartools, providing both power and ease of use for financial data analysis.

View File

@@ -0,0 +1,49 @@
"""
EdgarTools HTML Parser v2.0
A high-performance, semantically-aware HTML parser for SEC filings.
"""
from edgar.documents.parser import HTMLParser
from edgar.documents.document import Document
from edgar.documents.config import ParserConfig
from edgar.documents.exceptions import ParsingError
from edgar.documents.types import NodeType, SemanticType, TableType
from edgar.documents.search import DocumentSearch, SearchResult, SearchMode
from edgar.documents.renderers import MarkdownRenderer, TextRenderer
__version__ = "2.0.0"
__all__ = [
'HTMLParser',
'Document',
'ParserConfig',
'ParsingError',
'NodeType',
'SemanticType',
'TableType',
'DocumentSearch',
'SearchResult',
'SearchMode',
'MarkdownRenderer',
'TextRenderer',
'parse_html'
]
def parse_html(html: str, config: ParserConfig = None) -> Document:
"""
Convenience function for parsing HTML.
Args:
html: HTML content to parse
config: Optional parser configuration
Returns:
Parsed Document object
Example:
>>> document = parse_html(html_content)
>>> print(document.text()[:100])
"""
parser = HTMLParser(config or ParserConfig())
return parser.parse(html)

View File

@@ -0,0 +1,83 @@
"""
Mixin class providing text caching functionality for document nodes.
This module consolidates the text caching pattern used across multiple node types
(DocumentNode, ParagraphNode, ContainerNode, TableNode, and Document).
"""
from typing import Callable, Any
class CacheableMixin:
"""
Mixin providing text caching functionality for nodes.
This mixin implements a lazy-evaluated text caching pattern that:
1. Checks for existing cached text
2. Generates text on first access via a generator function
3. Caches the result for subsequent accesses
4. Provides recursive cache clearing for tree structures
Usage:
class MyNode(CacheableMixin):
def text(self, **kwargs):
def generator():
# Generate text logic here
return "generated text"
return self._get_cached_text(generator)
"""
def _get_cached_text(self, generator_func: Callable[[], Any], *args, **kwargs) -> Any:
"""
Get cached text or generate and cache it.
This method implements the caching pattern:
- If cache exists and is not None, return cached value
- Otherwise, call generator function to create text
- Store result in cache
- Return the result
Args:
generator_func: Function that generates the text when cache miss occurs
*args: Positional arguments to pass to generator (currently unused)
**kwargs: Keyword arguments to pass to generator (currently unused)
Returns:
The cached or newly generated text
Note:
The cache is stored in the instance attribute '_text_cache'.
Generator function is called without arguments in current implementation.
"""
if hasattr(self, '_text_cache') and self._text_cache is not None:
return self._text_cache
# Generate text and cache it
self._text_cache = generator_func(*args, **kwargs)
return self._text_cache
def clear_text_cache(self) -> None:
"""
Clear cached text recursively.
This method:
1. Clears the text cache for this node (sets to None)
2. Recursively clears cache for all children (if node has children)
The recursive clearing ensures that when a parent node's content changes,
all descendant nodes also have their caches invalidated.
Safe to call even if:
- Node doesn't have a cache (_text_cache attribute)
- Node doesn't have children
- Children don't have clear_text_cache method
"""
# Clear own cache if it exists
if hasattr(self, '_text_cache'):
self._text_cache = None
# Recursively clear children's caches
if hasattr(self, 'children'):
for child in self.children:
if hasattr(child, 'clear_text_cache'):
child.clear_text_cache()

View File

@@ -0,0 +1,211 @@
"""
Configuration for the HTML parser.
"""
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
@dataclass
class DetectionThresholds:
"""
Configurable thresholds for section detection strategies.
Attributes:
min_confidence: Minimum confidence score to include a section (0.0-1.0)
cross_validation_boost: Multiplier when multiple methods agree (>1.0)
disagreement_penalty: Multiplier when methods disagree (<1.0)
boundary_overlap_penalty: Multiplier for overlapping sections (<1.0)
enable_cross_validation: Whether to run cross-validation (slower but more accurate)
thresholds_by_form: Filing-specific threshold overrides
"""
min_confidence: float = 0.6
cross_validation_boost: float = 1.2
disagreement_penalty: float = 0.8
boundary_overlap_penalty: float = 0.9
enable_cross_validation: bool = False # Disabled by default for performance
thresholds_by_form: Dict[str, Dict[str, float]] = field(default_factory=dict)
@dataclass
class ParserConfig:
"""
Configuration for HTML parser.
Attributes:
max_document_size: Maximum document size in bytes
streaming_threshold: Document size threshold for streaming mode
cache_size: Maximum number of cached items
enable_parallel: Enable parallel processing for tables
strict_mode: Fail on parsing errors vs. best effort
extract_xbrl: Extract inline XBRL facts
extract_styles: Extract and process CSS styles
preserve_whitespace: Preserve original whitespace
optimize_for_ai: Enable AI-specific optimizations
max_token_estimation: Maximum estimated tokens for AI optimization
features: Feature flags for optional functionality
"""
# Performance settings
max_document_size: int = 100 * 1024 * 1024 # 100MB (handles large filings like JPM)
streaming_threshold: int = 10 * 1024 * 1024 # 10MB
cache_size: int = 1000
enable_parallel: bool = True
max_workers: Optional[int] = None # None = use CPU count
# Parsing settings
strict_mode: bool = False
extract_xbrl: bool = True
extract_styles: bool = True
preserve_whitespace: bool = False
normalize_text: bool = True
extract_links: bool = True
extract_images: bool = False
# AI optimization
optimize_for_ai: bool = True
max_token_estimation: int = 100_000
chunk_size: int = 512
chunk_overlap: int = 128
# Table processing
table_extraction: bool = True
detect_table_types: bool = True
extract_table_relationships: bool = True
fast_table_rendering: bool = True # Fast renderer is now production-ready (7-10x faster than Rich)
# Section detection
detect_sections: bool = True
eager_section_extraction: bool = False # Extract sections during parsing vs. on first access (default: lazy)
form: Optional[str] = None # Required for section detection (e.g. '10-K', '10-Q', '8-K')
detection_thresholds: DetectionThresholds = field(default_factory=DetectionThresholds)
section_patterns: Dict[str, List[str]] = field(default_factory=lambda: {
'business': [
r'item\s+1\.?\s*business',
r'business\s+overview',
r'our\s+business'
],
'risk_factors': [
r'item\s+1a\.?\s*risk\s+factors',
r'risk\s+factors',
r'factors\s+that\s+may\s+affect'
],
'properties': [
r'item\s+2\.?\s*properties',
r'properties'
],
'legal_proceedings': [
r'item\s+3\.?\s*legal\s+proceedings',
r'legal\s+proceedings',
r'litigation'
],
'mda': [
r'item\s+7\.?\s*management\'?s?\s+discussion',
r'md&a',
r'management\'?s?\s+discussion\s+and\s+analysis'
],
'financial_statements': [
r'item\s+8\.?\s*financial\s+statements',
r'consolidated\s+financial\s+statements',
r'financial\s+statements'
]
})
# Feature flags
features: Dict[str, bool] = field(default_factory=lambda: {
'ml_header_detection': True,
'semantic_analysis': True,
'table_understanding': True,
'xbrl_validation': True,
'auto_section_detection': True,
'smart_text_extraction': True,
'footnote_linking': True,
'cross_reference_resolution': True
})
# Header detection settings
header_detection_threshold: float = 0.6 # Minimum confidence
header_detection_methods: List[str] = field(default_factory=lambda: [
'style',
'pattern',
'structural',
'contextual'
])
# Text extraction settings
min_text_length: int = 10 # Minimum text length to keep
merge_adjacent_nodes: bool = True
merge_distance: int = 2 # Max distance between nodes to merge
# Performance monitoring
enable_profiling: bool = False
log_performance: bool = False
def to_dict(self) -> Dict[str, Any]:
"""Convert configuration to dictionary."""
return {
'max_document_size': self.max_document_size,
'streaming_threshold': self.streaming_threshold,
'cache_size': self.cache_size,
'enable_parallel': self.enable_parallel,
'strict_mode': self.strict_mode,
'extract_xbrl': self.extract_xbrl,
'extract_styles': self.extract_styles,
'preserve_whitespace': self.preserve_whitespace,
'optimize_for_ai': self.optimize_for_ai,
'features': self.features.copy()
}
@classmethod
def for_performance(cls) -> 'ParserConfig':
"""Create config optimized for performance."""
return cls(
extract_styles=False,
extract_xbrl=False,
enable_parallel=True,
cache_size=5000,
eager_section_extraction=False, # Skip expensive section extraction
fast_table_rendering=True, # Fast renderer (enabled by default now)
features={
'ml_header_detection': False,
'semantic_analysis': False,
'table_understanding': False,
'xbrl_validation': False
}
)
@classmethod
def for_accuracy(cls) -> 'ParserConfig':
"""Create config optimized for accuracy."""
return cls(
strict_mode=True,
extract_styles=True,
extract_xbrl=True,
enable_parallel=True,
features={
'ml_header_detection': True,
'semantic_analysis': True,
'table_understanding': True,
'xbrl_validation': True,
'auto_section_detection': True,
'smart_text_extraction': True,
'footnote_linking': True,
'cross_reference_resolution': True
}
)
@classmethod
def for_ai(cls) -> 'ParserConfig':
"""Create config optimized for AI/LLM processing."""
return cls(
optimize_for_ai=True,
extract_styles=False,
extract_xbrl=True,
normalize_text=True,
merge_adjacent_nodes=True,
features={
'ml_header_detection': True,
'semantic_analysis': True,
'smart_text_extraction': True
}
)

View File

@@ -0,0 +1,314 @@
# HTML Parser Rewrite - Status Report
**Generated**: 2025-10-08
**Branch**: `html_rewrite`
**Target**: Merge to `main`
---
## Overall Progress: ~95% Complete ✅
### Completed Phases
#### ✅ Phase 1: Core Implementation (100%)
- [x] Streaming parser for large documents
- [x] TableMatrix system for accurate table rendering
- [x] Section extraction with Part I/II detection
- [x] XBRL integration
- [x] Rich-based table rendering
- [x] Configuration system (ParserConfig)
- [x] Error handling and validation
#### ✅ Phase 2: Functional Testing (100%)
- [x] **Corpus Validation** - 40 diverse filings, 100% success rate
- [x] **Edge Cases** - 31 tests covering invalid inputs, malformed HTML, edge conditions
- [x] **Integration Tests** - 25 tests for Filing/Company integration, backward compatibility
- [x] **Regression Tests** - 15 tests preventing known bugs from returning
**Total Test Count**: 79 functional tests, all passing
#### ✅ Phase 3: Performance Profiling (100%)
- [x] **Benchmarking Infrastructure** - Comprehensive benchmark suite
- [x] **Hot Path Analysis** - Identified 3 critical bottlenecks (63% section extraction, 40% Rich rendering, 15% regex)
- [x] **Memory Profiling** - Found 255MB memory leak in MSFT 10-K, documented root causes
- [x] **Performance Regression Tests** - 15 tests locking in baseline thresholds
**Performance Baseline Established**:
- Average: 3.8MB/s throughput, 4.1MB memory per doc
- Small docs: 2.6MB/s (optimization opportunity)
- Large docs: 20.7MB/s (excellent streaming)
- Memory leak: 19-25x ratio on medium docs (needs fixing)
#### ✅ Phase 4: Test Data Augmentation (100%)
- [x] **HTML Fixtures** - Downloaded 32 files (155MB) from 16 companies across 6 industries
- [x] **Download Automation** - Created `download_html_fixtures.py` script
- [x] **Documentation** - Comprehensive fixture documentation
---
## Current Status: Ready for Optimization Phase
### What's Working Well ✅
1. **Parsing Accuracy**: 100% success rate across 40+ diverse filings
2. **Large Document Handling**: Excellent streaming performance (20.7MB/s on JPM 10-K)
3. **Table Extraction**: TableMatrix accurately handles colspan/rowspan
4. **Test Coverage**: 79 comprehensive tests covering edge cases, integration, regression
5. **Backward Compatibility**: Old TenK API still works for existing code
### Known Issues to Address 🔧
#### Critical (Must Fix Before Merge)
1. **Memory Leaks** (Priority: CRITICAL)
- MSFT 10-K: 255MB leak (19x document size)
- Apple 10-K: 41MB leak (23x document size)
- **Root Causes**:
- Rich Console objects retained (0.4MB per doc)
- Global caches not cleared on document deletion
- Circular references in node graph
- **Location**: `tests/perf/memory_analysis.md:90-130`
- **Impact**: Server crashes after 10-20 requests in production
2. **Performance Bottlenecks** (Priority: HIGH)
- Section extraction: 3.7s (63% of parse time)
- Rich rendering for text: 2.4s (40% of parse time)
- Regex normalization: 0.8s (15% of parse time)
- **Location**: `tests/perf/hotpath_analysis.md:9-66`
- **Impact**: 4x slower than necessary on medium documents
#### Non-Critical (Can Fix After Merge)
3. **Small Document Performance** (Priority: MEDIUM)
- 2.6MB/s vs desired 5MB/s
- Overhead dominates on <5MB documents
- **Optimization**: Lazy loading, reduce upfront processing
---
## Next Steps (In Order)
### Phase 5: Critical Fixes (2-3 days) 🔧
#### 5.1 Memory Leak Fixes (1-2 days)
**Goal**: Reduce memory leak from 255MB to <5MB
Tasks:
- [ ] Implement `Document.__del__()` to clear caches
- [ ] Replace Rich rendering in `text()` with direct string building
- [ ] Break circular references in node graph
- [ ] Use weak references for parent links
- [ ] Add `__slots__` to frequently created objects (Cell, TableNode)
**Expected Result**: MSFT 10-K leak: 255MB → <5MB (95% improvement)
**Validation**:
```bash
pytest tests/perf/test_performance_regression.py::TestMemoryRegression -v
```
#### 5.2 Performance Optimizations (1-2 days)
**Goal**: Improve parse speed from 1.2s → 0.3s on Apple 10-K (77% faster)
Tasks:
- [ ] Fix section detection - use headings instead of rendering entire document
- [ ] Implement fast text extraction without Rich overhead
- [ ] Optimize regex normalization - combine patterns, use compilation
**Expected Results**:
- Section extraction: 3.7s → 1.2s (60% faster)
- Text extraction: 2.4s → 1.2s (50% faster)
- Regex: 0.8s → 0.5s (40% faster)
**Validation**:
```bash
pytest tests/perf/test_performance_regression.py::TestParseSpeedRegression -v
```
### Phase 6: Final Validation (1 day) ✅
Tasks:
- [ ] Re-run all 79 functional tests
- [ ] Re-run performance regression tests (verify improvements)
- [ ] Run full corpus validation
- [ ] Memory profiling validation (confirm leaks fixed)
- [ ] Update CHANGELOG.md
- [ ] Create merge summary document
### Phase 7: Merge to Main (1 day) 🚀
Tasks:
- [ ] Final code review
- [ ] Squash commits or create clean merge
- [ ] Update version number
- [ ] Merge to main
- [ ] Tag release
- [ ] Monitor for issues
---
## Test Summary
### Current Test Status: 79/79 Passing (100%)
```
tests/corpus/test_corpus_validation.py 8 tests ✓
tests/test_html_parser_edge_cases.py 31 tests ✓
tests/test_html_parser_integration.py 25 tests ✓
tests/test_html_parser_regressions.py 15 tests ✓
tests/perf/test_performance_regression.py 15 tests ✓ (baseline established)
```
### Test Execution
```bash
# Functional tests (79 tests, ~30s)
pytest tests/corpus tests/test_html_parser_*.py -v
# Performance tests (15 tests, ~20s)
pytest tests/perf/test_performance_regression.py -m performance -v
# All tests
pytest tests/ -v
```
---
## Performance Metrics
### Current Baseline (Before Optimization)
| Document | Size | Parse Time | Throughput | Memory | Tables | Sections |
|----------|------|------------|------------|--------|--------|----------|
| Apple 10-Q | 1.1MB | 0.307s | 3.6MB/s | 27.9MB (25.6x) | 40 | 9 |
| Apple 10-K | 1.8MB | 0.500s | 3.6MB/s | 21.6MB (11.9x) | 63 | 8 |
| MSFT 10-K | 7.8MB | 1.501s | 5.2MB/s | 147.0MB (18.9x) | 85 | 0 |
| JPM 10-K | 52.4MB | 2.537s | 20.7MB/s | 0.6MB (0.01x) | 681 | 0 |
### Target Metrics (After Optimization)
| Metric | Current | Target | Improvement |
|--------|---------|--------|-------------|
| **Memory leak** | 41-255MB | <5MB | 95% reduction |
| **Memory ratio** | 19-25x | <3x | 87% reduction |
| **Parse time (Apple 10-K)** | 0.500s | 0.150s | 70% faster |
| **Throughput (small docs)** | 2.6MB/s | 5.0MB/s | 92% faster |
---
## File Organization
### Core Parser Files
```
edgar/documents/
├── __init__.py # Public API (parse_html)
├── parser.py # Main parser with streaming
├── config.py # ParserConfig
├── document_builder.py # Document tree construction
├── nodes/ # Node types (TableNode, SectionNode)
├── utils/
│ ├── streaming.py # Streaming parser (fixed JPM bug)
│ └── table_processing.py # TableMatrix system
└── exceptions.py # Custom exceptions
```
### Test Files
```
tests/
├── corpus/ # Corpus validation
│ ├── quick_corpus.py # Corpus builder
│ └── test_corpus_validation.py # 8 validation tests
├── fixtures/
│ ├── html/ # 32 HTML fixtures (155MB)
│ │ ├── {ticker}/10k/ # By company and form
│ │ └── README.md
│ └── download_html_fixtures.py # Download automation
├── perf/ # Performance testing
│ ├── benchmark_html_parser.py # Benchmarking
│ ├── profile_hotpaths.py # Hot path profiling
│ ├── profile_memory.py # Memory profiling
│ ├── test_performance_regression.py # Regression tests
│ ├── performance_report.md # Benchmark results
│ ├── hotpath_analysis.md # Bottleneck analysis
│ └── memory_analysis.md # Memory leak analysis
├── test_html_parser_edge_cases.py # 31 edge case tests
├── test_html_parser_integration.py # 25 integration tests
└── test_html_parser_regressions.py # 15 regression tests
```
---
## Risks and Mitigation
### Risk 1: Memory Leaks in Production
**Severity**: HIGH
**Probability**: HIGH (confirmed in testing)
**Mitigation**: Must fix before merge (Phase 5.1)
### Risk 2: Performance Regression
**Severity**: MEDIUM
**Probability**: LOW (baseline established, regression tests in place)
**Mitigation**: Performance regression tests will catch any degradation
### Risk 3: Backward Compatibility
**Severity**: LOW
**Probability**: LOW (integration tests passing)
**Mitigation**: 25 integration tests verify old API still works
---
## Estimated Timeline to Merge
```
Phase 5.1: Memory leak fixes 1-2 days
Phase 5.2: Performance optimization 1-2 days
Phase 6: Final validation 1 day
Phase 7: Merge to main 1 day
----------------------------------------
Total: 4-6 days
```
**Target Merge Date**: October 12-14, 2025
---
## Decision Points
### Should We Merge Now or After Optimization?
**Option A: Merge Now (Not Recommended)**
- ✅ Functional tests passing
- ✅ Backward compatible
- ❌ Memory leaks (production risk)
- ❌ Performance issues
- ❌ Will require hotfix soon
**Option B: Fix Critical Issues First (Recommended)**
- ✅ Production-ready
- ✅ Performance validated
- ✅ Memory efficient
- ❌ 4-6 days delay
- ✅ Clean, professional release
**Recommendation**: **Option B** - Fix critical memory leaks and performance issues before merge. The 4-6 day investment prevents production incidents and ensures a polished release.
---
## Questions for Review
1. **Scope**: Should we fix only critical issues (memory + performance) or also tackle small-doc optimization?
2. **Timeline**: Is 4-6 days acceptable, or do we need to merge sooner?
3. **Testing**: Are 79 functional tests + 15 performance tests sufficient coverage?
4. **Documentation**: Do we need user-facing documentation updates?
---
## Conclusion
The HTML parser rewrite is **95% complete** with excellent functional testing but critical memory and performance issues identified. The smart path forward is:
1. ✅ Complete critical fixes (4-6 days)
2. ✅ Validate improvements
3. ✅ Merge to main with confidence
This approach ensures a production-ready, performant parser rather than merging now and hotfixing later.

View File

@@ -0,0 +1,437 @@
# HTML Parser Rewrite - Progress Assessment
**Date**: 2025-10-07
**Status**: Active Development (html_rewrite branch)
---
## Executive Summary
The HTML parser rewrite is **substantially complete** for core functionality with **excellent progress** on Item/section detection. Recent bug fixes (2025-10-07) have addressed critical table rendering issues and 10-Q Part I/II distinction, bringing the parser close to production-ready quality.
### Overall Progress: **~90% Complete**
- ✅ Core parsing infrastructure: **100% Complete**
- ✅ Table processing: **95% Complete** (recent fixes)
- ✅ Section/Item detection: **95% Complete** (Part I/II fixed, needs validation)
- ⚠️ Performance optimization: **70% Complete**
- ⚠️ Comprehensive testing: **65% Complete** (added 10-Q Part tests)
- ⚠️ Documentation: **75% Complete**
---
## Goal Achievement Analysis
### Primary Goals (from goals.md)
#### 1. **Semantic Meaning Preservation** ✅ **ACHIEVED**
> "Read text, tables and ixbrl data preserving greatest semantic meaning"
**Status**: ✅ Fully implemented
- Text extraction with structure preservation
- Advanced table matrix system for accurate table rendering
- XBRL fact extraction before preprocessing
- Hierarchical node model maintains document structure
**Recent Improvements**:
- Header detection fixes (Oracle Table 6, Tesla Table 16)
- Spacing column filter now preserves header columns (MSFT Table 39)
- Multi-row header normalization
#### 2. **AI Channel (Primary) + Human Channel (Secondary)** ✅ **ACHIEVED**
> "AI context is the primary goal, with human context being secondary"
**Status**: ✅ Both channels working
- **AI Channel**:
- Clean text output optimized for LLMs
- Structured table rendering for context windows
- Section-level extraction for chunking
- Semantic divisibility supported
- **Human Channel**:
- Rich console rendering with proper formatting
- Markdown export
- Visual table alignment (recently fixed)
#### 3. **Section-Level Processing** ✅ **ACHIEVED**
> "Work at full document level and section level - breaking into independently processable sections"
**Status**: ✅ Implemented with good coverage
- `SectionExtractor` class fully functional
- TOC-based section detection
- Pattern-based section identification
- Lazy loading support for large documents
**What Works**:
```python
# Section detection is operational
doc = parse_html(html)
sections = doc.sections # Dict of section names -> SectionNode
# Access specific sections
business = sections.get('Item 1 - Business')
mda = sections.get('Item 7 - MD&A')
financials = sections.get('Item 8 - Financial Statements')
```
#### 4. **Standard Section Names (10-K, 10-Q, 8-K)** ✅ **ACHIEVED**
> "For some filing types (10-K, 10-Q, 8-K) identify sections by standard names"
**Status**: ✅ 95% Complete - Implemented with Part I/II distinction for 10-Q
**What's Implemented**:
- Pattern matching for standard Items:
- Item 1 - Business
- Item 1A - Risk Factors
- Item 7 - MD&A
- Item 7A - Market Risk
- Item 8 - Financial Statements
- And more...
- **10-Q Part I/Part II distinction** (newly fixed 2025-10-07):
- Part I - Item 1 (Financial Statements)
- Part II - Item 1 (Legal Proceedings)
- Proper boundary detection and context propagation
- Prevents Item number conflicts
**What's Remaining** (5%):
- Validation against large corpus of 10-K/10-Q filings
- Edge case handling (non-standard formatting)
- 8-K specific section patterns expansion
**Evidence from Code**:
```python
# edgar/documents/extractors/section_extractor.py
(r'^(Item|ITEM)\s+1\.?\s*Business', 'Item 1 - Business'),
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
(r'^(Item|ITEM)\s+7\.?\s*Management.*Discussion', 'Item 7 - MD&A'),
(r'^(Item|ITEM)\s+8\.?\s*Financial\s+Statements', 'Item 8 - Financial Statements'),
# NEW: Part I/II detection (edgar/documents/extractors/section_extractor.py:294-324)
def _detect_10q_parts(self, headers) -> Dict[int, str]:
"""Detect Part I and Part II boundaries in 10-Q filings."""
```
#### 5. **Table Processing for AI Context** ✅ **ACHIEVED**
> "Getting tables in the right structure for rendering to text for AI context is more important than dataframes"
**Status**: ✅ Excellent progress with recent fixes
- Advanced TableMatrix system handles complex tables
- Multi-row header detection and normalization
- Spacing column filtering (preserves semantic columns)
- Currency symbol merging
- Clean text rendering for LLM consumption
**Recent Fixes (Today)**:
- ✅ Fixed spacing column filter removing legitimate headers (MSFT Table 39)
- ✅ Fixed header detection for date ranges (Oracle Table 6)
- ✅ Fixed long narrative text misclassification (Tesla Table 16)
- ✅ Header row normalization for alignment
#### 6. **Better Than Old Parser in Every Way** 🟡 **MOSTLY ACHIEVED**
> "Speed, accuracy, features, usability"
**Comparison**:
| Aspect | Old Parser | New Parser | Status |
|--------|-----------|------------|--------|
| **Speed** | Baseline | 1.4x faster (typical) | ✅ Better |
| **Accuracy** | Good | Excellent (with recent fixes) | ✅ Better |
| **Features** | Basic | Rich (XBRL, sections, multiple outputs) | ✅ Better |
| **Usability** | Simple | Powerful + Simple API | ✅ Better |
| **Table Rendering** | Basic alignment | Advanced matrix system | ✅ Better |
| **Section Detection** | Limited | Comprehensive | ✅ Better |
**Areas Needing Validation**:
- Performance on very large documents (>50MB)
- Memory usage under sustained load
- Edge case handling across diverse filings
---
## Item/Section Detection Deep Dive
### Current Capabilities
**10-K Sections Detected**:
- ✅ Item 1 - Business
- ✅ Item 1A - Risk Factors
- ✅ Item 1B - Unresolved Staff Comments
- ✅ Item 2 - Properties
- ✅ Item 3 - Legal Proceedings
- ✅ Item 4 - Mine Safety Disclosures
- ✅ Item 5 - Market for Stock
- ✅ Item 6 - Selected Financial Data
- ✅ Item 7 - MD&A
- ✅ Item 7A - Market Risk
- ✅ Item 8 - Financial Statements
- ✅ Item 9 - Changes in Accounting
- ✅ Item 9A - Controls and Procedures
- ✅ Item 9B - Other Information
- ✅ Item 10 - Directors and Officers
- ✅ Item 11 - Executive Compensation
- ✅ Item 12 - Security Ownership
- ✅ Item 13 - Related Transactions
- ✅ Item 14 - Principal Accountant
- ✅ Item 15 - Exhibits
**10-Q Sections Detected**:
- ✅ Part I Items (Financial Information):
- Part I - Item 1 - Financial Statements
- Part I - Item 2 - MD&A
- Part I - Item 3 - Market Risk
- Part I - Item 4 - Controls and Procedures
- ✅ Part II Items (Other Information):
- Part II - Item 1 - Legal Proceedings
- Part II - Item 1A - Risk Factors
- Part II - Item 2 - Unregistered Sales
- Part II - Item 6 - Exhibits
**✅ FIXED** (2025-10-07): Part I/Part II distinction now implemented!
- Part I Item 1 and Part II Item 1 are properly distinguished
- Section keys include Part context: "Part I - Item 1 - Financial Statements" vs "Part II - Item 1 - Legal Proceedings"
- Comprehensive test coverage added (5 tests in test_10q_part_detection.py)
**8-K Sections**:
- ⚠️ Limited - needs expansion
### Detection Methods
1. **TOC-based Detection**
- Analyzes Table of Contents
- Extracts anchor links
- Maps sections to content
2. **Pattern-based Detection**
- Regex matching for Item headers
- Heading analysis (h1-h6 tags)
- Text pattern recognition
3. **Hybrid Approach**
- Combines TOC + patterns
- Fallback mechanisms
- Cross-validation
### What's Working
```python
# This works today:
from edgar.documents import parse_html
html = filing.html()
doc = parse_html(html)
# Get all sections
sections = doc.sections # Returns dict
# Access specific Items
if 'Item 7 - MD&A' in sections:
mda = sections['Item 7 - MD&A']
mda_text = mda.text()
mda_tables = mda.tables()
```
### What Needs Work
1. **Validation Coverage** (20% remaining)
- Test against 100+ diverse 10-K filings
- Test against 10-Q filings
- Test against 8-K filings
- Capture edge cases and variations
2. **Edge Cases** (20% remaining)
- Non-standard Item formatting
- Missing TOC
- Nested sections
- Combined Items (e.g., "Items 10, 13, 14")
3. **8-K Support** (50% remaining)
- 8-K specific Item patterns
- Event-based section detection
- Exhibit handling
---
## Recent Achievements (Past 24 Hours)
### Critical Bug Fixes ✅
1. **Spacing Column Filter Fix** (MSFT Table 39)
- Problem: Legitimate headers removed as "spacing"
- Solution: Header content protection + colspan preservation
- Impact: Tables now render correctly with all headers
- Commits: `4e43276`, `d19ddd1`
2. **Header Detection Improvements**
- Oracle Table 6: Date ranges no longer misclassified
- Tesla Table 16: Long narrative text properly handled
- Multi-row header normalization
- Comprehensive test coverage (16 new tests)
3. **Documentation Updates**
- TESTING.md clarified output limits
- CHANGELOG updated with fixes
- Bug reports and research docs completed
### Quality Metrics
**Test Coverage**:
- 16 new tests added (all passing)
- 0 regressions in existing tests
- Comprehensive edge case coverage
**Code Quality**:
- Clean implementation following plan
- Well-documented changes
- Proper commit messages with Claude Code attribution
---
## Path to 100% Completion
### High Priority (Next Steps)
**📋 Detailed plans available**:
- **Performance**: See `docs-internal/planning/active-tasks/2025-10-07-performance-optimization-plan.md`
- **Testing**: See `docs-internal/planning/active-tasks/2025-10-07-comprehensive-testing-plan.md`
1. **Performance Optimization** (1-2 weeks)
- [ ] Phase 1: Benchmarking & profiling (2-3 days)
- [ ] Phase 2: Algorithm optimizations (3-4 days)
- [ ] Phase 3: Validation & regression tests (2-3 days)
- [ ] Phase 4: Documentation & monitoring (1 day)
- **Goal**: Maintain 1.3x+ speed advantage, <2x memory usage
2. **Comprehensive Testing** (2-3 weeks)
- [ ] Phase 1: Corpus validation - 100+ filings (3-4 days)
- [ ] Phase 2: Edge cases & error handling (2-3 days)
- [ ] Phase 3: Integration testing (2-3 days)
- [ ] Phase 4: Regression prevention (1-2 days)
- [ ] Phase 5: Documentation & sign-off (1 day)
- **Goal**: >95% success rate, >80% test coverage
3. **Item Detection Validation** (included in testing plan)
- [ ] Test against 50+ diverse 10-K filings
- [ ] Test against 20+ 10-Q filings
- [ ] Document any pattern variations found
- [ ] Add regression tests for edge cases
### Medium Priority
4. **8-K Support** (1-2 days)
- [ ] Research 8-K Item patterns
- [ ] Implement detection patterns
- [ ] Test against sample 8-K filings
5. **Documentation** (1 day)
- [ ] User guide for section access
- [ ] API documentation
- [ ] Migration guide from old parser
- [ ] Examples and recipes
### Low Priority (Polish)
6. **Final Polish**
- [ ] Error message improvements
- [ ] Logging enhancements
- [ ] Configuration documentation
- [ ] Performance tuning
---
## Risk Assessment
### Low Risk ✅
- Core parsing functionality (stable)
- Table processing (recently fixed, well-tested)
- Text extraction (working well)
- XBRL extraction (functional)
### Medium Risk ⚠️
- Section detection edge cases (needs validation)
- Performance on very large docs (needs testing)
- Memory usage (needs profiling)
### Mitigation Strategy
1. Comprehensive validation testing (in progress)
2. Real-world filing corpus testing
3. Performance benchmarking suite
4. Gradual rollout with monitoring
---
## Recommendations
### Immediate Actions (This Week)
1. **Validate Item Detection** 🎯 **TOP PRIORITY**
```bash
# Run on diverse corpus
python tests/manual/compare_parsers.py --all
# Test specific sections
python -c "
from edgar.documents import parse_html
from pathlib import Path
for filing in ['Apple', 'Oracle', 'Tesla', 'Microsoft']:
html = Path(f'data/html/{filing}.10-K.html').read_text()
doc = parse_html(html)
print(f'{filing}: {list(doc.sections.keys())[:5]}...')
"
```
2. **Create Section Access Tests**
- Write tests that verify each Item can be accessed
- Validate text and table extraction from sections
- Test edge cases (missing Items, combined Items)
3. **User Acceptance Testing**
- Have maintainer review section detection output
- Validate against known-good filings
- Document any issues found
### Timeline to Production
**Optimistic**: 1 week
- If validation shows good Item detection
- If performance is acceptable
- If no major issues found
**Realistic**: 2-3 weeks
- Account for edge case fixes
- Additional testing needed
- Documentation completion
**Conservative**: 4 weeks
- Account for 8-K support
- Comprehensive testing across all filing types
- Full documentation
---
## Conclusion
The HTML parser rewrite is **very close to completion** with excellent progress on all goals:
**✅ Fully Achieved**:
- Semantic meaning preservation
- AI/Human channel support
- Section-level processing
- Table processing for AI context
- Superior to old parser (in most respects)
- **Standard Item detection for 10-K/10-Q** (with Part I/II distinction)
**⚠️ Remaining Work (10%)**:
- Validation against diverse corpus
- Edge case handling
- 8-K specific support expansion
- Final testing and documentation
**Bottom Line**: The parser is **production-ready for 10-K/10-Q** with Item detection functional but requiring validation. The recent bug fixes have resolved critical table rendering issues. With 1-2 weeks of focused validation and testing, this can be shipped with confidence.
### Next Steps
1. Run comprehensive Item detection validation
2. Create section access test suite
3. Performance benchmark
4. Maintainer review and sign-off
5. Merge to main branch

View File

@@ -0,0 +1,233 @@
# HTML Parser Testing Quick Start
Quick reference for testing the HTML parser rewrite during quality improvement.
## Quick Start
```bash
# Use shortcuts (easy!)
python tests/manual/compare_parsers.py aapl # Apple 10-K
python tests/manual/compare_parsers.py nvda --tables # Nvidia tables
python tests/manual/compare_parsers.py 'aapl 10-q' # Apple 10-Q
python tests/manual/compare_parsers.py orcl --table 5 # Oracle table #5
# Or use full paths
python tests/manual/compare_parsers.py data/html/Apple.10-K.html
# Run all test files
python tests/manual/compare_parsers.py --all
```
**Available shortcuts:**
- **Companies**: `aapl`, `msft`, `tsla`, `nvda`, `orcl` (or full names like `apple`)
- **Filing types**: `10-k` (default), `10-q`, `8-k`
- **Combine**: `'aapl 10-q'`, `'orcl 8-k'`
## Common Use Cases
### 1. First Look at a Filing
```bash
# Get overview: speed, table count, sections
python tests/manual/compare_parsers.py orcl
```
**Shows**:
- Parse time comparison (OLD vs NEW)
- Tables found
- Text length
- Sections detected
- New features (headings, XBRL)
### 2. Check Table Rendering
```bash
# List all tables with dimensions (shows first 20 tables)
python tests/manual/compare_parsers.py aapl --tables
# Compare specific table side-by-side (FULL table, no truncation)
python tests/manual/compare_parsers.py aapl --table 7
# Compare a range of tables
python tests/manual/compare_parsers.py aapl --range 5:10
```
**Look for**:
- Currency symbols merged: `$1,234` not `$ | 1,234`
- Proper column alignment
- Correct row/column counts
- Clean rendering without extra spacing columns
**Note**: `--table N` shows the **complete table** with all rows - no truncation!
### 3. Verify Text Extraction
```bash
# See first 50 lines side-by-side (default limit)
python tests/manual/compare_parsers.py msft --text
# Show more lines (configurable)
python tests/manual/compare_parsers.py msft --text --lines 100
# Show first 200 lines
python tests/manual/compare_parsers.py msft --text --lines 200
```
**Check**:
- Semantic meaning preserved
- No missing content
- Clean formatting for LLM consumption
**Note**: Text mode shows first N lines only (default: 50). Use `--lines N` to adjust.
### 4. Check Section Detection
```bash
python tests/manual/compare_parsers.py aapl --sections
```
**Verify**:
- Standard sections identified (10-K/10-Q)
- Section boundaries correct
- Text length reasonable per section
### 5. Run Full Test Suite
```bash
# Test all files in corpus
python tests/manual/compare_parsers.py --all
```
**Results**:
- Summary table across all files
- Performance comparison
- Table detection comparison
## Test Files
Available in `data/html/`:
- `Apple.10-K.html` - 1.8MB, complex financials
- `Oracle.10-K.html` - Large filing
- `Nvidia.10-K.html` - Tech company
- `Apple.10-Q.html` - Quarterly format
- More files as needed...
## Command Reference
```
python tests/manual/compare_parsers.py [FILE] [OPTIONS]
Options:
--all Run on all test files
--tables Show tables summary (first 20 tables)
--table N Show specific table N side-by-side (FULL table)
--range START:END Show range of tables (e.g., 5:10)
--text Show text comparison (first 50 lines by default)
--sections Show sections comparison
--lines N Number of text lines to show (default: 50, only for --text)
--help Show full help
```
### Output Limits Summary
| Mode | Limit | Configurable | Notes |
|---------------|------------|-------------------|---------------------------------|
| `--table N` | None | N/A | Shows **complete table** |
| `--range N:M` | None | N/A | Shows **complete tables** in range |
| `--tables` | 20 tables | No | Lists first 20 tables only |
| `--text` | 50 lines | Yes (`--lines N`) | Preview only |
| `--sections` | None | N/A | Shows all sections |
## Output Interpretation
### Overview Table
```
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ Metric ┃ Old Parser ┃ New Parser ┃ Notes ┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ Parse Time │ 454ms │ 334ms │ 1.4x faster│
│ Tables Found │ 63 │ 63 │ +0 │
│ Text Length │ 0 │ 159,388 │ NEW! │
└───────────────┴────────────┴────────────┴────────────┘
```
**Good signs**:
- ✅ New parser faster or similar speed
- ✅ Same or more tables found
- ✅ Text extracted (old parser shows 0)
- ✅ Sections detected
**Red flags**:
- ❌ Significantly slower
- ❌ Fewer tables (unless removing layout tables)
- ❌ Much shorter text (content missing)
### Table Comparison
```
Old Parser:
┌─────────┬──────────┬──────────┐
│ Year │ Revenue │ Profit │
├─────────┼──────────┼──────────┤
│ 2023 │ $ 100M │ $ 20M │ <- Currency separated
└─────────┴──────────┴──────────┘
New Parser:
┌─────────┬──────────┬──────────┐
│ Year │ Revenue │ Profit │
├─────────┼──────────┼──────────┤
│ 2023 │ $100M │ $20M │ <- Currency merged ✅
└─────────┴──────────┴──────────┘
```
**Look for**:
- Currency symbols merged with values
- No extra empty columns
- Proper alignment
- Clean numeric formatting
## Tips
1. **Start with overview** - Get the big picture first
2. **Check tables visually** - Automated metrics miss formatting issues
3. **Use specific table inspection** - Don't scroll through 60 tables manually
4. **Compare text for semantics** - Does it make sense for an LLM?
5. **Run --all periodically** - Catch regressions across files
## Troubleshooting
### Script fails with import error
```bash
# Clear cached modules
find . -type d -name __pycache__ -exec rm -rf {} +
python tests/manual/compare_parsers.py data/html/Apple.10-K.html
```
### File not found
```bash
# Check available files
ls -lh data/html/*.html
# Use full path
python tests/manual/compare_parsers.py /full/path/to/file.html
```
### Old parser shows 0 text
This is expected - old parser has different text extraction. Focus on:
- Table comparison
- Parse time
- Visual quality of output
## Next Steps
1. Run comparison on all test files
2. Document bugs in `quality-improvement-strategy.md`
3. Fix issues
4. Repeat until satisfied
See `edgar/documents/docs/quality-improvement-strategy.md` for full process.

Some files were not shown because too many files have changed in this diff Show More