Initial commit
This commit is contained in:
4
venv/lib/python3.10/site-packages/edgar/__about__.py
Normal file
4
venv/lib/python3.10/site-packages/edgar/__about__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2022-present Dwight Gunning <dgunning@gmail.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = '4.25.0'
|
||||
188
venv/lib/python3.10/site-packages/edgar/__init__.py
Normal file
188
venv/lib/python3.10/site-packages/edgar/__init__.py
Normal file
@@ -0,0 +1,188 @@
|
||||
# SPDX-FileCopyrightText: 2022-present Dwight Gunning <dgunning@gmail.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
import re
|
||||
from functools import lru_cache, partial
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from edgar._filings import Attachment, Attachments, Filing, FilingHeader, FilingHomepage, Filings, get_by_accession_number, get_by_accession_number_enriched, get_filings
|
||||
from edgar.core import CAUTION, CRAWL, NORMAL, edgar_mode, get_identity, listify, set_identity
|
||||
from edgar.current_filings import CurrentFilings, get_all_current_filings, get_current_filings, iter_current_filings_pages
|
||||
from edgar.entity import (
|
||||
Company,
|
||||
CompanyData,
|
||||
CompanyFiling,
|
||||
CompanyFilings,
|
||||
CompanySearchResults,
|
||||
Entity,
|
||||
EntityData,
|
||||
find_company,
|
||||
get_cik_lookup_data,
|
||||
get_company_facts,
|
||||
get_company_tickers,
|
||||
get_entity,
|
||||
get_entity_submissions,
|
||||
get_icon_from_ticker,
|
||||
get_ticker_to_cik_lookup,
|
||||
)
|
||||
from edgar.files import detect_page_breaks, mark_page_breaks
|
||||
from edgar.files.html import Document
|
||||
from edgar.financials import Financials, MultiFinancials
|
||||
from edgar.funds import FundClass, FundCompany, FundSeries, find_fund
|
||||
from edgar.funds.reports import NPORT_FORMS, FundReport
|
||||
from edgar.storage import download_edgar_data, download_filings, is_using_local_storage, set_local_storage_path, use_local_storage
|
||||
from edgar.storage_management import (
|
||||
StorageAnalysis,
|
||||
StorageInfo,
|
||||
analyze_storage,
|
||||
availability_summary,
|
||||
check_filing,
|
||||
check_filings_batch,
|
||||
cleanup_storage,
|
||||
clear_cache,
|
||||
optimize_storage,
|
||||
storage_info,
|
||||
)
|
||||
from edgar.thirteenf import THIRTEENF_FORMS, ThirteenF
|
||||
from edgar.xbrl import XBRL
|
||||
|
||||
# Fix for Issue #457: Clear locale-corrupted cache files on first import
|
||||
# This is a one-time operation that only runs if the marker file doesn't exist
|
||||
try:
|
||||
from edgar.httpclient import clear_locale_corrupted_cache
|
||||
clear_locale_corrupted_cache()
|
||||
except Exception:
|
||||
# Silently continue if cache clearing fails - it's not critical
|
||||
pass
|
||||
|
||||
# Another name for get_current_filings
|
||||
get_latest_filings = get_current_filings
|
||||
latest_filings = get_current_filings
|
||||
current_filings = get_current_filings
|
||||
|
||||
# Fund portfolio report filings
|
||||
get_fund_portfolio_filings = partial(get_filings, form=NPORT_FORMS)
|
||||
|
||||
# Restricted stock sales
|
||||
get_restricted_stock_filings = partial(get_filings, form=[144])
|
||||
|
||||
# Insider transaction filings
|
||||
get_insider_transaction_filings = partial(get_filings, form=[3, 4, 5])
|
||||
|
||||
# 13F filings - portfolio holdings
|
||||
get_portfolio_holding_filings = partial(get_filings, form=THIRTEENF_FORMS)
|
||||
|
||||
|
||||
@lru_cache(maxsize=16)
|
||||
def find(search_id: Union[str, int]) -> Optional[Union[Filing, Entity, CompanySearchResults, FundCompany, FundClass, FundSeries]]:
|
||||
"""This is an uber search function that can take a variety of search ids and return the appropriate object
|
||||
- accession number -> returns a Filing
|
||||
- CIK -> returns an Entity
|
||||
- Class/Contract ID -> returns a FundClass
|
||||
- Series ID -> returns a FundSeries
|
||||
- Ticker -> returns a Company or a Fund if the ticker is a fund ticker
|
||||
- Company name -> returns CompanySearchResults
|
||||
|
||||
:type: object
|
||||
"""
|
||||
if isinstance(search_id, int):
|
||||
return Entity(search_id)
|
||||
elif re.match(r"\d{10}-\d{2}-\d{6}", search_id):
|
||||
return get_by_accession_number_enriched(search_id)
|
||||
elif re.match(r"^\d{18}$", search_id): # accession number with no dashes
|
||||
accession_number = search_id[:10] + "-" + search_id[10:12] + "-" + search_id[12:]
|
||||
return get_by_accession_number_enriched(accession_number)
|
||||
elif re.match(r"\d{4,10}$", search_id):
|
||||
return Entity(search_id)
|
||||
elif re.match(r"^[A-WYZ]{1,5}([.-][A-Z])?$", search_id): # Ticker (including dot or hyphenated)
|
||||
return Entity(search_id)
|
||||
elif re.match(r"^[A-Z]{4}X$", search_id): # Mutual Fund Ticker
|
||||
return find_fund(search_id)
|
||||
elif re.match(r"^[CS]\d+$", search_id):
|
||||
return find_fund(search_id)
|
||||
elif re.match(r"^\d{6,}-", search_id):
|
||||
# Probably an invalid accession number
|
||||
return None
|
||||
else:
|
||||
return find_company(search_id)
|
||||
|
||||
|
||||
def matches_form(sec_filing: Filing,
|
||||
form: Union[str, List[str]]) -> bool:
|
||||
"""Check if the filing matches the forms"""
|
||||
form_list = listify(form)
|
||||
if sec_filing.form in form_list + [f"{f}/A" for f in form_list]:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class DataObjectException(Exception):
|
||||
|
||||
def __init__(self, filing: Filing):
|
||||
self.message = f"Could not create a data object for Form {filing.form} filing: {filing.accession_no}"
|
||||
super().__init__(self.message)
|
||||
|
||||
|
||||
def obj(sec_filing: Filing) -> Optional[object]:
|
||||
"""
|
||||
Depending on the filing return the data object that contains the data for the filing
|
||||
|
||||
This usually coms from the xml associated with the filing, but it can also come from the extracted xbrl
|
||||
:param sec_filing: The filing
|
||||
:return:
|
||||
"""
|
||||
from edgar.company_reports import CurrentReport, EightK, TenK, TenQ, TwentyF
|
||||
from edgar.effect import Effect
|
||||
from edgar.form144 import Form144
|
||||
from edgar.muniadvisors import MunicipalAdvisorForm
|
||||
from edgar.offerings import FormC, FormD
|
||||
from edgar.ownership import Form3, Form4, Form5, Ownership
|
||||
|
||||
if matches_form(sec_filing, "6-K"):
|
||||
return CurrentReport(sec_filing)
|
||||
if matches_form(sec_filing, "8-K"):
|
||||
return EightK(sec_filing)
|
||||
elif matches_form(sec_filing, "10-Q"):
|
||||
return TenQ(sec_filing)
|
||||
elif matches_form(sec_filing, "10-K"):
|
||||
return TenK(sec_filing)
|
||||
elif matches_form(sec_filing, "20-F"):
|
||||
return TwentyF(sec_filing)
|
||||
elif matches_form(sec_filing, THIRTEENF_FORMS):
|
||||
# ThirteenF can work with either XML (2013+) or TXT (2012 and earlier) format
|
||||
return ThirteenF(sec_filing)
|
||||
elif matches_form(sec_filing, "144"):
|
||||
return Form144.from_filing(sec_filing)
|
||||
elif matches_form(sec_filing, "MA-I"):
|
||||
return MunicipalAdvisorForm.from_filing(sec_filing)
|
||||
elif matches_form(sec_filing, "3"):
|
||||
xml = sec_filing.xml()
|
||||
if xml:
|
||||
return Form3(**Ownership.parse_xml(xml))
|
||||
elif matches_form(sec_filing, "4"):
|
||||
xml = sec_filing.xml()
|
||||
if xml:
|
||||
return Form4(**Ownership.parse_xml(xml))
|
||||
elif matches_form(sec_filing, "5"):
|
||||
xml = sec_filing.xml()
|
||||
if xml:
|
||||
return Form5(**Ownership.parse_xml(xml))
|
||||
elif matches_form(sec_filing, "EFFECT"):
|
||||
xml = sec_filing.xml()
|
||||
if xml:
|
||||
return Effect.from_xml(xml)
|
||||
elif matches_form(sec_filing, "D"):
|
||||
xml = sec_filing.xml()
|
||||
if xml:
|
||||
return FormD.from_xml(xml)
|
||||
elif matches_form(sec_filing, ["C", "C-U", "C-AR", "C-TR"]):
|
||||
xml = sec_filing.xml()
|
||||
if xml:
|
||||
return FormC.from_xml(xml, form=sec_filing.form)
|
||||
|
||||
elif matches_form(sec_filing, ["NPORT-P", "NPORT-EX"]):
|
||||
return FundReport.from_filing(sec_filing)
|
||||
|
||||
filing_xbrl = sec_filing.xbrl()
|
||||
if filing_xbrl:
|
||||
return filing_xbrl
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2012
venv/lib/python3.10/site-packages/edgar/_filings.py
Normal file
2012
venv/lib/python3.10/site-packages/edgar/_filings.py
Normal file
File diff suppressed because it is too large
Load Diff
127
venv/lib/python3.10/site-packages/edgar/_markdown.py
Normal file
127
venv/lib/python3.10/site-packages/edgar/_markdown.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import re
|
||||
|
||||
from rich import box
|
||||
from rich.console import Console, Group
|
||||
from rich.markdown import Markdown
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
|
||||
from edgar.files.html_documents import HtmlDocument
|
||||
from edgar.richtools import repr_rich
|
||||
|
||||
__all__ = [
|
||||
'convert_table',
|
||||
'MarkdownContent',
|
||||
'markdown_to_rich',
|
||||
'html_to_markdown',
|
||||
"fix_markdown",
|
||||
"text_to_markdown",
|
||||
]
|
||||
|
||||
|
||||
def _empty(row):
|
||||
if not row:
|
||||
return True
|
||||
chars = set(re.sub(r"\s", "", row.strip()))
|
||||
return chars == {'|'} or chars == {'-', '|'}
|
||||
|
||||
|
||||
def convert_table(table_markdown: str):
|
||||
"""Convert the markdown to a rich Table"""
|
||||
all_rows = table_markdown.replace("| |", "|\n|").split("\n")
|
||||
|
||||
# Just output a simple table with no headers
|
||||
table = Table(" " * all_rows[0].count("|"), box=box.SIMPLE)
|
||||
for row in all_rows:
|
||||
if not _empty(row):
|
||||
row = [cell.strip() for cell in row[1:-1].strip().split("|")]
|
||||
table.add_row(*row)
|
||||
return table
|
||||
|
||||
|
||||
skip_tags = ["<DOCUMENT>", "<TYPE>", "<SEQUENCE>", "<FILENAME>", "<DESCRIPTION>", "<TEXT>"]
|
||||
|
||||
|
||||
def markdown_to_rich(md: str, title: str = "") -> Panel:
|
||||
"""Convert the markdown to rich .. handling tables better than rich"""
|
||||
content = []
|
||||
buf = ""
|
||||
table_buf = ""
|
||||
is_table = False
|
||||
for line in md.split("\n"):
|
||||
if is_table:
|
||||
if not line.strip():
|
||||
table = convert_table(table_buf)
|
||||
content.append(table)
|
||||
is_table = False
|
||||
table_buf = ""
|
||||
else:
|
||||
table_buf += line + "\n"
|
||||
else:
|
||||
if "| |" in line:
|
||||
markdown = Markdown(buf)
|
||||
buf = ""
|
||||
table_buf = line + "\n"
|
||||
content.append(markdown)
|
||||
is_table = True
|
||||
else:
|
||||
buf += line + "\n"
|
||||
if buf:
|
||||
content.append(Markdown(buf))
|
||||
return Panel(Group(*content), title=title, subtitle=title, box=box.ROUNDED)
|
||||
|
||||
|
||||
def fix_markdown(md: str):
|
||||
# Clean up issues with not spaces between sentences like "Condition.On"
|
||||
md = re.sub(r"([a-z]\.)([A-Z])", r"\1 \2", md)
|
||||
|
||||
# Remove asterisks inside Items
|
||||
md = re.sub(r"\*\*(Item)\*\*\xa0\*\*(\d)", r"\1 \2", md, flags=re.IGNORECASE)
|
||||
|
||||
# And fix split Item numbers e.g. "Item\n5.02"
|
||||
md = re.sub(r"(Item)[\n\xa0]\s?(\d)", r"\1 \2", md, flags=re.IGNORECASE)
|
||||
|
||||
# Fix items not on newlines e.g. ". Item 5.02"
|
||||
md = re.sub(r"\. (Item)\s?(\d.\d{,2})", r".\n \1 \2", md, flags=re.IGNORECASE)
|
||||
|
||||
# Fix items with no space before Item e.g. "ReservedItem 7"
|
||||
md = re.sub(r"(\S)(Item)\s?(\d.\d{,2})", r"\1\n\n \2 \3", md, flags=re.IGNORECASE)
|
||||
return md
|
||||
|
||||
|
||||
def html_to_markdown(html: str) -> str:
|
||||
"""Convert the html to markdown"""
|
||||
document: HtmlDocument = HtmlDocument.from_html(html)
|
||||
return document.markdown
|
||||
|
||||
|
||||
def text_to_markdown(text: str) -> str:
|
||||
"""Convert the text to markdown"""
|
||||
return f"""
|
||||
<pre>{text}</pre>
|
||||
"""
|
||||
|
||||
|
||||
class MarkdownContent:
|
||||
|
||||
def __init__(self,
|
||||
markdown: str,
|
||||
title: str = ""):
|
||||
self.md = markdown
|
||||
self.title = title
|
||||
|
||||
@classmethod
|
||||
def from_html(cls, html: str, title: str = ""):
|
||||
md = html_to_markdown(html)
|
||||
return cls(markdown=md, title=title)
|
||||
|
||||
def view(self):
|
||||
console = Console()
|
||||
console.print(self.__rich__())
|
||||
|
||||
def __rich__(self):
|
||||
_renderable = markdown_to_rich(self.md, title=self.title)
|
||||
return _renderable
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
257
venv/lib/python3.10/site-packages/edgar/_party.py
Normal file
257
venv/lib/python3.10/site-packages/edgar/_party.py
Normal file
@@ -0,0 +1,257 @@
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from bs4 import Tag
|
||||
from pydantic import BaseModel
|
||||
from rich.columns import Columns
|
||||
from rich.console import Group
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from edgar.core import IntString
|
||||
from edgar.richtools import repr_rich
|
||||
from edgar.xmltools import child_text, child_value
|
||||
|
||||
__all__ = [
|
||||
'Address',
|
||||
'Issuer',
|
||||
'Person',
|
||||
'Name',
|
||||
'Filer',
|
||||
'get_addresses_as_columns'
|
||||
]
|
||||
|
||||
|
||||
class Address(BaseModel):
|
||||
|
||||
street1: Optional[str] = None
|
||||
street2: Optional[str] = None
|
||||
city: Optional[str] = None
|
||||
state_or_country: Optional[str] = None
|
||||
state_or_country_description: Optional[str] = None
|
||||
zipcode: Optional[str] = None
|
||||
|
||||
@property
|
||||
def empty(self):
|
||||
return not self.street1 and not self.street2 and not self.city and not self.state_or_country and not self.zipcode
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, address_dict: Dict[str, Any]):
|
||||
return Address(
|
||||
street1=address_dict.get('STREET1'),
|
||||
street2=address_dict.get('STREET2'),
|
||||
city=address_dict.get('CITY'),
|
||||
state_or_country=address_dict.get('STATE'),
|
||||
zipcode=address_dict.get('ZIP')
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
if not self.street1:
|
||||
return ""
|
||||
address_format = "{street1}\n"
|
||||
if self.street2:
|
||||
address_format += "{street2}\n"
|
||||
address_format += "{city}, {state_or_country} {zipcode}"
|
||||
|
||||
return address_format.format(
|
||||
street1=self.street1,
|
||||
street2=self.street2,
|
||||
city=self.city,
|
||||
state_or_country=self.state_or_country_description or self.state_or_country,
|
||||
zipcode=self.zipcode or ""
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return (f'Address(street1="{self.street1 or ""}", street2="{self.street2 or ""}", city="{self.city or ""}",'
|
||||
f'zipcode="{self.zipcode or ""}", state_or_country="{self.state_or_country} or "")'
|
||||
)
|
||||
|
||||
|
||||
def get_addresses_as_columns(*,
|
||||
mailing_address: Optional[Address],
|
||||
business_address: Optional[Address]) -> Columns:
|
||||
"""
|
||||
Returns a rich Columns object with mailing and business addresses
|
||||
"""
|
||||
addresses = []
|
||||
if mailing_address and not mailing_address.empty:
|
||||
addresses.append(Panel(Text(str(mailing_address)), title='\U00002709 Mailing Address', width=40))
|
||||
if business_address and not business_address.empty:
|
||||
addresses.append(Panel((Text(str(business_address))), title='\U0001F3E2 Business Address', width=40))
|
||||
return Columns(addresses, equal=True, expand=True)
|
||||
|
||||
|
||||
class Issuer:
|
||||
"""
|
||||
<primaryIssuer>
|
||||
<cik>0001961089</cik>
|
||||
<entityName>1685 38th REIT, L.L.C.</entityName>
|
||||
<issuerAddress>
|
||||
<street1>2029 CENTURY PARK EAST</street1>
|
||||
<street2>SUITE 1370</street2>
|
||||
<city>LOS ANGELES</city>
|
||||
<stateOrCountry>CA</stateOrCountry>
|
||||
<stateOrCountryDescription>CALIFORNIA</stateOrCountryDescription>
|
||||
<zipCode>90067</zipCode>
|
||||
</issuerAddress>
|
||||
<issuerPhoneNumber>424-313-1550</issuerPhoneNumber>
|
||||
<jurisdictionOfInc>DELAWARE</jurisdictionOfInc>
|
||||
<issuerPreviousNameList>
|
||||
<value>None</value>
|
||||
</issuerPreviousNameList>
|
||||
<edgarPreviousNameList>
|
||||
<value>None</value>
|
||||
</edgarPreviousNameList>
|
||||
<entityType>Limited Liability Company</entityType>
|
||||
<yearOfInc>
|
||||
<withinFiveYears>true</withinFiveYears>
|
||||
<value>2022</value>
|
||||
</yearOfInc>
|
||||
</primaryIssuer>
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
cik: IntString,
|
||||
entity_name: str,
|
||||
entity_type: str,
|
||||
primary_address: Address,
|
||||
phone_number: str,
|
||||
jurisdiction: str,
|
||||
issuer_previous_names: List[str],
|
||||
edgar_previous_names: List[str],
|
||||
year_of_incorporation: IntString,
|
||||
incorporated_within_5_years: bool):
|
||||
self.cik = cik
|
||||
self.entity_name: str = entity_name
|
||||
self.entity_type = entity_type
|
||||
self.primary_address: Address = primary_address
|
||||
self.phone_number: str = phone_number
|
||||
self.issuer_previous_names = issuer_previous_names
|
||||
self.edgar_previous_names = edgar_previous_names
|
||||
self.jurisdiction: str = jurisdiction
|
||||
self.year_of_incorporation = year_of_incorporation
|
||||
self.incorporated_within_5_years: bool = incorporated_within_5_years
|
||||
|
||||
@classmethod
|
||||
def from_xml(cls, issuer_el: Tag):
|
||||
# edgar previous names
|
||||
edgar_previous_names_el = issuer_el.find("edgarPreviousNameList")
|
||||
edgar_previous_names = [el.text
|
||||
for el in edgar_previous_names_el.find_all("value")
|
||||
if el.text != 'None'] if edgar_previous_names_el else []
|
||||
|
||||
# issuer previous names
|
||||
issuer_previous_names_el = issuer_el.find("issuerPreviousNameList")
|
||||
issuer_previous_names = [el.text
|
||||
for el in issuer_previous_names_el.find_all("value")
|
||||
if el.text != 'None'] if issuer_previous_names_el else []
|
||||
|
||||
year_of_inc_el = issuer_el.find("yearOfInc")
|
||||
|
||||
# Address
|
||||
issuer_address_el = issuer_el.find("issuerAddress")
|
||||
address: Address = Address(
|
||||
street1=child_text(issuer_address_el, "street1"),
|
||||
street2=child_text(issuer_address_el, "street2"),
|
||||
city=child_text(issuer_address_el, "city"),
|
||||
state_or_country=child_text(issuer_address_el, "stateOrCountry"),
|
||||
state_or_country_description=child_text(issuer_address_el, "stateOrCountryDescription"),
|
||||
zipcode=child_text(issuer_address_el, "zipCode")
|
||||
)
|
||||
|
||||
return cls(
|
||||
cik=child_text(issuer_el, "cik"),
|
||||
entity_name=child_text(issuer_el, "entityName"),
|
||||
phone_number=child_text(issuer_el, "issuerPhoneNumber"),
|
||||
jurisdiction=child_text(issuer_el, "jurisdictionOfInc"),
|
||||
entity_type=child_text(issuer_el, "entityType"),
|
||||
edgar_previous_names=edgar_previous_names,
|
||||
primary_address=address,
|
||||
issuer_previous_names=issuer_previous_names,
|
||||
year_of_incorporation=child_value(issuer_el, "yearOfInc"),
|
||||
incorporated_within_5_years=year_of_inc_el and child_text(year_of_inc_el, "withinFiveYears") == "true"
|
||||
)
|
||||
|
||||
def __rich__(self):
|
||||
table = Table("issuer", "entity type", "incorporated")
|
||||
table.add_row(self.entity_name, self.entity_type, self.year_of_incorporation)
|
||||
return Group(table)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
class Person:
|
||||
|
||||
def __init__(self,
|
||||
first_name: str,
|
||||
last_name: str,
|
||||
address: Optional[Address] = None):
|
||||
self.first_name = first_name
|
||||
self.last_name = last_name
|
||||
self.address: Address = address
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.first_name} {self.first_name}"
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.first_name} {self.last_name}"
|
||||
|
||||
|
||||
class Name:
|
||||
|
||||
def __init__(self,
|
||||
first_name: str,
|
||||
middle_name: str,
|
||||
last_name: str,
|
||||
suffix:Optional[str]=None):
|
||||
self.first_name = first_name
|
||||
self.middle_name = middle_name
|
||||
self.last_name = last_name
|
||||
self.suffix = suffix
|
||||
|
||||
@property
|
||||
def full_name(self):
|
||||
return f"{self.first_name}{' ' + self.middle_name or ''} {self.last_name} {self.suffix or ''}".rstrip()
|
||||
|
||||
def __str__(self):
|
||||
return self.full_name
|
||||
|
||||
def __repr__(self):
|
||||
return self.full_name
|
||||
|
||||
|
||||
class Filer:
|
||||
|
||||
def __init__(self,
|
||||
cik: str,
|
||||
entity_name: str,
|
||||
file_number: str
|
||||
):
|
||||
self.cik: str = cik
|
||||
self.entity_name: str = entity_name
|
||||
self.file_number: str = file_number
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.entity_name} ({self.cik})"
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.entity_name} ({self.cik})"
|
||||
|
||||
|
||||
class Contact:
|
||||
|
||||
def __init__(self,
|
||||
name: str,
|
||||
phone_number: str,
|
||||
email: str):
|
||||
self.name: str = name
|
||||
self.phone_number: str = phone_number
|
||||
self.email: str = email
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name} ({self.phone_number}) {self.email}"
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.name} ({self.phone_number}) {self.email}"
|
||||
263
venv/lib/python3.10/site-packages/edgar/ai/__init__.py
Normal file
263
venv/lib/python3.10/site-packages/edgar/ai/__init__.py
Normal file
@@ -0,0 +1,263 @@
|
||||
"""
|
||||
EdgarTools AI: AI and LLM integration for SEC financial data analysis.
|
||||
|
||||
This package provides AI capabilities for EdgarTools including:
|
||||
- AI Skills: Portable documentation packages for Claude Desktop and other AI tools
|
||||
- AI-optimized text methods (.text()) with research-backed formats (Markdown-KV, TSV)
|
||||
- LLM context generation with token optimization
|
||||
- Model Context Protocol (MCP) server for Claude Desktop integration
|
||||
- Semantic enrichment of financial data
|
||||
- Token counting and optimization
|
||||
|
||||
Installation:
|
||||
pip install edgartools[ai]
|
||||
|
||||
Dependencies included:
|
||||
- mcp: Model Context Protocol server support
|
||||
- tiktoken: Token counting and optimization
|
||||
|
||||
Skills API:
|
||||
>>> from edgar.ai import install_skill, package_skill
|
||||
>>>
|
||||
>>> # Install skill to ~/.claude/skills/
|
||||
>>> install_skill()
|
||||
PosixPath('/Users/username/.claude/skills/edgartools')
|
||||
>>>
|
||||
>>> # Create ZIP for Claude Desktop upload
|
||||
>>> package_skill()
|
||||
PosixPath('edgartools.zip')
|
||||
|
||||
>>> # List available skills
|
||||
>>> from edgar.ai import list_skills
|
||||
>>> skills = list_skills()
|
||||
|
||||
AI-Optimized Objects:
|
||||
>>> from edgar import Company
|
||||
>>> company = Company("AAPL")
|
||||
>>>
|
||||
>>> # Get AI-optimized text representation (Markdown-KV format)
|
||||
>>> text = company.text(max_tokens=2000)
|
||||
>>> print(text)
|
||||
**Company:** Apple Inc.
|
||||
**CIK:** 0000320193
|
||||
**Ticker:** AAPL
|
||||
|
||||
Context Generation:
|
||||
>>> from edgar.ai import enhance_financial_fact_llm_context
|
||||
>>> context = enhance_financial_fact_llm_context(fact, detail_level='detailed')
|
||||
"""
|
||||
|
||||
# Check for AI dependencies
|
||||
MISSING_DEPS = []
|
||||
|
||||
try:
|
||||
import mcp
|
||||
MCP_AVAILABLE = True
|
||||
except ImportError:
|
||||
MCP_AVAILABLE = False
|
||||
MISSING_DEPS.append("mcp")
|
||||
|
||||
try:
|
||||
import tiktoken
|
||||
TIKTOKEN_AVAILABLE = True
|
||||
except ImportError:
|
||||
TIKTOKEN_AVAILABLE = False
|
||||
MISSING_DEPS.append("tiktoken")
|
||||
|
||||
# AI is available if we have at least some key dependencies
|
||||
AI_AVAILABLE = MCP_AVAILABLE or TIKTOKEN_AVAILABLE
|
||||
|
||||
# Core functionality (always available)
|
||||
from edgar.ai.core import AIEnabled, SemanticEnricher, TokenOptimizer, check_ai_capabilities, enhance_financial_fact_llm_context
|
||||
|
||||
# Skills infrastructure (always available)
|
||||
from edgar.ai.skills.base import BaseSkill
|
||||
from edgar.ai.skills import list_skills, get_skill
|
||||
from edgar.ai.skills.core import edgartools_skill
|
||||
from edgar.ai.exporters import export_skill
|
||||
|
||||
# Convenience functions for common workflows
|
||||
def install_skill(skill=None, to=None, quiet=False):
|
||||
"""
|
||||
Install a skill to ~/.claude/skills/ for automatic discovery.
|
||||
|
||||
Simple, delightful API for installing skills to Claude.
|
||||
|
||||
Args:
|
||||
skill: Skill to install (defaults to edgartools_skill)
|
||||
to: Custom installation directory (defaults to ~/.claude/skills/)
|
||||
quiet: If True, suppress output messages (default: False)
|
||||
|
||||
Returns:
|
||||
Path: Path to installed skill directory
|
||||
|
||||
Examples:
|
||||
>>> from edgar.ai import install_skill
|
||||
>>>
|
||||
>>> # Install EdgarTools skill (default)
|
||||
>>> install_skill()
|
||||
✨ Installing EdgarTools skill...
|
||||
📁 Installed to: /Users/username/.claude/skills/edgartools
|
||||
✅ Ready to use in Claude Desktop and Claude Code!
|
||||
>>>
|
||||
>>> # Install to custom location
|
||||
>>> install_skill(to="~/my-skills")
|
||||
PosixPath('/Users/username/my-skills/edgartools')
|
||||
"""
|
||||
if skill is None:
|
||||
skill = edgartools_skill
|
||||
|
||||
# Show delightful message
|
||||
if not quiet:
|
||||
print("\n" + "="*60)
|
||||
print("""
|
||||
___ _ _____ _
|
||||
| __|__| |__ _ __ _ _ _ |_ _|__ ___ | |___
|
||||
| _|/ _` / _` / _` | '_| | |/ _ \\/ _ \\| (_-<
|
||||
|___\\__,_\\__, \\__,_|_| |_|\\___/\\___/|_/__/
|
||||
|___/
|
||||
""")
|
||||
print("="*60)
|
||||
print(f"✨ Installing {skill.name} skill...")
|
||||
print()
|
||||
|
||||
result = export_skill(
|
||||
skill,
|
||||
format="claude-skills",
|
||||
output_dir=to,
|
||||
install=(to is None) # Only use install flag if no custom dir
|
||||
)
|
||||
|
||||
if not quiet:
|
||||
print(f"📁 Installed to: {result}")
|
||||
print(f"✅ Ready to use in Claude Desktop and Claude Code!")
|
||||
print("="*60 + "\n")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def package_skill(skill=None, output=None, quiet=False):
|
||||
"""
|
||||
Create a ZIP package for Claude Desktop upload.
|
||||
|
||||
Simple, delightful API for packaging skills as ZIP files.
|
||||
|
||||
Args:
|
||||
skill: Skill to package (defaults to edgartools_skill)
|
||||
output: Output directory (defaults to current directory)
|
||||
quiet: If True, suppress output messages (default: False)
|
||||
|
||||
Returns:
|
||||
Path: Path to created ZIP file
|
||||
|
||||
Examples:
|
||||
>>> from edgar.ai import package_skill
|
||||
>>>
|
||||
>>> # Create ZIP in current directory (default)
|
||||
>>> package_skill()
|
||||
📦 Packaging EdgarTools skill...
|
||||
✅ Created: edgartools.zip
|
||||
💡 Ready to upload via Claude Desktop's skill upload interface!
|
||||
>>>
|
||||
>>> # Create ZIP in custom location
|
||||
>>> package_skill(output="~/Desktop")
|
||||
PosixPath('/Users/username/Desktop/edgartools.zip')
|
||||
"""
|
||||
if skill is None:
|
||||
skill = edgartools_skill
|
||||
|
||||
# Show delightful message
|
||||
if not quiet:
|
||||
print("\n" + "="*60)
|
||||
print("""
|
||||
___ _ _____ _
|
||||
| __|__| |__ _ __ _ _ _ |_ _|__ ___ | |___
|
||||
| _|/ _` / _` / _` | '_| | |/ _ \\/ _ \\| (_-<
|
||||
|___\\__,_\\__, \\__,_|_| |_|\\___/\\___/|_/__/
|
||||
|___/
|
||||
""")
|
||||
print("="*60)
|
||||
print(f"📦 Packaging {skill.name} skill as ZIP...")
|
||||
print()
|
||||
|
||||
result = export_skill(
|
||||
skill,
|
||||
format="claude-desktop",
|
||||
output_dir=output,
|
||||
create_zip=True
|
||||
)
|
||||
|
||||
if not quiet:
|
||||
print(f"✅ Created: {result.name}")
|
||||
print(f"📍 Location: {result.parent}")
|
||||
print(f"💡 Ready to upload via Claude Desktop's skill upload interface!")
|
||||
print("="*60 + "\n")
|
||||
|
||||
return result
|
||||
|
||||
# Optional MCP functionality
|
||||
# Note: The class-based MCPServer and EdgarToolsServer are deprecated.
|
||||
# Use the function-based API instead: from edgar.ai.mcp import main, test_server
|
||||
if MCP_AVAILABLE:
|
||||
# Provide stub classes for backward compatibility
|
||||
class MCPServer:
|
||||
def __init__(self, *args, **kwargs):
|
||||
raise DeprecationWarning(
|
||||
"MCPServer class is deprecated. "
|
||||
"Use function-based API: from edgar.ai.mcp import main, test_server"
|
||||
)
|
||||
|
||||
class EdgarToolsServer:
|
||||
def __init__(self, *args, **kwargs):
|
||||
raise DeprecationWarning(
|
||||
"EdgarToolsServer class is deprecated. "
|
||||
"Use function-based API: from edgar.ai.mcp import main, test_server"
|
||||
)
|
||||
else:
|
||||
def MCPServer(*args, **kwargs):
|
||||
raise ImportError(
|
||||
"MCP support requires additional dependencies. "
|
||||
"Install with: pip install edgartools[ai]"
|
||||
)
|
||||
EdgarToolsServer = MCPServer
|
||||
|
||||
# Public API
|
||||
__all__ = [
|
||||
# Core
|
||||
"AIEnabled",
|
||||
"TokenOptimizer",
|
||||
"SemanticEnricher",
|
||||
"enhance_financial_fact_llm_context",
|
||||
"check_ai_capabilities",
|
||||
|
||||
# Skills
|
||||
"BaseSkill",
|
||||
"list_skills",
|
||||
"get_skill",
|
||||
"edgartools_skill",
|
||||
"export_skill",
|
||||
|
||||
# Convenience functions (delightful API)
|
||||
"install_skill",
|
||||
"package_skill",
|
||||
|
||||
# MCP
|
||||
"MCPServer",
|
||||
"EdgarToolsServer",
|
||||
|
||||
# Status flags
|
||||
"AI_AVAILABLE",
|
||||
"MCP_AVAILABLE",
|
||||
"TIKTOKEN_AVAILABLE",
|
||||
"MISSING_DEPS"
|
||||
]
|
||||
|
||||
def get_ai_info():
|
||||
"""Get information about AI capabilities."""
|
||||
return {
|
||||
"ai_available": AI_AVAILABLE,
|
||||
"mcp_available": MCP_AVAILABLE,
|
||||
"tiktoken_available": TIKTOKEN_AVAILABLE,
|
||||
"missing_dependencies": MISSING_DEPS,
|
||||
"install_command": "pip install edgartools[ai]" if MISSING_DEPS else None
|
||||
}
|
||||
16
venv/lib/python3.10/site-packages/edgar/ai/__main__.py
Normal file
16
venv/lib/python3.10/site-packages/edgar/ai/__main__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
EdgarTools MCP Server Entry Point
|
||||
|
||||
Enables running the server via: python -m edgar.ai
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
from edgar.ai.mcp import main, test_server
|
||||
|
||||
# Check for --test flag before starting server
|
||||
if "--test" in sys.argv or "-t" in sys.argv:
|
||||
sys.exit(0 if test_server() else 1)
|
||||
else:
|
||||
main()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
391
venv/lib/python3.10/site-packages/edgar/ai/core.py
Normal file
391
venv/lib/python3.10/site-packages/edgar/ai/core.py
Normal file
@@ -0,0 +1,391 @@
|
||||
"""
|
||||
AI enhancements for EdgarTools entity models.
|
||||
|
||||
This module provides enhanced AI capabilities building on the existing
|
||||
to_llm_context() implementation, adding token optimization, semantic
|
||||
enrichment, and MCP compatibility.
|
||||
"""
|
||||
|
||||
import json
|
||||
from abc import ABC, abstractmethod
|
||||
from datetime import date
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
|
||||
class TokenOptimizer:
|
||||
"""Utilities for optimizing content for LLM token limits."""
|
||||
|
||||
@staticmethod
|
||||
def estimate_tokens(content: Union[str, dict]) -> int:
|
||||
"""
|
||||
Estimate token count for content.
|
||||
|
||||
Rough estimation: ~4 characters per token for English text.
|
||||
"""
|
||||
if isinstance(content, dict):
|
||||
content = json.dumps(content)
|
||||
return len(content) // 4
|
||||
|
||||
@staticmethod
|
||||
def optimize_for_tokens(content: Dict[str, Any], max_tokens: int) -> Dict[str, Any]:
|
||||
"""
|
||||
Optimize content to fit within token limit.
|
||||
|
||||
Uses progressive summarization to retain most important information.
|
||||
"""
|
||||
current_tokens = TokenOptimizer.estimate_tokens(content)
|
||||
|
||||
if current_tokens <= max_tokens:
|
||||
return content
|
||||
|
||||
# Define priority order for content retention
|
||||
priority_keys = [
|
||||
'concept', 'value', 'period', 'context',
|
||||
'quality', 'confidence', 'source'
|
||||
]
|
||||
|
||||
# Start with high-priority content
|
||||
optimized = {}
|
||||
for key in priority_keys:
|
||||
if key in content:
|
||||
optimized[key] = content[key]
|
||||
if TokenOptimizer.estimate_tokens(optimized) > max_tokens:
|
||||
# Remove last added item if we exceed limit
|
||||
optimized.pop(key)
|
||||
break
|
||||
|
||||
# Add truncation indicator
|
||||
if len(optimized) < len(content):
|
||||
optimized['_truncated'] = True
|
||||
|
||||
return optimized
|
||||
|
||||
|
||||
class SemanticEnricher:
|
||||
"""Add semantic context and interpretations to financial data."""
|
||||
|
||||
# Concept definitions for common financial terms
|
||||
CONCEPT_DEFINITIONS = {
|
||||
"Revenue": "Total income generated from normal business operations",
|
||||
"Revenues": "Total income generated from normal business operations",
|
||||
"NetIncome": "Company's total earnings after all expenses and taxes",
|
||||
"NetIncomeLoss": "Company's total earnings or losses after all expenses",
|
||||
"Assets": "Resources owned by the company with economic value",
|
||||
"Liabilities": "Company's financial debts or obligations",
|
||||
"StockholdersEquity": "Residual interest in assets after deducting liabilities",
|
||||
"CashAndCashEquivalents": "Highly liquid assets readily convertible to cash",
|
||||
"OperatingIncome": "Profit from core business operations before interest and taxes",
|
||||
"EarningsPerShare": "Company's profit divided by outstanding shares",
|
||||
"CurrentAssets": "Assets expected to be converted to cash within one year",
|
||||
"CurrentLiabilities": "Obligations due within one year",
|
||||
}
|
||||
|
||||
# Relationships between concepts
|
||||
CONCEPT_RELATIONSHIPS = {
|
||||
"Revenue": ["GrossProfit", "OperatingIncome", "NetIncome"],
|
||||
"Assets": ["CurrentAssets", "NonCurrentAssets", "CashAndCashEquivalents"],
|
||||
"Liabilities": ["CurrentLiabilities", "LongTermDebt"],
|
||||
"NetIncome": ["Revenue", "OperatingExpenses", "TaxExpense"],
|
||||
"StockholdersEquity": ["Assets", "Liabilities", "RetainedEarnings"],
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_concept_definition(cls, concept: str) -> Optional[str]:
|
||||
"""Get human-readable definition for a concept."""
|
||||
# Remove namespace prefix if present
|
||||
concept_key = concept.split(':')[-1]
|
||||
return cls.CONCEPT_DEFINITIONS.get(concept_key)
|
||||
|
||||
@classmethod
|
||||
def get_related_concepts(cls, concept: str) -> List[str]:
|
||||
"""Get semantically related concepts."""
|
||||
concept_key = concept.split(':')[-1]
|
||||
return cls.CONCEPT_RELATIONSHIPS.get(concept_key, [])
|
||||
|
||||
@classmethod
|
||||
def interpret_value(cls, concept: str, value: Union[int, float],
|
||||
unit: str, period_type: str = None) -> str:
|
||||
"""
|
||||
Generate business interpretation of a financial value.
|
||||
|
||||
Args:
|
||||
concept: The financial concept (e.g., "Revenue")
|
||||
value: The numeric value
|
||||
unit: The unit of measurement (e.g., "USD")
|
||||
period_type: 'instant' or 'duration'
|
||||
|
||||
Returns:
|
||||
Human-readable interpretation
|
||||
"""
|
||||
concept_key = concept.split(':')[-1]
|
||||
|
||||
# Revenue interpretations
|
||||
if concept_key in ["Revenue", "Revenues"]:
|
||||
if value > 1_000_000_000:
|
||||
scale = "billion-dollar"
|
||||
elif value > 100_000_000:
|
||||
scale = "multi-million dollar"
|
||||
else:
|
||||
scale = "smaller-scale"
|
||||
return f"The company is a {scale} business based on revenue"
|
||||
|
||||
# Profitability interpretations
|
||||
elif concept_key in ["NetIncome", "NetIncomeLoss"]:
|
||||
if value > 0:
|
||||
return "The company is profitable"
|
||||
elif value == 0:
|
||||
return "The company broke even"
|
||||
else:
|
||||
return "The company reported a net loss"
|
||||
|
||||
# Asset interpretations
|
||||
elif concept_key == "CashAndCashEquivalents":
|
||||
if value > 10_000_000_000:
|
||||
return "Very strong cash position providing significant financial flexibility"
|
||||
elif value > 1_000_000_000:
|
||||
return "Healthy cash reserves for operations and investments"
|
||||
elif value > 100_000_000:
|
||||
return "Adequate cash position for normal operations"
|
||||
else:
|
||||
return "Limited cash reserves may constrain growth opportunities"
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
class AIEnabled(ABC):
|
||||
"""
|
||||
Base mixin for AI-enabled EdgarTools classes.
|
||||
|
||||
Provides standardized AI methods that all classes should implement.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def to_llm_context(self, detail_level: str = 'standard',
|
||||
max_tokens: Optional[int] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert object to LLM-optimized context.
|
||||
|
||||
Args:
|
||||
detail_level: Level of detail ('minimal', 'standard', 'detailed')
|
||||
max_tokens: Optional token limit for response optimization
|
||||
|
||||
Returns:
|
||||
Dictionary optimized for LLM consumption
|
||||
"""
|
||||
pass
|
||||
|
||||
def to_agent_tool(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert object to MCP agent tool response format.
|
||||
|
||||
Returns:
|
||||
Dictionary following MCP tool response schema
|
||||
"""
|
||||
return {
|
||||
"data": self.to_dict() if hasattr(self, 'to_dict') else {},
|
||||
"context": self.to_llm_context(),
|
||||
"metadata": {
|
||||
"source": "SEC EDGAR",
|
||||
"object_type": self.__class__.__name__,
|
||||
"timestamp": date.today().isoformat()
|
||||
}
|
||||
}
|
||||
|
||||
@abstractmethod
|
||||
def get_semantic_description(self) -> str:
|
||||
"""
|
||||
Get natural language description of the object.
|
||||
|
||||
Returns:
|
||||
Human-readable description with key insights
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def enhance_financial_fact_llm_context(fact, detail_level='standard', max_tokens=None):
|
||||
"""
|
||||
Enhanced version of FinancialFact.to_llm_context() with new features.
|
||||
|
||||
This function shows how to enhance the existing implementation while
|
||||
maintaining backward compatibility.
|
||||
|
||||
Args:
|
||||
fact: FinancialFact instance
|
||||
detail_level: 'minimal', 'standard', or 'detailed'
|
||||
max_tokens: Optional token limit
|
||||
|
||||
Returns:
|
||||
Enhanced LLM context dictionary
|
||||
"""
|
||||
# Start with the existing implementation
|
||||
context = fact.to_llm_context()
|
||||
|
||||
# Add semantic enrichment based on detail level
|
||||
if detail_level in ['standard', 'detailed']:
|
||||
# Add concept definition
|
||||
definition = SemanticEnricher.get_concept_definition(fact.concept)
|
||||
if definition:
|
||||
context['definition'] = definition
|
||||
|
||||
# Add value interpretation
|
||||
interpretation = SemanticEnricher.interpret_value(
|
||||
fact.concept,
|
||||
fact.numeric_value or fact.value,
|
||||
fact.unit,
|
||||
fact.period_type
|
||||
)
|
||||
if interpretation:
|
||||
context['interpretation'] = interpretation
|
||||
|
||||
if detail_level == 'detailed':
|
||||
# Add related concepts
|
||||
related = SemanticEnricher.get_related_concepts(fact.concept)
|
||||
if related:
|
||||
context['related_concepts'] = related
|
||||
|
||||
# Add additional metadata
|
||||
context['metadata'] = {
|
||||
'taxonomy': fact.taxonomy,
|
||||
'scale': fact.scale,
|
||||
'decimals': getattr(fact, 'decimals', None),
|
||||
'statement_type': fact.statement_type
|
||||
}
|
||||
|
||||
# Add calculation context if available
|
||||
if hasattr(fact, 'calculation_context') and fact.calculation_context:
|
||||
context['calculation_context'] = fact.calculation_context
|
||||
|
||||
# Optimize for token limit if specified
|
||||
if max_tokens:
|
||||
context = TokenOptimizer.optimize_for_tokens(context, max_tokens)
|
||||
|
||||
return context
|
||||
|
||||
|
||||
class FinancialFactAIWrapper:
|
||||
"""
|
||||
Wrapper to add AI methods to existing FinancialFact instances.
|
||||
|
||||
This demonstrates how to add AI capabilities without modifying
|
||||
the original class definition.
|
||||
"""
|
||||
|
||||
def __init__(self, fact):
|
||||
self.fact = fact
|
||||
|
||||
def to_llm_context(self, detail_level='standard', max_tokens=None):
|
||||
"""Enhanced LLM context with new features."""
|
||||
return enhance_financial_fact_llm_context(
|
||||
self.fact, detail_level, max_tokens
|
||||
)
|
||||
|
||||
def to_agent_tool(self):
|
||||
"""Convert to MCP tool response format."""
|
||||
return {
|
||||
"data": {
|
||||
"concept": self.fact.concept,
|
||||
"value": self.fact.value,
|
||||
"numeric_value": self.fact.numeric_value,
|
||||
"unit": self.fact.unit,
|
||||
"period_end": self.fact.period_end.isoformat() if self.fact.period_end else None,
|
||||
"fiscal_period": self.fact.fiscal_period,
|
||||
"fiscal_year": self.fact.fiscal_year
|
||||
},
|
||||
"context": self.to_llm_context(),
|
||||
"metadata": {
|
||||
"source": f"SEC {self.fact.form_type}",
|
||||
"filed": self.fact.filing_date.isoformat() if self.fact.filing_date else None,
|
||||
"quality": self.fact.data_quality.value,
|
||||
"confidence": self.fact.confidence_score
|
||||
}
|
||||
}
|
||||
|
||||
def get_semantic_description(self):
|
||||
"""Natural language description of the fact."""
|
||||
context = self.fact.to_llm_context()
|
||||
|
||||
return (f"{context['concept']} of {context['value']} {context['unit']} "
|
||||
f"{context['period']} from {context['source']}")
|
||||
|
||||
|
||||
def check_ai_capabilities():
|
||||
"""
|
||||
Check which AI features are available based on installed dependencies.
|
||||
|
||||
Returns:
|
||||
Dictionary with capability flags
|
||||
"""
|
||||
capabilities = {
|
||||
'basic': True, # Always available
|
||||
'mcp': False,
|
||||
'token_optimization': False,
|
||||
'semantic_enrichment': True, # Works without external deps
|
||||
}
|
||||
|
||||
try:
|
||||
import mcp # noqa: F401
|
||||
capabilities['mcp'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import tiktoken # noqa: F401
|
||||
capabilities['token_optimization'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
return capabilities
|
||||
|
||||
|
||||
# Example usage demonstrating the enhanced capabilities
|
||||
if __name__ == "__main__":
|
||||
# This would be imported from edgar.entity.models
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
class DataQuality(Enum):
|
||||
HIGH = "high"
|
||||
|
||||
@dataclass
|
||||
class MockFinancialFact:
|
||||
"""Mock class for demonstration"""
|
||||
concept: str = "us-gaap:Revenue"
|
||||
taxonomy: str = "us-gaap"
|
||||
value: float = 125_000_000_000
|
||||
numeric_value: float = 125_000_000_000
|
||||
unit: str = "USD"
|
||||
scale: int = 1
|
||||
period_end: date = date(2024, 3, 31)
|
||||
period_type: str = "duration"
|
||||
fiscal_period: str = "Q1"
|
||||
fiscal_year: int = 2024
|
||||
form_type: str = "10-Q"
|
||||
filing_date: date = date(2024, 4, 30)
|
||||
data_quality: DataQuality = DataQuality.HIGH
|
||||
confidence_score: float = 0.95
|
||||
statement_type: str = "IncomeStatement"
|
||||
|
||||
def to_llm_context(self):
|
||||
# Simulate existing implementation
|
||||
return {
|
||||
"concept": "Revenue",
|
||||
"value": "125,000 million",
|
||||
"unit": "USD",
|
||||
"period": "for Q1 2024",
|
||||
"context": "",
|
||||
"quality": "high",
|
||||
"confidence": 0.95,
|
||||
"source": "10-Q filed 2024-04-30"
|
||||
}
|
||||
|
||||
# Create a mock fact
|
||||
fact = MockFinancialFact()
|
||||
|
||||
# Wrap it with AI enhancements
|
||||
ai_fact = FinancialFactAIWrapper(fact)
|
||||
|
||||
# Test different detail levels
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
EdgarTools AI examples.
|
||||
|
||||
This package contains example scripts demonstrating AI capabilities.
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Basic usage examples for EdgarTools AI features.
|
||||
|
||||
This script demonstrates how to use the AI capabilities including
|
||||
LLM context generation and MCP server functionality.
|
||||
"""
|
||||
|
||||
import json
|
||||
from datetime import date
|
||||
|
||||
# Check if AI features are available
|
||||
try:
|
||||
from edgar.ai import (
|
||||
AI_AVAILABLE,
|
||||
MCP_AVAILABLE,
|
||||
get_ai_info,
|
||||
enhance_financial_fact_llm_context,
|
||||
check_ai_capabilities
|
||||
)
|
||||
except ImportError:
|
||||
print("EdgarTools AI features not available.")
|
||||
print("Install with: pip install edgartools[llm]")
|
||||
exit(1)
|
||||
|
||||
|
||||
def demonstrate_ai_capabilities():
|
||||
"""Show available AI capabilities."""
|
||||
print("=== AI Capabilities ===")
|
||||
info = get_ai_info()
|
||||
print(f"AI Available: {info['ai_available']}")
|
||||
print(f"MCP Available: {info['mcp_available']}")
|
||||
print(f"Token Optimization: {info['tiktoken_available']}")
|
||||
|
||||
if info['missing_dependencies']:
|
||||
print(f"\nMissing dependencies: {', '.join(info['missing_dependencies'])}")
|
||||
print(f"Install with: {info['install_command']}")
|
||||
|
||||
print("\nDetailed capabilities:")
|
||||
capabilities = check_ai_capabilities()
|
||||
for capability, available in capabilities.items():
|
||||
status = "✓" if available else "✗"
|
||||
print(f" {status} {capability}")
|
||||
|
||||
|
||||
def demonstrate_financial_fact_enhancement():
|
||||
"""Demonstrate enhancing financial facts for LLM consumption."""
|
||||
print("\n=== Financial Fact Enhancement ===")
|
||||
|
||||
# Create a mock financial fact (in real usage, this would come from EdgarTools)
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
class DataQuality(Enum):
|
||||
HIGH = "high"
|
||||
|
||||
@dataclass
|
||||
class MockFinancialFact:
|
||||
concept: str = "us-gaap:Revenue"
|
||||
taxonomy: str = "us-gaap"
|
||||
label: str = "Revenue"
|
||||
value: float = 125_000_000_000
|
||||
numeric_value: float = 125_000_000_000
|
||||
unit: str = "USD"
|
||||
scale: int = 1
|
||||
period_end: date = date(2024, 3, 31)
|
||||
period_type: str = "duration"
|
||||
fiscal_period: str = "Q1"
|
||||
fiscal_year: int = 2024
|
||||
filing_date: date = date(2024, 4, 30)
|
||||
form_type: str = "10-Q"
|
||||
data_quality: DataQuality = DataQuality.HIGH
|
||||
confidence_score: float = 0.95
|
||||
statement_type: str = "IncomeStatement"
|
||||
|
||||
def to_llm_context(self):
|
||||
"""Basic LLM context (existing in EdgarTools)."""
|
||||
return {
|
||||
"concept": self.label,
|
||||
"value": f"{self.value:,.0f}",
|
||||
"unit": self.unit,
|
||||
"period": f"for {self.fiscal_period} {self.fiscal_year}",
|
||||
"quality": self.data_quality.value,
|
||||
"confidence": self.confidence_score,
|
||||
"source": f"{self.form_type} filed {self.filing_date}"
|
||||
}
|
||||
|
||||
fact = MockFinancialFact()
|
||||
|
||||
# Show different detail levels
|
||||
print("\nMinimal context:")
|
||||
minimal = enhance_financial_fact_llm_context(fact, detail_level='minimal')
|
||||
print(json.dumps(minimal, indent=2))
|
||||
|
||||
print("\nStandard context (with semantic enrichment):")
|
||||
standard = enhance_financial_fact_llm_context(fact, detail_level='standard')
|
||||
print(json.dumps(standard, indent=2))
|
||||
|
||||
print("\nToken-limited context (100 tokens):")
|
||||
limited = enhance_financial_fact_llm_context(fact, detail_level='detailed', max_tokens=100)
|
||||
print(json.dumps(limited, indent=2))
|
||||
|
||||
|
||||
def demonstrate_mcp_server():
|
||||
"""Demonstrate MCP server setup."""
|
||||
print("\n=== MCP Server Setup ===")
|
||||
|
||||
if not MCP_AVAILABLE:
|
||||
print("MCP not available. Install with: pip install edgartools[llm]")
|
||||
return
|
||||
|
||||
try:
|
||||
from edgar.ai.mcp import get_simple_server
|
||||
|
||||
server = get_simple_server()
|
||||
print("MCP Server created successfully!")
|
||||
print(f"Server name: {server.name}")
|
||||
|
||||
print("\nTo run the server:")
|
||||
print(" python edgar/ai/run_mcp_server.py")
|
||||
|
||||
print("\nOr use in Claude Desktop config:")
|
||||
print(""" {
|
||||
"tools": [
|
||||
{
|
||||
"type": "mcp",
|
||||
"name": "edgartools",
|
||||
"config": {
|
||||
"command": "python",
|
||||
"args": ["edgar/ai/run_mcp_server.py"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}""")
|
||||
|
||||
except ImportError as e:
|
||||
print(f"Error creating MCP server: {e}")
|
||||
|
||||
|
||||
def demonstrate_usage_with_company():
|
||||
"""Demonstrate AI features with real EdgarTools objects."""
|
||||
print("\n=== Usage with EdgarTools Company ===")
|
||||
|
||||
try:
|
||||
from edgar import Company
|
||||
|
||||
# Get a company
|
||||
company = Company("AAPL")
|
||||
print(f"Company: {company.name} ({company.get_ticker()})")
|
||||
|
||||
# If the company has a to_llm_context method (future enhancement)
|
||||
if hasattr(company, 'to_llm_context'):
|
||||
context = company.to_llm_context()
|
||||
print("\nLLM Context:")
|
||||
print(json.dumps(context, indent=2))
|
||||
else:
|
||||
print("\nNote: Company.to_llm_context() will be available in future versions")
|
||||
print("For now, use the AI wrapper functions to enhance EdgarTools objects")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error demonstrating company usage: {e}")
|
||||
print("This example requires a working internet connection and valid SEC API access")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all demonstrations."""
|
||||
print("EdgarTools AI Features Demonstration")
|
||||
print("=" * 50)
|
||||
|
||||
# Check capabilities
|
||||
demonstrate_ai_capabilities()
|
||||
|
||||
# Show financial fact enhancement
|
||||
demonstrate_financial_fact_enhancement()
|
||||
|
||||
# Show MCP server setup
|
||||
demonstrate_mcp_server()
|
||||
|
||||
# Show usage with real EdgarTools objects
|
||||
demonstrate_usage_with_company()
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("For more examples, see the documentation in edgar/ai/docs/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,53 @@
|
||||
"""
|
||||
EdgarTools AI skill exporters.
|
||||
|
||||
Provides functions to export skills in various formats for AI tool integration.
|
||||
"""
|
||||
|
||||
from edgar.ai.exporters.claude_desktop import export_claude_desktop
|
||||
from edgar.ai.exporters.claude_skills import export_claude_skills
|
||||
|
||||
__all__ = ['export_claude_desktop', 'export_claude_skills', 'export_skill']
|
||||
|
||||
|
||||
def export_skill(skill, format: str = "claude-skills", output_dir=None, **kwargs):
|
||||
"""
|
||||
Export a skill in the specified format.
|
||||
|
||||
Args:
|
||||
skill: BaseSkill instance to export
|
||||
format: Export format:
|
||||
- "claude-skills": Official Claude Skills format (default, ~/.claude/skills/)
|
||||
- "claude-desktop": Portable format (current directory)
|
||||
output_dir: Optional output directory (format-specific defaults)
|
||||
**kwargs: Additional format-specific parameters:
|
||||
- claude-skills: install (bool, default True)
|
||||
- claude-desktop: create_zip (bool, default False)
|
||||
|
||||
Returns:
|
||||
Path: Path to exported skill directory or archive
|
||||
|
||||
Examples:
|
||||
>>> from edgar.ai.skills import edgartools_skill
|
||||
|
||||
>>> # Export to ~/.claude/skills/ (default)
|
||||
>>> export_skill(edgartools_skill, format="claude-skills")
|
||||
PosixPath('/Users/username/.claude/skills/edgartools')
|
||||
|
||||
>>> # Export to current directory (portable)
|
||||
>>> export_skill(edgartools_skill, format="claude-desktop")
|
||||
PosixPath('edgartools')
|
||||
|
||||
>>> # Export as zip archive
|
||||
>>> export_skill(edgartools_skill, format="claude-desktop", create_zip=True)
|
||||
PosixPath('edgartools.zip')
|
||||
"""
|
||||
if format == "claude-skills":
|
||||
return export_claude_skills(skill, output_dir=output_dir, **kwargs)
|
||||
elif format == "claude-desktop":
|
||||
return export_claude_desktop(skill, output_dir=output_dir, **kwargs)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown export format: {format}. "
|
||||
f"Supported formats: 'claude-skills', 'claude-desktop'"
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,173 @@
|
||||
"""
|
||||
Claude Desktop skill exporter.
|
||||
|
||||
Exports EdgarTools skills for Claude Desktop upload:
|
||||
- Creates ZIP file with SKILL.md at root (required by Claude Desktop)
|
||||
- Validates YAML frontmatter structure
|
||||
- Includes all supporting markdown files and API reference
|
||||
"""
|
||||
|
||||
import shutil
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import re
|
||||
|
||||
|
||||
def export_claude_desktop(skill, output_dir: Optional[Path] = None, create_zip: bool = True) -> Path:
|
||||
"""
|
||||
Export a skill for Claude Desktop upload.
|
||||
|
||||
Creates a ZIP file with SKILL.md at the root level, as required by Claude Desktop's
|
||||
upload interface. The ZIP includes all supporting markdown files and API reference.
|
||||
|
||||
Args:
|
||||
skill: BaseSkill instance to export
|
||||
output_dir: Optional output directory (defaults to current directory)
|
||||
create_zip: If True (default), create a zip archive; if False, create directory
|
||||
|
||||
Returns:
|
||||
Path: Path to exported ZIP file (or directory if create_zip=False)
|
||||
|
||||
Examples:
|
||||
>>> from edgar.ai.skills import edgartools_skill
|
||||
|
||||
>>> # Create ZIP for Claude Desktop upload (default)
|
||||
>>> export_claude_desktop(edgartools_skill)
|
||||
PosixPath('edgartools.zip')
|
||||
|
||||
>>> # Create directory for manual installation
|
||||
>>> export_claude_desktop(edgartools_skill, create_zip=False)
|
||||
PosixPath('edgartools')
|
||||
"""
|
||||
from edgar.ai.skills.base import BaseSkill
|
||||
|
||||
if not isinstance(skill, BaseSkill):
|
||||
raise TypeError(f"Expected BaseSkill instance, got {type(skill)}")
|
||||
|
||||
# Determine output directory
|
||||
if output_dir is None:
|
||||
output_dir = Path.cwd()
|
||||
else:
|
||||
output_dir = Path(output_dir)
|
||||
|
||||
# Create skill-specific directory name (kebab-case from skill name)
|
||||
skill_dir_name = skill.name.lower().replace(' ', '-')
|
||||
skill_output_dir = output_dir / skill_dir_name
|
||||
|
||||
# Remove existing directory if present
|
||||
if skill_output_dir.exists():
|
||||
shutil.rmtree(skill_output_dir)
|
||||
|
||||
skill_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Get markdown files from skill content directory
|
||||
content_dir = skill.content_dir
|
||||
markdown_files = list(content_dir.glob("*.md"))
|
||||
|
||||
if not markdown_files:
|
||||
raise ValueError(f"No markdown files found in {content_dir}")
|
||||
|
||||
# Copy and validate each markdown file
|
||||
# Claude Desktop requires SKILL.md (uppercase) at root
|
||||
for md_file in markdown_files:
|
||||
_copy_and_validate_markdown(md_file, skill_output_dir)
|
||||
|
||||
# Copy centralized object documentation (API reference)
|
||||
object_docs = skill.get_object_docs()
|
||||
if object_docs:
|
||||
api_ref_dir = skill_output_dir / "api-reference"
|
||||
api_ref_dir.mkdir(exist_ok=True)
|
||||
|
||||
for doc_path in object_docs:
|
||||
if doc_path.exists():
|
||||
shutil.copy2(doc_path, api_ref_dir / doc_path.name)
|
||||
# Silently skip missing docs (allows for optional docs)
|
||||
|
||||
# Create zip archive if requested
|
||||
if create_zip:
|
||||
zip_path = output_dir / f"{skill_dir_name}.zip"
|
||||
_create_zip_archive(skill_output_dir, zip_path)
|
||||
# Clean up directory after zipping
|
||||
shutil.rmtree(skill_output_dir)
|
||||
return zip_path
|
||||
|
||||
return skill_output_dir
|
||||
|
||||
|
||||
def _copy_and_validate_markdown(source: Path, destination_dir: Path) -> None:
|
||||
"""
|
||||
Copy markdown file and validate YAML frontmatter.
|
||||
|
||||
Args:
|
||||
source: Source markdown file path
|
||||
destination_dir: Destination directory
|
||||
|
||||
Raises:
|
||||
ValueError: If YAML frontmatter is invalid or missing in SKILL.md
|
||||
"""
|
||||
dest_file = destination_dir / source.name
|
||||
|
||||
# Read and validate
|
||||
content = source.read_text(encoding='utf-8')
|
||||
|
||||
# Only require frontmatter for SKILL.md
|
||||
if source.name == 'SKILL.md':
|
||||
# Check for YAML frontmatter
|
||||
if not content.startswith('---'):
|
||||
raise ValueError(f"Missing YAML frontmatter in {source.name}")
|
||||
|
||||
# Extract frontmatter
|
||||
parts = content.split('---', 2)
|
||||
if len(parts) < 3:
|
||||
raise ValueError(f"Invalid YAML frontmatter structure in {source.name}")
|
||||
|
||||
frontmatter = parts[1].strip()
|
||||
|
||||
# Validate required frontmatter fields
|
||||
_validate_skill_frontmatter(frontmatter, source.name)
|
||||
else:
|
||||
# Optional: validate frontmatter if present in supporting files
|
||||
if content.startswith('---'):
|
||||
parts = content.split('---', 2)
|
||||
if len(parts) < 3:
|
||||
raise ValueError(f"Invalid YAML frontmatter structure in {source.name}")
|
||||
|
||||
# Copy file
|
||||
shutil.copy2(source, dest_file)
|
||||
|
||||
|
||||
def _validate_skill_frontmatter(frontmatter: str, filename: str) -> None:
|
||||
"""
|
||||
Validate required fields in skill.md frontmatter.
|
||||
|
||||
Args:
|
||||
frontmatter: YAML frontmatter content
|
||||
filename: Source filename (for error messages)
|
||||
|
||||
Raises:
|
||||
ValueError: If required fields are missing
|
||||
"""
|
||||
# Only require essential fields (name and description)
|
||||
# version and author are optional
|
||||
required_fields = ['name', 'description']
|
||||
|
||||
for field in required_fields:
|
||||
# Simple regex check (not full YAML parsing to avoid dependencies)
|
||||
if not re.search(rf'^{field}:', frontmatter, re.MULTILINE):
|
||||
raise ValueError(f"Missing required field '{field}' in {filename} frontmatter")
|
||||
|
||||
|
||||
def _create_zip_archive(source_dir: Path, zip_path: Path) -> None:
|
||||
"""
|
||||
Create a zip archive of the skill directory.
|
||||
|
||||
Args:
|
||||
source_dir: Source directory to zip
|
||||
zip_path: Output zip file path
|
||||
"""
|
||||
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||
for file_path in source_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
arcname = file_path.relative_to(source_dir.parent)
|
||||
zipf.write(file_path, arcname)
|
||||
@@ -0,0 +1,163 @@
|
||||
"""
|
||||
Claude Skills exporter.
|
||||
|
||||
Exports EdgarTools skills in official Anthropic Claude Skills format:
|
||||
- Installs to ~/.claude/skills/ by default
|
||||
- Main file: SKILL.md (uppercase, per Anthropic spec)
|
||||
- Keeps all supporting markdown files
|
||||
- Validates YAML frontmatter structure
|
||||
"""
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import re
|
||||
|
||||
|
||||
def export_claude_skills(skill, output_dir: Optional[Path] = None, install: bool = True) -> Path:
|
||||
"""
|
||||
Export a skill in official Claude Skills format.
|
||||
|
||||
Exports to ~/.claude/skills/ by default, creating SKILL.md (uppercase) as the
|
||||
main skill file per Anthropic's specification. All supporting markdown files
|
||||
are preserved.
|
||||
|
||||
Args:
|
||||
skill: BaseSkill instance to export
|
||||
output_dir: Optional output directory (defaults to ~/.claude/skills/)
|
||||
install: If True (default), install to ~/.claude/skills/;
|
||||
if False, use output_dir or current directory
|
||||
|
||||
Returns:
|
||||
Path: Path to exported skill directory
|
||||
|
||||
Examples:
|
||||
>>> from edgar.ai.skills import edgartools_skill
|
||||
>>> export_claude_skills(edgartools_skill)
|
||||
PosixPath('/Users/username/.claude/skills/edgartools')
|
||||
|
||||
>>> # Export to custom location
|
||||
>>> export_claude_skills(edgartools_skill,
|
||||
... output_dir="./my-skills",
|
||||
... install=False)
|
||||
PosixPath('./my-skills/edgartools')
|
||||
"""
|
||||
from edgar.ai.skills.base import BaseSkill
|
||||
|
||||
if not isinstance(skill, BaseSkill):
|
||||
raise TypeError(f"Expected BaseSkill instance, got {type(skill)}")
|
||||
|
||||
# Determine output directory
|
||||
if install and output_dir is None:
|
||||
# Default: Install to ~/.claude/skills/
|
||||
output_dir = Path.home() / ".claude" / "skills"
|
||||
elif output_dir is None:
|
||||
# No install flag, no output_dir: use current directory
|
||||
output_dir = Path.cwd()
|
||||
else:
|
||||
output_dir = Path(output_dir)
|
||||
|
||||
# Create skill-specific directory name (kebab-case from skill name)
|
||||
skill_dir_name = skill.name.lower().replace(' ', '-')
|
||||
skill_output_dir = output_dir / skill_dir_name
|
||||
|
||||
# Remove existing directory if present
|
||||
if skill_output_dir.exists():
|
||||
shutil.rmtree(skill_output_dir)
|
||||
|
||||
skill_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Get markdown files from skill content directory
|
||||
content_dir = skill.content_dir
|
||||
markdown_files = list(content_dir.glob("*.md"))
|
||||
|
||||
if not markdown_files:
|
||||
raise ValueError(f"No markdown files found in {content_dir}")
|
||||
|
||||
# Copy markdown files
|
||||
skill_md_found = False
|
||||
for md_file in markdown_files:
|
||||
if md_file.name == 'SKILL.md':
|
||||
# Validate and copy SKILL.md
|
||||
_copy_and_validate_skill_md(md_file, skill_output_dir)
|
||||
skill_md_found = True
|
||||
else:
|
||||
# Copy supporting markdown files as-is
|
||||
dest_file = skill_output_dir / md_file.name
|
||||
shutil.copy2(md_file, dest_file)
|
||||
|
||||
if not skill_md_found:
|
||||
raise ValueError("No SKILL.md found in skill content directory")
|
||||
|
||||
# Copy centralized object documentation (API reference)
|
||||
object_docs = skill.get_object_docs()
|
||||
if object_docs:
|
||||
api_ref_dir = skill_output_dir / "api-reference"
|
||||
api_ref_dir.mkdir(exist_ok=True)
|
||||
|
||||
for doc_path in object_docs:
|
||||
if doc_path.exists():
|
||||
shutil.copy2(doc_path, api_ref_dir / doc_path.name)
|
||||
# Silently skip missing docs (allows for optional docs)
|
||||
|
||||
return skill_output_dir
|
||||
|
||||
|
||||
def _copy_and_validate_skill_md(source: Path, destination_dir: Path) -> None:
|
||||
"""
|
||||
Copy SKILL.md and validate YAML frontmatter.
|
||||
|
||||
Args:
|
||||
source: Source SKILL.md file path
|
||||
destination_dir: Destination directory
|
||||
|
||||
Raises:
|
||||
ValueError: If YAML frontmatter is invalid or missing
|
||||
"""
|
||||
dest_file = destination_dir / source.name
|
||||
|
||||
# Read and validate
|
||||
content = source.read_text(encoding='utf-8')
|
||||
|
||||
# Check for YAML frontmatter
|
||||
if not content.startswith('---'):
|
||||
raise ValueError(f"Missing YAML frontmatter in {source.name}")
|
||||
|
||||
# Extract frontmatter
|
||||
parts = content.split('---', 2)
|
||||
if len(parts) < 3:
|
||||
raise ValueError(f"Invalid YAML frontmatter structure in {source.name}")
|
||||
|
||||
frontmatter = parts[1].strip()
|
||||
|
||||
# Validate required frontmatter fields
|
||||
_validate_skill_frontmatter(frontmatter, source.name)
|
||||
|
||||
# Copy file
|
||||
dest_file.write_text(content, encoding='utf-8')
|
||||
|
||||
|
||||
def _validate_skill_frontmatter(frontmatter: str, filename: str) -> None:
|
||||
"""
|
||||
Validate required fields in SKILL.md frontmatter.
|
||||
|
||||
Per Anthropic spec, SKILL.md must have:
|
||||
- name: skill identifier (lowercase with hyphens)
|
||||
- description: clear description of what skill does
|
||||
|
||||
Args:
|
||||
frontmatter: YAML frontmatter content
|
||||
filename: Source filename (for error messages)
|
||||
|
||||
Raises:
|
||||
ValueError: If required fields are missing
|
||||
"""
|
||||
required_fields = ['name', 'description']
|
||||
|
||||
for field in required_fields:
|
||||
# Simple regex check (not full YAML parsing to avoid dependencies)
|
||||
if not re.search(rf'^{field}:', frontmatter, re.MULTILINE):
|
||||
raise ValueError(
|
||||
f"Missing required field '{field}' in {filename} frontmatter. "
|
||||
f"Claude Skills require both 'name' and 'description' fields."
|
||||
)
|
||||
101
venv/lib/python3.10/site-packages/edgar/ai/formats.py
Normal file
101
venv/lib/python3.10/site-packages/edgar/ai/formats.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""
|
||||
AI-optimized text formatting utilities for EdgarTools.
|
||||
|
||||
Provides research-backed text formats optimized for LLM accuracy and token efficiency:
|
||||
- Markdown-KV: Best accuracy (60.7%) for metadata
|
||||
- TSV: Most efficient for tabular data
|
||||
|
||||
Based on research from improvingagents.com/blog/best-input-data-format-for-llms
|
||||
"""
|
||||
|
||||
from typing import List, Dict
|
||||
|
||||
__all__ = ['to_markdown_kv', 'to_tsv']
|
||||
|
||||
|
||||
def to_markdown_kv(data: dict, max_tokens: int = 2000) -> str:
|
||||
"""
|
||||
Convert dict to Markdown Key-Value format optimized for LLMs.
|
||||
|
||||
Research shows Markdown-KV format provides:
|
||||
- 60.7% accuracy (best among tested formats)
|
||||
- 25% fewer tokens than JSON
|
||||
- Better readability for both humans and AI
|
||||
|
||||
Source: improvingagents.com/blog/best-input-data-format-for-llms
|
||||
|
||||
Args:
|
||||
data: Dictionary with string keys and simple values
|
||||
max_tokens: Approximate token limit (4 chars/token heuristic)
|
||||
|
||||
Returns:
|
||||
Markdown-formatted key-value text
|
||||
|
||||
Example:
|
||||
>>> to_markdown_kv({"name": "Apple Inc.", "cik": "320193"})
|
||||
'**Name:** Apple Inc.\\n**Cik:** 320193'
|
||||
"""
|
||||
lines = []
|
||||
for key, value in data.items():
|
||||
if value is None:
|
||||
continue
|
||||
# Convert key to title case for readability
|
||||
display_key = key.replace('_', ' ').title()
|
||||
lines.append(f"**{display_key}:** {value}")
|
||||
|
||||
text = "\n".join(lines)
|
||||
|
||||
# Token limiting (4 chars/token heuristic)
|
||||
max_chars = max_tokens * 4
|
||||
if len(text) > max_chars:
|
||||
text = text[:max_chars] + "\n\n[Truncated for token limit]"
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def to_tsv(rows: List[Dict], headers: List[str], max_tokens: int = 2000, limit: int = 10) -> str:
|
||||
"""
|
||||
Convert list of dicts to TSV (tab-separated values) format.
|
||||
|
||||
TSV is extremely token-efficient for tabular data and provides better
|
||||
accuracy than CSV. This pattern is proven in MultiPeriodStatement.to_llm_string().
|
||||
|
||||
Args:
|
||||
rows: List of dicts with consistent keys
|
||||
headers: Column headers to include
|
||||
max_tokens: Approximate token limit (4 chars/token heuristic)
|
||||
limit: Maximum rows to include (default: 10)
|
||||
|
||||
Returns:
|
||||
Tab-separated values with header row
|
||||
|
||||
Example:
|
||||
>>> rows = [{"form": "10-K", "cik": "320193"}, {"form": "10-Q", "cik": "789019"}]
|
||||
>>> to_tsv(rows, ["form", "cik"], limit=2)
|
||||
'form\\tcik\\n10-K\\t320193\\n10-Q\\t789019'
|
||||
"""
|
||||
lines = []
|
||||
|
||||
# Header row
|
||||
lines.append("\t".join(headers))
|
||||
|
||||
# Data rows
|
||||
for row in rows[:limit]:
|
||||
values = [str(row.get(h, "N/A")) for h in headers]
|
||||
lines.append("\t".join(values))
|
||||
|
||||
text = "\n".join(lines)
|
||||
|
||||
# Add summary if truncated
|
||||
if len(rows) > limit:
|
||||
text += f"\n\n[Showing {limit} of {len(rows)} rows]"
|
||||
|
||||
# Token limiting
|
||||
max_chars = max_tokens * 4
|
||||
if len(text) > max_chars:
|
||||
# Estimate rows that fit
|
||||
avg_row_size = len(text) // len(lines) if lines else 100
|
||||
rows_that_fit = max(1, max_chars // avg_row_size)
|
||||
text = "\n".join(lines[:rows_that_fit]) + "\n\n[Truncated for token limit]"
|
||||
|
||||
return text
|
||||
667
venv/lib/python3.10/site-packages/edgar/ai/helpers.py
Normal file
667
venv/lib/python3.10/site-packages/edgar/ai/helpers.py
Normal file
@@ -0,0 +1,667 @@
|
||||
"""
|
||||
Helper functions for common SEC filing analysis tasks.
|
||||
|
||||
These convenience wrappers provide simple, high-level access to EdgarTools functionality
|
||||
for common SEC filing analysis patterns.
|
||||
"""
|
||||
from typing import Optional, List, Dict, Union
|
||||
import pandas as pd
|
||||
from edgar import get_filings, get_current_filings, Company
|
||||
|
||||
__all__ = [
|
||||
# Filing retrieval
|
||||
'get_filings_by_period',
|
||||
'get_today_filings',
|
||||
# Financial analysis
|
||||
'get_revenue_trend',
|
||||
'get_filing_statement',
|
||||
'compare_companies_revenue',
|
||||
# Industry and company subset filtering
|
||||
'filter_by_industry',
|
||||
'filter_by_company_subset',
|
||||
# Company subset convenience functions
|
||||
'get_companies_by_state',
|
||||
'get_pharmaceutical_companies',
|
||||
'get_biotechnology_companies',
|
||||
'get_software_companies',
|
||||
'get_semiconductor_companies',
|
||||
'get_banking_companies',
|
||||
'get_investment_companies',
|
||||
'get_insurance_companies',
|
||||
'get_real_estate_companies',
|
||||
'get_oil_gas_companies',
|
||||
'get_retail_companies',
|
||||
]
|
||||
|
||||
|
||||
def get_filings_by_period(
|
||||
year: int,
|
||||
quarter: int,
|
||||
form: Optional[str] = None,
|
||||
filing_date: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
Get published filings for a specific time period from SEC quarterly indexes.
|
||||
|
||||
This is a convenience wrapper around get_filings() with clear parameter names.
|
||||
|
||||
Args:
|
||||
year: Year (e.g., 2023)
|
||||
quarter: Quarter 1-4 (1=Jan-Mar, 2=Apr-Jun, 3=Jul-Sep, 4=Oct-Dec)
|
||||
form: Optional form type filter (e.g., "10-K", "10-Q", "S-1")
|
||||
filing_date: Optional date or range filter (e.g., "2023-02-01:2023-02-28")
|
||||
|
||||
Returns:
|
||||
Filings collection that can be further filtered or iterated
|
||||
|
||||
Raises:
|
||||
HTTPError: If SEC API request fails
|
||||
ValueError: If year/quarter parameters are invalid
|
||||
|
||||
Examples:
|
||||
>>> # Get all filings from Q1 2023
|
||||
>>> filings = get_filings_by_period(2023, 1)
|
||||
|
||||
>>> # Get only 10-K filings from Q1 2023
|
||||
>>> filings = get_filings_by_period(2023, 1, form="10-K")
|
||||
|
||||
>>> # Get S-1 filings from February 2023
|
||||
>>> filings = get_filings_by_period(
|
||||
... 2023, 1,
|
||||
... form="S-1",
|
||||
... filing_date="2023-02-01:2023-02-28"
|
||||
... )
|
||||
|
||||
See Also:
|
||||
- get_filings() - The underlying raw API function
|
||||
- get_today_filings() - For real-time filings (last 24h)
|
||||
- Company.get_filings() - For company-specific filings
|
||||
"""
|
||||
return get_filings(year, quarter, form=form, filing_date=filing_date)
|
||||
|
||||
|
||||
def get_today_filings():
|
||||
"""
|
||||
Get current filings from the last ~24 hours using SEC RSS feed.
|
||||
|
||||
This is a convenience wrapper around get_current_filings() for simpler naming.
|
||||
|
||||
Returns:
|
||||
CurrentFilings collection with recent submissions
|
||||
|
||||
Raises:
|
||||
HTTPError: If SEC RSS feed request fails
|
||||
|
||||
Examples:
|
||||
>>> # Get all recent filings
|
||||
>>> current = get_today_filings()
|
||||
>>> print(f"Found {len(current)} filings in last 24 hours")
|
||||
|
||||
>>> # Filter for specific forms
|
||||
>>> reports = current.filter(form=["10-K", "10-Q"])
|
||||
|
||||
>>> # Filter for specific companies
|
||||
>>> tech_filings = current.filter(ticker=["AAPL", "MSFT", "GOOGL"])
|
||||
|
||||
See Also:
|
||||
- get_current_filings() - The underlying raw API function
|
||||
- get_filings_by_period() - For historical filings by quarter
|
||||
"""
|
||||
return get_current_filings()
|
||||
|
||||
|
||||
def get_revenue_trend(
|
||||
ticker: str,
|
||||
periods: int = 3,
|
||||
quarterly: bool = False
|
||||
):
|
||||
"""
|
||||
Get income statement trend for revenue analysis using Entity Facts API.
|
||||
|
||||
This is the most efficient way to get multi-period financial data as it
|
||||
uses a single API call to retrieve comparative periods.
|
||||
|
||||
Args:
|
||||
ticker: Company ticker symbol (e.g., "AAPL", "MSFT", "GOOGL")
|
||||
periods: Number of periods to retrieve (default: 3)
|
||||
- For annual: Gets last N fiscal years
|
||||
- For quarterly: Gets last N quarters
|
||||
quarterly: If True, get quarterly data; if False, get annual data
|
||||
(default: False for annual)
|
||||
|
||||
Returns:
|
||||
MultiPeriodStatement object containing income statement data across
|
||||
multiple periods. Can be printed directly or accessed programmatically
|
||||
via .periods attribute.
|
||||
|
||||
Raises:
|
||||
ValueError: If ticker is invalid or company not found
|
||||
HTTPError: If SEC Company Facts API request fails
|
||||
NoCompanyFactsFound: If company has no financial data
|
||||
|
||||
Examples:
|
||||
>>> # Get 3 fiscal years of revenue data (default)
|
||||
>>> income = get_revenue_trend("AAPL")
|
||||
>>> print(income) # Shows 3-year revenue trend
|
||||
|
||||
>>> # Get 4 quarters of revenue data
|
||||
>>> quarterly = get_revenue_trend("TSLA", periods=4, quarterly=True)
|
||||
>>> print(quarterly) # Shows 4-quarter trend
|
||||
|
||||
>>> # Get 5 years for long-term analysis
|
||||
>>> long_term = get_revenue_trend("MSFT", periods=5)
|
||||
|
||||
>>> # Access specific period programmatically
|
||||
>>> income = get_revenue_trend("AAPL", periods=3)
|
||||
>>> fy2023_data = income.periods[0] # Most recent period
|
||||
|
||||
See Also:
|
||||
- Company.income_statement() - The underlying raw API method
|
||||
- get_filing_statement() - For statement from specific filing
|
||||
- compare_companies_revenue() - For multi-company comparison
|
||||
"""
|
||||
company = Company(ticker)
|
||||
return company.income_statement(periods=periods, annual=not quarterly)
|
||||
|
||||
|
||||
def get_filing_statement(
|
||||
ticker: str,
|
||||
year: int,
|
||||
form: str,
|
||||
statement_type: str = "income"
|
||||
):
|
||||
"""
|
||||
Get a specific financial statement from a company's filing using XBRL.
|
||||
|
||||
This provides the most detailed financial data from a specific filing,
|
||||
including all line items as filed. For multi-period comparison, consider
|
||||
using get_revenue_trend() instead (more efficient).
|
||||
|
||||
Args:
|
||||
ticker: Company ticker symbol (e.g., "AAPL", "MSFT")
|
||||
year: Filing year (e.g., 2023)
|
||||
form: Form type (e.g., "10-K" for annual, "10-Q" for quarterly)
|
||||
statement_type: Type of statement to retrieve (default: "income")
|
||||
- "income" - Income statement
|
||||
- "balance" - Balance sheet
|
||||
- "cash_flow" - Cash flow statement
|
||||
|
||||
Returns:
|
||||
Statement object with detailed line items from the filing.
|
||||
Can be printed directly or accessed programmatically.
|
||||
|
||||
Raises:
|
||||
ValueError: If statement_type is not recognized or ticker invalid
|
||||
HTTPError: If SEC API request fails
|
||||
IndexError: If no filing found for the specified year/form
|
||||
XBRLError: If XBRL parsing fails
|
||||
|
||||
Examples:
|
||||
>>> # Get income statement from Apple's 2023 10-K
|
||||
>>> income = get_filing_statement("AAPL", 2023, "10-K", "income")
|
||||
>>> print(income)
|
||||
|
||||
>>> # Get balance sheet from quarterly filing
|
||||
>>> balance = get_filing_statement("AAPL", 2023, "10-Q", "balance")
|
||||
|
||||
>>> # Get cash flow statement
|
||||
>>> cash_flow = get_filing_statement("MSFT", 2023, "10-K", "cash_flow")
|
||||
|
||||
>>> # Get all three major statements
|
||||
>>> income = get_filing_statement("GOOGL", 2023, "10-K", "income")
|
||||
>>> balance = get_filing_statement("GOOGL", 2023, "10-K", "balance")
|
||||
>>> cash = get_filing_statement("GOOGL", 2023, "10-K", "cash_flow")
|
||||
|
||||
See Also:
|
||||
- Filing.xbrl() - The underlying XBRL parsing method
|
||||
- get_revenue_trend() - More efficient for multi-period data
|
||||
- Company.get_filings() - For accessing filings directly
|
||||
"""
|
||||
company = Company(ticker)
|
||||
filing = company.get_filings(year=year, form=form)[0]
|
||||
xbrl = filing.xbrl()
|
||||
|
||||
if statement_type == "income":
|
||||
return xbrl.statements.income_statement()
|
||||
elif statement_type == "balance":
|
||||
return xbrl.statements.balance_sheet()
|
||||
elif statement_type == "cash_flow":
|
||||
return xbrl.statements.cash_flow_statement()
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown statement type: {statement_type}. "
|
||||
f"Must be 'income', 'balance', or 'cash_flow'"
|
||||
)
|
||||
|
||||
|
||||
def compare_companies_revenue(
|
||||
tickers: Union[List[str], tuple],
|
||||
periods: int = 3
|
||||
) -> Dict[str, 'MultiPeriodStatement']:
|
||||
"""
|
||||
Compare revenue trends across multiple companies using Entity Facts API.
|
||||
|
||||
This is the most efficient way to compare companies as it makes one API
|
||||
call per company (vs. multiple calls if using individual filings).
|
||||
|
||||
Args:
|
||||
tickers: List or tuple of ticker symbols (e.g., ["AAPL", "MSFT", "GOOGL"])
|
||||
periods: Number of periods to compare (default: 3 fiscal years)
|
||||
|
||||
Returns:
|
||||
Dictionary mapping ticker symbol to MultiPeriodStatement.
|
||||
Access individual company data via results["TICKER"].
|
||||
|
||||
Raises:
|
||||
ValueError: If any ticker is invalid
|
||||
HTTPError: If SEC Company Facts API request fails for any company
|
||||
|
||||
Examples:
|
||||
>>> # Compare three tech companies
|
||||
>>> results = compare_companies_revenue(["AAPL", "MSFT", "GOOGL"], periods=3)
|
||||
>>> print("Apple Revenue:")
|
||||
>>> print(results["AAPL"])
|
||||
>>> print("\nMicrosoft Revenue:")
|
||||
>>> print(results["MSFT"])
|
||||
|
||||
>>> # Compare with tuple of tickers
|
||||
>>> results = compare_companies_revenue(("AAPL", "MSFT"), periods=5)
|
||||
|
||||
>>> # Iterate through all results
|
||||
>>> results = compare_companies_revenue(["AAPL", "MSFT", "GOOGL"])
|
||||
>>> for ticker, statement in results.items():
|
||||
... print(f"\n{ticker} Revenue Trend:")
|
||||
... print(statement)
|
||||
|
||||
>>> # Handle errors gracefully
|
||||
>>> tickers = ["AAPL", "INVALID", "MSFT"]
|
||||
>>> results = {}
|
||||
>>> for ticker in tickers:
|
||||
... try:
|
||||
... company = Company(ticker)
|
||||
... results[ticker] = company.income_statement(periods=3)
|
||||
... except Exception as e:
|
||||
... print(f"Error with {ticker}: {e}")
|
||||
|
||||
See Also:
|
||||
- get_revenue_trend() - For single company analysis
|
||||
- Company.income_statement() - The underlying method used
|
||||
"""
|
||||
results = {}
|
||||
for ticker in tickers:
|
||||
company = Company(ticker)
|
||||
results[ticker] = company.income_statement(periods=periods)
|
||||
return results
|
||||
|
||||
|
||||
def filter_by_industry(
|
||||
filings: 'Filings',
|
||||
sic: Optional[Union[int, List[int]]] = None,
|
||||
sic_range: Optional[tuple[int, int]] = None,
|
||||
sic_description_contains: Optional[str] = None,
|
||||
) -> 'Filings':
|
||||
"""
|
||||
Filter filings by industry using comprehensive company dataset (EFFICIENT).
|
||||
|
||||
This REPLACES the old implementation which made N SEC API calls.
|
||||
New approach uses the comprehensive company dataset to identify target
|
||||
companies instantly (zero API calls), then filters filings by CIK.
|
||||
|
||||
Performance Comparison:
|
||||
- OLD: ~9 minutes for Q4 2023 8-K (5,400 API calls)
|
||||
- NEW: ~30s first time, <1s cached (zero API calls)
|
||||
- 100x+ faster for large filing sets
|
||||
|
||||
Args:
|
||||
filings: Filings collection to filter (from get_filings() or similar)
|
||||
sic: Single SIC code or list (e.g., 2834 or [2834, 2835, 2836])
|
||||
sic_range: SIC range tuple (e.g., (7300, 7400) for tech)
|
||||
Note: Use EXCLUSIVE upper bound (7400 means up to 7399)
|
||||
sic_description_contains: Search SIC description (e.g., "software")
|
||||
|
||||
Returns:
|
||||
Filtered Filings collection containing only filings from companies
|
||||
in the specified industry
|
||||
|
||||
Raises:
|
||||
ValueError: If no filter parameters provided
|
||||
|
||||
Examples:
|
||||
>>> from edgar import get_filings
|
||||
>>> from edgar.ai.helpers import filter_by_industry
|
||||
>>>
|
||||
>>> # Filter filings to pharmaceutical companies
|
||||
>>> filings = get_filings(2023, 4, form="10-K")
|
||||
>>> pharma_10ks = filter_by_industry(filings, sic=2834)
|
||||
>>>
|
||||
>>> # Filter to technology companies (SIC 7300-7399)
|
||||
>>> filings = get_filings(2023, 4, form="8-K")
|
||||
>>> tech_8ks = filter_by_industry(filings, sic_range=(7300, 7400))
|
||||
>>>
|
||||
>>> # Filter using description search
|
||||
>>> filings = get_filings(2023, 4)
|
||||
>>> software = filter_by_industry(filings, sic_description_contains="software")
|
||||
>>>
|
||||
>>> # Combine with other filters
|
||||
>>> filings = get_filings(2023, 4, form="10-K") # Pre-filter by form
|
||||
>>> nyse = filings.filter(exchange="NYSE") # Pre-filter by exchange
|
||||
>>> pharma_nyse = filter_by_industry(nyse, sic=2834) # Then by industry
|
||||
|
||||
See Also:
|
||||
- filter_by_company_subset() - Filter using CompanySubset fluent interface
|
||||
- get_companies_by_industry() - Get company list directly (from edgar.reference)
|
||||
- Filings.filter() - The underlying filter method
|
||||
"""
|
||||
from edgar.reference import get_companies_by_industry
|
||||
|
||||
# Validate inputs
|
||||
if len(filings) == 0:
|
||||
return filings
|
||||
|
||||
# Get companies in target industry (instant, local, zero API calls)
|
||||
companies = get_companies_by_industry(
|
||||
sic=sic,
|
||||
sic_range=sic_range,
|
||||
sic_description_contains=sic_description_contains
|
||||
)
|
||||
|
||||
# Extract CIKs
|
||||
target_ciks = companies['cik'].tolist()
|
||||
|
||||
if not target_ciks:
|
||||
# Return empty Filings collection with same structure
|
||||
return filings.filter(cik=[])
|
||||
|
||||
# Filter filings using target CIKs (instant, PyArrow operation)
|
||||
return filings.filter(cik=target_ciks)
|
||||
|
||||
|
||||
def filter_by_company_subset(
|
||||
filings: 'Filings',
|
||||
companies: Union['CompanySubset', pd.DataFrame]
|
||||
) -> 'Filings':
|
||||
"""
|
||||
Filter filings using a CompanySubset or company DataFrame.
|
||||
|
||||
This enables advanced company filtering using the CompanySubset fluent
|
||||
interface (industry + state + sampling + etc) or any custom company DataFrame.
|
||||
|
||||
Args:
|
||||
filings: Filings collection to filter
|
||||
companies: CompanySubset object or pandas DataFrame with 'cik' column
|
||||
|
||||
Returns:
|
||||
Filtered Filings collection
|
||||
|
||||
Raises:
|
||||
ValueError: If companies DataFrame doesn't have 'cik' column
|
||||
|
||||
Examples:
|
||||
>>> from edgar import get_filings
|
||||
>>> from edgar.reference import CompanySubset
|
||||
>>> from edgar.ai.helpers import filter_by_company_subset
|
||||
>>>
|
||||
>>> # Get filings
|
||||
>>> filings = get_filings(2023, 4, form="10-K")
|
||||
>>>
|
||||
>>> # Filter to Delaware pharmaceutical companies, sample 10
|
||||
>>> companies = (CompanySubset()
|
||||
... .from_industry(sic=2834)
|
||||
... .from_state('DE')
|
||||
... .sample(10, random_state=42))
|
||||
>>> pharma_de_filings = filter_by_company_subset(filings, companies)
|
||||
>>>
|
||||
>>> # Or pass the DataFrame directly
|
||||
>>> from edgar.reference import get_pharmaceutical_companies
|
||||
>>> pharma = get_pharmaceutical_companies()
|
||||
>>> pharma_filings = filter_by_company_subset(filings, pharma)
|
||||
|
||||
See Also:
|
||||
- filter_by_industry() - Simpler industry-only filtering
|
||||
- CompanySubset - Fluent interface for complex filtering (from edgar.reference)
|
||||
"""
|
||||
from edgar.reference import CompanySubset
|
||||
|
||||
# Extract DataFrame if CompanySubset passed
|
||||
if isinstance(companies, CompanySubset):
|
||||
companies = companies.get()
|
||||
|
||||
# Extract CIKs
|
||||
if 'cik' not in companies.columns:
|
||||
raise ValueError("companies DataFrame must have 'cik' column")
|
||||
|
||||
target_ciks = companies['cik'].tolist()
|
||||
|
||||
if not target_ciks:
|
||||
return filings.filter(cik=[])
|
||||
|
||||
return filings.filter(cik=target_ciks)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Company Subset Convenience Functions
|
||||
# ============================================================================
|
||||
|
||||
def get_companies_by_state(states: Union[str, List[str]]) -> pd.DataFrame:
|
||||
"""
|
||||
Get companies by state of incorporation.
|
||||
|
||||
Args:
|
||||
states: State code(s) (e.g., 'DE' or ['DE', 'NV'])
|
||||
|
||||
Returns:
|
||||
DataFrame with companies incorporated in specified state(s).
|
||||
Columns: cik, ticker, name, exchange, sic, sic_description,
|
||||
state_of_incorporation, state_of_incorporation_description,
|
||||
fiscal_year_end, entity_type, ein
|
||||
|
||||
Examples:
|
||||
>>> # Delaware companies (most common)
|
||||
>>> de_companies = get_companies_by_state('DE')
|
||||
>>> print(f"Found {len(de_companies)} Delaware companies")
|
||||
>>>
|
||||
>>> # Multiple states
|
||||
>>> tech_hubs = get_companies_by_state(['DE', 'CA', 'NV'])
|
||||
>>> print(tech_hubs[['ticker', 'name', 'state_of_incorporation']].head())
|
||||
|
||||
See Also:
|
||||
- filter_by_company_subset() - Filter filings by company subset
|
||||
- CompanySubset.from_state() - Fluent interface (from edgar.reference)
|
||||
"""
|
||||
from edgar.reference import get_companies_by_state as _get_by_state
|
||||
return _get_by_state(states)
|
||||
|
||||
|
||||
def get_pharmaceutical_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get all pharmaceutical companies (SIC 2834 - Pharmaceutical Preparations).
|
||||
|
||||
Returns:
|
||||
DataFrame with pharmaceutical companies and comprehensive metadata.
|
||||
|
||||
Examples:
|
||||
>>> pharma = get_pharmaceutical_companies()
|
||||
>>> print(f"Found {len(pharma)} pharmaceutical companies")
|
||||
>>> print(pharma[['ticker', 'name']].head())
|
||||
|
||||
See Also:
|
||||
- get_biotechnology_companies() - Broader biotech category
|
||||
- filter_by_industry() - Filter filings by industry
|
||||
"""
|
||||
from edgar.reference import get_pharmaceutical_companies as _get_pharma
|
||||
return _get_pharma()
|
||||
|
||||
|
||||
def get_biotechnology_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get all biotechnology companies (SIC 2833-2836).
|
||||
|
||||
Returns:
|
||||
DataFrame with biotechnology companies and comprehensive metadata.
|
||||
|
||||
Examples:
|
||||
>>> biotech = get_biotechnology_companies()
|
||||
>>> print(f"Found {len(biotech)} biotechnology companies")
|
||||
|
||||
See Also:
|
||||
- get_pharmaceutical_companies() - Narrower pharma category
|
||||
- filter_by_industry() - Filter filings by industry
|
||||
"""
|
||||
from edgar.reference import get_biotechnology_companies as _get_biotech
|
||||
return _get_biotech()
|
||||
|
||||
|
||||
def get_software_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get all software companies (SIC 7371-7379 - Computer Programming and Software).
|
||||
|
||||
Returns:
|
||||
DataFrame with software companies and comprehensive metadata.
|
||||
|
||||
Examples:
|
||||
>>> software = get_software_companies()
|
||||
>>> print(f"Found {len(software)} software companies")
|
||||
>>> # Get recent 10-K filings from software companies
|
||||
>>> from edgar import get_filings
|
||||
>>> filings = get_filings(2023, 4, form="10-K")
|
||||
>>> software_10ks = filter_by_company_subset(filings, software)
|
||||
|
||||
See Also:
|
||||
- get_semiconductor_companies() - Hardware tech companies
|
||||
- filter_by_industry() - Filter filings by industry
|
||||
"""
|
||||
from edgar.reference import get_software_companies as _get_software
|
||||
return _get_software()
|
||||
|
||||
|
||||
def get_semiconductor_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get all semiconductor companies (SIC 3674 - Semiconductors and Related Devices).
|
||||
|
||||
Returns:
|
||||
DataFrame with semiconductor companies and comprehensive metadata.
|
||||
|
||||
Examples:
|
||||
>>> semis = get_semiconductor_companies()
|
||||
>>> print(f"Found {len(semis)} semiconductor companies")
|
||||
|
||||
See Also:
|
||||
- get_software_companies() - Software tech companies
|
||||
- filter_by_industry() - Filter filings by industry
|
||||
"""
|
||||
from edgar.reference import get_semiconductor_companies as _get_semi
|
||||
return _get_semi()
|
||||
|
||||
|
||||
def get_banking_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get all banking companies (SIC 6020-6029 - Commercial Banks).
|
||||
|
||||
Returns:
|
||||
DataFrame with banking companies and comprehensive metadata.
|
||||
|
||||
Examples:
|
||||
>>> banks = get_banking_companies()
|
||||
>>> print(f"Found {len(banks)} banking companies")
|
||||
|
||||
See Also:
|
||||
- get_investment_companies() - Investment/securities firms
|
||||
- get_insurance_companies() - Insurance companies
|
||||
- filter_by_industry() - Filter filings by industry
|
||||
"""
|
||||
from edgar.reference import get_banking_companies as _get_banks
|
||||
return _get_banks()
|
||||
|
||||
|
||||
def get_investment_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get all investment companies (SIC 6200-6299 - Security and Commodity Brokers).
|
||||
|
||||
Returns:
|
||||
DataFrame with investment companies and comprehensive metadata.
|
||||
|
||||
Examples:
|
||||
>>> investments = get_investment_companies()
|
||||
>>> print(f"Found {len(investments)} investment companies")
|
||||
|
||||
See Also:
|
||||
- get_banking_companies() - Commercial banks
|
||||
- get_insurance_companies() - Insurance companies
|
||||
- filter_by_industry() - Filter filings by industry
|
||||
"""
|
||||
from edgar.reference import get_investment_companies as _get_invest
|
||||
return _get_invest()
|
||||
|
||||
|
||||
def get_insurance_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get all insurance companies (SIC 6300-6399 - Insurance Carriers).
|
||||
|
||||
Returns:
|
||||
DataFrame with insurance companies and comprehensive metadata.
|
||||
|
||||
Examples:
|
||||
>>> insurance = get_insurance_companies()
|
||||
>>> print(f"Found {len(insurance)} insurance companies")
|
||||
|
||||
See Also:
|
||||
- get_banking_companies() - Commercial banks
|
||||
- get_investment_companies() - Investment firms
|
||||
- filter_by_industry() - Filter filings by industry
|
||||
"""
|
||||
from edgar.reference import get_insurance_companies as _get_insurance
|
||||
return _get_insurance()
|
||||
|
||||
|
||||
def get_real_estate_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get all real estate companies (SIC 6500-6599 - Real Estate).
|
||||
|
||||
Returns:
|
||||
DataFrame with real estate companies and comprehensive metadata.
|
||||
|
||||
Examples:
|
||||
>>> real_estate = get_real_estate_companies()
|
||||
>>> print(f"Found {len(real_estate)} real estate companies")
|
||||
|
||||
See Also:
|
||||
- filter_by_industry() - Filter filings by industry
|
||||
"""
|
||||
from edgar.reference import get_real_estate_companies as _get_re
|
||||
return _get_re()
|
||||
|
||||
|
||||
def get_oil_gas_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get all oil and gas companies (SIC 1300-1399 - Oil and Gas Extraction).
|
||||
|
||||
Returns:
|
||||
DataFrame with oil and gas companies and comprehensive metadata.
|
||||
|
||||
Examples:
|
||||
>>> oil_gas = get_oil_gas_companies()
|
||||
>>> print(f"Found {len(oil_gas)} oil and gas companies")
|
||||
|
||||
See Also:
|
||||
- filter_by_industry() - Filter filings by industry
|
||||
"""
|
||||
from edgar.reference import get_oil_gas_companies as _get_oil_gas
|
||||
return _get_oil_gas()
|
||||
|
||||
|
||||
def get_retail_companies() -> pd.DataFrame:
|
||||
"""
|
||||
Get all retail companies (SIC 5200-5999 - Retail Trade).
|
||||
|
||||
Returns:
|
||||
DataFrame with retail companies and comprehensive metadata.
|
||||
|
||||
Examples:
|
||||
>>> retail = get_retail_companies()
|
||||
>>> print(f"Found {len(retail)} retail companies")
|
||||
|
||||
See Also:
|
||||
- filter_by_industry() - Filter filings by industry
|
||||
"""
|
||||
from edgar.reference import get_retail_companies as _get_retail
|
||||
return _get_retail()
|
||||
27
venv/lib/python3.10/site-packages/edgar/ai/mcp/__init__.py
Normal file
27
venv/lib/python3.10/site-packages/edgar/ai/mcp/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
"""
|
||||
Model Context Protocol (MCP) server for EdgarTools.
|
||||
|
||||
This module provides MCP server functionality to expose EdgarTools
|
||||
capabilities to AI agents and assistants like Claude Desktop.
|
||||
|
||||
Usage:
|
||||
# Start the server
|
||||
python -m edgar.ai
|
||||
|
||||
# Or via console script
|
||||
edgartools-mcp
|
||||
|
||||
# Test the server configuration
|
||||
python -m edgar.ai --test
|
||||
|
||||
For configuration and setup instructions, see:
|
||||
edgar/ai/mcp/docs/MCP_QUICKSTART.md
|
||||
"""
|
||||
|
||||
from edgar.ai.mcp.server import main, test_server
|
||||
|
||||
__all__ = [
|
||||
"main",
|
||||
"test_server",
|
||||
]
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,467 @@
|
||||
# EdgarTools MCP Quickstart Guide
|
||||
|
||||
This guide helps you get started with EdgarTools MCP server in under 5 minutes.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Install EdgarTools with AI features
|
||||
pip install edgartools[ai]
|
||||
```
|
||||
|
||||
## Starting the Server
|
||||
|
||||
EdgarTools provides two ways to start the MCP server:
|
||||
|
||||
### Option 1: Python Module (Recommended)
|
||||
```bash
|
||||
python -m edgar.ai
|
||||
```
|
||||
|
||||
### Option 2: Console Script
|
||||
```bash
|
||||
edgartools-mcp
|
||||
```
|
||||
|
||||
Both methods work identically and will start the MCP server listening on stdin/stdout.
|
||||
|
||||
## Client Configuration
|
||||
|
||||
### Claude Desktop
|
||||
|
||||
**Step 1: Install Claude Desktop**
|
||||
- Download from https://claude.ai/download (macOS or Windows)
|
||||
|
||||
**Step 2: Configure the Server**
|
||||
|
||||
You can configure EdgarTools MCP in two ways:
|
||||
|
||||
**Option A: Using Claude Desktop Settings (Easier)**
|
||||
1. Open Claude Desktop
|
||||
2. Go to Settings (macOS: `Cmd+,` / Windows: `Ctrl+,`)
|
||||
3. Navigate to **Developer** tab
|
||||
4. Click **Edit Config** button
|
||||
5. This will open `claude_desktop_config.json` in your default editor
|
||||
|
||||
**Option B: Edit Configuration File Directly**
|
||||
|
||||
Configuration file location:
|
||||
- **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
|
||||
- **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
|
||||
|
||||
**Configuration (macOS):**
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"edgartools": {
|
||||
"command": "python3",
|
||||
"args": ["-m", "edgar.ai"],
|
||||
"env": {
|
||||
"EDGAR_IDENTITY": "Your Name your.email@example.com"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Configuration (Windows):**
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"edgartools": {
|
||||
"command": "python",
|
||||
"args": ["-m", "edgar.ai"],
|
||||
"env": {
|
||||
"EDGAR_IDENTITY": "Your Name your.email@example.com"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Important:** On macOS, use `python3` (not `python`) as the command. On Windows, use `python`.
|
||||
|
||||
**Important Notes:**
|
||||
- Replace `"Your Name your.email@example.com"` with your actual name and email
|
||||
- The `EDGAR_IDENTITY` is required by the SEC for API requests
|
||||
- Use forward slashes in paths, even on Windows
|
||||
|
||||
**Step 3: Restart and Verify**
|
||||
1. Save the configuration file
|
||||
2. Restart Claude Desktop
|
||||
3. Look for the MCP server indicator (🔨) in the bottom-right corner of the chat input
|
||||
4. Try asking: "Research Apple Inc with financials"
|
||||
|
||||
### Cline (VS Code Extension)
|
||||
|
||||
**Configuration File:** `.vscode/cline_mcp_settings.json` in your project
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"edgartools": {
|
||||
"command": "python3",
|
||||
"args": ["-m", "edgar.ai"],
|
||||
"env": {
|
||||
"EDGAR_IDENTITY": "Your Name your.email@example.com"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Note:** Use `python3` on macOS/Linux, or `python` on Windows.
|
||||
|
||||
### Continue.dev
|
||||
|
||||
**Configuration File:** `~/.continue/config.json`
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"edgartools": {
|
||||
"command": "python3",
|
||||
"args": ["-m", "edgar.ai"],
|
||||
"env": {
|
||||
"EDGAR_IDENTITY": "Your Name your.email@example.com"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Note:** Use `python3` on macOS/Linux, or `python` on Windows.
|
||||
|
||||
## Available Tools
|
||||
|
||||
Once connected, AI agents have access to workflow-oriented tools designed for real-world research tasks:
|
||||
|
||||
### Workflow Tools (Recommended)
|
||||
|
||||
#### 1. edgar_company_research
|
||||
Comprehensive company intelligence combining profile, financials, recent activity, and ownership in a single workflow.
|
||||
|
||||
**Example prompts:**
|
||||
- "Research Tesla including financials and recent filings"
|
||||
- "Give me a detailed analysis of Apple Inc"
|
||||
- "Show me Microsoft's company profile with ownership data"
|
||||
|
||||
**Parameters:**
|
||||
- `identifier` (required): Company ticker, CIK, or name
|
||||
- `include_financials` (default: true): Include latest financial statements
|
||||
- `include_filings` (default: true): Include recent filing activity summary
|
||||
- `include_ownership` (default: false): Include insider/institutional ownership highlights
|
||||
- `detail_level` (default: "standard"): Response detail - "minimal", "standard", or "detailed"
|
||||
|
||||
**What it provides:**
|
||||
- Company profile (name, CIK, ticker, industry)
|
||||
- Latest financial metrics and statements
|
||||
- Recent filing activity summary
|
||||
- Ownership highlights (when requested)
|
||||
|
||||
#### 2. edgar_analyze_financials
|
||||
Multi-period financial statement analysis for trend analysis and comparisons.
|
||||
|
||||
**Example prompts:**
|
||||
- "Analyze Apple's income statement for the last 4 years"
|
||||
- "Show me Tesla's quarterly cash flow for the last 8 quarters"
|
||||
- "Compare Microsoft's income, balance sheet, and cash flow statements"
|
||||
|
||||
**Parameters:**
|
||||
- `company` (required): Company ticker, CIK, or name
|
||||
- `periods` (default: 4): Number of periods to analyze
|
||||
- `annual` (default: true): Annual (true) or quarterly (false) periods
|
||||
- `statement_types` (default: ["income"]): Statements to include - "income", "balance", "cash_flow"
|
||||
|
||||
**What it provides:**
|
||||
- Multi-period income statements
|
||||
- Multi-period balance sheets
|
||||
- Multi-period cash flow statements
|
||||
- Formatted for AI analysis and comparison
|
||||
|
||||
### Basic Tools (Backward Compatibility)
|
||||
|
||||
#### 3. edgar_get_company
|
||||
Get basic company information from SEC filings.
|
||||
|
||||
**Example prompts:**
|
||||
- "Get information about Tesla"
|
||||
- "Show me Apple's company details"
|
||||
|
||||
**Parameters:**
|
||||
- `identifier` (required): Company ticker, CIK, or name
|
||||
- `include_financials` (optional): Include latest financial statements
|
||||
|
||||
#### 4. edgar_current_filings
|
||||
Get the most recent SEC filings across all companies.
|
||||
|
||||
**Example prompts:**
|
||||
- "Show me the latest SEC filings"
|
||||
- "What are the most recent 10-K filings?"
|
||||
- "Get current 8-K filings"
|
||||
|
||||
**Parameters:**
|
||||
- `limit` (optional): Number of filings to return (default: 20)
|
||||
- `form_type` (optional): Filter by form type (e.g., "10-K", "10-Q", "8-K")
|
||||
|
||||
## Environment Variables
|
||||
|
||||
### EDGAR_IDENTITY (Recommended)
|
||||
|
||||
The SEC requires proper identification for all API requests. You can configure this in two ways:
|
||||
|
||||
**Option 1: In MCP Client Configuration (Recommended)**
|
||||
|
||||
Set it in your MCP client config as shown in the examples above:
|
||||
```json
|
||||
"env": {
|
||||
"EDGAR_IDENTITY": "Your Name your.email@example.com"
|
||||
}
|
||||
```
|
||||
|
||||
**Option 2: Shell Environment Variable**
|
||||
|
||||
Add to your `~/.bashrc` or `~/.zshrc`:
|
||||
```bash
|
||||
export EDGAR_IDENTITY="Your Name your.email@example.com"
|
||||
```
|
||||
|
||||
**What happens if not set:**
|
||||
- Server starts with a warning message
|
||||
- SEC API may rate-limit or return errors
|
||||
- The server will log helpful instructions for configuring it
|
||||
|
||||
**SEC Requirements:**
|
||||
- Format: "Full Name email@domain.com"
|
||||
- Must be a valid email you monitor
|
||||
- Used by SEC to contact you if issues arise with your API usage
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Finding Logs
|
||||
|
||||
Claude Desktop logs MCP server activity to help diagnose issues:
|
||||
|
||||
**Log Locations:**
|
||||
- **macOS**: `~/Library/Logs/Claude/`
|
||||
- Main log: `mcp.log`
|
||||
- Server-specific: `mcp-server-edgartools.log`
|
||||
- **Windows**: `%APPDATA%\Claude\logs\`
|
||||
|
||||
**Viewing logs:**
|
||||
```bash
|
||||
# macOS - watch logs in real-time
|
||||
tail -f ~/Library/Logs/Claude/mcp-server-edgartools.log
|
||||
|
||||
# macOS - view recent errors
|
||||
tail -50 ~/Library/Logs/Claude/mcp-server-edgartools.log | grep error
|
||||
```
|
||||
|
||||
### "spawn python ENOENT" Error
|
||||
|
||||
**Issue:** Claude Desktop logs show `spawn python ENOENT` error
|
||||
|
||||
**Where to check:** View logs at `~/Library/Logs/Claude/mcp-server-edgartools.log`
|
||||
|
||||
**Cause:** The `python` command is not found in your system PATH. This is the most common issue on macOS.
|
||||
|
||||
**Solution:**
|
||||
|
||||
1. **Use `python3` instead of `python` (macOS/Linux):**
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"edgartools": {
|
||||
"command": "python3",
|
||||
"args": ["-m", "edgar.ai"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
2. **Or specify the full Python path:**
|
||||
|
||||
Find your Python path:
|
||||
```bash
|
||||
which python3
|
||||
```
|
||||
|
||||
Then use the full path in your configuration:
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"edgartools": {
|
||||
"command": "/opt/homebrew/bin/python3",
|
||||
"args": ["-m", "edgar.ai"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
3. **Verify Python is accessible:**
|
||||
```bash
|
||||
python3 --version
|
||||
# Should show: Python 3.11.x or higher
|
||||
```
|
||||
|
||||
### Server won't start
|
||||
|
||||
**Issue:** `ModuleNotFoundError: No module named 'mcp'`
|
||||
|
||||
**Solution:** Install AI dependencies
|
||||
```bash
|
||||
pip install edgartools[ai]
|
||||
# or with pip3
|
||||
pip3 install edgartools[ai]
|
||||
```
|
||||
|
||||
### Client can't find server
|
||||
|
||||
**Issue:** Claude Desktop shows connection error
|
||||
|
||||
**Solution:** Verify the command works from terminal first
|
||||
```bash
|
||||
python3 -m edgar.ai
|
||||
# Should show: Starting EdgarTools MCP Server v...
|
||||
# Press Ctrl+C to stop
|
||||
```
|
||||
|
||||
### Wrong Python version
|
||||
|
||||
**Issue:** Server starts but tools don't work
|
||||
|
||||
**Solution:** MCP requires Python 3.10+. Check your version:
|
||||
```bash
|
||||
python --version
|
||||
```
|
||||
|
||||
If using Python 3.9 or earlier, upgrade Python:
|
||||
```bash
|
||||
# macOS with Homebrew
|
||||
brew install python@3.11
|
||||
|
||||
# Update your config to use the specific version
|
||||
{
|
||||
"mcpServers": {
|
||||
"edgartools": {
|
||||
"command": "/opt/homebrew/bin/python3.11",
|
||||
"args": ["-m", "edgar.ai"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Verification
|
||||
|
||||
### Quick Test
|
||||
|
||||
Before configuring your MCP client, verify the server is working:
|
||||
|
||||
```bash
|
||||
python -m edgar.ai --test
|
||||
```
|
||||
|
||||
**Expected output:**
|
||||
```
|
||||
Testing EdgarTools MCP Server Configuration...
|
||||
|
||||
✓ EdgarTools v4.18.0 imports successfully
|
||||
✓ MCP framework available
|
||||
✓ EDGAR_IDENTITY configured: Your Name your@email.com
|
||||
✓ Core EdgarTools functionality available
|
||||
|
||||
✓ All checks passed - MCP server is ready to run
|
||||
```
|
||||
|
||||
If any checks fail, the test will show specific error messages and installation instructions.
|
||||
|
||||
### Full Integration Test
|
||||
|
||||
1. **Start the server manually:**
|
||||
```bash
|
||||
python -m edgar.ai
|
||||
```
|
||||
You should see: `Starting EdgarTools MCP Server v4.18.0`
|
||||
|
||||
2. **Configure your MCP client** (see configurations above)
|
||||
|
||||
3. **Test in your MCP client:**
|
||||
|
||||
Try these example prompts:
|
||||
- "Research Apple Inc with financials and recent filings"
|
||||
- "Analyze Tesla's quarterly income statement for the last 4 quarters"
|
||||
- "Get the latest 10-K filings"
|
||||
|
||||
4. **Check server logs:**
|
||||
The server logs to stderr. Check your MCP client's developer console for any errors.
|
||||
|
||||
5. **Verify tool availability:**
|
||||
In Claude Desktop, look for the MCP indicator (🔨) in the bottom-right corner of the chat input. Clicking it should show available EdgarTools tools.
|
||||
|
||||
## Migration from Legacy Setup
|
||||
|
||||
If you're currently using the old `run_mcp_server.py` entry point, here's how to migrate:
|
||||
|
||||
### Old Configuration (Deprecated):
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"edgartools": {
|
||||
"command": "python",
|
||||
"args": ["/absolute/path/to/edgartools/edgar/ai/run_mcp_server.py"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### New Configuration (macOS):
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"edgartools": {
|
||||
"command": "python3",
|
||||
"args": ["-m", "edgar.ai"],
|
||||
"env": {
|
||||
"EDGAR_IDENTITY": "Your Name your@email.com"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### New Configuration (Windows):
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"edgartools": {
|
||||
"command": "python",
|
||||
"args": ["-m", "edgar.ai"],
|
||||
"env": {
|
||||
"EDGAR_IDENTITY": "Your Name your@email.com"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Benefits of Migrating:
|
||||
- ✅ No absolute file paths required
|
||||
- ✅ Works from any directory
|
||||
- ✅ Proper SEC identity configuration
|
||||
- ✅ Simpler configuration
|
||||
- ✅ Better error messages
|
||||
- ✅ Verification tool support (`--test` flag)
|
||||
|
||||
**Note:** The old entry point still works but shows a deprecation warning. It will be removed in a future version.
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Read the [full MCP documentation](../../../docs-internal/features/edgartools-mcp-ai-support.md) for advanced features
|
||||
- See [AI package structure](../../../docs-internal/features/ai-mcp-package-structure-plan.md) for architecture details
|
||||
- Explore example notebooks showing MCP workflows
|
||||
|
||||
## Support
|
||||
|
||||
- **Issues:** https://github.com/dgunning/edgartools/issues
|
||||
- **Discussions:** https://github.com/dgunning/edgartools/discussions
|
||||
- **Documentation:** https://dgunning.github.io/edgartools/
|
||||
394
venv/lib/python3.10/site-packages/edgar/ai/mcp/server.py
Normal file
394
venv/lib/python3.10/site-packages/edgar/ai/mcp/server.py
Normal file
@@ -0,0 +1,394 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
EdgarTools MCP Server
|
||||
|
||||
MCP (Model Context Protocol) server providing AI agents access to SEC filing data.
|
||||
This module provides the main entry point for the MCP server.
|
||||
|
||||
Usage:
|
||||
python -m edgar.ai.mcp # Via module
|
||||
edgartools-mcp # Via console script
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from mcp import Resource, Tool
|
||||
from mcp.server import NotificationOptions, Server
|
||||
from mcp.server.models import InitializationOptions
|
||||
from mcp.server.stdio import stdio_server
|
||||
from mcp.types import TextContent
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger("edgartools-mcp")
|
||||
|
||||
|
||||
def setup_edgar_identity():
|
||||
"""Configure SEC identity from environment variable.
|
||||
|
||||
The SEC requires proper identification for API requests. This function
|
||||
checks for the EDGAR_IDENTITY environment variable and configures it.
|
||||
If not set, logs a warning but continues (API errors will guide user).
|
||||
"""
|
||||
try:
|
||||
from edgar import set_identity
|
||||
|
||||
identity = os.environ.get('EDGAR_IDENTITY')
|
||||
if not identity:
|
||||
logger.warning(
|
||||
"EDGAR_IDENTITY environment variable not set. "
|
||||
"The SEC requires proper identification for API requests.\n"
|
||||
"Add to your MCP client configuration:\n"
|
||||
' "env": {"EDGAR_IDENTITY": "Your Name your.email@example.com"}\n'
|
||||
"Or set in your shell: export EDGAR_IDENTITY=\"Your Name your.email@example.com\""
|
||||
)
|
||||
return
|
||||
|
||||
set_identity(identity)
|
||||
logger.info(f"SEC identity configured: {identity}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error setting up EDGAR identity: {e}")
|
||||
|
||||
# Create the server
|
||||
app = Server("edgartools")
|
||||
|
||||
|
||||
@app.list_tools()
|
||||
async def list_tools() -> list[Tool]:
|
||||
"""List available tools."""
|
||||
return [
|
||||
Tool(
|
||||
name="edgar_company_research",
|
||||
description="Get company overview and background. Returns profile, 3-year financial trends, and recent filing activity. Use this for initial company research or to get a snapshot of recent performance.",
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"identifier": {
|
||||
"type": "string",
|
||||
"description": "Company ticker (AAPL), CIK (0000320193), or name (Apple Inc)"
|
||||
},
|
||||
"include_financials": {
|
||||
"type": "boolean",
|
||||
"description": "Include 3-year income statement showing revenue and profit trends",
|
||||
"default": True
|
||||
},
|
||||
"include_filings": {
|
||||
"type": "boolean",
|
||||
"description": "Include summary of last 5 SEC filings",
|
||||
"default": True
|
||||
},
|
||||
"include_ownership": {
|
||||
"type": "boolean",
|
||||
"description": "Include insider and institutional ownership data (currently not implemented)",
|
||||
"default": False
|
||||
},
|
||||
"detail_level": {
|
||||
"type": "string",
|
||||
"enum": ["minimal", "standard", "detailed"],
|
||||
"description": "Response detail: 'minimal' (key metrics only), 'standard' (balanced), 'detailed' (comprehensive data)",
|
||||
"default": "standard"
|
||||
}
|
||||
},
|
||||
"required": ["identifier"]
|
||||
}
|
||||
),
|
||||
Tool(
|
||||
name="edgar_analyze_financials",
|
||||
description="Detailed financial statement analysis across multiple periods. Use this for trend analysis, growth calculations, or comparing financial performance over time.",
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company": {
|
||||
"type": "string",
|
||||
"description": "Company ticker (TSLA), CIK (0001318605), or name (Tesla Inc)"
|
||||
},
|
||||
"periods": {
|
||||
"type": "integer",
|
||||
"description": "Number of periods: 4-5 for trends, 8-10 for patterns (max 10)",
|
||||
"default": 4
|
||||
},
|
||||
"annual": {
|
||||
"type": "boolean",
|
||||
"description": "Use annual periods (true) for long-term trends and year-over-year comparisons, or quarterly periods (false) for recent performance and current earnings. Quarterly provides more recent data but may show seasonal volatility.",
|
||||
"default": True
|
||||
},
|
||||
"statement_types": {
|
||||
"type": "array",
|
||||
"items": {"type": "string", "enum": ["income", "balance", "cash_flow"]},
|
||||
"description": "Statements to include: 'income' (revenue, profit, growth), 'balance' (assets, liabilities, equity), 'cash_flow' (operating, investing, financing cash flows)",
|
||||
"default": ["income"]
|
||||
}
|
||||
},
|
||||
"required": ["company"]
|
||||
}
|
||||
),
|
||||
Tool(
|
||||
name="edgar_industry_overview",
|
||||
description="Get overview of an industry sector including company count, major players, and aggregate metrics. Use this to understand industry landscape before diving into specific companies.",
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"industry": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"pharmaceuticals", "biotechnology", "software",
|
||||
"semiconductors", "banking", "investment",
|
||||
"insurance", "real_estate", "oil_gas", "retail"
|
||||
],
|
||||
"description": "Industry sector to analyze"
|
||||
},
|
||||
"include_top_companies": {
|
||||
"type": "boolean",
|
||||
"description": "Include list of major companies in the sector",
|
||||
"default": True
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Number of top companies to show (by filing activity)",
|
||||
"default": 10
|
||||
}
|
||||
},
|
||||
"required": ["industry"]
|
||||
}
|
||||
),
|
||||
Tool(
|
||||
name="edgar_compare_industry_companies",
|
||||
description="Compare financial performance of companies within an industry sector. Automatically selects top companies or accepts custom company list for side-by-side financial comparison.",
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"industry": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"pharmaceuticals", "biotechnology", "software",
|
||||
"semiconductors", "banking", "investment",
|
||||
"insurance", "real_estate", "oil_gas", "retail"
|
||||
],
|
||||
"description": "Industry sector to analyze"
|
||||
},
|
||||
"companies": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Optional: Specific tickers to compare (e.g., ['AAPL', 'MSFT', 'GOOGL']). If omitted, uses top companies by market presence.",
|
||||
"default": None
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Number of companies to compare if not specified (default 5, max 10)",
|
||||
"default": 5
|
||||
},
|
||||
"periods": {
|
||||
"type": "integer",
|
||||
"description": "Number of periods for comparison (default 3)",
|
||||
"default": 3
|
||||
},
|
||||
"annual": {
|
||||
"type": "boolean",
|
||||
"description": "Annual (true) or quarterly (false) comparison",
|
||||
"default": True
|
||||
}
|
||||
},
|
||||
"required": ["industry"]
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@app.call_tool()
|
||||
async def call_tool(name: str, arguments: dict[str, Any] | None) -> list[TextContent]:
|
||||
"""Handle tool calls."""
|
||||
if arguments is None:
|
||||
arguments = {}
|
||||
|
||||
try:
|
||||
if name == "edgar_company_research":
|
||||
from edgar.ai.mcp.tools.company_research import handle_company_research
|
||||
return await handle_company_research(arguments)
|
||||
elif name == "edgar_analyze_financials":
|
||||
from edgar.ai.mcp.tools.financial_analysis import handle_analyze_financials
|
||||
return await handle_analyze_financials(arguments)
|
||||
elif name == "edgar_industry_overview":
|
||||
from edgar.ai.mcp.tools.industry_analysis import handle_industry_overview
|
||||
return await handle_industry_overview(arguments)
|
||||
elif name == "edgar_compare_industry_companies":
|
||||
from edgar.ai.mcp.tools.industry_analysis import handle_compare_industry_companies
|
||||
return await handle_compare_industry_companies(arguments)
|
||||
else:
|
||||
raise ValueError(f"Unknown tool: {name}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error in tool %s: %s", name, e)
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text=f"Error: {str(e)}"
|
||||
)]
|
||||
|
||||
|
||||
@app.list_resources()
|
||||
async def list_resources() -> list[Resource]:
|
||||
"""List available resources."""
|
||||
return [
|
||||
Resource(
|
||||
uri="edgartools://docs/quickstart",
|
||||
name="EdgarTools Quickstart Guide",
|
||||
description="Quick start guide for using EdgarTools",
|
||||
mimeType="text/markdown"
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@app.read_resource()
|
||||
async def read_resource(uri: str) -> str:
|
||||
"""Read a resource."""
|
||||
if uri == "edgartools://docs/quickstart":
|
||||
return """# EdgarTools Quickstart
|
||||
|
||||
## Basic Usage
|
||||
|
||||
```python
|
||||
from edgar import Company, get_current_filings
|
||||
|
||||
# Get company information
|
||||
company = Company("AAPL")
|
||||
print(f"{company.name} - CIK: {company.cik}")
|
||||
|
||||
# Get filings
|
||||
filings = company.get_filings(form="10-K", limit=5)
|
||||
for filing in filings:
|
||||
print(f"{filing.form} - {filing.filing_date}")
|
||||
|
||||
# Get current filings across all companies
|
||||
current = get_current_filings(limit=20)
|
||||
for filing in current.data.to_pylist():
|
||||
print(f"{filing['company']} - {filing['form']}")
|
||||
```
|
||||
|
||||
## Available Tools
|
||||
|
||||
- **edgar_get_company**: Get detailed company information
|
||||
- **edgar_current_filings**: Get the latest SEC filings
|
||||
|
||||
## Example Queries
|
||||
|
||||
- "Get information about Apple Inc including recent financials"
|
||||
- "Show me the 20 most recent SEC filings"
|
||||
- "Find current 8-K filings"
|
||||
"""
|
||||
else:
|
||||
raise ValueError(f"Unknown resource: {uri}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for MCP server."""
|
||||
try:
|
||||
# Get package version for server version
|
||||
from edgar.__about__ import __version__
|
||||
|
||||
# Configure EDGAR identity from environment
|
||||
setup_edgar_identity()
|
||||
|
||||
async def run_server():
|
||||
"""Run the async MCP server."""
|
||||
logger.info(f"Starting EdgarTools MCP Server v{__version__}")
|
||||
|
||||
# Use stdio transport
|
||||
async with stdio_server() as (read_stream, write_stream):
|
||||
await app.run(
|
||||
read_stream,
|
||||
write_stream,
|
||||
InitializationOptions(
|
||||
server_name="edgartools",
|
||||
server_version=__version__, # Sync with package version
|
||||
capabilities=app.get_capabilities(
|
||||
notification_options=NotificationOptions(),
|
||||
experimental_capabilities={}
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
asyncio.run(run_server())
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Server stopped by user")
|
||||
except Exception as e:
|
||||
logger.error(f"Server error: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
def test_server():
|
||||
"""Test that MCP server is properly configured and ready to run.
|
||||
|
||||
Returns:
|
||||
bool: True if all checks pass, False otherwise
|
||||
"""
|
||||
import sys
|
||||
|
||||
print("Testing EdgarTools MCP Server Configuration...\n")
|
||||
|
||||
all_passed = True
|
||||
|
||||
# Test 1: EdgarTools import check
|
||||
try:
|
||||
from edgar import Company
|
||||
from edgar.__about__ import __version__
|
||||
print(f"✓ EdgarTools v{__version__} imports successfully")
|
||||
except ImportError as e:
|
||||
print(f"✗ EdgarTools import error: {e}")
|
||||
print(" Install with: pip install edgartools")
|
||||
all_passed = False
|
||||
|
||||
# Test 2: MCP framework check
|
||||
try:
|
||||
from mcp.server import Server
|
||||
print("✓ MCP framework available")
|
||||
except ImportError as e:
|
||||
print(f"✗ MCP framework not installed: {e}")
|
||||
print(" Install with: pip install edgartools[ai]")
|
||||
all_passed = False
|
||||
|
||||
# Test 3: Identity configuration check
|
||||
identity = os.environ.get('EDGAR_IDENTITY')
|
||||
if identity:
|
||||
print(f"✓ EDGAR_IDENTITY configured: {identity}")
|
||||
else:
|
||||
print("⚠ EDGAR_IDENTITY not set (recommended)")
|
||||
print(" Set with: export EDGAR_IDENTITY=\"Your Name your@email.com\"")
|
||||
print(" Or configure in MCP client's env settings")
|
||||
|
||||
# Test 4: Quick functionality test
|
||||
try:
|
||||
from edgar import get_current_filings
|
||||
print("✓ Core EdgarTools functionality available")
|
||||
except Exception as e:
|
||||
print(f"✗ EdgarTools functionality check failed: {e}")
|
||||
all_passed = False
|
||||
|
||||
# Summary
|
||||
print()
|
||||
if all_passed:
|
||||
print("✓ All checks passed - MCP server is ready to run")
|
||||
print("\nTo start the server:")
|
||||
print(" python -m edgar.ai")
|
||||
print(" or")
|
||||
print(" edgartools-mcp")
|
||||
return True
|
||||
else:
|
||||
print("✗ Some checks failed - please fix the issues above")
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
# Check for --test flag
|
||||
if "--test" in sys.argv or "-t" in sys.argv:
|
||||
sys.exit(0 if test_server() else 1)
|
||||
else:
|
||||
main()
|
||||
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
EdgarTools MCP Tool Handlers
|
||||
|
||||
This module contains workflow-oriented tool handlers for the MCP server.
|
||||
"""
|
||||
|
||||
from edgar.ai.mcp.tools.utils import (
|
||||
check_output_size,
|
||||
format_error_with_suggestions,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"check_output_size",
|
||||
"format_error_with_suggestions",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
Company Research Tool Handler
|
||||
|
||||
Provides comprehensive company intelligence including profile,
|
||||
financials, recent activity, and ownership information.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from mcp.types import TextContent
|
||||
|
||||
from edgar import Company
|
||||
from edgar.ai.mcp.tools.utils import (
|
||||
build_company_profile,
|
||||
check_output_size,
|
||||
format_error_with_suggestions,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def handle_company_research(args: dict[str, Any]) -> list[TextContent]:
|
||||
"""
|
||||
Handle company research tool requests.
|
||||
|
||||
Provides comprehensive company intelligence in one call, combining:
|
||||
- Company profile (name, CIK, ticker, industry)
|
||||
- Latest financial information (optional)
|
||||
- Recent filing activity (optional)
|
||||
- Ownership highlights (optional)
|
||||
|
||||
Args:
|
||||
args: Tool arguments containing:
|
||||
- identifier (required): Company ticker, CIK, or name
|
||||
- include_financials (default True): Include latest financials
|
||||
- include_filings (default True): Include recent filing summary
|
||||
- include_ownership (default False): Include ownership highlights
|
||||
- detail_level (default "standard"): minimal/standard/detailed
|
||||
|
||||
Returns:
|
||||
List containing TextContent with company research results
|
||||
"""
|
||||
identifier = args.get("identifier")
|
||||
detail_level = args.get("detail_level", "standard")
|
||||
include_financials = args.get("include_financials", True)
|
||||
include_filings = args.get("include_filings", True)
|
||||
include_ownership = args.get("include_ownership", False)
|
||||
|
||||
if not identifier:
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text="Error: identifier parameter is required"
|
||||
)]
|
||||
|
||||
try:
|
||||
# Get company
|
||||
company = Company(identifier)
|
||||
|
||||
# Build response parts
|
||||
response_parts = []
|
||||
|
||||
# 1. Company profile
|
||||
profile = build_company_profile(company, detail_level)
|
||||
response_parts.append(profile)
|
||||
|
||||
# 2. Latest financials (if requested)
|
||||
if include_financials:
|
||||
try:
|
||||
financials = extract_latest_financials(company, detail_level)
|
||||
if financials:
|
||||
response_parts.append("\n\nLatest Financials:")
|
||||
response_parts.append(financials)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not retrieve financials: {e}")
|
||||
response_parts.append(f"\n\nFinancials: Not available ({str(e)})")
|
||||
|
||||
# 3. Recent filings (if requested)
|
||||
if include_filings:
|
||||
try:
|
||||
filings = recent_filing_summary(company, detail_level)
|
||||
if filings:
|
||||
response_parts.append("\n\nRecent Filings:")
|
||||
response_parts.append(filings)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not retrieve filings: {e}")
|
||||
response_parts.append(f"\n\nRecent Filings: Not available ({str(e)})")
|
||||
|
||||
# 4. Ownership highlights (if requested)
|
||||
if include_ownership:
|
||||
try:
|
||||
ownership = ownership_highlights(company)
|
||||
if ownership:
|
||||
response_parts.append("\n\nOwnership Highlights:")
|
||||
response_parts.append(ownership)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not retrieve ownership: {e}")
|
||||
response_parts.append(f"\n\nOwnership: Not available ({str(e)})")
|
||||
|
||||
# Combine response
|
||||
response_text = "\n".join(response_parts)
|
||||
|
||||
# Check output size and truncate if needed
|
||||
response_text = check_output_size(response_text)
|
||||
|
||||
return [TextContent(type="text", text=response_text)]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in company research: {e}", exc_info=True)
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text=format_error_with_suggestions(e)
|
||||
)]
|
||||
|
||||
|
||||
def extract_latest_financials(company: Any, detail_level: str = "standard") -> str:
|
||||
"""
|
||||
Extract latest financial information for a company.
|
||||
|
||||
Args:
|
||||
company: Company object
|
||||
detail_level: Level of detail to include
|
||||
|
||||
Returns:
|
||||
Formatted financial summary
|
||||
"""
|
||||
try:
|
||||
# Get income statement with 3 periods for trend analysis (annual) with concise format for LLM
|
||||
stmt = company.income_statement(periods=3, annual=True, concise_format=True)
|
||||
|
||||
if detail_level == "minimal":
|
||||
# Just key metrics
|
||||
parts = ["Latest Annual Period"]
|
||||
# TODO: Extract specific metrics once we understand the API better
|
||||
return stmt.to_llm_string()
|
||||
else:
|
||||
# Standard or detailed
|
||||
return stmt.to_llm_string()
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not extract financials: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def recent_filing_summary(company: Any, detail_level: str = "standard") -> str:
|
||||
"""
|
||||
Get summary of recent filing activity.
|
||||
|
||||
Args:
|
||||
company: Company object
|
||||
detail_level: Level of detail to include
|
||||
|
||||
Returns:
|
||||
Formatted filing summary
|
||||
"""
|
||||
try:
|
||||
# Get recent filings (last 5)
|
||||
filings = company.get_filings(limit=5)
|
||||
|
||||
if not filings:
|
||||
return "No recent filings found"
|
||||
|
||||
parts = []
|
||||
for filing in filings:
|
||||
if detail_level == "minimal":
|
||||
parts.append(f"- {filing.form} ({filing.filing_date})")
|
||||
else:
|
||||
parts.append(f"- {filing.form} - {filing.filing_date}")
|
||||
if hasattr(filing, 'description') and filing.description:
|
||||
parts.append(f" {filing.description}")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not retrieve filings: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def ownership_highlights(company: Any) -> str:
|
||||
"""
|
||||
Get ownership highlights (insider/institutional activity).
|
||||
|
||||
Args:
|
||||
company: Company object
|
||||
|
||||
Returns:
|
||||
Formatted ownership summary
|
||||
"""
|
||||
# TODO: Implement once we understand ownership data access
|
||||
# This might require analyzing Form 4 (insider) and 13F (institutional) filings
|
||||
logger.info("Ownership highlights not yet implemented")
|
||||
return "Ownership data: Feature not yet implemented"
|
||||
@@ -0,0 +1,106 @@
|
||||
"""
|
||||
Financial Analysis Tool Handler
|
||||
|
||||
Provides multi-period financial statement analysis.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from mcp.types import TextContent
|
||||
|
||||
from edgar import Company
|
||||
from edgar.ai.mcp.tools.utils import (
|
||||
check_output_size,
|
||||
format_error_with_suggestions,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def handle_analyze_financials(args: dict[str, Any]) -> list[TextContent]:
|
||||
"""
|
||||
Handle financial analysis tool requests.
|
||||
|
||||
Provides multi-period financial statement analysis using Company
|
||||
convenience methods (income_statement, balance_sheet, cash_flow).
|
||||
|
||||
Args:
|
||||
args: Tool arguments containing:
|
||||
- company (required): Company ticker, CIK, or name
|
||||
- periods (default 4): Number of periods to analyze
|
||||
- annual (default True): Annual (true) or quarterly (false)
|
||||
- statement_types (default ["income"]): Statements to include
|
||||
|
||||
Returns:
|
||||
List containing TextContent with financial analysis results
|
||||
"""
|
||||
company_id = args.get("company")
|
||||
periods = args.get("periods", 4)
|
||||
annual = args.get("annual", True)
|
||||
statement_types = args.get("statement_types", ["income"])
|
||||
|
||||
if not company_id:
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text="Error: company parameter is required"
|
||||
)]
|
||||
|
||||
try:
|
||||
# Get company
|
||||
company = Company(company_id)
|
||||
|
||||
# Extract requested statements
|
||||
response_parts = []
|
||||
response_parts.append(f"Financial Analysis: {company.name}")
|
||||
response_parts.append(f"Periods: {periods} {'Annual' if annual else 'Quarterly'}")
|
||||
response_parts.append("")
|
||||
|
||||
# Process each requested statement type
|
||||
if "income" in statement_types:
|
||||
try:
|
||||
stmt = company.income_statement(periods=periods, annual=annual, concise_format=True)
|
||||
response_parts.append("=== Income Statement ===")
|
||||
response_parts.append(stmt.to_llm_string())
|
||||
response_parts.append("")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not retrieve income statement: {e}")
|
||||
response_parts.append(f"Income Statement: Not available ({str(e)})")
|
||||
response_parts.append("")
|
||||
|
||||
if "balance" in statement_types:
|
||||
try:
|
||||
stmt = company.balance_sheet(periods=periods, annual=annual, concise_format=True)
|
||||
response_parts.append("=== Balance Sheet ===")
|
||||
response_parts.append(stmt.to_llm_string())
|
||||
response_parts.append("")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not retrieve balance sheet: {e}")
|
||||
response_parts.append(f"Balance Sheet: Not available ({str(e)})")
|
||||
response_parts.append("")
|
||||
|
||||
if "cash_flow" in statement_types:
|
||||
try:
|
||||
stmt = company.cash_flow(periods=periods, annual=annual, concise_format=True)
|
||||
response_parts.append("=== Cash Flow Statement ===")
|
||||
response_parts.append(stmt.to_llm_string())
|
||||
response_parts.append("")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not retrieve cash flow: {e}")
|
||||
response_parts.append(f"Cash Flow: Not available ({str(e)})")
|
||||
response_parts.append("")
|
||||
|
||||
# Combine response
|
||||
response_text = "\n".join(response_parts)
|
||||
|
||||
# Check output size and truncate if needed
|
||||
response_text = check_output_size(response_text, max_tokens=3000) # Larger limit for financials
|
||||
|
||||
return [TextContent(type="text", text=response_text)]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in financial analysis: {e}", exc_info=True)
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text=format_error_with_suggestions(e)
|
||||
)]
|
||||
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
Industry Analysis Tool Handlers
|
||||
|
||||
Provides industry sector analysis and competitive benchmarking capabilities.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from mcp.types import TextContent
|
||||
|
||||
from edgar import Company
|
||||
from edgar.ai.mcp.tools.utils import (
|
||||
check_output_size,
|
||||
format_error_with_suggestions,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Industry function mapping
|
||||
INDUSTRY_FUNCTIONS = {
|
||||
"pharmaceuticals": "get_pharmaceutical_companies",
|
||||
"biotechnology": "get_biotechnology_companies",
|
||||
"software": "get_software_companies",
|
||||
"semiconductors": "get_semiconductor_companies",
|
||||
"banking": "get_banking_companies",
|
||||
"investment": "get_investment_companies",
|
||||
"insurance": "get_insurance_companies",
|
||||
"real_estate": "get_real_estate_companies",
|
||||
"oil_gas": "get_oil_gas_companies",
|
||||
"retail": "get_retail_companies",
|
||||
}
|
||||
|
||||
|
||||
async def handle_industry_overview(args: dict[str, Any]) -> list[TextContent]:
|
||||
"""
|
||||
Handle industry overview tool requests.
|
||||
|
||||
Provides overview of an industry sector including:
|
||||
- Total company count
|
||||
- SIC code(s)
|
||||
- Major public companies
|
||||
- Industry description
|
||||
|
||||
Args:
|
||||
args: Tool arguments containing:
|
||||
- industry (required): Industry sector name
|
||||
- include_top_companies (default True): Include major companies
|
||||
- limit (default 10): Number of top companies to show
|
||||
|
||||
Returns:
|
||||
List containing TextContent with industry overview
|
||||
"""
|
||||
industry = args.get("industry")
|
||||
include_top = args.get("include_top_companies", True)
|
||||
limit = args.get("limit", 10)
|
||||
|
||||
if not industry:
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text="Error: industry parameter is required"
|
||||
)]
|
||||
|
||||
if industry not in INDUSTRY_FUNCTIONS:
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text=f"Error: Unknown industry '{industry}'. Must be one of: {', '.join(INDUSTRY_FUNCTIONS.keys())}"
|
||||
)]
|
||||
|
||||
try:
|
||||
# Import and call the appropriate industry function
|
||||
from edgar.ai import helpers
|
||||
function_name = INDUSTRY_FUNCTIONS[industry]
|
||||
get_companies = getattr(helpers, function_name)
|
||||
companies = get_companies()
|
||||
|
||||
# Build response
|
||||
response_parts = [
|
||||
f"# {industry.replace('_', ' ').title()} Industry Overview",
|
||||
"",
|
||||
f"**Total Companies**: {len(companies):,}",
|
||||
]
|
||||
|
||||
# Get unique SIC codes
|
||||
sic_codes = sorted(companies['sic'].unique().tolist())
|
||||
if len(sic_codes) == 1:
|
||||
response_parts.append(f"**SIC Code**: {sic_codes[0]}")
|
||||
else:
|
||||
response_parts.append(f"**SIC Codes**: {', '.join(map(str, sic_codes))}")
|
||||
|
||||
# Get primary description (from first company)
|
||||
if len(companies) > 0 and 'sic_description' in companies.columns:
|
||||
primary_desc = companies['sic_description'].iloc[0]
|
||||
response_parts.append(f"**Description**: {primary_desc}")
|
||||
|
||||
response_parts.append("")
|
||||
|
||||
# Add major companies if requested
|
||||
if include_top and len(companies) > 0:
|
||||
# Filter to companies with tickers (publicly traded)
|
||||
public = companies[companies['ticker'].notna()].copy()
|
||||
|
||||
if len(public) > 0:
|
||||
response_parts.append("## Major Public Companies")
|
||||
response_parts.append("")
|
||||
|
||||
# Show top N companies
|
||||
top_companies = public.head(limit)
|
||||
|
||||
for _, row in top_companies.iterrows():
|
||||
ticker = row['ticker'] if row['ticker'] else 'N/A'
|
||||
exchange = row['exchange'] if row['exchange'] else 'N/A'
|
||||
response_parts.append(
|
||||
f"- **{ticker}** - {row['name']} ({exchange})"
|
||||
)
|
||||
else:
|
||||
response_parts.append("*No public companies found in this sector*")
|
||||
|
||||
# Combine response
|
||||
response_text = "\n".join(response_parts)
|
||||
|
||||
# Check output size
|
||||
response_text = check_output_size(response_text)
|
||||
|
||||
return [TextContent(type="text", text=response_text)]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in industry overview: {e}", exc_info=True)
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text=format_error_with_suggestions(e)
|
||||
)]
|
||||
|
||||
|
||||
async def handle_compare_industry_companies(args: dict[str, Any]) -> list[TextContent]:
|
||||
"""
|
||||
Handle industry company comparison tool requests.
|
||||
|
||||
Compares financial performance of companies within an industry sector.
|
||||
|
||||
Args:
|
||||
args: Tool arguments containing:
|
||||
- industry (required): Industry sector name
|
||||
- companies (optional): Specific tickers to compare
|
||||
- limit (default 5): Number of companies if not specified
|
||||
- periods (default 3): Number of periods for comparison
|
||||
- annual (default True): Annual (true) or quarterly (false)
|
||||
|
||||
Returns:
|
||||
List containing TextContent with comparative analysis
|
||||
"""
|
||||
industry = args.get("industry")
|
||||
company_tickers = args.get("companies")
|
||||
limit = args.get("limit", 5)
|
||||
periods = args.get("periods", 3)
|
||||
annual = args.get("annual", True)
|
||||
|
||||
if not industry:
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text="Error: industry parameter is required"
|
||||
)]
|
||||
|
||||
if industry not in INDUSTRY_FUNCTIONS:
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text=f"Error: Unknown industry '{industry}'. Must be one of: {', '.join(INDUSTRY_FUNCTIONS.keys())}"
|
||||
)]
|
||||
|
||||
try:
|
||||
# Import and call the appropriate industry function
|
||||
from edgar.ai import helpers
|
||||
function_name = INDUSTRY_FUNCTIONS[industry]
|
||||
get_companies = getattr(helpers, function_name)
|
||||
companies = get_companies()
|
||||
|
||||
# Select companies
|
||||
if company_tickers:
|
||||
# Filter to specified tickers
|
||||
selected = companies[companies['ticker'].isin(company_tickers)].copy()
|
||||
if len(selected) == 0:
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text=f"Error: None of the specified tickers found in {industry} industry"
|
||||
)]
|
||||
else:
|
||||
# Use top N companies with tickers
|
||||
public = companies[companies['ticker'].notna()].copy()
|
||||
if len(public) == 0:
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text=f"Error: No public companies found in {industry} industry"
|
||||
)]
|
||||
selected = public.head(limit)
|
||||
|
||||
# Compare financials
|
||||
response_parts = [
|
||||
f"# {industry.replace('_', ' ').title()} Industry Comparison",
|
||||
f"",
|
||||
f"Comparing {len(selected)} companies over {periods} {'annual' if annual else 'quarterly'} periods",
|
||||
"",
|
||||
]
|
||||
|
||||
for _, row in selected.iterrows():
|
||||
ticker = row['ticker']
|
||||
try:
|
||||
company = Company(ticker)
|
||||
stmt = company.income_statement(
|
||||
periods=periods,
|
||||
annual=annual,
|
||||
concise_format=True
|
||||
)
|
||||
|
||||
response_parts.append(f"## {ticker} - {row['name']}")
|
||||
response_parts.append("")
|
||||
response_parts.append(stmt.to_llm_string())
|
||||
response_parts.append("")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not get financials for {ticker}: {e}")
|
||||
response_parts.append(f"## {ticker} - {row['name']}")
|
||||
response_parts.append(f"*Financial data not available: {str(e)}*")
|
||||
response_parts.append("")
|
||||
|
||||
# Combine response
|
||||
response_text = "\n".join(response_parts)
|
||||
|
||||
# Check output size (larger limit for comparative data)
|
||||
response_text = check_output_size(response_text, max_tokens=5000)
|
||||
|
||||
return [TextContent(type="text", text=response_text)]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in industry comparison: {e}", exc_info=True)
|
||||
return [TextContent(
|
||||
type="text",
|
||||
text=format_error_with_suggestions(e)
|
||||
)]
|
||||
137
venv/lib/python3.10/site-packages/edgar/ai/mcp/tools/utils.py
Normal file
137
venv/lib/python3.10/site-packages/edgar/ai/mcp/tools/utils.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
Utility functions for MCP tool handlers.
|
||||
|
||||
Provides helper functions for output management, error handling,
|
||||
and data formatting for MCP responses.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def check_output_size(data: str, max_tokens: int = 2000) -> str:
|
||||
"""
|
||||
Prevent context overflow with intelligent summarization.
|
||||
|
||||
Estimates token count and truncates/summarizes if needed to stay
|
||||
within context window limits.
|
||||
|
||||
Args:
|
||||
data: The text data to check
|
||||
max_tokens: Maximum allowed tokens (default: 2000)
|
||||
|
||||
Returns:
|
||||
Original data if under limit, truncated data otherwise
|
||||
"""
|
||||
# Rough estimation: 1 token ≈ 4 characters
|
||||
estimated_tokens = len(data) / 4
|
||||
|
||||
if estimated_tokens > max_tokens:
|
||||
# Simple truncation with ellipsis
|
||||
# TODO: Implement smarter summarization in future
|
||||
char_limit = int(max_tokens * 4 * 0.9) # 90% of limit to be safe
|
||||
truncated = data[:char_limit]
|
||||
logger.warning(f"Output truncated: {int(estimated_tokens)} tokens -> {max_tokens} tokens")
|
||||
return f"{truncated}\n\n... (output truncated to stay within token limit)"
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def format_error_with_suggestions(error: Exception) -> str:
|
||||
"""
|
||||
Provide helpful error messages with alternatives.
|
||||
|
||||
Creates AI-friendly error messages that include specific suggestions
|
||||
for common error types.
|
||||
|
||||
Args:
|
||||
error: The exception that occurred
|
||||
|
||||
Returns:
|
||||
Formatted error message with suggestions
|
||||
"""
|
||||
error_type = type(error).__name__
|
||||
error_message = str(error)
|
||||
|
||||
# Define helpful suggestions for common errors
|
||||
suggestions_map = {
|
||||
"CompanyNotFound": [
|
||||
"Try searching by CIK instead of ticker",
|
||||
"Use the full company name",
|
||||
"Check spelling of ticker symbol"
|
||||
],
|
||||
"NoFinancialsAvailable": [
|
||||
"Company may not have filed recent 10-K/10-Q",
|
||||
"Try include_financials=False for basic info",
|
||||
"Check filing history with edgar_market_monitor tool"
|
||||
],
|
||||
"FileNotFoundError": [
|
||||
"The requested filing may not be available",
|
||||
"Try a different form type or date range",
|
||||
"Verify the company has filed this type of document"
|
||||
],
|
||||
"HTTPError": [
|
||||
"SEC EDGAR website may be temporarily unavailable",
|
||||
"Check your internet connection",
|
||||
"Try again in a few moments"
|
||||
],
|
||||
"ValueError": [
|
||||
"Check that all required parameters are provided",
|
||||
"Verify parameter formats (e.g., valid ticker symbols)",
|
||||
"Review the tool's parameter documentation"
|
||||
]
|
||||
}
|
||||
|
||||
suggestions = suggestions_map.get(error_type, [
|
||||
"Try rephrasing your request",
|
||||
"Check parameter values",
|
||||
"Consult the tool documentation"
|
||||
])
|
||||
|
||||
# Format the error response
|
||||
response_parts = [
|
||||
f"Error: {error_message}",
|
||||
f"Error Type: {error_type}",
|
||||
"",
|
||||
"Suggestions:"
|
||||
]
|
||||
|
||||
for i, suggestion in enumerate(suggestions, 1):
|
||||
response_parts.append(f"{i}. {suggestion}")
|
||||
|
||||
return "\n".join(response_parts)
|
||||
|
||||
|
||||
def build_company_profile(company: Any, detail_level: str = "standard") -> str:
|
||||
"""
|
||||
Build a company profile summary.
|
||||
|
||||
Args:
|
||||
company: Company object
|
||||
detail_level: Level of detail (minimal/standard/detailed)
|
||||
|
||||
Returns:
|
||||
Formatted company profile text
|
||||
"""
|
||||
parts = [f"Company: {company.name}"]
|
||||
|
||||
# Add CIK
|
||||
parts.append(f"CIK: {company.cik}")
|
||||
|
||||
# Add ticker if available
|
||||
if hasattr(company, 'tickers') and company.tickers:
|
||||
parts.append(f"Ticker: {company.tickers[0]}")
|
||||
|
||||
# Add industry/sector if available and detail level permits
|
||||
if detail_level in ["standard", "detailed"]:
|
||||
if hasattr(company, 'sic_description'):
|
||||
parts.append(f"Industry: {company.sic_description}")
|
||||
|
||||
# Add description for detailed level
|
||||
if detail_level == "detailed":
|
||||
if hasattr(company, 'description') and company.description:
|
||||
parts.append(f"\nDescription: {company.description}")
|
||||
|
||||
return "\n".join(parts)
|
||||
@@ -0,0 +1,63 @@
|
||||
"""
|
||||
EdgarTools AI Skills - Skill discovery and management.
|
||||
|
||||
Skills are self-contained packages of documentation and helper functions
|
||||
that enable AI agents to perform domain-specific tasks with EdgarTools.
|
||||
"""
|
||||
|
||||
from edgar.ai.skills.base import BaseSkill
|
||||
from edgar.ai.skills.core import edgartools_skill, EdgarToolsSkill
|
||||
|
||||
__all__ = [
|
||||
'BaseSkill',
|
||||
'EdgarToolsSkill',
|
||||
'edgartools_skill',
|
||||
'list_skills',
|
||||
'get_skill',
|
||||
]
|
||||
|
||||
|
||||
def list_skills() -> list:
|
||||
"""
|
||||
List all available skills (built-in + external).
|
||||
|
||||
Returns:
|
||||
List of BaseSkill instances
|
||||
|
||||
Example:
|
||||
>>> from edgar.ai.skills import list_skills
|
||||
>>> skills = list_skills()
|
||||
>>> for skill in skills:
|
||||
... print(f"{skill.name}: {skill.description}")
|
||||
"""
|
||||
# Currently only one built-in skill
|
||||
# External packages can register additional skills here
|
||||
return [edgartools_skill]
|
||||
|
||||
|
||||
def get_skill(name: str) -> BaseSkill:
|
||||
"""
|
||||
Get skill by name.
|
||||
|
||||
Args:
|
||||
name: Skill name (e.g., "EdgarTools")
|
||||
|
||||
Returns:
|
||||
BaseSkill instance
|
||||
|
||||
Raises:
|
||||
ValueError: If skill not found
|
||||
|
||||
Example:
|
||||
>>> from edgar.ai.skills import get_skill
|
||||
>>> skill = get_skill("EdgarTools")
|
||||
>>> docs = skill.get_documents()
|
||||
"""
|
||||
for skill in list_skills():
|
||||
if skill.name == name:
|
||||
return skill
|
||||
|
||||
available = [s.name for s in list_skills()]
|
||||
raise ValueError(
|
||||
f"Skill '{name}' not found. Available skills: {', '.join(available)}"
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
213
venv/lib/python3.10/site-packages/edgar/ai/skills/base.py
Normal file
213
venv/lib/python3.10/site-packages/edgar/ai/skills/base.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
Base class for EdgarTools AI skills.
|
||||
|
||||
Provides the foundation for creating AI skills that integrate with
|
||||
edgar.ai infrastructure. External packages can subclass BaseSkill to
|
||||
create specialized skills (e.g., insider trading detection, fraud analysis).
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Callable
|
||||
|
||||
__all__ = ['BaseSkill']
|
||||
|
||||
|
||||
class BaseSkill(ABC):
|
||||
"""
|
||||
Abstract base class for EdgarTools AI skills.
|
||||
|
||||
A skill packages:
|
||||
- Documentation (markdown files with YAML frontmatter)
|
||||
- Helper functions (workflow wrappers)
|
||||
- Examples and patterns
|
||||
|
||||
External packages can subclass this to create specialized skills
|
||||
that integrate seamlessly with edgar.ai infrastructure.
|
||||
|
||||
Example:
|
||||
>>> from edgar.ai.skills.base import BaseSkill
|
||||
>>> from pathlib import Path
|
||||
>>>
|
||||
>>> class InsiderTradingSkill(BaseSkill):
|
||||
... @property
|
||||
... def name(self) -> str:
|
||||
... return "Insider Trading Detection"
|
||||
...
|
||||
... @property
|
||||
... def description(self) -> str:
|
||||
... return "Analyze Form 4 filings for insider trading patterns"
|
||||
...
|
||||
... @property
|
||||
... def content_dir(self) -> Path:
|
||||
... return Path(__file__).parent / "content"
|
||||
...
|
||||
... def get_helpers(self) -> Dict[str, Callable]:
|
||||
... return {
|
||||
... 'detect_unusual_trades': self.detect_unusual_trades,
|
||||
... }
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def name(self) -> str:
|
||||
"""
|
||||
Skill name for display and identification.
|
||||
|
||||
Should be descriptive and unique. Example: "SEC Filing Analysis"
|
||||
|
||||
Returns:
|
||||
Human-readable skill name
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def description(self) -> str:
|
||||
"""
|
||||
Brief description of skill capabilities.
|
||||
|
||||
Used by AI agents to determine when to activate the skill.
|
||||
Should clearly describe what problems the skill solves.
|
||||
|
||||
Returns:
|
||||
One-sentence skill description
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def content_dir(self) -> Path:
|
||||
"""
|
||||
Directory containing skill documentation (markdown files).
|
||||
|
||||
This directory should contain:
|
||||
- skill.md: Main skill documentation with YAML frontmatter
|
||||
- objects.md: Object reference (optional)
|
||||
- workflows.md: Workflow patterns (optional)
|
||||
- readme.md: Installation/overview (optional)
|
||||
|
||||
Returns:
|
||||
Path to skill content directory
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_helpers(self) -> Dict[str, Callable]:
|
||||
"""
|
||||
Return dictionary of helper functions this skill provides.
|
||||
|
||||
Helper functions are convenience wrappers that simplify
|
||||
common workflows for the skill's domain.
|
||||
|
||||
Returns:
|
||||
Dict mapping function names to callable objects
|
||||
|
||||
Example:
|
||||
>>> {
|
||||
... 'get_revenue_trend': helpers.get_revenue_trend,
|
||||
... 'compare_companies': helpers.compare_companies,
|
||||
... }
|
||||
"""
|
||||
pass
|
||||
|
||||
# Non-abstract methods with default implementations
|
||||
|
||||
def get_object_docs(self) -> List[Path]:
|
||||
"""
|
||||
Return paths to centralized object documentation files to include in exports.
|
||||
|
||||
Override this method to specify which centralized API reference docs
|
||||
should be included when exporting the skill. These docs are copied to
|
||||
an 'api-reference/' subdirectory in the exported skill package.
|
||||
|
||||
Returns:
|
||||
List of Path objects pointing to markdown documentation files
|
||||
|
||||
Example:
|
||||
>>> def get_object_docs(self) -> List[Path]:
|
||||
... from pathlib import Path
|
||||
... root = Path(__file__).parent.parent.parent
|
||||
... return [
|
||||
... root / "entity/docs/Company.md",
|
||||
... root / "xbrl/docs/XBRL.md",
|
||||
... ]
|
||||
"""
|
||||
return [] # Default: no object docs
|
||||
|
||||
def get_documents(self) -> List[str]:
|
||||
"""
|
||||
List of markdown documents in this skill.
|
||||
|
||||
Returns:
|
||||
List of document names (without .md extension)
|
||||
"""
|
||||
if not self.content_dir.exists():
|
||||
return []
|
||||
return [f.stem for f in self.content_dir.glob("*.md")]
|
||||
|
||||
def get_document_content(self, name: str) -> str:
|
||||
"""
|
||||
Get content of a specific markdown document.
|
||||
|
||||
Args:
|
||||
name: Document name (with or without .md extension)
|
||||
|
||||
Returns:
|
||||
Full markdown content as string
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If document doesn't exist
|
||||
"""
|
||||
doc_name = name if name.endswith('.md') else f"{name}.md"
|
||||
doc_path = self.content_dir / doc_name
|
||||
|
||||
if not doc_path.exists():
|
||||
available = ", ".join(self.get_documents())
|
||||
raise FileNotFoundError(
|
||||
f"Document '{name}' not found in skill '{self.name}'. "
|
||||
f"Available: {available}"
|
||||
)
|
||||
|
||||
return doc_path.read_text()
|
||||
|
||||
def export(self, format: str = "claude-desktop", output_dir: Optional[Path] = None, **kwargs) -> Path:
|
||||
"""
|
||||
Export skill in specified format.
|
||||
|
||||
Args:
|
||||
format: Export format (default: "claude-desktop")
|
||||
- "claude-desktop": Claude Desktop Skills format (ZIP)
|
||||
- "claude-skills": Official Claude Skills format (~/.claude/skills/)
|
||||
output_dir: Where to create export (default: ./skills_export/)
|
||||
**kwargs: Additional format-specific parameters
|
||||
- create_zip (bool): For claude-desktop format (default: True)
|
||||
- install (bool): For claude-skills format (default: True)
|
||||
|
||||
Returns:
|
||||
Path to exported skill directory or archive
|
||||
|
||||
Example:
|
||||
>>> skill = EdgarToolsSkill()
|
||||
>>> # Export as ZIP for Claude Desktop upload
|
||||
>>> path = skill.export(format="claude-desktop")
|
||||
>>> # Export to ~/.claude/skills/ for automatic discovery
|
||||
>>> path = skill.export(format="claude-skills")
|
||||
"""
|
||||
from edgar.ai.exporters import export_skill
|
||||
return export_skill(self, format=format, output_dir=output_dir, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""String representation of the skill."""
|
||||
return f"{self.__class__.__name__}(name='{self.name}')"
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Human-readable skill description."""
|
||||
docs_count = len(self.get_documents())
|
||||
helpers_count = len(self.get_helpers())
|
||||
return (
|
||||
f"Skill: {self.name}\n"
|
||||
f"Description: {self.description}\n"
|
||||
f"Documents: {docs_count}\n"
|
||||
f"Helper Functions: {helpers_count}"
|
||||
)
|
||||
@@ -0,0 +1,119 @@
|
||||
"""
|
||||
EdgarTools Skill - Core EdgarTools AI skill.
|
||||
|
||||
Provides comprehensive documentation and helper functions for analyzing
|
||||
SEC filings and financial statements using EdgarTools.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, Callable
|
||||
from edgar.ai.skills.base import BaseSkill
|
||||
|
||||
__all__ = ['EdgarToolsSkill', 'edgartools_skill']
|
||||
|
||||
|
||||
class EdgarToolsSkill(BaseSkill):
|
||||
"""
|
||||
EdgarTools - AI skill for SEC filing analysis.
|
||||
|
||||
This skill provides:
|
||||
- Comprehensive API documentation for SEC filing analysis
|
||||
- Helper functions for common workflows
|
||||
- Object reference with token estimates
|
||||
- Workflow patterns for multi-step analysis
|
||||
|
||||
The skill covers:
|
||||
- Getting filings (3 approaches: Published, Current, Company-specific)
|
||||
- Getting financials (2 approaches: Entity Facts, Filing XBRL)
|
||||
- Multi-company analysis
|
||||
- Object representations optimized for AI
|
||||
|
||||
Example:
|
||||
>>> from edgar.ai.skills.core import edgartools_skill
|
||||
>>>
|
||||
>>> # List available documentation
|
||||
>>> print(edgartools_skill.get_documents())
|
||||
>>> ['skill', 'objects', 'workflows', 'readme']
|
||||
>>>
|
||||
>>> # Get main skill documentation
|
||||
>>> guide = edgartools_skill.get_document_content("skill")
|
||||
>>>
|
||||
>>> # Access helper functions
|
||||
>>> helpers = edgartools_skill.get_helpers()
|
||||
>>> get_revenue_trend = helpers['get_revenue_trend']
|
||||
>>> income = get_revenue_trend("AAPL", periods=3)
|
||||
>>>
|
||||
>>> # Export skill for Claude Desktop
|
||||
>>> path = edgartools_skill.export(
|
||||
... format="claude-desktop",
|
||||
... output_dir="~/.config/claude/skills"
|
||||
... )
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""Skill name: 'EdgarTools'"""
|
||||
return "EdgarTools"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
"""Skill description for AI agents."""
|
||||
return (
|
||||
"Query and analyze SEC filings and financial statements using EdgarTools. "
|
||||
"Get company data, filings, XBRL financials, and perform multi-company analysis."
|
||||
)
|
||||
|
||||
@property
|
||||
def content_dir(self) -> Path:
|
||||
"""Path to skill documentation directory."""
|
||||
return Path(__file__).parent
|
||||
|
||||
def get_object_docs(self) -> list[Path]:
|
||||
"""
|
||||
Return centralized object documentation to include in skill exports.
|
||||
|
||||
Returns paths to detailed API reference docs that complement the
|
||||
skill's tutorial documentation.
|
||||
|
||||
Returns:
|
||||
List of Path objects to centralized markdown documentation files
|
||||
"""
|
||||
# Navigate from edgar/ai/skills/core/ to edgar/ root
|
||||
edgar_root = Path(__file__).parent.parent.parent.parent
|
||||
|
||||
return [
|
||||
edgar_root / "entity/docs/Company.md",
|
||||
edgar_root / "entity/docs/EntityFiling.md",
|
||||
edgar_root / "entity/docs/EntityFilings.md",
|
||||
edgar_root / "xbrl/docs/XBRL.md",
|
||||
edgar_root / "xbrl/docs/Statement.md",
|
||||
]
|
||||
|
||||
def get_helpers(self) -> Dict[str, Callable]:
|
||||
"""
|
||||
Return helper functions provided by this skill.
|
||||
|
||||
Helper functions simplify common SEC analysis workflows:
|
||||
- get_filings_by_period: Get filings for a specific quarter
|
||||
- get_today_filings: Get recent filings (last ~24 hours)
|
||||
- get_revenue_trend: Get multi-period income statement
|
||||
- get_filing_statement: Get statement from specific filing
|
||||
- compare_companies_revenue: Compare revenue across companies
|
||||
|
||||
Returns:
|
||||
Dict mapping function names to callable objects
|
||||
"""
|
||||
# Import here to avoid circular dependencies
|
||||
from edgar.ai import helpers
|
||||
|
||||
return {
|
||||
'get_filings_by_period': helpers.get_filings_by_period,
|
||||
'get_today_filings': helpers.get_today_filings,
|
||||
'get_revenue_trend': helpers.get_revenue_trend,
|
||||
'get_filing_statement': helpers.get_filing_statement,
|
||||
'compare_companies_revenue': helpers.compare_companies_revenue,
|
||||
}
|
||||
|
||||
|
||||
# Create singleton instance for convenience
|
||||
edgartools_skill = EdgarToolsSkill()
|
||||
Binary file not shown.
950
venv/lib/python3.10/site-packages/edgar/attachments.py
Normal file
950
venv/lib/python3.10/site-packages/edgar/attachments.py
Normal file
@@ -0,0 +1,950 @@
|
||||
import http.server
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import socketserver
|
||||
import tempfile
|
||||
import time
|
||||
import webbrowser
|
||||
import zipfile
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from threading import Thread
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar.company_reports import Report
|
||||
from edgar.sgml.sgml_common import FilingSGML, SGMLDocument
|
||||
|
||||
import textwrap
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel
|
||||
from rich import box
|
||||
from rich.columns import Columns
|
||||
from rich.console import Group
|
||||
from rich.panel import Panel
|
||||
from rich.table import Column, Table
|
||||
from rich.text import Text
|
||||
|
||||
from edgar.core import binary_extensions, has_html_content, sec_dot_gov, text_extensions
|
||||
from edgar.files.html_documents import get_clean_html
|
||||
from edgar.files.markdown import to_markdown
|
||||
from edgar.httpclient import async_http_client
|
||||
from edgar.httprequests import download_file, download_file_async, get_with_retry
|
||||
from edgar.richtools import print_rich, print_xml, repr_rich, rich_to_text
|
||||
|
||||
xbrl_document_types = ['XBRL INSTANCE DOCUMENT', 'XBRL INSTANCE FILE', 'EXTRACTED XBRL INSTANCE DOCUMENT']
|
||||
|
||||
__all__ = ['Attachment', 'Attachments', 'FilingHomepage', 'FilerInfo', 'AttachmentServer', 'sec_document_url', 'get_document_type']
|
||||
|
||||
|
||||
def sec_document_url(attachment_url: str) -> str:
|
||||
# Remove "ix?doc=/" or "ix.xhtml?doc=/" from the filing url
|
||||
attachment_url = re.sub(r"ix(\.xhtml)?\?doc=/", "", attachment_url)
|
||||
return f"{sec_dot_gov}{attachment_url}"
|
||||
|
||||
def sequence_sort_key(x):
|
||||
seq = x.sequence_number
|
||||
if seq.strip() == '': # Handle empty or whitespace-only strings
|
||||
return (float('inf'), '') # Sort to end using infinity
|
||||
try:
|
||||
return (0, float(seq)) # Convert to number for numeric sorting
|
||||
except ValueError:
|
||||
return (1, seq) #
|
||||
|
||||
|
||||
# Mapping of SEC filing file types to Unicode symbols
|
||||
FILE_TYPE_SYMBOLS: Dict[str, str] = {
|
||||
# Main SEC filing documents
|
||||
"10-K": "📄", # Document emoji for main filing
|
||||
"EX-21.1": "📎", # Paperclip for exhibits
|
||||
"EX-23.1": "📎",
|
||||
"EX-31.1": "📎",
|
||||
"EX-31.2": "📎",
|
||||
"EX-32.1": "📎",
|
||||
"EX-97.1": "📎",
|
||||
|
||||
# XBRL-related documents
|
||||
"EX-101.SCH": "🔰", # Clipboard for schema
|
||||
"EX-101.CAL": "📊", # Chart for calculations
|
||||
"EX-101.DEF": "📚", # Books for definitions
|
||||
"EX-101.LAB": "📎", # Paperclip for labels (changed from label)
|
||||
"EX-101.PRE": "📈", # Graph for presentation
|
||||
|
||||
# Common file types
|
||||
"XML": "🔷", # Document for XML files
|
||||
"HTML": "🌍", # Page for HTML files
|
||||
"GRAPHIC": "🎨", # Camera for images
|
||||
"EXCEL": "📊", # Chart for Excel
|
||||
"JSON": "📝", # Note for JSON
|
||||
"ZIP": "📦", # Package for ZIP
|
||||
"CSS": "📃", # Page for CSS
|
||||
"JS": "📄", # Document for JavaScript
|
||||
".css": "📃", # Page for CSS extension
|
||||
".js": "📄", # Document for JS extension
|
||||
"PDF": "📕", # Book for PDF
|
||||
".pdf": "📕", # Book for PDF extension
|
||||
"INFORMATION TABLE": "📊" # Chart for tables
|
||||
}
|
||||
|
||||
|
||||
def get_extension(filename: str) -> str:
|
||||
"""Extract the file extension including the dot."""
|
||||
if '.' in filename:
|
||||
return filename[filename.rindex('.'):]
|
||||
return ''
|
||||
|
||||
def get_document_type(filename: str, declared_document_type:str) -> str:
|
||||
"""
|
||||
Sometimes the SEC gets the document type wrong. This function uses the extension to determine the document type
|
||||
"""
|
||||
if declared_document_type.upper() in ["XML", "HTML", "PDF", "HTM", "JS", "CSS", "ZIP", "XLS", "XSLX", "JSON"]:
|
||||
extension = get_extension(filename)
|
||||
document_type = extension[1:].upper()
|
||||
if document_type in ["HTM", "HTML"]:
|
||||
return "HTML"
|
||||
return document_type
|
||||
return declared_document_type
|
||||
|
||||
def get_file_icon(file_type: str, sequence: str = None, filename: str = None) -> str:
|
||||
"""
|
||||
Get the Unicode symbol for a given file type and sequence number.
|
||||
|
||||
Args:
|
||||
file_type: The type of the file from SEC filing
|
||||
sequence: The sequence number of the file in the filing
|
||||
filename: The name of the file to extract the extension
|
||||
|
||||
Returns:
|
||||
Unicode symbol corresponding to the file type.
|
||||
If sequence is 1, returns "📜" (scroll) to indicate main filing document.
|
||||
Returns "📄" (document) as default if type not found.
|
||||
"""
|
||||
icon = None
|
||||
if sequence == "1":
|
||||
icon = "📜" # Scroll emoji for main document
|
||||
|
||||
# Check if it's an XBRL exhibit (EX-101.*)
|
||||
elif file_type.startswith("EX-101."):
|
||||
icon = FILE_TYPE_SYMBOLS.get(file_type, "📄")
|
||||
|
||||
# Check if it's a regular exhibit (starts with EX-)
|
||||
elif file_type.startswith("EX-"):
|
||||
icon = "📋" # Clipboard + writing hand for exhibits
|
||||
|
||||
# Check for file extension first if filename is provided
|
||||
elif filename:
|
||||
ext = get_extension(filename)
|
||||
if ext in FILE_TYPE_SYMBOLS:
|
||||
icon = FILE_TYPE_SYMBOLS[ext]
|
||||
|
||||
if not icon:
|
||||
icon =FILE_TYPE_SYMBOLS.get(file_type, "📄")
|
||||
icon = f"{icon} " if len(icon) == 1 else icon # Add spaces around the icon for padding
|
||||
return icon
|
||||
|
||||
|
||||
class FilerInfo(BaseModel):
|
||||
company_name: str
|
||||
cik:str
|
||||
identification: str
|
||||
addresses: List[str]
|
||||
|
||||
def __rich__(self):
|
||||
return Panel(
|
||||
Columns([self.identification, Text(" "), self.addresses[0], self.addresses[1]]),
|
||||
title=self.company_name
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
class Attachment:
|
||||
"""
|
||||
A class to represent an attachment in an SEC filing
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
sequence_number: str,
|
||||
description: str,
|
||||
document: str,
|
||||
ixbrl: bool,
|
||||
path: str,
|
||||
document_type: str,
|
||||
size: Optional[int],
|
||||
sgml_document: Optional['SGMLDocument'] = None,
|
||||
purpose: Optional[str] = None,
|
||||
filing_sgml: Optional['FilingSGML'] = None):
|
||||
self.sequence_number = sequence_number
|
||||
self.description = description
|
||||
self.document = document
|
||||
self.ixbrl = ixbrl
|
||||
self.path = path
|
||||
self.document_type = document_type
|
||||
self.size = size
|
||||
self.sgml_document:Optional['SGMLDocument'] = sgml_document
|
||||
self.sgml = filing_sgml
|
||||
self.purpose = purpose
|
||||
# Allows tests to override content via property patching
|
||||
self._content_override = None
|
||||
|
||||
@property
|
||||
def content(self):
|
||||
# If tests have overridden content using the property's setter, honor it
|
||||
override = getattr(self, "_content_override", None)
|
||||
if override is not None:
|
||||
if isinstance(override, property) and override.fget is not None:
|
||||
return override.fget(self)
|
||||
try:
|
||||
return override(self) # callable override
|
||||
except TypeError:
|
||||
return override # direct value
|
||||
|
||||
# Avoid real network calls for synthetic test paths
|
||||
if isinstance(self.path, str) and self.path.startswith("/test/"):
|
||||
return ""
|
||||
|
||||
if self.sgml_document:
|
||||
return self.sgml_document.content
|
||||
else:
|
||||
return download_file(self.url)
|
||||
|
||||
@content.setter
|
||||
def content(self, value):
|
||||
# Enable tests to patch instance property via unittest.mock.patch.object
|
||||
self._content_override = value
|
||||
|
||||
@content.deleter
|
||||
def content(self):
|
||||
self._content_override = None
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return sec_document_url(self.path)
|
||||
|
||||
@property
|
||||
def extension(self):
|
||||
"""The actual extension of the filing document
|
||||
Usually one of .xml or .html or .pdf or .txt or .paper
|
||||
"""
|
||||
return os.path.splitext(self.document)[1]
|
||||
|
||||
@property
|
||||
def display_extension(self) -> str:
|
||||
"""This is the extension displayed in the html e.g. "es220296680_4-davis.html"
|
||||
The actual extension would be "es220296680_4-davis.xml", that displays as html in the browser
|
||||
"""
|
||||
return os.path.splitext(self.document)[1]
|
||||
|
||||
def validate_sequence_number(self, v):
|
||||
if not v.isdigit() and v != '':
|
||||
raise ValueError('sequence_number must be digits or an empty string')
|
||||
return v
|
||||
|
||||
def is_text(self) -> bool:
|
||||
"""Is this a text document"""
|
||||
return self.extension in text_extensions
|
||||
|
||||
def is_xml(self):
|
||||
return self.extension.lower() in [".xsd", ".xml", ".xbrl"]
|
||||
|
||||
def is_html(self):
|
||||
return self.extension.lower() in [".htm", ".html"]
|
||||
|
||||
def is_binary(self) -> bool:
|
||||
"""Is this a binary document"""
|
||||
return self.extension in binary_extensions
|
||||
|
||||
@property
|
||||
def empty(self):
|
||||
"""Some older filings have no document url. So effectively this attachment is empty"""
|
||||
return self.document is None or self.document.strip() == ''
|
||||
|
||||
def download(self, path: Optional[Union[str, Path]] = None) -> Optional[Union[str, bytes]]:
|
||||
"""
|
||||
Download the file to a specified path.
|
||||
If the path is not provided, return the downloaded content as text or bytes.
|
||||
If the path is a directory, the file is saved with its original name in that directory.
|
||||
If the path is a file, the file is saved with the given path name.
|
||||
"""
|
||||
if path is None:
|
||||
return self.content
|
||||
|
||||
# Ensure path is a Path object
|
||||
path = Path(path)
|
||||
|
||||
# Determine if the path is a directory or a file
|
||||
if path.is_dir():
|
||||
file_path = path / self.document
|
||||
else:
|
||||
file_path = path
|
||||
|
||||
# Save the file
|
||||
if isinstance(self.content, bytes):
|
||||
file_path.write_bytes(self.content)
|
||||
else:
|
||||
file_path.write_text(self.content)
|
||||
|
||||
return str(file_path)
|
||||
|
||||
def view(self):
|
||||
# Check if this is a report
|
||||
if self.is_report() and self.sgml:
|
||||
report = self.sgml.filing_summary.reports.get_by_filename(self.document)
|
||||
if report:
|
||||
report.view()
|
||||
else:
|
||||
if self.is_text():
|
||||
content = self.content
|
||||
if self.is_html() or has_html_content(content):
|
||||
from edgar import Document
|
||||
document = Document.parse(content)
|
||||
print_rich(document)
|
||||
elif self.is_xml():
|
||||
print_xml(content)
|
||||
else:
|
||||
pass
|
||||
else:
|
||||
pass
|
||||
|
||||
def is_report(self):
|
||||
return re.match(r"R\d+\.htm", self.document)
|
||||
|
||||
def text(self):
|
||||
# Check if this is a report
|
||||
if self.is_report() and self.sgml:
|
||||
report = self.sgml.filing_summary.reports.get_by_filename(self.document)
|
||||
if report:
|
||||
return report.text()
|
||||
|
||||
if self.is_text():
|
||||
content = self.content
|
||||
if self.is_html() or has_html_content(content):
|
||||
from edgar import Document
|
||||
document = Document.parse(content)
|
||||
return rich_to_text(document)
|
||||
else:
|
||||
return content
|
||||
return None
|
||||
|
||||
def markdown(self, include_page_breaks: bool = False, start_page_number: int = 0) -> Optional[str]:
|
||||
"""
|
||||
Convert the attachment to markdown format if it's HTML content.
|
||||
|
||||
Args:
|
||||
include_page_breaks: If True, include page break delimiters in the markdown
|
||||
start_page_number: Starting page number for page break markers (default: 0)
|
||||
|
||||
Returns:
|
||||
None if the attachment is not HTML or cannot be converted.
|
||||
"""
|
||||
if not self.is_html():
|
||||
return None
|
||||
|
||||
content = self.content
|
||||
if not content:
|
||||
return None
|
||||
|
||||
# Check if content has HTML structure
|
||||
if not has_html_content(content):
|
||||
return None
|
||||
|
||||
# Use the same approach as Filing.markdown() but with page break support
|
||||
clean_html = get_clean_html(content)
|
||||
if clean_html:
|
||||
return to_markdown(clean_html, include_page_breaks=include_page_breaks, start_page_number=start_page_number)
|
||||
|
||||
return None
|
||||
|
||||
def __rich__(self):
|
||||
icon = get_file_icon(self.document_type, self.sequence_number, self.document)
|
||||
text = Text.assemble( (f"{self.sequence_number:<3} ", "dim italic"),
|
||||
" ",
|
||||
(self.document, "bold"),
|
||||
" ", (self.purpose or self.description, "grey54"),
|
||||
" ",
|
||||
(icon, ""),
|
||||
" ",
|
||||
(self.document_type,
|
||||
"bold deep_sky_blue1" if self.sequence_number == "1" else "")
|
||||
)
|
||||
return Panel(text, box=box.ROUNDED, width=200, expand=False)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def __str__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
class Attachments:
|
||||
"""
|
||||
A class to represent the attachments of an SEC filing
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
document_files: List[Attachment],
|
||||
data_files: Optional[List[Attachment]],
|
||||
primary_documents: List[Attachment],
|
||||
sgml:Optional['FilingSGML'] = None):
|
||||
self.documents = document_files
|
||||
self.data_files = data_files
|
||||
self._attachments = document_files + (data_files or [])
|
||||
self.primary_documents = primary_documents
|
||||
self.sgml = sgml
|
||||
self.n = 0
|
||||
|
||||
|
||||
def __getitem__(self, item: Union[int, str]):
|
||||
"""
|
||||
Get the attachment by sequence number as set in the SEC filing SGML file
|
||||
"""
|
||||
if isinstance(item, int) or item.isdigit():
|
||||
return self.get_by_sequence(item)
|
||||
elif isinstance(item, str):
|
||||
for doc in self._attachments:
|
||||
if doc.document == item:
|
||||
return doc
|
||||
raise KeyError(f"Document not found: {item}")
|
||||
|
||||
def get_by_sequence(self, sequence: Union[str, int]):
|
||||
"""
|
||||
Get the attachment by sequence number starting at 1
|
||||
The sequence number is the exact sequence number in the filing
|
||||
"""
|
||||
for doc in self._attachments:
|
||||
if doc.sequence_number == str(sequence):
|
||||
return doc
|
||||
raise KeyError(f"Document not found: {sequence}")
|
||||
|
||||
def get_by_index(self, index: int):
|
||||
"""
|
||||
Get the attachment by index starting at 1
|
||||
"""
|
||||
return self._attachments[index]
|
||||
|
||||
|
||||
def get_report(self, filename:str) -> 'Report':
|
||||
"""
|
||||
Get a report by filename
|
||||
"""
|
||||
if self.sgml:
|
||||
reports = self.sgml.filing_summary.reports
|
||||
if reports:
|
||||
return reports.get_by_filename(filename)
|
||||
return None
|
||||
|
||||
|
||||
@property
|
||||
def primary_html_document(self) -> Optional[Attachment]:
|
||||
"""Get the primary xml document on the filing"""
|
||||
for doc in self.primary_documents:
|
||||
if doc.display_extension == ".html" or doc.display_extension == '.htm':
|
||||
return doc
|
||||
"""
|
||||
Most filings have html primary documents. Some don't.
|
||||
E.g. Form's 3,4,5 do when loaded directly from edgar but not when loaded from local files
|
||||
However, there are unusual filings with endings like ".fil" that require a return. So return the first one
|
||||
"""
|
||||
if len(self.primary_documents) > 0:
|
||||
return self.primary_documents[0]
|
||||
return None
|
||||
|
||||
|
||||
@property
|
||||
def primary_xml_document(self) -> Optional[Attachment]:
|
||||
"""Get the primary xml document on the filing"""
|
||||
for doc in self.primary_documents:
|
||||
if doc.display_extension == ".xml":
|
||||
return doc
|
||||
return None
|
||||
|
||||
@property
|
||||
def text_document(self):
|
||||
for doc in reversed(self.documents):
|
||||
if doc.description == "Complete submission text file":
|
||||
return doc
|
||||
return None
|
||||
|
||||
@property
|
||||
def exhibits(self):
|
||||
"""
|
||||
Get all the exhibits in the filing.
|
||||
This is the primary document plus all the documents listed as EX-XX
|
||||
"""
|
||||
primary_documents = [self.primary_html_document]
|
||||
exhibits_documents = self.query("re.match('EX-', document_type)", False).documents
|
||||
return Attachments(
|
||||
document_files=primary_documents + exhibits_documents,
|
||||
data_files=[],
|
||||
primary_documents=primary_documents,
|
||||
sgml=self.sgml)
|
||||
|
||||
@property
|
||||
def graphics(self):
|
||||
return self.query("document_type=='GRAPHIC'")
|
||||
|
||||
def query(self, query_str: str, include_data_files: bool = True):
|
||||
"""
|
||||
Query attachments based on a simple query string.
|
||||
Supports conditions on 'document', 'description', and 'document_type'.
|
||||
Example query: "document.endswith('.htm') and 'RELEASE' in description and document_type in ['EX-99.1', 'EX-99', 'EX-99.01']"
|
||||
"""
|
||||
allowed_attrs = {'document', 'description', 'document_type'}
|
||||
|
||||
# Precompile regex for finding attributes and match patterns
|
||||
attr_regex = re.compile(rf"\b({'|'.join(allowed_attrs)})\b")
|
||||
match_regex = re.compile(r"re\.match\('(.*)', (\w+)\)")
|
||||
|
||||
def safe_eval(attachment, query):
|
||||
# Replace attribute references with attachment attributes
|
||||
query = attr_regex.sub(lambda m: f"attachment.{m.group(0)}", query)
|
||||
|
||||
# Handle regex match explicitly
|
||||
match = match_regex.search(query)
|
||||
if match:
|
||||
pattern, attr = match.groups()
|
||||
query = query.replace(f"re.match('{pattern}', {attr})",
|
||||
f"re.match(r'{pattern}', attachment.{attr})")
|
||||
|
||||
return eval(query, {"re": re, "attachment": attachment})
|
||||
|
||||
# Evaluate the query for documents and data files
|
||||
new_documents = [attachment for attachment in self.documents if safe_eval(attachment, query_str)]
|
||||
if include_data_files:
|
||||
new_data_files = [attachment for attachment in self.data_files if
|
||||
safe_eval(attachment, query_str)] if self.data_files else None
|
||||
else:
|
||||
new_data_files = []
|
||||
|
||||
return Attachments(document_files=new_documents, data_files=new_data_files,
|
||||
primary_documents=self.primary_documents, sgml=self.sgml)
|
||||
|
||||
@staticmethod
|
||||
async def _download_all_attachments(attachments: List[Attachment]):
|
||||
import asyncio
|
||||
|
||||
async with async_http_client() as client:
|
||||
return await asyncio.gather(
|
||||
*[download_file_async(client, attachment.url, as_text=attachment.is_text()) for attachment in attachments])
|
||||
|
||||
|
||||
def download(self, path: Union[str, Path], archive: bool = False):
|
||||
"""
|
||||
Download all the attachments to a specified path.
|
||||
If the path is a directory, the file is saved with its original name in that directory.
|
||||
If the path is a file, the file is saved with the given path name.
|
||||
If archive is True, the attachments are saved in a zip file.
|
||||
path: str or Path - The path to save the attachments
|
||||
archive: bool (default False) - If True, save the attachments in a zip file
|
||||
"""
|
||||
if self.sgml:
|
||||
self.sgml.download(path, archive)
|
||||
return
|
||||
|
||||
import asyncio
|
||||
loop = asyncio.get_event_loop()
|
||||
downloaded_files = loop.run_until_complete(Attachments._download_all_attachments(self._attachments))
|
||||
|
||||
# Ensure path is a Path object
|
||||
path = Path(path)
|
||||
|
||||
# If the path is a directory, save the files in that directory
|
||||
if archive:
|
||||
if path.is_dir():
|
||||
raise ValueError("Path must be a zip file name to create zipfile")
|
||||
else:
|
||||
with zipfile.ZipFile(path, 'w') as zipf:
|
||||
for attachment, downloaded in zip(self._attachments, downloaded_files, strict=False):
|
||||
if isinstance(downloaded, bytes):
|
||||
zipf.writestr(attachment.document, downloaded)
|
||||
else:
|
||||
zipf.writestr(attachment.document, downloaded.encode('utf-8'))
|
||||
else:
|
||||
if path.is_dir():
|
||||
for attachment, downloaded in zip(self._attachments, downloaded_files, strict=False):
|
||||
file_path = path / attachment.document
|
||||
if isinstance(downloaded, bytes):
|
||||
file_path.write_bytes(downloaded)
|
||||
else:
|
||||
file_path.write_text(downloaded, encoding='utf-8')
|
||||
else:
|
||||
raise ValueError("Path must be a directory")
|
||||
|
||||
|
||||
def serve(self, port: int = 8000) -> Tuple[Thread, socketserver.TCPServer, str]:
|
||||
"""
|
||||
Serve the attachment on a local server
|
||||
The server can be stopped using CTRL-C
|
||||
port: int (default 8000) - The port to serve the attachment
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_path = Path(temp_dir)
|
||||
self.download(temp_path)
|
||||
|
||||
class Handler(http.server.SimpleHTTPRequestHandler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, directory=temp_dir, **kwargs)
|
||||
|
||||
primary_html = os.path.basename(self.primary_html_document.path)
|
||||
|
||||
url = f'http://localhost:{port}/{primary_html}'
|
||||
|
||||
httpd = socketserver.TCPServer(("", port), Handler)
|
||||
|
||||
def serve_forever():
|
||||
with httpd:
|
||||
httpd.serve_forever()
|
||||
|
||||
thread = Thread(target=serve_forever)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
# Wait for the server to start
|
||||
time.sleep(1)
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
httpd.shutdown()
|
||||
thread.join()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
webbrowser.open(url)
|
||||
|
||||
# Keep the main thread alive to handle signals
|
||||
while thread.is_alive():
|
||||
time.sleep(0.1)
|
||||
|
||||
return thread, httpd, url
|
||||
|
||||
def markdown(self, include_page_breaks: bool = False, start_page_number: int = 0) -> Dict[str, str]:
|
||||
"""
|
||||
Convert all HTML attachments to markdown format.
|
||||
|
||||
Args:
|
||||
include_page_breaks: If True, include page break delimiters in the markdown
|
||||
start_page_number: Starting page number for page break markers (default: 0)
|
||||
|
||||
Returns:
|
||||
A dictionary mapping attachment document names to their markdown content.
|
||||
Only includes attachments that can be successfully converted to markdown.
|
||||
"""
|
||||
markdown_attachments = {}
|
||||
|
||||
for attachment in self._attachments:
|
||||
if attachment.is_html():
|
||||
md_content = attachment.markdown(include_page_breaks=include_page_breaks, start_page_number=start_page_number)
|
||||
if md_content:
|
||||
markdown_attachments[attachment.document] = md_content
|
||||
|
||||
return markdown_attachments
|
||||
|
||||
def __len__(self):
|
||||
return len(self._attachments)
|
||||
|
||||
def __iter__(self):
|
||||
self.n = 0
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.n < len(self):
|
||||
_attachment = self._attachments[self.n]
|
||||
assert _attachment is not None
|
||||
|
||||
self.n += 1
|
||||
return _attachment
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
def __rich__(self):
|
||||
|
||||
# Document files
|
||||
document_table = Table(Column('Seq', header_style="dim"),
|
||||
Column('Document', header_style="dim"),
|
||||
Column('Description', header_style="dim", min_width=60),
|
||||
Column('Type', header_style="dim", min_width=16),
|
||||
title='Attachments',
|
||||
row_styles=["", "bold"],
|
||||
box=box.SIMPLE_HEAD)
|
||||
all_attachments = sorted(self.documents + (self.data_files or []), key=sequence_sort_key)
|
||||
|
||||
|
||||
|
||||
for attachment in all_attachments:
|
||||
# Get the file icon for each attachment
|
||||
icon = get_file_icon(file_type=attachment.document_type,
|
||||
sequence= attachment.sequence_number,
|
||||
filename=attachment.document)
|
||||
sequence_number = f"{attachment.sequence_number}" if attachment.sequence_number == "1" else attachment.sequence_number
|
||||
description = "\n".join(textwrap.wrap(attachment.purpose or attachment.description, 100))
|
||||
document_table.add_row(Text(sequence_number, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else sequence_number,
|
||||
Text(attachment.document, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else attachment.document,
|
||||
Text(description, style="bold deep_sky_blue1") if attachment.sequence_number == "1" else description,
|
||||
Text.assemble((icon, ""), " ", (attachment.document_type, "bold deep_sky_blue1" if attachment.sequence_number == "1" else "")),)
|
||||
|
||||
|
||||
return document_table
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
@classmethod
|
||||
def load(cls, soup: BeautifulSoup):
|
||||
"""
|
||||
Load the attachments from the SEC filing home page
|
||||
"""
|
||||
tables = soup.find_all('table', class_='tableFile')
|
||||
|
||||
def parse_table(table, documents: bool):
|
||||
min_seq = None
|
||||
# The list of attachments which are primary. This is the first document in the filing
|
||||
# Plus additional document with the same sequence number
|
||||
primary_documents: List[Attachment] = []
|
||||
|
||||
rows = table.find_all('tr')[1:] # Skip header row
|
||||
attachments = []
|
||||
for _index, row in enumerate(rows):
|
||||
cols = row.find_all('td')
|
||||
sequence_number = cols[0].text.strip().replace('\xa0', '-')
|
||||
|
||||
description = cols[1].text.strip()
|
||||
# The document text is the text of the document link.
|
||||
document_text = cols[2].text.strip()
|
||||
document = document_text.split(' ')[0].strip()
|
||||
iXbrl = 'iXBRL' in document_text
|
||||
path = cols[2].a['href'].strip()
|
||||
document_type = cols[3].text.strip()
|
||||
size = cols[4].text.strip()
|
||||
|
||||
try:
|
||||
size = int(size)
|
||||
except ValueError:
|
||||
size = None
|
||||
|
||||
attachment = Attachment(
|
||||
sequence_number=sequence_number,
|
||||
description=description,
|
||||
document=document,
|
||||
ixbrl=iXbrl,
|
||||
path=path,
|
||||
document_type=document_type,
|
||||
size=size
|
||||
)
|
||||
# Add the attachment to the list
|
||||
attachments.append(attachment)
|
||||
|
||||
# Set the SGML on the attachment
|
||||
attachment.sgml = attachment.sgml
|
||||
# If this is the first document, set it as the primary document
|
||||
if documents:
|
||||
if min_seq is None:
|
||||
min_seq = sequence_number
|
||||
if sequence_number == min_seq:
|
||||
primary_documents.append(attachment)
|
||||
return attachments, primary_documents
|
||||
|
||||
if tables:
|
||||
document_files, primary_documents = parse_table(tables[0], documents=True)
|
||||
else:
|
||||
document_files, primary_documents = [], []
|
||||
|
||||
if len(tables) > 1:
|
||||
data_files, _ = parse_table(tables[1], documents=False)
|
||||
else:
|
||||
data_files = None
|
||||
|
||||
return cls(document_files, data_files, primary_documents)
|
||||
|
||||
|
||||
class AttachmentServer:
|
||||
def __init__(self, attachments: Attachments, port: int = 8000):
|
||||
self.attachments = attachments
|
||||
self.port = port
|
||||
self.thread = None
|
||||
self.httpd = None
|
||||
self.url = None
|
||||
self.setup()
|
||||
|
||||
def setup(self):
|
||||
temp_dir = tempfile.TemporaryDirectory()
|
||||
temp_path = Path(temp_dir.name)
|
||||
self.attachments.download(temp_path)
|
||||
|
||||
class Handler(http.server.SimpleHTTPRequestHandler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, directory=temp_dir.name, **kwargs)
|
||||
|
||||
primary_html = os.path.basename(self.attachments.primary_html_document.path)
|
||||
|
||||
self.url = f'http://localhost:{self.port}/{primary_html}'
|
||||
|
||||
self.httpd = socketserver.TCPServer(("", self.port), Handler)
|
||||
|
||||
def serve_forever():
|
||||
with self.httpd:
|
||||
self.httpd.serve_forever()
|
||||
|
||||
self.thread = Thread(target=serve_forever)
|
||||
self.thread.daemon = True
|
||||
|
||||
signal.signal(signal.SIGINT, self.signal_handler)
|
||||
|
||||
def start(self):
|
||||
self.thread.start()
|
||||
webbrowser.open(self.url)
|
||||
|
||||
# Keep the main thread alive to handle signals
|
||||
while self.thread.is_alive():
|
||||
time.sleep(0.1)
|
||||
|
||||
def stop(self):
|
||||
self.httpd.shutdown()
|
||||
self.thread.join()
|
||||
|
||||
def signal_handler(self, sig, frame):
|
||||
self.stop()
|
||||
exit(0) # Ensure the program exits
|
||||
|
||||
|
||||
|
||||
class FilingHomepage:
|
||||
|
||||
def __init__(self,
|
||||
url: str,
|
||||
soup: BeautifulSoup,
|
||||
attachments: Attachments):
|
||||
self.attachments = attachments
|
||||
self.url = url
|
||||
self._soup = soup
|
||||
|
||||
def open(self):
|
||||
webbrowser.open(self.url)
|
||||
|
||||
@property
|
||||
def documents(self):
|
||||
return self.attachments.documents
|
||||
|
||||
@property
|
||||
def datafiles(self):
|
||||
return self.attachments.data_files
|
||||
|
||||
@property
|
||||
def primary_html_document(self) -> Optional[Attachment]:
|
||||
"""Get the primary html document on the filing"""
|
||||
return self.attachments.primary_html_document
|
||||
|
||||
@property
|
||||
def primary_xml_document(self) -> Optional[Attachment]:
|
||||
"""Get the primary xml document on the filing"""
|
||||
return self.attachments.primary_xml_document
|
||||
|
||||
@property
|
||||
def primary_documents(self):
|
||||
return self.attachments.primary_documents
|
||||
|
||||
@property
|
||||
def text_document(self):
|
||||
return self.attachments.text_document
|
||||
|
||||
@property
|
||||
def xbrl_document(self):
|
||||
"""Find and return the xbrl document."""
|
||||
|
||||
if self.datafiles is None:
|
||||
return None
|
||||
for datafile in reversed(self.datafiles):
|
||||
if datafile.description in xbrl_document_types:
|
||||
return datafile
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_filers(self):
|
||||
filer_divs = self._soup.find_all("div", id="filerDiv")
|
||||
filer_infos = []
|
||||
for filer_div in filer_divs:
|
||||
|
||||
# Get the company name
|
||||
company_info_div = filer_div.find("div", class_="companyInfo")
|
||||
|
||||
company_name_span = company_info_div.find("span", class_="companyName")
|
||||
|
||||
if company_name_span:
|
||||
full_text = company_name_span.text.strip()
|
||||
# Split the text into company name and CIK
|
||||
parts = full_text.split('CIK: ')
|
||||
company_name = parts[0].strip()
|
||||
cik = parts[1].split()[0] if len(parts) > 1 else ""
|
||||
|
||||
# Clean up the company name
|
||||
company_name = re.sub("\n", "", company_name).replace("(Filer)", "").strip()
|
||||
else:
|
||||
company_name = ""
|
||||
cik = ""
|
||||
|
||||
# Get the identification information
|
||||
ident_info_div = company_info_div.find("p", class_="identInfo")
|
||||
|
||||
# Replace <br> with newlines
|
||||
for br in ident_info_div.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
|
||||
identification = ident_info_div.text
|
||||
|
||||
# Get the mailing information
|
||||
mailer_divs = filer_div.find_all("div", class_="mailer")
|
||||
# For each mailed_div.text remove multiple spaces after a newline
|
||||
|
||||
addresses = [re.sub(r'\n\s+', '\n', mailer_div.text.strip())
|
||||
for mailer_div in mailer_divs]
|
||||
|
||||
# Create the filer info
|
||||
filer_info = FilerInfo(company_name=company_name, cik=cik, identification=identification, addresses=addresses)
|
||||
|
||||
filer_infos.append(filer_info)
|
||||
|
||||
return filer_infos
|
||||
|
||||
@property
|
||||
def period_of_report(self)-> Optional[str]:
|
||||
"Get the period of report"
|
||||
_,_, period = self.get_filing_dates()
|
||||
return period
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_filing_dates(self)-> Optional[Tuple[str,str, Optional[str]]]:
|
||||
# Find the form grouping divs
|
||||
grouping_divs = self._soup.find_all("div", class_="formGrouping")
|
||||
if len(grouping_divs) == 0:
|
||||
return None
|
||||
date_grouping_div = grouping_divs[0]
|
||||
info_divs = date_grouping_div.find_all("div", class_="info")
|
||||
filing_date = info_divs[0].text.strip()
|
||||
accepted_date = info_divs[1].text.strip()
|
||||
|
||||
if len(grouping_divs) > 1:
|
||||
period_grouping_div = grouping_divs[1]
|
||||
first_info_div = period_grouping_div.find("div", class_="info")
|
||||
if first_info_div:
|
||||
period = first_info_div.text.strip()
|
||||
return filing_date, accepted_date, period
|
||||
return filing_date, accepted_date, None
|
||||
|
||||
@classmethod
|
||||
def load(cls, url: str):
|
||||
response = get_with_retry(url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
attachments = Attachments.load(soup)
|
||||
return cls(url, soup, attachments)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
def __rich__(self):
|
||||
|
||||
return Panel(
|
||||
Group(
|
||||
self.attachments,
|
||||
Group(
|
||||
*[filer_info.__rich__() for filer_info in self.get_filers()]
|
||||
)
|
||||
))
|
||||
930
venv/lib/python3.10/site-packages/edgar/company_reports.py
Normal file
930
venv/lib/python3.10/site-packages/edgar/company_reports.py
Normal file
@@ -0,0 +1,930 @@
|
||||
import re
|
||||
from datetime import datetime
|
||||
from functools import cached_property, lru_cache, partial
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from rich import box, print
|
||||
from rich.console import Group, Text
|
||||
from rich.padding import Padding
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.tree import Tree
|
||||
|
||||
from edgar._filings import Attachment, Attachments
|
||||
from edgar._markdown import MarkdownContent
|
||||
from edgar.files.html import Document
|
||||
from edgar.files.html_documents import HtmlDocument
|
||||
from edgar.files.htmltools import ChunkedDocument, adjust_for_empty_items, chunks2df, detect_decimal_items
|
||||
from edgar.financials import Financials
|
||||
from edgar.formatting import datefmt
|
||||
from edgar.richtools import repr_rich, rich_to_text
|
||||
|
||||
__all__ = [
|
||||
'TenK',
|
||||
'TenQ',
|
||||
'TwentyF',
|
||||
'CurrentReport',
|
||||
'SixK',
|
||||
'EightK',
|
||||
'PressRelease',
|
||||
'PressReleases',
|
||||
'is_valid_item_for_filing'
|
||||
]
|
||||
|
||||
|
||||
class CompanyReport:
|
||||
|
||||
def __init__(self, filing):
|
||||
self._filing = filing
|
||||
|
||||
@property
|
||||
def filing_date(self):
|
||||
return self._filing.filing_date
|
||||
|
||||
@property
|
||||
def form(self):
|
||||
return self._filing.form
|
||||
|
||||
@property
|
||||
def company(self):
|
||||
return self._filing.company
|
||||
|
||||
@property
|
||||
def income_statement(self):
|
||||
return self.financials.income_statement() if self.financials else None
|
||||
|
||||
@property
|
||||
def balance_sheet(self):
|
||||
return self.financials.balance_sheet() if self.financials else None
|
||||
|
||||
@property
|
||||
def cash_flow_statement(self):
|
||||
return self.financials.cashflow_statement() if self.financials else None
|
||||
|
||||
@cached_property
|
||||
def financials(self):
|
||||
"""Get the financials for this filing"""
|
||||
return Financials.extract(self._filing)
|
||||
|
||||
@property
|
||||
def period_of_report(self):
|
||||
return self._filing.header.period_of_report
|
||||
|
||||
@cached_property
|
||||
def chunked_document(self):
|
||||
return ChunkedDocument(self._filing.html())
|
||||
|
||||
@property
|
||||
def doc(self):
|
||||
return self.chunked_document
|
||||
|
||||
@property
|
||||
def items(self) -> List[str]:
|
||||
return self.chunked_document.list_items()
|
||||
|
||||
def __getitem__(self, item_or_part: str):
|
||||
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
|
||||
item_text = self.chunked_document[item_or_part]
|
||||
return item_text
|
||||
|
||||
def view(self, item_or_part: str):
|
||||
"""Get the Item or Part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q"""
|
||||
item_text = self[item_or_part]
|
||||
if item_text:
|
||||
print(item_text)
|
||||
|
||||
def __rich__(self):
|
||||
return Panel(
|
||||
Group(
|
||||
self._filing.__rich__(),
|
||||
self.financials() or Text("No financial data available")
|
||||
)
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
class FilingStructure:
|
||||
|
||||
def __init__(self, structure: Dict):
|
||||
self.structure = structure
|
||||
|
||||
def get_part(self, part: str):
|
||||
return self.structure.get(part.upper())
|
||||
|
||||
def get_item(self, item: str, part: str = None):
|
||||
item = item.upper()
|
||||
if part:
|
||||
part_dict = self.get_part(part)
|
||||
if part_dict:
|
||||
return part_dict.get(item)
|
||||
else:
|
||||
for _, items in self.structure.items():
|
||||
if item in items:
|
||||
return items[item]
|
||||
return None
|
||||
|
||||
def is_valid_item(self, item: str, part: str = None):
|
||||
return self.get_item(item, part) is not None
|
||||
|
||||
|
||||
class ItemOnlyFilingStructure(FilingStructure):
|
||||
|
||||
def get_part(self, part: str):
|
||||
return None
|
||||
|
||||
def get_item(self, item: str, part: str = None):
|
||||
return self.structure.get(item.upper())
|
||||
|
||||
|
||||
class TenK(CompanyReport):
|
||||
structure = FilingStructure({
|
||||
"PART I": {
|
||||
# special case for 10-K
|
||||
# Items 1 and 2. Business and Properties
|
||||
"ITEM 1": {
|
||||
"Title": "Business",
|
||||
"Description": "Overview of the company's business operations, products, services, and market environment."
|
||||
},
|
||||
"ITEM 1A": {
|
||||
"Title": "Risk Factors",
|
||||
"Description": "Discussion of risks and uncertainties that could materially affect the company's " +
|
||||
"financial condition or results of operations."
|
||||
},
|
||||
"ITEM 1B": {
|
||||
"Title": "Unresolved Staff Comments",
|
||||
"Description": "Any comments from the SEC staff on the company's previous filings" +
|
||||
"that remain unresolved."
|
||||
},
|
||||
"ITEM 1C": {
|
||||
"Title": "Cybersecurity",
|
||||
"Description": "Cybersecurity risk management, strategy, and governance disclosures."
|
||||
},
|
||||
"ITEM 2": {
|
||||
"Title": "Properties",
|
||||
"Description": "Information about the physical properties owned or leased by the company."
|
||||
},
|
||||
"ITEM 3": {
|
||||
"Title": "Legal Proceedings",
|
||||
"Description": "Details of significant ongoing legal proceedings."
|
||||
},
|
||||
"ITEM 4": {
|
||||
"Title": "Mine Safety Disclosures",
|
||||
"Description": "Relevant for mining companies, disclosures about mine safety and regulatory compliance."
|
||||
}
|
||||
},
|
||||
"PART II": {
|
||||
"ITEM 5": {
|
||||
"Title": "Market for Registrant’s Common Equity",
|
||||
"Description": "Information on the company’s equity, including stock performance " +
|
||||
"and shareholder matters."
|
||||
},
|
||||
"ITEM 6": {
|
||||
"Title": "Selected Financial Data",
|
||||
"Description": "Financial data summary for the last five fiscal years."
|
||||
},
|
||||
"ITEM 7": {
|
||||
"Title": "Management’s Discussion and Analysis (MD&A)",
|
||||
"Description": "Management’s perspective on the financial condition, changes in financial condition, " +
|
||||
"and results of operations."
|
||||
},
|
||||
"ITEM 7A": {
|
||||
"Title": "Quantitative and Qualitative Disclosures About Market Risk",
|
||||
"Description": "Information on the company's exposure to market risk, such as interest rate risk, " +
|
||||
"foreign currency exchange risk, commodity price risk, etc."
|
||||
},
|
||||
"ITEM 8": {
|
||||
"Title": "Financial Statements",
|
||||
"Description": "Complete audited financial statements, including balance sheet, income statement, " +
|
||||
"cash flow statement, and notes to the financial statements."
|
||||
},
|
||||
"ITEM 9": {
|
||||
"Title": "Controls and Procedures",
|
||||
"Description": "Evaluation of the effectiveness of the design and operation of the company’s disclosure controls and procedures."
|
||||
},
|
||||
"ITEM 9A": {
|
||||
"Title": "Controls and Procedures",
|
||||
"Description": "Evaluation of internal controls over financial reporting."
|
||||
},
|
||||
"ITEM 9B": {
|
||||
"Title": "Other Information",
|
||||
"Description": "Any other relevant information not covered in other sections."
|
||||
},
|
||||
"ITEM 9C": {
|
||||
"Title": "Disclosure Regarding Foreign Jurisdictions That Prevent Inspections",
|
||||
"Description": "Disclosure Regarding Foreign Jurisdictions That Prevent Inspections."
|
||||
}
|
||||
},
|
||||
"PART III": {
|
||||
"ITEM 10": {
|
||||
"Title": "Directors, Executive Officers, and Corporate Governance",
|
||||
"Description": "Information about the company's directors, executive officers, and governance policies."
|
||||
},
|
||||
"ITEM 11": {
|
||||
"Title": "Executive Compensation",
|
||||
"Description": "Details of compensation paid to key executives."
|
||||
},
|
||||
"ITEM 12": {
|
||||
"Title": "Security Ownership of Certain Beneficial Owners and Management",
|
||||
"Description": "Information about stock ownership of major shareholders, directors, and management."
|
||||
},
|
||||
"ITEM 13": {
|
||||
"Title": "Certain Relationships and Related Transactions, and Director Independence",
|
||||
"Description": "Information on transactions between the company and its directors, officers, " +
|
||||
"and significant shareholders."
|
||||
},
|
||||
"ITEM 14": {
|
||||
"Title": "Principal Accounting Fees and Services",
|
||||
"Description": "Fees paid to the principal accountant and services rendered."
|
||||
}
|
||||
},
|
||||
"PART IV": {
|
||||
"ITEM 15": {
|
||||
"Title": "Exhibits, Financial Statement Schedules",
|
||||
"Description": "Legal documents and financial schedules that support the financial statements " +
|
||||
"and disclosures."
|
||||
},
|
||||
"ITEM 16": {
|
||||
"Title": "Form 10-K Summary",
|
||||
"Description": "Form 10-K Summary"
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
def __init__(self, filing):
|
||||
assert filing.form in ['10-K', '10-K/A'], f"This form should be a 10-K but was {filing.form}"
|
||||
super().__init__(filing)
|
||||
|
||||
@property
|
||||
def business(self):
|
||||
return self['Item 1']
|
||||
|
||||
@property
|
||||
def risk_factors(self):
|
||||
return self['Item 1A']
|
||||
|
||||
@property
|
||||
def management_discussion(self):
|
||||
return self['Item 7']
|
||||
|
||||
@property
|
||||
def directors_officers_and_governance(self):
|
||||
return self['Item 10']
|
||||
|
||||
@cached_property
|
||||
def chunked_document(self):
|
||||
return ChunkedDocument(self._filing.html(), prefix_src=self._filing.base_dir)
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def id_parse_document(self, markdown:bool=False):
|
||||
from edgar.files.html_documents_id_parser import ParsedHtml10K
|
||||
return ParsedHtml10K().extract_html(self._filing.html(), self.structure, markdown=markdown)
|
||||
|
||||
def __str__(self):
|
||||
return f"""TenK('{self.company}')"""
|
||||
|
||||
def __getitem__(self, item_or_part: str):
|
||||
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
|
||||
item_text = self.chunked_document[item_or_part]
|
||||
if item_text:
|
||||
item_text = item_text.rstrip()
|
||||
last_line = item_text.split("\n")[-1]
|
||||
if re.match(r'^\b(PART\s+[IVXLC]+)\b', last_line):
|
||||
item_text = item_text.rstrip(last_line)
|
||||
return item_text
|
||||
|
||||
def get_item_with_part(self, part: str, item: str, markdown:bool=True):
|
||||
if not part:
|
||||
return self.id_parse_document(markdown).get(item.lower())
|
||||
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
|
||||
item_text = self.chunked_document.get_item_with_part(part, item, markdown=markdown)
|
||||
# remove first line or last line (redundant part information)
|
||||
if not item_text or not item_text.strip():
|
||||
return self.id_parse_document(markdown).get(part.lower(), {}).get(item.lower())
|
||||
return item_text
|
||||
|
||||
def get_structure(self):
|
||||
# Create the main tree
|
||||
tree = Tree("📄 ")
|
||||
|
||||
# Get the actual items from the filing
|
||||
actual_items = self.items
|
||||
|
||||
# Create a mapping of uppercase to actual case items
|
||||
case_mapping = {item.upper(): item for item in actual_items}
|
||||
|
||||
# Process each part in the structure
|
||||
for part, items in self.structure.structure.items():
|
||||
# Create a branch for each part
|
||||
part_tree = tree.add(f"[bold blue]{part}[/]")
|
||||
|
||||
# Add items under each part
|
||||
for item_key, item_data in items.items():
|
||||
# Check if this item exists in the actual filing
|
||||
if item_key in case_mapping:
|
||||
# Use the actual case from the filing
|
||||
actual_item = case_mapping[item_key]
|
||||
item_text = Text.assemble(
|
||||
(f"{actual_item:<7} ", "bold green"),
|
||||
(f"{item_data['Title']}", "bold"),
|
||||
)
|
||||
else:
|
||||
# Item doesn't exist - show in grey with original structure case
|
||||
item_text = Text.assemble(
|
||||
(f"{item_key}: ", "dim"),
|
||||
(f"{item_data['Title']}", "dim"),
|
||||
)
|
||||
|
||||
part_tree.add(item_text)
|
||||
|
||||
return tree
|
||||
|
||||
def __rich__(self):
|
||||
title = Text.assemble(
|
||||
(f"{self.company}", "bold deep_sky_blue1"),
|
||||
(" ", ""),
|
||||
(f"{self.form}", "bold"),
|
||||
)
|
||||
periods = Text.assemble(
|
||||
("Period ending ", "grey70"),
|
||||
(f"{datefmt(self.period_of_report, '%B %d, %Y')}", "bold"),
|
||||
(" filed on ", "grey70"),
|
||||
(f"{datefmt(self.filing_date, '%B %d, %Y')}", "bold"),
|
||||
|
||||
)
|
||||
panel = Panel(
|
||||
Group(
|
||||
periods,
|
||||
Padding(" ", (1, 0, 0, 0)),
|
||||
self.get_structure(),
|
||||
Padding(" ", (1, 0, 0, 0)),
|
||||
self.financials or Text("No financial data available", style="italic")
|
||||
),
|
||||
title=title,
|
||||
box=box.ROUNDED,
|
||||
)
|
||||
return panel
|
||||
|
||||
|
||||
class TenQ(CompanyReport):
|
||||
structure = FilingStructure({
|
||||
"PART I": { # Financial Information
|
||||
"ITEM 1": {
|
||||
"Title": "Financial Statements",
|
||||
"Description": "Unaudited financial statements including balance sheets, income statements, " +
|
||||
"and cash flow statements."
|
||||
},
|
||||
"ITEM 2": {
|
||||
"Title": "Management’s Discussion and Analysis of Financial Condition and Results of Operations (MD&A)",
|
||||
"Description": "Management’s perspective on the financial condition and results of operations."
|
||||
},
|
||||
"ITEM 3": {
|
||||
"Title": "Quantitative and Qualitative Disclosures About Market Risk",
|
||||
"Description": "Information on the company's exposure to market risk."
|
||||
},
|
||||
"ITEM 4": {
|
||||
"Title": "Controls and Procedures",
|
||||
"Description": "Evaluation of the effectiveness of disclosure controls and procedures."
|
||||
}
|
||||
},
|
||||
"PART II": { # Other Information
|
||||
"ITEM 1": {
|
||||
"Title": "Legal Proceedings",
|
||||
"Description": "Brief description of any significant pending legal proceedings."
|
||||
},
|
||||
"ITEM 1A": {
|
||||
"Title": "Risk Factors",
|
||||
"Description": "An update on risk factors that may affect future results."
|
||||
},
|
||||
"ITEM 2": {
|
||||
"Title": "Unregistered Sales of Equity Securities and Use of Proceeds",
|
||||
"Description": "Details of unregistered sales of equity securities."
|
||||
},
|
||||
"ITEM 3": {
|
||||
"Title": "Defaults Upon Senior Securities",
|
||||
"Description": "Information regarding any defaults on senior securities."
|
||||
},
|
||||
"ITEM 4": {
|
||||
"Title": "Mine Safety Disclosures",
|
||||
"Description": "Required for companies with mining operations."
|
||||
},
|
||||
"ITEM 5": {
|
||||
"Title": "Other Information",
|
||||
"Description": "Any other information that should be disclosed to investors."
|
||||
},
|
||||
"ITEM 6": {
|
||||
"Title": "Exhibits",
|
||||
"Description": "List of exhibits required by Item 601 of Regulation S-K."
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
def __init__(self, filing):
|
||||
assert filing.form in ['10-Q', '10-Q/A'], f"This form should be a 10-Q but was {filing.form}"
|
||||
super().__init__(filing)
|
||||
|
||||
def __str__(self):
|
||||
return f"""TenQ('{self.company}')"""
|
||||
|
||||
def __getitem__(self, item_or_part: str):
|
||||
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
|
||||
item_text = self.chunked_document[item_or_part]
|
||||
return item_text
|
||||
|
||||
def get_item_with_part(self, part: str, item: str, markdown:bool=True):
|
||||
if not part:
|
||||
return self.id_parse_document(markdown).get(part.lower(), {}).get(item.lower())
|
||||
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
|
||||
item_text = self.chunked_document.get_item_with_part(part, item, markdown=markdown)
|
||||
# remove first line or last line (redundant part information)
|
||||
if not item_text or not item_text.strip():
|
||||
return self.id_parse_document(markdown).get(part.lower(), {}).get(item.lower())
|
||||
return item_text
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def id_parse_document(self, markdown:bool=True):
|
||||
from edgar.files.html_documents_id_parser import ParsedHtml10Q
|
||||
return ParsedHtml10Q().extract_html(self._filing.html(), self.structure, markdown=markdown)
|
||||
|
||||
@cached_property
|
||||
def chunked_document(self):
|
||||
return ChunkedDocument(self._filing.html(), prefix_src=self._filing.base_dir)
|
||||
|
||||
def get_structure(self):
|
||||
# Create the main tree
|
||||
tree = Tree("📄 ")
|
||||
|
||||
# Get the actual items from the filing
|
||||
actual_items = self.items
|
||||
|
||||
# Create a mapping of uppercase to actual case items
|
||||
case_mapping = {item.upper(): item for item in actual_items}
|
||||
|
||||
# Process each part in the structure
|
||||
for part, items in self.structure.structure.items():
|
||||
# Create a branch for each part
|
||||
part_tree = tree.add(f"[bold blue]{part}[/]")
|
||||
|
||||
# Add items under each part
|
||||
for item_key, item_data in items.items():
|
||||
# Check if this item exists in the actual filing
|
||||
if item_key in case_mapping:
|
||||
# Use the actual case from the filing
|
||||
actual_item = case_mapping[item_key]
|
||||
item_text = Text.assemble(
|
||||
(f"{actual_item:<7} ", "bold green"),
|
||||
(f"{item_data['Title']}", "bold"),
|
||||
)
|
||||
else:
|
||||
# Item doesn't exist - show in grey with original structure case
|
||||
item_text = Text.assemble(
|
||||
(f"{item_key}: ", "dim"),
|
||||
(f"{item_data['Title']}", "dim"),
|
||||
)
|
||||
|
||||
part_tree.add(item_text)
|
||||
|
||||
return tree
|
||||
|
||||
def __rich__(self):
|
||||
title = Text.assemble(
|
||||
(f"{self.company}", "bold deep_sky_blue1"),
|
||||
(" ", ""),
|
||||
(f"{self.form}", "bold"),
|
||||
)
|
||||
periods = Text.assemble(
|
||||
("Period ending ", "grey70"),
|
||||
(f"{datefmt(self.period_of_report, '%B %d, %Y')}", "bold"),
|
||||
(" filed on ", "grey70"),
|
||||
(f"{datefmt(self.filing_date, '%B %d, %Y')}", "bold"),
|
||||
)
|
||||
panel = Panel(
|
||||
Group(
|
||||
periods,
|
||||
Padding(" ", (1, 0, 0, 0)),
|
||||
self.get_structure(),
|
||||
Padding(" ", (1, 0, 0, 0)),
|
||||
self.financials or Text("No financial data available", style="italic")
|
||||
),
|
||||
title=title,
|
||||
box=box.ROUNDED,
|
||||
)
|
||||
return panel
|
||||
|
||||
|
||||
class TwentyF(CompanyReport):
|
||||
structure = FilingStructure({
|
||||
"PART I": {
|
||||
"ITEM 1": {
|
||||
"Title": "Identity of Directors, Senior Management, and Advisers",
|
||||
"Description": "Information about the company's directors, senior management, and advisers."
|
||||
},
|
||||
"ITEM 2": {
|
||||
"Title": "Offer Statistics and Expected Timetable",
|
||||
"Description": "Details on recent and expected offers of securities."
|
||||
},
|
||||
"ITEM 3": {
|
||||
"Title": "Key Information",
|
||||
"Description": "Financial and other key information about the company, including risk factors and ratios."
|
||||
},
|
||||
"ITEM 4": {
|
||||
"Title": "Information on the Company",
|
||||
"Description": "Detailed information about the company's operations and properties."
|
||||
},
|
||||
"ITEM 4A": {
|
||||
"Title": "Unresolved Staff Comments",
|
||||
"Description": "Any comments from the SEC staff on the company’s previous filings that " +
|
||||
"remain unresolved."
|
||||
}
|
||||
},
|
||||
"PART II": {
|
||||
"ITEM 5": {
|
||||
"Title": "Operating and Financial Review and Prospects",
|
||||
"Description": "Management’s discussion and analysis of financial condition and results of operations."
|
||||
},
|
||||
"ITEM 6": {
|
||||
"Title": "Directors, Senior Management, and Employees",
|
||||
"Description": "Information about the company's directors, senior management, and employees."
|
||||
},
|
||||
"ITEM 7": {
|
||||
"Title": "Major Shareholders and Related Party Transactions",
|
||||
"Description": "Information about major shareholders and transactions with related parties."
|
||||
},
|
||||
"ITEM 8": {
|
||||
"Title": "Financial Information",
|
||||
"Description": "Audited financial statements and supplementary financial information."
|
||||
},
|
||||
"ITEM 9": {
|
||||
"Title": "The Offer and Listing",
|
||||
"Description": "Details on the company's securities and markets where they are traded."
|
||||
}
|
||||
},
|
||||
"PART III": {
|
||||
"ITEM 10": {
|
||||
"Title": "Additional Information",
|
||||
"Description": "Additional information such as share capital, memoranda, and articles of association."
|
||||
},
|
||||
"ITEM 11": {
|
||||
"Title": "Quantitative and Qualitative Disclosures About Market Risk",
|
||||
"Description": "Information on the company's exposure to market risk."
|
||||
},
|
||||
"ITEM 12": {
|
||||
"Title": "Description of Securities Other Than Equity Securities",
|
||||
"Description": "Detailed information on securities other than equity."
|
||||
}
|
||||
},
|
||||
"PART IV": {
|
||||
"ITEM 13": {
|
||||
"Title": "Defaults, Dividend Arrearages, and Delinquencies",
|
||||
"Description": "Information about defaults on payments and arrearages."
|
||||
},
|
||||
"ITEM 14": {
|
||||
"Title": "Material Modifications to the Rights of Security Holders and Use of Proceeds",
|
||||
"Description": "Details on any modifications to the rights of security holders."
|
||||
},
|
||||
"ITEM 15": {
|
||||
"Title": "Controls and Procedures",
|
||||
"Description": "Assessment of the effectiveness of disclosure controls and internal controls over financial reporting."
|
||||
},
|
||||
"ITEM 16": {
|
||||
"Title": "Various Disclosures",
|
||||
"Description": "Includes disclosures related to audit committee financial experts, code of ethics, " +
|
||||
"principal accountant fees and services, and other corporate governance matters."
|
||||
}
|
||||
},
|
||||
"PART V": {
|
||||
"ITEM 17": {
|
||||
"Title": "Financial Statements",
|
||||
"Description": "Financial statements prepared in accordance with or reconciled to U.S. GAAP or IFRS."
|
||||
},
|
||||
"ITEM 18": {
|
||||
"Title": "Financial Statements",
|
||||
"Description": "If different from Item 17, financial statements prepared in accordance with " +
|
||||
"home country standards."
|
||||
},
|
||||
"ITEM 19": {
|
||||
"Title": "Exhibits",
|
||||
"Description": "Legal and financial documents supporting the information in the report."
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
def __init__(self, filing):
|
||||
assert filing.form in ['20-F', '20-F/A'], f"This form should be a 20-F but was {filing.form}"
|
||||
super().__init__(filing)
|
||||
|
||||
def __str__(self):
|
||||
return f"""TwentyF('{self.company}')"""
|
||||
|
||||
|
||||
class CurrentReport(CompanyReport):
|
||||
structure = ItemOnlyFilingStructure({
|
||||
"ITEM 1.01": {
|
||||
"Title": "Entry into a Material Definitive Agreement",
|
||||
"Description": "Reports any material agreement not made in the ordinary course of business."
|
||||
},
|
||||
"ITEM 1.02": {
|
||||
"Title": "Termination of a Material Definitive Agreement",
|
||||
"Description": "Reports the termination of any material agreement."
|
||||
},
|
||||
"ITEM 1.03": {
|
||||
"Title": "Bankruptcy or Receivership",
|
||||
"Description": "Reports any bankruptcy or receivership."
|
||||
},
|
||||
"ITEM 2.01": {"Title": "Completion of Acquisition or Disposition of Assets",
|
||||
"Description": "Reports the completion of an acquisition or disposition of a significant " +
|
||||
"amount of assets."},
|
||||
"ITEM 2.02": {"Title": "Results of Operations and Financial Condition",
|
||||
"Description": "Reports on the company's results of operations and financial condition."},
|
||||
"ITEM 2.03": {
|
||||
"Title": "Creation of a Direct Financial Obligation or an Obligation under an Off-Balance Sheet " +
|
||||
"Arrangement of a Registrant",
|
||||
"Description": "Reports the creation of a direct financial obligation."},
|
||||
"ITEM 2.04": {
|
||||
"Title": "Triggering Events That Accelerate or Increase a Direct Financial Obligation or an Obligation "
|
||||
+ "under an Off-Balance Sheet Arrangement",
|
||||
"Description": "Reports any triggering events."},
|
||||
"ITEM 2.05": {"Title": "Costs Associated with Exit or Disposal Activities",
|
||||
"Description": "Reports costs related to exit or disposal activities."},
|
||||
"ITEM 2.06": {"Title": "Material Impairments", "Description": "Reports on any material impairments."},
|
||||
"ITEM 3.01": {
|
||||
"Title": "Notice of Delisting or Failure to Satisfy a Continued Listing Rule or Standard; " +
|
||||
"Transfer of Listing",
|
||||
"Description": "Reports on delisting or failure to satisfy listing rules."},
|
||||
"ITEM 3.02": {"Title": "Unregistered Sales of Equity Securities",
|
||||
"Description": "Reports on the sale of unregistered equity securities."},
|
||||
"ITEM 3.03": {"Title": "Material Modification to Rights of Security Holders",
|
||||
"Description": "Reports on any modifications to the rights of security holders."},
|
||||
"ITEM 4.01": {"Title": "Changes in Registrant's Certifying Accountant",
|
||||
"Description": "Reports any change in the company's accountant."},
|
||||
"ITEM 4.02": {
|
||||
"Title": "Non-Reliance on Previously Issued Financial Statements or a Related Audit Report or " +
|
||||
"Completed Interim Review",
|
||||
"Description": "Reports on non-reliance on previously issued financial statements."},
|
||||
"ITEM 5.01": {"Title": "Changes in Control of Registrant",
|
||||
"Description": "Reports changes in control of the company."},
|
||||
"ITEM 5.02": {
|
||||
"Title": "Departure of Directors or Certain Officers; Election of Directors; Appointment of Certain " +
|
||||
"Officers",
|
||||
"Description": "Compensatory Arrangements of Certain Officers: Reports any changes in the company's " +
|
||||
"directors or certain officers."},
|
||||
"ITEM 5.03": {"Title": "Amendments to Articles of Incorporation or Bylaws; Change in Fiscal Year",
|
||||
"Description": "Reports on amendments to articles of incorporation or bylaws."},
|
||||
"ITEM 5.04": {
|
||||
"Title": "Temporary Suspension of Trading Under Registrant’s Employee Benefit Plans",
|
||||
"Description": "Reports on the temporary suspension of trading under the company’s employee benefit plans."
|
||||
},
|
||||
"ITEM 5.05": {
|
||||
"Title": "Amendment to the Registrant’s Code of Ethics, or Waiver of a Provision of the Code of Ethics",
|
||||
"Description": "Reports on amendments or waivers to the code of ethics."},
|
||||
"ITEM 5.06": {"Title": "Change in Shell Company Status",
|
||||
"Description": "Reports a change in the company's shell company status."},
|
||||
"ITEM 5.07": {"Title": "Submission of Matters to a Vote of Security Holders",
|
||||
"Description": "Reports on matters submitted to a vote of security holders."},
|
||||
"ITEM 5.08": {"Title": "Shareholder Director Nominations",
|
||||
"Description": "Reports on shareholder director nominations."},
|
||||
"ITEM 6.01": {"Title": "ABS Informational and Computational Material",
|
||||
"Description": "Reports ABS informational and computational material."},
|
||||
"ITEM 6.02": {"Title": "Change of Servicer or Trustee",
|
||||
"Description": "Reports on the change of servicer or trustee."},
|
||||
"ITEM 6.03": {"Title": "Change in Credit Enhancement or Other External Support",
|
||||
"Description": "Reports on changes in credit enhancement or external support."},
|
||||
"ITEM 6.04": {"Title": "Failure to Make a Required Distribution",
|
||||
"Description": "Reports on the failure to make a required distribution."},
|
||||
"ITEM 6.05": {"Title": "Securities Act Updating Disclosure",
|
||||
"Description": "Reports on Securities Act updating disclosure."},
|
||||
"ITEM 9.01": {
|
||||
"Title": "Financial Statements and Exhibits",
|
||||
"Description": "Reports financial statements and other exhibits related to the events reported in the 8-K."
|
||||
}
|
||||
})
|
||||
|
||||
def __init__(self, filing):
|
||||
assert filing.form in ['8-K', '8-K/A', '6-K', '6-K/A'], f"This form should be an 8-K but was {filing.form}"
|
||||
super().__init__(filing)
|
||||
|
||||
@property
|
||||
def has_press_release(self):
|
||||
return self.press_releases is not None
|
||||
|
||||
@property
|
||||
def press_releases(self):
|
||||
attachments: Attachments = self._filing.attachments
|
||||
# This query for press release currently includes EX-99, EX-99.1, EX-99.01 but not EX-99.2
|
||||
# Here is what we think so far
|
||||
html_document = "document.endswith('.htm')"
|
||||
named_release = "re.match('.*RELEASE', description)"
|
||||
type_ex_99 = "document_type in ['EX-99.1', 'EX-99', 'EX-99.01']"
|
||||
press_release_query = f"{html_document} and ({named_release} or {type_ex_99})"
|
||||
press_release_results = attachments.query(press_release_query)
|
||||
if press_release_results:
|
||||
return PressReleases(press_release_results)
|
||||
|
||||
@cached_property
|
||||
def chunked_document(self):
|
||||
html = self._filing.html()
|
||||
if not html:
|
||||
return None
|
||||
decimal_chunk_fn = partial(chunks2df,
|
||||
item_detector=detect_decimal_items,
|
||||
item_adjuster=adjust_for_empty_items,
|
||||
item_structure=self.structure)
|
||||
|
||||
return ChunkedDocument(html,
|
||||
chunk_fn=decimal_chunk_fn)
|
||||
|
||||
@property
|
||||
def doc(self):
|
||||
return self.chunked_document
|
||||
|
||||
@property
|
||||
def items(self) -> List[str]:
|
||||
if self.chunked_document:
|
||||
return self.chunked_document.list_items()
|
||||
return []
|
||||
|
||||
def __getitem__(self, item_or_part: str):
|
||||
# Show the item or part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q
|
||||
item_text = self.chunked_document[item_or_part]
|
||||
return item_text
|
||||
|
||||
def view(self, item_or_part: str):
|
||||
"""Get the Item or Part from the filing document. e.g. Item 1 Business from 10-K or Part I from 10-Q"""
|
||||
item_text = self[item_or_part]
|
||||
if item_text:
|
||||
print(item_text)
|
||||
|
||||
@property
|
||||
def date_of_report(self):
|
||||
"""Return the period of report for this filing"""
|
||||
period_of_report_str = self._filing.header.period_of_report
|
||||
if period_of_report_str:
|
||||
period_of_report = datetime.strptime(period_of_report_str, "%Y-%m-%d")
|
||||
return period_of_report.strftime("%B %d, %Y")
|
||||
return ""
|
||||
|
||||
def _get_exhibit_content(self, exhibit: Attachment) -> Optional[str]:
|
||||
"""
|
||||
Get the content of the exhibit
|
||||
"""
|
||||
# For old filings the exhibit might not have a document. So we need to get the full text content
|
||||
# from the sgml content
|
||||
if exhibit.empty:
|
||||
# Download the SGML document
|
||||
sgml_document = self._filing.sgml().get_document_by_sequence(exhibit.sequence_number)
|
||||
if sgml_document:
|
||||
exhibit_content = sgml_document.text()
|
||||
return exhibit_content
|
||||
else:
|
||||
html_content = exhibit.download()
|
||||
if html_content:
|
||||
document = Document.parse(html_content)
|
||||
return repr_rich(document, width=200, force_terminal=False)
|
||||
|
||||
def _content_renderables(self):
|
||||
"""Get the content of the exhibits as renderables"""
|
||||
renderables = []
|
||||
for exhibit in self._filing.exhibits:
|
||||
# Skip binary files
|
||||
if exhibit.is_binary():
|
||||
continue
|
||||
exhibit_content = self._get_exhibit_content(exhibit)
|
||||
|
||||
if exhibit_content:
|
||||
# Remove text like [/she] and replace with (she) to prevent it being treated as rich markup
|
||||
cleaned_content = re.sub(r'\[(/[^]]*)]', r'(\1)',exhibit_content)
|
||||
title = Text.assemble(("Exhibit ", "bold gray54"), (exhibit.document_type, "bold green"))
|
||||
renderables.append(Panel(cleaned_content,
|
||||
title=title,
|
||||
subtitle=Text(exhibit.description, style="gray54"),
|
||||
box=box.SIMPLE))
|
||||
return Group(*renderables)
|
||||
|
||||
def text(self):
|
||||
"""Get the text of the EightK filing
|
||||
This includes the text content of all the exhibits
|
||||
"""
|
||||
return rich_to_text(self._content_renderables())
|
||||
|
||||
def __rich__(self):
|
||||
|
||||
# Renderables for the panel.
|
||||
renderables = []
|
||||
|
||||
# List the exhibits as a table
|
||||
exhibit_table = Table("", "Type", "Description",
|
||||
title="Exhibits", show_header=True, header_style="bold", box=box.ROUNDED)
|
||||
renderables.append(exhibit_table)
|
||||
for exhibit in self._filing.exhibits:
|
||||
exhibit_table.add_row(exhibit.sequence_number, exhibit.document_type, exhibit.description)
|
||||
|
||||
panel_title = Text.assemble(
|
||||
(f"{self.company}", "bold deep_sky_blue1"),
|
||||
(" ", ""),
|
||||
(f"{self.form}", "bold green"),
|
||||
(" ", ""),
|
||||
(f"{self.date_of_report}", "bold yellow")
|
||||
)
|
||||
|
||||
# Add the content of the exhibits
|
||||
renderables.append(self._content_renderables())
|
||||
|
||||
return Panel(
|
||||
Group(*renderables),
|
||||
title=panel_title,
|
||||
box=box.SIMPLE
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.company} {self.form} {self.date_of_report}"
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
# Aliases fpr the current report
|
||||
EightK = CurrentReport
|
||||
SixK = CurrentReport
|
||||
|
||||
class PressReleases:
|
||||
"""
|
||||
Represent the attachment on an 8-K filing that could be press releases
|
||||
"""
|
||||
|
||||
def __init__(self, attachments: Attachments):
|
||||
self.attachments: Attachments = attachments
|
||||
|
||||
def __len__(self):
|
||||
return len(self.attachments)
|
||||
|
||||
def __getitem__(self, item):
|
||||
attachment = self.attachments.get_by_index(item)
|
||||
if attachment:
|
||||
return PressRelease(attachment)
|
||||
|
||||
def __rich__(self):
|
||||
return self.attachments.__rich__()
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
class PressRelease:
|
||||
"""
|
||||
Represents a press release attachment from an 8-K filing
|
||||
With the Type EX-99.1
|
||||
"""
|
||||
|
||||
def __init__(self, attachment: Attachment):
|
||||
self.attachment: Attachment = attachment
|
||||
|
||||
def url(self):
|
||||
return self.attachment.url
|
||||
|
||||
@property
|
||||
def document(self) -> str:
|
||||
return self.attachment.document
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return self.attachment.description
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def html(self) -> str:
|
||||
return self.attachment.download()
|
||||
|
||||
def text(self) -> str:
|
||||
html = self.html()
|
||||
if html:
|
||||
return HtmlDocument.from_html(html, extract_data=False).text
|
||||
|
||||
def open(self):
|
||||
self.attachment.open()
|
||||
|
||||
def view(self):
|
||||
return self.to_markdown().view()
|
||||
|
||||
def to_markdown(self):
|
||||
html = self.html()
|
||||
markdown_content = MarkdownContent.from_html(html, title="8-K Press Release")
|
||||
return markdown_content
|
||||
|
||||
def __rich__(self):
|
||||
return self.to_markdown()
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
def is_valid_item_for_filing(filing_structure: Dict, item: str, part: str = None):
|
||||
"""Return true if the item is valid"""
|
||||
item = item.upper()
|
||||
if part:
|
||||
part_dict = filing_structure.get(part.upper())
|
||||
if part_dict:
|
||||
return item in part_dict
|
||||
else:
|
||||
for _, items in filing_structure.items():
|
||||
if item in items:
|
||||
return True
|
||||
return False
|
||||
689
venv/lib/python3.10/site-packages/edgar/core.py
Normal file
689
venv/lib/python3.10/site-packages/edgar/core.py
Normal file
@@ -0,0 +1,689 @@
|
||||
import asyncio
|
||||
import datetime
|
||||
import logging.config
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import threading
|
||||
from _thread import interrupt_main
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from functools import lru_cache, partial, wraps
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, List, Optional, Tuple, TypeVar, Union
|
||||
|
||||
import httpx
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pytz
|
||||
from pandas.tseries.offsets import BDay
|
||||
from rich.logging import RichHandler
|
||||
from rich.prompt import Prompt
|
||||
|
||||
from edgar.datatools import PagingState
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
def parse_pandas_version():
|
||||
"""Parse pandas version without external dependencies"""
|
||||
version_parts = pd.__version__.split('.')
|
||||
major = int(version_parts[0])
|
||||
minor = int(version_parts[1]) if len(version_parts) > 1 else 0
|
||||
# Handle dev versions, rc versions, and build metadata
|
||||
patch_str = version_parts[2] if len(version_parts) > 2 else '0'
|
||||
patch = int(patch_str.split('+')[0].split('rc')[0].split('dev')[0])
|
||||
return (major, minor, patch)
|
||||
|
||||
pandas_version = parse_pandas_version()
|
||||
|
||||
# sys version
|
||||
python_version = tuple(map(int, sys.version.split()[0].split('.')))
|
||||
|
||||
__all__ = [
|
||||
'log',
|
||||
'Result',
|
||||
'get_bool',
|
||||
'edgar_mode',
|
||||
'NORMAL',
|
||||
'CRAWL',
|
||||
'CAUTION',
|
||||
'sec_edgar',
|
||||
'IntString',
|
||||
'sec_dot_gov',
|
||||
'get_identity',
|
||||
'python_version',
|
||||
'set_identity',
|
||||
'strtobool',
|
||||
'listify',
|
||||
'decode_content',
|
||||
'cache_except_none',
|
||||
'text_extensions',
|
||||
'binary_extensions',
|
||||
'ask_for_identity',
|
||||
'is_start_of_quarter',
|
||||
'run_async_or_sync',
|
||||
'get_edgar_data_directory',
|
||||
'is_probably_html',
|
||||
'has_html_content',
|
||||
'default_page_size',
|
||||
'parse_acceptance_datetime',
|
||||
'PagingState',
|
||||
'Years',
|
||||
'Quarters',
|
||||
'YearAndQuarter',
|
||||
'YearAndQuarters',
|
||||
'quarters_in_year',
|
||||
'parallel_thread_map',
|
||||
'pandas_version'
|
||||
]
|
||||
|
||||
IntString = Union[str, int]
|
||||
quarters_in_year: List[int] = list(range(1, 5))
|
||||
|
||||
YearAndQuarter = Tuple[int, int]
|
||||
YearAndQuarters = List[YearAndQuarter]
|
||||
Years = Union[int, List[int], range]
|
||||
Quarters = Union[int, List[int], range]
|
||||
|
||||
# Date patterns
|
||||
YYYY_MM_DD = "\\d{4}-\\d{2}-\\d{2}"
|
||||
DATE_PATTERN = re.compile(YYYY_MM_DD)
|
||||
DATE_RANGE_PATTERN = re.compile(f"^({YYYY_MM_DD}(:({YYYY_MM_DD})?)?|:({YYYY_MM_DD}))$")
|
||||
|
||||
default_http_timeout: int = 12
|
||||
default_page_size = 50
|
||||
default_max_connections = 10
|
||||
default_retries = 3
|
||||
|
||||
limits = httpx.Limits(max_connections=default_max_connections)
|
||||
|
||||
|
||||
def strtobool (val:str):
|
||||
"""Convert a string representation of truth to true (1) or false (0).
|
||||
|
||||
True values are case insensitive 'y', 'yes', 't', 'true', 'on', and '1'.
|
||||
false values are case insensitive 'n', 'no', 'f', 'false', 'off', and '0'.
|
||||
Raises ValueError if 'val' is anything else.
|
||||
"""
|
||||
if not val:
|
||||
return False
|
||||
val = val.lower()
|
||||
if val in ('y', 'yes', 't', 'true', 'on', '1'):
|
||||
return True
|
||||
elif val in ('n', 'no', 'f', 'false', 'off', '0'):
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
#raise ValueError("invalid truth value %r" % (val,))
|
||||
|
||||
|
||||
@dataclass
|
||||
class EdgarSettings:
|
||||
http_timeout: int
|
||||
max_connections: int
|
||||
retries: int = 3
|
||||
|
||||
@property
|
||||
@lru_cache(maxsize=1)
|
||||
def limits(self):
|
||||
return httpx.Limits(max_connections=default_max_connections)
|
||||
|
||||
def __eq__(self, othr):
|
||||
return (isinstance(othr, type(self))
|
||||
and (self.http_timeout, self.max_connections, self.retries) ==
|
||||
(othr.http_timeout, othr.max_connections, othr.retries))
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.http_timeout, self.max_connections, self.retries))
|
||||
|
||||
|
||||
# Modes of accessing edgar
|
||||
|
||||
# The normal mode of accessing edgar
|
||||
NORMAL = EdgarSettings(http_timeout=15, max_connections=10)
|
||||
|
||||
# A bit more cautious mode of accessing edgar
|
||||
CAUTION = EdgarSettings(http_timeout=20, max_connections=5)
|
||||
|
||||
# Use this setting when you have long-running jobs and want to avoid breaching Edgar limits
|
||||
CRAWL = EdgarSettings(http_timeout=25, max_connections=2, retries=2)
|
||||
|
||||
edgar_access_mode = os.getenv('EDGAR_ACCESS_MODE', 'NORMAL')
|
||||
if edgar_access_mode == 'CAUTION':
|
||||
# A bit more cautious mode of accessing edgar
|
||||
edgar_mode = CAUTION
|
||||
elif edgar_access_mode == 'CRAWL':
|
||||
# Use this setting when you have long-running jobs and want to avoid breaching Edgar limits
|
||||
edgar_mode = CRAWL
|
||||
else:
|
||||
# The normal mode of accessing edgar
|
||||
edgar_mode = NORMAL
|
||||
|
||||
edgar_identity = 'EDGAR_IDENTITY'
|
||||
|
||||
# SEC urls
|
||||
sec_dot_gov = "https://www.sec.gov"
|
||||
sec_edgar = "https://www.sec.gov/Archives/edgar"
|
||||
|
||||
# Local storage directory.
|
||||
edgar_data_dir = os.path.join(os.path.expanduser("~"), ".edgar")
|
||||
|
||||
|
||||
def set_identity(user_identity: str):
|
||||
"""
|
||||
This function sets the environment variable EDGAR_IDENTITY to the identity you will use to call Edgar
|
||||
|
||||
This user identity looks like
|
||||
|
||||
"Sample Company Name AdminContact@<sample company domain>.com"
|
||||
|
||||
See https://www.sec.gov/os/accessing-edgar-data
|
||||
|
||||
:param user_identity:
|
||||
"""
|
||||
os.environ[edgar_identity] = user_identity
|
||||
log.info("Identity of the Edgar REST client set to [%s]", user_identity)
|
||||
|
||||
from edgar.httpclient import close_clients
|
||||
close_clients() # close any httpx clients, to reset the identity.
|
||||
|
||||
|
||||
identity_prompt = """
|
||||
[bold turquoise4]Identify your client to SEC Edgar[/bold turquoise4]
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
Before running [bold]edgartools[/bold] it needs to know the UserAgent string to send to Edgar.
|
||||
See https://www.sec.gov/os/accessing-edgar-data
|
||||
|
||||
This can be set in the environment variable [bold green]EDGAR_IDENTITY[/bold green].
|
||||
|
||||
1. Set an OS environment variable
|
||||
[bold]EDGAR_IDENTITY=[green]Name email@domain.com[/green][/bold]
|
||||
2. Or a Python environment variable
|
||||
import os
|
||||
[bold]os.environ['EDGAR_IDENTITY']=[green]"Name email@domain.com"[/green][/bold]
|
||||
3. Or use [bold magenta]edgartools.set_identity[/bold magenta]
|
||||
from edgar import set_identity
|
||||
[bold]set_identity([green]'Name email@domain.com'[/green])[/bold]
|
||||
|
||||
But since you are already using [bold]edgartools[/bold] you can set it here
|
||||
|
||||
Enter your [bold green]EDGAR_IDENTITY[/bold green] e.g. [bold italic green]Name email@domain.com[/bold italic green]
|
||||
"""
|
||||
|
||||
|
||||
def ask_for_identity(user_prompt: str = identity_prompt,
|
||||
timeout: int = 60):
|
||||
timer = threading.Timer(timeout, interrupt_main)
|
||||
timer.start()
|
||||
|
||||
try:
|
||||
# Prompt the user for input
|
||||
input_str = Prompt.ask(user_prompt)
|
||||
|
||||
# Strip the newline character from the end of the input string
|
||||
input_str = input_str.strip()
|
||||
except KeyboardInterrupt:
|
||||
# If the timeout is reached, raise a TimeoutError exception
|
||||
message = "You did not enter your Edgar user identity. Try again .. or set environment variable EDGAR_IDENTITY"
|
||||
log.warning(message)
|
||||
raise TimeoutError(message) from None
|
||||
finally:
|
||||
# Cancel the timer to prevent it from interrupting the main thread
|
||||
timer.cancel()
|
||||
|
||||
return input_str
|
||||
|
||||
|
||||
def get_identity() -> str:
|
||||
"""
|
||||
Get the sec identity used to set the UserAgent string
|
||||
:return:
|
||||
"""
|
||||
identity = os.environ.get(edgar_identity)
|
||||
if not identity:
|
||||
identity = ask_for_identity()
|
||||
os.environ[edgar_identity] = identity
|
||||
return identity
|
||||
|
||||
def decode_content(content: bytes):
|
||||
try:
|
||||
return content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
return content.decode('latin-1')
|
||||
|
||||
|
||||
text_extensions = (".txt", ".htm", ".html", ".xsd", ".xml", "XML", ".json", ".idx", ".paper")
|
||||
binary_extensions = (".pdf", ".jpg", ".jpeg", "png", ".gif", ".tif", ".tiff", ".bmp", ".ico", ".svg", ".webp", ".avif",
|
||||
".apng")
|
||||
|
||||
|
||||
def get_bool(value: str = None) -> Optional[bool]:
|
||||
"""Convert the value to a boolean"""
|
||||
return value in [1, "1", "Y", "true", "True", "TRUE"]
|
||||
|
||||
|
||||
class Result:
|
||||
"""
|
||||
This class represents the result of an operation which can succeed or fail.
|
||||
It allows for handling the failures more gracefully that using error handling
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
success: bool,
|
||||
error: Optional[str] = None,
|
||||
value: Optional[object] = None):
|
||||
self.success = success
|
||||
self.error = error
|
||||
self.value = value
|
||||
|
||||
@property
|
||||
def failure(self) -> bool:
|
||||
""":return True if the operation failed"""
|
||||
return not self.success
|
||||
|
||||
def __str__(self):
|
||||
if self.success:
|
||||
return '[Success]'
|
||||
else:
|
||||
return f'[Failure] "{self.error}"'
|
||||
|
||||
def __repr__(self):
|
||||
if self.success:
|
||||
return f"Result (success={self.success})"
|
||||
else:
|
||||
return f'Result (success={self.success}, message="{self.error}")'
|
||||
|
||||
@classmethod
|
||||
def Fail(cls,
|
||||
error: str):
|
||||
"""Create a Result for a failed operation"""
|
||||
return cls(False, error=error, value=None)
|
||||
|
||||
@classmethod
|
||||
def Ok(cls,
|
||||
value: object):
|
||||
"""Create a Result for a successful operation"""
|
||||
return cls(success=True, value=value, error=None)
|
||||
|
||||
|
||||
def get_resource(file: str):
|
||||
import importlib
|
||||
|
||||
import edgar
|
||||
return importlib.resources.path(edgar, file)
|
||||
|
||||
|
||||
def get_edgar_data_directory() -> Path:
|
||||
"""Get the edgar data directory"""
|
||||
default_local_data_dir = Path(os.path.join(os.path.expanduser("~"), ".edgar"))
|
||||
edgar_data_dir = Path(os.getenv('EDGAR_LOCAL_DATA_DIR', default_local_data_dir))
|
||||
os.makedirs(edgar_data_dir, exist_ok=True)
|
||||
return edgar_data_dir
|
||||
|
||||
|
||||
class TooManyRequestsException(Exception):
|
||||
|
||||
def __init__(self, message: str):
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
def filing_date_to_year_quarters(filing_date: str) -> List[Tuple[int, int]]:
|
||||
if ":" in filing_date:
|
||||
start_date, end_date = filing_date.split(":")
|
||||
|
||||
if not start_date:
|
||||
start_date = "1994-06-01"
|
||||
|
||||
if not end_date:
|
||||
end_date = date.today().strftime("%Y-%m-%d")
|
||||
|
||||
start_year, start_month, _ = map(int, start_date.split("-"))
|
||||
end_year, end_month, _ = map(int, end_date.split("-"))
|
||||
|
||||
start_quarter = (start_month - 1) // 3 + 1
|
||||
end_quarter = (end_month - 1) // 3 + 1
|
||||
|
||||
result = []
|
||||
for year in range(start_year, end_year + 1):
|
||||
if year == start_year and year == end_year:
|
||||
quarters = range(start_quarter, end_quarter + 1)
|
||||
elif year == start_year:
|
||||
quarters = range(start_quarter, 5)
|
||||
elif year == end_year:
|
||||
quarters = range(1, end_quarter + 1)
|
||||
else:
|
||||
quarters = range(1, 5)
|
||||
|
||||
for quarter in quarters:
|
||||
result.append((year, quarter))
|
||||
|
||||
return result
|
||||
else:
|
||||
year, month, _ = map(int, filing_date.split("-"))
|
||||
quarter = (month - 1) // 3 + 1
|
||||
return [(year, quarter)]
|
||||
|
||||
|
||||
def current_year_and_quarter() -> Tuple[int, int]:
|
||||
# Define the Eastern timezone
|
||||
eastern = pytz.timezone('America/New_York')
|
||||
|
||||
# Get the current time in Eastern timezone
|
||||
now_eastern = datetime.datetime.now(eastern)
|
||||
|
||||
# Calculate the current year and quarter
|
||||
current_year, current_quarter = now_eastern.year, (now_eastern.month - 1) // 3 + 1
|
||||
|
||||
return current_year, current_quarter
|
||||
|
||||
|
||||
def filter_by_date(data: pa.Table,
|
||||
date: Union[str, datetime.datetime],
|
||||
date_col: str) -> pa.Table:
|
||||
# If datetime convert to string
|
||||
if isinstance(date, datetime.date) or isinstance(date, datetime.datetime):
|
||||
date = date.strftime('%Y-%m-%d')
|
||||
|
||||
def decode_content(content: bytes):
|
||||
try:
|
||||
return content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
return content.decode('latin-1')
|
||||
|
||||
|
||||
text_extensions = (".txt", ".htm", ".html", ".xsd", ".xml", "XML", ".json", ".idx", ".paper")
|
||||
binary_extensions = (".pdf", ".jpg", ".jpeg", "png", ".gif", ".tif", ".tiff", ".bmp", ".ico", ".svg", ".webp", ".avif",
|
||||
".apng")
|
||||
|
||||
|
||||
class DataPager:
|
||||
def __init__(self,
|
||||
data: Union[pa.Table, pd.DataFrame],
|
||||
page_size=default_page_size):
|
||||
self.data: Union[pa.Table, pd.DataFrame] = data
|
||||
self.page_size = page_size
|
||||
self.total_pages = (len(self.data) // page_size) + 1
|
||||
self.current_page = 1
|
||||
|
||||
def has_next(self):
|
||||
return self.current_page < self.total_pages
|
||||
|
||||
def has_previous(self):
|
||||
return self.current_page > 1
|
||||
|
||||
def next(self):
|
||||
"""Get the next page of data"""
|
||||
if self.has_next():
|
||||
self.current_page += 1
|
||||
return self.current()
|
||||
else:
|
||||
return None
|
||||
|
||||
def previous(self):
|
||||
"""Get the previous page of data"""
|
||||
if self.has_previous():
|
||||
self.current_page -= 1
|
||||
return self.current()
|
||||
else:
|
||||
return None
|
||||
|
||||
@property
|
||||
def _current_range(self) -> Tuple[int, int]:
|
||||
"""Get the current start and end index for the data"""
|
||||
start_index = (self.current_page - 1) * self.page_size
|
||||
end_index = min(len(self.data), start_index + self.page_size)
|
||||
return start_index, end_index
|
||||
|
||||
def current(self) -> pa.Table:
|
||||
"""
|
||||
Get the current data page as a pyarrow Table
|
||||
:return:
|
||||
"""
|
||||
start_index = (self.current_page - 1) * self.page_size
|
||||
end_index = start_index + self.page_size
|
||||
if isinstance(self.data, pa.Table):
|
||||
return self.data.slice(offset=start_index, length=self.page_size)
|
||||
else:
|
||||
return self.data.iloc[start_index:end_index]
|
||||
|
||||
@property
|
||||
def start_index(self):
|
||||
return (self.current_page - 1) * self.page_size
|
||||
|
||||
@property
|
||||
def end_index(self):
|
||||
return self.start_index + self.page_size
|
||||
|
||||
|
||||
@dataclass
|
||||
class PagingState:
|
||||
page_start: int
|
||||
num_records: int
|
||||
|
||||
def parse_acceptance_datetime(acceptance_datetime: str) -> datetime.datetime:
|
||||
return datetime.datetime.fromisoformat(acceptance_datetime.replace('Z', '+00:00'))
|
||||
|
||||
def sample_table(table, n=None, frac=None, replace=False, random_state=None):
|
||||
"""Take a sample from a pyarrow Table"""
|
||||
if random_state:
|
||||
random.seed(random_state)
|
||||
|
||||
if frac is not None:
|
||||
n = int(len(table) * frac)
|
||||
|
||||
if n is not None:
|
||||
if replace:
|
||||
indices = [random.randint(0, len(table) - 1) for _ in range(n)]
|
||||
else:
|
||||
indices = random.sample(range(len(table)), min(n, len(table)))
|
||||
else:
|
||||
indices = random.sample(range(len(table)), len(table))
|
||||
|
||||
return table.take(indices)
|
||||
|
||||
|
||||
def run_async_or_sync(coroutine):
|
||||
try:
|
||||
# Check if we're in an IPython environment
|
||||
ipython = sys.modules['IPython']
|
||||
if 'asyncio' in sys.modules:
|
||||
# try is needed for ipython console
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
except RuntimeError:
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
loop = asyncio.get_event_loop()
|
||||
if loop.is_running():
|
||||
# We're in a notebook with an active event loop
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
return loop.run_until_complete(coroutine)
|
||||
else:
|
||||
# We're in IPython but without an active event loop
|
||||
return loop.run_until_complete(coroutine)
|
||||
else:
|
||||
# We're in IPython but asyncio is not available
|
||||
return ipython.get_ipython().run_cell_magic('time', '', f'import asyncio; asyncio.run({coroutine!r})')
|
||||
except (KeyError, AttributeError):
|
||||
# We're not in an IPython environment, use asyncio.run()
|
||||
return asyncio.run(coroutine)
|
||||
|
||||
|
||||
def listify(value):
|
||||
"""
|
||||
Convert the input to a list if it's not already a list.
|
||||
|
||||
Args:
|
||||
value: Any type of input
|
||||
|
||||
Returns:
|
||||
list: The input as a list
|
||||
"""
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
elif isinstance(value, range):
|
||||
return list(value)
|
||||
else:
|
||||
return [value]
|
||||
|
||||
|
||||
def is_start_of_quarter():
|
||||
today = datetime.datetime.now().date()
|
||||
|
||||
# Check if it's the start of a quarter
|
||||
if today.month in [1, 4, 7, 10] and today.day <= 5:
|
||||
# Get the first day of the current quarter
|
||||
first_day_of_quarter = datetime.datetime(today.year, today.month, 1).date()
|
||||
|
||||
# Calculate one business day after the start of the quarter
|
||||
one_business_day_after = (first_day_of_quarter + BDay(1)).date()
|
||||
|
||||
# Check if we haven't passed one full business day yet
|
||||
if today <= one_business_day_after:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def cache_except_none(maxsize=128):
|
||||
"""
|
||||
A decorator that caches the result of a function, but only if the result is not None.
|
||||
"""
|
||||
def decorator(func):
|
||||
cache = lru_cache(maxsize=maxsize)
|
||||
|
||||
@cache
|
||||
def cached_func(*args, **kwargs):
|
||||
result = func(*args, **kwargs)
|
||||
if result is None:
|
||||
# Clear this result from the cache
|
||||
cached_func.cache_clear()
|
||||
return result
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
return cached_func(*args, **kwargs)
|
||||
|
||||
# Preserve cache methods
|
||||
wrapper.cache_info = cached_func.cache_info
|
||||
wrapper.cache_clear = cached_func.cache_clear
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
def is_probably_html(content: str) -> bool:
|
||||
"""Does it have html tags"""
|
||||
if isinstance(content, bytes):
|
||||
content = content.decode('utf-8', errors='ignore')
|
||||
|
||||
# Check for common HTML tags
|
||||
html_tags = ['<html>', '<body>', '<head>', '<title>', '<div', '<span', '<p>']
|
||||
return any(tag in content.lower() for tag in html_tags)
|
||||
|
||||
def has_html_content(content: str) -> bool:
|
||||
"""
|
||||
Check if the content is HTML or inline XBRL HTML
|
||||
"""
|
||||
if content is None:
|
||||
return False
|
||||
|
||||
if isinstance(content, bytes):
|
||||
content = content.decode('utf-8', errors='ignore')
|
||||
|
||||
# Strip only leading whitespace and get first 200 chars for doctype check
|
||||
content = content.lstrip()
|
||||
first_200_lower = content[:200].lower()
|
||||
|
||||
# Check for XHTML doctype declarations
|
||||
if '<!doctype html public "-//w3c//dtd xhtml' in first_200_lower or \
|
||||
'<!doctype html system "http://www.w3.org/tr/xhtml1/dtd/' in first_200_lower or \
|
||||
'<!doctype html public "-//w3c//dtd html 4.01 transitional//en"' in first_200_lower:
|
||||
return True
|
||||
|
||||
# Look for common XML/HTML indicators in first 1000 chars
|
||||
first_1000 = content[:1000]
|
||||
|
||||
# Check for standard XHTML namespace
|
||||
if 'xmlns="http://www.w3.org/1999/xhtml"' in first_1000:
|
||||
return True
|
||||
|
||||
# Check for HTML root element
|
||||
if '<html' in first_1000:
|
||||
# Check for common inline XBRL namespaces
|
||||
if ('xmlns:xbrli' in first_1000 or
|
||||
'xmlns:ix' in first_1000 or
|
||||
'xmlns:html' in first_1000):
|
||||
return True
|
||||
|
||||
# If we have an <html> tag, it's likely HTML content
|
||||
# This catches cases like <html style="..."> that don't have XBRL namespaces
|
||||
return True
|
||||
|
||||
# Just check for straightforward HTML
|
||||
if first_200_lower.startswith('<html>') and content[-7:].lower().startswith('</html>'):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
T = TypeVar('T')
|
||||
R = TypeVar('R')
|
||||
|
||||
def parallel_thread_map(func: Callable[[T], R],
|
||||
items: Iterable[T],
|
||||
**kwargs) -> List[R]:
|
||||
"""
|
||||
Run a function in parallel across multiple items using ThreadPoolExecutor.
|
||||
|
||||
This is a replacement for fastcore's parallel function, supporting only the threadpool
|
||||
execution mode. It does not include progress bars.
|
||||
|
||||
Args:
|
||||
func: The function to apply to each item
|
||||
items: The items to process
|
||||
**kwargs: Additional keyword arguments to pass to func
|
||||
|
||||
Returns:
|
||||
List of results from applying func to each item
|
||||
"""
|
||||
# Default to min(32, cores+4) which is a good balance for I/O-bound tasks
|
||||
max_workers = kwargs.pop('n_workers', None) or min(32, (os.cpu_count() or 1) + 4)
|
||||
|
||||
# Convert items to a list for easier handling
|
||||
items_list = list(items)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
if kwargs:
|
||||
# If there are kwargs, create a partial function
|
||||
partial_func = partial(func, **kwargs)
|
||||
results = list(executor.map(partial_func, items_list))
|
||||
else:
|
||||
results = list(executor.map(func, items_list))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def initialize_rich_logging():
|
||||
# Rich logging
|
||||
logging.basicConfig(
|
||||
level="INFO",
|
||||
format="%(message)s",
|
||||
datefmt="[%X]",
|
||||
handlers=[RichHandler(rich_tracebacks=True)]
|
||||
)
|
||||
|
||||
# Turn down 3rd party logging
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpxthrottlecache").setLevel(logging.WARNING)
|
||||
logging.getLogger("pyrate_limiter").setLevel(
|
||||
logging.CRITICAL
|
||||
) # TODO: Temporary, until next pyrate_limiter update that reduces the spurious "async" message
|
||||
|
||||
|
||||
# Turn on rich logging if the environment variable is set
|
||||
if os.getenv('EDGAR_USE_RICH_LOGGING', '0') == '1':
|
||||
initialize_rich_logging()
|
||||
428
venv/lib/python3.10/site-packages/edgar/current_filings.py
Normal file
428
venv/lib/python3.10/site-packages/edgar/current_filings.py
Normal file
@@ -0,0 +1,428 @@
|
||||
import re
|
||||
from datetime import datetime
|
||||
from functools import lru_cache
|
||||
from typing import Optional
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
from bs4 import BeautifulSoup
|
||||
from rich import box
|
||||
from rich.console import Group
|
||||
from rich.panel import Panel
|
||||
from rich.status import Status
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from edgar._filings import Filings
|
||||
from edgar.core import IntString
|
||||
from edgar.formatting import accepted_time_text, accession_number_text
|
||||
from edgar.httprequests import get_with_retry
|
||||
from edgar.reference.tickers import find_ticker
|
||||
from edgar.xmltools import child_text
|
||||
|
||||
__all__ = [
|
||||
'CurrentFilings',
|
||||
'get_current_filings',
|
||||
'get_all_current_filings',
|
||||
'iter_current_filings_pages',
|
||||
]
|
||||
|
||||
summary_regex = re.compile(r'<b>([^<]+):</b>\s+([^<\s]+)')
|
||||
title_regex = re.compile(r"(.*?) - (.*) \((\d+)\) \((.*)\)")
|
||||
|
||||
"""
|
||||
Get the current filings from the SEC. Use this to get the filings filed after the 5:30 deadline
|
||||
"""
|
||||
GET_CURRENT_URL = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&output=atom&owner=only&count=100"
|
||||
|
||||
|
||||
def _empty_filing_index():
|
||||
schema = pa.schema([
|
||||
('form', pa.string()),
|
||||
('company', pa.string()),
|
||||
('cik', pa.int32()),
|
||||
('filing_date', pa.date32()),
|
||||
('accession_number', pa.string()),
|
||||
('accepted', pa.timestamp('s')),
|
||||
])
|
||||
|
||||
# Create an empty table with the defined schema
|
||||
return pa.Table.from_arrays([
|
||||
pa.array([], type=pa.string()),
|
||||
pa.array([], type=pa.string()),
|
||||
pa.array([], type=pa.int32()),
|
||||
pa.array([], type=pa.date32()),
|
||||
pa.array([], type=pa.string()),
|
||||
pa.array([], type=pa.timestamp('s')),
|
||||
], schema=schema)
|
||||
|
||||
def parse_title(title: str):
|
||||
"""
|
||||
Given the title in this example
|
||||
|
||||
"144 - monday.com Ltd. (0001845338) (Subject)"
|
||||
which contains the form type, company name, CIK, and status
|
||||
parse into a tuple of form type, company name, CIK, and status using regex
|
||||
"""
|
||||
match = title_regex.match(title)
|
||||
if not match:
|
||||
raise ValueError(f"Could not parse title: {title} using regex: {title_regex}")
|
||||
return match.groups()
|
||||
|
||||
def parse_summary(summary: str):
|
||||
"""
|
||||
Given the summary in this example
|
||||
|
||||
"Filed: 2021-09-30 AccNo: 0001845338-21-000002 Size: 1 MB"
|
||||
|
||||
parse into a tuple of filing date, accession number, and size
|
||||
"""
|
||||
# Remove <b> and </b> tags from summary
|
||||
|
||||
matches = re.findall(summary_regex, summary)
|
||||
|
||||
# Convert matches into a dictionary
|
||||
fields = {k.strip(): (int(v) if v.isdigit() else v) for k, v in matches}
|
||||
|
||||
filed_date = fields.get('Filed')
|
||||
if not filed_date:
|
||||
raise ValueError(f"Could not find 'Filed' date in summary: {summary}")
|
||||
|
||||
accession_no = fields.get('AccNo')
|
||||
if not accession_no:
|
||||
raise ValueError(f"Could not find 'AccNo' in summary: {summary}")
|
||||
|
||||
try:
|
||||
filing_date = datetime.strptime(str(filed_date), '%Y-%m-%d').date()
|
||||
except ValueError as e:
|
||||
raise ValueError(f"Invalid date format in summary: {filed_date}") from e
|
||||
|
||||
return filing_date, accession_no
|
||||
|
||||
|
||||
def get_current_url(atom: bool = True,
|
||||
count: int = 100,
|
||||
start: int = 0,
|
||||
form: str = '',
|
||||
owner: str = 'include'):
|
||||
url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent"
|
||||
|
||||
count = count if count in [10, 20, 40, 80, 100] else 40
|
||||
owner = owner if owner in ['include', 'exclude', 'only'] else 'include'
|
||||
|
||||
url = url + f"&count={count}&start={start}&type={form}&owner={owner}"
|
||||
if atom:
|
||||
url += "&output=atom"
|
||||
return url
|
||||
|
||||
|
||||
@lru_cache(maxsize=32)
|
||||
def get_current_entries_on_page(count: int, start: int, form: Optional[str] = None, owner: str = 'include'):
|
||||
url = get_current_url(count=count, start=start, form=form if form else '', owner=owner, atom=True)
|
||||
response = get_with_retry(url)
|
||||
|
||||
soup = BeautifulSoup(response.text, features="xml")
|
||||
entries = []
|
||||
for entry in soup.find_all("entry"):
|
||||
# The title contains the form type, company name, CIK, and status e.g 4 - WILKS LEWIS (0001076463) (Reporting)
|
||||
title = child_text(entry, "title")
|
||||
form_type, company_name, cik, status = parse_title(title)
|
||||
# The summary contains the filing date and link to the filing
|
||||
summary = child_text(entry, "summary")
|
||||
filing_date, accession_number = parse_summary(summary)
|
||||
accepted = datetime.fromisoformat(child_text(entry, "updated"))
|
||||
|
||||
entries.append({'form': form_type,
|
||||
'company': company_name,
|
||||
'cik': int(cik),
|
||||
'filing_date': filing_date,
|
||||
'accession_number': accession_number,
|
||||
'accepted': accepted})
|
||||
return entries
|
||||
|
||||
|
||||
class CurrentFilings(Filings):
|
||||
"""
|
||||
This version of the Filings class is used to get the current filings from the SEC
|
||||
page by page
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
filing_index: pa.Table,
|
||||
form: str = '',
|
||||
start: int = 1,
|
||||
page_size: int = 40,
|
||||
owner: str = 'include'):
|
||||
super().__init__(filing_index, original_state=None)
|
||||
self._start = start
|
||||
self._page_size = page_size
|
||||
self.owner = owner
|
||||
self.form = form
|
||||
|
||||
def next(self):
|
||||
# If the number of entries is less than the page size then we are at the end of the data
|
||||
if len(self.data) < self._page_size:
|
||||
return None
|
||||
start = self._start + len(self.data)
|
||||
next_entries = get_current_entries_on_page(start=start-1, count=self._page_size, form=self.form, owner=self.owner)
|
||||
if next_entries:
|
||||
# Copy the values to this Filings object and return it
|
||||
self.data = pa.Table.from_pylist(next_entries)
|
||||
self._start = start
|
||||
return self
|
||||
|
||||
def previous(self):
|
||||
# If start = 1 then there are no previous entries
|
||||
if self._start == 1:
|
||||
return None
|
||||
start = max(1, self._start - self._page_size)
|
||||
previous_entries = get_current_entries_on_page(start=start, count=self._page_size, form=self.form, owner=self.owner)
|
||||
if previous_entries:
|
||||
# Copy the values to this Filings object and return it
|
||||
self.data = pa.Table.from_pylist(previous_entries)
|
||||
self._start = start
|
||||
return self
|
||||
|
||||
def __getitem__(self, item): # type: ignore
|
||||
result = self.get(item)
|
||||
if result is None:
|
||||
if isinstance(item, int) or item.isdigit():
|
||||
raise IndexError(f"Filing index {item} is out of range for current page")
|
||||
else:
|
||||
raise KeyError(f"Filing with accession number '{item}' not found")
|
||||
return result
|
||||
|
||||
def __iter__(self):
|
||||
"""Override to reset iteration index for current page"""
|
||||
self.n = 0
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
"""Override to handle pagination properly - use page-relative indices"""
|
||||
if self.n < len(self.data):
|
||||
filing = super().get_filing_at(self.n) # Use page-relative index directly
|
||||
self.n += 1
|
||||
return filing
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
def get(self, index_or_accession_number: IntString):
|
||||
if isinstance(index_or_accession_number, int) or index_or_accession_number.isdigit():
|
||||
idx = int(index_or_accession_number)
|
||||
if self._start - 1 <= idx < self._start - 1 + len(self.data):
|
||||
# Where on this page is the index
|
||||
idx_on_page = idx - (self._start - 1)
|
||||
return super().get_filing_at(idx_on_page)
|
||||
# Index is out of bounds for current page
|
||||
return None
|
||||
else:
|
||||
accession_number = index_or_accession_number.strip()
|
||||
# See if the filing is in this page
|
||||
filing = super().get(accession_number)
|
||||
if filing:
|
||||
return filing
|
||||
|
||||
current_filings = get_current_filings(self.form, self.owner, page_size=100)
|
||||
filing = CurrentFilings._get_current_filing_by_accession_number(current_filings.data, accession_number)
|
||||
if filing:
|
||||
return filing
|
||||
with Status(f"[bold deep_sky_blue1]Searching through the most recent filings for {accession_number}...",
|
||||
spinner="dots2"):
|
||||
while True:
|
||||
current_filings = current_filings.next()
|
||||
if current_filings is None:
|
||||
return None
|
||||
filing = CurrentFilings._get_current_filing_by_accession_number(current_filings.data,
|
||||
accession_number)
|
||||
if filing:
|
||||
return filing
|
||||
|
||||
@staticmethod
|
||||
def _get_current_filing_by_accession_number(data: pa.Table, accession_number: str):
|
||||
from edgar import Filing
|
||||
mask = pc.equal(data['accession_number'], accession_number)
|
||||
try:
|
||||
idx = mask.index(True).as_py()
|
||||
if idx > -1:
|
||||
return Filing(
|
||||
cik=data['cik'][idx].as_py(),
|
||||
company=data['company'][idx].as_py(),
|
||||
form=data['form'][idx].as_py(),
|
||||
filing_date=data['filing_date'][idx].as_py(),
|
||||
accession_no=data['accession_number'][idx].as_py(),
|
||||
)
|
||||
except ValueError:
|
||||
# Accession number not found in this batch
|
||||
pass
|
||||
return None
|
||||
|
||||
def __rich__(self):
|
||||
|
||||
# Create table with appropriate columns and styling
|
||||
table = Table(
|
||||
show_header=True,
|
||||
header_style="bold",
|
||||
show_edge=True,
|
||||
expand=False,
|
||||
padding=(0, 1),
|
||||
box=box.SIMPLE,
|
||||
)
|
||||
|
||||
# Add columns with specific styling and alignment
|
||||
table.add_column("#", style="dim", justify="right")
|
||||
table.add_column("Form", width=14)
|
||||
table.add_column("CIK", style="dim", width=10, justify="right")
|
||||
table.add_column("Ticker", width=6, style="yellow")
|
||||
table.add_column("Company", style="bold green", width=38, no_wrap=True)
|
||||
table.add_column("Accepted", width=20)
|
||||
table.add_column("Accession Number", width=20)
|
||||
table.add_column(" ", width=1, style="cyan dim") # Group indicator column
|
||||
|
||||
|
||||
# Access data directly from PyArrow table (zero-copy)
|
||||
num_rows = len(self.data)
|
||||
start_idx = self._start - 1
|
||||
|
||||
# Get accession numbers for grouping (zero-copy access)
|
||||
accession_numbers = self.data.column('accession_number').to_pylist()
|
||||
|
||||
# Identify groups of consecutive filings with same accession number
|
||||
groups = {}
|
||||
|
||||
for i in range(len(accession_numbers)):
|
||||
acc_no = accession_numbers[i]
|
||||
|
||||
# Check previous and next accession numbers
|
||||
prev_acc = accession_numbers[i-1] if i > 0 else None
|
||||
next_acc = accession_numbers[i+1] if i < len(accession_numbers)-1 else None
|
||||
|
||||
if acc_no != prev_acc and acc_no == next_acc:
|
||||
groups[i] = '┐' # Start of group
|
||||
elif acc_no == prev_acc and acc_no == next_acc:
|
||||
groups[i] = '│' # Middle of group
|
||||
elif acc_no == prev_acc and acc_no != next_acc:
|
||||
groups[i] = '┘' # End of group
|
||||
else:
|
||||
groups[i] = ' ' # Standalone filing
|
||||
|
||||
# Iterate through PyArrow table directly (zero-copy)
|
||||
for idx in range(num_rows):
|
||||
row_index = start_idx + idx
|
||||
cik = self.data['cik'][idx].as_py()
|
||||
ticker = find_ticker(cik)
|
||||
|
||||
row = [
|
||||
str(row_index),
|
||||
self.data['form'][idx].as_py(),
|
||||
str(cik),
|
||||
ticker,
|
||||
self.data['company'][idx].as_py(),
|
||||
accepted_time_text(self.data['accepted'][idx].as_py()),
|
||||
accession_number_text(self.data['accession_number'][idx].as_py()),
|
||||
groups.get(idx, ' ') # Add group indicator
|
||||
]
|
||||
table.add_row(*row)
|
||||
|
||||
# Show paging information only if there are multiple pages
|
||||
elements = [table]
|
||||
|
||||
page_info = Text.assemble(
|
||||
("Showing ", "dim"),
|
||||
(f"{start_idx:,}", "bold red"),
|
||||
(" to ", "dim"),
|
||||
(f"{start_idx + num_rows - 1:,}", "bold red"),
|
||||
(" most recent filings.", "dim"),
|
||||
(" Page using ", "dim"),
|
||||
("← prev()", "bold gray54"),
|
||||
(" and ", "dim"),
|
||||
("next() →", "bold gray54")
|
||||
)
|
||||
|
||||
elements.extend([Text("\n"), page_info])
|
||||
|
||||
# Get the subtitle
|
||||
start_date, end_date = self.date_range
|
||||
subtitle = "Most recent filings from the SEC"
|
||||
return Panel(
|
||||
Group(*elements),
|
||||
title="SEC Filings",
|
||||
subtitle=subtitle,
|
||||
border_style="bold grey54",
|
||||
expand=False
|
||||
)
|
||||
|
||||
|
||||
def get_all_current_filings(form: str = '',
|
||||
owner: str = 'include',
|
||||
page_size: int = 100) -> 'Filings':
|
||||
"""
|
||||
Get ALL current filings by iterating through all pages.
|
||||
|
||||
Args:
|
||||
form: Form type to filter by (e.g., "10-K", "8-K")
|
||||
owner: Owner filter ('include', 'exclude', 'only')
|
||||
page_size: Number of filings per page (10, 20, 40, 80, 100)
|
||||
|
||||
Returns:
|
||||
Filings: A regular Filings object containing all current filings
|
||||
|
||||
Example:
|
||||
>>> all_filings = get_all_current_filings(form="10-K")
|
||||
>>> print(f"Found {len(all_filings)} total current 10-K filings")
|
||||
"""
|
||||
from edgar._filings import Filings
|
||||
all_entries = []
|
||||
|
||||
for page in iter_current_filings_pages(form=form, owner=owner, page_size=page_size):
|
||||
# Convert PyArrow table to list and extend
|
||||
page_entries = page.data.to_pylist()
|
||||
all_entries.extend(page_entries)
|
||||
|
||||
if not all_entries:
|
||||
return Filings(_empty_filing_index())
|
||||
|
||||
# Return as regular Filings object (not CurrentFilings)
|
||||
return Filings(pa.Table.from_pylist(all_entries))
|
||||
|
||||
|
||||
def get_current_filings(form: str = '',
|
||||
owner: str = 'include',
|
||||
page_size: int = 40):
|
||||
"""
|
||||
Get the current filings from the SEC
|
||||
:return: The current filings from the SEC
|
||||
"""
|
||||
owner = owner if owner in ['include', 'exclude', 'only'] else 'include'
|
||||
page_size = page_size if page_size in [10, 20, 40, 80, 100] else 100
|
||||
start = 0
|
||||
|
||||
entries = get_current_entries_on_page(count=page_size, start=start, form=form, owner=owner)
|
||||
if not entries:
|
||||
return CurrentFilings(filing_index=_empty_filing_index(), owner=owner, form=form, page_size=page_size)
|
||||
return CurrentFilings(filing_index=pa.Table.from_pylist(entries), owner=owner, form=form, page_size=page_size)
|
||||
|
||||
|
||||
def iter_current_filings_pages(form: str = '',
|
||||
owner: str = 'include',
|
||||
page_size: int = 100):
|
||||
"""
|
||||
Iterator that yields CurrentFilings pages until exhausted.
|
||||
|
||||
Args:
|
||||
form: Form type to filter by (e.g., "10-K", "8-K")
|
||||
owner: Owner filter ('include', 'exclude', 'only')
|
||||
page_size: Number of filings per page (10, 20, 40, 80, 100)
|
||||
|
||||
Yields:
|
||||
CurrentFilings: Each page of current filings until no more pages
|
||||
|
||||
Example:
|
||||
>>> for page in iter_current_filings_pages(form="10-K"):
|
||||
... print(f"Processing {len(page)} filings")
|
||||
... # Process each page
|
||||
"""
|
||||
current_page = get_current_filings(form=form, owner=owner, page_size=page_size)
|
||||
|
||||
while current_page is not None:
|
||||
yield current_page
|
||||
current_page = current_page.next()
|
||||
387
venv/lib/python3.10/site-packages/edgar/datatools.py
Normal file
387
venv/lib/python3.10/site-packages/edgar/datatools.py
Normal file
@@ -0,0 +1,387 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
from lxml import html as lxml_html
|
||||
|
||||
__all__ = [
|
||||
"compress_dataframe",
|
||||
"table_html_to_dataframe",
|
||||
"table_tag_to_dataframe",
|
||||
"markdown_to_dataframe",
|
||||
"dataframe_to_text",
|
||||
"clean_column_text",
|
||||
'convert_to_numeric',
|
||||
'describe_dataframe',
|
||||
'na_value',
|
||||
'replace_all_na_with_empty',
|
||||
'convert_to_pyarrow_backend',
|
||||
'drop_duplicates_pyarrow',
|
||||
'repr_df',
|
||||
'DataPager',
|
||||
'PagingState',
|
||||
]
|
||||
|
||||
|
||||
def clean_column_text(text: str):
|
||||
"""Remove newlines and extra spaces from column text.
|
||||
' Per Share ' -> 'Per Share'
|
||||
'Per\nShare' -> 'Per Share'
|
||||
'Per Share' -> 'Per Share'
|
||||
"""
|
||||
text = ' '.join(text.strip().split())
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
|
||||
def compress_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Remove empty rows and columns from a DataFrame.
|
||||
|
||||
Args:
|
||||
df: DataFrame to compress
|
||||
|
||||
Returns:
|
||||
Compressed DataFrame with empty rows and columns removed
|
||||
"""
|
||||
# Remove empty rows and columns
|
||||
df = (df.replace('', pd.NA)
|
||||
.dropna(axis=1, how="all")
|
||||
.dropna(axis=0, how="all"))
|
||||
# Fill na
|
||||
df = df.fillna('')
|
||||
return df
|
||||
|
||||
|
||||
def repr_df(df: pd.DataFrame, hide_index: bool = True) -> str:
|
||||
"""Return a string representation of a DataFrame.
|
||||
|
||||
Args:
|
||||
df: DataFrame to represent as string
|
||||
hide_index: Whether to hide the index in the output
|
||||
|
||||
Returns:
|
||||
String representation of the DataFrame
|
||||
"""
|
||||
if hide_index:
|
||||
return df.to_string(index=False)
|
||||
return df.to_string()
|
||||
|
||||
|
||||
@dataclass
|
||||
class PagingState:
|
||||
"""State for paginating through data."""
|
||||
page: int = 1
|
||||
page_size: int = 50
|
||||
total_items: int = 0
|
||||
|
||||
@property
|
||||
def start_idx(self) -> int:
|
||||
"""Get the start index for the current page."""
|
||||
return (self.page - 1) * self.page_size
|
||||
|
||||
@property
|
||||
def end_idx(self) -> int:
|
||||
"""Get the end index for the current page."""
|
||||
return min(self.start_idx + self.page_size, self.total_items)
|
||||
|
||||
@property
|
||||
def has_more(self) -> bool:
|
||||
"""Check if there are more pages."""
|
||||
return self.end_idx < self.total_items
|
||||
|
||||
|
||||
class DataPager:
|
||||
"""Class for paginating through data."""
|
||||
def __init__(self, data: Union[pd.DataFrame, pa.Table], page_size: int = 50):
|
||||
"""Initialize the pager.
|
||||
|
||||
Args:
|
||||
data: Data to paginate through
|
||||
page_size: Number of items per page
|
||||
"""
|
||||
self.data = data
|
||||
self.state = PagingState(page_size=page_size, total_items=len(data))
|
||||
|
||||
def get_page(self, page: int = 1) -> Union[pd.DataFrame, pa.Table]:
|
||||
"""Get a specific page of data.
|
||||
|
||||
Args:
|
||||
page: Page number to get (1-based)
|
||||
|
||||
Returns:
|
||||
Slice of data for the requested page
|
||||
"""
|
||||
self.state.page = page
|
||||
return self.data[self.state.start_idx:self.state.end_idx]
|
||||
|
||||
def adjust_column_headers(df: pd.DataFrame):
|
||||
""" Replace numeric column headers with blank strings. """
|
||||
# Check if column names are integers (default index names in pandas DataFrames)
|
||||
if all(isinstance(col, int) for col in df.columns):
|
||||
# Replace them with blank strings
|
||||
df.columns = ['' for _ in df.columns]
|
||||
return df
|
||||
|
||||
|
||||
def should_promote_to_header(df: pd.DataFrame) -> bool:
|
||||
if df.shape[0] > 1:
|
||||
first_row = df.iloc[0]
|
||||
|
||||
# Check for uniformity and non-numeric nature
|
||||
if all(isinstance(item, str) for item in first_row):
|
||||
# Pattern matching for typical header keywords
|
||||
header_keywords = {'title', 'name', 'number', 'description', 'date', 'total', 'id'}
|
||||
if any(any(keyword in str(cell).lower() for keyword in header_keywords) for cell in first_row):
|
||||
return True
|
||||
|
||||
# Check distinctiveness compared to the second row (simple heuristic)
|
||||
second_row = df.iloc[1]
|
||||
difference_count = sum(1 for f, s in zip(first_row, second_row, strict=False) if f != s)
|
||||
if difference_count > len(first_row) / 2: # Arbitrary threshold: more than half are different
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def table_html_to_dataframe(html_str: str) -> pd.DataFrame:
|
||||
tree = lxml_html.fromstring(html_str)
|
||||
table_element = tree.xpath("//table")[0]
|
||||
rows = table_element.xpath(".//tr")
|
||||
|
||||
data = []
|
||||
for row in rows:
|
||||
cols = row.xpath(".//td | .//th") # Handle both 'td' and 'th' if present
|
||||
cols = [clean_column_text(lxml_html.tostring(c, method='text', encoding='unicode').strip()) for c in cols]
|
||||
data.append(cols)
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
df = adjust_column_headers(df) # Adjust headers if not promoted
|
||||
df = compress_dataframe(df)
|
||||
return df
|
||||
|
||||
|
||||
def table_tag_to_dataframe(table_tag):
|
||||
"""Convert a BeautifulSoup table Tag to a DataFrame."""
|
||||
|
||||
rows = table_tag.find_all('tr')
|
||||
|
||||
data = []
|
||||
|
||||
for row in rows:
|
||||
# Find all 'td' tags within each 'tr' tag
|
||||
cols = row.find_all('td')
|
||||
# Get the text from each 'td' tag, handling nested tags automatically
|
||||
cols = [clean_column_text(col.get_text(strip=True)) for col in cols]
|
||||
data.append(cols)
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
return df
|
||||
|
||||
|
||||
def markdown_to_dataframe(markdown_table):
|
||||
# Split the markdown table into rows
|
||||
rows = markdown_table.split('\n')
|
||||
|
||||
# Extract the header row
|
||||
header = rows[0].split('|')
|
||||
header = [col.strip() for col in header]
|
||||
|
||||
# Extract the data rows
|
||||
data_rows = []
|
||||
for row in rows[2:]:
|
||||
if not row.strip():
|
||||
continue
|
||||
data_row = row.split('|')
|
||||
data_row = [col.strip() for col in data_row]
|
||||
data_rows.append(data_row)
|
||||
|
||||
# Create a pandas DataFrame
|
||||
if len(data_rows) == 0:
|
||||
df = pd.DataFrame([header], columns=["" for col in header])
|
||||
else:
|
||||
df = pd.DataFrame(data_rows, columns=header)
|
||||
df = compress_dataframe(df)
|
||||
return df
|
||||
|
||||
|
||||
def dataframe_to_text(df, include_index=False, include_headers=False):
|
||||
"""
|
||||
Convert a Pandas DataFrame to a plain text string, with formatting options for including
|
||||
the index and column headers.
|
||||
|
||||
Parameters:
|
||||
- df (pd.DataFrame): The dataframe to convert
|
||||
- include_index (bool): Whether to include the index in the text output. Defaults to True.
|
||||
- include_headers (bool): Whether to include column headers in the text output. Defaults to True.
|
||||
|
||||
Returns:
|
||||
str: The dataframe converted to a text string.
|
||||
"""
|
||||
# Getting the maximum width for each column
|
||||
column_widths = df.apply(lambda col: col.astype(str).str.len().max())
|
||||
|
||||
# If including indexes, get the maximum width of the index
|
||||
|
||||
index_label = ''
|
||||
if include_index:
|
||||
index_label = "Index"
|
||||
index_width = max(df.index.astype(str).map(len).max(), len(index_label))
|
||||
else:
|
||||
index_width = 0
|
||||
|
||||
# Initialize an empty string to store the text
|
||||
text_output = ""
|
||||
|
||||
# Include column headers if specified
|
||||
if include_headers:
|
||||
# Add index label if specified
|
||||
if include_index:
|
||||
text_output += f"{index_label:<{index_width}}\t"
|
||||
|
||||
# Create and add the header row
|
||||
headers = [f"{col:<{width}}" for col, width in zip(df.columns, column_widths, strict=False)]
|
||||
text_output += '\t'.join(headers) + '\n'
|
||||
|
||||
# Loop through each row of the dataframe
|
||||
for index, row in df.iterrows():
|
||||
# Include index if specified
|
||||
if include_index:
|
||||
text_output += f"{index:<{index_width}}\t"
|
||||
|
||||
# Format each value according to the column width and concatenate
|
||||
row_values = [f"{val:<{width}}" for val, width in zip(row.astype(str), column_widths, strict=False)]
|
||||
text_output += '\t'.join(row_values) + '\n'
|
||||
|
||||
return text_output
|
||||
|
||||
|
||||
def convert_to_numeric(series):
|
||||
"""Convert a pandas Series to numeric if possible, otherwise return the original series."""
|
||||
try:
|
||||
return pd.to_numeric(series)
|
||||
except ValueError:
|
||||
return series
|
||||
|
||||
|
||||
def describe_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
||||
# Get data types of columns
|
||||
dtypes = df.dtypes
|
||||
|
||||
# Create a Series for the index dtype
|
||||
index_dtype = pd.Series(df.index.dtype, index=['Index'])
|
||||
|
||||
# Concatenate the dtypes and index_dtype
|
||||
all_dtypes = pd.concat([index_dtype, dtypes])
|
||||
|
||||
# Get memory usage of each column including the index, in kilobytes and round to 2 decimal places
|
||||
memory_usage = df.memory_usage(deep=True) / 1024
|
||||
memory_usage.index = memory_usage.index.astype(str) # Ensure index labels are string type
|
||||
memory_usage = memory_usage.round(2) # Round memory usage to 2 decimal places
|
||||
|
||||
# Calculate total memory usage
|
||||
total_memory_usage = memory_usage.sum()
|
||||
|
||||
# Create a DataFrame with the information
|
||||
description_df = pd.DataFrame({
|
||||
'Data type': all_dtypes.to_numpy(),
|
||||
'Memory Usage (KB)': memory_usage.to_numpy()
|
||||
}, index=all_dtypes.index)
|
||||
|
||||
# Append the total memory usage as the last row
|
||||
total_row = pd.DataFrame({
|
||||
'Data type': [''],
|
||||
'Memory Usage (KB)': [total_memory_usage]
|
||||
}, index=['Total'])
|
||||
|
||||
description_df = pd.concat([description_df, total_row])
|
||||
|
||||
return description_df
|
||||
|
||||
|
||||
def convert_to_pyarrow_backend(data:pd.DataFrame):
|
||||
# Convert dtypes carefully
|
||||
for col in data.columns:
|
||||
if data[col].dtype == 'object':
|
||||
# For object columns, convert to string
|
||||
data[col] = data[col].astype(str)
|
||||
elif data[col].dtype == 'float64':
|
||||
# For float columns, use float32 to match PyArrow's default
|
||||
data[col] = data[col].astype('float32')
|
||||
|
||||
# Now convert to PyArrow
|
||||
return data.convert_dtypes(dtype_backend="pyarrow")
|
||||
|
||||
|
||||
def replace_all_na_with_empty(df_or_series):
|
||||
if isinstance(df_or_series, pd.DataFrame):
|
||||
for column in df_or_series.columns:
|
||||
# Check if the column is all NA or None
|
||||
if df_or_series[column].isna().all():
|
||||
# Get the length of the DataFrame
|
||||
length = len(df_or_series)
|
||||
|
||||
# Create a new Series of empty strings
|
||||
empty_series = pd.Series([''] * length, name=column)
|
||||
|
||||
# Replace the column with the new Series
|
||||
df_or_series[column] = empty_series
|
||||
|
||||
return df_or_series
|
||||
elif isinstance(df_or_series, pd.Series):
|
||||
# Check if the series is all NA or None
|
||||
if df_or_series.isna().all():
|
||||
# Create a new Series of empty strings with the same index and name
|
||||
return pd.Series('', index=df_or_series.index, name=df_or_series.name)
|
||||
else:
|
||||
# If not all NA, return the original series
|
||||
return df_or_series
|
||||
|
||||
def na_value(value, default_value:object=''):
|
||||
if pd.isna(value):
|
||||
return default_value
|
||||
return value
|
||||
|
||||
|
||||
def drop_duplicates_pyarrow(table, column_name, keep='first'):
|
||||
"""
|
||||
Drop duplicates from a PyArrow Table based on a specified column.
|
||||
|
||||
Parameters:
|
||||
- table (pa.Table): The input PyArrow Table
|
||||
- column_name (str): The column to check for duplicates
|
||||
- keep (str): 'first' to keep first occurrence, 'last' to keep last occurrence
|
||||
|
||||
Returns:
|
||||
- pa.Table: A new table with duplicates removed
|
||||
"""
|
||||
if column_name not in table.column_names:
|
||||
raise ValueError(f"Column '{column_name}' not found in table")
|
||||
|
||||
if keep not in ['first', 'last']:
|
||||
raise ValueError("Parameter 'keep' must be 'first' or 'last'")
|
||||
|
||||
# Extract the column as an array
|
||||
column_array = table[column_name]
|
||||
|
||||
# Convert to NumPy array and get unique indices
|
||||
np_array = column_array.to_numpy()
|
||||
unique_values, unique_indices = np.unique(np_array, return_index=True)
|
||||
|
||||
if keep == 'first':
|
||||
# Sort indices to maintain original order for first occurrences
|
||||
sorted_indices = np.sort(unique_indices)
|
||||
else: # keep == 'last'
|
||||
# Get the last occurrence by reversing the array logic
|
||||
reverse_indices = len(np_array) - 1 - np.unique(np_array[::-1], return_index=True)[1]
|
||||
sorted_indices = np.sort(reverse_indices)
|
||||
|
||||
# Create a boolean mask to filter the table
|
||||
mask = np.zeros(len(table), dtype=bool)
|
||||
mask[sorted_indices] = True
|
||||
|
||||
# Filter the table using the mask
|
||||
deduplicated_table = table.filter(pa.array(mask))
|
||||
|
||||
return deduplicated_table
|
||||
76
venv/lib/python3.10/site-packages/edgar/dates.py
Normal file
76
venv/lib/python3.10/site-packages/edgar/dates.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import datetime
|
||||
from typing import Optional, Tuple
|
||||
|
||||
__all__ = [
|
||||
"extract_dates",
|
||||
"InvalidDateException"
|
||||
]
|
||||
|
||||
class InvalidDateException(Exception):
|
||||
|
||||
def __init__(self, message: str):
|
||||
super().__init__(message)
|
||||
|
||||
def extract_dates(date_str: str) -> Tuple[Optional[datetime.datetime], Optional[datetime.datetime], bool]:
|
||||
"""
|
||||
Split a date or a date range into start_date and end_date
|
||||
Examples:
|
||||
extract_dates("2022-03-04") -> 2022-03-04, None, False
|
||||
extract_dates("2022-03-04:2022-04-05") -> 2022-03-04, 2022-04-05, True
|
||||
extract_dates("2022-03-04:") -> 2022-03-04, <current_date>, True
|
||||
extract_dates(":2022-03-04") -> 1994-07-01, 2022-03-04, True
|
||||
|
||||
Args:
|
||||
date_str: Date string in YYYY-MM-DD format, optionally with a range separator ':'
|
||||
|
||||
Returns:
|
||||
Tuple of (start_date, end_date, is_range) where dates are datetime objects
|
||||
and is_range indicates if this was a date range query
|
||||
|
||||
Raises:
|
||||
InvalidDateException: If the date string cannot be parsed
|
||||
"""
|
||||
if not date_str:
|
||||
raise InvalidDateException("Empty date string provided")
|
||||
|
||||
try:
|
||||
# Split on colon, handling the single date case
|
||||
has_colon = ':' in date_str
|
||||
parts = date_str.split(':') if has_colon else [date_str]
|
||||
|
||||
# Handle invalid formats
|
||||
if len(parts) != (2 if has_colon else 1):
|
||||
raise InvalidDateException("Invalid date range format")
|
||||
|
||||
# Parse start date
|
||||
if not has_colon or parts[0]:
|
||||
start_date = datetime.datetime.strptime(parts[0], "%Y-%m-%d")
|
||||
else:
|
||||
start_date = datetime.datetime.strptime('1994-07-01', '%Y-%m-%d')
|
||||
|
||||
# Parse end date
|
||||
if has_colon and parts[1]:
|
||||
end_date = datetime.datetime.strptime(parts[1], "%Y-%m-%d")
|
||||
elif has_colon:
|
||||
end_date = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
else:
|
||||
end_date = None
|
||||
|
||||
# Validate date order if both dates are present
|
||||
if has_colon and end_date and start_date > end_date:
|
||||
raise InvalidDateException(
|
||||
f"Invalid date range: start date ({start_date.date()}) "
|
||||
f"cannot be after end date ({end_date.date()})"
|
||||
)
|
||||
|
||||
return start_date, end_date, has_colon
|
||||
|
||||
except ValueError as e:
|
||||
raise InvalidDateException(f"""
|
||||
Cannot extract a date or date range from string {date_str}
|
||||
Provide either
|
||||
1. A date in the format "YYYY-MM-DD" e.g. "2022-10-27"
|
||||
2. A date range in the format "YYYY-MM-DD:YYYY-MM-DD" e.g. "2022-10-01:2022-10-27"
|
||||
3. A partial date range "YYYY-MM-DD:" to specify dates after the value e.g. "2022-10-01:"
|
||||
4. A partial date range ":YYYY-MM-DD" to specify dates before the value e.g. ":2022-10-27"
|
||||
""") from e
|
||||
263
venv/lib/python3.10/site-packages/edgar/docs/Filing.md
Normal file
263
venv/lib/python3.10/site-packages/edgar/docs/Filing.md
Normal file
@@ -0,0 +1,263 @@
|
||||
# Filing Class Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The `Filing` class is the core object in edgartools for working with individual SEC filings. It provides comprehensive access to filing content, metadata, documents, and related functionality, making it easy to analyze and extract data from SEC filings.
|
||||
|
||||
## Common Actions
|
||||
|
||||
Quick reference for the most frequently used Filing methods:
|
||||
|
||||
### Access Filing Content
|
||||
```python
|
||||
# Get HTML content
|
||||
html = filing.html()
|
||||
|
||||
# Get plain text
|
||||
text = filing.text()
|
||||
|
||||
# Get markdown formatted content
|
||||
markdown = filing.markdown()
|
||||
```
|
||||
|
||||
### Get Structured Data
|
||||
```python
|
||||
# Get form-specific object (10-K, 10-Q, 8-K, etc.)
|
||||
report = filing.obj()
|
||||
|
||||
# Get XBRL financial data
|
||||
xbrl = filing.xbrl()
|
||||
```
|
||||
|
||||
### View in Browser
|
||||
```python
|
||||
# Open filing in web browser
|
||||
filing.open()
|
||||
```
|
||||
|
||||
### Get Attachments
|
||||
```python
|
||||
# Access all filing attachments
|
||||
attachments = filing.attachments
|
||||
```
|
||||
|
||||
## Constructor
|
||||
|
||||
```python
|
||||
Filing(cik: int, company: str, form: str, filing_date: str, accession_no: str)
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- `cik`: Company's Central Index Key (integer)
|
||||
- `company`: Company name (string)
|
||||
- `form`: SEC form type (e.g., "10-K", "8-K", "DEF 14A")
|
||||
- `filing_date`: Date of filing (YYYY-MM-DD format)
|
||||
- `accession_no`: Unique SEC accession number
|
||||
|
||||
## Core Properties
|
||||
|
||||
| Property | Type | Description |
|
||||
|----------|------|-------------|
|
||||
| `cik` | int | Company's Central Index Key |
|
||||
| `company` | str | Company name |
|
||||
| `form` | str | SEC form type |
|
||||
| `filing_date` | str | Filing date |
|
||||
| `accession_no` | str | SEC accession number |
|
||||
| `accession_number` | str | Alias for accession_no |
|
||||
|
||||
## Document Access
|
||||
|
||||
### Primary Documents
|
||||
- **`document`** - The primary display document (HTML/XHTML)
|
||||
- **`primary_documents`** - List of all primary documents
|
||||
- **`attachments`** - All filing attachments
|
||||
- **`exhibits`** - Filing exhibits
|
||||
|
||||
### Content Formats
|
||||
- **`html()`** - HTML content of the primary document
|
||||
- **`xml()`** - XML content of the primary document
|
||||
- **`text()`** - Plain text version of the document
|
||||
- **`markdown()`** - Markdown formatted version
|
||||
|
||||
## Financial Data Access
|
||||
|
||||
### XBRL Data
|
||||
```python
|
||||
# Access structured financial data
|
||||
filing.xbrl() # Returns XBRLInstance with financial statements
|
||||
filing.statements # Direct access to financial statements
|
||||
```
|
||||
|
||||
### SGML Data
|
||||
```python
|
||||
# Access SGML filing data
|
||||
filing.sgml() # Returns SGMLFiling object
|
||||
```
|
||||
|
||||
## Navigation & URLs
|
||||
|
||||
| Property/Method | Description |
|
||||
|----------------|-------------|
|
||||
| `homepage` | Filing homepage information |
|
||||
| `homepage_url` | URL to the filing homepage |
|
||||
| `filing_url` | URL to the main filing document |
|
||||
| `text_url` | URL to the text version |
|
||||
| `base_dir` | Base directory URL for the filing |
|
||||
|
||||
## Search & Analysis
|
||||
|
||||
### Content Search
|
||||
```python
|
||||
# Search filing content
|
||||
results = filing.search("revenue recognition", regex=False)
|
||||
|
||||
# Search with regex
|
||||
results = filing.search(r"\b\d+\.\d+%", regex=True)
|
||||
```
|
||||
|
||||
### Document Structure
|
||||
- **`sections()`** - Get HTML sections for advanced search
|
||||
- **`period_of_report`** - Get the reporting period
|
||||
|
||||
## Entity Relationships
|
||||
|
||||
### Company Integration
|
||||
```python
|
||||
# Get the associated Company object
|
||||
company = filing.get_entity()
|
||||
|
||||
# Convert to company filing with additional data
|
||||
company_filing = filing.as_company_filing()
|
||||
|
||||
# Find related filings
|
||||
related = filing.related_filings()
|
||||
```
|
||||
|
||||
## Display & Interaction
|
||||
|
||||
### Console Display
|
||||
```python
|
||||
# Rich console display
|
||||
filing.view() # Display in console with rich formatting
|
||||
|
||||
# String representations
|
||||
str(filing) # Concise string representation
|
||||
repr(filing) # Detailed representation
|
||||
```
|
||||
|
||||
### Browser Integration
|
||||
```python
|
||||
# Open filing in web browser
|
||||
filing.open() # Open main document
|
||||
filing.open_homepage() # Open filing homepage
|
||||
|
||||
# Serve filing locally
|
||||
filing.serve(port=8000) # Serve on localhost:8000
|
||||
```
|
||||
|
||||
## Data Export & Persistence
|
||||
|
||||
### Export Formats
|
||||
```python
|
||||
# Convert to different formats
|
||||
filing_dict = filing.to_dict() # Dictionary
|
||||
filing_df = filing.to_pandas() # DataFrame
|
||||
summary_df = filing.summary() # Summary DataFrame
|
||||
```
|
||||
|
||||
### Save & Load
|
||||
```python
|
||||
# Save filing for later use
|
||||
filing.save("my_filing.pkl") # Save to file
|
||||
filing.save("/path/to/directory/") # Save to directory
|
||||
|
||||
# Load saved filing
|
||||
loaded_filing = Filing.load("my_filing.pkl")
|
||||
```
|
||||
|
||||
## Class Methods
|
||||
|
||||
### Alternative Constructors
|
||||
```python
|
||||
# Create from dictionary
|
||||
filing = Filing.from_dict(data_dict)
|
||||
|
||||
# Create from JSON file
|
||||
filing = Filing.from_json("filing_data.json")
|
||||
|
||||
# Create from SGML data
|
||||
filing = Filing.from_sgml(sgml_source)
|
||||
```
|
||||
|
||||
## Common Usage Patterns
|
||||
|
||||
### Basic Filing Analysis
|
||||
```python
|
||||
# Get a filing and explore its content
|
||||
filing = company.get_filings(form="10-K").latest(1)[0]
|
||||
|
||||
# Access financial statements
|
||||
statements = filing.xbrl()
|
||||
income_statement = statements.income_statement
|
||||
|
||||
# Search for specific content
|
||||
results = filing.search("risk factors")
|
||||
|
||||
# View in browser
|
||||
filing.open()
|
||||
```
|
||||
|
||||
### Working with Attachments
|
||||
```python
|
||||
# Get all attachments
|
||||
attachments = filing.attachments
|
||||
|
||||
# Find specific exhibits
|
||||
exhibits = filing.exhibits
|
||||
exhibit_99_1 = [ex for ex in exhibits if "99.1" in ex.description]
|
||||
|
||||
# Access exhibit content
|
||||
if exhibit_99_1:
|
||||
content = exhibit_99_1[0].html()
|
||||
```
|
||||
|
||||
### Financial Data Extraction
|
||||
```python
|
||||
# Get financial statements
|
||||
xbrl = filing.xbrl()
|
||||
|
||||
# Access different statement types
|
||||
balance_sheet = xbrl.balance_sheet
|
||||
income_statement = xbrl.income_statement
|
||||
cash_flow = xbrl.cash_flow_statement
|
||||
|
||||
# Get specific facts
|
||||
revenue = xbrl.get_facts("Revenues")
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The Filing class handles various edge cases gracefully:
|
||||
|
||||
- **Missing documents**: Returns None or empty collections
|
||||
- **Network errors**: Raises appropriate HTTP exceptions
|
||||
- **Malformed data**: Provides informative error messages
|
||||
- **File access**: Handles permissions and missing files
|
||||
|
||||
## Integration with Other Classes
|
||||
|
||||
The Filing class works seamlessly with other edgartools components:
|
||||
|
||||
- **Company**: Get filings from companies, convert back to company context
|
||||
- **Filings**: Part of filing collections with filtering and search
|
||||
- **XBRLInstance**: Access structured financial data
|
||||
- **Attachments**: Work with filing documents and exhibits
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- **Lazy loading**: Documents and data are loaded only when accessed
|
||||
- **Caching**: Network requests are cached to improve performance
|
||||
- **Streaming**: Large documents can be processed in chunks
|
||||
- **Async support**: Some operations support asynchronous execution
|
||||
|
||||
This comprehensive API makes the Filing class the primary interface for working with SEC filing data in edgartools.
|
||||
302
venv/lib/python3.10/site-packages/edgar/docs/Filings.md
Normal file
302
venv/lib/python3.10/site-packages/edgar/docs/Filings.md
Normal file
@@ -0,0 +1,302 @@
|
||||
# Filings Class Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The `Filings` class is a powerful container for SEC filing data that provides comprehensive functionality for filtering, searching, pagination, and data manipulation. It's built on PyArrow tables for efficient processing of large datasets and offers an intuitive interface for working with collections of SEC filings.
|
||||
|
||||
## Getting Filings
|
||||
|
||||
```python
|
||||
filings = get_filings()
|
||||
```
|
||||
- **Parameters**:
|
||||
- `year`: Year of filings (optional)
|
||||
- `quarter`: Quarter of filings (optional)
|
||||
- `amendments`: Include amended filings (default: True)
|
||||
- `ticker`: Company ticker symbol (optional)
|
||||
- `filing_date`: Date or date range for filtering (optional)
|
||||
|
||||
## Core Properties
|
||||
|
||||
| Property | Type | Description |
|
||||
|----------|------|-------------|
|
||||
| `data` | pa.Table | PyArrow table with filing information |
|
||||
| `date_range` | Tuple[str, str] | Start and end dates of filings |
|
||||
| `start_date` | str | Earliest filing date in collection |
|
||||
| `end_date` | str | Latest filing date in collection |
|
||||
| `empty` | bool | True if collection contains no filings |
|
||||
| `summary` | str | Description of current page/total filings |
|
||||
|
||||
## Data Access & Conversion
|
||||
|
||||
### DataFrame Conversion
|
||||
```python
|
||||
# Convert to pandas DataFrame
|
||||
df = filings.to_pandas() # All columns
|
||||
df = filings.to_pandas('form', 'company') # Specific columns
|
||||
```
|
||||
|
||||
### Individual Filing Access
|
||||
```python
|
||||
# Get filing by index
|
||||
filing = filings.get_filing_at(0) # First filing
|
||||
filing = filings[0] # Alternative syntax
|
||||
|
||||
# Get filing by accession number
|
||||
filing = filings.get("0000320193-23-000077")
|
||||
|
||||
# Get filing by index or accession
|
||||
filing = filings.get(5) # By index
|
||||
filing = filings.get("0000320193-23-000077") # By accession
|
||||
```
|
||||
|
||||
### Export & Persistence
|
||||
```python
|
||||
# Save as Parquet file
|
||||
filings.save_parquet("filings_data.parquet")
|
||||
filings.save("filings_data.parquet") # Alternative
|
||||
|
||||
# Convert to dictionary
|
||||
data_dict = filings.to_dict(max_rows=1000)
|
||||
```
|
||||
|
||||
## Filtering & Search
|
||||
|
||||
### Form-based Filtering
|
||||
```python
|
||||
# Single form type
|
||||
filings.filter(form="10-K")
|
||||
filings.filter(form="8-K")
|
||||
|
||||
# Multiple form types
|
||||
filings.filter(form=["10-K", "10-Q"])
|
||||
filings.filter(form=["8-K", "DEF 14A"])
|
||||
|
||||
# Include/exclude amendments
|
||||
filings.filter(form="10-K", amendments=True) # Include amendments
|
||||
filings.filter(form="10-K", amendments=False) # Exclude amendments
|
||||
```
|
||||
|
||||
### Date Filtering
|
||||
```python
|
||||
# Specific date
|
||||
filings.filter(date="2023-06-15")
|
||||
filings.filter(filing_date="2023-06-15") # Alternative
|
||||
|
||||
# Date ranges
|
||||
filings.filter(date="2023-01-01:2023-03-31") # Between dates
|
||||
filings.filter(date="2023-01-01:") # From date onwards
|
||||
filings.filter(date=":2023-03-31") # Up to date
|
||||
```
|
||||
|
||||
### Company-based Filtering
|
||||
```python
|
||||
# By CIK (Central Index Key)
|
||||
filings.filter(cik=320193) # Single CIK
|
||||
filings.filter(cik=[320193, 789019]) # Multiple CIKs
|
||||
|
||||
# By ticker symbol
|
||||
filings.filter(ticker="AAPL")
|
||||
filings.filter(ticker=["AAPL", "MSFT"])
|
||||
|
||||
# By exchange
|
||||
filings.filter(exchange="NASDAQ")
|
||||
filings.filter(exchange=["NYSE", "NASDAQ"])
|
||||
|
||||
# By accession number
|
||||
filings.filter(accession_number="0000320193-23-000077")
|
||||
```
|
||||
|
||||
### Company Search
|
||||
```python
|
||||
# Search for company and filter
|
||||
apple_filings = filings.find("Apple")
|
||||
microsoft_filings = filings.find("Microsoft Corporation")
|
||||
```
|
||||
|
||||
### Combined Filtering
|
||||
```python
|
||||
# Complex filtering example
|
||||
filtered = filings.filter(
|
||||
form=["10-K", "10-Q"],
|
||||
date="2023-01-01:2023-12-31",
|
||||
ticker=["AAPL", "MSFT", "GOOGL"],
|
||||
amendments=False
|
||||
)
|
||||
```
|
||||
|
||||
## Data Selection & Sampling
|
||||
|
||||
### Latest Filings
|
||||
```python
|
||||
# Get most recent filings
|
||||
latest_filing = filings.latest() # Most recent (default n=1)
|
||||
latest_five = filings.latest(5) # Most recent 5
|
||||
```
|
||||
|
||||
### Head & Tail
|
||||
```python
|
||||
# Get first/last n filings
|
||||
first_ten = filings.head(10) # First 10 filings
|
||||
last_ten = filings.tail(10) # Last 10 filings
|
||||
```
|
||||
|
||||
### Random Sampling
|
||||
```python
|
||||
# Get random sample
|
||||
sample = filings.sample(20) # Random 20 filings
|
||||
```
|
||||
|
||||
## Pagination
|
||||
|
||||
### Navigation
|
||||
```python
|
||||
# Navigate through pages
|
||||
current_page = filings.current() # Current page info
|
||||
next_page = filings.next() # Next page
|
||||
prev_page = filings.previous() # Previous page
|
||||
```
|
||||
|
||||
### Page Information
|
||||
```python
|
||||
# Check pagination status
|
||||
print(filings.summary) # "Page 1 of 50 (total: 12,543 filings)"
|
||||
is_empty = filings.empty # Check if no results
|
||||
```
|
||||
|
||||
## File Operations
|
||||
|
||||
### Download Filings
|
||||
```python
|
||||
# Download all filings in collection
|
||||
filings.download() # Download to default directory
|
||||
filings.download("./my_filings/") # Download to specific directory
|
||||
```
|
||||
|
||||
## Integration with Other Classes
|
||||
|
||||
### Filing Objects
|
||||
```python
|
||||
# Each item returns a Filing object
|
||||
for filing in filings:
|
||||
print(f"Form: {filing.form}")
|
||||
print(f"Company: {filing.company}")
|
||||
print(f"Date: {filing.filing_date}")
|
||||
|
||||
# Access filing content
|
||||
html_content = filing.html()
|
||||
attachments = filing.attachments
|
||||
xbrl_data = filing.xbrl()
|
||||
```
|
||||
|
||||
### Company Integration
|
||||
```python
|
||||
# Convert filing to company context
|
||||
filing = filings[0]
|
||||
company = filing.get_entity() # Get Company object
|
||||
company_filing = filing.as_company_filing() # Enhanced filing with company data
|
||||
```
|
||||
|
||||
## Rich Console Display
|
||||
|
||||
The Filings class provides formatted console output showing:
|
||||
- Filing table with Form, CIK, Ticker, Company, Filing Date, Accession Number
|
||||
- Pagination information
|
||||
- Navigation hints
|
||||
|
||||
```python
|
||||
# Display in console
|
||||
print(filings) # Rich formatted table
|
||||
filings.view() # Alternative display method
|
||||
```
|
||||
|
||||
## Common Usage Patterns
|
||||
|
||||
### Quarterly Filing Analysis
|
||||
```python
|
||||
# Get all 10-K filings for 2023
|
||||
annual_reports = get_filings(2023).filter(form="10-K", amendments=False)
|
||||
|
||||
# Find latest 10-Q for major tech companies
|
||||
tech_quarterlies = get_filings(2023, 4).filter(
|
||||
form="10-Q",
|
||||
ticker=["AAPL", "MSFT", "GOOGL", "TSLA"]
|
||||
).latest(4)
|
||||
```
|
||||
|
||||
### Company-Specific Research
|
||||
```python
|
||||
# Get all Apple filings from Q1 2023
|
||||
apple_filings = get_filings(2023, 1).find("Apple Inc")
|
||||
|
||||
# Filter for specific forms
|
||||
apple_major_filings = apple_filings.filter(
|
||||
form=["10-K", "10-Q", "8-K"],
|
||||
amendments=False
|
||||
)
|
||||
```
|
||||
|
||||
### Event-Driven Analysis
|
||||
```python
|
||||
# Find 8-K filings around specific dates
|
||||
event_filings = get_filings(2023, 2).filter(
|
||||
form="8-K",
|
||||
date="2023-02-01:2023-02-28"
|
||||
)
|
||||
|
||||
# Sample for analysis
|
||||
sample_events = event_filings.sample(50)
|
||||
```
|
||||
|
||||
### Bulk Data Processing
|
||||
```python
|
||||
# Get large dataset and save for later
|
||||
all_2023_filings = get_filings(2023)
|
||||
all_2023_filings.save_parquet("2023_filings.parquet")
|
||||
|
||||
# Convert to pandas for analysis
|
||||
df = all_2023_filings.to_pandas(['form', 'company', 'filing_date'])
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- **PyArrow Backend**: Efficient columnar data processing
|
||||
- **Lazy Evaluation**: Filters are applied efficiently without loading full documents
|
||||
- **Pagination**: Large datasets are handled through pagination
|
||||
- **Caching**: Network requests are cached for improved performance
|
||||
- **Parallel Processing**: Some operations support concurrent execution
|
||||
|
||||
## Error Handling
|
||||
|
||||
The Filings class handles various scenarios gracefully:
|
||||
|
||||
- **Empty Results**: Returns empty Filings object with `empty=True`
|
||||
- **Invalid Filters**: Raises informative ValueError with guidance
|
||||
- **Network Issues**: Propagates HTTP errors with context
|
||||
- **Data Type Mismatches**: Automatic type conversion where possible
|
||||
|
||||
## Method Chaining
|
||||
|
||||
Most filtering and selection methods return new Filings objects, enabling method chaining:
|
||||
|
||||
```python
|
||||
# Chain multiple operations
|
||||
result = (filings
|
||||
.filter(form=["10-K", "10-Q"])
|
||||
.filter(date="2023-01-01:2023-06-30")
|
||||
.filter(amendments=False)
|
||||
.latest(10))
|
||||
```
|
||||
|
||||
## Schema Information
|
||||
|
||||
The underlying PyArrow table contains these key columns:
|
||||
- `form`: SEC form type
|
||||
- `cik`: Company Central Index Key
|
||||
- `ticker`: Stock ticker symbol
|
||||
- `company`: Company name
|
||||
- `filing_date`: Date of filing
|
||||
- `accession_number`: Unique SEC identifier
|
||||
- Additional metadata columns for enhanced functionality
|
||||
|
||||
This comprehensive API makes the Filings class the primary interface for working with collections of SEC filing data in edgartools, providing both power and ease of use for financial data analysis.
|
||||
@@ -0,0 +1,49 @@
|
||||
"""
|
||||
EdgarTools HTML Parser v2.0
|
||||
|
||||
A high-performance, semantically-aware HTML parser for SEC filings.
|
||||
"""
|
||||
|
||||
from edgar.documents.parser import HTMLParser
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.exceptions import ParsingError
|
||||
from edgar.documents.types import NodeType, SemanticType, TableType
|
||||
from edgar.documents.search import DocumentSearch, SearchResult, SearchMode
|
||||
from edgar.documents.renderers import MarkdownRenderer, TextRenderer
|
||||
|
||||
__version__ = "2.0.0"
|
||||
__all__ = [
|
||||
'HTMLParser',
|
||||
'Document',
|
||||
'ParserConfig',
|
||||
'ParsingError',
|
||||
'NodeType',
|
||||
'SemanticType',
|
||||
'TableType',
|
||||
'DocumentSearch',
|
||||
'SearchResult',
|
||||
'SearchMode',
|
||||
'MarkdownRenderer',
|
||||
'TextRenderer',
|
||||
'parse_html'
|
||||
]
|
||||
|
||||
|
||||
def parse_html(html: str, config: ParserConfig = None) -> Document:
|
||||
"""
|
||||
Convenience function for parsing HTML.
|
||||
|
||||
Args:
|
||||
html: HTML content to parse
|
||||
config: Optional parser configuration
|
||||
|
||||
Returns:
|
||||
Parsed Document object
|
||||
|
||||
Example:
|
||||
>>> document = parse_html(html_content)
|
||||
>>> print(document.text()[:100])
|
||||
"""
|
||||
parser = HTMLParser(config or ParserConfig())
|
||||
return parser.parse(html)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,83 @@
|
||||
"""
|
||||
Mixin class providing text caching functionality for document nodes.
|
||||
|
||||
This module consolidates the text caching pattern used across multiple node types
|
||||
(DocumentNode, ParagraphNode, ContainerNode, TableNode, and Document).
|
||||
"""
|
||||
|
||||
from typing import Callable, Any
|
||||
|
||||
|
||||
class CacheableMixin:
|
||||
"""
|
||||
Mixin providing text caching functionality for nodes.
|
||||
|
||||
This mixin implements a lazy-evaluated text caching pattern that:
|
||||
1. Checks for existing cached text
|
||||
2. Generates text on first access via a generator function
|
||||
3. Caches the result for subsequent accesses
|
||||
4. Provides recursive cache clearing for tree structures
|
||||
|
||||
Usage:
|
||||
class MyNode(CacheableMixin):
|
||||
def text(self, **kwargs):
|
||||
def generator():
|
||||
# Generate text logic here
|
||||
return "generated text"
|
||||
return self._get_cached_text(generator)
|
||||
"""
|
||||
|
||||
def _get_cached_text(self, generator_func: Callable[[], Any], *args, **kwargs) -> Any:
|
||||
"""
|
||||
Get cached text or generate and cache it.
|
||||
|
||||
This method implements the caching pattern:
|
||||
- If cache exists and is not None, return cached value
|
||||
- Otherwise, call generator function to create text
|
||||
- Store result in cache
|
||||
- Return the result
|
||||
|
||||
Args:
|
||||
generator_func: Function that generates the text when cache miss occurs
|
||||
*args: Positional arguments to pass to generator (currently unused)
|
||||
**kwargs: Keyword arguments to pass to generator (currently unused)
|
||||
|
||||
Returns:
|
||||
The cached or newly generated text
|
||||
|
||||
Note:
|
||||
The cache is stored in the instance attribute '_text_cache'.
|
||||
Generator function is called without arguments in current implementation.
|
||||
"""
|
||||
if hasattr(self, '_text_cache') and self._text_cache is not None:
|
||||
return self._text_cache
|
||||
|
||||
# Generate text and cache it
|
||||
self._text_cache = generator_func(*args, **kwargs)
|
||||
return self._text_cache
|
||||
|
||||
def clear_text_cache(self) -> None:
|
||||
"""
|
||||
Clear cached text recursively.
|
||||
|
||||
This method:
|
||||
1. Clears the text cache for this node (sets to None)
|
||||
2. Recursively clears cache for all children (if node has children)
|
||||
|
||||
The recursive clearing ensures that when a parent node's content changes,
|
||||
all descendant nodes also have their caches invalidated.
|
||||
|
||||
Safe to call even if:
|
||||
- Node doesn't have a cache (_text_cache attribute)
|
||||
- Node doesn't have children
|
||||
- Children don't have clear_text_cache method
|
||||
"""
|
||||
# Clear own cache if it exists
|
||||
if hasattr(self, '_text_cache'):
|
||||
self._text_cache = None
|
||||
|
||||
# Recursively clear children's caches
|
||||
if hasattr(self, 'children'):
|
||||
for child in self.children:
|
||||
if hasattr(child, 'clear_text_cache'):
|
||||
child.clear_text_cache()
|
||||
211
venv/lib/python3.10/site-packages/edgar/documents/config.py
Normal file
211
venv/lib/python3.10/site-packages/edgar/documents/config.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""
|
||||
Configuration for the HTML parser.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetectionThresholds:
|
||||
"""
|
||||
Configurable thresholds for section detection strategies.
|
||||
|
||||
Attributes:
|
||||
min_confidence: Minimum confidence score to include a section (0.0-1.0)
|
||||
cross_validation_boost: Multiplier when multiple methods agree (>1.0)
|
||||
disagreement_penalty: Multiplier when methods disagree (<1.0)
|
||||
boundary_overlap_penalty: Multiplier for overlapping sections (<1.0)
|
||||
enable_cross_validation: Whether to run cross-validation (slower but more accurate)
|
||||
thresholds_by_form: Filing-specific threshold overrides
|
||||
"""
|
||||
min_confidence: float = 0.6
|
||||
cross_validation_boost: float = 1.2
|
||||
disagreement_penalty: float = 0.8
|
||||
boundary_overlap_penalty: float = 0.9
|
||||
enable_cross_validation: bool = False # Disabled by default for performance
|
||||
thresholds_by_form: Dict[str, Dict[str, float]] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserConfig:
|
||||
"""
|
||||
Configuration for HTML parser.
|
||||
|
||||
Attributes:
|
||||
max_document_size: Maximum document size in bytes
|
||||
streaming_threshold: Document size threshold for streaming mode
|
||||
cache_size: Maximum number of cached items
|
||||
enable_parallel: Enable parallel processing for tables
|
||||
strict_mode: Fail on parsing errors vs. best effort
|
||||
extract_xbrl: Extract inline XBRL facts
|
||||
extract_styles: Extract and process CSS styles
|
||||
preserve_whitespace: Preserve original whitespace
|
||||
optimize_for_ai: Enable AI-specific optimizations
|
||||
max_token_estimation: Maximum estimated tokens for AI optimization
|
||||
features: Feature flags for optional functionality
|
||||
"""
|
||||
|
||||
# Performance settings
|
||||
max_document_size: int = 100 * 1024 * 1024 # 100MB (handles large filings like JPM)
|
||||
streaming_threshold: int = 10 * 1024 * 1024 # 10MB
|
||||
cache_size: int = 1000
|
||||
enable_parallel: bool = True
|
||||
max_workers: Optional[int] = None # None = use CPU count
|
||||
|
||||
# Parsing settings
|
||||
strict_mode: bool = False
|
||||
extract_xbrl: bool = True
|
||||
extract_styles: bool = True
|
||||
preserve_whitespace: bool = False
|
||||
normalize_text: bool = True
|
||||
extract_links: bool = True
|
||||
extract_images: bool = False
|
||||
|
||||
# AI optimization
|
||||
optimize_for_ai: bool = True
|
||||
max_token_estimation: int = 100_000
|
||||
chunk_size: int = 512
|
||||
chunk_overlap: int = 128
|
||||
|
||||
# Table processing
|
||||
table_extraction: bool = True
|
||||
detect_table_types: bool = True
|
||||
extract_table_relationships: bool = True
|
||||
fast_table_rendering: bool = True # Fast renderer is now production-ready (7-10x faster than Rich)
|
||||
|
||||
# Section detection
|
||||
detect_sections: bool = True
|
||||
eager_section_extraction: bool = False # Extract sections during parsing vs. on first access (default: lazy)
|
||||
form: Optional[str] = None # Required for section detection (e.g. '10-K', '10-Q', '8-K')
|
||||
detection_thresholds: DetectionThresholds = field(default_factory=DetectionThresholds)
|
||||
section_patterns: Dict[str, List[str]] = field(default_factory=lambda: {
|
||||
'business': [
|
||||
r'item\s+1\.?\s*business',
|
||||
r'business\s+overview',
|
||||
r'our\s+business'
|
||||
],
|
||||
'risk_factors': [
|
||||
r'item\s+1a\.?\s*risk\s+factors',
|
||||
r'risk\s+factors',
|
||||
r'factors\s+that\s+may\s+affect'
|
||||
],
|
||||
'properties': [
|
||||
r'item\s+2\.?\s*properties',
|
||||
r'properties'
|
||||
],
|
||||
'legal_proceedings': [
|
||||
r'item\s+3\.?\s*legal\s+proceedings',
|
||||
r'legal\s+proceedings',
|
||||
r'litigation'
|
||||
],
|
||||
'mda': [
|
||||
r'item\s+7\.?\s*management\'?s?\s+discussion',
|
||||
r'md&a',
|
||||
r'management\'?s?\s+discussion\s+and\s+analysis'
|
||||
],
|
||||
'financial_statements': [
|
||||
r'item\s+8\.?\s*financial\s+statements',
|
||||
r'consolidated\s+financial\s+statements',
|
||||
r'financial\s+statements'
|
||||
]
|
||||
})
|
||||
|
||||
# Feature flags
|
||||
features: Dict[str, bool] = field(default_factory=lambda: {
|
||||
'ml_header_detection': True,
|
||||
'semantic_analysis': True,
|
||||
'table_understanding': True,
|
||||
'xbrl_validation': True,
|
||||
'auto_section_detection': True,
|
||||
'smart_text_extraction': True,
|
||||
'footnote_linking': True,
|
||||
'cross_reference_resolution': True
|
||||
})
|
||||
|
||||
# Header detection settings
|
||||
header_detection_threshold: float = 0.6 # Minimum confidence
|
||||
header_detection_methods: List[str] = field(default_factory=lambda: [
|
||||
'style',
|
||||
'pattern',
|
||||
'structural',
|
||||
'contextual'
|
||||
])
|
||||
|
||||
# Text extraction settings
|
||||
min_text_length: int = 10 # Minimum text length to keep
|
||||
merge_adjacent_nodes: bool = True
|
||||
merge_distance: int = 2 # Max distance between nodes to merge
|
||||
|
||||
# Performance monitoring
|
||||
enable_profiling: bool = False
|
||||
log_performance: bool = False
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert configuration to dictionary."""
|
||||
return {
|
||||
'max_document_size': self.max_document_size,
|
||||
'streaming_threshold': self.streaming_threshold,
|
||||
'cache_size': self.cache_size,
|
||||
'enable_parallel': self.enable_parallel,
|
||||
'strict_mode': self.strict_mode,
|
||||
'extract_xbrl': self.extract_xbrl,
|
||||
'extract_styles': self.extract_styles,
|
||||
'preserve_whitespace': self.preserve_whitespace,
|
||||
'optimize_for_ai': self.optimize_for_ai,
|
||||
'features': self.features.copy()
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def for_performance(cls) -> 'ParserConfig':
|
||||
"""Create config optimized for performance."""
|
||||
return cls(
|
||||
extract_styles=False,
|
||||
extract_xbrl=False,
|
||||
enable_parallel=True,
|
||||
cache_size=5000,
|
||||
eager_section_extraction=False, # Skip expensive section extraction
|
||||
fast_table_rendering=True, # Fast renderer (enabled by default now)
|
||||
features={
|
||||
'ml_header_detection': False,
|
||||
'semantic_analysis': False,
|
||||
'table_understanding': False,
|
||||
'xbrl_validation': False
|
||||
}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def for_accuracy(cls) -> 'ParserConfig':
|
||||
"""Create config optimized for accuracy."""
|
||||
return cls(
|
||||
strict_mode=True,
|
||||
extract_styles=True,
|
||||
extract_xbrl=True,
|
||||
enable_parallel=True,
|
||||
features={
|
||||
'ml_header_detection': True,
|
||||
'semantic_analysis': True,
|
||||
'table_understanding': True,
|
||||
'xbrl_validation': True,
|
||||
'auto_section_detection': True,
|
||||
'smart_text_extraction': True,
|
||||
'footnote_linking': True,
|
||||
'cross_reference_resolution': True
|
||||
}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def for_ai(cls) -> 'ParserConfig':
|
||||
"""Create config optimized for AI/LLM processing."""
|
||||
return cls(
|
||||
optimize_for_ai=True,
|
||||
extract_styles=False,
|
||||
extract_xbrl=True,
|
||||
normalize_text=True,
|
||||
merge_adjacent_nodes=True,
|
||||
features={
|
||||
'ml_header_detection': True,
|
||||
'semantic_analysis': True,
|
||||
'smart_text_extraction': True
|
||||
}
|
||||
)
|
||||
@@ -0,0 +1,314 @@
|
||||
# HTML Parser Rewrite - Status Report
|
||||
|
||||
**Generated**: 2025-10-08
|
||||
**Branch**: `html_rewrite`
|
||||
**Target**: Merge to `main`
|
||||
|
||||
---
|
||||
|
||||
## Overall Progress: ~95% Complete ✅
|
||||
|
||||
### Completed Phases
|
||||
|
||||
#### ✅ Phase 1: Core Implementation (100%)
|
||||
- [x] Streaming parser for large documents
|
||||
- [x] TableMatrix system for accurate table rendering
|
||||
- [x] Section extraction with Part I/II detection
|
||||
- [x] XBRL integration
|
||||
- [x] Rich-based table rendering
|
||||
- [x] Configuration system (ParserConfig)
|
||||
- [x] Error handling and validation
|
||||
|
||||
#### ✅ Phase 2: Functional Testing (100%)
|
||||
- [x] **Corpus Validation** - 40 diverse filings, 100% success rate
|
||||
- [x] **Edge Cases** - 31 tests covering invalid inputs, malformed HTML, edge conditions
|
||||
- [x] **Integration Tests** - 25 tests for Filing/Company integration, backward compatibility
|
||||
- [x] **Regression Tests** - 15 tests preventing known bugs from returning
|
||||
|
||||
**Total Test Count**: 79 functional tests, all passing
|
||||
|
||||
#### ✅ Phase 3: Performance Profiling (100%)
|
||||
- [x] **Benchmarking Infrastructure** - Comprehensive benchmark suite
|
||||
- [x] **Hot Path Analysis** - Identified 3 critical bottlenecks (63% section extraction, 40% Rich rendering, 15% regex)
|
||||
- [x] **Memory Profiling** - Found 255MB memory leak in MSFT 10-K, documented root causes
|
||||
- [x] **Performance Regression Tests** - 15 tests locking in baseline thresholds
|
||||
|
||||
**Performance Baseline Established**:
|
||||
- Average: 3.8MB/s throughput, 4.1MB memory per doc
|
||||
- Small docs: 2.6MB/s (optimization opportunity)
|
||||
- Large docs: 20.7MB/s (excellent streaming)
|
||||
- Memory leak: 19-25x ratio on medium docs (needs fixing)
|
||||
|
||||
#### ✅ Phase 4: Test Data Augmentation (100%)
|
||||
- [x] **HTML Fixtures** - Downloaded 32 files (155MB) from 16 companies across 6 industries
|
||||
- [x] **Download Automation** - Created `download_html_fixtures.py` script
|
||||
- [x] **Documentation** - Comprehensive fixture documentation
|
||||
|
||||
---
|
||||
|
||||
## Current Status: Ready for Optimization Phase
|
||||
|
||||
### What's Working Well ✅
|
||||
|
||||
1. **Parsing Accuracy**: 100% success rate across 40+ diverse filings
|
||||
2. **Large Document Handling**: Excellent streaming performance (20.7MB/s on JPM 10-K)
|
||||
3. **Table Extraction**: TableMatrix accurately handles colspan/rowspan
|
||||
4. **Test Coverage**: 79 comprehensive tests covering edge cases, integration, regression
|
||||
5. **Backward Compatibility**: Old TenK API still works for existing code
|
||||
|
||||
### Known Issues to Address 🔧
|
||||
|
||||
#### Critical (Must Fix Before Merge)
|
||||
|
||||
1. **Memory Leaks** (Priority: CRITICAL)
|
||||
- MSFT 10-K: 255MB leak (19x document size)
|
||||
- Apple 10-K: 41MB leak (23x document size)
|
||||
- **Root Causes**:
|
||||
- Rich Console objects retained (0.4MB per doc)
|
||||
- Global caches not cleared on document deletion
|
||||
- Circular references in node graph
|
||||
- **Location**: `tests/perf/memory_analysis.md:90-130`
|
||||
- **Impact**: Server crashes after 10-20 requests in production
|
||||
|
||||
2. **Performance Bottlenecks** (Priority: HIGH)
|
||||
- Section extraction: 3.7s (63% of parse time)
|
||||
- Rich rendering for text: 2.4s (40% of parse time)
|
||||
- Regex normalization: 0.8s (15% of parse time)
|
||||
- **Location**: `tests/perf/hotpath_analysis.md:9-66`
|
||||
- **Impact**: 4x slower than necessary on medium documents
|
||||
|
||||
#### Non-Critical (Can Fix After Merge)
|
||||
|
||||
3. **Small Document Performance** (Priority: MEDIUM)
|
||||
- 2.6MB/s vs desired 5MB/s
|
||||
- Overhead dominates on <5MB documents
|
||||
- **Optimization**: Lazy loading, reduce upfront processing
|
||||
|
||||
---
|
||||
|
||||
## Next Steps (In Order)
|
||||
|
||||
### Phase 5: Critical Fixes (2-3 days) 🔧
|
||||
|
||||
#### 5.1 Memory Leak Fixes (1-2 days)
|
||||
**Goal**: Reduce memory leak from 255MB to <5MB
|
||||
|
||||
Tasks:
|
||||
- [ ] Implement `Document.__del__()` to clear caches
|
||||
- [ ] Replace Rich rendering in `text()` with direct string building
|
||||
- [ ] Break circular references in node graph
|
||||
- [ ] Use weak references for parent links
|
||||
- [ ] Add `__slots__` to frequently created objects (Cell, TableNode)
|
||||
|
||||
**Expected Result**: MSFT 10-K leak: 255MB → <5MB (95% improvement)
|
||||
|
||||
**Validation**:
|
||||
```bash
|
||||
pytest tests/perf/test_performance_regression.py::TestMemoryRegression -v
|
||||
```
|
||||
|
||||
#### 5.2 Performance Optimizations (1-2 days)
|
||||
**Goal**: Improve parse speed from 1.2s → 0.3s on Apple 10-K (77% faster)
|
||||
|
||||
Tasks:
|
||||
- [ ] Fix section detection - use headings instead of rendering entire document
|
||||
- [ ] Implement fast text extraction without Rich overhead
|
||||
- [ ] Optimize regex normalization - combine patterns, use compilation
|
||||
|
||||
**Expected Results**:
|
||||
- Section extraction: 3.7s → 1.2s (60% faster)
|
||||
- Text extraction: 2.4s → 1.2s (50% faster)
|
||||
- Regex: 0.8s → 0.5s (40% faster)
|
||||
|
||||
**Validation**:
|
||||
```bash
|
||||
pytest tests/perf/test_performance_regression.py::TestParseSpeedRegression -v
|
||||
```
|
||||
|
||||
### Phase 6: Final Validation (1 day) ✅
|
||||
|
||||
Tasks:
|
||||
- [ ] Re-run all 79 functional tests
|
||||
- [ ] Re-run performance regression tests (verify improvements)
|
||||
- [ ] Run full corpus validation
|
||||
- [ ] Memory profiling validation (confirm leaks fixed)
|
||||
- [ ] Update CHANGELOG.md
|
||||
- [ ] Create merge summary document
|
||||
|
||||
### Phase 7: Merge to Main (1 day) 🚀
|
||||
|
||||
Tasks:
|
||||
- [ ] Final code review
|
||||
- [ ] Squash commits or create clean merge
|
||||
- [ ] Update version number
|
||||
- [ ] Merge to main
|
||||
- [ ] Tag release
|
||||
- [ ] Monitor for issues
|
||||
|
||||
---
|
||||
|
||||
## Test Summary
|
||||
|
||||
### Current Test Status: 79/79 Passing (100%)
|
||||
|
||||
```
|
||||
tests/corpus/test_corpus_validation.py 8 tests ✓
|
||||
tests/test_html_parser_edge_cases.py 31 tests ✓
|
||||
tests/test_html_parser_integration.py 25 tests ✓
|
||||
tests/test_html_parser_regressions.py 15 tests ✓
|
||||
tests/perf/test_performance_regression.py 15 tests ✓ (baseline established)
|
||||
```
|
||||
|
||||
### Test Execution
|
||||
|
||||
```bash
|
||||
# Functional tests (79 tests, ~30s)
|
||||
pytest tests/corpus tests/test_html_parser_*.py -v
|
||||
|
||||
# Performance tests (15 tests, ~20s)
|
||||
pytest tests/perf/test_performance_regression.py -m performance -v
|
||||
|
||||
# All tests
|
||||
pytest tests/ -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Metrics
|
||||
|
||||
### Current Baseline (Before Optimization)
|
||||
|
||||
| Document | Size | Parse Time | Throughput | Memory | Tables | Sections |
|
||||
|----------|------|------------|------------|--------|--------|----------|
|
||||
| Apple 10-Q | 1.1MB | 0.307s | 3.6MB/s | 27.9MB (25.6x) | 40 | 9 |
|
||||
| Apple 10-K | 1.8MB | 0.500s | 3.6MB/s | 21.6MB (11.9x) | 63 | 8 |
|
||||
| MSFT 10-K | 7.8MB | 1.501s | 5.2MB/s | 147.0MB (18.9x) | 85 | 0 |
|
||||
| JPM 10-K | 52.4MB | 2.537s | 20.7MB/s | 0.6MB (0.01x) | 681 | 0 |
|
||||
|
||||
### Target Metrics (After Optimization)
|
||||
|
||||
| Metric | Current | Target | Improvement |
|
||||
|--------|---------|--------|-------------|
|
||||
| **Memory leak** | 41-255MB | <5MB | 95% reduction |
|
||||
| **Memory ratio** | 19-25x | <3x | 87% reduction |
|
||||
| **Parse time (Apple 10-K)** | 0.500s | 0.150s | 70% faster |
|
||||
| **Throughput (small docs)** | 2.6MB/s | 5.0MB/s | 92% faster |
|
||||
|
||||
---
|
||||
|
||||
## File Organization
|
||||
|
||||
### Core Parser Files
|
||||
```
|
||||
edgar/documents/
|
||||
├── __init__.py # Public API (parse_html)
|
||||
├── parser.py # Main parser with streaming
|
||||
├── config.py # ParserConfig
|
||||
├── document_builder.py # Document tree construction
|
||||
├── nodes/ # Node types (TableNode, SectionNode)
|
||||
├── utils/
|
||||
│ ├── streaming.py # Streaming parser (fixed JPM bug)
|
||||
│ └── table_processing.py # TableMatrix system
|
||||
└── exceptions.py # Custom exceptions
|
||||
```
|
||||
|
||||
### Test Files
|
||||
```
|
||||
tests/
|
||||
├── corpus/ # Corpus validation
|
||||
│ ├── quick_corpus.py # Corpus builder
|
||||
│ └── test_corpus_validation.py # 8 validation tests
|
||||
├── fixtures/
|
||||
│ ├── html/ # 32 HTML fixtures (155MB)
|
||||
│ │ ├── {ticker}/10k/ # By company and form
|
||||
│ │ └── README.md
|
||||
│ └── download_html_fixtures.py # Download automation
|
||||
├── perf/ # Performance testing
|
||||
│ ├── benchmark_html_parser.py # Benchmarking
|
||||
│ ├── profile_hotpaths.py # Hot path profiling
|
||||
│ ├── profile_memory.py # Memory profiling
|
||||
│ ├── test_performance_regression.py # Regression tests
|
||||
│ ├── performance_report.md # Benchmark results
|
||||
│ ├── hotpath_analysis.md # Bottleneck analysis
|
||||
│ └── memory_analysis.md # Memory leak analysis
|
||||
├── test_html_parser_edge_cases.py # 31 edge case tests
|
||||
├── test_html_parser_integration.py # 25 integration tests
|
||||
└── test_html_parser_regressions.py # 15 regression tests
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Risks and Mitigation
|
||||
|
||||
### Risk 1: Memory Leaks in Production
|
||||
**Severity**: HIGH
|
||||
**Probability**: HIGH (confirmed in testing)
|
||||
**Mitigation**: Must fix before merge (Phase 5.1)
|
||||
|
||||
### Risk 2: Performance Regression
|
||||
**Severity**: MEDIUM
|
||||
**Probability**: LOW (baseline established, regression tests in place)
|
||||
**Mitigation**: Performance regression tests will catch any degradation
|
||||
|
||||
### Risk 3: Backward Compatibility
|
||||
**Severity**: LOW
|
||||
**Probability**: LOW (integration tests passing)
|
||||
**Mitigation**: 25 integration tests verify old API still works
|
||||
|
||||
---
|
||||
|
||||
## Estimated Timeline to Merge
|
||||
|
||||
```
|
||||
Phase 5.1: Memory leak fixes 1-2 days
|
||||
Phase 5.2: Performance optimization 1-2 days
|
||||
Phase 6: Final validation 1 day
|
||||
Phase 7: Merge to main 1 day
|
||||
----------------------------------------
|
||||
Total: 4-6 days
|
||||
```
|
||||
|
||||
**Target Merge Date**: October 12-14, 2025
|
||||
|
||||
---
|
||||
|
||||
## Decision Points
|
||||
|
||||
### Should We Merge Now or After Optimization?
|
||||
|
||||
**Option A: Merge Now (Not Recommended)**
|
||||
- ✅ Functional tests passing
|
||||
- ✅ Backward compatible
|
||||
- ❌ Memory leaks (production risk)
|
||||
- ❌ Performance issues
|
||||
- ❌ Will require hotfix soon
|
||||
|
||||
**Option B: Fix Critical Issues First (Recommended)**
|
||||
- ✅ Production-ready
|
||||
- ✅ Performance validated
|
||||
- ✅ Memory efficient
|
||||
- ❌ 4-6 days delay
|
||||
- ✅ Clean, professional release
|
||||
|
||||
**Recommendation**: **Option B** - Fix critical memory leaks and performance issues before merge. The 4-6 day investment prevents production incidents and ensures a polished release.
|
||||
|
||||
---
|
||||
|
||||
## Questions for Review
|
||||
|
||||
1. **Scope**: Should we fix only critical issues (memory + performance) or also tackle small-doc optimization?
|
||||
2. **Timeline**: Is 4-6 days acceptable, or do we need to merge sooner?
|
||||
3. **Testing**: Are 79 functional tests + 15 performance tests sufficient coverage?
|
||||
4. **Documentation**: Do we need user-facing documentation updates?
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
The HTML parser rewrite is **95% complete** with excellent functional testing but critical memory and performance issues identified. The smart path forward is:
|
||||
|
||||
1. ✅ Complete critical fixes (4-6 days)
|
||||
2. ✅ Validate improvements
|
||||
3. ✅ Merge to main with confidence
|
||||
|
||||
This approach ensures a production-ready, performant parser rather than merging now and hotfixing later.
|
||||
@@ -0,0 +1,437 @@
|
||||
# HTML Parser Rewrite - Progress Assessment
|
||||
|
||||
**Date**: 2025-10-07
|
||||
**Status**: Active Development (html_rewrite branch)
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
The HTML parser rewrite is **substantially complete** for core functionality with **excellent progress** on Item/section detection. Recent bug fixes (2025-10-07) have addressed critical table rendering issues and 10-Q Part I/II distinction, bringing the parser close to production-ready quality.
|
||||
|
||||
### Overall Progress: **~90% Complete**
|
||||
|
||||
- ✅ Core parsing infrastructure: **100% Complete**
|
||||
- ✅ Table processing: **95% Complete** (recent fixes)
|
||||
- ✅ Section/Item detection: **95% Complete** (Part I/II fixed, needs validation)
|
||||
- ⚠️ Performance optimization: **70% Complete**
|
||||
- ⚠️ Comprehensive testing: **65% Complete** (added 10-Q Part tests)
|
||||
- ⚠️ Documentation: **75% Complete**
|
||||
|
||||
---
|
||||
|
||||
## Goal Achievement Analysis
|
||||
|
||||
### Primary Goals (from goals.md)
|
||||
|
||||
#### 1. **Semantic Meaning Preservation** ✅ **ACHIEVED**
|
||||
> "Read text, tables and ixbrl data preserving greatest semantic meaning"
|
||||
|
||||
**Status**: ✅ Fully implemented
|
||||
- Text extraction with structure preservation
|
||||
- Advanced table matrix system for accurate table rendering
|
||||
- XBRL fact extraction before preprocessing
|
||||
- Hierarchical node model maintains document structure
|
||||
|
||||
**Recent Improvements**:
|
||||
- Header detection fixes (Oracle Table 6, Tesla Table 16)
|
||||
- Spacing column filter now preserves header columns (MSFT Table 39)
|
||||
- Multi-row header normalization
|
||||
|
||||
#### 2. **AI Channel (Primary) + Human Channel (Secondary)** ✅ **ACHIEVED**
|
||||
> "AI context is the primary goal, with human context being secondary"
|
||||
|
||||
**Status**: ✅ Both channels working
|
||||
- **AI Channel**:
|
||||
- Clean text output optimized for LLMs
|
||||
- Structured table rendering for context windows
|
||||
- Section-level extraction for chunking
|
||||
- Semantic divisibility supported
|
||||
|
||||
- **Human Channel**:
|
||||
- Rich console rendering with proper formatting
|
||||
- Markdown export
|
||||
- Visual table alignment (recently fixed)
|
||||
|
||||
#### 3. **Section-Level Processing** ✅ **ACHIEVED**
|
||||
> "Work at full document level and section level - breaking into independently processable sections"
|
||||
|
||||
**Status**: ✅ Implemented with good coverage
|
||||
- `SectionExtractor` class fully functional
|
||||
- TOC-based section detection
|
||||
- Pattern-based section identification
|
||||
- Lazy loading support for large documents
|
||||
|
||||
**What Works**:
|
||||
```python
|
||||
# Section detection is operational
|
||||
doc = parse_html(html)
|
||||
sections = doc.sections # Dict of section names -> SectionNode
|
||||
|
||||
# Access specific sections
|
||||
business = sections.get('Item 1 - Business')
|
||||
mda = sections.get('Item 7 - MD&A')
|
||||
financials = sections.get('Item 8 - Financial Statements')
|
||||
```
|
||||
|
||||
#### 4. **Standard Section Names (10-K, 10-Q, 8-K)** ✅ **ACHIEVED**
|
||||
> "For some filing types (10-K, 10-Q, 8-K) identify sections by standard names"
|
||||
|
||||
**Status**: ✅ 95% Complete - Implemented with Part I/II distinction for 10-Q
|
||||
|
||||
**What's Implemented**:
|
||||
- Pattern matching for standard Items:
|
||||
- Item 1 - Business
|
||||
- Item 1A - Risk Factors
|
||||
- Item 7 - MD&A
|
||||
- Item 7A - Market Risk
|
||||
- Item 8 - Financial Statements
|
||||
- And more...
|
||||
- **10-Q Part I/Part II distinction** (newly fixed 2025-10-07):
|
||||
- Part I - Item 1 (Financial Statements)
|
||||
- Part II - Item 1 (Legal Proceedings)
|
||||
- Proper boundary detection and context propagation
|
||||
- Prevents Item number conflicts
|
||||
|
||||
**What's Remaining** (5%):
|
||||
- Validation against large corpus of 10-K/10-Q filings
|
||||
- Edge case handling (non-standard formatting)
|
||||
- 8-K specific section patterns expansion
|
||||
|
||||
**Evidence from Code**:
|
||||
```python
|
||||
# edgar/documents/extractors/section_extractor.py
|
||||
(r'^(Item|ITEM)\s+1\.?\s*Business', 'Item 1 - Business'),
|
||||
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
|
||||
(r'^(Item|ITEM)\s+7\.?\s*Management.*Discussion', 'Item 7 - MD&A'),
|
||||
(r'^(Item|ITEM)\s+8\.?\s*Financial\s+Statements', 'Item 8 - Financial Statements'),
|
||||
|
||||
# NEW: Part I/II detection (edgar/documents/extractors/section_extractor.py:294-324)
|
||||
def _detect_10q_parts(self, headers) -> Dict[int, str]:
|
||||
"""Detect Part I and Part II boundaries in 10-Q filings."""
|
||||
```
|
||||
|
||||
#### 5. **Table Processing for AI Context** ✅ **ACHIEVED**
|
||||
> "Getting tables in the right structure for rendering to text for AI context is more important than dataframes"
|
||||
|
||||
**Status**: ✅ Excellent progress with recent fixes
|
||||
- Advanced TableMatrix system handles complex tables
|
||||
- Multi-row header detection and normalization
|
||||
- Spacing column filtering (preserves semantic columns)
|
||||
- Currency symbol merging
|
||||
- Clean text rendering for LLM consumption
|
||||
|
||||
**Recent Fixes (Today)**:
|
||||
- ✅ Fixed spacing column filter removing legitimate headers (MSFT Table 39)
|
||||
- ✅ Fixed header detection for date ranges (Oracle Table 6)
|
||||
- ✅ Fixed long narrative text misclassification (Tesla Table 16)
|
||||
- ✅ Header row normalization for alignment
|
||||
|
||||
#### 6. **Better Than Old Parser in Every Way** 🟡 **MOSTLY ACHIEVED**
|
||||
> "Speed, accuracy, features, usability"
|
||||
|
||||
**Comparison**:
|
||||
|
||||
| Aspect | Old Parser | New Parser | Status |
|
||||
|--------|-----------|------------|--------|
|
||||
| **Speed** | Baseline | 1.4x faster (typical) | ✅ Better |
|
||||
| **Accuracy** | Good | Excellent (with recent fixes) | ✅ Better |
|
||||
| **Features** | Basic | Rich (XBRL, sections, multiple outputs) | ✅ Better |
|
||||
| **Usability** | Simple | Powerful + Simple API | ✅ Better |
|
||||
| **Table Rendering** | Basic alignment | Advanced matrix system | ✅ Better |
|
||||
| **Section Detection** | Limited | Comprehensive | ✅ Better |
|
||||
|
||||
**Areas Needing Validation**:
|
||||
- Performance on very large documents (>50MB)
|
||||
- Memory usage under sustained load
|
||||
- Edge case handling across diverse filings
|
||||
|
||||
---
|
||||
|
||||
## Item/Section Detection Deep Dive
|
||||
|
||||
### Current Capabilities
|
||||
|
||||
**10-K Sections Detected**:
|
||||
- ✅ Item 1 - Business
|
||||
- ✅ Item 1A - Risk Factors
|
||||
- ✅ Item 1B - Unresolved Staff Comments
|
||||
- ✅ Item 2 - Properties
|
||||
- ✅ Item 3 - Legal Proceedings
|
||||
- ✅ Item 4 - Mine Safety Disclosures
|
||||
- ✅ Item 5 - Market for Stock
|
||||
- ✅ Item 6 - Selected Financial Data
|
||||
- ✅ Item 7 - MD&A
|
||||
- ✅ Item 7A - Market Risk
|
||||
- ✅ Item 8 - Financial Statements
|
||||
- ✅ Item 9 - Changes in Accounting
|
||||
- ✅ Item 9A - Controls and Procedures
|
||||
- ✅ Item 9B - Other Information
|
||||
- ✅ Item 10 - Directors and Officers
|
||||
- ✅ Item 11 - Executive Compensation
|
||||
- ✅ Item 12 - Security Ownership
|
||||
- ✅ Item 13 - Related Transactions
|
||||
- ✅ Item 14 - Principal Accountant
|
||||
- ✅ Item 15 - Exhibits
|
||||
|
||||
**10-Q Sections Detected**:
|
||||
- ✅ Part I Items (Financial Information):
|
||||
- Part I - Item 1 - Financial Statements
|
||||
- Part I - Item 2 - MD&A
|
||||
- Part I - Item 3 - Market Risk
|
||||
- Part I - Item 4 - Controls and Procedures
|
||||
- ✅ Part II Items (Other Information):
|
||||
- Part II - Item 1 - Legal Proceedings
|
||||
- Part II - Item 1A - Risk Factors
|
||||
- Part II - Item 2 - Unregistered Sales
|
||||
- Part II - Item 6 - Exhibits
|
||||
|
||||
**✅ FIXED** (2025-10-07): Part I/Part II distinction now implemented!
|
||||
- Part I Item 1 and Part II Item 1 are properly distinguished
|
||||
- Section keys include Part context: "Part I - Item 1 - Financial Statements" vs "Part II - Item 1 - Legal Proceedings"
|
||||
- Comprehensive test coverage added (5 tests in test_10q_part_detection.py)
|
||||
|
||||
**8-K Sections**:
|
||||
- ⚠️ Limited - needs expansion
|
||||
|
||||
### Detection Methods
|
||||
|
||||
1. **TOC-based Detection** ✅
|
||||
- Analyzes Table of Contents
|
||||
- Extracts anchor links
|
||||
- Maps sections to content
|
||||
|
||||
2. **Pattern-based Detection** ✅
|
||||
- Regex matching for Item headers
|
||||
- Heading analysis (h1-h6 tags)
|
||||
- Text pattern recognition
|
||||
|
||||
3. **Hybrid Approach** ✅
|
||||
- Combines TOC + patterns
|
||||
- Fallback mechanisms
|
||||
- Cross-validation
|
||||
|
||||
### What's Working
|
||||
|
||||
```python
|
||||
# This works today:
|
||||
from edgar.documents import parse_html
|
||||
|
||||
html = filing.html()
|
||||
doc = parse_html(html)
|
||||
|
||||
# Get all sections
|
||||
sections = doc.sections # Returns dict
|
||||
|
||||
# Access specific Items
|
||||
if 'Item 7 - MD&A' in sections:
|
||||
mda = sections['Item 7 - MD&A']
|
||||
mda_text = mda.text()
|
||||
mda_tables = mda.tables()
|
||||
```
|
||||
|
||||
### What Needs Work
|
||||
|
||||
1. **Validation Coverage** (20% remaining)
|
||||
- Test against 100+ diverse 10-K filings
|
||||
- Test against 10-Q filings
|
||||
- Test against 8-K filings
|
||||
- Capture edge cases and variations
|
||||
|
||||
2. **Edge Cases** (20% remaining)
|
||||
- Non-standard Item formatting
|
||||
- Missing TOC
|
||||
- Nested sections
|
||||
- Combined Items (e.g., "Items 10, 13, 14")
|
||||
|
||||
3. **8-K Support** (50% remaining)
|
||||
- 8-K specific Item patterns
|
||||
- Event-based section detection
|
||||
- Exhibit handling
|
||||
|
||||
---
|
||||
|
||||
## Recent Achievements (Past 24 Hours)
|
||||
|
||||
### Critical Bug Fixes ✅
|
||||
|
||||
1. **Spacing Column Filter Fix** (MSFT Table 39)
|
||||
- Problem: Legitimate headers removed as "spacing"
|
||||
- Solution: Header content protection + colspan preservation
|
||||
- Impact: Tables now render correctly with all headers
|
||||
- Commits: `4e43276`, `d19ddd1`
|
||||
|
||||
2. **Header Detection Improvements**
|
||||
- Oracle Table 6: Date ranges no longer misclassified
|
||||
- Tesla Table 16: Long narrative text properly handled
|
||||
- Multi-row header normalization
|
||||
- Comprehensive test coverage (16 new tests)
|
||||
|
||||
3. **Documentation Updates**
|
||||
- TESTING.md clarified output limits
|
||||
- CHANGELOG updated with fixes
|
||||
- Bug reports and research docs completed
|
||||
|
||||
### Quality Metrics
|
||||
|
||||
**Test Coverage**:
|
||||
- 16 new tests added (all passing)
|
||||
- 0 regressions in existing tests
|
||||
- Comprehensive edge case coverage
|
||||
|
||||
**Code Quality**:
|
||||
- Clean implementation following plan
|
||||
- Well-documented changes
|
||||
- Proper commit messages with Claude Code attribution
|
||||
|
||||
---
|
||||
|
||||
## Path to 100% Completion
|
||||
|
||||
### High Priority (Next Steps)
|
||||
|
||||
**📋 Detailed plans available**:
|
||||
- **Performance**: See `docs-internal/planning/active-tasks/2025-10-07-performance-optimization-plan.md`
|
||||
- **Testing**: See `docs-internal/planning/active-tasks/2025-10-07-comprehensive-testing-plan.md`
|
||||
|
||||
1. **Performance Optimization** (1-2 weeks)
|
||||
- [ ] Phase 1: Benchmarking & profiling (2-3 days)
|
||||
- [ ] Phase 2: Algorithm optimizations (3-4 days)
|
||||
- [ ] Phase 3: Validation & regression tests (2-3 days)
|
||||
- [ ] Phase 4: Documentation & monitoring (1 day)
|
||||
- **Goal**: Maintain 1.3x+ speed advantage, <2x memory usage
|
||||
|
||||
2. **Comprehensive Testing** (2-3 weeks)
|
||||
- [ ] Phase 1: Corpus validation - 100+ filings (3-4 days)
|
||||
- [ ] Phase 2: Edge cases & error handling (2-3 days)
|
||||
- [ ] Phase 3: Integration testing (2-3 days)
|
||||
- [ ] Phase 4: Regression prevention (1-2 days)
|
||||
- [ ] Phase 5: Documentation & sign-off (1 day)
|
||||
- **Goal**: >95% success rate, >80% test coverage
|
||||
|
||||
3. **Item Detection Validation** (included in testing plan)
|
||||
- [ ] Test against 50+ diverse 10-K filings
|
||||
- [ ] Test against 20+ 10-Q filings
|
||||
- [ ] Document any pattern variations found
|
||||
- [ ] Add regression tests for edge cases
|
||||
|
||||
### Medium Priority
|
||||
|
||||
4. **8-K Support** (1-2 days)
|
||||
- [ ] Research 8-K Item patterns
|
||||
- [ ] Implement detection patterns
|
||||
- [ ] Test against sample 8-K filings
|
||||
|
||||
5. **Documentation** (1 day)
|
||||
- [ ] User guide for section access
|
||||
- [ ] API documentation
|
||||
- [ ] Migration guide from old parser
|
||||
- [ ] Examples and recipes
|
||||
|
||||
### Low Priority (Polish)
|
||||
|
||||
6. **Final Polish**
|
||||
- [ ] Error message improvements
|
||||
- [ ] Logging enhancements
|
||||
- [ ] Configuration documentation
|
||||
- [ ] Performance tuning
|
||||
|
||||
---
|
||||
|
||||
## Risk Assessment
|
||||
|
||||
### Low Risk ✅
|
||||
- Core parsing functionality (stable)
|
||||
- Table processing (recently fixed, well-tested)
|
||||
- Text extraction (working well)
|
||||
- XBRL extraction (functional)
|
||||
|
||||
### Medium Risk ⚠️
|
||||
- Section detection edge cases (needs validation)
|
||||
- Performance on very large docs (needs testing)
|
||||
- Memory usage (needs profiling)
|
||||
|
||||
### Mitigation Strategy
|
||||
1. Comprehensive validation testing (in progress)
|
||||
2. Real-world filing corpus testing
|
||||
3. Performance benchmarking suite
|
||||
4. Gradual rollout with monitoring
|
||||
|
||||
---
|
||||
|
||||
## Recommendations
|
||||
|
||||
### Immediate Actions (This Week)
|
||||
|
||||
1. **Validate Item Detection** 🎯 **TOP PRIORITY**
|
||||
```bash
|
||||
# Run on diverse corpus
|
||||
python tests/manual/compare_parsers.py --all
|
||||
|
||||
# Test specific sections
|
||||
python -c "
|
||||
from edgar.documents import parse_html
|
||||
from pathlib import Path
|
||||
|
||||
for filing in ['Apple', 'Oracle', 'Tesla', 'Microsoft']:
|
||||
html = Path(f'data/html/{filing}.10-K.html').read_text()
|
||||
doc = parse_html(html)
|
||||
print(f'{filing}: {list(doc.sections.keys())[:5]}...')
|
||||
"
|
||||
```
|
||||
|
||||
2. **Create Section Access Tests**
|
||||
- Write tests that verify each Item can be accessed
|
||||
- Validate text and table extraction from sections
|
||||
- Test edge cases (missing Items, combined Items)
|
||||
|
||||
3. **User Acceptance Testing**
|
||||
- Have maintainer review section detection output
|
||||
- Validate against known-good filings
|
||||
- Document any issues found
|
||||
|
||||
### Timeline to Production
|
||||
|
||||
**Optimistic**: 1 week
|
||||
- If validation shows good Item detection
|
||||
- If performance is acceptable
|
||||
- If no major issues found
|
||||
|
||||
**Realistic**: 2-3 weeks
|
||||
- Account for edge case fixes
|
||||
- Additional testing needed
|
||||
- Documentation completion
|
||||
|
||||
**Conservative**: 4 weeks
|
||||
- Account for 8-K support
|
||||
- Comprehensive testing across all filing types
|
||||
- Full documentation
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
The HTML parser rewrite is **very close to completion** with excellent progress on all goals:
|
||||
|
||||
**✅ Fully Achieved**:
|
||||
- Semantic meaning preservation
|
||||
- AI/Human channel support
|
||||
- Section-level processing
|
||||
- Table processing for AI context
|
||||
- Superior to old parser (in most respects)
|
||||
- **Standard Item detection for 10-K/10-Q** (with Part I/II distinction)
|
||||
|
||||
**⚠️ Remaining Work (10%)**:
|
||||
- Validation against diverse corpus
|
||||
- Edge case handling
|
||||
- 8-K specific support expansion
|
||||
- Final testing and documentation
|
||||
|
||||
**Bottom Line**: The parser is **production-ready for 10-K/10-Q** with Item detection functional but requiring validation. The recent bug fixes have resolved critical table rendering issues. With 1-2 weeks of focused validation and testing, this can be shipped with confidence.
|
||||
|
||||
### Next Steps
|
||||
1. Run comprehensive Item detection validation
|
||||
2. Create section access test suite
|
||||
3. Performance benchmark
|
||||
4. Maintainer review and sign-off
|
||||
5. Merge to main branch
|
||||
@@ -0,0 +1,233 @@
|
||||
# HTML Parser Testing Quick Start
|
||||
|
||||
Quick reference for testing the HTML parser rewrite during quality improvement.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Use shortcuts (easy!)
|
||||
python tests/manual/compare_parsers.py aapl # Apple 10-K
|
||||
python tests/manual/compare_parsers.py nvda --tables # Nvidia tables
|
||||
python tests/manual/compare_parsers.py 'aapl 10-q' # Apple 10-Q
|
||||
python tests/manual/compare_parsers.py orcl --table 5 # Oracle table #5
|
||||
|
||||
# Or use full paths
|
||||
python tests/manual/compare_parsers.py data/html/Apple.10-K.html
|
||||
|
||||
# Run all test files
|
||||
python tests/manual/compare_parsers.py --all
|
||||
```
|
||||
|
||||
**Available shortcuts:**
|
||||
- **Companies**: `aapl`, `msft`, `tsla`, `nvda`, `orcl` (or full names like `apple`)
|
||||
- **Filing types**: `10-k` (default), `10-q`, `8-k`
|
||||
- **Combine**: `'aapl 10-q'`, `'orcl 8-k'`
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. First Look at a Filing
|
||||
|
||||
```bash
|
||||
# Get overview: speed, table count, sections
|
||||
python tests/manual/compare_parsers.py orcl
|
||||
```
|
||||
|
||||
**Shows**:
|
||||
- Parse time comparison (OLD vs NEW)
|
||||
- Tables found
|
||||
- Text length
|
||||
- Sections detected
|
||||
- New features (headings, XBRL)
|
||||
|
||||
### 2. Check Table Rendering
|
||||
|
||||
```bash
|
||||
# List all tables with dimensions (shows first 20 tables)
|
||||
python tests/manual/compare_parsers.py aapl --tables
|
||||
|
||||
# Compare specific table side-by-side (FULL table, no truncation)
|
||||
python tests/manual/compare_parsers.py aapl --table 7
|
||||
|
||||
# Compare a range of tables
|
||||
python tests/manual/compare_parsers.py aapl --range 5:10
|
||||
```
|
||||
|
||||
**Look for**:
|
||||
- Currency symbols merged: `$1,234` not `$ | 1,234`
|
||||
- Proper column alignment
|
||||
- Correct row/column counts
|
||||
- Clean rendering without extra spacing columns
|
||||
|
||||
**Note**: `--table N` shows the **complete table** with all rows - no truncation!
|
||||
|
||||
### 3. Verify Text Extraction
|
||||
|
||||
```bash
|
||||
# See first 50 lines side-by-side (default limit)
|
||||
python tests/manual/compare_parsers.py msft --text
|
||||
|
||||
# Show more lines (configurable)
|
||||
python tests/manual/compare_parsers.py msft --text --lines 100
|
||||
|
||||
# Show first 200 lines
|
||||
python tests/manual/compare_parsers.py msft --text --lines 200
|
||||
```
|
||||
|
||||
**Check**:
|
||||
- Semantic meaning preserved
|
||||
- No missing content
|
||||
- Clean formatting for LLM consumption
|
||||
|
||||
**Note**: Text mode shows first N lines only (default: 50). Use `--lines N` to adjust.
|
||||
|
||||
### 4. Check Section Detection
|
||||
|
||||
```bash
|
||||
python tests/manual/compare_parsers.py aapl --sections
|
||||
```
|
||||
|
||||
**Verify**:
|
||||
- Standard sections identified (10-K/10-Q)
|
||||
- Section boundaries correct
|
||||
- Text length reasonable per section
|
||||
|
||||
### 5. Run Full Test Suite
|
||||
|
||||
```bash
|
||||
# Test all files in corpus
|
||||
python tests/manual/compare_parsers.py --all
|
||||
```
|
||||
|
||||
**Results**:
|
||||
- Summary table across all files
|
||||
- Performance comparison
|
||||
- Table detection comparison
|
||||
|
||||
## Test Files
|
||||
|
||||
Available in `data/html/`:
|
||||
|
||||
- `Apple.10-K.html` - 1.8MB, complex financials
|
||||
- `Oracle.10-K.html` - Large filing
|
||||
- `Nvidia.10-K.html` - Tech company
|
||||
- `Apple.10-Q.html` - Quarterly format
|
||||
- More files as needed...
|
||||
|
||||
## Command Reference
|
||||
|
||||
```
|
||||
python tests/manual/compare_parsers.py [FILE] [OPTIONS]
|
||||
|
||||
Options:
|
||||
--all Run on all test files
|
||||
--tables Show tables summary (first 20 tables)
|
||||
--table N Show specific table N side-by-side (FULL table)
|
||||
--range START:END Show range of tables (e.g., 5:10)
|
||||
--text Show text comparison (first 50 lines by default)
|
||||
--sections Show sections comparison
|
||||
--lines N Number of text lines to show (default: 50, only for --text)
|
||||
--help Show full help
|
||||
```
|
||||
|
||||
### Output Limits Summary
|
||||
|
||||
| Mode | Limit | Configurable | Notes |
|
||||
|---------------|------------|-------------------|---------------------------------|
|
||||
| `--table N` | None | N/A | Shows **complete table** |
|
||||
| `--range N:M` | None | N/A | Shows **complete tables** in range |
|
||||
| `--tables` | 20 tables | No | Lists first 20 tables only |
|
||||
| `--text` | 50 lines | Yes (`--lines N`) | Preview only |
|
||||
| `--sections` | None | N/A | Shows all sections |
|
||||
|
||||
## Output Interpretation
|
||||
|
||||
### Overview Table
|
||||
|
||||
```
|
||||
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┓
|
||||
┃ Metric ┃ Old Parser ┃ New Parser ┃ Notes ┃
|
||||
┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━┩
|
||||
│ Parse Time │ 454ms │ 334ms │ 1.4x faster│
|
||||
│ Tables Found │ 63 │ 63 │ +0 │
|
||||
│ Text Length │ 0 │ 159,388 │ NEW! │
|
||||
└───────────────┴────────────┴────────────┴────────────┘
|
||||
```
|
||||
|
||||
**Good signs**:
|
||||
- ✅ New parser faster or similar speed
|
||||
- ✅ Same or more tables found
|
||||
- ✅ Text extracted (old parser shows 0)
|
||||
- ✅ Sections detected
|
||||
|
||||
**Red flags**:
|
||||
- ❌ Significantly slower
|
||||
- ❌ Fewer tables (unless removing layout tables)
|
||||
- ❌ Much shorter text (content missing)
|
||||
|
||||
### Table Comparison
|
||||
|
||||
```
|
||||
Old Parser:
|
||||
┌─────────┬──────────┬──────────┐
|
||||
│ Year │ Revenue │ Profit │
|
||||
├─────────┼──────────┼──────────┤
|
||||
│ 2023 │ $ 100M │ $ 20M │ <- Currency separated
|
||||
└─────────┴──────────┴──────────┘
|
||||
|
||||
New Parser:
|
||||
┌─────────┬──────────┬──────────┐
|
||||
│ Year │ Revenue │ Profit │
|
||||
├─────────┼──────────┼──────────┤
|
||||
│ 2023 │ $100M │ $20M │ <- Currency merged ✅
|
||||
└─────────┴──────────┴──────────┘
|
||||
```
|
||||
|
||||
**Look for**:
|
||||
- Currency symbols merged with values
|
||||
- No extra empty columns
|
||||
- Proper alignment
|
||||
- Clean numeric formatting
|
||||
|
||||
## Tips
|
||||
|
||||
1. **Start with overview** - Get the big picture first
|
||||
2. **Check tables visually** - Automated metrics miss formatting issues
|
||||
3. **Use specific table inspection** - Don't scroll through 60 tables manually
|
||||
4. **Compare text for semantics** - Does it make sense for an LLM?
|
||||
5. **Run --all periodically** - Catch regressions across files
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Script fails with import error
|
||||
|
||||
```bash
|
||||
# Clear cached modules
|
||||
find . -type d -name __pycache__ -exec rm -rf {} +
|
||||
python tests/manual/compare_parsers.py data/html/Apple.10-K.html
|
||||
```
|
||||
|
||||
### File not found
|
||||
|
||||
```bash
|
||||
# Check available files
|
||||
ls -lh data/html/*.html
|
||||
|
||||
# Use full path
|
||||
python tests/manual/compare_parsers.py /full/path/to/file.html
|
||||
```
|
||||
|
||||
### Old parser shows 0 text
|
||||
|
||||
This is expected - old parser has different text extraction. Focus on:
|
||||
- Table comparison
|
||||
- Parse time
|
||||
- Visual quality of output
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Run comparison on all test files
|
||||
2. Document bugs in `quality-improvement-strategy.md`
|
||||
3. Fix issues
|
||||
4. Repeat until satisfied
|
||||
|
||||
See `edgar/documents/docs/quality-improvement-strategy.md` for full process.
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user