Initial commit
This commit is contained in:
101
venv/lib/python3.10/site-packages/edgar/ai/formats.py
Normal file
101
venv/lib/python3.10/site-packages/edgar/ai/formats.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""
|
||||
AI-optimized text formatting utilities for EdgarTools.
|
||||
|
||||
Provides research-backed text formats optimized for LLM accuracy and token efficiency:
|
||||
- Markdown-KV: Best accuracy (60.7%) for metadata
|
||||
- TSV: Most efficient for tabular data
|
||||
|
||||
Based on research from improvingagents.com/blog/best-input-data-format-for-llms
|
||||
"""
|
||||
|
||||
from typing import List, Dict
|
||||
|
||||
__all__ = ['to_markdown_kv', 'to_tsv']
|
||||
|
||||
|
||||
def to_markdown_kv(data: dict, max_tokens: int = 2000) -> str:
|
||||
"""
|
||||
Convert dict to Markdown Key-Value format optimized for LLMs.
|
||||
|
||||
Research shows Markdown-KV format provides:
|
||||
- 60.7% accuracy (best among tested formats)
|
||||
- 25% fewer tokens than JSON
|
||||
- Better readability for both humans and AI
|
||||
|
||||
Source: improvingagents.com/blog/best-input-data-format-for-llms
|
||||
|
||||
Args:
|
||||
data: Dictionary with string keys and simple values
|
||||
max_tokens: Approximate token limit (4 chars/token heuristic)
|
||||
|
||||
Returns:
|
||||
Markdown-formatted key-value text
|
||||
|
||||
Example:
|
||||
>>> to_markdown_kv({"name": "Apple Inc.", "cik": "320193"})
|
||||
'**Name:** Apple Inc.\\n**Cik:** 320193'
|
||||
"""
|
||||
lines = []
|
||||
for key, value in data.items():
|
||||
if value is None:
|
||||
continue
|
||||
# Convert key to title case for readability
|
||||
display_key = key.replace('_', ' ').title()
|
||||
lines.append(f"**{display_key}:** {value}")
|
||||
|
||||
text = "\n".join(lines)
|
||||
|
||||
# Token limiting (4 chars/token heuristic)
|
||||
max_chars = max_tokens * 4
|
||||
if len(text) > max_chars:
|
||||
text = text[:max_chars] + "\n\n[Truncated for token limit]"
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def to_tsv(rows: List[Dict], headers: List[str], max_tokens: int = 2000, limit: int = 10) -> str:
|
||||
"""
|
||||
Convert list of dicts to TSV (tab-separated values) format.
|
||||
|
||||
TSV is extremely token-efficient for tabular data and provides better
|
||||
accuracy than CSV. This pattern is proven in MultiPeriodStatement.to_llm_string().
|
||||
|
||||
Args:
|
||||
rows: List of dicts with consistent keys
|
||||
headers: Column headers to include
|
||||
max_tokens: Approximate token limit (4 chars/token heuristic)
|
||||
limit: Maximum rows to include (default: 10)
|
||||
|
||||
Returns:
|
||||
Tab-separated values with header row
|
||||
|
||||
Example:
|
||||
>>> rows = [{"form": "10-K", "cik": "320193"}, {"form": "10-Q", "cik": "789019"}]
|
||||
>>> to_tsv(rows, ["form", "cik"], limit=2)
|
||||
'form\\tcik\\n10-K\\t320193\\n10-Q\\t789019'
|
||||
"""
|
||||
lines = []
|
||||
|
||||
# Header row
|
||||
lines.append("\t".join(headers))
|
||||
|
||||
# Data rows
|
||||
for row in rows[:limit]:
|
||||
values = [str(row.get(h, "N/A")) for h in headers]
|
||||
lines.append("\t".join(values))
|
||||
|
||||
text = "\n".join(lines)
|
||||
|
||||
# Add summary if truncated
|
||||
if len(rows) > limit:
|
||||
text += f"\n\n[Showing {limit} of {len(rows)} rows]"
|
||||
|
||||
# Token limiting
|
||||
max_chars = max_tokens * 4
|
||||
if len(text) > max_chars:
|
||||
# Estimate rows that fit
|
||||
avg_row_size = len(text) // len(lines) if lines else 100
|
||||
rows_that_fit = max(1, max_chars // avg_row_size)
|
||||
text = "\n".join(lines[:rows_that_fit]) + "\n\n[Truncated for token limit]"
|
||||
|
||||
return text
|
||||
Reference in New Issue
Block a user