Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/ai/formats.py
+++ b/venv/lib/python3.10/site-packages/edgar/ai/formats.py
@@ -0,0 +1,101 @@
+"""
+AI-optimized text formatting utilities for EdgarTools.
+
+Provides research-backed text formats optimized for LLM accuracy and token efficiency:
+- Markdown-KV: Best accuracy (60.7%) for metadata
+- TSV: Most efficient for tabular data
+
+Based on research from improvingagents.com/blog/best-input-data-format-for-llms
+"""
+
+from typing import List, Dict
+
+__all__ = ['to_markdown_kv', 'to_tsv']
+
+
+def to_markdown_kv(data: dict, max_tokens: int = 2000) -> str:
+    """
+    Convert dict to Markdown Key-Value format optimized for LLMs.
+
+    Research shows Markdown-KV format provides:
+    - 60.7% accuracy (best among tested formats)
+    - 25% fewer tokens than JSON
+    - Better readability for both humans and AI
+
+    Source: improvingagents.com/blog/best-input-data-format-for-llms
+
+    Args:
+        data: Dictionary with string keys and simple values
+        max_tokens: Approximate token limit (4 chars/token heuristic)
+
+    Returns:
+        Markdown-formatted key-value text
+
+    Example:
+        >>> to_markdown_kv({"name": "Apple Inc.", "cik": "320193"})
+        '**Name:** Apple Inc.\\n**Cik:** 320193'
+    """
+    lines = []
+    for key, value in data.items():
+        if value is None:
+            continue
+        # Convert key to title case for readability
+        display_key = key.replace('_', ' ').title()
+        lines.append(f"**{display_key}:** {value}")
+
+    text = "\n".join(lines)
+
+    # Token limiting (4 chars/token heuristic)
+    max_chars = max_tokens * 4
+    if len(text) > max_chars:
+        text = text[:max_chars] + "\n\n[Truncated for token limit]"
+
+    return text
+
+
+def to_tsv(rows: List[Dict], headers: List[str], max_tokens: int = 2000, limit: int = 10) -> str:
+    """
+    Convert list of dicts to TSV (tab-separated values) format.
+
+    TSV is extremely token-efficient for tabular data and provides better
+    accuracy than CSV. This pattern is proven in MultiPeriodStatement.to_llm_string().
+
+    Args:
+        rows: List of dicts with consistent keys
+        headers: Column headers to include
+        max_tokens: Approximate token limit (4 chars/token heuristic)
+        limit: Maximum rows to include (default: 10)
+
+    Returns:
+        Tab-separated values with header row
+
+    Example:
+        >>> rows = [{"form": "10-K", "cik": "320193"}, {"form": "10-Q", "cik": "789019"}]
+        >>> to_tsv(rows, ["form", "cik"], limit=2)
+        'form\\tcik\\n10-K\\t320193\\n10-Q\\t789019'
+    """
+    lines = []
+
+    # Header row
+    lines.append("\t".join(headers))
+
+    # Data rows
+    for row in rows[:limit]:
+        values = [str(row.get(h, "N/A")) for h in headers]
+        lines.append("\t".join(values))
+
+    text = "\n".join(lines)
+
+    # Add summary if truncated
+    if len(rows) > limit:
+        text += f"\n\n[Showing {limit} of {len(rows)} rows]"
+
+    # Token limiting
+    max_chars = max_tokens * 4
+    if len(text) > max_chars:
+        # Estimate rows that fit
+        avg_row_size = len(text) // len(lines) if lines else 100
+        rows_that_fit = max(1, max_chars // avg_row_size)
+        text = "\n".join(lines[:rows_that_fit]) + "\n\n[Truncated for token limit]"
+
+    return text