931 lines
33 KiB
Python
931 lines
33 KiB
Python
"""
|
|
Document model for parsed HTML.
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, List, Optional, Any, Iterator
|
|
|
|
from rich.table import Table as RichTable
|
|
from rich.console import Group
|
|
from rich.text import Text
|
|
from edgar.richtools import repr_rich
|
|
|
|
from edgar.documents.nodes import Node, SectionNode
|
|
from edgar.documents.table_nodes import TableNode
|
|
from edgar.documents.types import XBRLFact, SearchResult
|
|
|
|
|
|
@dataclass
|
|
class DocumentMetadata:
|
|
"""
|
|
Document metadata.
|
|
|
|
Contains information about the source document and parsing process.
|
|
"""
|
|
source: Optional[str] = None
|
|
form: Optional[str] = None
|
|
company: Optional[str] = None
|
|
cik: Optional[str] = None
|
|
accession_number: Optional[str] = None
|
|
filing_date: Optional[str] = None
|
|
report_date: Optional[str] = None
|
|
url: Optional[str] = None
|
|
size: int = 0
|
|
parse_time: float = 0.0
|
|
parser_version: str = "2.0.0"
|
|
xbrl_data: Optional[List[XBRLFact]] = None
|
|
preserve_whitespace: bool = False
|
|
original_html: Optional[str] = None # Store original HTML for anchor analysis
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert metadata to dictionary."""
|
|
return {
|
|
'source': self.source,
|
|
'form': self.form,
|
|
'company': self.company,
|
|
'cik': self.cik,
|
|
'accession_number': self.accession_number,
|
|
'filing_date': self.filing_date,
|
|
'report_date': self.report_date,
|
|
'url': self.url,
|
|
'size': self.size,
|
|
'parse_time': self.parse_time,
|
|
'parser_version': self.parser_version,
|
|
'xbrl_data': [fact.to_dict() for fact in self.xbrl_data] if self.xbrl_data else None
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class Section:
|
|
"""
|
|
Document section representation.
|
|
|
|
Represents a logical section of the document (e.g., Risk Factors, MD&A).
|
|
|
|
Attributes:
|
|
name: Section identifier (e.g., "item_1", "part_i_item_1", "risk_factors")
|
|
title: Display title (e.g., "Item 1 - Business")
|
|
node: Node containing section content
|
|
start_offset: Character position where section starts
|
|
end_offset: Character position where section ends
|
|
confidence: Detection confidence score (0.0-1.0)
|
|
detection_method: How section was detected ('toc', 'heading', 'pattern')
|
|
validated: Whether section has been cross-validated
|
|
part: Optional part identifier for 10-Q filings ("I", "II", or None for 10-K)
|
|
item: Optional item identifier (e.g., "1", "1A", "2")
|
|
_text_extractor: Optional callback for lazy text extraction (for TOC-based sections)
|
|
"""
|
|
name: str
|
|
title: str
|
|
node: SectionNode
|
|
start_offset: int = 0
|
|
end_offset: int = 0
|
|
confidence: float = 1.0 # Detection confidence (0.0-1.0)
|
|
detection_method: str = 'unknown' # 'toc', 'heading', 'pattern', or 'unknown'
|
|
validated: bool = False # Cross-validated flag
|
|
part: Optional[str] = None # Part identifier for 10-Q: "I", "II", or None for 10-K
|
|
item: Optional[str] = None # Item identifier: "1", "1A", "2", etc.
|
|
_text_extractor: Optional[Any] = field(default=None, repr=False) # Callback for lazy text extraction
|
|
|
|
def text(self, **kwargs) -> str:
|
|
"""Extract text from section."""
|
|
# If we have a text extractor callback (TOC-based sections), use it
|
|
if self._text_extractor is not None:
|
|
return self._text_extractor(self.name, **kwargs)
|
|
|
|
# Otherwise extract from node (heading/pattern-based sections)
|
|
from edgar.documents.extractors.text_extractor import TextExtractor
|
|
extractor = TextExtractor(**kwargs)
|
|
return extractor.extract_from_node(self.node)
|
|
|
|
def tables(self) -> List[TableNode]:
|
|
"""Get all tables in section."""
|
|
return self.node.find(lambda n: isinstance(n, TableNode))
|
|
|
|
def search(self, query: str) -> List[SearchResult]:
|
|
"""Search within section."""
|
|
# Implementation would use semantic search
|
|
results = []
|
|
# Simple text search for now
|
|
text = self.text().lower()
|
|
query_lower = query.lower()
|
|
|
|
if query_lower in text:
|
|
# Find snippet around match
|
|
index = text.find(query_lower)
|
|
start = max(0, index - 50)
|
|
end = min(len(text), index + len(query) + 50)
|
|
snippet = text[start:end]
|
|
|
|
results.append(SearchResult(
|
|
node=self.node,
|
|
score=1.0,
|
|
snippet=snippet,
|
|
section=self.name
|
|
))
|
|
|
|
return results
|
|
|
|
@staticmethod
|
|
def parse_section_name(section_name: str) -> tuple[Optional[str], Optional[str]]:
|
|
"""
|
|
Parse section name to extract part and item identifiers.
|
|
|
|
Handles both 10-Q part-aware names and 10-K simple names.
|
|
|
|
Args:
|
|
section_name: Section identifier (e.g., "part_i_item_1", "item_1a", "risk_factors")
|
|
|
|
Returns:
|
|
Tuple of (part, item) where:
|
|
- part: "I", "II", or None for 10-K sections
|
|
- item: "1", "1A", "2", etc. or None if not an item section
|
|
|
|
Examples:
|
|
>>> Section.parse_section_name("part_i_item_1")
|
|
("I", "1")
|
|
>>> Section.parse_section_name("part_ii_item_1a")
|
|
("II", "1A")
|
|
>>> Section.parse_section_name("item_7")
|
|
(None, "7")
|
|
>>> Section.parse_section_name("risk_factors")
|
|
(None, None)
|
|
"""
|
|
import re
|
|
|
|
section_lower = section_name.lower()
|
|
|
|
# Match 10-Q format: "part_i_item_1", "part_ii_item_1a"
|
|
part_item_match = re.match(r'part_([ivx]+)_item_(\d+[a-z]?)', section_lower)
|
|
if part_item_match:
|
|
part_roman = part_item_match.group(1).upper()
|
|
item_num = part_item_match.group(2).upper()
|
|
return (part_roman, item_num)
|
|
|
|
# Match 10-K format: "item_1", "item_1a", "item_7"
|
|
item_match = re.match(r'item_(\d+[a-z]?)', section_lower)
|
|
if item_match:
|
|
item_num = item_match.group(1).upper()
|
|
return (None, item_num)
|
|
|
|
# Not a structured item section
|
|
return (None, None)
|
|
|
|
|
|
class Sections(Dict[str, Section]):
|
|
"""
|
|
Dictionary wrapper for sections with rich display support.
|
|
|
|
Behaves like a normal dict but provides beautiful terminal display
|
|
via __rich__() method when printed in rich-enabled environments.
|
|
"""
|
|
|
|
def __rich__(self):
|
|
"""Return rich representation for display."""
|
|
if not self:
|
|
return Text("No sections detected", style="dim")
|
|
|
|
# Create summary table
|
|
table = RichTable(title="Document Sections", show_header=True, header_style="bold magenta")
|
|
table.add_column("Section", style="cyan", no_wrap=True)
|
|
table.add_column("Title", style="white")
|
|
table.add_column("Confidence", justify="right", style="green")
|
|
table.add_column("Method", style="yellow")
|
|
table.add_column("Part/Item", style="blue")
|
|
|
|
# Sort sections by part (roman numeral) and item number
|
|
def sort_key(item):
|
|
name, section = item
|
|
# Convert roman numerals to integers for sorting
|
|
roman_to_int = {'i': 1, 'ii': 2, 'iii': 3, 'iv': 4, 'v': 5}
|
|
|
|
part = section.part.lower() if section.part else ''
|
|
item_str = section.item if section.item else ''
|
|
|
|
# Extract part number
|
|
part_num = roman_to_int.get(part, 0)
|
|
|
|
# Extract item number and letter
|
|
import re
|
|
if item_str:
|
|
match = re.match(r'(\d+)([a-z]?)', item_str.lower())
|
|
if match:
|
|
item_num = int(match.group(1))
|
|
item_letter = match.group(2) or ''
|
|
return (part_num, item_num, item_letter)
|
|
|
|
# Fallback to name sorting
|
|
return (part_num, 999, name)
|
|
|
|
sorted_sections = sorted(self.items(), key=sort_key)
|
|
|
|
# Add rows for each section
|
|
for name, section in sorted_sections:
|
|
# Format confidence as percentage
|
|
confidence = f"{section.confidence:.1%}"
|
|
|
|
# Format part/item info
|
|
part_item = ""
|
|
if section.part and section.item:
|
|
part_item = f"Part {section.part}, Item {section.item}"
|
|
elif section.item:
|
|
part_item = f"Item {section.item}"
|
|
elif section.part:
|
|
part_item = f"Part {section.part}"
|
|
|
|
# Truncate title if too long
|
|
title = section.title
|
|
if len(title) > 50:
|
|
title = title[:47] + "..."
|
|
|
|
table.add_row(
|
|
name,
|
|
title,
|
|
confidence,
|
|
section.detection_method,
|
|
part_item
|
|
)
|
|
|
|
# Create summary stats
|
|
total = len(self)
|
|
high_conf = sum(1 for s in self.values() if s.confidence >= 0.8)
|
|
methods = {}
|
|
for section in self.values():
|
|
methods[section.detection_method] = methods.get(section.detection_method, 0) + 1
|
|
|
|
summary = Text()
|
|
summary.append(f"\nTotal: {total} sections | ", style="dim")
|
|
summary.append(f"High confidence (≥80%): {high_conf} | ", style="dim")
|
|
summary.append(f"Methods: {', '.join(f'{m}={c}' for m, c in methods.items())}", style="dim")
|
|
|
|
return Group(table, summary)
|
|
|
|
def __repr__(self):
|
|
return repr_rich(self.__rich__())
|
|
|
|
def get_item(self, item: str, part: str = None) -> Optional[Section]:
|
|
"""
|
|
Get section by item number with optional part specification.
|
|
|
|
Args:
|
|
item: Item identifier (e.g., "1", "1A", "7", "Item 1", "Item 7A")
|
|
part: Optional part specification (e.g., "I", "II", "Part I", "Part II")
|
|
If not specified and multiple parts contain the item, returns first match.
|
|
|
|
Returns:
|
|
Section object if found, None otherwise
|
|
|
|
Examples:
|
|
>>> sections.get_item("1") # Returns first Item 1 (any part)
|
|
>>> sections.get_item("1", "I") # Returns Part I, Item 1
|
|
>>> sections.get_item("Item 1A") # Returns first Item 1A
|
|
>>> sections.get_item("7A", "II") # Returns Part II, Item 7A
|
|
"""
|
|
# Normalize item string - remove "Item " prefix if present
|
|
item_clean = item.replace("Item ", "").replace("item ", "").strip().upper()
|
|
|
|
# Normalize part string if provided
|
|
part_clean = None
|
|
if part:
|
|
part_clean = part.replace("Part ", "").replace("part ", "").replace("PART ", "").strip().upper()
|
|
|
|
# Search through sections
|
|
for name, section in self.items():
|
|
if section.item and section.item.upper() == item_clean:
|
|
if part_clean is None:
|
|
# No part specified - return first match
|
|
return section
|
|
elif section.part and section.part.upper() == part_clean:
|
|
# Part matches
|
|
return section
|
|
|
|
return None
|
|
|
|
def get_part(self, part: str) -> Dict[str, Section]:
|
|
"""
|
|
Get all sections in a specific part.
|
|
|
|
Args:
|
|
part: Part identifier (e.g., "I", "II", "Part I", "Part II")
|
|
|
|
Returns:
|
|
Dictionary of sections in that part
|
|
|
|
Examples:
|
|
>>> sections.get_part("I") # All Part I sections
|
|
>>> sections.get_part("Part II") # All Part II sections
|
|
"""
|
|
# Normalize part string
|
|
part_clean = part.replace("Part ", "").replace("part ", "").replace("PART ", "").strip().upper()
|
|
|
|
result = {}
|
|
for name, section in self.items():
|
|
if section.part and section.part.upper() == part_clean:
|
|
result[name] = section
|
|
|
|
return result
|
|
|
|
def get(self, key, default=None):
|
|
"""
|
|
Enhanced get method that supports flexible key formats.
|
|
|
|
Supports:
|
|
- Standard dict key: "part_i_item_1"
|
|
- Item number: "Item 1", "1", "1A"
|
|
- Part+Item: ("I", "1"), ("Part II", "7A")
|
|
|
|
Args:
|
|
key: Section key (string or tuple)
|
|
default: Default value if not found
|
|
|
|
Returns:
|
|
Section object or default value
|
|
"""
|
|
# Try standard dict lookup first
|
|
if isinstance(key, str):
|
|
result = super().get(key, None)
|
|
if result is not None:
|
|
return result
|
|
|
|
# Try as item number
|
|
result = self.get_item(key)
|
|
if result is not None:
|
|
return result
|
|
|
|
# Try as (part, item) tuple
|
|
elif isinstance(key, tuple) and len(key) == 2:
|
|
part, item = key
|
|
result = self.get_item(item, part)
|
|
if result is not None:
|
|
return result
|
|
|
|
return default
|
|
|
|
def __getitem__(self, key):
|
|
"""
|
|
Enhanced __getitem__ that supports flexible key formats.
|
|
|
|
Supports:
|
|
- Standard dict key: sections["part_i_item_1"]
|
|
- Item number: sections["Item 1"], sections["1A"]
|
|
- Part+Item tuple: sections[("I", "1")], sections[("II", "7A")]
|
|
|
|
Raises KeyError if not found (standard dict behavior).
|
|
"""
|
|
# Try standard dict lookup first
|
|
if isinstance(key, str):
|
|
try:
|
|
return super().__getitem__(key)
|
|
except KeyError:
|
|
# Try as item number
|
|
result = self.get_item(key)
|
|
if result is not None:
|
|
return result
|
|
|
|
# Try as (part, item) tuple
|
|
elif isinstance(key, tuple) and len(key) == 2:
|
|
part, item = key
|
|
result = self.get_item(item, part)
|
|
if result is not None:
|
|
return result
|
|
|
|
# Not found - raise KeyError
|
|
raise KeyError(key)
|
|
|
|
|
|
@dataclass
|
|
class Document:
|
|
"""
|
|
Main document class.
|
|
|
|
Represents a parsed HTML document with methods for content extraction,
|
|
search, and transformation.
|
|
"""
|
|
|
|
# Core properties
|
|
root: Node
|
|
metadata: DocumentMetadata = field(default_factory=DocumentMetadata)
|
|
|
|
# Cached extractions
|
|
_sections: Optional[Sections] = field(default=None, init=False, repr=False)
|
|
_tables: Optional[List[TableNode]] = field(default=None, init=False, repr=False)
|
|
_headings: Optional[List[Node]] = field(default=None, init=False, repr=False)
|
|
_xbrl_facts: Optional[List[XBRLFact]] = field(default=None, init=False, repr=False)
|
|
_text_cache: Optional[str] = field(default=None, init=False, repr=False)
|
|
_config: Optional[Any] = field(default=None, init=False, repr=False) # ParserConfig reference
|
|
|
|
@property
|
|
def sections(self) -> Sections:
|
|
"""
|
|
Get document sections using hybrid multi-strategy detection.
|
|
|
|
Tries detection methods in order of reliability:
|
|
1. TOC-based (0.95 confidence)
|
|
2. Heading-based (0.7-0.9 confidence)
|
|
3. Pattern-based (0.6 confidence)
|
|
|
|
Returns a Sections dictionary wrapper that provides rich terminal display
|
|
via __rich__() method. Each section includes confidence score and detection method.
|
|
"""
|
|
if self._sections is None:
|
|
# Get form type from config or metadata
|
|
form = None
|
|
if self._config and hasattr(self._config, 'form'):
|
|
form = self._config.form
|
|
elif self.metadata and self.metadata.form:
|
|
form = self.metadata.form
|
|
|
|
# Only detect sections for supported form types (including amendments)
|
|
# Normalize form type by removing /A suffix for amendments
|
|
base_form = form.replace('/A', '') if form else None
|
|
|
|
if base_form and base_form in ['10-K', '10-Q', '8-K']:
|
|
from edgar.documents.extractors.hybrid_section_detector import HybridSectionDetector
|
|
# Pass thresholds from config if available
|
|
thresholds = self._config.detection_thresholds if self._config else None
|
|
# Use base form type for detection (10-K/A → 10-K)
|
|
detector = HybridSectionDetector(self, base_form, thresholds)
|
|
detected_sections = detector.detect_sections()
|
|
else:
|
|
# Fallback to pattern-based for other types or unknown
|
|
from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
|
|
extractor = SectionExtractor(form) if form else SectionExtractor()
|
|
detected_sections = extractor.extract(self)
|
|
|
|
# Wrap detected sections in Sections class for rich display
|
|
self._sections = Sections(detected_sections)
|
|
|
|
return self._sections
|
|
|
|
@property
|
|
def tables(self) -> List[TableNode]:
|
|
"""Get all tables in document."""
|
|
if self._tables is None:
|
|
self._tables = self.root.find(lambda n: isinstance(n, TableNode))
|
|
return self._tables
|
|
|
|
@property
|
|
def headings(self) -> List[Node]:
|
|
"""Get all headings in document."""
|
|
if self._headings is None:
|
|
from edgar.documents.nodes import HeadingNode
|
|
self._headings = self.root.find(lambda n: isinstance(n, HeadingNode))
|
|
return self._headings
|
|
|
|
@property
|
|
def xbrl_facts(self) -> List[XBRLFact]:
|
|
"""Get all XBRL facts in document."""
|
|
if self._xbrl_facts is None:
|
|
self._xbrl_facts = self._extract_xbrl_facts()
|
|
return self._xbrl_facts
|
|
|
|
def text(self,
|
|
clean: bool = True,
|
|
include_tables: bool = True,
|
|
include_metadata: bool = False,
|
|
max_length: Optional[int] = None) -> str:
|
|
"""
|
|
Extract text from document.
|
|
|
|
Args:
|
|
clean: Clean and normalize text
|
|
include_tables: Include table content in text
|
|
include_metadata: Include metadata annotations
|
|
max_length: Maximum text length
|
|
|
|
Returns:
|
|
Extracted text
|
|
"""
|
|
# Use cache if available and parameters match
|
|
if (self._text_cache is not None and
|
|
clean and not include_tables and not include_metadata and max_length is None):
|
|
return self._text_cache
|
|
|
|
# If whitespace was preserved during parsing and clean is default (True),
|
|
# respect the preserve_whitespace setting
|
|
if self.metadata.preserve_whitespace and clean:
|
|
clean = False
|
|
|
|
from edgar.documents.extractors.text_extractor import TextExtractor
|
|
extractor = TextExtractor(
|
|
clean=clean,
|
|
include_tables=include_tables,
|
|
include_metadata=include_metadata,
|
|
max_length=max_length
|
|
)
|
|
text = extractor.extract(self)
|
|
|
|
# Apply navigation link filtering when cleaning
|
|
if clean:
|
|
# Use cached/integrated navigation filtering (optimized approach)
|
|
try:
|
|
from edgar.documents.utils.anchor_cache import filter_with_cached_patterns
|
|
# Use minimal cached approach (no memory overhead)
|
|
original_html = getattr(self.metadata, 'original_html', None)
|
|
text = filter_with_cached_patterns(text, html_content=original_html)
|
|
except:
|
|
# Fallback to pattern-based filtering
|
|
from edgar.documents.utils.toc_filter import filter_toc_links
|
|
text = filter_toc_links(text)
|
|
|
|
# Cache if using default parameters
|
|
if clean and not include_tables and not include_metadata and max_length is None:
|
|
self._text_cache = text
|
|
|
|
return text
|
|
|
|
def search(self, query: str, top_k: int = 10) -> List[SearchResult]:
|
|
"""
|
|
Search document for query.
|
|
|
|
Args:
|
|
query: Search query
|
|
top_k: Maximum results to return
|
|
|
|
Returns:
|
|
List of search results
|
|
"""
|
|
from edgar.documents.search import DocumentSearch
|
|
searcher = DocumentSearch(self)
|
|
return searcher.search(query, top_k=top_k)
|
|
|
|
def get_section(self, section_name: str, part: Optional[str] = None) -> Optional[Section]:
|
|
"""
|
|
Get section by name with optional part specification for 10-Q filings.
|
|
|
|
Args:
|
|
section_name: Section identifier (e.g., "item_1", "part_i_item_1")
|
|
part: Optional part specification for 10-Q ("I", "II", "i", "ii")
|
|
If provided, searches for "part_{part}_{section_name}"
|
|
|
|
Returns:
|
|
Section object if found, None otherwise
|
|
|
|
Examples:
|
|
# 10-K usage (unchanged)
|
|
>>> doc.get_section("item_1") # Returns Item 1
|
|
|
|
# 10-Q usage with explicit part
|
|
>>> doc.get_section("item_1", part="I") # Returns Part I Item 1
|
|
>>> doc.get_section("item_1", part="II") # Returns Part II Item 1
|
|
|
|
# 10-Q usage with full name
|
|
>>> doc.get_section("part_i_item_1") # Returns Part I Item 1
|
|
"""
|
|
# If part is specified, construct part-aware name
|
|
if part:
|
|
part_normalized = part.upper()
|
|
# Remove "item_" prefix if present in section_name
|
|
item_name = section_name.replace("item_", "") if section_name.startswith("item_") else section_name
|
|
full_name = f"part_{part_normalized.lower()}_item_{item_name.lower()}"
|
|
return self.sections.get(full_name)
|
|
|
|
# Direct lookup (works for both 10-K "item_1" and 10-Q "part_i_item_1")
|
|
section = self.sections.get(section_name)
|
|
if section:
|
|
return section
|
|
|
|
# If not found and looks like an item without part, check if we have multiple parts
|
|
# In that case, raise a helpful error
|
|
if section_name.startswith("item_") or section_name.replace("_", "").startswith("item"):
|
|
# Check if we have part-aware sections (10-Q)
|
|
matching_sections = [name for name in self.sections.keys()
|
|
if section_name in name and "part_" in name]
|
|
if matching_sections:
|
|
# Multiple parts available - user needs to specify which one
|
|
parts = sorted(set(s.split("_")[1] for s in matching_sections if s.startswith("part_")))
|
|
raise ValueError(
|
|
f"Ambiguous section '{section_name}' in 10-Q filing. "
|
|
f"Found in parts: {parts}. "
|
|
f"Please specify part: get_section('{section_name}', part='I') or part='II'"
|
|
)
|
|
|
|
return None
|
|
|
|
def extract_section_text(self, section_name: str) -> Optional[str]:
|
|
"""Extract text from specific section."""
|
|
section = self.get_section(section_name)
|
|
if section:
|
|
return section.text()
|
|
return None
|
|
|
|
def get_sec_section(self, section_name: str, clean: bool = True,
|
|
include_subsections: bool = True) -> Optional[str]:
|
|
"""
|
|
Extract content from a specific SEC filing section using anchor analysis.
|
|
|
|
Args:
|
|
section_name: Section name (e.g., "Item 1", "Item 1A", "Part I")
|
|
clean: Whether to apply text cleaning and navigation filtering
|
|
include_subsections: Whether to include subsections
|
|
|
|
Returns:
|
|
Section text content or None if section not found
|
|
|
|
Examples:
|
|
>>> doc.get_sec_section("Item 1") # Business description
|
|
>>> doc.get_sec_section("Item 1A") # Risk factors
|
|
>>> doc.get_sec_section("Item 7") # MD&A
|
|
"""
|
|
# Lazy-load section extractor
|
|
if not hasattr(self, '_section_extractor'):
|
|
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
|
|
self._section_extractor = SECSectionExtractor(self)
|
|
|
|
return self._section_extractor.get_section_text(
|
|
section_name, include_subsections, clean
|
|
)
|
|
|
|
def get_available_sec_sections(self) -> List[str]:
|
|
"""
|
|
Get list of SEC sections available for extraction.
|
|
|
|
Returns:
|
|
List of section names that can be passed to get_sec_section()
|
|
|
|
Example:
|
|
>>> sections = doc.get_available_sec_sections()
|
|
>>> print(sections)
|
|
['Part I', 'Item 1', 'Item 1A', 'Item 1B', 'Item 2', ...]
|
|
"""
|
|
if not hasattr(self, '_section_extractor'):
|
|
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
|
|
self._section_extractor = SECSectionExtractor(self)
|
|
|
|
return self._section_extractor.get_available_sections()
|
|
|
|
def get_sec_section_info(self, section_name: str) -> Optional[Dict]:
|
|
"""
|
|
Get detailed information about an SEC section.
|
|
|
|
Args:
|
|
section_name: Section name to look up
|
|
|
|
Returns:
|
|
Dict with section metadata including anchor info
|
|
"""
|
|
if not hasattr(self, '_section_extractor'):
|
|
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
|
|
self._section_extractor = SECSectionExtractor(self)
|
|
|
|
return self._section_extractor.get_section_info(section_name)
|
|
|
|
def to_markdown(self) -> str:
|
|
"""Convert document to Markdown."""
|
|
from edgar.documents.renderers.markdown_renderer import MarkdownRenderer
|
|
renderer = MarkdownRenderer()
|
|
return renderer.render(self)
|
|
|
|
def to_json(self, include_content: bool = True) -> Dict[str, Any]:
|
|
"""
|
|
Convert document to JSON.
|
|
|
|
Args:
|
|
include_content: Include full content or just structure
|
|
|
|
Returns:
|
|
JSON-serializable dictionary
|
|
"""
|
|
result = {
|
|
'metadata': self.metadata.to_dict(),
|
|
'sections': list(self.sections.keys()),
|
|
'table_count': len(self.tables),
|
|
'xbrl_fact_count': len(self.xbrl_facts)
|
|
}
|
|
|
|
if include_content:
|
|
result['sections_detail'] = {
|
|
name: {
|
|
'title': section.title,
|
|
'text_length': len(section.text()),
|
|
'table_count': len(section.tables())
|
|
}
|
|
for name, section in self.sections.items()
|
|
}
|
|
|
|
result['tables'] = [
|
|
{
|
|
'type': table.table_type.name,
|
|
'rows': len(table.rows),
|
|
'columns': len(table.headers[0]) if table.headers else 0,
|
|
'caption': table.caption
|
|
}
|
|
for table in self.tables
|
|
]
|
|
|
|
return result
|
|
|
|
def to_dataframe(self) -> 'pd.DataFrame':
|
|
"""
|
|
Convert document tables to pandas DataFrame.
|
|
|
|
Returns a DataFrame with all tables concatenated.
|
|
"""
|
|
import pandas as pd
|
|
|
|
if not self.tables:
|
|
return pd.DataFrame()
|
|
|
|
# Convert each table to DataFrame
|
|
dfs = []
|
|
for i, table in enumerate(self.tables):
|
|
df = table.to_dataframe()
|
|
# Add table index
|
|
df['_table_index'] = i
|
|
df['_table_type'] = table.table_type.name
|
|
if table.caption:
|
|
df['_table_caption'] = table.caption
|
|
dfs.append(df)
|
|
|
|
# Concatenate all tables
|
|
return pd.concat(dfs, ignore_index=True)
|
|
|
|
def chunks(self, chunk_size: int = 512, overlap: int = 128) -> Iterator['DocumentChunk']:
|
|
"""
|
|
Generate document chunks for processing.
|
|
|
|
Args:
|
|
chunk_size: Target chunk size in tokens
|
|
overlap: Overlap between chunks
|
|
|
|
Yields:
|
|
Document chunks
|
|
"""
|
|
from edgar.documents.extractors.chunk_extractor import ChunkExtractor
|
|
extractor = ChunkExtractor(chunk_size=chunk_size, overlap=overlap)
|
|
return extractor.extract(self)
|
|
|
|
def prepare_for_llm(self,
|
|
max_tokens: int = 4000,
|
|
preserve_structure: bool = True,
|
|
focus_sections: Optional[List[str]] = None) -> 'LLMDocument':
|
|
"""
|
|
Prepare document for LLM processing.
|
|
|
|
Args:
|
|
max_tokens: Maximum tokens
|
|
preserve_structure: Preserve document structure
|
|
focus_sections: Sections to focus on
|
|
|
|
Returns:
|
|
LLM-optimized document
|
|
"""
|
|
from edgar.documents.ai.llm_optimizer import LLMOptimizer
|
|
optimizer = LLMOptimizer()
|
|
return optimizer.optimize(
|
|
self,
|
|
max_tokens=max_tokens,
|
|
preserve_structure=preserve_structure,
|
|
focus_sections=focus_sections
|
|
)
|
|
|
|
def extract_key_information(self) -> Dict[str, Any]:
|
|
"""Extract key information from document."""
|
|
return {
|
|
'company': self.metadata.company,
|
|
'form': self.metadata.form,
|
|
'filing_date': self.metadata.filing_date,
|
|
'sections': list(self.sections.keys()),
|
|
'financial_tables': sum(1 for t in self.tables if t.is_financial_table),
|
|
'total_tables': len(self.tables),
|
|
'xbrl_facts': len(self.xbrl_facts),
|
|
'document_length': len(self.text())
|
|
}
|
|
|
|
def _extract_xbrl_facts(self) -> List[XBRLFact]:
|
|
"""Extract XBRL facts from document."""
|
|
facts = []
|
|
|
|
# Find all nodes with XBRL metadata
|
|
xbrl_nodes = self.root.find(
|
|
lambda n: n.get_metadata('ix_tag') is not None
|
|
)
|
|
|
|
for node in xbrl_nodes:
|
|
fact = XBRLFact(
|
|
concept=node.get_metadata('ix_tag'),
|
|
value=node.text(),
|
|
context_ref=node.get_metadata('ix_context'),
|
|
unit_ref=node.get_metadata('ix_unit'),
|
|
decimals=node.get_metadata('ix_decimals'),
|
|
scale=node.get_metadata('ix_scale')
|
|
)
|
|
facts.append(fact)
|
|
|
|
return facts
|
|
|
|
def __len__(self) -> int:
|
|
"""Get number of top-level nodes."""
|
|
return len(self.root.children)
|
|
|
|
def __iter__(self) -> Iterator[Node]:
|
|
"""Iterate over top-level nodes."""
|
|
return iter(self.root.children)
|
|
|
|
def __repr__(self) -> str:
|
|
return self.text()
|
|
|
|
def walk(self) -> Iterator[Node]:
|
|
"""Walk entire document tree."""
|
|
return self.root.walk()
|
|
|
|
def find_nodes(self, predicate) -> List[Node]:
|
|
"""Find all nodes matching predicate."""
|
|
return self.root.find(predicate)
|
|
|
|
def find_first_node(self, predicate) -> Optional[Node]:
|
|
"""Find first node matching predicate."""
|
|
return self.root.find_first(predicate)
|
|
|
|
@property
|
|
def is_empty(self) -> bool:
|
|
"""Check if document is empty."""
|
|
return len(self.root.children) == 0
|
|
|
|
@property
|
|
def has_tables(self) -> bool:
|
|
"""Check if document has tables."""
|
|
return len(self.tables) > 0
|
|
|
|
@property
|
|
def has_xbrl(self) -> bool:
|
|
"""Check if document has XBRL data."""
|
|
return len(self.xbrl_facts) > 0
|
|
|
|
def validate(self) -> List[str]:
|
|
"""
|
|
Validate document structure.
|
|
|
|
Returns list of validation issues.
|
|
"""
|
|
issues = []
|
|
|
|
# Check for empty document
|
|
if self.is_empty:
|
|
issues.append("Document is empty")
|
|
|
|
# Check for sections
|
|
if not self.sections:
|
|
issues.append("No sections detected")
|
|
|
|
# Check for common sections in filings
|
|
if self.metadata.form in ['10-K', '10-Q']:
|
|
expected_sections = ['business', 'risk_factors', 'mda']
|
|
missing = [s for s in expected_sections if s not in self.sections]
|
|
if missing:
|
|
issues.append(f"Missing expected sections: {', '.join(missing)}")
|
|
|
|
# Check for orphaned nodes
|
|
orphaned = self.root.find(lambda n: n.parent is None and n != self.root)
|
|
if orphaned:
|
|
issues.append(f"Found {len(orphaned)} orphaned nodes")
|
|
|
|
return issues
|
|
|
|
|
|
@dataclass
|
|
class DocumentChunk:
|
|
"""Represents a chunk of document for processing."""
|
|
content: str
|
|
start_node: Node
|
|
end_node: Node
|
|
section: Optional[str] = None
|
|
token_count: int = 0
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert chunk to dictionary."""
|
|
return {
|
|
'content': self.content,
|
|
'section': self.section,
|
|
'token_count': self.token_count,
|
|
'start_path': self.start_node.path,
|
|
'end_path': self.end_node.path
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class LLMDocument:
|
|
"""Document optimized for LLM processing."""
|
|
content: str
|
|
metadata: Dict[str, Any]
|
|
token_count: int
|
|
sections: List[str]
|
|
truncated: bool = False
|
|
|
|
def to_prompt(self) -> str:
|
|
"""Convert to LLM prompt."""
|
|
parts = []
|
|
|
|
# Add metadata context
|
|
parts.append(f"Document: {self.metadata.get('form', 'Unknown')}")
|
|
parts.append(f"Company: {self.metadata.get('company', 'Unknown')}")
|
|
parts.append(f"Date: {self.metadata.get('filing_date', 'Unknown')}")
|
|
parts.append("")
|
|
|
|
# Add content
|
|
parts.append(self.content)
|
|
|
|
if self.truncated:
|
|
parts.append("\n[Content truncated due to length]")
|
|
|
|
return '\n'.join(parts) |