Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
"""
Content extractors for documents.
"""
from edgar.documents.extractors.text_extractor import TextExtractor
from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
from edgar.documents.extractors.hybrid_section_detector import HybridSectionDetector
from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
__all__ = [
'TextExtractor',
'SectionExtractor',
'HybridSectionDetector',
'TOCSectionDetector'
]

View File

@@ -0,0 +1,170 @@
"""
Heading-based section detection strategy.
Detects sections by analyzing heading nodes with HeaderInfo metadata.
This strategy provides moderate confidence (0.7-0.9) and serves as a
fallback when TOC-based detection is not available.
"""
import logging
from typing import Dict, Optional
from edgar.documents.document import Document, Section
from edgar.documents.nodes import HeadingNode, SectionNode
from edgar.documents.types import HeaderInfo
logger = logging.getLogger(__name__)
class HeadingSectionDetector:
"""
Heading-based section detection using HeaderInfo.
Analyzes heading nodes that have been annotated with HeaderInfo
during parsing. Detects sections based on:
- Item numbers (Item 1, Item 1A, etc.)
- Heading confidence scores
- Heading hierarchy
Provides moderate confidence (0.7-0.9) detection.
"""
def __init__(
self,
document: Document,
form: Optional[str] = None,
min_confidence: float = 0.5 # Lower threshold, let hybrid detector filter
):
"""
Initialize heading-based detector.
Args:
document: Document to analyze
form: Optional filing type for context ('10-K', '10-Q', '8-K')
min_confidence: Minimum confidence for headings (default 0.5)
"""
self.document = document
self.form = form
self.min_confidence = min_confidence
def detect(self) -> Optional[Dict[str, Section]]:
"""
Detect sections from heading nodes with HeaderInfo.
Returns:
Dictionary of sections if successful, None if no sections found
"""
try:
# Get heading nodes from document
headings = self.document.headings
if not headings:
logger.debug("No headings found in document")
return None
sections = {}
for heading in headings:
# Check if heading has header info
if not hasattr(heading, 'header_info') or not heading.header_info:
continue
header_info = heading.header_info
# Only use headings with sufficient confidence
if header_info.confidence < self.min_confidence:
continue
# Check if it's an item header
if not header_info.is_item:
continue
# Extract section from this heading
section = self._extract_section_from_heading(heading, header_info)
if section:
section.confidence = header_info.confidence
section.detection_method = 'heading'
sections[section.name] = section
if not sections:
logger.debug("No item headers found with sufficient confidence")
return None
logger.info(f"Heading detection found {len(sections)} sections")
return sections
except Exception as e:
logger.warning(f"Heading detection failed: {e}")
return None
def _extract_section_from_heading(
self, heading: HeadingNode, header_info: HeaderInfo
) -> Optional[Section]:
"""
Extract section content from heading node to next heading.
Args:
heading: HeadingNode representing section start
header_info: HeaderInfo with section metadata
Returns:
Section object if successful, None otherwise
"""
try:
# Create section name from item number
if header_info.item_number:
# Normalize: "1A" -> "item_1a", "7" -> "item_7"
section_name = f"item_{header_info.item_number.replace('.', '_').lower()}"
else:
section_name = "unknown"
# Create section node
section_node = SectionNode(section_name=section_name)
# Find next heading at same or higher level to determine section end
current_level = header_info.level
parent = heading.parent
if not parent:
logger.debug(f"Heading {header_info.text} has no parent")
return None
# Find heading position in parent's children
try:
heading_index = parent.children.index(heading)
except ValueError:
logger.debug(f"Could not find heading in parent's children")
return None
# Collect nodes until next section heading
for i in range(heading_index + 1, len(parent.children)):
child = parent.children[i]
# Stop at next heading of same or higher level
if isinstance(child, HeadingNode):
if hasattr(child, 'header_info') and child.header_info:
if child.header_info.level <= current_level:
break
# Add child to section
section_node.add_child(child)
# Parse section name to extract part and item identifiers
part, item = Section.parse_section_name(section_name)
# Create Section object
section = Section(
name=section_name,
title=header_info.text,
node=section_node,
start_offset=0, # Would need actual text position
end_offset=0, # Would need actual text position
confidence=header_info.confidence,
detection_method='heading',
part=part,
item=item
)
return section
except Exception as e:
logger.warning(f"Failed to extract section from heading: {e}")
return None

View File

@@ -0,0 +1,489 @@
"""
Hybrid section detection system with multiple fallback strategies.
This module implements a multi-strategy approach to section detection:
1. TOC-based (primary): High confidence, uses Table of Contents structure
2. Heading-based (fallback): Moderate confidence, uses multi-strategy heading detection
3. Pattern-based (last resort): Lower confidence, uses regex pattern matching
"""
import logging
from typing import Dict, Optional, List
from dataclasses import dataclass
from functools import lru_cache
from edgar.documents.document import Document, Section
from edgar.documents.nodes import SectionNode, HeadingNode
from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
from edgar.documents.config import DetectionThresholds
logger = logging.getLogger(__name__)
class HybridSectionDetector:
"""
Multi-strategy section detector with fallback.
Tries strategies in order of reliability:
1. TOC-based (0.95 confidence) - Most reliable
2. Multi-strategy heading detection (0.7-0.9 confidence) - Fallback
3. Pattern matching (0.6 confidence) - Last resort
Example:
>>> detector = HybridSectionDetector(document, '10-K')
>>> sections = detector.detect_sections()
>>> for name, section in sections.items():
... print(f"{name}: {section.confidence:.2f} ({section.detection_method})")
"""
def __init__(self, document: Document, form: str, thresholds: Optional[DetectionThresholds] = None):
"""
Initialize hybrid detector.
Args:
document: Document to extract sections from
form: Filing type ('10-K', '10-Q', '8-K')
thresholds: Detection thresholds configuration
"""
self.document = document
self.form = form
self.thresholds = thresholds or DetectionThresholds()
# Initialize detection strategies
self.toc_detector = TOCSectionDetector(document)
self.pattern_extractor = SectionExtractor(form)
def detect_sections(self) -> Dict[str, Section]:
"""
Detect sections using hybrid approach with fallback and validation.
Returns:
Dictionary mapping section names to Section objects with confidence scores
"""
# Strategy 1: TOC-based (most reliable)
logger.debug("Trying TOC-based detection...")
sections = self.toc_detector.detect()
if sections:
logger.info(f"TOC detection successful: {len(sections)} sections found")
return self._validate_pipeline(sections, enable_cross_validation=True)
# Strategy 2: Heading-based (fallback)
logger.debug("TOC detection failed, trying heading detection...")
sections = self._try_heading_detection()
if sections:
logger.info(f"Heading detection successful: {len(sections)} sections found")
return self._validate_pipeline(sections, enable_cross_validation=False)
# Strategy 3: Pattern-based (last resort)
logger.debug("Heading detection failed, trying pattern matching...")
sections = self._try_pattern_detection()
if sections:
logger.info(f"Pattern detection successful: {len(sections)} sections found")
return self._validate_pipeline(sections, enable_cross_validation=False)
logger.warning("All detection strategies failed, no sections found")
return {}
def _validate_pipeline(
self,
sections: Dict[str, Section],
enable_cross_validation: bool = False
) -> Dict[str, Section]:
"""
Apply validation pipeline to sections.
Centralizes validation logic to eliminate duplication.
Args:
sections: Sections to validate
enable_cross_validation: Whether to enable cross-validation (expensive)
Returns:
Validated sections
"""
if not sections:
return sections
# Cross-validate (optional, expensive)
if enable_cross_validation and self.thresholds.enable_cross_validation:
sections = self._cross_validate(sections)
# Validate boundaries
sections = self._validate_boundaries(sections)
# Deduplicate
sections = self._deduplicate(sections)
# Filter by confidence
sections = self._filter_by_confidence(sections)
return sections
def _try_heading_detection(self) -> Optional[Dict[str, Section]]:
"""
Try multi-strategy heading detection.
Returns:
Dictionary of sections if successful, None if failed
"""
try:
# Get heading nodes from document
headings = self.document.headings
if not headings:
return None
sections = {}
for heading in headings:
# Check if heading has header info
if not hasattr(heading, 'header_info') or not heading.header_info:
continue
header_info = heading.header_info
# Only use headings with sufficient confidence
if header_info.confidence < 0.7:
continue
# Check if it's an item header
if not header_info.is_item:
continue
# Extract section from this heading to next
section = self._extract_section_from_heading(heading, header_info)
if section:
section.confidence = header_info.confidence
section.detection_method = 'heading'
sections[section.name] = section
return sections if sections else None
except Exception as e:
logger.warning(f"Heading detection failed: {e}")
return None
def _try_pattern_detection(self) -> Optional[Dict[str, Section]]:
"""
Try pattern-based extraction.
Returns:
Dictionary of sections if successful, None if failed
"""
try:
# Use pattern extractor
sections = self.pattern_extractor.extract(self.document)
# Mark with pattern detection confidence
for section in sections.values():
section.confidence = 0.6 # Pattern-based = lower confidence
section.detection_method = 'pattern'
return sections if sections else None
except Exception as e:
logger.warning(f"Pattern detection failed: {e}")
return None
def _extract_section_from_heading(self, heading: HeadingNode, header_info) -> Optional[Section]:
"""
Extract section content from heading node to next heading.
Args:
heading: HeadingNode representing section start
header_info: HeaderInfo with section metadata
Returns:
Section object if successful, None otherwise
"""
try:
# Create section name from item number
section_name = f"item_{header_info.item_number.replace('.', '_')}" if header_info.item_number else "unknown"
# Create section node
section_node = SectionNode(section_name=section_name)
# Find next heading at same or higher level to determine section end
current_level = header_info.level
parent = heading.parent
if not parent:
return None
# Find heading position in parent's children
try:
heading_index = parent.children.index(heading)
except ValueError:
return None
# Collect nodes until next section heading
for i in range(heading_index + 1, len(parent.children)):
child = parent.children[i]
# Stop at next heading of same or higher level
if isinstance(child, HeadingNode):
if hasattr(child, 'header_info') and child.header_info:
if child.header_info.level <= current_level:
break
# Add child to section
section_node.add_child(child)
# Create Section object
section = Section(
name=section_name,
title=header_info.text,
node=section_node,
start_offset=0, # Would need actual text position
end_offset=0, # Would need actual text position
confidence=header_info.confidence,
detection_method='heading'
)
return section
except Exception as e:
logger.warning(f"Failed to extract section from heading: {e}")
return None
def _cross_validate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
"""
Cross-validate sections using multiple detection methods.
Boosts confidence if multiple methods detect the same section.
Reduces confidence if methods disagree.
Args:
sections: Sections detected by primary method
Returns:
Validated sections with adjusted confidence scores
"""
validated = {}
# Get pattern-based sections once for comparison (not per section)
try:
pattern_sections = self.pattern_extractor.extract(self.document)
except Exception as e:
logger.debug(f"Pattern extraction failed for cross-validation: {e}")
pattern_sections = {}
for name, section in sections.items():
# Try alternative detection (pattern matching for validation)
try:
# Check if this section is also found by pattern matching
found_in_patterns = False
for pattern_name, pattern_section in pattern_sections.items():
# Check for name similarity or overlap
if self._sections_similar(section, pattern_section):
found_in_patterns = True
break
# Boost confidence if methods agree
if found_in_patterns:
section.confidence = min(section.confidence * self.thresholds.cross_validation_boost, 1.0)
section.validated = True
logger.debug(f"Section {name} validated by multiple methods, confidence boosted to {section.confidence:.2f}")
else:
# Slight reduction if not validated
section.confidence *= self.thresholds.disagreement_penalty
section.validated = False
except Exception as e:
logger.debug(f"Cross-validation failed for {name}: {e}")
# Keep original confidence if validation fails
pass
validated[name] = section
return validated
def _validate_boundaries(self, sections: Dict[str, Section]) -> Dict[str, Section]:
"""
Validate section boundaries for overlaps, gaps, and ordering.
Args:
sections: Sections to validate
Returns:
Sections with validated boundaries
"""
if not sections:
return sections
# Sort by start offset
sorted_sections = sorted(sections.items(), key=lambda x: x[1].start_offset)
validated = {}
prev_section = None
for name, section in sorted_sections:
# Check for overlap with previous section
if prev_section and section.start_offset > 0:
if section.start_offset < prev_section[1].end_offset:
# Overlap detected - adjust boundary at midpoint
gap_mid = (prev_section[1].end_offset + section.start_offset) // 2
prev_section[1].end_offset = gap_mid
section.start_offset = gap_mid
# Reduce confidence due to boundary adjustment
section.confidence *= self.thresholds.boundary_overlap_penalty
prev_section[1].confidence *= self.thresholds.boundary_overlap_penalty
logger.debug(f"Adjusted boundary between {prev_section[0]} and {name}")
# Check for large gap (>10% of document size)
elif prev_section[1].end_offset > 0:
gap_size = section.start_offset - prev_section[1].end_offset
if gap_size > 100000: # Arbitrary large gap threshold
# Large gap - might indicate missing section
section.confidence *= 0.9
logger.debug(f"Large gap detected before {name}")
validated[name] = section
prev_section = (name, section)
return validated
def _deduplicate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
"""
Remove duplicate sections detected by multiple methods.
Keeps the detection with highest confidence.
Args:
sections: Sections possibly containing duplicates
Returns:
Deduplicated sections
"""
if len(sections) <= 1:
return sections
# Group similar sections
groups = self._group_similar_sections(sections)
deduplicated = {}
for group in groups:
if len(group) == 1:
# No duplicates
deduplicated[group[0].name] = group[0]
else:
# Keep section with highest confidence
best = max(group, key=lambda s: s.confidence)
# Merge detection methods
methods = set(s.detection_method for s in group)
if len(methods) > 1:
best.detection_method = ','.join(sorted(methods))
# Boost confidence for multi-method detection
best.confidence = min(best.confidence * 1.15, 1.0)
best.validated = True
logger.debug(f"Merged duplicate sections for {best.name}, methods: {best.detection_method}")
deduplicated[best.name] = best
return deduplicated
def _group_similar_sections(self, sections: Dict[str, Section]) -> List[List[Section]]:
"""
Group sections that appear to be duplicates.
Args:
sections: Sections to group
Returns:
List of section groups
"""
groups = []
used = set()
for name1, section1 in sections.items():
if name1 in used:
continue
group = [section1]
used.add(name1)
for name2, section2 in sections.items():
if name2 in used:
continue
# Check if sections are similar
if self._sections_similar(section1, section2):
group.append(section2)
used.add(name2)
groups.append(group)
return groups
def _sections_similar(self, section1: Section, section2: Section) -> bool:
"""
Check if two sections are similar (likely duplicates).
Args:
section1: First section
section2: Second section
Returns:
True if sections are similar
"""
# Normalize names for comparison
name1 = section1.name.lower().replace('_', ' ').strip()
name2 = section2.name.lower().replace('_', ' ').strip()
# Check exact match after normalization
if name1 == name2:
return True
# Check title similarity (exact match)
title1 = section1.title.lower().strip()
title2 = section2.title.lower().strip()
if title1 == title2:
return True
# Check for position overlap (if positions are set)
if section1.start_offset > 0 and section2.start_offset > 0:
# Calculate overlap
overlap_start = max(section1.start_offset, section2.start_offset)
overlap_end = min(section1.end_offset, section2.end_offset)
if overlap_end > overlap_start:
# There is overlap
overlap_size = overlap_end - overlap_start
min_size = min(
section1.end_offset - section1.start_offset,
section2.end_offset - section2.start_offset
)
# If overlap is >50% of smaller section, consider similar
if min_size > 0 and overlap_size / min_size > 0.5:
return True
return False
def _filter_by_confidence(self, sections: Dict[str, Section]) -> Dict[str, Section]:
"""
Filter sections by minimum confidence threshold.
Args:
sections: Sections to filter
Returns:
Filtered sections meeting minimum confidence
"""
# Check for filing-specific thresholds
min_conf = self.thresholds.min_confidence
if self.form in self.thresholds.thresholds_by_form:
filing_thresholds = self.thresholds.thresholds_by_form[self.form]
min_conf = filing_thresholds.get('min_confidence', min_conf)
filtered = {}
for name, section in sections.items():
if section.confidence >= min_conf:
filtered[name] = section
else:
logger.debug(f"Filtered out section {name} with confidence {section.confidence:.2f} < {min_conf:.2f}")
return filtered

View File

@@ -0,0 +1,405 @@
"""
Section extraction from documents.
"""
import re
from typing import Dict, List, Optional, Tuple
from edgar.documents.document import Document, Section
from edgar.documents.nodes import Node, HeadingNode, SectionNode
class SectionExtractor:
"""
Extracts logical sections from documents.
Identifies document sections like:
- Business Overview (Item 1)
- Risk Factors (Item 1A)
- MD&A (Item 7)
- Financial Statements (Item 8)
"""
# Common section patterns for different filing types
SECTION_PATTERNS = {
'10-K': {
'business': [
(r'^(Item|ITEM)\s+1\.?\s*Business', 'Item 1 - Business'),
(r'^Business\s*$', 'Business'),
(r'^Business Overview', 'Business Overview'),
(r'^Our Business', 'Our Business'),
(r'^Company Overview', 'Company Overview')
],
'risk_factors': [
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
(r'^Risk\s+Factors', 'Risk Factors'),
(r'^Factors\s+That\s+May\s+Affect', 'Risk Factors')
],
'properties': [
(r'^(Item|ITEM)\s+2\.?\s*Properties', 'Item 2 - Properties'),
(r'^Properties', 'Properties'),
(r'^Real\s+Estate', 'Real Estate')
],
'legal_proceedings': [
(r'^(Item|ITEM)\s+3\.?\s*Legal\s+Proceedings', 'Item 3 - Legal Proceedings'),
(r'^Legal\s+Proceedings', 'Legal Proceedings'),
(r'^Litigation', 'Litigation')
],
'market_risk': [
(r'^(Item|ITEM)\s+7A\.?\s*Quantitative.*Disclosures', 'Item 7A - Market Risk'),
(r'^Market\s+Risk', 'Market Risk'),
(r'^Quantitative.*Qualitative.*Market\s+Risk', 'Market Risk')
],
'mda': [
(r'^(Item|ITEM)\s+7\.?\s*Management.*Discussion', 'Item 7 - MD&A'),
(r'^Management.*Discussion.*Analysis', 'MD&A'),
(r'^MD&A', 'MD&A')
],
'financial_statements': [
(r'^(Item|ITEM)\s+8\.?\s*Financial\s+Statements', 'Item 8 - Financial Statements'),
(r'^Financial\s+Statements', 'Financial Statements'),
(r'^Consolidated\s+Financial\s+Statements', 'Consolidated Financial Statements')
],
'controls_procedures': [
(r'^(Item|ITEM)\s+9A\.?\s*Controls.*Procedures', 'Item 9A - Controls and Procedures'),
(r'^Controls.*Procedures', 'Controls and Procedures'),
(r'^Internal\s+Control', 'Internal Controls')
]
},
'10-Q': {
'financial_statements': [
(r'^(Item|ITEM)\s+1\.?\s*Financial\s+Statements', 'Item 1 - Financial Statements'),
(r'^Financial\s+Statements', 'Financial Statements'),
(r'^Condensed.*Financial\s+Statements', 'Condensed Financial Statements')
],
'mda': [
(r'^(Item|ITEM)\s+2\.?\s*Management.*Discussion', 'Item 2 - MD&A'),
(r'^Management.*Discussion.*Analysis', 'MD&A')
],
'market_risk': [
(r'^(Item|ITEM)\s+3\.?\s*Quantitative.*Disclosures', 'Item 3 - Market Risk'),
(r'^Market\s+Risk', 'Market Risk')
],
'controls_procedures': [
(r'^(Item|ITEM)\s+4\.?\s*Controls.*Procedures', 'Item 4 - Controls and Procedures'),
(r'^Controls.*Procedures', 'Controls and Procedures')
],
'legal_proceedings': [
(r'^(Item|ITEM)\s+1\.?\s*Legal\s+Proceedings', 'Item 1 - Legal Proceedings'),
(r'^Legal\s+Proceedings', 'Legal Proceedings')
],
'risk_factors': [
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
(r'^Risk\s+Factors', 'Risk Factors')
]
},
'8-K': {
'item_101': [
(r'^(Item|ITEM)\s+1\.01', 'Item 1.01 - Entry into Material Agreement'),
(r'^Entry.*Material.*Agreement', 'Material Agreement')
],
'item_201': [
(r'^(Item|ITEM)\s+2\.01', 'Item 2.01 - Completion of Acquisition'),
(r'^Completion.*Acquisition', 'Acquisition')
],
'item_202': [
(r'^(Item|ITEM)\s+2\.02', 'Item 2.02 - Results of Operations'),
(r'^Results.*Operations', 'Results of Operations')
],
'item_503': [
(r'^(Item|ITEM)\s+5\.03', 'Item 5.03 - Director/Officer Changes'),
(r'^Amendments.*Articles', 'Charter Amendments')
],
'item_801': [
(r'^(Item|ITEM)\s+8\.01', 'Item 8.01 - Other Events'),
(r'^Other\s+Events', 'Other Events')
],
'item_901': [
(r'^(Item|ITEM)\s+9\.01', 'Item 9.01 - Financial Statements and Exhibits'),
(r'^Financial.*Exhibits', 'Financial Statements and Exhibits')
]
}
}
def __init__(self, form: Optional[str] = None):
"""
Initialize section extractor.
Args:
form: Type of filing (10-K, 10-Q, 8-K, etc.)
"""
self.form = form
def extract(self, document: Document) -> Dict[str, Section]:
"""
Extract sections from document.
Args:
document: Document to extract sections from
Returns:
Dictionary mapping section names to Section objects
"""
# Get filing type from instance, metadata, or document config
# NOTE: We no longer auto-detect filing type (expensive and unnecessary)
form = None
if self.form:
form = self.form
elif document.metadata and document.metadata.form:
form = document.metadata.form
elif hasattr(document, '_config') and document._config and document._config.form:
form = document._config.form
# Only extract sections for forms that have standard sections
if not form or form not in ['10-K', '10-Q', '8-K']:
return {} # No filing type or unsupported form = no section detection
# Get patterns for filing type
patterns = self.SECTION_PATTERNS.get(form, {})
if not patterns:
return {} # No patterns defined for this form type
# Find section headers
headers = self._find_section_headers(document)
# For 10-Q, detect Part I/Part II boundaries
part_context = None
if form == '10-Q':
part_context = self._detect_10q_parts(headers)
# Match headers to sections
sections = self._match_sections(headers, patterns, document, part_context)
# Create section objects
return self._create_sections(sections, document)
# NOTE: _detect_form() removed - form type should be known from context
# Filing metadata should be set by the caller (Filing class, TenK/TenQ, etc.)
# NOTE: _infer_form_from_headers() kept for backward compatibility but not used
# in normal flow anymore. Form type should always be provided explicitly.
def _infer_form_from_headers(self, document: Document) -> str:
"""
Infer filing type from section headers.
NOTE: This method is kept for backward compatibility but should not be used
in the normal flow. Form type should be explicitly provided via config or metadata.
"""
headers = document.headings
header_texts = [h.text().upper() for h in headers if h.text()]
# Check for 10-K specific sections
has_10k_sections = any(
'ITEM 1.' in text or 'ITEM 1A.' in text or 'ITEM 7.' in text or 'ITEM 8.' in text
for text in header_texts
)
# Check for 10-Q specific sections
has_10q_sections = any(
('ITEM 1.' in text and 'FINANCIAL STATEMENTS' in text) or
('ITEM 2.' in text and 'MANAGEMENT' in text) or
'ITEM 3.' in text or 'ITEM 4.' in text
for text in header_texts
)
# Check for 8-K specific sections
has_8k_sections = any(
re.search(r'ITEM \d\.\d{2}', text) for text in header_texts
)
if has_10k_sections and not has_10q_sections:
return '10-K'
elif has_10q_sections:
return '10-Q'
elif has_8k_sections:
return '8-K'
else:
return 'UNKNOWN'
def _get_general_patterns(self) -> Dict[str, List[Tuple[str, str]]]:
"""Get general section patterns."""
return {
'business': [
(r'^Business', 'Business'),
(r'^Overview', 'Overview'),
(r'^Company', 'Company')
],
'financial': [
(r'^Financial\s+Statements', 'Financial Statements'),
(r'^Consolidated.*Statements', 'Consolidated Statements')
],
'notes': [
(r'^Notes\s+to.*Financial\s+Statements', 'Notes to Financial Statements'),
(r'^Notes\s+to.*Statements', 'Notes')
]
}
def _find_section_headers(self, document: Document) -> List[Tuple[Node, str, int]]:
"""Find all potential section headers."""
headers = []
# Find all heading nodes
heading_nodes = document.root.find(lambda n: isinstance(n, HeadingNode))
for node in heading_nodes:
text = node.text()
if text:
# Get position in document
position = self._get_node_position(node, document)
headers.append((node, text, position))
# Also check for section nodes
section_nodes = document.root.find(lambda n: isinstance(n, SectionNode))
for node in section_nodes:
# Get first heading in section
first_heading = node.find_first(lambda n: isinstance(n, HeadingNode))
if first_heading:
text = first_heading.text()
if text:
position = self._get_node_position(node, document)
headers.append((node, text, position))
# Sort by position
headers.sort(key=lambda x: x[2])
return headers
def _get_node_position(self, node: Node, document: Document) -> int:
"""Get position of node in document."""
position = 0
for n in document.root.walk():
if n == node:
return position
position += 1
return position
def _detect_10q_parts(self, headers: List[Tuple[Node, str, int]]) -> Dict[int, str]:
"""
Detect Part I and Part II boundaries in 10-Q filings.
Args:
headers: List of (node, text, position) tuples
Returns:
Dict mapping header index to part name ("Part I" or "Part II")
"""
part_context = {}
current_part = None
part_i_pattern = re.compile(r'^\s*PART\s+I\b', re.IGNORECASE)
part_ii_pattern = re.compile(r'^\s*PART\s+II\b', re.IGNORECASE)
for i, (node, text, position) in enumerate(headers):
text_stripped = text.strip()
# Check if this is a Part I or Part II header
if part_i_pattern.match(text_stripped):
current_part = "Part I"
part_context[i] = current_part
elif part_ii_pattern.match(text_stripped):
current_part = "Part II"
part_context[i] = current_part
elif current_part:
# Headers after a Part declaration belong to that part
part_context[i] = current_part
return part_context
def _match_sections(self,
headers: List[Tuple[Node, str, int]],
patterns: Dict[str, List[Tuple[str, str]]],
document: Document,
part_context: Optional[Dict[int, str]] = None) -> Dict[str, Tuple[Node, str, int, int]]:
"""Match headers to section patterns."""
matched_sections = {}
used_headers = set()
# Try to match each pattern
for section_name, section_patterns in patterns.items():
for pattern, title in section_patterns:
for i, (node, text, position) in enumerate(headers):
if i in used_headers:
continue
# Try to match pattern
if re.match(pattern, text.strip(), re.IGNORECASE):
# Find end position (next section or end of document)
end_position = self._find_section_end(i, headers, document)
# For 10-Q, prefix with Part I or Part II
final_title = title
if part_context and i in part_context:
final_title = f"{part_context[i]} - {title}"
# Use final_title as key to avoid conflicts
section_key = final_title if part_context and i in part_context else section_name
matched_sections[section_key] = (node, final_title, position, end_position)
used_headers.add(i)
break
# If we found a match, move to next section
if section_name in matched_sections:
break
return matched_sections
def _find_section_end(self,
section_index: int,
headers: List[Tuple[Node, str, int]],
document: Document) -> int:
"""Find where section ends."""
# Next section starts where next header at same or higher level begins
if section_index + 1 < len(headers):
current_node = headers[section_index][0]
current_level = current_node.level if isinstance(current_node, HeadingNode) else 1
for i in range(section_index + 1, len(headers)):
next_node = headers[i][0]
next_level = next_node.level if isinstance(next_node, HeadingNode) else 1
# If next header is at same or higher level, that's our end
if next_level <= current_level:
return headers[i][2]
# Otherwise, section goes to end of document
return sum(1 for _ in document.root.walk())
def _create_sections(self,
matched_sections: Dict[str, Tuple[Node, str, int, int]],
document: Document) -> Dict[str, Section]:
"""Create Section objects from matches."""
sections = {}
for section_name, (node, title, start_pos, end_pos) in matched_sections.items():
# Create section node containing all content in range
section_node = SectionNode(section_name=section_name)
# Find all nodes in position range
position = 0
for n in document.root.walk():
if start_pos <= position < end_pos:
# Clone node and add to section
# (In real implementation, would properly handle node hierarchy)
section_node.add_child(n)
position += 1
# Parse section name to extract part and item identifiers
part, item = Section.parse_section_name(section_name)
# Create Section object
section = Section(
name=section_name,
title=title,
node=section_node,
start_offset=start_pos,
end_offset=end_pos,
confidence=0.7, # Pattern-based detection = moderate confidence
detection_method='pattern', # Method: regex pattern matching
part=part,
item=item
)
sections[section_name] = section
return sections

View File

@@ -0,0 +1,348 @@
"""
Text extraction from documents with various options.
"""
import re
from typing import List, Optional, Set
from edgar.documents.document import Document
from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode
from edgar.documents.table_nodes import TableNode
from edgar.documents.types import NodeType
class TextExtractor:
"""
Extracts text from documents with configurable options.
Supports:
- Clean text extraction for AI/NLP
- Table inclusion/exclusion
- Metadata annotations
- Length limiting
- Smart whitespace handling
"""
def __init__(self,
clean: bool = True,
include_tables: bool = True,
include_metadata: bool = False,
include_links: bool = False,
max_length: Optional[int] = None,
preserve_structure: bool = False):
"""
Initialize text extractor.
Args:
clean: Clean and normalize text
include_tables: Include table content
include_metadata: Include metadata annotations
include_links: Include link URLs
max_length: Maximum text length
preserve_structure: Preserve document structure with markers
"""
self.clean = clean
self.include_tables = include_tables
self.include_metadata = include_metadata
self.include_links = include_links
self.max_length = max_length
self.preserve_structure = preserve_structure
# Track what we've extracted to avoid duplicates
self._extracted_ids: Set[str] = set()
def extract(self, document: Document) -> str:
"""
Extract text from document.
Args:
document: Document to extract from
Returns:
Extracted text
"""
parts = []
self._extracted_ids.clear()
# Extract from root
self._extract_from_node(document.root, parts, depth=0)
# Join parts
if self.preserve_structure:
text = '\n'.join(parts)
else:
text = '\n\n'.join(filter(None, parts))
# Apply minimal global cleaning - tables are already handled appropriately per node
if self.clean:
text = self._clean_document_text(text)
# Limit length if requested
if self.max_length and len(text) > self.max_length:
text = self._truncate_text(text, self.max_length)
return text
def extract_from_node(self, node: Node) -> str:
"""Extract text from a specific node."""
parts = []
self._extracted_ids.clear()
self._extract_from_node(node, parts, depth=0)
text = '\n\n'.join(filter(None, parts))
if self.clean:
text = self._clean_document_text(text)
return text
def _extract_from_node(self, node: Node, parts: List[str], depth: int):
"""Recursively extract text from node - render each node type appropriately."""
# Skip if already extracted (handles shared nodes)
if node.id in self._extracted_ids:
return
self._extracted_ids.add(node.id)
# Handle based on node type - like old parser's block.get_text()
if isinstance(node, TableNode):
if self.include_tables:
# Tables render themselves - preserve their formatting
self._extract_table(node, parts)
elif isinstance(node, HeadingNode):
# Headings get cleaned text
self._extract_heading(node, parts, depth)
elif isinstance(node, TextNode):
# Text nodes get cleaned if cleaning is enabled
text = node.text()
if text:
if self.clean:
text = self._clean_text_content(text) # Clean non-table text
if self.include_metadata and node.metadata:
text = self._annotate_with_metadata(text, node.metadata)
parts.append(text)
elif isinstance(node, ParagraphNode):
# Extract paragraph as unified text to maintain flow of inline elements
text = node.text()
if text:
if self.clean:
text = self._clean_text_content(text)
if self.include_metadata and node.metadata:
text = self._annotate_with_metadata(text, node.metadata)
parts.append(text)
# Don't process children since we already got the paragraph text
return
else:
# Check if this looks like a bullet point container that should flow together
if self._is_bullet_point_container(node):
# Extract text from bullet point children and join with spaces (not newlines)
bullet_parts = []
for child in node.children:
child_text = child.text() if hasattr(child, 'text') else ""
if child_text and child_text.strip():
bullet_parts.append(child_text.strip())
if bullet_parts:
# Join with spaces for bullet points
text = ' '.join(bullet_parts)
if self.clean:
text = self._clean_text_content(text)
if self.include_metadata and node.metadata:
text = self._annotate_with_metadata(text, node.metadata)
parts.append(text)
# Don't process children since we already got the unified text
return
# For other nodes, extract text content and clean if appropriate
if hasattr(node, 'content') and isinstance(node.content, str):
text = node.content
if text and text.strip():
if self.clean:
text = self._clean_text_content(text) # Clean non-table text
if self.include_metadata and node.metadata:
text = self._annotate_with_metadata(text, node.metadata)
parts.append(text)
# Process children
for child in node.children:
self._extract_from_node(child, parts, depth + 1)
def _extract_heading(self, node: HeadingNode, parts: List[str], depth: int):
"""Extract heading with optional structure markers."""
text = node.text()
if not text:
return
if self.preserve_structure:
# Add structure markers
marker = '#' * node.level
text = f"{marker} {text}"
if self.include_metadata and node.metadata:
text = self._annotate_with_metadata(text, node.metadata)
parts.append(text)
def _extract_table(self, table: TableNode, parts: List[str]):
"""Extract table content - preserve original formatting like old parser."""
if self.preserve_structure:
parts.append("[TABLE START]")
# Add table caption if present
if table.caption:
caption_text = table.caption
if self.clean:
caption_text = self._clean_text_content(caption_text) # Clean caption but not table content
if self.preserve_structure:
parts.append(f"Caption: {caption_text}")
else:
parts.append(caption_text)
# Extract table text - PRESERVE FORMATTING (like old parser's TableBlock.get_text())
table_text = table.text()
if table_text:
# Tables render their own formatting - don't apply text cleaning to preserve alignment
parts.append(table_text) # Keep original spacing and alignment
if self.preserve_structure:
parts.append("[TABLE END]")
def _annotate_with_metadata(self, text: str, metadata: dict) -> str:
"""Add metadata annotations to text."""
annotations = []
# Add XBRL annotations
if 'ix_tag' in metadata:
annotations.append(f"[XBRL: {metadata['ix_tag']}]")
# Add section annotations
if 'section_name' in metadata:
annotations.append(f"[Section: {metadata['section_name']}]")
# Add semantic type
if 'semantic_type' in metadata:
annotations.append(f"[Type: {metadata['semantic_type']}]")
if annotations:
return f"{' '.join(annotations)} {text}"
return text
def _clean_text_content(self, text: str) -> str:
"""Clean regular text content (not tables) - like old parser text cleaning."""
if not text:
return text
# Replace multiple spaces with single space for regular text
text = re.sub(r' {2,}', ' ', text)
# Clean up space around newlines
text = re.sub(r' *\n *', '\n', text)
# Remove leading/trailing whitespace from lines
lines = text.split('\n')
lines = [line.strip() for line in lines]
text = '\n'.join(lines)
# Normalize quotes and dashes
text = self._normalize_punctuation(text)
return text
def _is_bullet_point_container(self, node) -> bool:
"""Check if a container node represents a bullet point that should flow as one line."""
from edgar.documents.nodes import ContainerNode
if not isinstance(node, ContainerNode):
return False
# Must have at least 2 children (bullet + content)
if len(node.children) < 2:
return False
# Get the text of all children to check for bullet patterns
all_text = node.text()
if not all_text:
return False
# Check if starts with common bullet characters
bullet_chars = ['', '', '', '', '', '', '-', '*']
starts_with_bullet = any(all_text.strip().startswith(char) for char in bullet_chars)
if not starts_with_bullet:
return False
# Check if container has flex display (common for bullet point layouts)
if hasattr(node, 'style') and node.style and hasattr(node.style, 'display'):
if node.style.display == 'flex':
return True
# Check if it has bullet-like structure: short first child + longer content
if len(node.children) >= 2:
first_child_text = node.children[0].text() if hasattr(node.children[0], 'text') else ""
second_child_text = node.children[1].text() if hasattr(node.children[1], 'text') else ""
# First child is very short (likely bullet), second is longer (content)
if len(first_child_text.strip()) <= 3 and len(second_child_text.strip()) > 10:
return True
return False
def _clean_document_text(self, text: str) -> str:
"""Apply minimal document-level cleaning that preserves table formatting."""
if not text:
return text
# Only apply global formatting that doesn't affect table alignment:
# Replace excessive newlines (4+ consecutive) with triple newline
text = re.sub(r'\n{4,}', '\n\n\n', text)
# Remove empty lines at start/end only
text = text.strip()
return text
def _normalize_punctuation(self, text: str) -> str:
"""Normalize punctuation for cleaner text."""
# Normalize quotes
text = text.replace('"', '"').replace('"', '"')
text = text.replace(''', "'").replace(''', "'")
# Normalize dashes
text = text.replace('', ' - ') # em dash
text = text.replace('', ' - ') # en dash
# Fix spacing around punctuation
text = re.sub(r'\s+([.,;!?])', r'\1', text)
text = re.sub(r'([.,;!?])\s*', r'\1 ', text)
# Remove extra spaces
text = re.sub(r' {2,}', ' ', text)
return text.strip()
def _truncate_text(self, text: str, max_length: int) -> str:
"""Truncate text intelligently."""
if len(text) <= max_length:
return text
# Try to truncate at sentence boundary
truncated = text[:max_length]
last_period = truncated.rfind('.')
last_newline = truncated.rfind('\n')
# Choose the better truncation point
truncate_at = max(last_period, last_newline)
if truncate_at > max_length * 0.8: # If we found a good boundary
return text[:truncate_at + 1].strip()
# Otherwise truncate at word boundary
last_space = truncated.rfind(' ')
if last_space > max_length * 0.9:
return text[:last_space].strip() + '...'
# Last resort: hard truncate
return text[:max_length - 3].strip() + '...'

View File

@@ -0,0 +1,178 @@
"""
TOC-based section detection strategy.
Detects sections using Table of Contents structure. Provides highest
confidence (0.95) and includes full text extraction capabilities.
This detector wraps SECSectionExtractor which has proven implementations of:
- Multi-column TOC support (checks all preceding table cells)
- Nested anchor handling (traverses up to find content container)
- Full section text extraction
"""
import logging
from typing import Dict, Optional
from edgar.documents.document import Document, Section
from edgar.documents.nodes import SectionNode
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
logger = logging.getLogger(__name__)
class TOCSectionDetector:
"""
TOC-based section detection strategy.
Uses Table of Contents structure to identify section boundaries and
extract full section content. Provides high confidence (0.95) detection.
This implementation wraps the proven SECSectionExtractor which includes:
- Multi-column TOC support for edge cases like Morgan Stanley
- Nested anchor handling for sections with no sibling content
- Complete text extraction with proper boundary detection
"""
def __init__(self, document: Document):
"""
Initialize TOC-based detector.
Args:
document: Document to analyze (must have metadata.original_html)
"""
self.document = document
self.extractor = SECSectionExtractor(document)
def detect(self) -> Optional[Dict[str, Section]]:
"""
Detect sections using TOC structure.
Returns:
Dictionary mapping section names to Section objects, or None if unavailable
Note:
Requires document.metadata.original_html to be available.
Returns None if HTML is not available or no sections found.
"""
# Check if original HTML is available
html_content = getattr(self.document.metadata, 'original_html', None)
if not html_content:
logger.debug("TOC detection unavailable: original_html not in document metadata")
return None
try:
# Get available sections from TOC
available = self.extractor.get_available_sections()
if not available:
logger.debug("No sections found in TOC")
return None
sections = {}
# Extract each section
for section_name in available:
# Get section metadata first to check for subsections
section_info = self.extractor.get_section_info(section_name)
if not section_info:
logger.debug(f"Skipping {section_name}: no section info")
continue
# Get section text (may be empty for container sections)
section_text = self.extractor.get_section_text(section_name, include_subsections=True)
# Check if this section has subsections
has_subsections = section_info.get('subsections', [])
if not section_text and not has_subsections:
# Skip only if no text AND no subsections
logger.debug(f"Skipping {section_name}: no text and no subsections")
continue
# Create section node (placeholder - actual content extracted lazily)
section_node = SectionNode(section_name=section_name)
# For container sections (Item 1, Item 10), text will include all subsections
section_length = len(section_text) if section_text else 0
# Create text extractor callback for lazy loading
def make_text_extractor(extractor, name):
"""Create a closure that captures extractor and section name."""
def extract_text(section_name=None, **kwargs):
# Use captured name, ignore passed section_name
clean = kwargs.get('clean', True)
return extractor.get_section_text(name, include_subsections=True, clean=clean) or ""
return extract_text
# Parse section name to extract part and item identifiers
part, item = Section.parse_section_name(section_name)
# Create Section with TOC confidence
section = Section(
name=section_name,
title=section_info.get('canonical_name', section_name),
node=section_node,
start_offset=0, # Would need actual offsets from parsing
end_offset=section_length,
confidence=0.95, # TOC-based = high confidence
detection_method='toc',
part=part,
item=item,
_text_extractor=make_text_extractor(self.extractor, section_name)
)
sections[section_name] = section
if sections:
logger.info(f"TOC detection found {len(sections)} sections")
return sections
return None
except Exception as e:
logger.warning(f"TOC detection failed: {e}", exc_info=True)
return None
def get_section_text(document: Document, section_name: str) -> Optional[str]:
"""
Get section text using TOC-based extraction.
Args:
document: Document to extract from
section_name: Section name (e.g., 'Item 1', 'Item 1A')
Returns:
Section text if available, None otherwise
"""
html_content = getattr(document.metadata, 'original_html', None)
if not html_content:
return None
try:
extractor = SECSectionExtractor(document)
return extractor.get_section_text(section_name)
except Exception as e:
logger.warning(f"Failed to get section text for {section_name}: {e}")
return None
def get_available_sections(document: Document) -> list[str]:
"""
Get list of available sections from TOC.
Args:
document: Document to analyze
Returns:
List of section names found in TOC
"""
html_content = getattr(document.metadata, 'original_html', None)
if not html_content:
return []
try:
extractor = SECSectionExtractor(document)
return extractor.get_available_sections()
except Exception as e:
logger.warning(f"Failed to get available sections: {e}")
return []

View File

@@ -0,0 +1,383 @@
"""
Section extraction for SEC filings using Table of Contents analysis.
This system uses TOC structure to extract specific sections like "Item 1",
"Item 1A", etc. from SEC filings. This approach works consistently across
all SEC filings regardless of whether they use semantic anchors or generated IDs.
"""
import re
from typing import Dict, List, Optional, Tuple, Set
from dataclasses import dataclass
from lxml import html as lxml_html
from edgar.documents.nodes import Node, SectionNode
from edgar.documents.document import Document
from edgar.documents.utils.toc_analyzer import TOCAnalyzer
@dataclass
class SectionBoundary:
"""Represents the boundaries of a document section."""
name: str
anchor_id: str
start_element_id: Optional[str] = None
end_element_id: Optional[str] = None
start_node: Optional[Node] = None
end_node: Optional[Node] = None
text_start: Optional[int] = None # Character position in full text
text_end: Optional[int] = None
confidence: float = 1.0 # Detection confidence (0.0-1.0)
detection_method: str = 'unknown' # How section was detected
class SECSectionExtractor:
"""
Extract specific sections from SEC filings using Table of Contents analysis.
This uses TOC structure to identify section boundaries and extract content
between them. Works consistently for all SEC filings.
"""
def __init__(self, document: Document):
self.document = document
self.section_map = {} # Maps section names to canonical names
self.section_boundaries = {} # Maps section names to boundaries
self.toc_analyzer = TOCAnalyzer()
self._analyze_sections()
def _analyze_sections(self) -> None:
"""
Analyze the document using TOC structure to identify section boundaries.
This creates a map of section names to their anchor positions using
Table of Contents analysis, which works for all SEC filings.
"""
# Get the original HTML if available
html_content = getattr(self.document.metadata, 'original_html', None)
if not html_content:
return
# Use TOC analysis to find sections
toc_mapping = self.toc_analyzer.analyze_toc_structure(html_content)
if not toc_mapping:
return # No sections found
# Handle XML declaration issues
if html_content.startswith('<?xml'):
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
tree = lxml_html.fromstring(html_content)
sec_sections = {}
for section_name, anchor_id in toc_mapping.items():
# Verify the anchor target exists
target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
if target_elements:
element = target_elements[0]
# Use TOC-based section info
section_type, order = self.toc_analyzer._get_section_type_and_order(section_name)
sec_sections[section_name] = {
'anchor_id': anchor_id,
'element': element,
'canonical_name': section_name,
'type': section_type,
'order': order,
'confidence': 0.95, # TOC-based detection = high confidence
'detection_method': 'toc' # Method: Table of Contents
}
if not sec_sections:
return # No valid sections found
# Sort sections by their logical order
sorted_sections = sorted(sec_sections.items(), key=lambda x: x[1]['order'])
# Calculate section boundaries
for i, (section_name, section_data) in enumerate(sorted_sections):
start_anchor = section_data['anchor_id']
# End boundary is the start of the next section (if any)
end_anchor = None
if i + 1 < len(sorted_sections):
next_section = sorted_sections[i + 1][1]
end_anchor = next_section['anchor_id']
self.section_boundaries[section_name] = SectionBoundary(
name=section_name,
anchor_id=start_anchor,
end_element_id=end_anchor,
confidence=section_data.get('confidence', 0.95),
detection_method=section_data.get('detection_method', 'toc')
)
self.section_map = {name: data['canonical_name'] for name, data in sec_sections.items()}
def get_available_sections(self) -> List[str]:
"""
Get list of available sections that can be extracted.
Returns:
List of section names
"""
return sorted(self.section_boundaries.keys(),
key=lambda x: self.section_boundaries[x].anchor_id)
def get_section_text(self, section_name: str,
include_subsections: bool = True,
clean: bool = True) -> Optional[str]:
"""
Extract text content for a specific section.
Args:
section_name: Name of section (e.g., "Item 1", "Item 1A", "Part I")
include_subsections: Whether to include subsections
clean: Whether to apply text cleaning
Returns:
Section text content or None if section not found
"""
# Normalize section name
normalized_name = self._normalize_section_name(section_name)
if normalized_name not in self.section_boundaries:
return None
boundary = self.section_boundaries[normalized_name]
# Extract content between boundaries using HTML parsing
html_content = getattr(self.document.metadata, 'original_html', None)
if not html_content:
return None
try:
section_text = self._extract_section_content(html_content, boundary, include_subsections, clean)
# If no direct content but include_subsections=True, aggregate subsection text
if not section_text and include_subsections:
subsections = self._get_subsections(normalized_name)
if subsections:
# Recursively get text from all subsections
subsection_texts = []
for subsection_name in subsections:
subsection_text = self.get_section_text(subsection_name, include_subsections=True, clean=clean)
if subsection_text:
subsection_texts.append(subsection_text)
if subsection_texts:
section_text = '\n\n'.join(subsection_texts)
return section_text
except Exception as e:
# Fallback to simple text extraction
return self._extract_section_fallback(section_name, clean)
def _normalize_section_name(self, section_name: str) -> str:
"""Normalize section name for lookup."""
# Handle common variations
name = section_name.strip()
# "Item 1" vs "Item 1." vs "Item 1:"
name = re.sub(r'[.:]$', '', name)
# Case normalization
if re.match(r'item\s+\d+', name, re.IGNORECASE):
match = re.match(r'item\s+(\d+[a-z]?)', name, re.IGNORECASE)
if match:
name = f"Item {match.group(1).upper()}"
elif re.match(r'part\s+[ivx]+', name, re.IGNORECASE):
match = re.match(r'part\s+([ivx]+)', name, re.IGNORECASE)
if match:
name = f"Part {match.group(1).upper()}"
return name
def _extract_section_content(self, html_content: str, boundary: SectionBoundary,
include_subsections: bool, clean: bool) -> str:
"""
Extract section content from HTML between anchors.
Args:
html_content: Full HTML content
boundary: Section boundary info
include_subsections: Whether to include subsections
clean: Whether to clean the text
Returns:
Extracted section text
"""
# Handle XML declaration issues
if html_content.startswith('<?xml'):
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
tree = lxml_html.fromstring(html_content)
# Find start element
start_elements = tree.xpath(f'//*[@id="{boundary.anchor_id}"]')
if not start_elements:
return ""
start_element = start_elements[0]
# Collect content until we hit the end boundary (if specified)
content_elements = []
# If anchor has no siblings (nested in empty container), traverse up to find content container
# This handles cases like <div id="item7"><div></div></div> where content is after the container
current = start_element.getnext()
if current is None:
# No sibling - traverse up to find a container with siblings
container = start_element.getparent()
while container is not None and container.getnext() is None:
container = container.getparent()
# Start from the container's next sibling if found
if container is not None:
current = container.getnext()
# Collect content from siblings
if current is not None:
# Normal case - anchor has siblings
while current is not None:
# Check if we've reached the end boundary
if boundary.end_element_id:
current_id = current.get('id', '')
if current_id == boundary.end_element_id:
break
# Also check if this is a sibling section we should stop at
if not include_subsections and self._is_sibling_section(current_id, boundary.name):
break
content_elements.append(current)
current = current.getnext()
# Extract text from collected elements
section_texts = []
for element in content_elements:
text = self._extract_element_text(element)
if text.strip():
section_texts.append(text)
combined_text = '\n\n'.join(section_texts)
# Apply cleaning if requested
if clean:
combined_text = self._clean_section_text(combined_text)
return combined_text
def _is_sibling_section(self, element_id: str, current_section: str) -> bool:
"""Check if element ID represents a sibling section."""
if not element_id:
return False
# Check if this looks like another item at the same level
if 'item' in current_section.lower() and 'item' in element_id.lower():
current_item = re.search(r'item\s*(\d+)', current_section, re.IGNORECASE)
other_item = re.search(r'item[\s_]*(\d+)', element_id, re.IGNORECASE)
if current_item and other_item:
return current_item.group(1) != other_item.group(1)
return False
def _extract_element_text(self, element) -> str:
"""Extract clean text from an HTML element."""
# This would integrate with your existing text extraction logic
# For now, simple text extraction
return element.text_content() or ""
def _clean_section_text(self, text: str) -> str:
"""Clean extracted section text."""
# Apply the same cleaning as the main document
from edgar.documents.utils.anchor_cache import filter_with_cached_patterns
# Remove excessive whitespace
text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
# Filter navigation links
html_content = getattr(self.document.metadata, 'original_html', None)
if html_content:
text = filter_with_cached_patterns(text, html_content)
return text.strip()
def _extract_section_fallback(self, section_name: str, clean: bool) -> Optional[str]:
"""
Fallback section extraction using document nodes.
This is used when HTML-based extraction fails.
"""
# Search through document sections
for name, section in self.document.sections.items():
if section_name.lower() in name.lower():
return section.text(clean=clean)
return None
def get_section_info(self, section_name: str) -> Optional[Dict]:
"""
Get detailed information about a section.
Args:
section_name: Section name to look up
Returns:
Dict with section metadata
"""
normalized_name = self._normalize_section_name(section_name)
if normalized_name not in self.section_boundaries:
return None
boundary = self.section_boundaries[normalized_name]
return {
'name': boundary.name,
'anchor_id': boundary.anchor_id,
'available': True,
'estimated_length': None, # Could calculate if needed
'subsections': self._get_subsections(normalized_name)
}
def _get_subsections(self, parent_section: str) -> List[str]:
"""
Get subsections of a parent section.
For example:
- "Item 1" has subsections "Item 1A", "Item 1B" (valid)
- "Item 1" does NOT have subsection "Item 10" (invalid - different item)
"""
subsections = []
# Look for sections that start with the parent name
for section_name in self.section_boundaries:
if section_name == parent_section:
continue
if section_name.startswith(parent_section):
# Check if this is a true subsection (e.g., Item 1A)
# vs a different section that happens to start with same prefix (e.g., Item 10)
remainder = section_name[len(parent_section):]
# Valid subsection patterns:
# - "Item 1A" (remainder: "A") - letter suffix
# - "Item 1 - Business" (remainder: " - Business") - has separator
# Invalid patterns:
# - "Item 10" (remainder: "0") - digit continues the number
if remainder and remainder[0].isalpha():
# Letter suffix like "A", "B" - valid subsection
subsections.append(section_name)
elif remainder and remainder[0] in [' ', '-', '.', ':']:
# Has separator - could be descriptive title
subsections.append(section_name)
# If remainder starts with digit, it's NOT a subsection (e.g., "Item 10")
return sorted(subsections)