Initial commit
This commit is contained in:
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
Content extractors for documents.
|
||||
"""
|
||||
|
||||
from edgar.documents.extractors.text_extractor import TextExtractor
|
||||
from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
|
||||
from edgar.documents.extractors.hybrid_section_detector import HybridSectionDetector
|
||||
from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
|
||||
|
||||
__all__ = [
|
||||
'TextExtractor',
|
||||
'SectionExtractor',
|
||||
'HybridSectionDetector',
|
||||
'TOCSectionDetector'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,170 @@
|
||||
"""
|
||||
Heading-based section detection strategy.
|
||||
|
||||
Detects sections by analyzing heading nodes with HeaderInfo metadata.
|
||||
This strategy provides moderate confidence (0.7-0.9) and serves as a
|
||||
fallback when TOC-based detection is not available.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
from edgar.documents.document import Document, Section
|
||||
from edgar.documents.nodes import HeadingNode, SectionNode
|
||||
from edgar.documents.types import HeaderInfo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HeadingSectionDetector:
|
||||
"""
|
||||
Heading-based section detection using HeaderInfo.
|
||||
|
||||
Analyzes heading nodes that have been annotated with HeaderInfo
|
||||
during parsing. Detects sections based on:
|
||||
- Item numbers (Item 1, Item 1A, etc.)
|
||||
- Heading confidence scores
|
||||
- Heading hierarchy
|
||||
|
||||
Provides moderate confidence (0.7-0.9) detection.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
document: Document,
|
||||
form: Optional[str] = None,
|
||||
min_confidence: float = 0.5 # Lower threshold, let hybrid detector filter
|
||||
):
|
||||
"""
|
||||
Initialize heading-based detector.
|
||||
|
||||
Args:
|
||||
document: Document to analyze
|
||||
form: Optional filing type for context ('10-K', '10-Q', '8-K')
|
||||
min_confidence: Minimum confidence for headings (default 0.5)
|
||||
"""
|
||||
self.document = document
|
||||
self.form = form
|
||||
self.min_confidence = min_confidence
|
||||
|
||||
def detect(self) -> Optional[Dict[str, Section]]:
|
||||
"""
|
||||
Detect sections from heading nodes with HeaderInfo.
|
||||
|
||||
Returns:
|
||||
Dictionary of sections if successful, None if no sections found
|
||||
"""
|
||||
try:
|
||||
# Get heading nodes from document
|
||||
headings = self.document.headings
|
||||
if not headings:
|
||||
logger.debug("No headings found in document")
|
||||
return None
|
||||
|
||||
sections = {}
|
||||
|
||||
for heading in headings:
|
||||
# Check if heading has header info
|
||||
if not hasattr(heading, 'header_info') or not heading.header_info:
|
||||
continue
|
||||
|
||||
header_info = heading.header_info
|
||||
|
||||
# Only use headings with sufficient confidence
|
||||
if header_info.confidence < self.min_confidence:
|
||||
continue
|
||||
|
||||
# Check if it's an item header
|
||||
if not header_info.is_item:
|
||||
continue
|
||||
|
||||
# Extract section from this heading
|
||||
section = self._extract_section_from_heading(heading, header_info)
|
||||
if section:
|
||||
section.confidence = header_info.confidence
|
||||
section.detection_method = 'heading'
|
||||
sections[section.name] = section
|
||||
|
||||
if not sections:
|
||||
logger.debug("No item headers found with sufficient confidence")
|
||||
return None
|
||||
|
||||
logger.info(f"Heading detection found {len(sections)} sections")
|
||||
return sections
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Heading detection failed: {e}")
|
||||
return None
|
||||
|
||||
def _extract_section_from_heading(
|
||||
self, heading: HeadingNode, header_info: HeaderInfo
|
||||
) -> Optional[Section]:
|
||||
"""
|
||||
Extract section content from heading node to next heading.
|
||||
|
||||
Args:
|
||||
heading: HeadingNode representing section start
|
||||
header_info: HeaderInfo with section metadata
|
||||
|
||||
Returns:
|
||||
Section object if successful, None otherwise
|
||||
"""
|
||||
try:
|
||||
# Create section name from item number
|
||||
if header_info.item_number:
|
||||
# Normalize: "1A" -> "item_1a", "7" -> "item_7"
|
||||
section_name = f"item_{header_info.item_number.replace('.', '_').lower()}"
|
||||
else:
|
||||
section_name = "unknown"
|
||||
|
||||
# Create section node
|
||||
section_node = SectionNode(section_name=section_name)
|
||||
|
||||
# Find next heading at same or higher level to determine section end
|
||||
current_level = header_info.level
|
||||
parent = heading.parent
|
||||
if not parent:
|
||||
logger.debug(f"Heading {header_info.text} has no parent")
|
||||
return None
|
||||
|
||||
# Find heading position in parent's children
|
||||
try:
|
||||
heading_index = parent.children.index(heading)
|
||||
except ValueError:
|
||||
logger.debug(f"Could not find heading in parent's children")
|
||||
return None
|
||||
|
||||
# Collect nodes until next section heading
|
||||
for i in range(heading_index + 1, len(parent.children)):
|
||||
child = parent.children[i]
|
||||
|
||||
# Stop at next heading of same or higher level
|
||||
if isinstance(child, HeadingNode):
|
||||
if hasattr(child, 'header_info') and child.header_info:
|
||||
if child.header_info.level <= current_level:
|
||||
break
|
||||
|
||||
# Add child to section
|
||||
section_node.add_child(child)
|
||||
|
||||
# Parse section name to extract part and item identifiers
|
||||
part, item = Section.parse_section_name(section_name)
|
||||
|
||||
# Create Section object
|
||||
section = Section(
|
||||
name=section_name,
|
||||
title=header_info.text,
|
||||
node=section_node,
|
||||
start_offset=0, # Would need actual text position
|
||||
end_offset=0, # Would need actual text position
|
||||
confidence=header_info.confidence,
|
||||
detection_method='heading',
|
||||
part=part,
|
||||
item=item
|
||||
)
|
||||
|
||||
return section
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract section from heading: {e}")
|
||||
return None
|
||||
@@ -0,0 +1,489 @@
|
||||
"""
|
||||
Hybrid section detection system with multiple fallback strategies.
|
||||
|
||||
This module implements a multi-strategy approach to section detection:
|
||||
1. TOC-based (primary): High confidence, uses Table of Contents structure
|
||||
2. Heading-based (fallback): Moderate confidence, uses multi-strategy heading detection
|
||||
3. Pattern-based (last resort): Lower confidence, uses regex pattern matching
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Optional, List
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
|
||||
from edgar.documents.document import Document, Section
|
||||
from edgar.documents.nodes import SectionNode, HeadingNode
|
||||
from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
|
||||
from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
|
||||
from edgar.documents.config import DetectionThresholds
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HybridSectionDetector:
|
||||
"""
|
||||
Multi-strategy section detector with fallback.
|
||||
|
||||
Tries strategies in order of reliability:
|
||||
1. TOC-based (0.95 confidence) - Most reliable
|
||||
2. Multi-strategy heading detection (0.7-0.9 confidence) - Fallback
|
||||
3. Pattern matching (0.6 confidence) - Last resort
|
||||
|
||||
Example:
|
||||
>>> detector = HybridSectionDetector(document, '10-K')
|
||||
>>> sections = detector.detect_sections()
|
||||
>>> for name, section in sections.items():
|
||||
... print(f"{name}: {section.confidence:.2f} ({section.detection_method})")
|
||||
"""
|
||||
|
||||
def __init__(self, document: Document, form: str, thresholds: Optional[DetectionThresholds] = None):
|
||||
"""
|
||||
Initialize hybrid detector.
|
||||
|
||||
Args:
|
||||
document: Document to extract sections from
|
||||
form: Filing type ('10-K', '10-Q', '8-K')
|
||||
thresholds: Detection thresholds configuration
|
||||
"""
|
||||
self.document = document
|
||||
self.form = form
|
||||
self.thresholds = thresholds or DetectionThresholds()
|
||||
|
||||
# Initialize detection strategies
|
||||
self.toc_detector = TOCSectionDetector(document)
|
||||
self.pattern_extractor = SectionExtractor(form)
|
||||
|
||||
def detect_sections(self) -> Dict[str, Section]:
|
||||
"""
|
||||
Detect sections using hybrid approach with fallback and validation.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section names to Section objects with confidence scores
|
||||
"""
|
||||
# Strategy 1: TOC-based (most reliable)
|
||||
logger.debug("Trying TOC-based detection...")
|
||||
sections = self.toc_detector.detect()
|
||||
if sections:
|
||||
logger.info(f"TOC detection successful: {len(sections)} sections found")
|
||||
return self._validate_pipeline(sections, enable_cross_validation=True)
|
||||
|
||||
# Strategy 2: Heading-based (fallback)
|
||||
logger.debug("TOC detection failed, trying heading detection...")
|
||||
sections = self._try_heading_detection()
|
||||
if sections:
|
||||
logger.info(f"Heading detection successful: {len(sections)} sections found")
|
||||
return self._validate_pipeline(sections, enable_cross_validation=False)
|
||||
|
||||
# Strategy 3: Pattern-based (last resort)
|
||||
logger.debug("Heading detection failed, trying pattern matching...")
|
||||
sections = self._try_pattern_detection()
|
||||
if sections:
|
||||
logger.info(f"Pattern detection successful: {len(sections)} sections found")
|
||||
return self._validate_pipeline(sections, enable_cross_validation=False)
|
||||
|
||||
logger.warning("All detection strategies failed, no sections found")
|
||||
return {}
|
||||
|
||||
def _validate_pipeline(
|
||||
self,
|
||||
sections: Dict[str, Section],
|
||||
enable_cross_validation: bool = False
|
||||
) -> Dict[str, Section]:
|
||||
"""
|
||||
Apply validation pipeline to sections.
|
||||
|
||||
Centralizes validation logic to eliminate duplication.
|
||||
|
||||
Args:
|
||||
sections: Sections to validate
|
||||
enable_cross_validation: Whether to enable cross-validation (expensive)
|
||||
|
||||
Returns:
|
||||
Validated sections
|
||||
"""
|
||||
if not sections:
|
||||
return sections
|
||||
|
||||
# Cross-validate (optional, expensive)
|
||||
if enable_cross_validation and self.thresholds.enable_cross_validation:
|
||||
sections = self._cross_validate(sections)
|
||||
|
||||
# Validate boundaries
|
||||
sections = self._validate_boundaries(sections)
|
||||
|
||||
# Deduplicate
|
||||
sections = self._deduplicate(sections)
|
||||
|
||||
# Filter by confidence
|
||||
sections = self._filter_by_confidence(sections)
|
||||
|
||||
return sections
|
||||
|
||||
def _try_heading_detection(self) -> Optional[Dict[str, Section]]:
|
||||
"""
|
||||
Try multi-strategy heading detection.
|
||||
|
||||
Returns:
|
||||
Dictionary of sections if successful, None if failed
|
||||
"""
|
||||
try:
|
||||
# Get heading nodes from document
|
||||
headings = self.document.headings
|
||||
if not headings:
|
||||
return None
|
||||
|
||||
sections = {}
|
||||
|
||||
for heading in headings:
|
||||
# Check if heading has header info
|
||||
if not hasattr(heading, 'header_info') or not heading.header_info:
|
||||
continue
|
||||
|
||||
header_info = heading.header_info
|
||||
|
||||
# Only use headings with sufficient confidence
|
||||
if header_info.confidence < 0.7:
|
||||
continue
|
||||
|
||||
# Check if it's an item header
|
||||
if not header_info.is_item:
|
||||
continue
|
||||
|
||||
# Extract section from this heading to next
|
||||
section = self._extract_section_from_heading(heading, header_info)
|
||||
if section:
|
||||
section.confidence = header_info.confidence
|
||||
section.detection_method = 'heading'
|
||||
sections[section.name] = section
|
||||
|
||||
return sections if sections else None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Heading detection failed: {e}")
|
||||
return None
|
||||
|
||||
def _try_pattern_detection(self) -> Optional[Dict[str, Section]]:
|
||||
"""
|
||||
Try pattern-based extraction.
|
||||
|
||||
Returns:
|
||||
Dictionary of sections if successful, None if failed
|
||||
"""
|
||||
try:
|
||||
# Use pattern extractor
|
||||
sections = self.pattern_extractor.extract(self.document)
|
||||
|
||||
# Mark with pattern detection confidence
|
||||
for section in sections.values():
|
||||
section.confidence = 0.6 # Pattern-based = lower confidence
|
||||
section.detection_method = 'pattern'
|
||||
|
||||
return sections if sections else None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Pattern detection failed: {e}")
|
||||
return None
|
||||
|
||||
def _extract_section_from_heading(self, heading: HeadingNode, header_info) -> Optional[Section]:
|
||||
"""
|
||||
Extract section content from heading node to next heading.
|
||||
|
||||
Args:
|
||||
heading: HeadingNode representing section start
|
||||
header_info: HeaderInfo with section metadata
|
||||
|
||||
Returns:
|
||||
Section object if successful, None otherwise
|
||||
"""
|
||||
try:
|
||||
# Create section name from item number
|
||||
section_name = f"item_{header_info.item_number.replace('.', '_')}" if header_info.item_number else "unknown"
|
||||
|
||||
# Create section node
|
||||
section_node = SectionNode(section_name=section_name)
|
||||
|
||||
# Find next heading at same or higher level to determine section end
|
||||
current_level = header_info.level
|
||||
parent = heading.parent
|
||||
if not parent:
|
||||
return None
|
||||
|
||||
# Find heading position in parent's children
|
||||
try:
|
||||
heading_index = parent.children.index(heading)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Collect nodes until next section heading
|
||||
for i in range(heading_index + 1, len(parent.children)):
|
||||
child = parent.children[i]
|
||||
|
||||
# Stop at next heading of same or higher level
|
||||
if isinstance(child, HeadingNode):
|
||||
if hasattr(child, 'header_info') and child.header_info:
|
||||
if child.header_info.level <= current_level:
|
||||
break
|
||||
|
||||
# Add child to section
|
||||
section_node.add_child(child)
|
||||
|
||||
# Create Section object
|
||||
section = Section(
|
||||
name=section_name,
|
||||
title=header_info.text,
|
||||
node=section_node,
|
||||
start_offset=0, # Would need actual text position
|
||||
end_offset=0, # Would need actual text position
|
||||
confidence=header_info.confidence,
|
||||
detection_method='heading'
|
||||
)
|
||||
|
||||
return section
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract section from heading: {e}")
|
||||
return None
|
||||
|
||||
def _cross_validate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
|
||||
"""
|
||||
Cross-validate sections using multiple detection methods.
|
||||
|
||||
Boosts confidence if multiple methods detect the same section.
|
||||
Reduces confidence if methods disagree.
|
||||
|
||||
Args:
|
||||
sections: Sections detected by primary method
|
||||
|
||||
Returns:
|
||||
Validated sections with adjusted confidence scores
|
||||
"""
|
||||
validated = {}
|
||||
|
||||
# Get pattern-based sections once for comparison (not per section)
|
||||
try:
|
||||
pattern_sections = self.pattern_extractor.extract(self.document)
|
||||
except Exception as e:
|
||||
logger.debug(f"Pattern extraction failed for cross-validation: {e}")
|
||||
pattern_sections = {}
|
||||
|
||||
for name, section in sections.items():
|
||||
# Try alternative detection (pattern matching for validation)
|
||||
try:
|
||||
# Check if this section is also found by pattern matching
|
||||
found_in_patterns = False
|
||||
for pattern_name, pattern_section in pattern_sections.items():
|
||||
# Check for name similarity or overlap
|
||||
if self._sections_similar(section, pattern_section):
|
||||
found_in_patterns = True
|
||||
break
|
||||
|
||||
# Boost confidence if methods agree
|
||||
if found_in_patterns:
|
||||
section.confidence = min(section.confidence * self.thresholds.cross_validation_boost, 1.0)
|
||||
section.validated = True
|
||||
logger.debug(f"Section {name} validated by multiple methods, confidence boosted to {section.confidence:.2f}")
|
||||
else:
|
||||
# Slight reduction if not validated
|
||||
section.confidence *= self.thresholds.disagreement_penalty
|
||||
section.validated = False
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Cross-validation failed for {name}: {e}")
|
||||
# Keep original confidence if validation fails
|
||||
pass
|
||||
|
||||
validated[name] = section
|
||||
|
||||
return validated
|
||||
|
||||
def _validate_boundaries(self, sections: Dict[str, Section]) -> Dict[str, Section]:
|
||||
"""
|
||||
Validate section boundaries for overlaps, gaps, and ordering.
|
||||
|
||||
Args:
|
||||
sections: Sections to validate
|
||||
|
||||
Returns:
|
||||
Sections with validated boundaries
|
||||
"""
|
||||
if not sections:
|
||||
return sections
|
||||
|
||||
# Sort by start offset
|
||||
sorted_sections = sorted(sections.items(), key=lambda x: x[1].start_offset)
|
||||
|
||||
validated = {}
|
||||
prev_section = None
|
||||
|
||||
for name, section in sorted_sections:
|
||||
# Check for overlap with previous section
|
||||
if prev_section and section.start_offset > 0:
|
||||
if section.start_offset < prev_section[1].end_offset:
|
||||
# Overlap detected - adjust boundary at midpoint
|
||||
gap_mid = (prev_section[1].end_offset + section.start_offset) // 2
|
||||
prev_section[1].end_offset = gap_mid
|
||||
section.start_offset = gap_mid
|
||||
|
||||
# Reduce confidence due to boundary adjustment
|
||||
section.confidence *= self.thresholds.boundary_overlap_penalty
|
||||
prev_section[1].confidence *= self.thresholds.boundary_overlap_penalty
|
||||
|
||||
logger.debug(f"Adjusted boundary between {prev_section[0]} and {name}")
|
||||
|
||||
# Check for large gap (>10% of document size)
|
||||
elif prev_section[1].end_offset > 0:
|
||||
gap_size = section.start_offset - prev_section[1].end_offset
|
||||
if gap_size > 100000: # Arbitrary large gap threshold
|
||||
# Large gap - might indicate missing section
|
||||
section.confidence *= 0.9
|
||||
logger.debug(f"Large gap detected before {name}")
|
||||
|
||||
validated[name] = section
|
||||
prev_section = (name, section)
|
||||
|
||||
return validated
|
||||
|
||||
def _deduplicate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
|
||||
"""
|
||||
Remove duplicate sections detected by multiple methods.
|
||||
|
||||
Keeps the detection with highest confidence.
|
||||
|
||||
Args:
|
||||
sections: Sections possibly containing duplicates
|
||||
|
||||
Returns:
|
||||
Deduplicated sections
|
||||
"""
|
||||
if len(sections) <= 1:
|
||||
return sections
|
||||
|
||||
# Group similar sections
|
||||
groups = self._group_similar_sections(sections)
|
||||
|
||||
deduplicated = {}
|
||||
for group in groups:
|
||||
if len(group) == 1:
|
||||
# No duplicates
|
||||
deduplicated[group[0].name] = group[0]
|
||||
else:
|
||||
# Keep section with highest confidence
|
||||
best = max(group, key=lambda s: s.confidence)
|
||||
|
||||
# Merge detection methods
|
||||
methods = set(s.detection_method for s in group)
|
||||
if len(methods) > 1:
|
||||
best.detection_method = ','.join(sorted(methods))
|
||||
# Boost confidence for multi-method detection
|
||||
best.confidence = min(best.confidence * 1.15, 1.0)
|
||||
best.validated = True
|
||||
logger.debug(f"Merged duplicate sections for {best.name}, methods: {best.detection_method}")
|
||||
|
||||
deduplicated[best.name] = best
|
||||
|
||||
return deduplicated
|
||||
|
||||
def _group_similar_sections(self, sections: Dict[str, Section]) -> List[List[Section]]:
|
||||
"""
|
||||
Group sections that appear to be duplicates.
|
||||
|
||||
Args:
|
||||
sections: Sections to group
|
||||
|
||||
Returns:
|
||||
List of section groups
|
||||
"""
|
||||
groups = []
|
||||
used = set()
|
||||
|
||||
for name1, section1 in sections.items():
|
||||
if name1 in used:
|
||||
continue
|
||||
|
||||
group = [section1]
|
||||
used.add(name1)
|
||||
|
||||
for name2, section2 in sections.items():
|
||||
if name2 in used:
|
||||
continue
|
||||
|
||||
# Check if sections are similar
|
||||
if self._sections_similar(section1, section2):
|
||||
group.append(section2)
|
||||
used.add(name2)
|
||||
|
||||
groups.append(group)
|
||||
|
||||
return groups
|
||||
|
||||
def _sections_similar(self, section1: Section, section2: Section) -> bool:
|
||||
"""
|
||||
Check if two sections are similar (likely duplicates).
|
||||
|
||||
Args:
|
||||
section1: First section
|
||||
section2: Second section
|
||||
|
||||
Returns:
|
||||
True if sections are similar
|
||||
"""
|
||||
# Normalize names for comparison
|
||||
name1 = section1.name.lower().replace('_', ' ').strip()
|
||||
name2 = section2.name.lower().replace('_', ' ').strip()
|
||||
|
||||
# Check exact match after normalization
|
||||
if name1 == name2:
|
||||
return True
|
||||
|
||||
# Check title similarity (exact match)
|
||||
title1 = section1.title.lower().strip()
|
||||
title2 = section2.title.lower().strip()
|
||||
|
||||
if title1 == title2:
|
||||
return True
|
||||
|
||||
# Check for position overlap (if positions are set)
|
||||
if section1.start_offset > 0 and section2.start_offset > 0:
|
||||
# Calculate overlap
|
||||
overlap_start = max(section1.start_offset, section2.start_offset)
|
||||
overlap_end = min(section1.end_offset, section2.end_offset)
|
||||
|
||||
if overlap_end > overlap_start:
|
||||
# There is overlap
|
||||
overlap_size = overlap_end - overlap_start
|
||||
min_size = min(
|
||||
section1.end_offset - section1.start_offset,
|
||||
section2.end_offset - section2.start_offset
|
||||
)
|
||||
|
||||
# If overlap is >50% of smaller section, consider similar
|
||||
if min_size > 0 and overlap_size / min_size > 0.5:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _filter_by_confidence(self, sections: Dict[str, Section]) -> Dict[str, Section]:
|
||||
"""
|
||||
Filter sections by minimum confidence threshold.
|
||||
|
||||
Args:
|
||||
sections: Sections to filter
|
||||
|
||||
Returns:
|
||||
Filtered sections meeting minimum confidence
|
||||
"""
|
||||
# Check for filing-specific thresholds
|
||||
min_conf = self.thresholds.min_confidence
|
||||
if self.form in self.thresholds.thresholds_by_form:
|
||||
filing_thresholds = self.thresholds.thresholds_by_form[self.form]
|
||||
min_conf = filing_thresholds.get('min_confidence', min_conf)
|
||||
|
||||
filtered = {}
|
||||
for name, section in sections.items():
|
||||
if section.confidence >= min_conf:
|
||||
filtered[name] = section
|
||||
else:
|
||||
logger.debug(f"Filtered out section {name} with confidence {section.confidence:.2f} < {min_conf:.2f}")
|
||||
|
||||
return filtered
|
||||
@@ -0,0 +1,405 @@
|
||||
"""
|
||||
Section extraction from documents.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from edgar.documents.document import Document, Section
|
||||
from edgar.documents.nodes import Node, HeadingNode, SectionNode
|
||||
|
||||
|
||||
class SectionExtractor:
|
||||
"""
|
||||
Extracts logical sections from documents.
|
||||
|
||||
Identifies document sections like:
|
||||
- Business Overview (Item 1)
|
||||
- Risk Factors (Item 1A)
|
||||
- MD&A (Item 7)
|
||||
- Financial Statements (Item 8)
|
||||
"""
|
||||
|
||||
# Common section patterns for different filing types
|
||||
SECTION_PATTERNS = {
|
||||
'10-K': {
|
||||
'business': [
|
||||
(r'^(Item|ITEM)\s+1\.?\s*Business', 'Item 1 - Business'),
|
||||
(r'^Business\s*$', 'Business'),
|
||||
(r'^Business Overview', 'Business Overview'),
|
||||
(r'^Our Business', 'Our Business'),
|
||||
(r'^Company Overview', 'Company Overview')
|
||||
],
|
||||
'risk_factors': [
|
||||
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
|
||||
(r'^Risk\s+Factors', 'Risk Factors'),
|
||||
(r'^Factors\s+That\s+May\s+Affect', 'Risk Factors')
|
||||
],
|
||||
'properties': [
|
||||
(r'^(Item|ITEM)\s+2\.?\s*Properties', 'Item 2 - Properties'),
|
||||
(r'^Properties', 'Properties'),
|
||||
(r'^Real\s+Estate', 'Real Estate')
|
||||
],
|
||||
'legal_proceedings': [
|
||||
(r'^(Item|ITEM)\s+3\.?\s*Legal\s+Proceedings', 'Item 3 - Legal Proceedings'),
|
||||
(r'^Legal\s+Proceedings', 'Legal Proceedings'),
|
||||
(r'^Litigation', 'Litigation')
|
||||
],
|
||||
'market_risk': [
|
||||
(r'^(Item|ITEM)\s+7A\.?\s*Quantitative.*Disclosures', 'Item 7A - Market Risk'),
|
||||
(r'^Market\s+Risk', 'Market Risk'),
|
||||
(r'^Quantitative.*Qualitative.*Market\s+Risk', 'Market Risk')
|
||||
],
|
||||
'mda': [
|
||||
(r'^(Item|ITEM)\s+7\.?\s*Management.*Discussion', 'Item 7 - MD&A'),
|
||||
(r'^Management.*Discussion.*Analysis', 'MD&A'),
|
||||
(r'^MD&A', 'MD&A')
|
||||
],
|
||||
'financial_statements': [
|
||||
(r'^(Item|ITEM)\s+8\.?\s*Financial\s+Statements', 'Item 8 - Financial Statements'),
|
||||
(r'^Financial\s+Statements', 'Financial Statements'),
|
||||
(r'^Consolidated\s+Financial\s+Statements', 'Consolidated Financial Statements')
|
||||
],
|
||||
'controls_procedures': [
|
||||
(r'^(Item|ITEM)\s+9A\.?\s*Controls.*Procedures', 'Item 9A - Controls and Procedures'),
|
||||
(r'^Controls.*Procedures', 'Controls and Procedures'),
|
||||
(r'^Internal\s+Control', 'Internal Controls')
|
||||
]
|
||||
},
|
||||
'10-Q': {
|
||||
'financial_statements': [
|
||||
(r'^(Item|ITEM)\s+1\.?\s*Financial\s+Statements', 'Item 1 - Financial Statements'),
|
||||
(r'^Financial\s+Statements', 'Financial Statements'),
|
||||
(r'^Condensed.*Financial\s+Statements', 'Condensed Financial Statements')
|
||||
],
|
||||
'mda': [
|
||||
(r'^(Item|ITEM)\s+2\.?\s*Management.*Discussion', 'Item 2 - MD&A'),
|
||||
(r'^Management.*Discussion.*Analysis', 'MD&A')
|
||||
],
|
||||
'market_risk': [
|
||||
(r'^(Item|ITEM)\s+3\.?\s*Quantitative.*Disclosures', 'Item 3 - Market Risk'),
|
||||
(r'^Market\s+Risk', 'Market Risk')
|
||||
],
|
||||
'controls_procedures': [
|
||||
(r'^(Item|ITEM)\s+4\.?\s*Controls.*Procedures', 'Item 4 - Controls and Procedures'),
|
||||
(r'^Controls.*Procedures', 'Controls and Procedures')
|
||||
],
|
||||
'legal_proceedings': [
|
||||
(r'^(Item|ITEM)\s+1\.?\s*Legal\s+Proceedings', 'Item 1 - Legal Proceedings'),
|
||||
(r'^Legal\s+Proceedings', 'Legal Proceedings')
|
||||
],
|
||||
'risk_factors': [
|
||||
(r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
|
||||
(r'^Risk\s+Factors', 'Risk Factors')
|
||||
]
|
||||
},
|
||||
'8-K': {
|
||||
'item_101': [
|
||||
(r'^(Item|ITEM)\s+1\.01', 'Item 1.01 - Entry into Material Agreement'),
|
||||
(r'^Entry.*Material.*Agreement', 'Material Agreement')
|
||||
],
|
||||
'item_201': [
|
||||
(r'^(Item|ITEM)\s+2\.01', 'Item 2.01 - Completion of Acquisition'),
|
||||
(r'^Completion.*Acquisition', 'Acquisition')
|
||||
],
|
||||
'item_202': [
|
||||
(r'^(Item|ITEM)\s+2\.02', 'Item 2.02 - Results of Operations'),
|
||||
(r'^Results.*Operations', 'Results of Operations')
|
||||
],
|
||||
'item_503': [
|
||||
(r'^(Item|ITEM)\s+5\.03', 'Item 5.03 - Director/Officer Changes'),
|
||||
(r'^Amendments.*Articles', 'Charter Amendments')
|
||||
],
|
||||
'item_801': [
|
||||
(r'^(Item|ITEM)\s+8\.01', 'Item 8.01 - Other Events'),
|
||||
(r'^Other\s+Events', 'Other Events')
|
||||
],
|
||||
'item_901': [
|
||||
(r'^(Item|ITEM)\s+9\.01', 'Item 9.01 - Financial Statements and Exhibits'),
|
||||
(r'^Financial.*Exhibits', 'Financial Statements and Exhibits')
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self, form: Optional[str] = None):
|
||||
"""
|
||||
Initialize section extractor.
|
||||
|
||||
Args:
|
||||
form: Type of filing (10-K, 10-Q, 8-K, etc.)
|
||||
"""
|
||||
self.form = form
|
||||
|
||||
def extract(self, document: Document) -> Dict[str, Section]:
|
||||
"""
|
||||
Extract sections from document.
|
||||
|
||||
Args:
|
||||
document: Document to extract sections from
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section names to Section objects
|
||||
"""
|
||||
# Get filing type from instance, metadata, or document config
|
||||
# NOTE: We no longer auto-detect filing type (expensive and unnecessary)
|
||||
form = None
|
||||
|
||||
if self.form:
|
||||
form = self.form
|
||||
elif document.metadata and document.metadata.form:
|
||||
form = document.metadata.form
|
||||
elif hasattr(document, '_config') and document._config and document._config.form:
|
||||
form = document._config.form
|
||||
|
||||
# Only extract sections for forms that have standard sections
|
||||
if not form or form not in ['10-K', '10-Q', '8-K']:
|
||||
return {} # No filing type or unsupported form = no section detection
|
||||
|
||||
# Get patterns for filing type
|
||||
patterns = self.SECTION_PATTERNS.get(form, {})
|
||||
if not patterns:
|
||||
return {} # No patterns defined for this form type
|
||||
|
||||
# Find section headers
|
||||
headers = self._find_section_headers(document)
|
||||
|
||||
# For 10-Q, detect Part I/Part II boundaries
|
||||
part_context = None
|
||||
if form == '10-Q':
|
||||
part_context = self._detect_10q_parts(headers)
|
||||
|
||||
# Match headers to sections
|
||||
sections = self._match_sections(headers, patterns, document, part_context)
|
||||
|
||||
# Create section objects
|
||||
return self._create_sections(sections, document)
|
||||
|
||||
# NOTE: _detect_form() removed - form type should be known from context
|
||||
# Filing metadata should be set by the caller (Filing class, TenK/TenQ, etc.)
|
||||
|
||||
# NOTE: _infer_form_from_headers() kept for backward compatibility but not used
|
||||
# in normal flow anymore. Form type should always be provided explicitly.
|
||||
def _infer_form_from_headers(self, document: Document) -> str:
|
||||
"""
|
||||
Infer filing type from section headers.
|
||||
|
||||
NOTE: This method is kept for backward compatibility but should not be used
|
||||
in the normal flow. Form type should be explicitly provided via config or metadata.
|
||||
"""
|
||||
headers = document.headings
|
||||
header_texts = [h.text().upper() for h in headers if h.text()]
|
||||
|
||||
# Check for 10-K specific sections
|
||||
has_10k_sections = any(
|
||||
'ITEM 1.' in text or 'ITEM 1A.' in text or 'ITEM 7.' in text or 'ITEM 8.' in text
|
||||
for text in header_texts
|
||||
)
|
||||
|
||||
# Check for 10-Q specific sections
|
||||
has_10q_sections = any(
|
||||
('ITEM 1.' in text and 'FINANCIAL STATEMENTS' in text) or
|
||||
('ITEM 2.' in text and 'MANAGEMENT' in text) or
|
||||
'ITEM 3.' in text or 'ITEM 4.' in text
|
||||
for text in header_texts
|
||||
)
|
||||
|
||||
# Check for 8-K specific sections
|
||||
has_8k_sections = any(
|
||||
re.search(r'ITEM \d\.\d{2}', text) for text in header_texts
|
||||
)
|
||||
|
||||
if has_10k_sections and not has_10q_sections:
|
||||
return '10-K'
|
||||
elif has_10q_sections:
|
||||
return '10-Q'
|
||||
elif has_8k_sections:
|
||||
return '8-K'
|
||||
else:
|
||||
return 'UNKNOWN'
|
||||
|
||||
def _get_general_patterns(self) -> Dict[str, List[Tuple[str, str]]]:
|
||||
"""Get general section patterns."""
|
||||
return {
|
||||
'business': [
|
||||
(r'^Business', 'Business'),
|
||||
(r'^Overview', 'Overview'),
|
||||
(r'^Company', 'Company')
|
||||
],
|
||||
'financial': [
|
||||
(r'^Financial\s+Statements', 'Financial Statements'),
|
||||
(r'^Consolidated.*Statements', 'Consolidated Statements')
|
||||
],
|
||||
'notes': [
|
||||
(r'^Notes\s+to.*Financial\s+Statements', 'Notes to Financial Statements'),
|
||||
(r'^Notes\s+to.*Statements', 'Notes')
|
||||
]
|
||||
}
|
||||
|
||||
def _find_section_headers(self, document: Document) -> List[Tuple[Node, str, int]]:
|
||||
"""Find all potential section headers."""
|
||||
headers = []
|
||||
|
||||
# Find all heading nodes
|
||||
heading_nodes = document.root.find(lambda n: isinstance(n, HeadingNode))
|
||||
|
||||
for node in heading_nodes:
|
||||
text = node.text()
|
||||
if text:
|
||||
# Get position in document
|
||||
position = self._get_node_position(node, document)
|
||||
headers.append((node, text, position))
|
||||
|
||||
# Also check for section nodes
|
||||
section_nodes = document.root.find(lambda n: isinstance(n, SectionNode))
|
||||
for node in section_nodes:
|
||||
# Get first heading in section
|
||||
first_heading = node.find_first(lambda n: isinstance(n, HeadingNode))
|
||||
if first_heading:
|
||||
text = first_heading.text()
|
||||
if text:
|
||||
position = self._get_node_position(node, document)
|
||||
headers.append((node, text, position))
|
||||
|
||||
# Sort by position
|
||||
headers.sort(key=lambda x: x[2])
|
||||
|
||||
return headers
|
||||
|
||||
def _get_node_position(self, node: Node, document: Document) -> int:
|
||||
"""Get position of node in document."""
|
||||
position = 0
|
||||
for n in document.root.walk():
|
||||
if n == node:
|
||||
return position
|
||||
position += 1
|
||||
return position
|
||||
|
||||
def _detect_10q_parts(self, headers: List[Tuple[Node, str, int]]) -> Dict[int, str]:
|
||||
"""
|
||||
Detect Part I and Part II boundaries in 10-Q filings.
|
||||
|
||||
Args:
|
||||
headers: List of (node, text, position) tuples
|
||||
|
||||
Returns:
|
||||
Dict mapping header index to part name ("Part I" or "Part II")
|
||||
"""
|
||||
part_context = {}
|
||||
current_part = None
|
||||
|
||||
part_i_pattern = re.compile(r'^\s*PART\s+I\b', re.IGNORECASE)
|
||||
part_ii_pattern = re.compile(r'^\s*PART\s+II\b', re.IGNORECASE)
|
||||
|
||||
for i, (node, text, position) in enumerate(headers):
|
||||
text_stripped = text.strip()
|
||||
|
||||
# Check if this is a Part I or Part II header
|
||||
if part_i_pattern.match(text_stripped):
|
||||
current_part = "Part I"
|
||||
part_context[i] = current_part
|
||||
elif part_ii_pattern.match(text_stripped):
|
||||
current_part = "Part II"
|
||||
part_context[i] = current_part
|
||||
elif current_part:
|
||||
# Headers after a Part declaration belong to that part
|
||||
part_context[i] = current_part
|
||||
|
||||
return part_context
|
||||
|
||||
def _match_sections(self,
|
||||
headers: List[Tuple[Node, str, int]],
|
||||
patterns: Dict[str, List[Tuple[str, str]]],
|
||||
document: Document,
|
||||
part_context: Optional[Dict[int, str]] = None) -> Dict[str, Tuple[Node, str, int, int]]:
|
||||
"""Match headers to section patterns."""
|
||||
matched_sections = {}
|
||||
used_headers = set()
|
||||
|
||||
# Try to match each pattern
|
||||
for section_name, section_patterns in patterns.items():
|
||||
for pattern, title in section_patterns:
|
||||
for i, (node, text, position) in enumerate(headers):
|
||||
if i in used_headers:
|
||||
continue
|
||||
|
||||
# Try to match pattern
|
||||
if re.match(pattern, text.strip(), re.IGNORECASE):
|
||||
# Find end position (next section or end of document)
|
||||
end_position = self._find_section_end(i, headers, document)
|
||||
|
||||
# For 10-Q, prefix with Part I or Part II
|
||||
final_title = title
|
||||
if part_context and i in part_context:
|
||||
final_title = f"{part_context[i]} - {title}"
|
||||
|
||||
# Use final_title as key to avoid conflicts
|
||||
section_key = final_title if part_context and i in part_context else section_name
|
||||
matched_sections[section_key] = (node, final_title, position, end_position)
|
||||
used_headers.add(i)
|
||||
break
|
||||
|
||||
# If we found a match, move to next section
|
||||
if section_name in matched_sections:
|
||||
break
|
||||
|
||||
return matched_sections
|
||||
|
||||
def _find_section_end(self,
|
||||
section_index: int,
|
||||
headers: List[Tuple[Node, str, int]],
|
||||
document: Document) -> int:
|
||||
"""Find where section ends."""
|
||||
# Next section starts where next header at same or higher level begins
|
||||
if section_index + 1 < len(headers):
|
||||
current_node = headers[section_index][0]
|
||||
current_level = current_node.level if isinstance(current_node, HeadingNode) else 1
|
||||
|
||||
for i in range(section_index + 1, len(headers)):
|
||||
next_node = headers[i][0]
|
||||
next_level = next_node.level if isinstance(next_node, HeadingNode) else 1
|
||||
|
||||
# If next header is at same or higher level, that's our end
|
||||
if next_level <= current_level:
|
||||
return headers[i][2]
|
||||
|
||||
# Otherwise, section goes to end of document
|
||||
return sum(1 for _ in document.root.walk())
|
||||
|
||||
def _create_sections(self,
|
||||
matched_sections: Dict[str, Tuple[Node, str, int, int]],
|
||||
document: Document) -> Dict[str, Section]:
|
||||
"""Create Section objects from matches."""
|
||||
sections = {}
|
||||
|
||||
for section_name, (node, title, start_pos, end_pos) in matched_sections.items():
|
||||
# Create section node containing all content in range
|
||||
section_node = SectionNode(section_name=section_name)
|
||||
|
||||
# Find all nodes in position range
|
||||
position = 0
|
||||
for n in document.root.walk():
|
||||
if start_pos <= position < end_pos:
|
||||
# Clone node and add to section
|
||||
# (In real implementation, would properly handle node hierarchy)
|
||||
section_node.add_child(n)
|
||||
position += 1
|
||||
|
||||
# Parse section name to extract part and item identifiers
|
||||
part, item = Section.parse_section_name(section_name)
|
||||
|
||||
# Create Section object
|
||||
section = Section(
|
||||
name=section_name,
|
||||
title=title,
|
||||
node=section_node,
|
||||
start_offset=start_pos,
|
||||
end_offset=end_pos,
|
||||
confidence=0.7, # Pattern-based detection = moderate confidence
|
||||
detection_method='pattern', # Method: regex pattern matching
|
||||
part=part,
|
||||
item=item
|
||||
)
|
||||
|
||||
sections[section_name] = section
|
||||
|
||||
return sections
|
||||
@@ -0,0 +1,348 @@
|
||||
"""
|
||||
Text extraction from documents with various options.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Optional, Set
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode
|
||||
from edgar.documents.table_nodes import TableNode
|
||||
from edgar.documents.types import NodeType
|
||||
|
||||
|
||||
class TextExtractor:
|
||||
"""
|
||||
Extracts text from documents with configurable options.
|
||||
|
||||
Supports:
|
||||
- Clean text extraction for AI/NLP
|
||||
- Table inclusion/exclusion
|
||||
- Metadata annotations
|
||||
- Length limiting
|
||||
- Smart whitespace handling
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
clean: bool = True,
|
||||
include_tables: bool = True,
|
||||
include_metadata: bool = False,
|
||||
include_links: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
preserve_structure: bool = False):
|
||||
"""
|
||||
Initialize text extractor.
|
||||
|
||||
Args:
|
||||
clean: Clean and normalize text
|
||||
include_tables: Include table content
|
||||
include_metadata: Include metadata annotations
|
||||
include_links: Include link URLs
|
||||
max_length: Maximum text length
|
||||
preserve_structure: Preserve document structure with markers
|
||||
"""
|
||||
self.clean = clean
|
||||
self.include_tables = include_tables
|
||||
self.include_metadata = include_metadata
|
||||
self.include_links = include_links
|
||||
self.max_length = max_length
|
||||
self.preserve_structure = preserve_structure
|
||||
|
||||
# Track what we've extracted to avoid duplicates
|
||||
self._extracted_ids: Set[str] = set()
|
||||
|
||||
def extract(self, document: Document) -> str:
|
||||
"""
|
||||
Extract text from document.
|
||||
|
||||
Args:
|
||||
document: Document to extract from
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
parts = []
|
||||
self._extracted_ids.clear()
|
||||
|
||||
# Extract from root
|
||||
self._extract_from_node(document.root, parts, depth=0)
|
||||
|
||||
# Join parts
|
||||
if self.preserve_structure:
|
||||
text = '\n'.join(parts)
|
||||
else:
|
||||
text = '\n\n'.join(filter(None, parts))
|
||||
|
||||
# Apply minimal global cleaning - tables are already handled appropriately per node
|
||||
if self.clean:
|
||||
text = self._clean_document_text(text)
|
||||
|
||||
# Limit length if requested
|
||||
if self.max_length and len(text) > self.max_length:
|
||||
text = self._truncate_text(text, self.max_length)
|
||||
|
||||
return text
|
||||
|
||||
def extract_from_node(self, node: Node) -> str:
|
||||
"""Extract text from a specific node."""
|
||||
parts = []
|
||||
self._extracted_ids.clear()
|
||||
self._extract_from_node(node, parts, depth=0)
|
||||
|
||||
text = '\n\n'.join(filter(None, parts))
|
||||
|
||||
if self.clean:
|
||||
text = self._clean_document_text(text)
|
||||
|
||||
return text
|
||||
|
||||
def _extract_from_node(self, node: Node, parts: List[str], depth: int):
|
||||
"""Recursively extract text from node - render each node type appropriately."""
|
||||
# Skip if already extracted (handles shared nodes)
|
||||
if node.id in self._extracted_ids:
|
||||
return
|
||||
self._extracted_ids.add(node.id)
|
||||
|
||||
# Handle based on node type - like old parser's block.get_text()
|
||||
if isinstance(node, TableNode):
|
||||
if self.include_tables:
|
||||
# Tables render themselves - preserve their formatting
|
||||
self._extract_table(node, parts)
|
||||
|
||||
elif isinstance(node, HeadingNode):
|
||||
# Headings get cleaned text
|
||||
self._extract_heading(node, parts, depth)
|
||||
|
||||
elif isinstance(node, TextNode):
|
||||
# Text nodes get cleaned if cleaning is enabled
|
||||
text = node.text()
|
||||
if text:
|
||||
if self.clean:
|
||||
text = self._clean_text_content(text) # Clean non-table text
|
||||
if self.include_metadata and node.metadata:
|
||||
text = self._annotate_with_metadata(text, node.metadata)
|
||||
parts.append(text)
|
||||
|
||||
elif isinstance(node, ParagraphNode):
|
||||
# Extract paragraph as unified text to maintain flow of inline elements
|
||||
text = node.text()
|
||||
if text:
|
||||
if self.clean:
|
||||
text = self._clean_text_content(text)
|
||||
if self.include_metadata and node.metadata:
|
||||
text = self._annotate_with_metadata(text, node.metadata)
|
||||
parts.append(text)
|
||||
# Don't process children since we already got the paragraph text
|
||||
return
|
||||
|
||||
else:
|
||||
# Check if this looks like a bullet point container that should flow together
|
||||
if self._is_bullet_point_container(node):
|
||||
# Extract text from bullet point children and join with spaces (not newlines)
|
||||
bullet_parts = []
|
||||
for child in node.children:
|
||||
child_text = child.text() if hasattr(child, 'text') else ""
|
||||
if child_text and child_text.strip():
|
||||
bullet_parts.append(child_text.strip())
|
||||
|
||||
if bullet_parts:
|
||||
# Join with spaces for bullet points
|
||||
text = ' '.join(bullet_parts)
|
||||
if self.clean:
|
||||
text = self._clean_text_content(text)
|
||||
if self.include_metadata and node.metadata:
|
||||
text = self._annotate_with_metadata(text, node.metadata)
|
||||
parts.append(text)
|
||||
# Don't process children since we already got the unified text
|
||||
return
|
||||
|
||||
# For other nodes, extract text content and clean if appropriate
|
||||
if hasattr(node, 'content') and isinstance(node.content, str):
|
||||
text = node.content
|
||||
if text and text.strip():
|
||||
if self.clean:
|
||||
text = self._clean_text_content(text) # Clean non-table text
|
||||
if self.include_metadata and node.metadata:
|
||||
text = self._annotate_with_metadata(text, node.metadata)
|
||||
parts.append(text)
|
||||
|
||||
# Process children
|
||||
for child in node.children:
|
||||
self._extract_from_node(child, parts, depth + 1)
|
||||
|
||||
def _extract_heading(self, node: HeadingNode, parts: List[str], depth: int):
|
||||
"""Extract heading with optional structure markers."""
|
||||
text = node.text()
|
||||
if not text:
|
||||
return
|
||||
|
||||
if self.preserve_structure:
|
||||
# Add structure markers
|
||||
marker = '#' * node.level
|
||||
text = f"{marker} {text}"
|
||||
|
||||
if self.include_metadata and node.metadata:
|
||||
text = self._annotate_with_metadata(text, node.metadata)
|
||||
|
||||
parts.append(text)
|
||||
|
||||
def _extract_table(self, table: TableNode, parts: List[str]):
|
||||
"""Extract table content - preserve original formatting like old parser."""
|
||||
if self.preserve_structure:
|
||||
parts.append("[TABLE START]")
|
||||
|
||||
# Add table caption if present
|
||||
if table.caption:
|
||||
caption_text = table.caption
|
||||
if self.clean:
|
||||
caption_text = self._clean_text_content(caption_text) # Clean caption but not table content
|
||||
if self.preserve_structure:
|
||||
parts.append(f"Caption: {caption_text}")
|
||||
else:
|
||||
parts.append(caption_text)
|
||||
|
||||
# Extract table text - PRESERVE FORMATTING (like old parser's TableBlock.get_text())
|
||||
table_text = table.text()
|
||||
if table_text:
|
||||
# Tables render their own formatting - don't apply text cleaning to preserve alignment
|
||||
parts.append(table_text) # Keep original spacing and alignment
|
||||
|
||||
if self.preserve_structure:
|
||||
parts.append("[TABLE END]")
|
||||
|
||||
def _annotate_with_metadata(self, text: str, metadata: dict) -> str:
|
||||
"""Add metadata annotations to text."""
|
||||
annotations = []
|
||||
|
||||
# Add XBRL annotations
|
||||
if 'ix_tag' in metadata:
|
||||
annotations.append(f"[XBRL: {metadata['ix_tag']}]")
|
||||
|
||||
# Add section annotations
|
||||
if 'section_name' in metadata:
|
||||
annotations.append(f"[Section: {metadata['section_name']}]")
|
||||
|
||||
# Add semantic type
|
||||
if 'semantic_type' in metadata:
|
||||
annotations.append(f"[Type: {metadata['semantic_type']}]")
|
||||
|
||||
if annotations:
|
||||
return f"{' '.join(annotations)} {text}"
|
||||
|
||||
return text
|
||||
|
||||
def _clean_text_content(self, text: str) -> str:
|
||||
"""Clean regular text content (not tables) - like old parser text cleaning."""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Replace multiple spaces with single space for regular text
|
||||
text = re.sub(r' {2,}', ' ', text)
|
||||
|
||||
# Clean up space around newlines
|
||||
text = re.sub(r' *\n *', '\n', text)
|
||||
|
||||
# Remove leading/trailing whitespace from lines
|
||||
lines = text.split('\n')
|
||||
lines = [line.strip() for line in lines]
|
||||
text = '\n'.join(lines)
|
||||
|
||||
# Normalize quotes and dashes
|
||||
text = self._normalize_punctuation(text)
|
||||
|
||||
return text
|
||||
|
||||
def _is_bullet_point_container(self, node) -> bool:
|
||||
"""Check if a container node represents a bullet point that should flow as one line."""
|
||||
from edgar.documents.nodes import ContainerNode
|
||||
|
||||
if not isinstance(node, ContainerNode):
|
||||
return False
|
||||
|
||||
# Must have at least 2 children (bullet + content)
|
||||
if len(node.children) < 2:
|
||||
return False
|
||||
|
||||
# Get the text of all children to check for bullet patterns
|
||||
all_text = node.text()
|
||||
if not all_text:
|
||||
return False
|
||||
|
||||
# Check if starts with common bullet characters
|
||||
bullet_chars = ['•', '●', '▪', '▫', '◦', '‣', '-', '*']
|
||||
starts_with_bullet = any(all_text.strip().startswith(char) for char in bullet_chars)
|
||||
|
||||
if not starts_with_bullet:
|
||||
return False
|
||||
|
||||
# Check if container has flex display (common for bullet point layouts)
|
||||
if hasattr(node, 'style') and node.style and hasattr(node.style, 'display'):
|
||||
if node.style.display == 'flex':
|
||||
return True
|
||||
|
||||
# Check if it has bullet-like structure: short first child + longer content
|
||||
if len(node.children) >= 2:
|
||||
first_child_text = node.children[0].text() if hasattr(node.children[0], 'text') else ""
|
||||
second_child_text = node.children[1].text() if hasattr(node.children[1], 'text') else ""
|
||||
|
||||
# First child is very short (likely bullet), second is longer (content)
|
||||
if len(first_child_text.strip()) <= 3 and len(second_child_text.strip()) > 10:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _clean_document_text(self, text: str) -> str:
|
||||
"""Apply minimal document-level cleaning that preserves table formatting."""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Only apply global formatting that doesn't affect table alignment:
|
||||
|
||||
# Replace excessive newlines (4+ consecutive) with triple newline
|
||||
text = re.sub(r'\n{4,}', '\n\n\n', text)
|
||||
|
||||
# Remove empty lines at start/end only
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
def _normalize_punctuation(self, text: str) -> str:
|
||||
"""Normalize punctuation for cleaner text."""
|
||||
# Normalize quotes
|
||||
text = text.replace('"', '"').replace('"', '"')
|
||||
text = text.replace(''', "'").replace(''', "'")
|
||||
|
||||
# Normalize dashes
|
||||
text = text.replace('—', ' - ') # em dash
|
||||
text = text.replace('–', ' - ') # en dash
|
||||
|
||||
# Fix spacing around punctuation
|
||||
text = re.sub(r'\s+([.,;!?])', r'\1', text)
|
||||
text = re.sub(r'([.,;!?])\s*', r'\1 ', text)
|
||||
|
||||
# Remove extra spaces
|
||||
text = re.sub(r' {2,}', ' ', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def _truncate_text(self, text: str, max_length: int) -> str:
|
||||
"""Truncate text intelligently."""
|
||||
if len(text) <= max_length:
|
||||
return text
|
||||
|
||||
# Try to truncate at sentence boundary
|
||||
truncated = text[:max_length]
|
||||
last_period = truncated.rfind('.')
|
||||
last_newline = truncated.rfind('\n')
|
||||
|
||||
# Choose the better truncation point
|
||||
truncate_at = max(last_period, last_newline)
|
||||
if truncate_at > max_length * 0.8: # If we found a good boundary
|
||||
return text[:truncate_at + 1].strip()
|
||||
|
||||
# Otherwise truncate at word boundary
|
||||
last_space = truncated.rfind(' ')
|
||||
if last_space > max_length * 0.9:
|
||||
return text[:last_space].strip() + '...'
|
||||
|
||||
# Last resort: hard truncate
|
||||
return text[:max_length - 3].strip() + '...'
|
||||
@@ -0,0 +1,178 @@
|
||||
"""
|
||||
TOC-based section detection strategy.
|
||||
|
||||
Detects sections using Table of Contents structure. Provides highest
|
||||
confidence (0.95) and includes full text extraction capabilities.
|
||||
|
||||
This detector wraps SECSectionExtractor which has proven implementations of:
|
||||
- Multi-column TOC support (checks all preceding table cells)
|
||||
- Nested anchor handling (traverses up to find content container)
|
||||
- Full section text extraction
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
from edgar.documents.document import Document, Section
|
||||
from edgar.documents.nodes import SectionNode
|
||||
from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TOCSectionDetector:
|
||||
"""
|
||||
TOC-based section detection strategy.
|
||||
|
||||
Uses Table of Contents structure to identify section boundaries and
|
||||
extract full section content. Provides high confidence (0.95) detection.
|
||||
|
||||
This implementation wraps the proven SECSectionExtractor which includes:
|
||||
- Multi-column TOC support for edge cases like Morgan Stanley
|
||||
- Nested anchor handling for sections with no sibling content
|
||||
- Complete text extraction with proper boundary detection
|
||||
"""
|
||||
|
||||
def __init__(self, document: Document):
|
||||
"""
|
||||
Initialize TOC-based detector.
|
||||
|
||||
Args:
|
||||
document: Document to analyze (must have metadata.original_html)
|
||||
"""
|
||||
self.document = document
|
||||
self.extractor = SECSectionExtractor(document)
|
||||
|
||||
def detect(self) -> Optional[Dict[str, Section]]:
|
||||
"""
|
||||
Detect sections using TOC structure.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section names to Section objects, or None if unavailable
|
||||
|
||||
Note:
|
||||
Requires document.metadata.original_html to be available.
|
||||
Returns None if HTML is not available or no sections found.
|
||||
"""
|
||||
# Check if original HTML is available
|
||||
html_content = getattr(self.document.metadata, 'original_html', None)
|
||||
if not html_content:
|
||||
logger.debug("TOC detection unavailable: original_html not in document metadata")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Get available sections from TOC
|
||||
available = self.extractor.get_available_sections()
|
||||
if not available:
|
||||
logger.debug("No sections found in TOC")
|
||||
return None
|
||||
|
||||
sections = {}
|
||||
|
||||
# Extract each section
|
||||
for section_name in available:
|
||||
# Get section metadata first to check for subsections
|
||||
section_info = self.extractor.get_section_info(section_name)
|
||||
if not section_info:
|
||||
logger.debug(f"Skipping {section_name}: no section info")
|
||||
continue
|
||||
|
||||
# Get section text (may be empty for container sections)
|
||||
section_text = self.extractor.get_section_text(section_name, include_subsections=True)
|
||||
|
||||
# Check if this section has subsections
|
||||
has_subsections = section_info.get('subsections', [])
|
||||
|
||||
if not section_text and not has_subsections:
|
||||
# Skip only if no text AND no subsections
|
||||
logger.debug(f"Skipping {section_name}: no text and no subsections")
|
||||
continue
|
||||
|
||||
# Create section node (placeholder - actual content extracted lazily)
|
||||
section_node = SectionNode(section_name=section_name)
|
||||
|
||||
# For container sections (Item 1, Item 10), text will include all subsections
|
||||
section_length = len(section_text) if section_text else 0
|
||||
|
||||
# Create text extractor callback for lazy loading
|
||||
def make_text_extractor(extractor, name):
|
||||
"""Create a closure that captures extractor and section name."""
|
||||
def extract_text(section_name=None, **kwargs):
|
||||
# Use captured name, ignore passed section_name
|
||||
clean = kwargs.get('clean', True)
|
||||
return extractor.get_section_text(name, include_subsections=True, clean=clean) or ""
|
||||
return extract_text
|
||||
|
||||
# Parse section name to extract part and item identifiers
|
||||
part, item = Section.parse_section_name(section_name)
|
||||
|
||||
# Create Section with TOC confidence
|
||||
section = Section(
|
||||
name=section_name,
|
||||
title=section_info.get('canonical_name', section_name),
|
||||
node=section_node,
|
||||
start_offset=0, # Would need actual offsets from parsing
|
||||
end_offset=section_length,
|
||||
confidence=0.95, # TOC-based = high confidence
|
||||
detection_method='toc',
|
||||
part=part,
|
||||
item=item,
|
||||
_text_extractor=make_text_extractor(self.extractor, section_name)
|
||||
)
|
||||
|
||||
sections[section_name] = section
|
||||
|
||||
if sections:
|
||||
logger.info(f"TOC detection found {len(sections)} sections")
|
||||
return sections
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"TOC detection failed: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
def get_section_text(document: Document, section_name: str) -> Optional[str]:
|
||||
"""
|
||||
Get section text using TOC-based extraction.
|
||||
|
||||
Args:
|
||||
document: Document to extract from
|
||||
section_name: Section name (e.g., 'Item 1', 'Item 1A')
|
||||
|
||||
Returns:
|
||||
Section text if available, None otherwise
|
||||
"""
|
||||
html_content = getattr(document.metadata, 'original_html', None)
|
||||
if not html_content:
|
||||
return None
|
||||
|
||||
try:
|
||||
extractor = SECSectionExtractor(document)
|
||||
return extractor.get_section_text(section_name)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get section text for {section_name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_available_sections(document: Document) -> list[str]:
|
||||
"""
|
||||
Get list of available sections from TOC.
|
||||
|
||||
Args:
|
||||
document: Document to analyze
|
||||
|
||||
Returns:
|
||||
List of section names found in TOC
|
||||
"""
|
||||
html_content = getattr(document.metadata, 'original_html', None)
|
||||
if not html_content:
|
||||
return []
|
||||
|
||||
try:
|
||||
extractor = SECSectionExtractor(document)
|
||||
return extractor.get_available_sections()
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get available sections: {e}")
|
||||
return []
|
||||
@@ -0,0 +1,383 @@
|
||||
"""
|
||||
Section extraction for SEC filings using Table of Contents analysis.
|
||||
|
||||
This system uses TOC structure to extract specific sections like "Item 1",
|
||||
"Item 1A", etc. from SEC filings. This approach works consistently across
|
||||
all SEC filings regardless of whether they use semantic anchors or generated IDs.
|
||||
"""
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple, Set
|
||||
from dataclasses import dataclass
|
||||
from lxml import html as lxml_html
|
||||
|
||||
from edgar.documents.nodes import Node, SectionNode
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.utils.toc_analyzer import TOCAnalyzer
|
||||
|
||||
|
||||
@dataclass
|
||||
class SectionBoundary:
|
||||
"""Represents the boundaries of a document section."""
|
||||
name: str
|
||||
anchor_id: str
|
||||
start_element_id: Optional[str] = None
|
||||
end_element_id: Optional[str] = None
|
||||
start_node: Optional[Node] = None
|
||||
end_node: Optional[Node] = None
|
||||
text_start: Optional[int] = None # Character position in full text
|
||||
text_end: Optional[int] = None
|
||||
confidence: float = 1.0 # Detection confidence (0.0-1.0)
|
||||
detection_method: str = 'unknown' # How section was detected
|
||||
|
||||
|
||||
class SECSectionExtractor:
|
||||
"""
|
||||
Extract specific sections from SEC filings using Table of Contents analysis.
|
||||
|
||||
This uses TOC structure to identify section boundaries and extract content
|
||||
between them. Works consistently for all SEC filings.
|
||||
"""
|
||||
|
||||
def __init__(self, document: Document):
|
||||
self.document = document
|
||||
self.section_map = {} # Maps section names to canonical names
|
||||
self.section_boundaries = {} # Maps section names to boundaries
|
||||
self.toc_analyzer = TOCAnalyzer()
|
||||
self._analyze_sections()
|
||||
|
||||
def _analyze_sections(self) -> None:
|
||||
"""
|
||||
Analyze the document using TOC structure to identify section boundaries.
|
||||
|
||||
This creates a map of section names to their anchor positions using
|
||||
Table of Contents analysis, which works for all SEC filings.
|
||||
"""
|
||||
# Get the original HTML if available
|
||||
html_content = getattr(self.document.metadata, 'original_html', None)
|
||||
if not html_content:
|
||||
return
|
||||
|
||||
# Use TOC analysis to find sections
|
||||
toc_mapping = self.toc_analyzer.analyze_toc_structure(html_content)
|
||||
|
||||
if not toc_mapping:
|
||||
return # No sections found
|
||||
|
||||
# Handle XML declaration issues
|
||||
if html_content.startswith('<?xml'):
|
||||
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
|
||||
|
||||
tree = lxml_html.fromstring(html_content)
|
||||
|
||||
sec_sections = {}
|
||||
|
||||
for section_name, anchor_id in toc_mapping.items():
|
||||
# Verify the anchor target exists
|
||||
target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
|
||||
if target_elements:
|
||||
element = target_elements[0]
|
||||
|
||||
# Use TOC-based section info
|
||||
section_type, order = self.toc_analyzer._get_section_type_and_order(section_name)
|
||||
|
||||
sec_sections[section_name] = {
|
||||
'anchor_id': anchor_id,
|
||||
'element': element,
|
||||
'canonical_name': section_name,
|
||||
'type': section_type,
|
||||
'order': order,
|
||||
'confidence': 0.95, # TOC-based detection = high confidence
|
||||
'detection_method': 'toc' # Method: Table of Contents
|
||||
}
|
||||
|
||||
if not sec_sections:
|
||||
return # No valid sections found
|
||||
|
||||
# Sort sections by their logical order
|
||||
sorted_sections = sorted(sec_sections.items(), key=lambda x: x[1]['order'])
|
||||
|
||||
# Calculate section boundaries
|
||||
for i, (section_name, section_data) in enumerate(sorted_sections):
|
||||
start_anchor = section_data['anchor_id']
|
||||
|
||||
# End boundary is the start of the next section (if any)
|
||||
end_anchor = None
|
||||
if i + 1 < len(sorted_sections):
|
||||
next_section = sorted_sections[i + 1][1]
|
||||
end_anchor = next_section['anchor_id']
|
||||
|
||||
self.section_boundaries[section_name] = SectionBoundary(
|
||||
name=section_name,
|
||||
anchor_id=start_anchor,
|
||||
end_element_id=end_anchor,
|
||||
confidence=section_data.get('confidence', 0.95),
|
||||
detection_method=section_data.get('detection_method', 'toc')
|
||||
)
|
||||
|
||||
self.section_map = {name: data['canonical_name'] for name, data in sec_sections.items()}
|
||||
|
||||
|
||||
|
||||
def get_available_sections(self) -> List[str]:
|
||||
"""
|
||||
Get list of available sections that can be extracted.
|
||||
|
||||
Returns:
|
||||
List of section names
|
||||
"""
|
||||
return sorted(self.section_boundaries.keys(),
|
||||
key=lambda x: self.section_boundaries[x].anchor_id)
|
||||
|
||||
def get_section_text(self, section_name: str,
|
||||
include_subsections: bool = True,
|
||||
clean: bool = True) -> Optional[str]:
|
||||
"""
|
||||
Extract text content for a specific section.
|
||||
|
||||
Args:
|
||||
section_name: Name of section (e.g., "Item 1", "Item 1A", "Part I")
|
||||
include_subsections: Whether to include subsections
|
||||
clean: Whether to apply text cleaning
|
||||
|
||||
Returns:
|
||||
Section text content or None if section not found
|
||||
"""
|
||||
# Normalize section name
|
||||
normalized_name = self._normalize_section_name(section_name)
|
||||
|
||||
if normalized_name not in self.section_boundaries:
|
||||
return None
|
||||
|
||||
boundary = self.section_boundaries[normalized_name]
|
||||
|
||||
# Extract content between boundaries using HTML parsing
|
||||
html_content = getattr(self.document.metadata, 'original_html', None)
|
||||
if not html_content:
|
||||
return None
|
||||
|
||||
try:
|
||||
section_text = self._extract_section_content(html_content, boundary, include_subsections, clean)
|
||||
|
||||
# If no direct content but include_subsections=True, aggregate subsection text
|
||||
if not section_text and include_subsections:
|
||||
subsections = self._get_subsections(normalized_name)
|
||||
if subsections:
|
||||
# Recursively get text from all subsections
|
||||
subsection_texts = []
|
||||
for subsection_name in subsections:
|
||||
subsection_text = self.get_section_text(subsection_name, include_subsections=True, clean=clean)
|
||||
if subsection_text:
|
||||
subsection_texts.append(subsection_text)
|
||||
|
||||
if subsection_texts:
|
||||
section_text = '\n\n'.join(subsection_texts)
|
||||
|
||||
return section_text
|
||||
except Exception as e:
|
||||
# Fallback to simple text extraction
|
||||
return self._extract_section_fallback(section_name, clean)
|
||||
|
||||
def _normalize_section_name(self, section_name: str) -> str:
|
||||
"""Normalize section name for lookup."""
|
||||
# Handle common variations
|
||||
name = section_name.strip()
|
||||
|
||||
# "Item 1" vs "Item 1." vs "Item 1:"
|
||||
name = re.sub(r'[.:]$', '', name)
|
||||
|
||||
# Case normalization
|
||||
if re.match(r'item\s+\d+', name, re.IGNORECASE):
|
||||
match = re.match(r'item\s+(\d+[a-z]?)', name, re.IGNORECASE)
|
||||
if match:
|
||||
name = f"Item {match.group(1).upper()}"
|
||||
elif re.match(r'part\s+[ivx]+', name, re.IGNORECASE):
|
||||
match = re.match(r'part\s+([ivx]+)', name, re.IGNORECASE)
|
||||
if match:
|
||||
name = f"Part {match.group(1).upper()}"
|
||||
|
||||
return name
|
||||
|
||||
def _extract_section_content(self, html_content: str, boundary: SectionBoundary,
|
||||
include_subsections: bool, clean: bool) -> str:
|
||||
"""
|
||||
Extract section content from HTML between anchors.
|
||||
|
||||
Args:
|
||||
html_content: Full HTML content
|
||||
boundary: Section boundary info
|
||||
include_subsections: Whether to include subsections
|
||||
clean: Whether to clean the text
|
||||
|
||||
Returns:
|
||||
Extracted section text
|
||||
"""
|
||||
# Handle XML declaration issues
|
||||
if html_content.startswith('<?xml'):
|
||||
html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
|
||||
|
||||
tree = lxml_html.fromstring(html_content)
|
||||
|
||||
# Find start element
|
||||
start_elements = tree.xpath(f'//*[@id="{boundary.anchor_id}"]')
|
||||
if not start_elements:
|
||||
return ""
|
||||
|
||||
start_element = start_elements[0]
|
||||
|
||||
# Collect content until we hit the end boundary (if specified)
|
||||
content_elements = []
|
||||
|
||||
# If anchor has no siblings (nested in empty container), traverse up to find content container
|
||||
# This handles cases like <div id="item7"><div></div></div> where content is after the container
|
||||
current = start_element.getnext()
|
||||
if current is None:
|
||||
# No sibling - traverse up to find a container with siblings
|
||||
container = start_element.getparent()
|
||||
while container is not None and container.getnext() is None:
|
||||
container = container.getparent()
|
||||
|
||||
# Start from the container's next sibling if found
|
||||
if container is not None:
|
||||
current = container.getnext()
|
||||
|
||||
# Collect content from siblings
|
||||
if current is not None:
|
||||
# Normal case - anchor has siblings
|
||||
while current is not None:
|
||||
# Check if we've reached the end boundary
|
||||
if boundary.end_element_id:
|
||||
current_id = current.get('id', '')
|
||||
if current_id == boundary.end_element_id:
|
||||
break
|
||||
|
||||
# Also check if this is a sibling section we should stop at
|
||||
if not include_subsections and self._is_sibling_section(current_id, boundary.name):
|
||||
break
|
||||
|
||||
content_elements.append(current)
|
||||
current = current.getnext()
|
||||
|
||||
# Extract text from collected elements
|
||||
section_texts = []
|
||||
for element in content_elements:
|
||||
text = self._extract_element_text(element)
|
||||
if text.strip():
|
||||
section_texts.append(text)
|
||||
|
||||
combined_text = '\n\n'.join(section_texts)
|
||||
|
||||
# Apply cleaning if requested
|
||||
if clean:
|
||||
combined_text = self._clean_section_text(combined_text)
|
||||
|
||||
return combined_text
|
||||
|
||||
def _is_sibling_section(self, element_id: str, current_section: str) -> bool:
|
||||
"""Check if element ID represents a sibling section."""
|
||||
if not element_id:
|
||||
return False
|
||||
|
||||
# Check if this looks like another item at the same level
|
||||
if 'item' in current_section.lower() and 'item' in element_id.lower():
|
||||
current_item = re.search(r'item\s*(\d+)', current_section, re.IGNORECASE)
|
||||
other_item = re.search(r'item[\s_]*(\d+)', element_id, re.IGNORECASE)
|
||||
|
||||
if current_item and other_item:
|
||||
return current_item.group(1) != other_item.group(1)
|
||||
|
||||
return False
|
||||
|
||||
def _extract_element_text(self, element) -> str:
|
||||
"""Extract clean text from an HTML element."""
|
||||
# This would integrate with your existing text extraction logic
|
||||
# For now, simple text extraction
|
||||
return element.text_content() or ""
|
||||
|
||||
def _clean_section_text(self, text: str) -> str:
|
||||
"""Clean extracted section text."""
|
||||
# Apply the same cleaning as the main document
|
||||
from edgar.documents.utils.anchor_cache import filter_with_cached_patterns
|
||||
|
||||
# Remove excessive whitespace
|
||||
text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
|
||||
|
||||
# Filter navigation links
|
||||
html_content = getattr(self.document.metadata, 'original_html', None)
|
||||
if html_content:
|
||||
text = filter_with_cached_patterns(text, html_content)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def _extract_section_fallback(self, section_name: str, clean: bool) -> Optional[str]:
|
||||
"""
|
||||
Fallback section extraction using document nodes.
|
||||
|
||||
This is used when HTML-based extraction fails.
|
||||
"""
|
||||
# Search through document sections
|
||||
for name, section in self.document.sections.items():
|
||||
if section_name.lower() in name.lower():
|
||||
return section.text(clean=clean)
|
||||
|
||||
return None
|
||||
|
||||
def get_section_info(self, section_name: str) -> Optional[Dict]:
|
||||
"""
|
||||
Get detailed information about a section.
|
||||
|
||||
Args:
|
||||
section_name: Section name to look up
|
||||
|
||||
Returns:
|
||||
Dict with section metadata
|
||||
"""
|
||||
normalized_name = self._normalize_section_name(section_name)
|
||||
|
||||
if normalized_name not in self.section_boundaries:
|
||||
return None
|
||||
|
||||
boundary = self.section_boundaries[normalized_name]
|
||||
|
||||
return {
|
||||
'name': boundary.name,
|
||||
'anchor_id': boundary.anchor_id,
|
||||
'available': True,
|
||||
'estimated_length': None, # Could calculate if needed
|
||||
'subsections': self._get_subsections(normalized_name)
|
||||
}
|
||||
|
||||
def _get_subsections(self, parent_section: str) -> List[str]:
|
||||
"""
|
||||
Get subsections of a parent section.
|
||||
|
||||
For example:
|
||||
- "Item 1" has subsections "Item 1A", "Item 1B" (valid)
|
||||
- "Item 1" does NOT have subsection "Item 10" (invalid - different item)
|
||||
"""
|
||||
subsections = []
|
||||
|
||||
# Look for sections that start with the parent name
|
||||
for section_name in self.section_boundaries:
|
||||
if section_name == parent_section:
|
||||
continue
|
||||
|
||||
if section_name.startswith(parent_section):
|
||||
# Check if this is a true subsection (e.g., Item 1A)
|
||||
# vs a different section that happens to start with same prefix (e.g., Item 10)
|
||||
remainder = section_name[len(parent_section):]
|
||||
|
||||
# Valid subsection patterns:
|
||||
# - "Item 1A" (remainder: "A") - letter suffix
|
||||
# - "Item 1 - Business" (remainder: " - Business") - has separator
|
||||
# Invalid patterns:
|
||||
# - "Item 10" (remainder: "0") - digit continues the number
|
||||
|
||||
if remainder and remainder[0].isalpha():
|
||||
# Letter suffix like "A", "B" - valid subsection
|
||||
subsections.append(section_name)
|
||||
elif remainder and remainder[0] in [' ', '-', '.', ':']:
|
||||
# Has separator - could be descriptive title
|
||||
subsections.append(section_name)
|
||||
# If remainder starts with digit, it's NOT a subsection (e.g., "Item 10")
|
||||
|
||||
return sorted(subsections)
|
||||
Reference in New Issue
Block a user