Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/init.py
@@ -0,0 +1,15 @@
+"""
+Content extractors for documents.
+"""
+
+from edgar.documents.extractors.text_extractor import TextExtractor
+from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
+from edgar.documents.extractors.hybrid_section_detector import HybridSectionDetector
+from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
+
+__all__ = [
+    'TextExtractor',
+    'SectionExtractor',
+    'HybridSectionDetector',
+    'TOCSectionDetector'
+]
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/heading_section_detector.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/heading_section_detector.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/hybrid_section_detector.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/hybrid_section_detector.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/pattern_section_extractor.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/pattern_section_extractor.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/text_extractor.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/text_extractor.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/toc_section_detector.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/toc_section_detector.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/toc_section_extractor.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pycache/toc_section_extractor.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/heading_section_detector.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/heading_section_detector.py
@@ -0,0 +1,170 @@
+"""
+Heading-based section detection strategy.
+
+Detects sections by analyzing heading nodes with HeaderInfo metadata.
+This strategy provides moderate confidence (0.7-0.9) and serves as a
+fallback when TOC-based detection is not available.
+"""
+
+import logging
+from typing import Dict, Optional
+
+from edgar.documents.document import Document, Section
+from edgar.documents.nodes import HeadingNode, SectionNode
+from edgar.documents.types import HeaderInfo
+
+logger = logging.getLogger(__name__)
+
+
+class HeadingSectionDetector:
+    """
+    Heading-based section detection using HeaderInfo.
+
+    Analyzes heading nodes that have been annotated with HeaderInfo
+    during parsing. Detects sections based on:
+    - Item numbers (Item 1, Item 1A, etc.)
+    - Heading confidence scores
+    - Heading hierarchy
+
+    Provides moderate confidence (0.7-0.9) detection.
+    """
+
+    def __init__(
+        self,
+        document: Document,
+        form: Optional[str] = None,
+        min_confidence: float = 0.5  # Lower threshold, let hybrid detector filter
+    ):
+        """
+        Initialize heading-based detector.
+
+        Args:
+            document: Document to analyze
+            form: Optional filing type for context ('10-K', '10-Q', '8-K')
+            min_confidence: Minimum confidence for headings (default 0.5)
+        """
+        self.document = document
+        self.form = form
+        self.min_confidence = min_confidence
+
+    def detect(self) -> Optional[Dict[str, Section]]:
+        """
+        Detect sections from heading nodes with HeaderInfo.
+
+        Returns:
+            Dictionary of sections if successful, None if no sections found
+        """
+        try:
+            # Get heading nodes from document
+            headings = self.document.headings
+            if not headings:
+                logger.debug("No headings found in document")
+                return None
+
+            sections = {}
+
+            for heading in headings:
+                # Check if heading has header info
+                if not hasattr(heading, 'header_info') or not heading.header_info:
+                    continue
+
+                header_info = heading.header_info
+
+                # Only use headings with sufficient confidence
+                if header_info.confidence < self.min_confidence:
+                    continue
+
+                # Check if it's an item header
+                if not header_info.is_item:
+                    continue
+
+                # Extract section from this heading
+                section = self._extract_section_from_heading(heading, header_info)
+                if section:
+                    section.confidence = header_info.confidence
+                    section.detection_method = 'heading'
+                    sections[section.name] = section
+
+            if not sections:
+                logger.debug("No item headers found with sufficient confidence")
+                return None
+
+            logger.info(f"Heading detection found {len(sections)} sections")
+            return sections
+
+        except Exception as e:
+            logger.warning(f"Heading detection failed: {e}")
+            return None
+
+    def _extract_section_from_heading(
+        self, heading: HeadingNode, header_info: HeaderInfo
+    ) -> Optional[Section]:
+        """
+        Extract section content from heading node to next heading.
+
+        Args:
+            heading: HeadingNode representing section start
+            header_info: HeaderInfo with section metadata
+
+        Returns:
+            Section object if successful, None otherwise
+        """
+        try:
+            # Create section name from item number
+            if header_info.item_number:
+                # Normalize: "1A" -> "item_1a", "7" -> "item_7"
+                section_name = f"item_{header_info.item_number.replace('.', '_').lower()}"
+            else:
+                section_name = "unknown"
+
+            # Create section node
+            section_node = SectionNode(section_name=section_name)
+
+            # Find next heading at same or higher level to determine section end
+            current_level = header_info.level
+            parent = heading.parent
+            if not parent:
+                logger.debug(f"Heading {header_info.text} has no parent")
+                return None
+
+            # Find heading position in parent's children
+            try:
+                heading_index = parent.children.index(heading)
+            except ValueError:
+                logger.debug(f"Could not find heading in parent's children")
+                return None
+
+            # Collect nodes until next section heading
+            for i in range(heading_index + 1, len(parent.children)):
+                child = parent.children[i]
+
+                # Stop at next heading of same or higher level
+                if isinstance(child, HeadingNode):
+                    if hasattr(child, 'header_info') and child.header_info:
+                        if child.header_info.level <= current_level:
+                            break
+
+                # Add child to section
+                section_node.add_child(child)
+
+            # Parse section name to extract part and item identifiers
+            part, item = Section.parse_section_name(section_name)
+
+            # Create Section object
+            section = Section(
+                name=section_name,
+                title=header_info.text,
+                node=section_node,
+                start_offset=0,  # Would need actual text position
+                end_offset=0,  # Would need actual text position
+                confidence=header_info.confidence,
+                detection_method='heading',
+                part=part,
+                item=item
+            )
+
+            return section
+
+        except Exception as e:
+            logger.warning(f"Failed to extract section from heading: {e}")
+            return None
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/hybrid_section_detector.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/hybrid_section_detector.py
@@ -0,0 +1,489 @@
+"""
+Hybrid section detection system with multiple fallback strategies.
+
+This module implements a multi-strategy approach to section detection:
+1. TOC-based (primary): High confidence, uses Table of Contents structure
+2. Heading-based (fallback): Moderate confidence, uses multi-strategy heading detection
+3. Pattern-based (last resort): Lower confidence, uses regex pattern matching
+"""
+
+import logging
+from typing import Dict, Optional, List
+from dataclasses import dataclass
+from functools import lru_cache
+
+from edgar.documents.document import Document, Section
+from edgar.documents.nodes import SectionNode, HeadingNode
+from edgar.documents.extractors.toc_section_detector import TOCSectionDetector
+from edgar.documents.extractors.pattern_section_extractor import SectionExtractor
+from edgar.documents.config import DetectionThresholds
+
+logger = logging.getLogger(__name__)
+
+
+class HybridSectionDetector:
+    """
+    Multi-strategy section detector with fallback.
+
+    Tries strategies in order of reliability:
+    1. TOC-based (0.95 confidence) - Most reliable
+    2. Multi-strategy heading detection (0.7-0.9 confidence) - Fallback
+    3. Pattern matching (0.6 confidence) - Last resort
+
+    Example:
+        >>> detector = HybridSectionDetector(document, '10-K')
+        >>> sections = detector.detect_sections()
+        >>> for name, section in sections.items():
+        ...     print(f"{name}: {section.confidence:.2f} ({section.detection_method})")
+    """
+
+    def __init__(self, document: Document, form: str, thresholds: Optional[DetectionThresholds] = None):
+        """
+        Initialize hybrid detector.
+
+        Args:
+            document: Document to extract sections from
+            form: Filing type ('10-K', '10-Q', '8-K')
+            thresholds: Detection thresholds configuration
+        """
+        self.document = document
+        self.form = form
+        self.thresholds = thresholds or DetectionThresholds()
+
+        # Initialize detection strategies
+        self.toc_detector = TOCSectionDetector(document)
+        self.pattern_extractor = SectionExtractor(form)
+
+    def detect_sections(self) -> Dict[str, Section]:
+        """
+        Detect sections using hybrid approach with fallback and validation.
+
+        Returns:
+            Dictionary mapping section names to Section objects with confidence scores
+        """
+        # Strategy 1: TOC-based (most reliable)
+        logger.debug("Trying TOC-based detection...")
+        sections = self.toc_detector.detect()
+        if sections:
+            logger.info(f"TOC detection successful: {len(sections)} sections found")
+            return self._validate_pipeline(sections, enable_cross_validation=True)
+
+        # Strategy 2: Heading-based (fallback)
+        logger.debug("TOC detection failed, trying heading detection...")
+        sections = self._try_heading_detection()
+        if sections:
+            logger.info(f"Heading detection successful: {len(sections)} sections found")
+            return self._validate_pipeline(sections, enable_cross_validation=False)
+
+        # Strategy 3: Pattern-based (last resort)
+        logger.debug("Heading detection failed, trying pattern matching...")
+        sections = self._try_pattern_detection()
+        if sections:
+            logger.info(f"Pattern detection successful: {len(sections)} sections found")
+            return self._validate_pipeline(sections, enable_cross_validation=False)
+
+        logger.warning("All detection strategies failed, no sections found")
+        return {}
+
+    def _validate_pipeline(
+        self,
+        sections: Dict[str, Section],
+        enable_cross_validation: bool = False
+    ) -> Dict[str, Section]:
+        """
+        Apply validation pipeline to sections.
+
+        Centralizes validation logic to eliminate duplication.
+
+        Args:
+            sections: Sections to validate
+            enable_cross_validation: Whether to enable cross-validation (expensive)
+
+        Returns:
+            Validated sections
+        """
+        if not sections:
+            return sections
+
+        # Cross-validate (optional, expensive)
+        if enable_cross_validation and self.thresholds.enable_cross_validation:
+            sections = self._cross_validate(sections)
+
+        # Validate boundaries
+        sections = self._validate_boundaries(sections)
+
+        # Deduplicate
+        sections = self._deduplicate(sections)
+
+        # Filter by confidence
+        sections = self._filter_by_confidence(sections)
+
+        return sections
+
+    def _try_heading_detection(self) -> Optional[Dict[str, Section]]:
+        """
+        Try multi-strategy heading detection.
+
+        Returns:
+            Dictionary of sections if successful, None if failed
+        """
+        try:
+            # Get heading nodes from document
+            headings = self.document.headings
+            if not headings:
+                return None
+
+            sections = {}
+
+            for heading in headings:
+                # Check if heading has header info
+                if not hasattr(heading, 'header_info') or not heading.header_info:
+                    continue
+
+                header_info = heading.header_info
+
+                # Only use headings with sufficient confidence
+                if header_info.confidence < 0.7:
+                    continue
+
+                # Check if it's an item header
+                if not header_info.is_item:
+                    continue
+
+                # Extract section from this heading to next
+                section = self._extract_section_from_heading(heading, header_info)
+                if section:
+                    section.confidence = header_info.confidence
+                    section.detection_method = 'heading'
+                    sections[section.name] = section
+
+            return sections if sections else None
+
+        except Exception as e:
+            logger.warning(f"Heading detection failed: {e}")
+            return None
+
+    def _try_pattern_detection(self) -> Optional[Dict[str, Section]]:
+        """
+        Try pattern-based extraction.
+
+        Returns:
+            Dictionary of sections if successful, None if failed
+        """
+        try:
+            # Use pattern extractor
+            sections = self.pattern_extractor.extract(self.document)
+
+            # Mark with pattern detection confidence
+            for section in sections.values():
+                section.confidence = 0.6  # Pattern-based = lower confidence
+                section.detection_method = 'pattern'
+
+            return sections if sections else None
+
+        except Exception as e:
+            logger.warning(f"Pattern detection failed: {e}")
+            return None
+
+    def _extract_section_from_heading(self, heading: HeadingNode, header_info) -> Optional[Section]:
+        """
+        Extract section content from heading node to next heading.
+
+        Args:
+            heading: HeadingNode representing section start
+            header_info: HeaderInfo with section metadata
+
+        Returns:
+            Section object if successful, None otherwise
+        """
+        try:
+            # Create section name from item number
+            section_name = f"item_{header_info.item_number.replace('.', '_')}" if header_info.item_number else "unknown"
+
+            # Create section node
+            section_node = SectionNode(section_name=section_name)
+
+            # Find next heading at same or higher level to determine section end
+            current_level = header_info.level
+            parent = heading.parent
+            if not parent:
+                return None
+
+            # Find heading position in parent's children
+            try:
+                heading_index = parent.children.index(heading)
+            except ValueError:
+                return None
+
+            # Collect nodes until next section heading
+            for i in range(heading_index + 1, len(parent.children)):
+                child = parent.children[i]
+
+                # Stop at next heading of same or higher level
+                if isinstance(child, HeadingNode):
+                    if hasattr(child, 'header_info') and child.header_info:
+                        if child.header_info.level <= current_level:
+                            break
+
+                # Add child to section
+                section_node.add_child(child)
+
+            # Create Section object
+            section = Section(
+                name=section_name,
+                title=header_info.text,
+                node=section_node,
+                start_offset=0,  # Would need actual text position
+                end_offset=0,  # Would need actual text position
+                confidence=header_info.confidence,
+                detection_method='heading'
+            )
+
+            return section
+
+        except Exception as e:
+            logger.warning(f"Failed to extract section from heading: {e}")
+            return None
+
+    def _cross_validate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
+        """
+        Cross-validate sections using multiple detection methods.
+
+        Boosts confidence if multiple methods detect the same section.
+        Reduces confidence if methods disagree.
+
+        Args:
+            sections: Sections detected by primary method
+
+        Returns:
+            Validated sections with adjusted confidence scores
+        """
+        validated = {}
+
+        # Get pattern-based sections once for comparison (not per section)
+        try:
+            pattern_sections = self.pattern_extractor.extract(self.document)
+        except Exception as e:
+            logger.debug(f"Pattern extraction failed for cross-validation: {e}")
+            pattern_sections = {}
+
+        for name, section in sections.items():
+            # Try alternative detection (pattern matching for validation)
+            try:
+                # Check if this section is also found by pattern matching
+                found_in_patterns = False
+                for pattern_name, pattern_section in pattern_sections.items():
+                    # Check for name similarity or overlap
+                    if self._sections_similar(section, pattern_section):
+                        found_in_patterns = True
+                        break
+
+                # Boost confidence if methods agree
+                if found_in_patterns:
+                    section.confidence = min(section.confidence * self.thresholds.cross_validation_boost, 1.0)
+                    section.validated = True
+                    logger.debug(f"Section {name} validated by multiple methods, confidence boosted to {section.confidence:.2f}")
+                else:
+                    # Slight reduction if not validated
+                    section.confidence *= self.thresholds.disagreement_penalty
+                    section.validated = False
+
+            except Exception as e:
+                logger.debug(f"Cross-validation failed for {name}: {e}")
+                # Keep original confidence if validation fails
+                pass
+
+            validated[name] = section
+
+        return validated
+
+    def _validate_boundaries(self, sections: Dict[str, Section]) -> Dict[str, Section]:
+        """
+        Validate section boundaries for overlaps, gaps, and ordering.
+
+        Args:
+            sections: Sections to validate
+
+        Returns:
+            Sections with validated boundaries
+        """
+        if not sections:
+            return sections
+
+        # Sort by start offset
+        sorted_sections = sorted(sections.items(), key=lambda x: x[1].start_offset)
+
+        validated = {}
+        prev_section = None
+
+        for name, section in sorted_sections:
+            # Check for overlap with previous section
+            if prev_section and section.start_offset > 0:
+                if section.start_offset < prev_section[1].end_offset:
+                    # Overlap detected - adjust boundary at midpoint
+                    gap_mid = (prev_section[1].end_offset + section.start_offset) // 2
+                    prev_section[1].end_offset = gap_mid
+                    section.start_offset = gap_mid
+
+                    # Reduce confidence due to boundary adjustment
+                    section.confidence *= self.thresholds.boundary_overlap_penalty
+                    prev_section[1].confidence *= self.thresholds.boundary_overlap_penalty
+
+                    logger.debug(f"Adjusted boundary between {prev_section[0]} and {name}")
+
+                # Check for large gap (>10% of document size)
+                elif prev_section[1].end_offset > 0:
+                    gap_size = section.start_offset - prev_section[1].end_offset
+                    if gap_size > 100000:  # Arbitrary large gap threshold
+                        # Large gap - might indicate missing section
+                        section.confidence *= 0.9
+                        logger.debug(f"Large gap detected before {name}")
+
+            validated[name] = section
+            prev_section = (name, section)
+
+        return validated
+
+    def _deduplicate(self, sections: Dict[str, Section]) -> Dict[str, Section]:
+        """
+        Remove duplicate sections detected by multiple methods.
+
+        Keeps the detection with highest confidence.
+
+        Args:
+            sections: Sections possibly containing duplicates
+
+        Returns:
+            Deduplicated sections
+        """
+        if len(sections) <= 1:
+            return sections
+
+        # Group similar sections
+        groups = self._group_similar_sections(sections)
+
+        deduplicated = {}
+        for group in groups:
+            if len(group) == 1:
+                # No duplicates
+                deduplicated[group[0].name] = group[0]
+            else:
+                # Keep section with highest confidence
+                best = max(group, key=lambda s: s.confidence)
+
+                # Merge detection methods
+                methods = set(s.detection_method for s in group)
+                if len(methods) > 1:
+                    best.detection_method = ','.join(sorted(methods))
+                    # Boost confidence for multi-method detection
+                    best.confidence = min(best.confidence * 1.15, 1.0)
+                    best.validated = True
+                    logger.debug(f"Merged duplicate sections for {best.name}, methods: {best.detection_method}")
+
+                deduplicated[best.name] = best
+
+        return deduplicated
+
+    def _group_similar_sections(self, sections: Dict[str, Section]) -> List[List[Section]]:
+        """
+        Group sections that appear to be duplicates.
+
+        Args:
+            sections: Sections to group
+
+        Returns:
+            List of section groups
+        """
+        groups = []
+        used = set()
+
+        for name1, section1 in sections.items():
+            if name1 in used:
+                continue
+
+            group = [section1]
+            used.add(name1)
+
+            for name2, section2 in sections.items():
+                if name2 in used:
+                    continue
+
+                # Check if sections are similar
+                if self._sections_similar(section1, section2):
+                    group.append(section2)
+                    used.add(name2)
+
+            groups.append(group)
+
+        return groups
+
+    def _sections_similar(self, section1: Section, section2: Section) -> bool:
+        """
+        Check if two sections are similar (likely duplicates).
+
+        Args:
+            section1: First section
+            section2: Second section
+
+        Returns:
+            True if sections are similar
+        """
+        # Normalize names for comparison
+        name1 = section1.name.lower().replace('_', ' ').strip()
+        name2 = section2.name.lower().replace('_', ' ').strip()
+
+        # Check exact match after normalization
+        if name1 == name2:
+            return True
+
+        # Check title similarity (exact match)
+        title1 = section1.title.lower().strip()
+        title2 = section2.title.lower().strip()
+
+        if title1 == title2:
+            return True
+
+        # Check for position overlap (if positions are set)
+        if section1.start_offset > 0 and section2.start_offset > 0:
+            # Calculate overlap
+            overlap_start = max(section1.start_offset, section2.start_offset)
+            overlap_end = min(section1.end_offset, section2.end_offset)
+
+            if overlap_end > overlap_start:
+                # There is overlap
+                overlap_size = overlap_end - overlap_start
+                min_size = min(
+                    section1.end_offset - section1.start_offset,
+                    section2.end_offset - section2.start_offset
+                )
+
+                # If overlap is >50% of smaller section, consider similar
+                if min_size > 0 and overlap_size / min_size > 0.5:
+                    return True
+
+        return False
+
+    def _filter_by_confidence(self, sections: Dict[str, Section]) -> Dict[str, Section]:
+        """
+        Filter sections by minimum confidence threshold.
+
+        Args:
+            sections: Sections to filter
+
+        Returns:
+            Filtered sections meeting minimum confidence
+        """
+        # Check for filing-specific thresholds
+        min_conf = self.thresholds.min_confidence
+        if self.form in self.thresholds.thresholds_by_form:
+            filing_thresholds = self.thresholds.thresholds_by_form[self.form]
+            min_conf = filing_thresholds.get('min_confidence', min_conf)
+
+        filtered = {}
+        for name, section in sections.items():
+            if section.confidence >= min_conf:
+                filtered[name] = section
+            else:
+                logger.debug(f"Filtered out section {name} with confidence {section.confidence:.2f} < {min_conf:.2f}")
+
+        return filtered
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/pattern_section_extractor.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/pattern_section_extractor.py
@@ -0,0 +1,405 @@
+"""
+Section extraction from documents.
+"""
+
+import re
+from typing import Dict, List, Optional, Tuple
+
+from edgar.documents.document import Document, Section
+from edgar.documents.nodes import Node, HeadingNode, SectionNode
+
+
+class SectionExtractor:
+    """
+    Extracts logical sections from documents.
+    
+    Identifies document sections like:
+    - Business Overview (Item 1)
+    - Risk Factors (Item 1A)
+    - MD&A (Item 7)
+    - Financial Statements (Item 8)
+    """
+    
+    # Common section patterns for different filing types
+    SECTION_PATTERNS = {
+        '10-K': {
+            'business': [
+                (r'^(Item|ITEM)\s+1\.?\s*Business', 'Item 1 - Business'),
+                (r'^Business\s*$', 'Business'),
+                (r'^Business Overview', 'Business Overview'),
+                (r'^Our Business', 'Our Business'),
+                (r'^Company Overview', 'Company Overview')
+            ],
+            'risk_factors': [
+                (r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
+                (r'^Risk\s+Factors', 'Risk Factors'),
+                (r'^Factors\s+That\s+May\s+Affect', 'Risk Factors')
+            ],
+            'properties': [
+                (r'^(Item|ITEM)\s+2\.?\s*Properties', 'Item 2 - Properties'),
+                (r'^Properties', 'Properties'),
+                (r'^Real\s+Estate', 'Real Estate')
+            ],
+            'legal_proceedings': [
+                (r'^(Item|ITEM)\s+3\.?\s*Legal\s+Proceedings', 'Item 3 - Legal Proceedings'),
+                (r'^Legal\s+Proceedings', 'Legal Proceedings'),
+                (r'^Litigation', 'Litigation')
+            ],
+            'market_risk': [
+                (r'^(Item|ITEM)\s+7A\.?\s*Quantitative.*Disclosures', 'Item 7A - Market Risk'),
+                (r'^Market\s+Risk', 'Market Risk'),
+                (r'^Quantitative.*Qualitative.*Market\s+Risk', 'Market Risk')
+            ],
+            'mda': [
+                (r'^(Item|ITEM)\s+7\.?\s*Management.*Discussion', 'Item 7 - MD&A'),
+                (r'^Management.*Discussion.*Analysis', 'MD&A'),
+                (r'^MD&A', 'MD&A')
+            ],
+            'financial_statements': [
+                (r'^(Item|ITEM)\s+8\.?\s*Financial\s+Statements', 'Item 8 - Financial Statements'),
+                (r'^Financial\s+Statements', 'Financial Statements'),
+                (r'^Consolidated\s+Financial\s+Statements', 'Consolidated Financial Statements')
+            ],
+            'controls_procedures': [
+                (r'^(Item|ITEM)\s+9A\.?\s*Controls.*Procedures', 'Item 9A - Controls and Procedures'),
+                (r'^Controls.*Procedures', 'Controls and Procedures'),
+                (r'^Internal\s+Control', 'Internal Controls')
+            ]
+        },
+        '10-Q': {
+            'financial_statements': [
+                (r'^(Item|ITEM)\s+1\.?\s*Financial\s+Statements', 'Item 1 - Financial Statements'),
+                (r'^Financial\s+Statements', 'Financial Statements'),
+                (r'^Condensed.*Financial\s+Statements', 'Condensed Financial Statements')
+            ],
+            'mda': [
+                (r'^(Item|ITEM)\s+2\.?\s*Management.*Discussion', 'Item 2 - MD&A'),
+                (r'^Management.*Discussion.*Analysis', 'MD&A')
+            ],
+            'market_risk': [
+                (r'^(Item|ITEM)\s+3\.?\s*Quantitative.*Disclosures', 'Item 3 - Market Risk'),
+                (r'^Market\s+Risk', 'Market Risk')
+            ],
+            'controls_procedures': [
+                (r'^(Item|ITEM)\s+4\.?\s*Controls.*Procedures', 'Item 4 - Controls and Procedures'),
+                (r'^Controls.*Procedures', 'Controls and Procedures')
+            ],
+            'legal_proceedings': [
+                (r'^(Item|ITEM)\s+1\.?\s*Legal\s+Proceedings', 'Item 1 - Legal Proceedings'),
+                (r'^Legal\s+Proceedings', 'Legal Proceedings')
+            ],
+            'risk_factors': [
+                (r'^(Item|ITEM)\s+1A\.?\s*Risk\s+Factors', 'Item 1A - Risk Factors'),
+                (r'^Risk\s+Factors', 'Risk Factors')
+            ]
+        },
+        '8-K': {
+            'item_101': [
+                (r'^(Item|ITEM)\s+1\.01', 'Item 1.01 - Entry into Material Agreement'),
+                (r'^Entry.*Material.*Agreement', 'Material Agreement')
+            ],
+            'item_201': [
+                (r'^(Item|ITEM)\s+2\.01', 'Item 2.01 - Completion of Acquisition'),
+                (r'^Completion.*Acquisition', 'Acquisition')
+            ],
+            'item_202': [
+                (r'^(Item|ITEM)\s+2\.02', 'Item 2.02 - Results of Operations'),
+                (r'^Results.*Operations', 'Results of Operations')
+            ],
+            'item_503': [
+                (r'^(Item|ITEM)\s+5\.03', 'Item 5.03 - Director/Officer Changes'),
+                (r'^Amendments.*Articles', 'Charter Amendments')
+            ],
+            'item_801': [
+                (r'^(Item|ITEM)\s+8\.01', 'Item 8.01 - Other Events'),
+                (r'^Other\s+Events', 'Other Events')
+            ],
+            'item_901': [
+                (r'^(Item|ITEM)\s+9\.01', 'Item 9.01 - Financial Statements and Exhibits'),
+                (r'^Financial.*Exhibits', 'Financial Statements and Exhibits')
+            ]
+        }
+    }
+    
+    def __init__(self, form: Optional[str] = None):
+        """
+        Initialize section extractor.
+        
+        Args:
+            form: Type of filing (10-K, 10-Q, 8-K, etc.)
+        """
+        self.form = form
+    
+    def extract(self, document: Document) -> Dict[str, Section]:
+        """
+        Extract sections from document.
+
+        Args:
+            document: Document to extract sections from
+
+        Returns:
+            Dictionary mapping section names to Section objects
+        """
+        # Get filing type from instance, metadata, or document config
+        # NOTE: We no longer auto-detect filing type (expensive and unnecessary)
+        form = None
+
+        if self.form:
+            form = self.form
+        elif document.metadata and document.metadata.form:
+            form = document.metadata.form
+        elif hasattr(document, '_config') and document._config and document._config.form:
+            form = document._config.form
+
+        # Only extract sections for forms that have standard sections
+        if not form or form not in ['10-K', '10-Q', '8-K']:
+            return {}  # No filing type or unsupported form = no section detection
+
+        # Get patterns for filing type
+        patterns = self.SECTION_PATTERNS.get(form, {})
+        if not patterns:
+            return {}  # No patterns defined for this form type
+
+        # Find section headers
+        headers = self._find_section_headers(document)
+
+        # For 10-Q, detect Part I/Part II boundaries
+        part_context = None
+        if form == '10-Q':
+            part_context = self._detect_10q_parts(headers)
+
+        # Match headers to sections
+        sections = self._match_sections(headers, patterns, document, part_context)
+
+        # Create section objects
+        return self._create_sections(sections, document)
+    
+    # NOTE: _detect_form() removed - form type should be known from context
+    # Filing metadata should be set by the caller (Filing class, TenK/TenQ, etc.)
+
+    # NOTE: _infer_form_from_headers() kept for backward compatibility but not used
+    # in normal flow anymore. Form type should always be provided explicitly.
+    def _infer_form_from_headers(self, document: Document) -> str:
+        """
+        Infer filing type from section headers.
+
+        NOTE: This method is kept for backward compatibility but should not be used
+        in the normal flow. Form type should be explicitly provided via config or metadata.
+        """
+        headers = document.headings
+        header_texts = [h.text().upper() for h in headers if h.text()]
+
+        # Check for 10-K specific sections
+        has_10k_sections = any(
+            'ITEM 1.' in text or 'ITEM 1A.' in text or 'ITEM 7.' in text or 'ITEM 8.' in text
+            for text in header_texts
+        )
+
+        # Check for 10-Q specific sections
+        has_10q_sections = any(
+            ('ITEM 1.' in text and 'FINANCIAL STATEMENTS' in text) or
+            ('ITEM 2.' in text and 'MANAGEMENT' in text) or
+            'ITEM 3.' in text or 'ITEM 4.' in text
+            for text in header_texts
+        )
+
+        # Check for 8-K specific sections
+        has_8k_sections = any(
+            re.search(r'ITEM \d\.\d{2}', text) for text in header_texts
+        )
+
+        if has_10k_sections and not has_10q_sections:
+            return '10-K'
+        elif has_10q_sections:
+            return '10-Q'
+        elif has_8k_sections:
+            return '8-K'
+        else:
+            return 'UNKNOWN'
+    
+    def _get_general_patterns(self) -> Dict[str, List[Tuple[str, str]]]:
+        """Get general section patterns."""
+        return {
+            'business': [
+                (r'^Business', 'Business'),
+                (r'^Overview', 'Overview'),
+                (r'^Company', 'Company')
+            ],
+            'financial': [
+                (r'^Financial\s+Statements', 'Financial Statements'),
+                (r'^Consolidated.*Statements', 'Consolidated Statements')
+            ],
+            'notes': [
+                (r'^Notes\s+to.*Financial\s+Statements', 'Notes to Financial Statements'),
+                (r'^Notes\s+to.*Statements', 'Notes')
+            ]
+        }
+    
+    def _find_section_headers(self, document: Document) -> List[Tuple[Node, str, int]]:
+        """Find all potential section headers."""
+        headers = []
+        
+        # Find all heading nodes
+        heading_nodes = document.root.find(lambda n: isinstance(n, HeadingNode))
+        
+        for node in heading_nodes:
+            text = node.text()
+            if text:
+                # Get position in document
+                position = self._get_node_position(node, document)
+                headers.append((node, text, position))
+        
+        # Also check for section nodes
+        section_nodes = document.root.find(lambda n: isinstance(n, SectionNode))
+        for node in section_nodes:
+            # Get first heading in section
+            first_heading = node.find_first(lambda n: isinstance(n, HeadingNode))
+            if first_heading:
+                text = first_heading.text()
+                if text:
+                    position = self._get_node_position(node, document)
+                    headers.append((node, text, position))
+        
+        # Sort by position
+        headers.sort(key=lambda x: x[2])
+        
+        return headers
+    
+    def _get_node_position(self, node: Node, document: Document) -> int:
+        """Get position of node in document."""
+        position = 0
+        for n in document.root.walk():
+            if n == node:
+                return position
+            position += 1
+        return position
+    
+    def _detect_10q_parts(self, headers: List[Tuple[Node, str, int]]) -> Dict[int, str]:
+        """
+        Detect Part I and Part II boundaries in 10-Q filings.
+
+        Args:
+            headers: List of (node, text, position) tuples
+
+        Returns:
+            Dict mapping header index to part name ("Part I" or "Part II")
+        """
+        part_context = {}
+        current_part = None
+
+        part_i_pattern = re.compile(r'^\s*PART\s+I\b', re.IGNORECASE)
+        part_ii_pattern = re.compile(r'^\s*PART\s+II\b', re.IGNORECASE)
+
+        for i, (node, text, position) in enumerate(headers):
+            text_stripped = text.strip()
+
+            # Check if this is a Part I or Part II header
+            if part_i_pattern.match(text_stripped):
+                current_part = "Part I"
+                part_context[i] = current_part
+            elif part_ii_pattern.match(text_stripped):
+                current_part = "Part II"
+                part_context[i] = current_part
+            elif current_part:
+                # Headers after a Part declaration belong to that part
+                part_context[i] = current_part
+
+        return part_context
+
+    def _match_sections(self,
+                       headers: List[Tuple[Node, str, int]],
+                       patterns: Dict[str, List[Tuple[str, str]]],
+                       document: Document,
+                       part_context: Optional[Dict[int, str]] = None) -> Dict[str, Tuple[Node, str, int, int]]:
+        """Match headers to section patterns."""
+        matched_sections = {}
+        used_headers = set()
+
+        # Try to match each pattern
+        for section_name, section_patterns in patterns.items():
+            for pattern, title in section_patterns:
+                for i, (node, text, position) in enumerate(headers):
+                    if i in used_headers:
+                        continue
+
+                    # Try to match pattern
+                    if re.match(pattern, text.strip(), re.IGNORECASE):
+                        # Find end position (next section or end of document)
+                        end_position = self._find_section_end(i, headers, document)
+
+                        # For 10-Q, prefix with Part I or Part II
+                        final_title = title
+                        if part_context and i in part_context:
+                            final_title = f"{part_context[i]} - {title}"
+
+                        # Use final_title as key to avoid conflicts
+                        section_key = final_title if part_context and i in part_context else section_name
+                        matched_sections[section_key] = (node, final_title, position, end_position)
+                        used_headers.add(i)
+                        break
+
+                # If we found a match, move to next section
+                if section_name in matched_sections:
+                    break
+
+        return matched_sections
+    
+    def _find_section_end(self, 
+                         section_index: int, 
+                         headers: List[Tuple[Node, str, int]],
+                         document: Document) -> int:
+        """Find where section ends."""
+        # Next section starts where next header at same or higher level begins
+        if section_index + 1 < len(headers):
+            current_node = headers[section_index][0]
+            current_level = current_node.level if isinstance(current_node, HeadingNode) else 1
+            
+            for i in range(section_index + 1, len(headers)):
+                next_node = headers[i][0]
+                next_level = next_node.level if isinstance(next_node, HeadingNode) else 1
+                
+                # If next header is at same or higher level, that's our end
+                if next_level <= current_level:
+                    return headers[i][2]
+        
+        # Otherwise, section goes to end of document
+        return sum(1 for _ in document.root.walk())
+    
+    def _create_sections(self, 
+                        matched_sections: Dict[str, Tuple[Node, str, int, int]], 
+                        document: Document) -> Dict[str, Section]:
+        """Create Section objects from matches."""
+        sections = {}
+        
+        for section_name, (node, title, start_pos, end_pos) in matched_sections.items():
+            # Create section node containing all content in range
+            section_node = SectionNode(section_name=section_name)
+            
+            # Find all nodes in position range
+            position = 0
+            for n in document.root.walk():
+                if start_pos <= position < end_pos:
+                    # Clone node and add to section
+                    # (In real implementation, would properly handle node hierarchy)
+                    section_node.add_child(n)
+                position += 1
+            
+            # Parse section name to extract part and item identifiers
+            part, item = Section.parse_section_name(section_name)
+
+            # Create Section object
+            section = Section(
+                name=section_name,
+                title=title,
+                node=section_node,
+                start_offset=start_pos,
+                end_offset=end_pos,
+                confidence=0.7,  # Pattern-based detection = moderate confidence
+                detection_method='pattern',  # Method: regex pattern matching
+                part=part,
+                item=item
+            )
+            
+            sections[section_name] = section
+        
+        return sections
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/text_extractor.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/text_extractor.py
@@ -0,0 +1,348 @@
+"""
+Text extraction from documents with various options.
+"""
+
+import re
+from typing import List, Optional, Set
+from edgar.documents.document import Document
+from edgar.documents.nodes import Node, TextNode, HeadingNode, ParagraphNode
+from edgar.documents.table_nodes import TableNode
+from edgar.documents.types import NodeType
+
+
+class TextExtractor:
+    """
+    Extracts text from documents with configurable options.
+    
+    Supports:
+    - Clean text extraction for AI/NLP
+    - Table inclusion/exclusion
+    - Metadata annotations
+    - Length limiting
+    - Smart whitespace handling
+    """
+    
+    def __init__(self,
+                 clean: bool = True,
+                 include_tables: bool = True,
+                 include_metadata: bool = False,
+                 include_links: bool = False,
+                 max_length: Optional[int] = None,
+                 preserve_structure: bool = False):
+        """
+        Initialize text extractor.
+        
+        Args:
+            clean: Clean and normalize text
+            include_tables: Include table content
+            include_metadata: Include metadata annotations
+            include_links: Include link URLs
+            max_length: Maximum text length
+            preserve_structure: Preserve document structure with markers
+        """
+        self.clean = clean
+        self.include_tables = include_tables
+        self.include_metadata = include_metadata
+        self.include_links = include_links
+        self.max_length = max_length
+        self.preserve_structure = preserve_structure
+        
+        # Track what we've extracted to avoid duplicates
+        self._extracted_ids: Set[str] = set()
+    
+    def extract(self, document: Document) -> str:
+        """
+        Extract text from document.
+        
+        Args:
+            document: Document to extract from
+            
+        Returns:
+            Extracted text
+        """
+        parts = []
+        self._extracted_ids.clear()
+        
+        # Extract from root
+        self._extract_from_node(document.root, parts, depth=0)
+        
+        # Join parts
+        if self.preserve_structure:
+            text = '\n'.join(parts)
+        else:
+            text = '\n\n'.join(filter(None, parts))
+        
+        # Apply minimal global cleaning - tables are already handled appropriately per node
+        if self.clean:
+            text = self._clean_document_text(text)
+        
+        # Limit length if requested
+        if self.max_length and len(text) > self.max_length:
+            text = self._truncate_text(text, self.max_length)
+        
+        return text
+    
+    def extract_from_node(self, node: Node) -> str:
+        """Extract text from a specific node."""
+        parts = []
+        self._extracted_ids.clear()
+        self._extract_from_node(node, parts, depth=0)
+        
+        text = '\n\n'.join(filter(None, parts))
+        
+        if self.clean:
+            text = self._clean_document_text(text)
+        
+        return text
+    
+    def _extract_from_node(self, node: Node, parts: List[str], depth: int):
+        """Recursively extract text from node - render each node type appropriately."""
+        # Skip if already extracted (handles shared nodes)
+        if node.id in self._extracted_ids:
+            return
+        self._extracted_ids.add(node.id)
+        
+        # Handle based on node type - like old parser's block.get_text()
+        if isinstance(node, TableNode):
+            if self.include_tables:
+                # Tables render themselves - preserve their formatting 
+                self._extract_table(node, parts)
+        
+        elif isinstance(node, HeadingNode):
+            # Headings get cleaned text
+            self._extract_heading(node, parts, depth)
+        
+        elif isinstance(node, TextNode):
+            # Text nodes get cleaned if cleaning is enabled
+            text = node.text()
+            if text:
+                if self.clean:
+                    text = self._clean_text_content(text)  # Clean non-table text
+                if self.include_metadata and node.metadata:
+                    text = self._annotate_with_metadata(text, node.metadata)
+                parts.append(text)
+        
+        elif isinstance(node, ParagraphNode):
+            # Extract paragraph as unified text to maintain flow of inline elements
+            text = node.text()
+            if text:
+                if self.clean:
+                    text = self._clean_text_content(text)
+                if self.include_metadata and node.metadata:
+                    text = self._annotate_with_metadata(text, node.metadata)
+                parts.append(text)
+            # Don't process children since we already got the paragraph text
+            return
+        
+        else:
+            # Check if this looks like a bullet point container that should flow together
+            if self._is_bullet_point_container(node):
+                # Extract text from bullet point children and join with spaces (not newlines)
+                bullet_parts = []
+                for child in node.children:
+                    child_text = child.text() if hasattr(child, 'text') else ""
+                    if child_text and child_text.strip():
+                        bullet_parts.append(child_text.strip())
+                
+                if bullet_parts:
+                    # Join with spaces for bullet points
+                    text = ' '.join(bullet_parts)
+                    if self.clean:
+                        text = self._clean_text_content(text)
+                    if self.include_metadata and node.metadata:
+                        text = self._annotate_with_metadata(text, node.metadata)
+                    parts.append(text)
+                # Don't process children since we already got the unified text
+                return
+            
+            # For other nodes, extract text content and clean if appropriate
+            if hasattr(node, 'content') and isinstance(node.content, str):
+                text = node.content
+                if text and text.strip():
+                    if self.clean:
+                        text = self._clean_text_content(text)  # Clean non-table text
+                    if self.include_metadata and node.metadata:
+                        text = self._annotate_with_metadata(text, node.metadata)
+                    parts.append(text)
+        
+        # Process children
+        for child in node.children:
+            self._extract_from_node(child, parts, depth + 1)
+    
+    def _extract_heading(self, node: HeadingNode, parts: List[str], depth: int):
+        """Extract heading with optional structure markers."""
+        text = node.text()
+        if not text:
+            return
+        
+        if self.preserve_structure:
+            # Add structure markers
+            marker = '#' * node.level
+            text = f"{marker} {text}"
+        
+        if self.include_metadata and node.metadata:
+            text = self._annotate_with_metadata(text, node.metadata)
+        
+        parts.append(text)
+    
+    def _extract_table(self, table: TableNode, parts: List[str]):
+        """Extract table content - preserve original formatting like old parser."""
+        if self.preserve_structure:
+            parts.append("[TABLE START]")
+        
+        # Add table caption if present
+        if table.caption:
+            caption_text = table.caption
+            if self.clean:
+                caption_text = self._clean_text_content(caption_text)  # Clean caption but not table content
+            if self.preserve_structure:
+                parts.append(f"Caption: {caption_text}")
+            else:
+                parts.append(caption_text)
+        
+        # Extract table text - PRESERVE FORMATTING (like old parser's TableBlock.get_text())
+        table_text = table.text()
+        if table_text:
+            # Tables render their own formatting - don't apply text cleaning to preserve alignment
+            parts.append(table_text)  # Keep original spacing and alignment
+        
+        if self.preserve_structure:
+            parts.append("[TABLE END]")
+    
+    def _annotate_with_metadata(self, text: str, metadata: dict) -> str:
+        """Add metadata annotations to text."""
+        annotations = []
+        
+        # Add XBRL annotations
+        if 'ix_tag' in metadata:
+            annotations.append(f"[XBRL: {metadata['ix_tag']}]")
+        
+        # Add section annotations
+        if 'section_name' in metadata:
+            annotations.append(f"[Section: {metadata['section_name']}]")
+        
+        # Add semantic type
+        if 'semantic_type' in metadata:
+            annotations.append(f"[Type: {metadata['semantic_type']}]")
+        
+        if annotations:
+            return f"{' '.join(annotations)} {text}"
+        
+        return text
+    
+    def _clean_text_content(self, text: str) -> str:
+        """Clean regular text content (not tables) - like old parser text cleaning."""
+        if not text:
+            return text
+        
+        # Replace multiple spaces with single space for regular text
+        text = re.sub(r' {2,}', ' ', text)
+        
+        # Clean up space around newlines
+        text = re.sub(r' *\n *', '\n', text)
+        
+        # Remove leading/trailing whitespace from lines
+        lines = text.split('\n')
+        lines = [line.strip() for line in lines]
+        text = '\n'.join(lines)
+        
+        # Normalize quotes and dashes
+        text = self._normalize_punctuation(text)
+        
+        return text
+    
+    def _is_bullet_point_container(self, node) -> bool:
+        """Check if a container node represents a bullet point that should flow as one line."""
+        from edgar.documents.nodes import ContainerNode
+        
+        if not isinstance(node, ContainerNode):
+            return False
+        
+        # Must have at least 2 children (bullet + content)
+        if len(node.children) < 2:
+            return False
+        
+        # Get the text of all children to check for bullet patterns
+        all_text = node.text()
+        if not all_text:
+            return False
+        
+        # Check if starts with common bullet characters
+        bullet_chars = ['•', '●', '▪', '▫', '◦', '‣', '-', '*']
+        starts_with_bullet = any(all_text.strip().startswith(char) for char in bullet_chars)
+        
+        if not starts_with_bullet:
+            return False
+        
+        # Check if container has flex display (common for bullet point layouts)
+        if hasattr(node, 'style') and node.style and hasattr(node.style, 'display'):
+            if node.style.display == 'flex':
+                return True
+        
+        # Check if it has bullet-like structure: short first child + longer content
+        if len(node.children) >= 2:
+            first_child_text = node.children[0].text() if hasattr(node.children[0], 'text') else ""
+            second_child_text = node.children[1].text() if hasattr(node.children[1], 'text') else ""
+            
+            # First child is very short (likely bullet), second is longer (content)
+            if len(first_child_text.strip()) <= 3 and len(second_child_text.strip()) > 10:
+                return True
+        
+        return False
+    
+    def _clean_document_text(self, text: str) -> str:
+        """Apply minimal document-level cleaning that preserves table formatting."""
+        if not text:
+            return text
+        
+        # Only apply global formatting that doesn't affect table alignment:
+        
+        # Replace excessive newlines (4+ consecutive) with triple newline
+        text = re.sub(r'\n{4,}', '\n\n\n', text)
+        
+        # Remove empty lines at start/end only
+        text = text.strip()
+        
+        return text
+    
+    def _normalize_punctuation(self, text: str) -> str:
+        """Normalize punctuation for cleaner text."""
+        # Normalize quotes
+        text = text.replace('"', '"').replace('"', '"')
+        text = text.replace(''', "'").replace(''', "'")
+        
+        # Normalize dashes
+        text = text.replace('—', ' - ')  # em dash
+        text = text.replace('–', ' - ')  # en dash
+        
+        # Fix spacing around punctuation
+        text = re.sub(r'\s+([.,;!?])', r'\1', text)
+        text = re.sub(r'([.,;!?])\s*', r'\1 ', text)
+        
+        # Remove extra spaces
+        text = re.sub(r' {2,}', ' ', text)
+        
+        return text.strip()
+    
+    def _truncate_text(self, text: str, max_length: int) -> str:
+        """Truncate text intelligently."""
+        if len(text) <= max_length:
+            return text
+        
+        # Try to truncate at sentence boundary
+        truncated = text[:max_length]
+        last_period = truncated.rfind('.')
+        last_newline = truncated.rfind('\n')
+        
+        # Choose the better truncation point
+        truncate_at = max(last_period, last_newline)
+        if truncate_at > max_length * 0.8:  # If we found a good boundary
+            return text[:truncate_at + 1].strip()
+        
+        # Otherwise truncate at word boundary
+        last_space = truncated.rfind(' ')
+        if last_space > max_length * 0.9:
+            return text[:last_space].strip() + '...'
+        
+        # Last resort: hard truncate
+        return text[:max_length - 3].strip() + '...'
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/toc_section_detector.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/toc_section_detector.py
@@ -0,0 +1,178 @@
+"""
+TOC-based section detection strategy.
+
+Detects sections using Table of Contents structure. Provides highest
+confidence (0.95) and includes full text extraction capabilities.
+
+This detector wraps SECSectionExtractor which has proven implementations of:
+- Multi-column TOC support (checks all preceding table cells)
+- Nested anchor handling (traverses up to find content container)
+- Full section text extraction
+"""
+
+import logging
+from typing import Dict, Optional
+
+from edgar.documents.document import Document, Section
+from edgar.documents.nodes import SectionNode
+from edgar.documents.extractors.toc_section_extractor import SECSectionExtractor
+
+logger = logging.getLogger(__name__)
+
+
+class TOCSectionDetector:
+    """
+    TOC-based section detection strategy.
+
+    Uses Table of Contents structure to identify section boundaries and
+    extract full section content. Provides high confidence (0.95) detection.
+
+    This implementation wraps the proven SECSectionExtractor which includes:
+    - Multi-column TOC support for edge cases like Morgan Stanley
+    - Nested anchor handling for sections with no sibling content
+    - Complete text extraction with proper boundary detection
+    """
+
+    def __init__(self, document: Document):
+        """
+        Initialize TOC-based detector.
+
+        Args:
+            document: Document to analyze (must have metadata.original_html)
+        """
+        self.document = document
+        self.extractor = SECSectionExtractor(document)
+
+    def detect(self) -> Optional[Dict[str, Section]]:
+        """
+        Detect sections using TOC structure.
+
+        Returns:
+            Dictionary mapping section names to Section objects, or None if unavailable
+
+        Note:
+            Requires document.metadata.original_html to be available.
+            Returns None if HTML is not available or no sections found.
+        """
+        # Check if original HTML is available
+        html_content = getattr(self.document.metadata, 'original_html', None)
+        if not html_content:
+            logger.debug("TOC detection unavailable: original_html not in document metadata")
+            return None
+
+        try:
+            # Get available sections from TOC
+            available = self.extractor.get_available_sections()
+            if not available:
+                logger.debug("No sections found in TOC")
+                return None
+
+            sections = {}
+
+            # Extract each section
+            for section_name in available:
+                # Get section metadata first to check for subsections
+                section_info = self.extractor.get_section_info(section_name)
+                if not section_info:
+                    logger.debug(f"Skipping {section_name}: no section info")
+                    continue
+
+                # Get section text (may be empty for container sections)
+                section_text = self.extractor.get_section_text(section_name, include_subsections=True)
+
+                # Check if this section has subsections
+                has_subsections = section_info.get('subsections', [])
+
+                if not section_text and not has_subsections:
+                    # Skip only if no text AND no subsections
+                    logger.debug(f"Skipping {section_name}: no text and no subsections")
+                    continue
+
+                # Create section node (placeholder - actual content extracted lazily)
+                section_node = SectionNode(section_name=section_name)
+
+                # For container sections (Item 1, Item 10), text will include all subsections
+                section_length = len(section_text) if section_text else 0
+
+                # Create text extractor callback for lazy loading
+                def make_text_extractor(extractor, name):
+                    """Create a closure that captures extractor and section name."""
+                    def extract_text(section_name=None, **kwargs):
+                        # Use captured name, ignore passed section_name
+                        clean = kwargs.get('clean', True)
+                        return extractor.get_section_text(name, include_subsections=True, clean=clean) or ""
+                    return extract_text
+
+                # Parse section name to extract part and item identifiers
+                part, item = Section.parse_section_name(section_name)
+
+                # Create Section with TOC confidence
+                section = Section(
+                    name=section_name,
+                    title=section_info.get('canonical_name', section_name),
+                    node=section_node,
+                    start_offset=0,  # Would need actual offsets from parsing
+                    end_offset=section_length,
+                    confidence=0.95,  # TOC-based = high confidence
+                    detection_method='toc',
+                    part=part,
+                    item=item,
+                    _text_extractor=make_text_extractor(self.extractor, section_name)
+                )
+
+                sections[section_name] = section
+
+            if sections:
+                logger.info(f"TOC detection found {len(sections)} sections")
+                return sections
+
+            return None
+
+        except Exception as e:
+            logger.warning(f"TOC detection failed: {e}", exc_info=True)
+            return None
+
+
+def get_section_text(document: Document, section_name: str) -> Optional[str]:
+    """
+    Get section text using TOC-based extraction.
+
+    Args:
+        document: Document to extract from
+        section_name: Section name (e.g., 'Item 1', 'Item 1A')
+
+    Returns:
+        Section text if available, None otherwise
+    """
+    html_content = getattr(document.metadata, 'original_html', None)
+    if not html_content:
+        return None
+
+    try:
+        extractor = SECSectionExtractor(document)
+        return extractor.get_section_text(section_name)
+    except Exception as e:
+        logger.warning(f"Failed to get section text for {section_name}: {e}")
+        return None
+
+
+def get_available_sections(document: Document) -> list[str]:
+    """
+    Get list of available sections from TOC.
+
+    Args:
+        document: Document to analyze
+
+    Returns:
+        List of section names found in TOC
+    """
+    html_content = getattr(document.metadata, 'original_html', None)
+    if not html_content:
+        return []
+
+    try:
+        extractor = SECSectionExtractor(document)
+        return extractor.get_available_sections()
+    except Exception as e:
+        logger.warning(f"Failed to get available sections: {e}")
+        return []
--- a/venv/lib/python3.10/site-packages/edgar/documents/extractors/toc_section_extractor.py
+++ b/venv/lib/python3.10/site-packages/edgar/documents/extractors/toc_section_extractor.py
@@ -0,0 +1,383 @@
+"""
+Section extraction for SEC filings using Table of Contents analysis.
+
+This system uses TOC structure to extract specific sections like "Item 1", 
+"Item 1A", etc. from SEC filings. This approach works consistently across
+all SEC filings regardless of whether they use semantic anchors or generated IDs.
+"""
+import re
+from typing import Dict, List, Optional, Tuple, Set
+from dataclasses import dataclass
+from lxml import html as lxml_html
+
+from edgar.documents.nodes import Node, SectionNode
+from edgar.documents.document import Document
+from edgar.documents.utils.toc_analyzer import TOCAnalyzer
+
+
+@dataclass
+class SectionBoundary:
+    """Represents the boundaries of a document section."""
+    name: str
+    anchor_id: str
+    start_element_id: Optional[str] = None
+    end_element_id: Optional[str] = None
+    start_node: Optional[Node] = None
+    end_node: Optional[Node] = None
+    text_start: Optional[int] = None  # Character position in full text
+    text_end: Optional[int] = None
+    confidence: float = 1.0  # Detection confidence (0.0-1.0)
+    detection_method: str = 'unknown'  # How section was detected
+
+
+class SECSectionExtractor:
+    """
+    Extract specific sections from SEC filings using Table of Contents analysis.
+    
+    This uses TOC structure to identify section boundaries and extract content
+    between them. Works consistently for all SEC filings.
+    """
+    
+    def __init__(self, document: Document):
+        self.document = document
+        self.section_map = {}  # Maps section names to canonical names
+        self.section_boundaries = {}  # Maps section names to boundaries
+        self.toc_analyzer = TOCAnalyzer()
+        self._analyze_sections()
+    
+    def _analyze_sections(self) -> None:
+        """
+        Analyze the document using TOC structure to identify section boundaries.
+        
+        This creates a map of section names to their anchor positions using
+        Table of Contents analysis, which works for all SEC filings.
+        """
+        # Get the original HTML if available
+        html_content = getattr(self.document.metadata, 'original_html', None)
+        if not html_content:
+            return
+        
+        # Use TOC analysis to find sections
+        toc_mapping = self.toc_analyzer.analyze_toc_structure(html_content)
+        
+        if not toc_mapping:
+            return  # No sections found
+        
+        # Handle XML declaration issues  
+        if html_content.startswith('<?xml'):
+            html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
+        
+        tree = lxml_html.fromstring(html_content)
+        
+        sec_sections = {}
+        
+        for section_name, anchor_id in toc_mapping.items():
+            # Verify the anchor target exists
+            target_elements = tree.xpath(f'//*[@id="{anchor_id}"]')
+            if target_elements:
+                element = target_elements[0]
+                
+                # Use TOC-based section info
+                section_type, order = self.toc_analyzer._get_section_type_and_order(section_name)
+                
+                sec_sections[section_name] = {
+                    'anchor_id': anchor_id,
+                    'element': element,
+                    'canonical_name': section_name,
+                    'type': section_type,
+                    'order': order,
+                    'confidence': 0.95,  # TOC-based detection = high confidence
+                    'detection_method': 'toc'  # Method: Table of Contents
+                }
+        
+        if not sec_sections:
+            return  # No valid sections found
+        
+        # Sort sections by their logical order
+        sorted_sections = sorted(sec_sections.items(), key=lambda x: x[1]['order'])
+        
+        # Calculate section boundaries
+        for i, (section_name, section_data) in enumerate(sorted_sections):
+            start_anchor = section_data['anchor_id']
+            
+            # End boundary is the start of the next section (if any)
+            end_anchor = None
+            if i + 1 < len(sorted_sections):
+                next_section = sorted_sections[i + 1][1]
+                end_anchor = next_section['anchor_id']
+            
+            self.section_boundaries[section_name] = SectionBoundary(
+                name=section_name,
+                anchor_id=start_anchor,
+                end_element_id=end_anchor,
+                confidence=section_data.get('confidence', 0.95),
+                detection_method=section_data.get('detection_method', 'toc')
+            )
+        
+        self.section_map = {name: data['canonical_name'] for name, data in sec_sections.items()}
+    
+    
+    
+    def get_available_sections(self) -> List[str]:
+        """
+        Get list of available sections that can be extracted.
+        
+        Returns:
+            List of section names
+        """
+        return sorted(self.section_boundaries.keys(), 
+                     key=lambda x: self.section_boundaries[x].anchor_id)
+    
+    def get_section_text(self, section_name: str,
+                        include_subsections: bool = True,
+                        clean: bool = True) -> Optional[str]:
+        """
+        Extract text content for a specific section.
+
+        Args:
+            section_name: Name of section (e.g., "Item 1", "Item 1A", "Part I")
+            include_subsections: Whether to include subsections
+            clean: Whether to apply text cleaning
+
+        Returns:
+            Section text content or None if section not found
+        """
+        # Normalize section name
+        normalized_name = self._normalize_section_name(section_name)
+
+        if normalized_name not in self.section_boundaries:
+            return None
+
+        boundary = self.section_boundaries[normalized_name]
+
+        # Extract content between boundaries using HTML parsing
+        html_content = getattr(self.document.metadata, 'original_html', None)
+        if not html_content:
+            return None
+
+        try:
+            section_text = self._extract_section_content(html_content, boundary, include_subsections, clean)
+
+            # If no direct content but include_subsections=True, aggregate subsection text
+            if not section_text and include_subsections:
+                subsections = self._get_subsections(normalized_name)
+                if subsections:
+                    # Recursively get text from all subsections
+                    subsection_texts = []
+                    for subsection_name in subsections:
+                        subsection_text = self.get_section_text(subsection_name, include_subsections=True, clean=clean)
+                        if subsection_text:
+                            subsection_texts.append(subsection_text)
+
+                    if subsection_texts:
+                        section_text = '\n\n'.join(subsection_texts)
+
+            return section_text
+        except Exception as e:
+            # Fallback to simple text extraction
+            return self._extract_section_fallback(section_name, clean)
+    
+    def _normalize_section_name(self, section_name: str) -> str:
+        """Normalize section name for lookup."""
+        # Handle common variations
+        name = section_name.strip()
+        
+        # "Item 1" vs "Item 1." vs "Item 1:"
+        name = re.sub(r'[.:]$', '', name)
+        
+        # Case normalization
+        if re.match(r'item\s+\d+', name, re.IGNORECASE):
+            match = re.match(r'item\s+(\d+[a-z]?)', name, re.IGNORECASE)
+            if match:
+                name = f"Item {match.group(1).upper()}"
+        elif re.match(r'part\s+[ivx]+', name, re.IGNORECASE):
+            match = re.match(r'part\s+([ivx]+)', name, re.IGNORECASE)
+            if match:
+                name = f"Part {match.group(1).upper()}"
+        
+        return name
+    
+    def _extract_section_content(self, html_content: str, boundary: SectionBoundary,
+                               include_subsections: bool, clean: bool) -> str:
+        """
+        Extract section content from HTML between anchors.
+        
+        Args:
+            html_content: Full HTML content
+            boundary: Section boundary info
+            include_subsections: Whether to include subsections
+            clean: Whether to clean the text
+            
+        Returns:
+            Extracted section text
+        """
+        # Handle XML declaration issues
+        if html_content.startswith('<?xml'):
+            html_content = re.sub(r'<\?xml[^>]*\?>', '', html_content, count=1)
+        
+        tree = lxml_html.fromstring(html_content)
+        
+        # Find start element
+        start_elements = tree.xpath(f'//*[@id="{boundary.anchor_id}"]')
+        if not start_elements:
+            return ""
+        
+        start_element = start_elements[0]
+
+        # Collect content until we hit the end boundary (if specified)
+        content_elements = []
+
+        # If anchor has no siblings (nested in empty container), traverse up to find content container
+        # This handles cases like <div id="item7"><div></div></div> where content is after the container
+        current = start_element.getnext()
+        if current is None:
+            # No sibling - traverse up to find a container with siblings
+            container = start_element.getparent()
+            while container is not None and container.getnext() is None:
+                container = container.getparent()
+
+            # Start from the container's next sibling if found
+            if container is not None:
+                current = container.getnext()
+
+        # Collect content from siblings
+        if current is not None:
+            # Normal case - anchor has siblings
+            while current is not None:
+                # Check if we've reached the end boundary
+                if boundary.end_element_id:
+                    current_id = current.get('id', '')
+                    if current_id == boundary.end_element_id:
+                        break
+
+                    # Also check if this is a sibling section we should stop at
+                    if not include_subsections and self._is_sibling_section(current_id, boundary.name):
+                        break
+
+                content_elements.append(current)
+                current = current.getnext()
+        
+        # Extract text from collected elements
+        section_texts = []
+        for element in content_elements:
+            text = self._extract_element_text(element)
+            if text.strip():
+                section_texts.append(text)
+        
+        combined_text = '\n\n'.join(section_texts)
+        
+        # Apply cleaning if requested
+        if clean:
+            combined_text = self._clean_section_text(combined_text)
+        
+        return combined_text
+    
+    def _is_sibling_section(self, element_id: str, current_section: str) -> bool:
+        """Check if element ID represents a sibling section."""
+        if not element_id:
+            return False
+        
+        # Check if this looks like another item at the same level
+        if 'item' in current_section.lower() and 'item' in element_id.lower():
+            current_item = re.search(r'item\s*(\d+)', current_section, re.IGNORECASE)
+            other_item = re.search(r'item[\s_]*(\d+)', element_id, re.IGNORECASE)
+            
+            if current_item and other_item:
+                return current_item.group(1) != other_item.group(1)
+        
+        return False
+    
+    def _extract_element_text(self, element) -> str:
+        """Extract clean text from an HTML element."""
+        # This would integrate with your existing text extraction logic
+        # For now, simple text extraction
+        return element.text_content() or ""
+    
+    def _clean_section_text(self, text: str) -> str:
+        """Clean extracted section text."""
+        # Apply the same cleaning as the main document
+        from edgar.documents.utils.anchor_cache import filter_with_cached_patterns
+        
+        # Remove excessive whitespace
+        text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
+        
+        # Filter navigation links
+        html_content = getattr(self.document.metadata, 'original_html', None)
+        if html_content:
+            text = filter_with_cached_patterns(text, html_content)
+        
+        return text.strip()
+    
+    def _extract_section_fallback(self, section_name: str, clean: bool) -> Optional[str]:
+        """
+        Fallback section extraction using document nodes.
+        
+        This is used when HTML-based extraction fails.
+        """
+        # Search through document sections
+        for name, section in self.document.sections.items():
+            if section_name.lower() in name.lower():
+                return section.text(clean=clean)
+        
+        return None
+    
+    def get_section_info(self, section_name: str) -> Optional[Dict]:
+        """
+        Get detailed information about a section.
+        
+        Args:
+            section_name: Section name to look up
+            
+        Returns:
+            Dict with section metadata
+        """
+        normalized_name = self._normalize_section_name(section_name)
+        
+        if normalized_name not in self.section_boundaries:
+            return None
+        
+        boundary = self.section_boundaries[normalized_name]
+        
+        return {
+            'name': boundary.name,
+            'anchor_id': boundary.anchor_id,
+            'available': True,
+            'estimated_length': None,  # Could calculate if needed
+            'subsections': self._get_subsections(normalized_name)
+        }
+    
+    def _get_subsections(self, parent_section: str) -> List[str]:
+        """
+        Get subsections of a parent section.
+
+        For example:
+        - "Item 1" has subsections "Item 1A", "Item 1B" (valid)
+        - "Item 1" does NOT have subsection "Item 10" (invalid - different item)
+        """
+        subsections = []
+
+        # Look for sections that start with the parent name
+        for section_name in self.section_boundaries:
+            if section_name == parent_section:
+                continue
+
+            if section_name.startswith(parent_section):
+                # Check if this is a true subsection (e.g., Item 1A)
+                # vs a different section that happens to start with same prefix (e.g., Item 10)
+                remainder = section_name[len(parent_section):]
+
+                # Valid subsection patterns:
+                # - "Item 1A" (remainder: "A") - letter suffix
+                # - "Item 1 - Business" (remainder: " - Business") - has separator
+                # Invalid patterns:
+                # - "Item 10" (remainder: "0") - digit continues the number
+
+                if remainder and remainder[0].isalpha():
+                    # Letter suffix like "A", "B" - valid subsection
+                    subsections.append(section_name)
+                elif remainder and remainder[0] in [' ', '-', '.', ':']:
+                    # Has separator - could be descriptive title
+                    subsections.append(section_name)
+                # If remainder starts with digit, it's NOT a subsection (e.g., "Item 10")
+
+        return sorted(subsections)