Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/edgar/sgml/init.py
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/init.py
@@ -0,0 +1,4 @@
+from edgar.sgml.filing_summary import FilingSummary, Report, Reports, Statements
+from edgar.sgml.sgml_common import FilingSGML, iter_documents, list_documents
+from edgar.sgml.sgml_header import Filer, FilingHeader, FilingMetadata, Issuer, ReportingOwner
+from edgar.sgml.sgml_parser import SGMLDocument
--- a/venv/lib/python3.10/site-packages/edgar/sgml/pycache/init.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/pycache/init.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/sgml/pycache/filing_summary.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/pycache/filing_summary.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/sgml/pycache/sgml_common.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/pycache/sgml_common.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/sgml/pycache/sgml_header.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/pycache/sgml_header.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/sgml/pycache/sgml_parser.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/pycache/sgml_parser.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/sgml/pycache/table_to_dataframe.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/pycache/table_to_dataframe.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/sgml/pycache/tools.cpython-310.pyc
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/pycache/tools.cpython-310.pyc
--- a/venv/lib/python3.10/site-packages/edgar/sgml/filing_summary.py
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/filing_summary.py
@@ -0,0 +1,666 @@
+import re
+from dataclasses import dataclass
+from enum import Enum
+from functools import lru_cache
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+import pyarrow as pa
+import pyarrow.compute as pc
+from bs4 import BeautifulSoup
+from rich import box
+from rich.console import Group
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+
+from edgar.core import DataPager, PagingState, log, strtobool
+from edgar.files.html import Document
+from edgar.richtools import print_rich, repr_rich, rich_to_text
+from edgar.xmltools import child_text
+
+__all__ = ['Report', 'Reports', 'File', 'FilingSummary']
+
+class Reports:
+
+    """
+    A collection of reports in a filing summary
+    """
+
+    def __init__(self,
+                 data:pa.Table,
+                 filing_summary: Optional['FilingSummary'] = None,
+                 original_state: Optional[PagingState] = None,
+                 title: Optional[str] = "Reports"):
+        self.data:pa.Table = data
+        self.data_pager = DataPager(data)
+        self._original_state = original_state or PagingState(0, len(self.data))
+        self.n = 0
+        self._filing_summary = filing_summary
+        self.title = title
+
+    def __len__(self):
+        return len(self.data)
+
+    def __iter__(self):
+        self.n = 0
+        return self
+
+    def __next__(self):
+        if self.n < len(self.data):
+            report = Report(
+                instance=self.data['instance'][self.n].as_py(),
+                is_default=self.data['IsDefault'][self.n].as_py(),
+                has_embedded_reports=self.data['HasEmbeddedReports'][self.n].as_py(),
+                html_file_name=self.data['HtmlFileName'][self.n].as_py(),
+                long_name=self.data['LongName'][self.n].as_py(),
+                report_type=self.data['ReportType'][self.n].as_py(),
+                role=self.data['Role'][self.n].as_py(),
+                parent_role=self.data['ParentRole'][self.n].as_py(),
+                short_name=self.data['ShortName'][self.n].as_py(),
+                menu_category=self.data['MenuCategory'][self.n].as_py(),
+                position=self.data['Position'][self.n].as_py(),
+                reports = self
+            )
+            self.n += 1
+            return report
+        else:
+            raise StopIteration
+
+    def current(self):
+        """Display the current page ... which is the default for this filings object"""
+        return self
+
+    def next(self):
+        """Show the next page"""
+        data_page = self.data_pager.next()
+        if data_page is None:
+            log.warning("End of data .. use prev() \u2190 ")
+            return None
+        start_index, _ = self.data_pager._current_range
+        paging_state = PagingState(page_start=start_index, num_records=len(self))
+        return Reports(data_page, original_state=paging_state)
+
+    def previous(self):
+        """
+        Show the previous page of the data
+        :return:
+        """
+        data_page = self.data_pager.previous()
+        if data_page is None:
+            log.warning(" No previous data .. use next() \u2192 ")
+            return None
+        start_index, _ = self.data_pager._current_range
+        paging_state = PagingState(page_start=start_index, num_records=len(self))
+        return Reports(data_page, original_state=paging_state)
+
+    def to_pandas(self):
+        return self.data.to_pandas()
+
+    def __getitem__(self, item):
+        record = self.filter("Position", str(item))
+        if record:
+            return record
+
+
+    def create_from_record(self, data:pa.Table):
+        return Report(
+                instance=data['instance'][0].as_py(),
+                is_default=data['IsDefault'][0].as_py(),
+                has_embedded_reports=data['HasEmbeddedReports'][0].as_py(),
+                html_file_name=data['HtmlFileName'][0].as_py(),
+                long_name=data['LongName'][0].as_py(),
+                report_type=data['ReportType'][0].as_py(),
+                role=data['Role'][0].as_py(),
+                parent_role=data['ParentRole'][0].as_py(),
+                short_name=data['ShortName'][0].as_py(),
+                menu_category=data['MenuCategory'][0].as_py(),
+                position=data['Position'][0].as_py(),
+                reports = self
+            )
+
+    @property
+    def long_names(self) -> List[str]:
+        return self.data['LongName'].to_pylist()
+
+    @property
+    def short_names(self) -> List[str]:
+        return self.data['ShortName'].to_pylist()
+
+    def get_by_category(self, category: str):
+        """
+        Get a single report by category
+        """
+        data = self.data.filter(pc.equal(self.data['MenuCategory'], category))
+        return Reports(data, filing_summary=self._filing_summary, title=category)
+
+    @property
+    def statements(self) -> Optional['Statements']:
+        """
+        Get all reports in the Statements category
+        """
+        reports = self.get_by_category('Statements')
+        if reports:
+            return Statements(reports)
+
+    def get_by_filename(self, file_name: str):
+        """
+        Get a single report by file name
+        """
+        data = self.data.filter(pc.equal(self.data['HtmlFileName'], file_name))
+        if len(data) ==1:
+            return self.create_from_record(data)
+
+    def get_by_short_name(self, short_name: str):
+        """
+        Get a single report by short name
+        """
+        data = self.data.filter(pc.equal(self.data['ShortName'], short_name))
+        if len(data) == 1:
+            return self.create_from_record(data)
+
+    def filter(self, column: Union[str, List[str]], value: Union[str, List[str]]):
+        if isinstance(column, str):
+            column = [column]
+        if isinstance(value, str):
+            value = [value]
+        # Convert value list to a pyarrow array for proper comparison
+        value_set = pa.array(value)
+        # Initialize mask using the first column
+        mask = pc.is_in(self.data[column[0]], value_set)
+        # Combine with subsequent columns using logical AND
+        for col in column[1:]:
+            mask = pc.and_(mask, pc.is_in(self.data[col], value_set))
+        # Apply the mask to filter the data
+        data = self.data.filter(mask)
+        # Return a single Report or new Reports instance
+        if len(data) == 1:
+            return self.create_from_record(data)
+        return Reports(data)
+
+    def __rich__(self):
+        table = Table(
+            show_header=True,
+            header_style="dim",
+            show_lines=True,
+            box=box.SIMPLE,
+            border_style="bold grey54",
+            row_styles=["", "bold"]
+        )
+        table.add_column("#", style="dim", justify="left")
+        table.add_column("Report", style="bold", width=60)
+        table.add_column("Category", width=12)
+        table.add_column("File", justify="left")
+
+        # Iterate through rows in current page
+        for i in range(len(self)):
+            position = self.data['Position'][i].as_py()
+
+            row = [
+                str(position) if position else "-",
+                self.data['ShortName'][i].as_py(),
+                self.data['MenuCategory'][i].as_py() or "",
+                self.data['HtmlFileName'][i].as_py() or ""
+            ]
+            table.add_row(*row)
+
+        panel = Panel(table, title=self.title, expand=False)
+        return panel
+
+    def __repr__(self):
+        return repr_rich(self.__rich__())
+
+
+class Report:
+
+    def __init__(self,
+                 instance: Optional[str],
+                 is_default: Optional[bool],
+                 has_embedded_reports: Optional[bool],
+                 long_name: Optional[str],
+                 short_name: Optional[str],
+                 menu_category: Optional[str],
+                 position: Optional[int],
+                 html_file_name: Optional[str],
+                 report_type: Optional[str],
+                 role: Optional[str],
+                 parent_role: Optional[str] = None,
+                 reports = None):
+        self.instance = instance
+        self.is_default = is_default
+        self.has_embedded_reports = has_embedded_reports
+        self.long_name = long_name
+        self.short_name = short_name
+        self.menu_category = menu_category
+        self.position = position
+        self.html_file_name = html_file_name
+        self.report_type = report_type
+        self.role = role
+        self.parent_role = parent_role
+        self._reports = reports
+
+    @property
+    def content(self):
+        """
+        Get the content of the report
+        """
+        sgml = self._reports._filing_summary._filing_sgml
+        if sgml:
+            return sgml.get_content(self.html_file_name)
+
+    def text(self):
+        """
+        Get the text content of the report
+        """
+        table = self._get_report_table()
+        if table:
+            return rich_to_text(table.render(500))
+
+    @lru_cache
+    def _get_report_table(self):
+        """
+        Get the first table in the document
+        """
+        document = Document.parse(self.content)
+        if len(document.tables) == 0:
+            log.warning(f"No tables found in {self.html_file_name}")
+            return None
+        return document.tables[0]
+
+    def view(self):
+        table = self._get_report_table()
+        if table:
+            print_rich(table.render(500))
+
+    def to_dataframe(self):
+        """
+        Extract the report's financial table as a pandas DataFrame.
+
+        Returns:
+            pd.DataFrame: Financial data with periods as columns and line items as index.
+                         Returns empty DataFrame if no tables found.
+
+        The DataFrame includes metadata attributes:
+        - currency: The currency used (e.g., 'USD')
+        - units: The units description (e.g., 'thousands')
+        - scaling_factor: Numeric scaling factor (e.g., 1000 for thousands)
+        - period_type: 'instant' or 'duration' for the time periods
+        """
+        from edgar.sgml.table_to_dataframe import extract_statement_dataframe
+
+        content = self.content
+        if content:
+            return extract_statement_dataframe(content)
+
+        import pandas as pd
+        return pd.DataFrame()
+
+    def __str__(self):
+        return f"Report(short_name={self.short_name}, category={self.menu_category}, file_name={self.html_file_name})"
+
+    def __rich__(self):
+        return Panel(
+            Text.assemble(("Report ", "dim"), (self.long_name, "bold")),
+            subtitle=Text(self.menu_category, style='dim italic'),
+            expand=False,
+            width=400,
+            height=4
+        )
+
+    def __repr__(self):
+        return repr_rich(self.__rich__())
+
+@dataclass
+class File:
+    file_name: str
+    doc_type: Optional[str]
+    is_definitely_fs: Optional[bool]
+    is_usgaap: Optional[bool]
+    original: Optional[str]
+
+class FilingSummary:
+
+    def __init__(self,
+                    reports: Reports,
+                    short_name_map: Dict[str, Report],
+                    category_map: Dict[str, List[Report]],
+                    input_files: List[File],
+                    supplemental_files: List[File],
+                    report_format: Optional[str] = None,
+                    context_count: Optional[int] = None,
+                    element_count: Optional[int] = None,
+                    entity_count: Optional[int] = None,
+                    footnotes_reported: Optional[bool] = None,
+                    segment_count: Optional[int] = None,
+                    scenario_count: Optional[int] = None,
+                    tuples_reported: Optional[bool] = None,
+                    has_presentation_linkbase: Optional[bool] = None,
+                    has_calculation_linkbase: Optional[bool] = None):
+        self.reports:Reports = reports
+        self.reports._filing_summary = self
+        self._short_name_map = short_name_map
+        self._category_map = category_map
+        self.input_files = input_files
+        self.supplemental_files = supplemental_files
+        self.report_format = report_format
+        self.context_count = context_count
+        self.element_count = element_count
+        self.entity_count = entity_count
+        self.footnotes_reported = footnotes_reported
+        self.segment_count = segment_count
+        self.scenario_count = scenario_count
+        self.tuples_reported = tuples_reported
+        self.has_presentation_linkbase = has_presentation_linkbase
+        self.has_calculation_linkbase = has_calculation_linkbase
+        self._filing_sgml = None
+
+    @classmethod
+    def parse(cls, xml_text:str):
+        soup = BeautifulSoup(xml_text, 'xml')
+        root = soup.find('FilingSummary')
+
+        # Main fields
+        report_format = child_text(root, 'ReportFormat')
+        context_count = child_text(root, 'ContextCount')
+        element_count = child_text(root, 'ElementCount')
+        entity_count = child_text(root, 'EntityCount')
+        footnotes_reported = strtobool(child_text(root, 'FootnotesReported'))
+        segment_count = child_text(root, 'SegmentCount')
+        scenario_count = child_text(root, 'ScenarioCount')
+        tuples_reported = strtobool(child_text(root, 'TuplesReported'))
+        has_presentation_linkbase = strtobool(child_text(root, 'HasPresentationLinkbase'))
+        has_calculation_linkbase = strtobool(child_text(root, 'HasCalculationLinkbase'))
+        # Reports
+        reports: List[Report] = []
+        short_name_map: Dict[str, Report] = {}
+        category_map: Dict[str, List[Report]] = {}
+        report_records = []
+        for report_tag in root.find_all("Report"):
+            record = {
+                'instance': report_tag.get('instance'),
+                'IsDefault': strtobool(child_text(report_tag, 'IsDefault')),
+                'HasEmbeddedReports': strtobool(child_text(report_tag, 'HasEmbeddedReports')),
+                'HtmlFileName': child_text(report_tag, 'HtmlFileName'),
+                'LongName': child_text(report_tag, 'LongName'),
+                'ReportType': child_text(report_tag, 'ReportType'),
+                'Role': child_text(report_tag, 'Role'),
+                'ParentRole': child_text(report_tag, 'ParentRole'),
+                'ShortName': child_text(report_tag, 'ShortName'),
+                'MenuCategory': child_text(report_tag, 'MenuCategory'),
+                'Position': child_text(report_tag, 'Position')
+            }
+            report = Report(
+                instance = report_tag.get('instance'),
+                is_default = strtobool(child_text(report_tag, 'IsDefault')),
+                has_embedded_reports = strtobool(child_text(report_tag, 'HasEmbeddedReports')),
+                html_file_name = child_text(report_tag, 'HtmlFileName'),
+                long_name = child_text(report_tag, 'LongName'),
+                report_type = child_text(report_tag, 'ReportType'),
+                role = child_text(report_tag, 'Role'),
+                parent_role=child_text(report_tag, 'ParentRole'),
+                short_name = child_text(report_tag, 'ShortName'),
+                menu_category = child_text(report_tag, 'MenuCategory'),
+                position = child_text(report_tag, 'Position')
+            )
+            reports.append(report)
+            report_records.append(record)
+            short_name_map[report.short_name] = report
+            if report.menu_category not in category_map:
+                category_map[report.menu_category] = []
+            category_map[report.menu_category].append(report)
+
+        # Reports Data
+        reports_obj = Reports(data=pa.Table.from_pylist(report_records))
+        # Input Files
+        input_files_tag = root.find('InputFiles')
+        input_files = []
+        if input_files_tag:
+            for file_tag in input_files_tag.find_all('File'):
+                file = File(
+                        file_name = file_tag.text,
+                        doc_type = file_tag.get('doctype'),
+                        is_definitely_fs = strtobool(file_tag.get('isDefinitelyFs')),
+                        is_usgaap = strtobool(file_tag.get('isUsgaap')),
+                        original = file_tag.get('original')
+                )
+                input_files.append(file)
+
+        # Supplemental Files
+        supplemental_files_tag = root.find('SupplementalFiles')
+        supplemental_files = []
+        if supplemental_files_tag:
+            for file_tag in supplemental_files_tag.find_all('File'):
+                file = File(
+                        file_name = file_tag.text,
+                        doc_type = file_tag.get('doctype'),
+                        is_definitely_fs = strtobool(file_tag.get('isDefinitelyFs')),
+                        is_usgaap = strtobool(file_tag.get('isUsgaap')),
+                        original = file_tag.get('original')
+                )
+                supplemental_files.append(file)
+        return cls( report_format=report_format,
+                    short_name_map=short_name_map,
+                    category_map=category_map,
+                    context_count=context_count,
+                    element_count=element_count,
+                    entity_count=entity_count,
+                    footnotes_reported=footnotes_reported,
+                    segment_count=segment_count,
+                    scenario_count=scenario_count,
+                    tuples_reported=tuples_reported,
+                    has_presentation_linkbase=has_presentation_linkbase,
+                    has_calculation_linkbase=has_calculation_linkbase,
+                    reports=reports_obj,
+                    input_files=input_files,
+                    supplemental_files=supplemental_files)
+
+    def get_report_by_short_name(self, short_name: str) -> Optional[Report]:
+        return self.reports.get_by_short_name(short_name)
+
+    def get_reports_by_category(self, category: str) -> Reports:
+        return self.reports.get_by_category(category)
+
+    def get_reports_by_filename(self, file_name: str) -> Optional[Report]:
+        return self.reports.get_by_filename(file_name)
+
+    @property
+    def statements(self):
+        reports = self.get_reports_by_category('Statements')
+        return Statements(reports)
+
+    @property
+    def tables(self):
+        return self.get_reports_by_category('Tables')
+
+    def __str__(self):
+        return f"FilingSummary(report_format={self.report_format})"
+
+
+    def __rich__(self):
+        renderables = [self.reports]
+        return Panel(
+            Group(*renderables),
+            box=box.ROUNDED,
+            title="Filing Summary"
+        )
+
+    def __repr__(self):
+        return repr_rich(self.__rich__())
+
+
+class StatementType(Enum):
+    INCOME = "income"
+    BALANCE = "balance"
+    CASH_FLOW = "cash_flow"
+    COMPREHENSIVE_INCOME = "comprehensive_income"
+    EQUITY = "equity"
+
+
+class StatementMapper:
+    def __init__(self):
+        # Define pattern matchers for each statement type
+        self.patterns = {
+            StatementType.INCOME: [
+                (r'(?i)statement.*of.*(?:operation|income|earning)s?(?!\s+and\s+comprehensive)', 3),
+                # High confidence direct match
+                (r'(?i)(?:operation|income|earning)s?\s+statement', 2),  # Alternative format
+                (r'(?i)profit.*loss', 1),  # P&L reference
+            ],
+            StatementType.BALANCE: [
+                (r'(?i)balance\s*sheet', 3),  # Very consistent naming
+                (r'(?i)statement.*of.*financial\s+position', 2),  # Alternative format
+            ],
+            StatementType.CASH_FLOW: [
+                (r'(?i)statement.*of.*cash\s*flows?', 3),  # Primary pattern
+                (r'(?i)cash\s*flows?\s*statement', 2),  # Alternative format
+            ],
+            StatementType.COMPREHENSIVE_INCOME: [
+                (r'(?i)statement.*of.*comprehensive\s*(?:income|loss)', 3),  # Primary pattern
+                (r'(?i)comprehensive\s*(?:income|loss)\s*statement', 2),  # Alternative format
+            ],
+            StatementType.EQUITY: [
+                (r'(?i)statement.*of.*(?:stockholders|shareholders|owners)[\'\s]*equity', 3),  # Primary pattern
+                (r'(?i)statement.*of.*changes\s+in\s+(?:stockholders|shareholders|owners)[\'\s]*equity', 3),
+                # With "changes in"
+                (r'(?i)statement.*of.*equity', 2),  # Generic equity
+            ]
+        }
+
+        # Define combined statement patterns
+        self.combined_patterns = [
+            (r'(?i)statement.*of.*operations?\s+and\s+comprehensive\s*(?:income|loss)',
+             {StatementType.INCOME, StatementType.COMPREHENSIVE_INCOME}),
+        ]
+
+    def normalize_statement(self, statement: str) -> str:
+        """Normalize statement name by removing common variations."""
+        statement = statement.strip().upper()
+        # Remove common prefixes if they exist
+        prefixes = ['CONSOLIDATED', 'COMBINED']
+        for prefix in prefixes:
+            if statement.startswith(prefix):
+                statement = statement[len(prefix):].strip()
+        return statement
+
+    def match_statement(self, statement: str) -> Dict[StatementType, float]:
+        """
+        Match a statement name to possible statement types with confidence scores.
+        Returns a dictionary of {StatementType: confidence_score}
+        """
+        normalized = self.normalize_statement(statement)
+        scores: Dict[StatementType, float] = {}
+
+        # First check for combined statements
+        for pattern, types in self.combined_patterns:
+            if re.search(pattern, normalized):
+                for stmt_type in types:
+                    scores[stmt_type] = 1.0
+                return scores
+
+        # Then check individual patterns
+        for stmt_type, patterns in self.patterns.items():
+            max_score = 0
+            for pattern, weight in patterns:
+                if re.search(pattern, normalized):
+                    max_score = max(max_score, weight / 3.0)  # Normalize to 0-1 range
+            if max_score > 0:
+                scores[stmt_type] = max_score
+
+        return scores
+
+    def classify_statement(self, statement: str, threshold: float = 0.5) -> Set[StatementType]:
+        """
+        Classify a statement into one or more statement types.
+        Returns a set of StatementType enums.
+        """
+        scores = self.match_statement(statement)
+        return {stmt_type for stmt_type, score in scores.items() if score >= threshold}
+
+    def get_best_matches(self, statements: List[str]) -> Dict[StatementType, str]:
+        """
+        Given a list of statement names, returns the best matching statement
+        for each statement type.
+        """
+        result: Dict[StatementType, Tuple[str, float]] = {}
+
+        for statement in statements:
+            scores = self.match_statement(statement)
+            for stmt_type, score in scores.items():
+                if (stmt_type not in result or
+                        score > result[stmt_type][1]):
+                    result[stmt_type] = (statement, score)
+
+        return {stmt_type: stmt for stmt_type, (stmt, _) in result.items()}
+
+
+class Statements:
+
+    """
+    A wrapper class for detected financial statements in a filing summary.
+    """
+    def __init__(self, statement_reports:Reports):
+        self._reports = statement_reports
+        self.statements = [report.short_name for report in self._reports]
+        self.mapper = StatementMapper()
+        self._matches: Dict[StatementType, Tuple[str, float]] = {}
+        self._initialize_matches()
+
+    def _initialize_matches(self) -> None:
+        """Initialize best matches for each statement type."""
+        for statement in self.statements:
+            scores = self.mapper.match_statement(statement)
+            for stmt_type, score in scores.items():
+                if (stmt_type not in self._matches or
+                    score > self._matches[stmt_type][1]):
+                    self._matches[stmt_type] = (statement, score)
+
+    def _get_statement(self, stmt_type: StatementType, threshold: float = 0.5) -> Optional[Report]:
+        """Helper method to get a statement of a specific type."""
+        if stmt_type in self._matches:
+            statement, score = self._matches[stmt_type]
+            if score >= threshold:
+                return self._reports.get_by_short_name(statement)
+        return None
+
+    def __getitem__(self, item):
+        return self._reports[item]
+
+    @property
+    def balance_sheet(self) -> Optional[Report]:
+        """Returns the detected balance sheet statement."""
+        return self._get_statement(StatementType.BALANCE)
+
+    @property
+    def income_statement(self) -> Optional[Report]:
+        """Returns the detected income statement."""
+        return self._get_statement(StatementType.INCOME)
+
+    @property
+    def cash_flow_statement(self) -> Optional[Report]:
+        """Returns the detected cash flow statement."""
+        return self._get_statement(StatementType.CASH_FLOW)
+
+    @property
+    def comprehensive_income_statement(self) -> Optional[Report]:
+        """Returns the detected comprehensive income statement."""
+        return self._get_statement(StatementType.COMPREHENSIVE_INCOME)
+
+    @property
+    def equity_statement(self) -> Optional[Report]:
+        """Returns the detected equity statement."""
+        return self._get_statement(StatementType.EQUITY)
+
+    @property
+    def detected_statements(self) -> Dict[StatementType, str]:
+        """Returns all detected statements with scores above threshold."""
+        return {
+            stmt_type: stmt for stmt_type, (stmt, score)
+            in self._matches.items()
+            if score >= 0.5
+        }
+
+    def __rich__(self):
+        return self._reports
+
+
+    def __repr__(self):
+        return repr_rich(self.__rich__())
--- a/venv/lib/python3.10/site-packages/edgar/sgml/sgml_common.py
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/sgml_common.py
@@ -0,0 +1,482 @@
+import re
+import zipfile
+from collections import defaultdict
+from functools import cached_property
+from pathlib import Path
+from typing import TYPE_CHECKING, DefaultDict, Dict, Iterator, List, Optional, Tuple, Union
+
+if TYPE_CHECKING:
+    from edgar._filings import Filing
+
+from edgar.attachments import Attachment, Attachments, get_document_type
+from edgar.httprequests import stream_with_retry
+from edgar.sgml.filing_summary import FilingSummary
+from edgar.sgml.sgml_header import FilingHeader
+from edgar.sgml.sgml_parser import SGMLDocument, SGMLFormatType, SGMLParser
+from edgar.sgml.tools import is_xml
+
+__all__ = ['iter_documents', 'list_documents', 'FilingSGML', 'FilingHeader']
+
+
+def parse_document(document_str: str) -> SGMLDocument:
+    """
+    Parse a single SGML document section, maintaining raw content.
+    """
+    # Extract individual fields with separate patterns
+    type_match = re.search(r'<TYPE>([^<\n]+)', document_str)
+    sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', document_str)
+    filename_match = re.search(r'<FILENAME>([^<\n]+)', document_str)
+    description_match = re.search(r'<DESCRIPTION>([^<\n]+)', document_str)
+
+    return SGMLDocument(
+        type=type_match.group(1).strip() if type_match else "",
+        sequence=sequence_match.group(1).strip() if sequence_match else "",
+        filename=filename_match.group(1).strip() if filename_match else "",
+        description=description_match.group(1).strip() if description_match else "",
+        raw_content=document_str
+    )
+
+
+def read_content(source: Union[str, Path]) -> Iterator[str]:
+    """
+    Read content from either a URL or file path, yielding lines as strings.
+    Automatically handles gzip-compressed files with .gz extension.
+
+    Args:
+        source: Either a URL string or a file path
+
+    Yields:
+        str: Lines of content from the source
+
+    Raises:
+        TooManyRequestsError: If the server returns a 429 response
+        FileNotFoundError: If the file path doesn't exist
+        gzip.BadGzipFile: If the file is not a valid gzip file
+    """
+    if isinstance(source, str) and (source.startswith('http://') or source.startswith('https://')):
+        # Handle URL using stream_with_retry
+        for response in stream_with_retry(source):
+            # Process each line from the response and decode from bytes
+            for line in response.iter_lines():
+                if line is not None:
+                    yield line + "\n"
+    else:
+        # Handle file path
+        path = Path(source)
+
+        # Check if the file is gzip-compressed
+        if str(path).endswith('.gz'):
+            import gzip
+            with gzip.open(path, 'rt', encoding='utf-8', errors='replace') as file:
+                yield from file
+        else:
+            # Regular file handling
+            with path.open('r', encoding='utf-8', errors='replace') as file:
+                yield from file
+
+
+def read_content_as_string(source: Union[str, Path]) -> str:
+    """
+    Read content from either a URL or file path into a string.
+    Uses existing read_content generator function.
+
+    Args:
+        source: Either a URL string or a file path
+
+    Returns:
+        str: Full content as string
+
+    Raises:
+        TooManyRequestsError: If the server returns a 429 response
+        FileNotFoundError: If file path doesn't exist
+    """
+    # Convert lines from read_content to string
+    lines = []
+    for line in read_content(source):
+        # Handle both string and bytes from response
+        if isinstance(line, bytes):
+            lines.append(line.decode('utf-8', errors='replace'))
+        else:
+            lines.append(line)
+
+    return ''.join(lines)
+
+
+def iter_documents(source: Union[str, Path]) -> Iterator[SGMLDocument]:
+    """
+    Stream SGML documents from either a URL or file path, yielding parsed documents.
+
+    Args:
+        source: Either a URL string or a file path (string or Path object)
+
+    Yields:
+        SGMLDocument objects containing the parsed content
+
+    Raises:
+        ValueError: If the source is invalid
+        ConnectionError: If URL retrieval fails after retries
+        FileNotFoundError: If the file path doesn't exist
+    """
+    try:
+        content = ''.join(read_content(source))
+        document_pattern = re.compile(r'<DOCUMENT>([\s\S]*?)</DOCUMENT>')
+
+        for match in document_pattern.finditer(content):
+            document = parse_document(match.group(1))
+            if document:
+                yield document
+
+    except (ValueError, ConnectionError, FileNotFoundError) as e:
+        raise type(e)(f"Error processing source {source}: {str(e)}") from e
+
+def list_documents(source: Union[str, Path]) -> list[SGMLDocument]:
+    """
+    Convenience method to parse all documents from a source into a list.
+
+    Args:
+        source: Either a URL string or a file path
+
+    Returns:
+        List of SGMLDocument objects
+    """
+    return list(iter_documents(source))
+
+
+def parse_file(source: Union[str, Path]) -> list[SGMLDocument]:
+    """
+    Convenience method to parse all documents from a source into a list.
+
+    Args:
+        source: Either a URL string or a file path
+
+    Returns:
+        List of SGMLDocument objects
+    """
+    return list(iter_documents(source))
+
+def parse_submission_text(content: str) -> Tuple[FilingHeader, DefaultDict[str, List[SGMLDocument]]]:
+    """
+    Parses the raw submission text and returns the filing header along with
+    a dictionary mapping document sequence numbers to lists of SGMLDocument objects.
+    Args:
+        content (str): The raw text content of the submission.
+    Returns:
+        Tuple[FilingHeader, DefaultDict[str, List[SGMLDocument]]]:
+            A tuple where the first element is the FilingHeader object representing
+            the parsed header information, and the second element is a defaultdict
+            mapping document sequence identifiers to their corresponding list of SGMLDocument objects.
+    Details:
+        - For submissions with the SGMLFormatType.SUBMISSION format, the function uses
+          the pre-parsed filer data to create the FilingHeader.
+        - For SEC-DOCUMENT formatted content, the header is initially parsed from the SGML text;
+          if this fails, the header is parsed again with preprocessing enabled.
+        - The function creates an SGMLDocument for each parsed document and groups them by
+          their sequence identifier.
+    Raises:
+        Exception: Any exceptions raised during header parsing (handled internally
+                   by attempting to preprocess the header in case of failure).
+    """
+    # Create parser and get structure including header and documents
+    parser = SGMLParser()
+    parsed_data = parser.parse(content)
+
+    # Create FilingHeader using already parsed data
+    if parsed_data['format'] == SGMLFormatType.SUBMISSION:
+        # For submission format, we already have parsed filer data
+        header = FilingHeader.parse_submission_format_header(parsed_data=parsed_data)
+    else:
+        # For SEC-DOCUMENT format, pass the header text to the
+        # specialized header parser since we need additional processing
+        try:
+            header = FilingHeader.parse_from_sgml_text(parsed_data['header'])
+        except Exception:
+            header = FilingHeader.parse_from_sgml_text(parsed_data['header'], preprocess=True)
+
+    # Create document dictionary
+    documents = defaultdict(list)
+    for doc_data in parsed_data['documents']:
+        doc = SGMLDocument.from_parsed_data(doc_data)
+        documents[doc.sequence].append(doc)
+    return header, documents
+
+
+
+class FilingSGML:
+    """
+    Main class that parses and provides access to both the header and documents
+    from an SGML filing.
+    """
+    __slots__ = ('header', '_documents_by_sequence', '__dict__')  # Use slots for memory efficiency
+
+    def __init__(self, header: FilingHeader, documents: defaultdict[str, List[SGMLDocument]]):
+        """
+        Initialize FilingSGML with parsed header and documents.
+
+        Args:
+            header (FilingHeader): Parsed header information
+            documents (Dict[str, SGMLDocument]): Dictionary of parsed documents keyed by sequence
+        """
+        self.header:FilingHeader = header
+        self._documents_by_sequence:defaultdict[str, List[SGMLDocument]] = documents
+        self._documents_by_name:Dict[str, SGMLDocument] = {
+            doc.filename: doc for doc_lst in documents.values() for doc in doc_lst
+        }
+
+    @property
+    def accession_number(self):
+        return self.header.accession_number
+
+    @property
+    def cik(self):
+        return self.header.cik
+
+    @cached_property
+    def entity(self):
+        from edgar.entity import Entity
+        cik = self.cik
+        if cik:
+            return Entity(cik)
+
+    @property
+    def form(self):
+        return self.header.form
+
+    @property
+    def filing_date(self):
+        return self.header.filing_date
+
+    @property
+    def date_as_of_change(self):
+        return self.header.date_as_of_change
+
+    @property
+    def period_of_report(self):
+        return self.header.period_of_report
+
+    @property
+    def effective_date(self):
+        return self.header.filing_metadata.get('EFFECTIVE DATE')
+
+    @property
+    def path(self):
+        """
+        Get the root path of the filing.
+        """
+        if self.accession_number:
+            return f"/Archives/edgar/data/{self.header.cik}/{self.accession_number.replace('-', '')}"
+        else:
+            return "/<SGML FILE>"
+
+
+    def html(self):
+        html_document = self.attachments.primary_html_document
+        if html_document and not html_document.is_binary() and not html_document.empty:
+            html_text = self.get_content(html_document.document)
+            if isinstance(html_text, bytes):
+                html_text = html_text.decode('utf-8')
+            return html_text
+
+    def xml(self):
+        xml_document = self.attachments.primary_xml_document
+        if xml_document and not xml_document.is_binary() and not xml_document.empty:
+            xml_text = self.get_content(xml_document.document)
+            if isinstance(xml_text, bytes):
+                xml_text = xml_text.decode('utf-8')
+            return xml_text
+
+    def get_content(self, filename: str) -> Optional[str]:
+        """
+        Get the content of a document by its filename.
+        """
+        document = self._documents_by_name.get(filename)
+        if document:
+            return document.content
+
+    @cached_property
+    def attachments(self) -> Attachments:
+        """
+        Get all attachments from the filing.
+        """
+        is_datafile = False
+        documents, datafiles, primary_files = [], [], []
+
+        # Get the filing summary
+        filing_summary = self.filing_summary
+
+        for sequence, document_lst in self._documents_by_sequence.items():
+            for document in document_lst:
+                attachment = Attachment(
+                    sequence_number=sequence,
+                    ixbrl=False,
+                    path=f"{self.path}/{document.filename}",
+                    document=document.filename,
+                    document_type=get_document_type(filename=document.filename, declared_document_type=document.type),
+                    description=document.description,
+                    size=None,
+                    sgml_document=document,
+                    filing_sgml=self
+                )
+                # Add from the filing summary if available
+                if filing_summary:
+                    report = filing_summary.get_reports_by_filename(document.filename)
+                    if report:
+                        attachment.purpose = report.short_name
+                # Check if the document is a primary document
+                if sequence == "1":
+                    primary_files.append(attachment)
+                    documents.append(attachment)
+                else:
+                    if not is_datafile:
+                        is_datafile = is_xml(filename=document.filename)
+                    if is_datafile:
+                        datafiles.append(attachment)
+                    else:
+                        documents.append(attachment)
+
+        return Attachments(document_files=documents, data_files=datafiles, primary_documents=primary_files, sgml=self)
+
+    @cached_property
+    def filing_summary(self):
+        summary_attachment = self._documents_by_name.get("FilingSummary.xml")
+        if summary_attachment:
+            filing_summary = FilingSummary.parse(summary_attachment.content)
+            filing_summary.reports._filing_summary = filing_summary
+            filing_summary._filing_sgml = self
+            return filing_summary
+
+    def download(self,  path: Union[str, Path], archive: bool = False):
+        """
+        Download all the attachments to a specified path.
+        If the path is a directory, the file is saved with its original name in that directory.
+        If the path is a file, the file is saved with the given path name.
+        If archive is True, the attachments are saved in a zip file.
+        path: str or Path - The path to save the attachments
+        archive: bool (default False) - If True, save the attachments in a zip file
+        """
+        if archive:
+            if path.is_dir():
+                raise ValueError("Path must be a zip file name to create zipfile")
+            else:
+                with zipfile.ZipFile(path, 'w') as zipf:
+                    for document in self._documents_by_name.values():
+                        zipf.writestr(document.filename, document.content)
+        else:
+            if path.is_dir():
+                for document in self._documents_by_name.values():
+                    file_path = path / document.filename
+                    content = document.content
+                    if isinstance(content, bytes):
+                        file_path.write_bytes(content)
+                    else:
+                        file_path.write_text(content, encoding='utf-8')
+            else:
+                raise ValueError("Path must be a directory")
+
+    @property
+    def primary_documents(self):
+        """
+        Get the primary documents from the filing.
+        """
+        return self.attachments.primary_documents
+
+
+    @classmethod
+    def from_source(cls, source: Union[str, Path]) -> "FilingSGML":
+        """
+        Create FilingSGML instance from either a URL or file path.
+        Parses both header and documents.
+
+        Args:
+            source: Either a URL string or a file path
+
+        Returns:
+            FilingSGML: New instance with parsed header and documents
+
+        Raises:
+            ValueError: If header section cannot be found
+            IOError: If file cannot be read
+        """
+        # Read content once
+        content = read_content_as_string(source)
+
+        # Parse header and documents
+        header, documents = parse_submission_text(content)
+
+        # Create FilingSGML instance
+        return cls(header=header, documents=documents)
+
+    @classmethod
+    def from_text(cls, full_text_submission: str) -> "FilingSGML":
+        """
+        Create FilingSGML instance from either full text submission.
+        Parses both header and documents.
+
+        Args:
+            full_text_submission: String containing full text submission
+
+        Returns:
+            FilingSGML: New instance with parsed header and documents
+
+        Raises:
+            ValueError: If header section cannot be found
+        """
+        # Parse header and documents
+        header, documents = parse_submission_text(full_text_submission)
+
+        # Create FilingSGML instance
+        return cls(header=header, documents=documents)
+
+
+    def get_document_by_sequence(self, sequence: str) -> Optional[SGMLDocument]:
+        """
+        Get a document by its sequence number.
+        Direct dictionary lookup for O(1) performance.
+        """
+        results = self._documents_by_sequence.get(sequence)
+        if results and len(results) > 0:
+            return results[0]
+
+    def get_document_by_name(self, filename: str) -> Optional[SGMLDocument]:
+        """
+        Get a document by its filename.
+        Direct dictionary lookup for O(1) performance.
+        """
+        return self._documents_by_name.get(filename)
+
+    @classmethod
+    def from_filing(cls, filing: 'Filing') -> 'FilingSGML':
+        """Create from a Filing object that provides text_url."""
+        filing_sgml = cls.from_source(filing.text_url)
+        if not filing_sgml.accession_number:
+            filing_sgml.header.filing_metadata.update('ACCESSION NUMBER', filing.accession_no)
+        if not filing_sgml.header.filing_metadata.get("CIK"):
+            filing_sgml.header.filing_metadata.update('CIK', str(filing.cik).zfill(10))
+        if not filing_sgml.header.form:
+            filing_sgml.header.filing_metadata.update("CONFORMED SUBMISSION TYPE", filing.form)
+        return filing_sgml
+
+    def __str__(self) -> str:
+        """String representation with basic filing info."""
+        doc_count = len(self._documents_by_name)
+        return f"FilingSGML(accession={self.header.accession_number}, document_count={doc_count})"
+
+    def __repr__(self) -> str:
+        return str(self)
+
+    def get_document_sequences(self) -> List[str]:
+        """
+        Get all document sequences.
+        Using list() is more efficient than sorted() when order doesn't matter.
+        """
+        return list(self._documents_by_sequence.keys())
+
+    def get_all_document_types(self) -> List[str]:
+        """
+        Get unique document types in filing.
+        Using set for deduplication.
+        """
+        return list({doc.type for doc in self._documents_by_sequence.values()})
+
+    def get_document_count(self) -> int:
+        """Get total number of documents."""
+        return len(self._documents_by_sequence)
--- a/venv/lib/python3.10/site-packages/edgar/sgml/sgml_header.py
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/sgml_header.py
--- a/venv/lib/python3.10/site-packages/edgar/sgml/sgml_parser.py
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/sgml_parser.py
@@ -0,0 +1,593 @@
+import re
+import warnings
+from dataclasses import dataclass
+from enum import Enum
+from io import BytesIO
+from typing import Iterator, Optional
+
+from edgar.core import has_html_content
+from edgar.sgml.tools import get_content_between_tags
+from edgar.vendored import uu
+
+__all__ = ['SGMLParser', 'SGMLFormatType', 'SGMLDocument', 'SECIdentityError', 'SECFilingNotFoundError', 'SECHTMLResponseError']
+
+
+class SECIdentityError(Exception):
+    """Raised when SEC rejects request due to invalid or missing EDGAR_IDENTITY"""
+    pass
+
+
+class SECFilingNotFoundError(Exception):
+    """Raised when SEC returns error for non-existent filing"""
+    pass
+
+
+class SECHTMLResponseError(Exception):
+    """Raised when SEC returns HTML content instead of expected SGML"""
+    pass
+
+class SGMLFormatType(Enum):
+    SEC_DOCUMENT = "sec_document"  # <SEC-DOCUMENT>...<SEC-HEADER> style
+    SUBMISSION = "submission"  # <SUBMISSION>...<FILER> style
+
+
+@dataclass
+class SGMLDocument:
+    type: str
+    sequence: str
+    filename: str
+    description: str
+    raw_content: str = ""
+
+    @classmethod
+    def from_parsed_data(cls, data: dict) -> 'SGMLDocument':
+        """Create document from parser output"""
+        return cls(
+            type=data['type'],
+            sequence=data['sequence'],
+            filename=data['filename'],
+            description=data['description'],
+            raw_content=data['content']
+        )
+
+    @property
+    def content(self):
+        raw_content = get_content_between_tags(self.raw_content)
+        if raw_content:
+            if raw_content.startswith("begin"):
+                # Create input and output streams
+                # Suppress the binascii warning
+
+                warnings.filterwarnings('ignore')
+
+                # Create input and output streams
+                input_stream = BytesIO(raw_content.encode("utf-8"))
+                output_stream = BytesIO()
+
+                # Decode the UU content
+                uu.decode(input_stream, output_stream, quiet=True)
+
+                # Get the decoded bytes
+                return output_stream.getvalue()
+            return raw_content
+
+    def __str__(self):
+        return f"Document(type={self.type}, sequence={self.sequence}, filename={self.filename}, description={self.description})"
+
+    def text(self) -> str:
+        """Extract content between <TEXT> tags."""
+        match = re.search(r'<TEXT>([\s\S]*?)</TEXT>', self.raw_content, re.DOTALL | re.IGNORECASE)
+        return match.group(1).strip() if match else ""
+
+    def xml(self) -> Optional[str]:
+        """Extract content between <XML> tags if present."""
+        match = re.search(r'<XML>([\s\S]*?)</XML>', self.raw_content, re.DOTALL | re.IGNORECASE)
+        return match.group(1).strip() if match else None
+
+    def html(self) -> Optional[str]:
+        """Extract content between <HTML> tags if present."""
+        match = re.search(r'<HTML>([\s\S]*?)</HTML>', self.raw_content, re.DOTALL | re.IGNORECASE)
+        return match.group(1).strip() if match else None
+
+    def xbrl(self) -> Optional[str]:
+        """Extract content between <XBRL> tags if present."""
+        match = re.search(r'<XBRL>([\s\S]*?)</XBRL>', self.raw_content, re.DOTALL | re.IGNORECASE)
+        return match.group(1).strip() if match else None
+
+    def get_content_type(self) -> str:
+        """
+        Determine the primary content type of the document.
+        Returns: 'xml', 'html', 'xbrl', or 'text'
+        """
+        if self.xml():
+            return 'xml'
+        elif self.html():
+            return 'html'
+        elif self.xbrl():
+            return 'xbrl'
+        return 'text'
+
+def _raise_sec_html_error(content: str):
+    """
+    Analyze HTML/XML error content from SEC and raise appropriate specific exception.
+
+    Args:
+        content: HTML or XML content received from SEC
+
+    Raises:
+        SECIdentityError: For identity-related errors
+        SECFilingNotFoundError: For missing filing errors
+        SECHTMLResponseError: For other HTML/XML responses
+    """
+    # Check for identity error
+    if "Your Request Originates from an Undeclared Automated Tool" in content:
+        raise SECIdentityError(
+            "SEC rejected request due to invalid or missing EDGAR_IDENTITY. "
+            "Please set a valid identity using set_identity('Your Name your.email@domain.com'). "
+            "See https://www.sec.gov/os/accessing-edgar-data"
+        )
+
+    # Check for AWS S3 NoSuchKey error (XML format)
+    if "<Code>NoSuchKey</Code>" in content and "<Message>The specified key does not exist.</Message>" in content:
+        raise SECFilingNotFoundError(
+            "SEC filing not found - the specified key does not exist in EDGAR archives. "
+            "Check that the accession number and filing date are correct."
+        )
+
+    # Check for general not found errors
+    if "Not Found" in content or "404" in content:
+        raise SECFilingNotFoundError(
+            "SEC filing not found. Check that the accession number and filing date are correct."
+        )
+
+    # Generic HTML/XML response error
+    raise SECHTMLResponseError(
+        "SEC returned HTML or XML content instead of expected SGML filing data. "
+        "This may indicate an invalid request or temporary SEC server issue."
+    )
+
+
+class SGMLParser:
+    @staticmethod
+    def detect_format(content: str) -> SGMLFormatType:
+        """Detect SGML format based on root element"""
+        # First check for valid SGML structure before checking for HTML content
+        content_stripped = content.lstrip()
+
+        # Check for valid SGML formats first
+        if content_stripped.startswith('<SUBMISSION>'):
+            return SGMLFormatType.SUBMISSION
+        elif '<SEC-DOCUMENT>' in content:
+            return SGMLFormatType.SEC_DOCUMENT
+        elif '<IMS-DOCUMENT>' in content:
+            # For old filings from the 1990's
+            return SGMLFormatType.SEC_DOCUMENT
+        elif '<DOCUMENT>' in content[:1000]:
+            # For old filings from the 1990's
+            return SGMLFormatType.SEC_DOCUMENT
+
+        # Only check for HTML content if it's not valid SGML structure
+        # This prevents false positives when SGML contains HTML within <TEXT> sections
+        if has_html_content(content):
+            _raise_sec_html_error(content)
+
+        # Check if we received XML error content (like AWS S3 NoSuchKey errors)
+        if content_stripped.startswith('<?xml') and '<Error>' in content:
+            _raise_sec_html_error(content)
+
+        raise ValueError("Unknown SGML format")
+
+    def parse(self, content) -> dict:
+        """Main entry point for parsing"""
+        format_type = self.detect_format(content)
+
+        if format_type == SGMLFormatType.SUBMISSION:
+            return self._parse_submission_format(content)
+        else:
+            return self._parse_sec_document_format(content)
+
+    def _parse_submission_format(self, content):
+        parser = SubmissionFormatParser()
+        return parser.parse(content)
+
+    def _parse_sec_document_format(self, content):
+        parser = SecDocumentFormatParser()
+        return parser.parse(content)
+
+
+class SubmissionFormatParser:
+    def __init__(self):
+        # Initialize main data structure
+        self.data = {
+            'format': SGMLFormatType.SUBMISSION,
+            'header': '',
+            'documents': [],
+        }
+
+        # Parser state
+        self.current_path = []  # Stack to track current position in hierarchy
+        self.header_lines = []  # Collect header lines
+        self.in_documents = False
+
+        # Known section tags that can contain nested content
+        self.SECTION_TAGS = {
+            'FILER',
+            'OWNER-DATA',
+            'COMPANY-DATA',
+            'REPORTING-OWNER',
+            'ISSUER',
+            'DEPOSITOR',
+            'SECURITIZER',
+            'UNDERWRITER',
+            'ISSUING_ENTITY',
+            'FORMER-COMPANY',
+            'SUBJECT-COMPANY',
+            'FILED-BY',
+            'FORMER-NAME',
+            'FILING-VALUES',
+            'BUSINESS-ADDRESS',
+            'MAIL-ADDRESS',
+            'CLASS-CONTRACT',
+            'SERIES',
+            'NEW-SERIES',
+            'NEW-CLASSES-CONTRACTS',
+            'ACQUIRING-DATA',
+            'TARGET-DATA',
+            'SERIAL-COMPANY',
+            'MERGER',
+            'SERIES-AND-CLASSES-CONTRACTS-DATA',
+            'NEW-SERIES-AND-CLASSES-CONTRACTS',
+            'MERGER-SERIES-AND-CLASSES-CONTRACTS',
+            'EXISTING-SERIES-AND-CLASSES-CONTRACTS',
+            'RULE',
+            'ITEM'
+        }
+
+        # Tags that can appear multiple times and should be stored as lists
+        self.REPEATABLE_TAGS = {
+            'FILER',
+            'REPORTING-OWNER',
+            'UNDERWRITER',
+            'SERIES',
+            'CLASS-CONTRACT',
+            'FORMER-COMPANY',
+            'SUBJECT-COMPANY',
+            'ITEM'
+        }
+
+    def _get_current_context(self) -> dict:
+        """Navigate to current position in data hierarchy."""
+        context = self.data
+        for path_element in self.current_path:
+            tag, index = path_element
+            if index is not None:
+                context = context[tag][index]
+            else:
+                context = context[tag]
+        return context
+
+    def _is_unclosed_tag(self, line: str) -> bool:
+        """Check if line is an unclosed tag with value."""
+        line = line.strip()
+        if not (line.startswith('<') and '>' in line and not line.startswith('</')):
+            return False
+
+        tag_end = line.index('>')
+        content_after = line[tag_end + 1:].strip()
+        return bool(content_after)
+
+    def _is_section_end(self, line: str) -> bool:
+        """Check if line ends a section."""
+        return line.strip().startswith('</')
+
+    def _is_section_start(self, line: str) -> bool:
+        """Identifies if a line starts a new nested section."""
+        line = line.strip()
+        if not line.startswith('<') or not line.endswith('>'):
+            return False
+
+        tag = line[1:-1]  # Remove < and >
+        return tag in self.SECTION_TAGS
+
+    def _is_data_tag(self, line: str) -> bool:
+        """Identifies if a line contains a tag with a value."""
+        line = line.strip()
+        if not line.startswith('<'):
+            return False
+
+        parts = line.split('>')
+        return len(parts) == 2 and bool(parts[1].strip())
+
+    def _is_empty_tag(self, line: str) -> bool:
+        """Identifies if a line is an empty tag."""
+        line = line.strip()
+        return (line.startswith('<') and
+                line.endswith('>') and
+                not line.startswith('</') and
+                not self._is_section_start(line) and
+                not self._is_data_tag(line))
+
+    def _handle_section_start(self, line: str) -> None:
+        """Handle start of nested section."""
+        tag = line.strip()[1:-1]  # Remove < and >
+
+        current_context = self._get_current_context()
+
+        # Initialize tag in current context if needed
+        if tag not in current_context:
+            if tag in self.REPEATABLE_TAGS:
+                current_context[tag] = []
+            else:
+                current_context[tag] = {}
+
+        # For repeatable tags, append new dict and track index
+        if tag in self.REPEATABLE_TAGS:
+            current_context[tag].append({})
+            self.current_path.append((tag, len(current_context[tag]) - 1))
+        else:
+            self.current_path.append((tag, None))
+
+    def _handle_section_end(self, line: str) -> None:
+        """Handle end of nested section."""
+        tag = line.strip()[2:-1]  # Remove </ and >
+
+        # Verify we're closing the correct tag
+        current_tag, _ = self.current_path[-1]
+        if tag != current_tag:
+            raise ValueError(f"Mismatched tags: expected </{current_tag}>, got </{tag}>")
+
+        # Pop the current section from the path
+        self.current_path.pop()
+
+    def _handle_data_tag(self, line: str) -> None:
+        """Handle tags with values."""
+        line = line.strip()
+        tag_end = line.index('>')
+        tag = line[1:tag_end]
+        value = line[tag_end + 1:].strip()
+
+        current_context = self._get_current_context()
+
+        # Handle repeated tags
+        if tag in current_context:
+            if not isinstance(current_context[tag], list):
+                current_context[tag] = [current_context[tag]]
+            current_context[tag].append(value)
+        else:
+            current_context[tag] = value
+
+    def _handle_empty_tag(self, line: str) -> None:
+        """Handle empty tags."""
+        tag = line.strip()[1:-1]  # Remove < and >
+        current_context = self._get_current_context()
+        current_context[tag] = ""
+
+    def _handle_unclosed_tag(self, line: str) -> None:
+        """Handle tags like <ITEMS>value."""
+        line = line.strip()
+        tag_end = line.index('>')
+        tag = line[1:tag_end]
+        value = line[tag_end + 1:].strip()
+
+        current_context = self._get_current_context()
+
+        if tag in current_context:
+            if not isinstance(current_context[tag], list):
+                current_context[tag] = [current_context[tag]]
+            current_context[tag].append(value)
+        else:
+            current_context[tag] = value
+
+    def parse(self, content: str) -> dict:
+        """Parse SGML content in SUBMISSION format."""
+        document_buffer = None
+
+        for line in content.splitlines():
+            # Check for document section
+            if '<DOCUMENT>' in line:
+                self.data['header'] = '\n'.join(self.header_lines)
+                self.in_documents = True
+                document_buffer = [line]
+                continue
+
+            if self.in_documents:
+                if '</DOCUMENT>' in line:
+                    document_buffer.append(line)
+                    doc_content = '\n'.join(document_buffer)
+                    doc_data = self._parse_document_section(doc_content)
+                    if doc_data:
+                        self.data['documents'].append(doc_data)
+                    document_buffer = None
+                elif document_buffer is not None:
+                    document_buffer.append(line)
+            else:
+                # Header section parsing
+                self.header_lines.append(line)
+                line = line.strip()
+
+                if not line:
+                    continue
+
+                if self._is_section_start(line):
+                    self._handle_section_start(line)
+                elif self._is_section_end(line):
+                    self._handle_section_end(line)
+                elif self._is_data_tag(line):
+                    self._handle_data_tag(line)
+                elif self._is_empty_tag(line):
+                    self._handle_empty_tag(line)
+                elif self._is_unclosed_tag(line):
+                    self._handle_unclosed_tag(line)
+
+        return self.data
+
+    def _parse_document_section(self, content: str) -> dict:
+        """Parse a single document section."""
+        doc_data = {
+            'type': '',
+            'sequence': '',
+            'filename': '',
+            'description': '',
+            'content': content
+        }
+
+        # Extract document metadata
+        type_match = re.search(r'<TYPE>([^<\n]+)', content)
+        if type_match:
+            doc_data['type'] = type_match.group(1).strip()
+
+        sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', content)
+        if sequence_match:
+            doc_data['sequence'] = sequence_match.group(1).strip()
+
+        filename_match = re.search(r'<FILENAME>([^<\n]+)', content)
+        if filename_match:
+            doc_data['filename'] = filename_match.group(1).strip()
+
+        description_match = re.search(r'<DESCRIPTION>([^<\n]+)', content)
+        if description_match:
+            doc_data['description'] = description_match.group(1).strip()
+
+        return doc_data
+
+class SecDocumentFormatParser:
+    """Parser for <SEC-DOCUMENT> style SGML"""
+
+    def __init__(self):
+        self.in_header = False
+        self.data = {
+            'format': SGMLFormatType.SEC_DOCUMENT,
+            'header': '',
+            'documents': [],
+            'filer': {}
+        }
+        self.current_document = {}
+        self.header_text = []
+
+    def parse(self, content: str) -> dict:
+        """Parse SGML content in SEC-DOCUMENT format
+
+        Args:
+            content: The full SGML content as string
+
+        Returns:
+            dict containing parsed header and documents
+        """
+        document_buffer = []
+
+        for line in content.splitlines():
+            if '<SEC-HEADER>' in line or '<IMS-HEADER>' in line:
+                self.in_header = True
+                continue
+            elif '</SEC-HEADER>' in line or '</IMS-HEADER>' in line:
+                self.in_header = False
+                self.data['header'] = '\n'.join(self.header_text)
+                continue
+
+            if self.in_header:
+                # Collect header text
+                self.header_text.append(line)
+
+            # Handle document sections
+            if '<DOCUMENT>' in line:
+                document_buffer = []  # Start new document
+            elif '</DOCUMENT>' in line and document_buffer:
+                # Parse completed document
+                doc_content = '\n'.join(document_buffer)
+                doc_data = self._parse_document_section(doc_content)
+                if doc_data:
+                    self.data['documents'].append(doc_data)
+                document_buffer = []
+            elif document_buffer is not None:  # Currently collecting document content
+                document_buffer.append(line)
+
+        return self.data
+
+    def _parse_document_section(self, content: str) -> dict:
+        """Parse a single document section
+
+        Args:
+            content: Content between <DOCUMENT> tags
+
+        Returns:
+            dict with document metadata and content
+        """
+        doc_data = {
+            'type': '',
+            'sequence': '',
+            'filename': '',
+            'description': '',
+            'content': content
+        }
+
+        # Extract document metadata using regex
+        type_match = re.search(r'<TYPE>([^<\n]+)', content)
+        if type_match:
+            doc_data['type'] = type_match.group(1).strip()
+
+        sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', content)
+        if sequence_match:
+            doc_data['sequence'] = sequence_match.group(1).strip()
+
+        filename_match = re.search(r'<FILENAME>([^<\n]+)', content)
+        if filename_match:
+            doc_data['filename'] = filename_match.group(1).strip()
+
+        description_match = re.search(r'<DESCRIPTION>([^<\n]+)', content)
+        if description_match:
+            doc_data['description'] = description_match.group(1).strip()
+
+        return doc_data
+
+def list_documents(content:str) -> list[SGMLDocument]:
+    """
+    Convenience method to parse all documents from a source into a list.
+
+    Args:
+        content: The content string to parse
+
+    Returns:
+        List of SGMLDocument objects
+    """
+    return list(iter_documents(content))
+
+def iter_documents(content:str) -> Iterator[SGMLDocument]:
+    """
+    Stream SGML documents from either a URL or file path, yielding parsed documents.
+
+    Args:
+        content: The content string to parse
+
+    Yields:
+        SGMLDocument objects containing the parsed content
+
+    Raises:
+        ValueError: If the source is invalid
+        ConnectionError: If URL retrieval fails after retries
+        FileNotFoundError: If the file path doesn't exist
+    """
+    document_pattern = re.compile(r'<DOCUMENT>([\s\S]*?)</DOCUMENT>')
+
+    for match in document_pattern.finditer(content):
+        document = parse_document(match.group(1))
+        if document:
+            yield document
+
+
+def parse_document(document_str: str) -> SGMLDocument:
+    """
+    Parse a single SGML document section, maintaining raw content.
+    """
+    # Extract individual fields with separate patterns
+    type_match = re.search(r'<TYPE>([^<\n]+)', document_str)
+    sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', document_str)
+    filename_match = re.search(r'<FILENAME>([^<\n]+)', document_str)
+    description_match = re.search(r'<DESCRIPTION>([^<\n]+)', document_str)
+
+    return SGMLDocument(
+        type=type_match.group(1).strip() if type_match else "",
+        sequence=sequence_match.group(1).strip() if sequence_match else "",
+        filename=filename_match.group(1).strip() if filename_match else "",
+        description=description_match.group(1).strip() if description_match else "",
+        raw_content=document_str
+    )
--- a/venv/lib/python3.10/site-packages/edgar/sgml/table_to_dataframe.py
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/table_to_dataframe.py
@@ -0,0 +1,349 @@
+"""
+Module for converting HTML tables from filing reports to pandas DataFrames.
+This provides an alternative to XBRL parsing by extracting data directly from
+company-formatted HTML tables.
+"""
+
+import re
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import pandas as pd
+
+from edgar.files.html import Document, TableNode
+from edgar.files.tables import ProcessedTable
+
+
+@dataclass
+class TableMetadata:
+    """Metadata extracted from table headers and content"""
+    currency: Optional[str] = None
+    units: Optional[str] = None
+    scaling_factor: Optional[int] = None
+    period_type: Optional[str] = None  # 'instant' or 'duration'
+
+
+class FinancialTableExtractor:
+    """Extract financial tables from HTML reports as pandas DataFrames"""
+
+    # Common patterns for financial data
+    # More comprehensive currency patterns
+    CURRENCY_PATTERN = re.compile(
+        r'\$|USD|EUR|GBP|JPY|CNY|CAD|AUD|CHF|'
+        r'£|€|¥|₹|'  # Currency symbols
+        r'\bDollars?\b|\bPounds?\b|\bEuros?\b|\bYen\b',
+        re.IGNORECASE
+    )
+    # More flexible units pattern
+    UNITS_PATTERN = re.compile(
+        r'(?:in\s+)?(?:thousands?|millions?|billions?|000s?|000,000s?|mln|mil|bn)',
+        re.IGNORECASE
+    )
+    SCALING_PATTERN = re.compile(r'(\d+(?:,\d{3})*)\s*=\s*\$?1')
+    # More flexible date patterns to handle various formats
+    PERIOD_PATTERN = re.compile(
+        r'(\d{1,2}[\s/\-]\w{3,}[\s/\-]\d{2,4}|'  # 31-Dec-2024, 31/December/24
+        r'\w{3,}\.?\s+\d{1,2},?\s+\d{4}|'        # December 31, 2024
+        r'\d{4}[\s/\-]\d{1,2}[\s/\-]\d{1,2}|'    # 2024-12-31
+        r'\d{1,2}[\s/\-]\d{1,2}[\s/\-]\d{2,4}|'  # 12/31/2024, 31-12-24
+        r'Q[1-4]\s*\d{2,4}|'                      # Q1 2024, Q12024
+        r'\d{1}Q\s*\d{2,4}|'                      # 1Q 2024, 1Q24
+        r'FY\s*\d{2,4}|'                          # FY 2024, FY24
+        r'Fiscal\s+\d{4}|'                        # Fiscal 2024
+        r'Year\s+Ended)',                         # Year Ended
+        re.IGNORECASE
+    )
+
+    @classmethod
+    def extract_table_to_dataframe(cls, table_node: TableNode) -> pd.DataFrame:
+        """
+        Convert a TableNode to a pandas DataFrame with appropriate data types.
+
+        Args:
+            table_node: The TableNode containing financial data
+
+        Returns:
+            pd.DataFrame with financial data, periods as columns, line items as index
+        """
+        try:
+            # Get processed table
+            processed_table = table_node._processed
+            if not processed_table:
+                return pd.DataFrame()
+
+            # Extract metadata from headers
+            metadata = cls._extract_metadata(table_node, processed_table)
+
+            # Build DataFrame
+            df = cls._build_dataframe(processed_table, metadata)
+
+            # Apply data transformations
+            df = cls._apply_transformations(df, metadata)
+
+            return df
+
+        except Exception:
+            # Log error but return empty DataFrame to allow processing to continue
+            return pd.DataFrame()
+
+    @classmethod
+    def _extract_metadata(cls, table_node: TableNode, processed_table: ProcessedTable) -> TableMetadata:
+        """Extract metadata from table headers and first few rows"""
+        metadata = TableMetadata()
+
+        # Check headers for currency and units
+        if processed_table.headers:
+            header_text = ' '.join(processed_table.headers)
+
+            # Extract currency
+            currency_match = cls.CURRENCY_PATTERN.search(header_text)
+            if currency_match:
+                metadata.currency = currency_match.group(0)
+
+            # Extract units
+            units_match = cls.UNITS_PATTERN.search(header_text)
+            if units_match:
+                unit_text = units_match.group(0).lower()
+                if any(x in unit_text for x in ['thousand', '000s', '000,']):
+                    metadata.scaling_factor = 1000
+                    metadata.units = 'thousands'
+                elif any(x in unit_text for x in ['million', 'mln', 'mil', '000,000']):
+                    metadata.scaling_factor = 1000000
+                    metadata.units = 'millions'
+                elif any(x in unit_text for x in ['billion', 'bn']):
+                    metadata.scaling_factor = 1000000000
+                    metadata.units = 'billions'
+
+        # Check if periods are durations or instants
+        if processed_table.headers:
+            period_headers = [h for h in processed_table.headers if cls.PERIOD_PATTERN.search(h)]
+            if period_headers:
+                # If headers contain "ended" it's likely duration periods
+                if any('ended' in h.lower() for h in period_headers):
+                    metadata.period_type = 'duration'
+                else:
+                    metadata.period_type = 'instant'
+
+        return metadata
+
+    @classmethod
+    def _build_dataframe(cls, processed_table: ProcessedTable, metadata: TableMetadata) -> pd.DataFrame:
+        """Build initial DataFrame from processed table"""
+        if not processed_table.data_rows:
+            return pd.DataFrame()
+
+        # Identify period columns and line item column
+        headers = processed_table.headers or []
+        period_cols = []
+        line_item_col = 0
+
+        # Check if this is a "vertical" table (like Cover Page)
+        # where first column is labels and all others are data
+        is_vertical_table = False
+        if len(headers) >= 2:
+            # Check if first column has label-like patterns
+            first_header_lower = headers[0].lower() if headers[0] else ''
+            first_is_label = any(pattern in first_header_lower for pattern in 
+                               ['entity', 'line item', 'information', 'abstract', 'cover page',
+                                'detail', 'description', 'item'])
+
+            # Check if this looks like a cover page or entity info table
+            # by examining the first few data rows
+            looks_like_entity_info = False
+            if processed_table.data_rows and len(processed_table.data_rows) > 2:
+                # Check if first column has entity/document field names
+                first_col_values = []
+                for row in processed_table.data_rows[:10]:  # Check more rows
+                    if len(row) > 0 and isinstance(row[0], str):
+                        first_col_values.append(row[0].lower())
+
+                # More comprehensive patterns for vertical tables
+                entity_patterns = ['entity', 'document', 'registrant', 'address', 
+                                 'file number', 'incorporation', 'fiscal', 'telephone',
+                                 'securities', 'trading', 'exchange', 'ticker']
+
+                # Count how many rows match entity patterns
+                pattern_matches = sum(
+                    any(pattern in val for pattern in entity_patterns) 
+                    for val in first_col_values
+                )
+
+                # If more than 30% of rows have entity-like labels, it's probably vertical
+                looks_like_entity_info = pattern_matches >= len(first_col_values) * 0.3
+
+            is_vertical_table = first_is_label or looks_like_entity_info
+
+        if is_vertical_table:
+            # For vertical tables, first column is index, rest are data
+            line_item_col = 0
+            period_cols = list(range(1, len(headers)))
+            # Ensure we don't include the line item column
+            if line_item_col in period_cols:
+                period_cols.remove(line_item_col)
+        else:
+            # For standard tables, identify period columns
+            for i, header in enumerate(headers):
+                if cls.PERIOD_PATTERN.search(header):
+                    period_cols.append(i)
+                elif i == 0:  # First column is usually line items
+                    line_item_col = i
+
+        # Extract data
+        data = []
+        index = []
+
+        for row in processed_table.data_rows:
+            if len(row) > line_item_col:
+                line_item = row[line_item_col].strip()
+                if line_item and not line_item.isspace():
+                    index.append(line_item)
+                    row_data = []
+                    for col_idx in period_cols:
+                        if col_idx < len(row):
+                            row_data.append(row[col_idx])
+                        else:
+                            row_data.append('')
+                    data.append(row_data)
+
+        # Create DataFrame
+        if data:
+            column_names = []
+            for i, col_idx in enumerate(period_cols):
+                if col_idx < len(headers):
+                    # Clean up column name and make unique if needed
+                    col_name = headers[col_idx].strip()
+                    # If duplicate, append index
+                    if col_name in column_names:
+                        col_name = f"{col_name}_{i}"
+                    column_names.append(col_name)
+                else:
+                    column_names.append(f'Col_{i}')
+
+            df = pd.DataFrame(data, index=index, columns=column_names)
+        else:
+            df = pd.DataFrame()
+
+        return df
+
+    @classmethod
+    def _apply_transformations(cls, df: pd.DataFrame, metadata: TableMetadata) -> pd.DataFrame:
+        """Apply data type conversions and scaling"""
+        if df.empty:
+            return df
+
+        # Convert numeric columns
+        for col in df.columns:
+            df[col] = df[col].apply(cls._parse_financial_value)
+
+        # Apply scaling if specified
+        if metadata.scaling_factor:
+            numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
+            df[numeric_cols] = df[numeric_cols] * metadata.scaling_factor
+
+        # Add metadata as attributes
+        df.attrs['currency'] = metadata.currency
+        df.attrs['units'] = metadata.units
+        df.attrs['scaling_factor'] = metadata.scaling_factor
+        df.attrs['period_type'] = metadata.period_type
+
+        return df
+
+    @staticmethod
+    def _parse_financial_value(value: str) -> Union[float, str]:
+        """Parse a financial value string to float or keep as string"""
+        if not isinstance(value, str):
+            return value
+
+        # Clean the value
+        clean_value = value.strip()
+
+        # Check for special markers and empty values
+        empty_markers = ['—', '-', '–', '—', '‒', 'N/A', 'n/a', 'NA', 'nm', 'NM', '*', '**']
+        if clean_value in empty_markers or not clean_value:
+            return 0.0
+
+        # Remove currency symbols, whitespace, and other common symbols
+        # Keep negative sign and decimal points
+        clean_value = re.sub(r'[£€¥₹$,\s]', '', clean_value)
+
+        # Handle various negative formats
+        if clean_value.startswith('(') and clean_value.endswith(')'):
+            clean_value = '-' + clean_value[1:-1]
+        elif clean_value.endswith('-'):  # Some companies put negative sign at end
+            clean_value = '-' + clean_value[:-1]
+
+        # Handle percentage values (remove % but keep the number)
+        clean_value = clean_value.replace('%', '')
+
+        # Try to convert to float
+        try:
+            return float(clean_value)
+        except ValueError:
+            # If it contains any digits, try harder to extract them
+            if re.search(r'\d', clean_value):
+                # Extract just the numeric part
+                numeric_match = re.search(r'-?\d+\.?\d*', clean_value)
+                if numeric_match:
+                    try:
+                        return float(numeric_match.group(0))
+                    except ValueError:
+                        pass
+
+            # Return original if not numeric
+            return value
+
+
+def extract_statement_dataframe(report_content: str) -> pd.DataFrame:
+    """
+    Convenience function to extract a DataFrame from report HTML content.
+
+    Args:
+        report_content: HTML content from a report
+
+    Returns:
+        pd.DataFrame containing the financial data
+    """
+    # Parse HTML document
+    document = Document.parse(report_content)
+
+    if not document.tables:
+        return pd.DataFrame()
+
+    # Try each table to find one with financial data
+    for table_node in document.tables:
+        # Skip tables that are too small (likely headers or metadata)
+        if table_node.row_count < 3:
+            continue
+
+        # Check if table has numeric data
+        if _table_has_financial_data(table_node):
+            df = FinancialTableExtractor.extract_table_to_dataframe(table_node)
+            if not df.empty:
+                return df
+
+    # If no suitable table found, try the first table anyway
+    if document.tables:
+        return FinancialTableExtractor.extract_table_to_dataframe(document.tables[0])
+
+    return pd.DataFrame()
+
+
+def _table_has_financial_data(table_node: TableNode) -> bool:
+    """Check if a table contains financial data by looking for numeric patterns"""
+    if not table_node.content:
+        return False
+
+    # Check first few rows for numeric data
+    numeric_count = 0
+    total_cells = 0
+
+    for _i, row in enumerate(table_node.content[:10]):  # Check first 10 rows
+        for cell in row.cells:
+            total_cells += 1
+            if isinstance(cell.content, str):
+                # Check for financial number patterns
+                if re.search(r'\$?\s*\d+[,.]?\d*', cell.content):
+                    numeric_count += 1
+
+    # If more than 20% of cells have numbers, likely a financial table
+    return total_cells > 0 and (numeric_count / total_cells) > 0.2
--- a/venv/lib/python3.10/site-packages/edgar/sgml/tools.py
+++ b/venv/lib/python3.10/site-packages/edgar/sgml/tools.py
@@ -0,0 +1,82 @@
+import base64
+import re
+
+__all__ = ['extract_text_between_tags', 'get_content_between_tags', 'strip_tags', 'is_xml', 'decode_uu']
+
+def extract_text_between_tags(content: str, tag: str) -> str:
+    """
+    Extracts text from provided content between the specified HTML/XML tags.
+
+    :param content: The text content to search through
+    :param tag: The tag to extract the content from
+    :return: The extracted text between the tags
+    """
+    tag_start = f'<{tag}>'
+    tag_end = f'</{tag}>'
+    is_tag = False
+    extracted_content = ""
+
+    for line in content.splitlines():
+        if line.startswith(tag_start):
+            is_tag = True
+            continue  # Skip the start tag line
+        elif line.startswith(tag_end):
+            break  # Stop reading if end tag is found
+        elif is_tag:
+            extracted_content += line + '\n'  # Add line to result
+
+    return extracted_content.strip()
+
+
+def get_content_between_tags(content: str, outer_tag: str = None) -> str:
+    """
+    Extract content between specified tags, starting from most nested tags.
+
+    Args:
+        content: Raw content containing tagged sections
+        outer_tag: Optional specific tag to extract from (e.g. 'XBRL', 'TEXT')
+
+    Returns:
+        str: Content between the specified tags, or innermost content if no tag specified
+    """
+    known_tags = ["PDF", "XBRL", "XML", "TEXT"]  # Ordered from most nested to least nested
+
+    if outer_tag:
+        # Extract content for specific tag
+        pattern = f'<{outer_tag}>(.*?)</{outer_tag}>'
+        match = re.search(pattern, content, re.DOTALL)
+        return match.group(1).strip() if match else ''
+
+    # If no tag specified, find the first matching tag from most nested to least
+    for tag in known_tags:
+        pattern = f'<{tag}>(.*?)</{tag}>'
+        match = re.search(pattern, content, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+
+    return ''
+
+
+def strip_tags(text: str, start_tag: str, end_tag: str) -> str:
+    """Strip XML/HTML tags from text if present."""
+    if text.startswith(start_tag) and text.endswith(end_tag):
+        return text[len(start_tag):-len(end_tag)].strip()
+    return text
+
+def is_xml(filename: str) -> bool:
+    """Check if a file is XML based on the file extension.
+    .xsd, .xml, .xbrl
+    """
+    return filename.lower().endswith(('.xsd', '.xml', '.xbrl'))
+
+
+def decode_uu(uu_content):
+    lines = uu_content.split('\n')
+    data = ''
+    for line in lines[1:]:  # Skip "begin" line
+        if line.startswith('`') or line.startswith('end'):
+            break
+        # Convert UU to base64 padding
+        data += ''.join([chr(((ord(c) - 32) & 63) + 32) for c in line.strip()])
+
+    return base64.b64decode(data)