Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
from edgar.sgml.filing_summary import FilingSummary, Report, Reports, Statements
from edgar.sgml.sgml_common import FilingSGML, iter_documents, list_documents
from edgar.sgml.sgml_header import Filer, FilingHeader, FilingMetadata, Issuer, ReportingOwner
from edgar.sgml.sgml_parser import SGMLDocument

View File

@@ -0,0 +1,666 @@
import re
from dataclasses import dataclass
from enum import Enum
from functools import lru_cache
from typing import Dict, List, Optional, Set, Tuple, Union
import pyarrow as pa
import pyarrow.compute as pc
from bs4 import BeautifulSoup
from rich import box
from rich.console import Group
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from edgar.core import DataPager, PagingState, log, strtobool
from edgar.files.html import Document
from edgar.richtools import print_rich, repr_rich, rich_to_text
from edgar.xmltools import child_text
__all__ = ['Report', 'Reports', 'File', 'FilingSummary']
class Reports:
"""
A collection of reports in a filing summary
"""
def __init__(self,
data:pa.Table,
filing_summary: Optional['FilingSummary'] = None,
original_state: Optional[PagingState] = None,
title: Optional[str] = "Reports"):
self.data:pa.Table = data
self.data_pager = DataPager(data)
self._original_state = original_state or PagingState(0, len(self.data))
self.n = 0
self._filing_summary = filing_summary
self.title = title
def __len__(self):
return len(self.data)
def __iter__(self):
self.n = 0
return self
def __next__(self):
if self.n < len(self.data):
report = Report(
instance=self.data['instance'][self.n].as_py(),
is_default=self.data['IsDefault'][self.n].as_py(),
has_embedded_reports=self.data['HasEmbeddedReports'][self.n].as_py(),
html_file_name=self.data['HtmlFileName'][self.n].as_py(),
long_name=self.data['LongName'][self.n].as_py(),
report_type=self.data['ReportType'][self.n].as_py(),
role=self.data['Role'][self.n].as_py(),
parent_role=self.data['ParentRole'][self.n].as_py(),
short_name=self.data['ShortName'][self.n].as_py(),
menu_category=self.data['MenuCategory'][self.n].as_py(),
position=self.data['Position'][self.n].as_py(),
reports = self
)
self.n += 1
return report
else:
raise StopIteration
def current(self):
"""Display the current page ... which is the default for this filings object"""
return self
def next(self):
"""Show the next page"""
data_page = self.data_pager.next()
if data_page is None:
log.warning("End of data .. use prev() \u2190 ")
return None
start_index, _ = self.data_pager._current_range
paging_state = PagingState(page_start=start_index, num_records=len(self))
return Reports(data_page, original_state=paging_state)
def previous(self):
"""
Show the previous page of the data
:return:
"""
data_page = self.data_pager.previous()
if data_page is None:
log.warning(" No previous data .. use next() \u2192 ")
return None
start_index, _ = self.data_pager._current_range
paging_state = PagingState(page_start=start_index, num_records=len(self))
return Reports(data_page, original_state=paging_state)
def to_pandas(self):
return self.data.to_pandas()
def __getitem__(self, item):
record = self.filter("Position", str(item))
if record:
return record
def create_from_record(self, data:pa.Table):
return Report(
instance=data['instance'][0].as_py(),
is_default=data['IsDefault'][0].as_py(),
has_embedded_reports=data['HasEmbeddedReports'][0].as_py(),
html_file_name=data['HtmlFileName'][0].as_py(),
long_name=data['LongName'][0].as_py(),
report_type=data['ReportType'][0].as_py(),
role=data['Role'][0].as_py(),
parent_role=data['ParentRole'][0].as_py(),
short_name=data['ShortName'][0].as_py(),
menu_category=data['MenuCategory'][0].as_py(),
position=data['Position'][0].as_py(),
reports = self
)
@property
def long_names(self) -> List[str]:
return self.data['LongName'].to_pylist()
@property
def short_names(self) -> List[str]:
return self.data['ShortName'].to_pylist()
def get_by_category(self, category: str):
"""
Get a single report by category
"""
data = self.data.filter(pc.equal(self.data['MenuCategory'], category))
return Reports(data, filing_summary=self._filing_summary, title=category)
@property
def statements(self) -> Optional['Statements']:
"""
Get all reports in the Statements category
"""
reports = self.get_by_category('Statements')
if reports:
return Statements(reports)
def get_by_filename(self, file_name: str):
"""
Get a single report by file name
"""
data = self.data.filter(pc.equal(self.data['HtmlFileName'], file_name))
if len(data) ==1:
return self.create_from_record(data)
def get_by_short_name(self, short_name: str):
"""
Get a single report by short name
"""
data = self.data.filter(pc.equal(self.data['ShortName'], short_name))
if len(data) == 1:
return self.create_from_record(data)
def filter(self, column: Union[str, List[str]], value: Union[str, List[str]]):
if isinstance(column, str):
column = [column]
if isinstance(value, str):
value = [value]
# Convert value list to a pyarrow array for proper comparison
value_set = pa.array(value)
# Initialize mask using the first column
mask = pc.is_in(self.data[column[0]], value_set)
# Combine with subsequent columns using logical AND
for col in column[1:]:
mask = pc.and_(mask, pc.is_in(self.data[col], value_set))
# Apply the mask to filter the data
data = self.data.filter(mask)
# Return a single Report or new Reports instance
if len(data) == 1:
return self.create_from_record(data)
return Reports(data)
def __rich__(self):
table = Table(
show_header=True,
header_style="dim",
show_lines=True,
box=box.SIMPLE,
border_style="bold grey54",
row_styles=["", "bold"]
)
table.add_column("#", style="dim", justify="left")
table.add_column("Report", style="bold", width=60)
table.add_column("Category", width=12)
table.add_column("File", justify="left")
# Iterate through rows in current page
for i in range(len(self)):
position = self.data['Position'][i].as_py()
row = [
str(position) if position else "-",
self.data['ShortName'][i].as_py(),
self.data['MenuCategory'][i].as_py() or "",
self.data['HtmlFileName'][i].as_py() or ""
]
table.add_row(*row)
panel = Panel(table, title=self.title, expand=False)
return panel
def __repr__(self):
return repr_rich(self.__rich__())
class Report:
def __init__(self,
instance: Optional[str],
is_default: Optional[bool],
has_embedded_reports: Optional[bool],
long_name: Optional[str],
short_name: Optional[str],
menu_category: Optional[str],
position: Optional[int],
html_file_name: Optional[str],
report_type: Optional[str],
role: Optional[str],
parent_role: Optional[str] = None,
reports = None):
self.instance = instance
self.is_default = is_default
self.has_embedded_reports = has_embedded_reports
self.long_name = long_name
self.short_name = short_name
self.menu_category = menu_category
self.position = position
self.html_file_name = html_file_name
self.report_type = report_type
self.role = role
self.parent_role = parent_role
self._reports = reports
@property
def content(self):
"""
Get the content of the report
"""
sgml = self._reports._filing_summary._filing_sgml
if sgml:
return sgml.get_content(self.html_file_name)
def text(self):
"""
Get the text content of the report
"""
table = self._get_report_table()
if table:
return rich_to_text(table.render(500))
@lru_cache
def _get_report_table(self):
"""
Get the first table in the document
"""
document = Document.parse(self.content)
if len(document.tables) == 0:
log.warning(f"No tables found in {self.html_file_name}")
return None
return document.tables[0]
def view(self):
table = self._get_report_table()
if table:
print_rich(table.render(500))
def to_dataframe(self):
"""
Extract the report's financial table as a pandas DataFrame.
Returns:
pd.DataFrame: Financial data with periods as columns and line items as index.
Returns empty DataFrame if no tables found.
The DataFrame includes metadata attributes:
- currency: The currency used (e.g., 'USD')
- units: The units description (e.g., 'thousands')
- scaling_factor: Numeric scaling factor (e.g., 1000 for thousands)
- period_type: 'instant' or 'duration' for the time periods
"""
from edgar.sgml.table_to_dataframe import extract_statement_dataframe
content = self.content
if content:
return extract_statement_dataframe(content)
import pandas as pd
return pd.DataFrame()
def __str__(self):
return f"Report(short_name={self.short_name}, category={self.menu_category}, file_name={self.html_file_name})"
def __rich__(self):
return Panel(
Text.assemble(("Report ", "dim"), (self.long_name, "bold")),
subtitle=Text(self.menu_category, style='dim italic'),
expand=False,
width=400,
height=4
)
def __repr__(self):
return repr_rich(self.__rich__())
@dataclass
class File:
file_name: str
doc_type: Optional[str]
is_definitely_fs: Optional[bool]
is_usgaap: Optional[bool]
original: Optional[str]
class FilingSummary:
def __init__(self,
reports: Reports,
short_name_map: Dict[str, Report],
category_map: Dict[str, List[Report]],
input_files: List[File],
supplemental_files: List[File],
report_format: Optional[str] = None,
context_count: Optional[int] = None,
element_count: Optional[int] = None,
entity_count: Optional[int] = None,
footnotes_reported: Optional[bool] = None,
segment_count: Optional[int] = None,
scenario_count: Optional[int] = None,
tuples_reported: Optional[bool] = None,
has_presentation_linkbase: Optional[bool] = None,
has_calculation_linkbase: Optional[bool] = None):
self.reports:Reports = reports
self.reports._filing_summary = self
self._short_name_map = short_name_map
self._category_map = category_map
self.input_files = input_files
self.supplemental_files = supplemental_files
self.report_format = report_format
self.context_count = context_count
self.element_count = element_count
self.entity_count = entity_count
self.footnotes_reported = footnotes_reported
self.segment_count = segment_count
self.scenario_count = scenario_count
self.tuples_reported = tuples_reported
self.has_presentation_linkbase = has_presentation_linkbase
self.has_calculation_linkbase = has_calculation_linkbase
self._filing_sgml = None
@classmethod
def parse(cls, xml_text:str):
soup = BeautifulSoup(xml_text, 'xml')
root = soup.find('FilingSummary')
# Main fields
report_format = child_text(root, 'ReportFormat')
context_count = child_text(root, 'ContextCount')
element_count = child_text(root, 'ElementCount')
entity_count = child_text(root, 'EntityCount')
footnotes_reported = strtobool(child_text(root, 'FootnotesReported'))
segment_count = child_text(root, 'SegmentCount')
scenario_count = child_text(root, 'ScenarioCount')
tuples_reported = strtobool(child_text(root, 'TuplesReported'))
has_presentation_linkbase = strtobool(child_text(root, 'HasPresentationLinkbase'))
has_calculation_linkbase = strtobool(child_text(root, 'HasCalculationLinkbase'))
# Reports
reports: List[Report] = []
short_name_map: Dict[str, Report] = {}
category_map: Dict[str, List[Report]] = {}
report_records = []
for report_tag in root.find_all("Report"):
record = {
'instance': report_tag.get('instance'),
'IsDefault': strtobool(child_text(report_tag, 'IsDefault')),
'HasEmbeddedReports': strtobool(child_text(report_tag, 'HasEmbeddedReports')),
'HtmlFileName': child_text(report_tag, 'HtmlFileName'),
'LongName': child_text(report_tag, 'LongName'),
'ReportType': child_text(report_tag, 'ReportType'),
'Role': child_text(report_tag, 'Role'),
'ParentRole': child_text(report_tag, 'ParentRole'),
'ShortName': child_text(report_tag, 'ShortName'),
'MenuCategory': child_text(report_tag, 'MenuCategory'),
'Position': child_text(report_tag, 'Position')
}
report = Report(
instance = report_tag.get('instance'),
is_default = strtobool(child_text(report_tag, 'IsDefault')),
has_embedded_reports = strtobool(child_text(report_tag, 'HasEmbeddedReports')),
html_file_name = child_text(report_tag, 'HtmlFileName'),
long_name = child_text(report_tag, 'LongName'),
report_type = child_text(report_tag, 'ReportType'),
role = child_text(report_tag, 'Role'),
parent_role=child_text(report_tag, 'ParentRole'),
short_name = child_text(report_tag, 'ShortName'),
menu_category = child_text(report_tag, 'MenuCategory'),
position = child_text(report_tag, 'Position')
)
reports.append(report)
report_records.append(record)
short_name_map[report.short_name] = report
if report.menu_category not in category_map:
category_map[report.menu_category] = []
category_map[report.menu_category].append(report)
# Reports Data
reports_obj = Reports(data=pa.Table.from_pylist(report_records))
# Input Files
input_files_tag = root.find('InputFiles')
input_files = []
if input_files_tag:
for file_tag in input_files_tag.find_all('File'):
file = File(
file_name = file_tag.text,
doc_type = file_tag.get('doctype'),
is_definitely_fs = strtobool(file_tag.get('isDefinitelyFs')),
is_usgaap = strtobool(file_tag.get('isUsgaap')),
original = file_tag.get('original')
)
input_files.append(file)
# Supplemental Files
supplemental_files_tag = root.find('SupplementalFiles')
supplemental_files = []
if supplemental_files_tag:
for file_tag in supplemental_files_tag.find_all('File'):
file = File(
file_name = file_tag.text,
doc_type = file_tag.get('doctype'),
is_definitely_fs = strtobool(file_tag.get('isDefinitelyFs')),
is_usgaap = strtobool(file_tag.get('isUsgaap')),
original = file_tag.get('original')
)
supplemental_files.append(file)
return cls( report_format=report_format,
short_name_map=short_name_map,
category_map=category_map,
context_count=context_count,
element_count=element_count,
entity_count=entity_count,
footnotes_reported=footnotes_reported,
segment_count=segment_count,
scenario_count=scenario_count,
tuples_reported=tuples_reported,
has_presentation_linkbase=has_presentation_linkbase,
has_calculation_linkbase=has_calculation_linkbase,
reports=reports_obj,
input_files=input_files,
supplemental_files=supplemental_files)
def get_report_by_short_name(self, short_name: str) -> Optional[Report]:
return self.reports.get_by_short_name(short_name)
def get_reports_by_category(self, category: str) -> Reports:
return self.reports.get_by_category(category)
def get_reports_by_filename(self, file_name: str) -> Optional[Report]:
return self.reports.get_by_filename(file_name)
@property
def statements(self):
reports = self.get_reports_by_category('Statements')
return Statements(reports)
@property
def tables(self):
return self.get_reports_by_category('Tables')
def __str__(self):
return f"FilingSummary(report_format={self.report_format})"
def __rich__(self):
renderables = [self.reports]
return Panel(
Group(*renderables),
box=box.ROUNDED,
title="Filing Summary"
)
def __repr__(self):
return repr_rich(self.__rich__())
class StatementType(Enum):
INCOME = "income"
BALANCE = "balance"
CASH_FLOW = "cash_flow"
COMPREHENSIVE_INCOME = "comprehensive_income"
EQUITY = "equity"
class StatementMapper:
def __init__(self):
# Define pattern matchers for each statement type
self.patterns = {
StatementType.INCOME: [
(r'(?i)statement.*of.*(?:operation|income|earning)s?(?!\s+and\s+comprehensive)', 3),
# High confidence direct match
(r'(?i)(?:operation|income|earning)s?\s+statement', 2), # Alternative format
(r'(?i)profit.*loss', 1), # P&L reference
],
StatementType.BALANCE: [
(r'(?i)balance\s*sheet', 3), # Very consistent naming
(r'(?i)statement.*of.*financial\s+position', 2), # Alternative format
],
StatementType.CASH_FLOW: [
(r'(?i)statement.*of.*cash\s*flows?', 3), # Primary pattern
(r'(?i)cash\s*flows?\s*statement', 2), # Alternative format
],
StatementType.COMPREHENSIVE_INCOME: [
(r'(?i)statement.*of.*comprehensive\s*(?:income|loss)', 3), # Primary pattern
(r'(?i)comprehensive\s*(?:income|loss)\s*statement', 2), # Alternative format
],
StatementType.EQUITY: [
(r'(?i)statement.*of.*(?:stockholders|shareholders|owners)[\'\s]*equity', 3), # Primary pattern
(r'(?i)statement.*of.*changes\s+in\s+(?:stockholders|shareholders|owners)[\'\s]*equity', 3),
# With "changes in"
(r'(?i)statement.*of.*equity', 2), # Generic equity
]
}
# Define combined statement patterns
self.combined_patterns = [
(r'(?i)statement.*of.*operations?\s+and\s+comprehensive\s*(?:income|loss)',
{StatementType.INCOME, StatementType.COMPREHENSIVE_INCOME}),
]
def normalize_statement(self, statement: str) -> str:
"""Normalize statement name by removing common variations."""
statement = statement.strip().upper()
# Remove common prefixes if they exist
prefixes = ['CONSOLIDATED', 'COMBINED']
for prefix in prefixes:
if statement.startswith(prefix):
statement = statement[len(prefix):].strip()
return statement
def match_statement(self, statement: str) -> Dict[StatementType, float]:
"""
Match a statement name to possible statement types with confidence scores.
Returns a dictionary of {StatementType: confidence_score}
"""
normalized = self.normalize_statement(statement)
scores: Dict[StatementType, float] = {}
# First check for combined statements
for pattern, types in self.combined_patterns:
if re.search(pattern, normalized):
for stmt_type in types:
scores[stmt_type] = 1.0
return scores
# Then check individual patterns
for stmt_type, patterns in self.patterns.items():
max_score = 0
for pattern, weight in patterns:
if re.search(pattern, normalized):
max_score = max(max_score, weight / 3.0) # Normalize to 0-1 range
if max_score > 0:
scores[stmt_type] = max_score
return scores
def classify_statement(self, statement: str, threshold: float = 0.5) -> Set[StatementType]:
"""
Classify a statement into one or more statement types.
Returns a set of StatementType enums.
"""
scores = self.match_statement(statement)
return {stmt_type for stmt_type, score in scores.items() if score >= threshold}
def get_best_matches(self, statements: List[str]) -> Dict[StatementType, str]:
"""
Given a list of statement names, returns the best matching statement
for each statement type.
"""
result: Dict[StatementType, Tuple[str, float]] = {}
for statement in statements:
scores = self.match_statement(statement)
for stmt_type, score in scores.items():
if (stmt_type not in result or
score > result[stmt_type][1]):
result[stmt_type] = (statement, score)
return {stmt_type: stmt for stmt_type, (stmt, _) in result.items()}
class Statements:
"""
A wrapper class for detected financial statements in a filing summary.
"""
def __init__(self, statement_reports:Reports):
self._reports = statement_reports
self.statements = [report.short_name for report in self._reports]
self.mapper = StatementMapper()
self._matches: Dict[StatementType, Tuple[str, float]] = {}
self._initialize_matches()
def _initialize_matches(self) -> None:
"""Initialize best matches for each statement type."""
for statement in self.statements:
scores = self.mapper.match_statement(statement)
for stmt_type, score in scores.items():
if (stmt_type not in self._matches or
score > self._matches[stmt_type][1]):
self._matches[stmt_type] = (statement, score)
def _get_statement(self, stmt_type: StatementType, threshold: float = 0.5) -> Optional[Report]:
"""Helper method to get a statement of a specific type."""
if stmt_type in self._matches:
statement, score = self._matches[stmt_type]
if score >= threshold:
return self._reports.get_by_short_name(statement)
return None
def __getitem__(self, item):
return self._reports[item]
@property
def balance_sheet(self) -> Optional[Report]:
"""Returns the detected balance sheet statement."""
return self._get_statement(StatementType.BALANCE)
@property
def income_statement(self) -> Optional[Report]:
"""Returns the detected income statement."""
return self._get_statement(StatementType.INCOME)
@property
def cash_flow_statement(self) -> Optional[Report]:
"""Returns the detected cash flow statement."""
return self._get_statement(StatementType.CASH_FLOW)
@property
def comprehensive_income_statement(self) -> Optional[Report]:
"""Returns the detected comprehensive income statement."""
return self._get_statement(StatementType.COMPREHENSIVE_INCOME)
@property
def equity_statement(self) -> Optional[Report]:
"""Returns the detected equity statement."""
return self._get_statement(StatementType.EQUITY)
@property
def detected_statements(self) -> Dict[StatementType, str]:
"""Returns all detected statements with scores above threshold."""
return {
stmt_type: stmt for stmt_type, (stmt, score)
in self._matches.items()
if score >= 0.5
}
def __rich__(self):
return self._reports
def __repr__(self):
return repr_rich(self.__rich__())

View File

@@ -0,0 +1,482 @@
import re
import zipfile
from collections import defaultdict
from functools import cached_property
from pathlib import Path
from typing import TYPE_CHECKING, DefaultDict, Dict, Iterator, List, Optional, Tuple, Union
if TYPE_CHECKING:
from edgar._filings import Filing
from edgar.attachments import Attachment, Attachments, get_document_type
from edgar.httprequests import stream_with_retry
from edgar.sgml.filing_summary import FilingSummary
from edgar.sgml.sgml_header import FilingHeader
from edgar.sgml.sgml_parser import SGMLDocument, SGMLFormatType, SGMLParser
from edgar.sgml.tools import is_xml
__all__ = ['iter_documents', 'list_documents', 'FilingSGML', 'FilingHeader']
def parse_document(document_str: str) -> SGMLDocument:
"""
Parse a single SGML document section, maintaining raw content.
"""
# Extract individual fields with separate patterns
type_match = re.search(r'<TYPE>([^<\n]+)', document_str)
sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', document_str)
filename_match = re.search(r'<FILENAME>([^<\n]+)', document_str)
description_match = re.search(r'<DESCRIPTION>([^<\n]+)', document_str)
return SGMLDocument(
type=type_match.group(1).strip() if type_match else "",
sequence=sequence_match.group(1).strip() if sequence_match else "",
filename=filename_match.group(1).strip() if filename_match else "",
description=description_match.group(1).strip() if description_match else "",
raw_content=document_str
)
def read_content(source: Union[str, Path]) -> Iterator[str]:
"""
Read content from either a URL or file path, yielding lines as strings.
Automatically handles gzip-compressed files with .gz extension.
Args:
source: Either a URL string or a file path
Yields:
str: Lines of content from the source
Raises:
TooManyRequestsError: If the server returns a 429 response
FileNotFoundError: If the file path doesn't exist
gzip.BadGzipFile: If the file is not a valid gzip file
"""
if isinstance(source, str) and (source.startswith('http://') or source.startswith('https://')):
# Handle URL using stream_with_retry
for response in stream_with_retry(source):
# Process each line from the response and decode from bytes
for line in response.iter_lines():
if line is not None:
yield line + "\n"
else:
# Handle file path
path = Path(source)
# Check if the file is gzip-compressed
if str(path).endswith('.gz'):
import gzip
with gzip.open(path, 'rt', encoding='utf-8', errors='replace') as file:
yield from file
else:
# Regular file handling
with path.open('r', encoding='utf-8', errors='replace') as file:
yield from file
def read_content_as_string(source: Union[str, Path]) -> str:
"""
Read content from either a URL or file path into a string.
Uses existing read_content generator function.
Args:
source: Either a URL string or a file path
Returns:
str: Full content as string
Raises:
TooManyRequestsError: If the server returns a 429 response
FileNotFoundError: If file path doesn't exist
"""
# Convert lines from read_content to string
lines = []
for line in read_content(source):
# Handle both string and bytes from response
if isinstance(line, bytes):
lines.append(line.decode('utf-8', errors='replace'))
else:
lines.append(line)
return ''.join(lines)
def iter_documents(source: Union[str, Path]) -> Iterator[SGMLDocument]:
"""
Stream SGML documents from either a URL or file path, yielding parsed documents.
Args:
source: Either a URL string or a file path (string or Path object)
Yields:
SGMLDocument objects containing the parsed content
Raises:
ValueError: If the source is invalid
ConnectionError: If URL retrieval fails after retries
FileNotFoundError: If the file path doesn't exist
"""
try:
content = ''.join(read_content(source))
document_pattern = re.compile(r'<DOCUMENT>([\s\S]*?)</DOCUMENT>')
for match in document_pattern.finditer(content):
document = parse_document(match.group(1))
if document:
yield document
except (ValueError, ConnectionError, FileNotFoundError) as e:
raise type(e)(f"Error processing source {source}: {str(e)}") from e
def list_documents(source: Union[str, Path]) -> list[SGMLDocument]:
"""
Convenience method to parse all documents from a source into a list.
Args:
source: Either a URL string or a file path
Returns:
List of SGMLDocument objects
"""
return list(iter_documents(source))
def parse_file(source: Union[str, Path]) -> list[SGMLDocument]:
"""
Convenience method to parse all documents from a source into a list.
Args:
source: Either a URL string or a file path
Returns:
List of SGMLDocument objects
"""
return list(iter_documents(source))
def parse_submission_text(content: str) -> Tuple[FilingHeader, DefaultDict[str, List[SGMLDocument]]]:
"""
Parses the raw submission text and returns the filing header along with
a dictionary mapping document sequence numbers to lists of SGMLDocument objects.
Args:
content (str): The raw text content of the submission.
Returns:
Tuple[FilingHeader, DefaultDict[str, List[SGMLDocument]]]:
A tuple where the first element is the FilingHeader object representing
the parsed header information, and the second element is a defaultdict
mapping document sequence identifiers to their corresponding list of SGMLDocument objects.
Details:
- For submissions with the SGMLFormatType.SUBMISSION format, the function uses
the pre-parsed filer data to create the FilingHeader.
- For SEC-DOCUMENT formatted content, the header is initially parsed from the SGML text;
if this fails, the header is parsed again with preprocessing enabled.
- The function creates an SGMLDocument for each parsed document and groups them by
their sequence identifier.
Raises:
Exception: Any exceptions raised during header parsing (handled internally
by attempting to preprocess the header in case of failure).
"""
# Create parser and get structure including header and documents
parser = SGMLParser()
parsed_data = parser.parse(content)
# Create FilingHeader using already parsed data
if parsed_data['format'] == SGMLFormatType.SUBMISSION:
# For submission format, we already have parsed filer data
header = FilingHeader.parse_submission_format_header(parsed_data=parsed_data)
else:
# For SEC-DOCUMENT format, pass the header text to the
# specialized header parser since we need additional processing
try:
header = FilingHeader.parse_from_sgml_text(parsed_data['header'])
except Exception:
header = FilingHeader.parse_from_sgml_text(parsed_data['header'], preprocess=True)
# Create document dictionary
documents = defaultdict(list)
for doc_data in parsed_data['documents']:
doc = SGMLDocument.from_parsed_data(doc_data)
documents[doc.sequence].append(doc)
return header, documents
class FilingSGML:
"""
Main class that parses and provides access to both the header and documents
from an SGML filing.
"""
__slots__ = ('header', '_documents_by_sequence', '__dict__') # Use slots for memory efficiency
def __init__(self, header: FilingHeader, documents: defaultdict[str, List[SGMLDocument]]):
"""
Initialize FilingSGML with parsed header and documents.
Args:
header (FilingHeader): Parsed header information
documents (Dict[str, SGMLDocument]): Dictionary of parsed documents keyed by sequence
"""
self.header:FilingHeader = header
self._documents_by_sequence:defaultdict[str, List[SGMLDocument]] = documents
self._documents_by_name:Dict[str, SGMLDocument] = {
doc.filename: doc for doc_lst in documents.values() for doc in doc_lst
}
@property
def accession_number(self):
return self.header.accession_number
@property
def cik(self):
return self.header.cik
@cached_property
def entity(self):
from edgar.entity import Entity
cik = self.cik
if cik:
return Entity(cik)
@property
def form(self):
return self.header.form
@property
def filing_date(self):
return self.header.filing_date
@property
def date_as_of_change(self):
return self.header.date_as_of_change
@property
def period_of_report(self):
return self.header.period_of_report
@property
def effective_date(self):
return self.header.filing_metadata.get('EFFECTIVE DATE')
@property
def path(self):
"""
Get the root path of the filing.
"""
if self.accession_number:
return f"/Archives/edgar/data/{self.header.cik}/{self.accession_number.replace('-', '')}"
else:
return "/<SGML FILE>"
def html(self):
html_document = self.attachments.primary_html_document
if html_document and not html_document.is_binary() and not html_document.empty:
html_text = self.get_content(html_document.document)
if isinstance(html_text, bytes):
html_text = html_text.decode('utf-8')
return html_text
def xml(self):
xml_document = self.attachments.primary_xml_document
if xml_document and not xml_document.is_binary() and not xml_document.empty:
xml_text = self.get_content(xml_document.document)
if isinstance(xml_text, bytes):
xml_text = xml_text.decode('utf-8')
return xml_text
def get_content(self, filename: str) -> Optional[str]:
"""
Get the content of a document by its filename.
"""
document = self._documents_by_name.get(filename)
if document:
return document.content
@cached_property
def attachments(self) -> Attachments:
"""
Get all attachments from the filing.
"""
is_datafile = False
documents, datafiles, primary_files = [], [], []
# Get the filing summary
filing_summary = self.filing_summary
for sequence, document_lst in self._documents_by_sequence.items():
for document in document_lst:
attachment = Attachment(
sequence_number=sequence,
ixbrl=False,
path=f"{self.path}/{document.filename}",
document=document.filename,
document_type=get_document_type(filename=document.filename, declared_document_type=document.type),
description=document.description,
size=None,
sgml_document=document,
filing_sgml=self
)
# Add from the filing summary if available
if filing_summary:
report = filing_summary.get_reports_by_filename(document.filename)
if report:
attachment.purpose = report.short_name
# Check if the document is a primary document
if sequence == "1":
primary_files.append(attachment)
documents.append(attachment)
else:
if not is_datafile:
is_datafile = is_xml(filename=document.filename)
if is_datafile:
datafiles.append(attachment)
else:
documents.append(attachment)
return Attachments(document_files=documents, data_files=datafiles, primary_documents=primary_files, sgml=self)
@cached_property
def filing_summary(self):
summary_attachment = self._documents_by_name.get("FilingSummary.xml")
if summary_attachment:
filing_summary = FilingSummary.parse(summary_attachment.content)
filing_summary.reports._filing_summary = filing_summary
filing_summary._filing_sgml = self
return filing_summary
def download(self, path: Union[str, Path], archive: bool = False):
"""
Download all the attachments to a specified path.
If the path is a directory, the file is saved with its original name in that directory.
If the path is a file, the file is saved with the given path name.
If archive is True, the attachments are saved in a zip file.
path: str or Path - The path to save the attachments
archive: bool (default False) - If True, save the attachments in a zip file
"""
if archive:
if path.is_dir():
raise ValueError("Path must be a zip file name to create zipfile")
else:
with zipfile.ZipFile(path, 'w') as zipf:
for document in self._documents_by_name.values():
zipf.writestr(document.filename, document.content)
else:
if path.is_dir():
for document in self._documents_by_name.values():
file_path = path / document.filename
content = document.content
if isinstance(content, bytes):
file_path.write_bytes(content)
else:
file_path.write_text(content, encoding='utf-8')
else:
raise ValueError("Path must be a directory")
@property
def primary_documents(self):
"""
Get the primary documents from the filing.
"""
return self.attachments.primary_documents
@classmethod
def from_source(cls, source: Union[str, Path]) -> "FilingSGML":
"""
Create FilingSGML instance from either a URL or file path.
Parses both header and documents.
Args:
source: Either a URL string or a file path
Returns:
FilingSGML: New instance with parsed header and documents
Raises:
ValueError: If header section cannot be found
IOError: If file cannot be read
"""
# Read content once
content = read_content_as_string(source)
# Parse header and documents
header, documents = parse_submission_text(content)
# Create FilingSGML instance
return cls(header=header, documents=documents)
@classmethod
def from_text(cls, full_text_submission: str) -> "FilingSGML":
"""
Create FilingSGML instance from either full text submission.
Parses both header and documents.
Args:
full_text_submission: String containing full text submission
Returns:
FilingSGML: New instance with parsed header and documents
Raises:
ValueError: If header section cannot be found
"""
# Parse header and documents
header, documents = parse_submission_text(full_text_submission)
# Create FilingSGML instance
return cls(header=header, documents=documents)
def get_document_by_sequence(self, sequence: str) -> Optional[SGMLDocument]:
"""
Get a document by its sequence number.
Direct dictionary lookup for O(1) performance.
"""
results = self._documents_by_sequence.get(sequence)
if results and len(results) > 0:
return results[0]
def get_document_by_name(self, filename: str) -> Optional[SGMLDocument]:
"""
Get a document by its filename.
Direct dictionary lookup for O(1) performance.
"""
return self._documents_by_name.get(filename)
@classmethod
def from_filing(cls, filing: 'Filing') -> 'FilingSGML':
"""Create from a Filing object that provides text_url."""
filing_sgml = cls.from_source(filing.text_url)
if not filing_sgml.accession_number:
filing_sgml.header.filing_metadata.update('ACCESSION NUMBER', filing.accession_no)
if not filing_sgml.header.filing_metadata.get("CIK"):
filing_sgml.header.filing_metadata.update('CIK', str(filing.cik).zfill(10))
if not filing_sgml.header.form:
filing_sgml.header.filing_metadata.update("CONFORMED SUBMISSION TYPE", filing.form)
return filing_sgml
def __str__(self) -> str:
"""String representation with basic filing info."""
doc_count = len(self._documents_by_name)
return f"FilingSGML(accession={self.header.accession_number}, document_count={doc_count})"
def __repr__(self) -> str:
return str(self)
def get_document_sequences(self) -> List[str]:
"""
Get all document sequences.
Using list() is more efficient than sorted() when order doesn't matter.
"""
return list(self._documents_by_sequence.keys())
def get_all_document_types(self) -> List[str]:
"""
Get unique document types in filing.
Using set for deduplication.
"""
return list({doc.type for doc in self._documents_by_sequence.values()})
def get_document_count(self) -> int:
"""Get total number of documents."""
return len(self._documents_by_sequence)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,593 @@
import re
import warnings
from dataclasses import dataclass
from enum import Enum
from io import BytesIO
from typing import Iterator, Optional
from edgar.core import has_html_content
from edgar.sgml.tools import get_content_between_tags
from edgar.vendored import uu
__all__ = ['SGMLParser', 'SGMLFormatType', 'SGMLDocument', 'SECIdentityError', 'SECFilingNotFoundError', 'SECHTMLResponseError']
class SECIdentityError(Exception):
"""Raised when SEC rejects request due to invalid or missing EDGAR_IDENTITY"""
pass
class SECFilingNotFoundError(Exception):
"""Raised when SEC returns error for non-existent filing"""
pass
class SECHTMLResponseError(Exception):
"""Raised when SEC returns HTML content instead of expected SGML"""
pass
class SGMLFormatType(Enum):
SEC_DOCUMENT = "sec_document" # <SEC-DOCUMENT>...<SEC-HEADER> style
SUBMISSION = "submission" # <SUBMISSION>...<FILER> style
@dataclass
class SGMLDocument:
type: str
sequence: str
filename: str
description: str
raw_content: str = ""
@classmethod
def from_parsed_data(cls, data: dict) -> 'SGMLDocument':
"""Create document from parser output"""
return cls(
type=data['type'],
sequence=data['sequence'],
filename=data['filename'],
description=data['description'],
raw_content=data['content']
)
@property
def content(self):
raw_content = get_content_between_tags(self.raw_content)
if raw_content:
if raw_content.startswith("begin"):
# Create input and output streams
# Suppress the binascii warning
warnings.filterwarnings('ignore')
# Create input and output streams
input_stream = BytesIO(raw_content.encode("utf-8"))
output_stream = BytesIO()
# Decode the UU content
uu.decode(input_stream, output_stream, quiet=True)
# Get the decoded bytes
return output_stream.getvalue()
return raw_content
def __str__(self):
return f"Document(type={self.type}, sequence={self.sequence}, filename={self.filename}, description={self.description})"
def text(self) -> str:
"""Extract content between <TEXT> tags."""
match = re.search(r'<TEXT>([\s\S]*?)</TEXT>', self.raw_content, re.DOTALL | re.IGNORECASE)
return match.group(1).strip() if match else ""
def xml(self) -> Optional[str]:
"""Extract content between <XML> tags if present."""
match = re.search(r'<XML>([\s\S]*?)</XML>', self.raw_content, re.DOTALL | re.IGNORECASE)
return match.group(1).strip() if match else None
def html(self) -> Optional[str]:
"""Extract content between <HTML> tags if present."""
match = re.search(r'<HTML>([\s\S]*?)</HTML>', self.raw_content, re.DOTALL | re.IGNORECASE)
return match.group(1).strip() if match else None
def xbrl(self) -> Optional[str]:
"""Extract content between <XBRL> tags if present."""
match = re.search(r'<XBRL>([\s\S]*?)</XBRL>', self.raw_content, re.DOTALL | re.IGNORECASE)
return match.group(1).strip() if match else None
def get_content_type(self) -> str:
"""
Determine the primary content type of the document.
Returns: 'xml', 'html', 'xbrl', or 'text'
"""
if self.xml():
return 'xml'
elif self.html():
return 'html'
elif self.xbrl():
return 'xbrl'
return 'text'
def _raise_sec_html_error(content: str):
"""
Analyze HTML/XML error content from SEC and raise appropriate specific exception.
Args:
content: HTML or XML content received from SEC
Raises:
SECIdentityError: For identity-related errors
SECFilingNotFoundError: For missing filing errors
SECHTMLResponseError: For other HTML/XML responses
"""
# Check for identity error
if "Your Request Originates from an Undeclared Automated Tool" in content:
raise SECIdentityError(
"SEC rejected request due to invalid or missing EDGAR_IDENTITY. "
"Please set a valid identity using set_identity('Your Name your.email@domain.com'). "
"See https://www.sec.gov/os/accessing-edgar-data"
)
# Check for AWS S3 NoSuchKey error (XML format)
if "<Code>NoSuchKey</Code>" in content and "<Message>The specified key does not exist.</Message>" in content:
raise SECFilingNotFoundError(
"SEC filing not found - the specified key does not exist in EDGAR archives. "
"Check that the accession number and filing date are correct."
)
# Check for general not found errors
if "Not Found" in content or "404" in content:
raise SECFilingNotFoundError(
"SEC filing not found. Check that the accession number and filing date are correct."
)
# Generic HTML/XML response error
raise SECHTMLResponseError(
"SEC returned HTML or XML content instead of expected SGML filing data. "
"This may indicate an invalid request or temporary SEC server issue."
)
class SGMLParser:
@staticmethod
def detect_format(content: str) -> SGMLFormatType:
"""Detect SGML format based on root element"""
# First check for valid SGML structure before checking for HTML content
content_stripped = content.lstrip()
# Check for valid SGML formats first
if content_stripped.startswith('<SUBMISSION>'):
return SGMLFormatType.SUBMISSION
elif '<SEC-DOCUMENT>' in content:
return SGMLFormatType.SEC_DOCUMENT
elif '<IMS-DOCUMENT>' in content:
# For old filings from the 1990's
return SGMLFormatType.SEC_DOCUMENT
elif '<DOCUMENT>' in content[:1000]:
# For old filings from the 1990's
return SGMLFormatType.SEC_DOCUMENT
# Only check for HTML content if it's not valid SGML structure
# This prevents false positives when SGML contains HTML within <TEXT> sections
if has_html_content(content):
_raise_sec_html_error(content)
# Check if we received XML error content (like AWS S3 NoSuchKey errors)
if content_stripped.startswith('<?xml') and '<Error>' in content:
_raise_sec_html_error(content)
raise ValueError("Unknown SGML format")
def parse(self, content) -> dict:
"""Main entry point for parsing"""
format_type = self.detect_format(content)
if format_type == SGMLFormatType.SUBMISSION:
return self._parse_submission_format(content)
else:
return self._parse_sec_document_format(content)
def _parse_submission_format(self, content):
parser = SubmissionFormatParser()
return parser.parse(content)
def _parse_sec_document_format(self, content):
parser = SecDocumentFormatParser()
return parser.parse(content)
class SubmissionFormatParser:
def __init__(self):
# Initialize main data structure
self.data = {
'format': SGMLFormatType.SUBMISSION,
'header': '',
'documents': [],
}
# Parser state
self.current_path = [] # Stack to track current position in hierarchy
self.header_lines = [] # Collect header lines
self.in_documents = False
# Known section tags that can contain nested content
self.SECTION_TAGS = {
'FILER',
'OWNER-DATA',
'COMPANY-DATA',
'REPORTING-OWNER',
'ISSUER',
'DEPOSITOR',
'SECURITIZER',
'UNDERWRITER',
'ISSUING_ENTITY',
'FORMER-COMPANY',
'SUBJECT-COMPANY',
'FILED-BY',
'FORMER-NAME',
'FILING-VALUES',
'BUSINESS-ADDRESS',
'MAIL-ADDRESS',
'CLASS-CONTRACT',
'SERIES',
'NEW-SERIES',
'NEW-CLASSES-CONTRACTS',
'ACQUIRING-DATA',
'TARGET-DATA',
'SERIAL-COMPANY',
'MERGER',
'SERIES-AND-CLASSES-CONTRACTS-DATA',
'NEW-SERIES-AND-CLASSES-CONTRACTS',
'MERGER-SERIES-AND-CLASSES-CONTRACTS',
'EXISTING-SERIES-AND-CLASSES-CONTRACTS',
'RULE',
'ITEM'
}
# Tags that can appear multiple times and should be stored as lists
self.REPEATABLE_TAGS = {
'FILER',
'REPORTING-OWNER',
'UNDERWRITER',
'SERIES',
'CLASS-CONTRACT',
'FORMER-COMPANY',
'SUBJECT-COMPANY',
'ITEM'
}
def _get_current_context(self) -> dict:
"""Navigate to current position in data hierarchy."""
context = self.data
for path_element in self.current_path:
tag, index = path_element
if index is not None:
context = context[tag][index]
else:
context = context[tag]
return context
def _is_unclosed_tag(self, line: str) -> bool:
"""Check if line is an unclosed tag with value."""
line = line.strip()
if not (line.startswith('<') and '>' in line and not line.startswith('</')):
return False
tag_end = line.index('>')
content_after = line[tag_end + 1:].strip()
return bool(content_after)
def _is_section_end(self, line: str) -> bool:
"""Check if line ends a section."""
return line.strip().startswith('</')
def _is_section_start(self, line: str) -> bool:
"""Identifies if a line starts a new nested section."""
line = line.strip()
if not line.startswith('<') or not line.endswith('>'):
return False
tag = line[1:-1] # Remove < and >
return tag in self.SECTION_TAGS
def _is_data_tag(self, line: str) -> bool:
"""Identifies if a line contains a tag with a value."""
line = line.strip()
if not line.startswith('<'):
return False
parts = line.split('>')
return len(parts) == 2 and bool(parts[1].strip())
def _is_empty_tag(self, line: str) -> bool:
"""Identifies if a line is an empty tag."""
line = line.strip()
return (line.startswith('<') and
line.endswith('>') and
not line.startswith('</') and
not self._is_section_start(line) and
not self._is_data_tag(line))
def _handle_section_start(self, line: str) -> None:
"""Handle start of nested section."""
tag = line.strip()[1:-1] # Remove < and >
current_context = self._get_current_context()
# Initialize tag in current context if needed
if tag not in current_context:
if tag in self.REPEATABLE_TAGS:
current_context[tag] = []
else:
current_context[tag] = {}
# For repeatable tags, append new dict and track index
if tag in self.REPEATABLE_TAGS:
current_context[tag].append({})
self.current_path.append((tag, len(current_context[tag]) - 1))
else:
self.current_path.append((tag, None))
def _handle_section_end(self, line: str) -> None:
"""Handle end of nested section."""
tag = line.strip()[2:-1] # Remove </ and >
# Verify we're closing the correct tag
current_tag, _ = self.current_path[-1]
if tag != current_tag:
raise ValueError(f"Mismatched tags: expected </{current_tag}>, got </{tag}>")
# Pop the current section from the path
self.current_path.pop()
def _handle_data_tag(self, line: str) -> None:
"""Handle tags with values."""
line = line.strip()
tag_end = line.index('>')
tag = line[1:tag_end]
value = line[tag_end + 1:].strip()
current_context = self._get_current_context()
# Handle repeated tags
if tag in current_context:
if not isinstance(current_context[tag], list):
current_context[tag] = [current_context[tag]]
current_context[tag].append(value)
else:
current_context[tag] = value
def _handle_empty_tag(self, line: str) -> None:
"""Handle empty tags."""
tag = line.strip()[1:-1] # Remove < and >
current_context = self._get_current_context()
current_context[tag] = ""
def _handle_unclosed_tag(self, line: str) -> None:
"""Handle tags like <ITEMS>value."""
line = line.strip()
tag_end = line.index('>')
tag = line[1:tag_end]
value = line[tag_end + 1:].strip()
current_context = self._get_current_context()
if tag in current_context:
if not isinstance(current_context[tag], list):
current_context[tag] = [current_context[tag]]
current_context[tag].append(value)
else:
current_context[tag] = value
def parse(self, content: str) -> dict:
"""Parse SGML content in SUBMISSION format."""
document_buffer = None
for line in content.splitlines():
# Check for document section
if '<DOCUMENT>' in line:
self.data['header'] = '\n'.join(self.header_lines)
self.in_documents = True
document_buffer = [line]
continue
if self.in_documents:
if '</DOCUMENT>' in line:
document_buffer.append(line)
doc_content = '\n'.join(document_buffer)
doc_data = self._parse_document_section(doc_content)
if doc_data:
self.data['documents'].append(doc_data)
document_buffer = None
elif document_buffer is not None:
document_buffer.append(line)
else:
# Header section parsing
self.header_lines.append(line)
line = line.strip()
if not line:
continue
if self._is_section_start(line):
self._handle_section_start(line)
elif self._is_section_end(line):
self._handle_section_end(line)
elif self._is_data_tag(line):
self._handle_data_tag(line)
elif self._is_empty_tag(line):
self._handle_empty_tag(line)
elif self._is_unclosed_tag(line):
self._handle_unclosed_tag(line)
return self.data
def _parse_document_section(self, content: str) -> dict:
"""Parse a single document section."""
doc_data = {
'type': '',
'sequence': '',
'filename': '',
'description': '',
'content': content
}
# Extract document metadata
type_match = re.search(r'<TYPE>([^<\n]+)', content)
if type_match:
doc_data['type'] = type_match.group(1).strip()
sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', content)
if sequence_match:
doc_data['sequence'] = sequence_match.group(1).strip()
filename_match = re.search(r'<FILENAME>([^<\n]+)', content)
if filename_match:
doc_data['filename'] = filename_match.group(1).strip()
description_match = re.search(r'<DESCRIPTION>([^<\n]+)', content)
if description_match:
doc_data['description'] = description_match.group(1).strip()
return doc_data
class SecDocumentFormatParser:
"""Parser for <SEC-DOCUMENT> style SGML"""
def __init__(self):
self.in_header = False
self.data = {
'format': SGMLFormatType.SEC_DOCUMENT,
'header': '',
'documents': [],
'filer': {}
}
self.current_document = {}
self.header_text = []
def parse(self, content: str) -> dict:
"""Parse SGML content in SEC-DOCUMENT format
Args:
content: The full SGML content as string
Returns:
dict containing parsed header and documents
"""
document_buffer = []
for line in content.splitlines():
if '<SEC-HEADER>' in line or '<IMS-HEADER>' in line:
self.in_header = True
continue
elif '</SEC-HEADER>' in line or '</IMS-HEADER>' in line:
self.in_header = False
self.data['header'] = '\n'.join(self.header_text)
continue
if self.in_header:
# Collect header text
self.header_text.append(line)
# Handle document sections
if '<DOCUMENT>' in line:
document_buffer = [] # Start new document
elif '</DOCUMENT>' in line and document_buffer:
# Parse completed document
doc_content = '\n'.join(document_buffer)
doc_data = self._parse_document_section(doc_content)
if doc_data:
self.data['documents'].append(doc_data)
document_buffer = []
elif document_buffer is not None: # Currently collecting document content
document_buffer.append(line)
return self.data
def _parse_document_section(self, content: str) -> dict:
"""Parse a single document section
Args:
content: Content between <DOCUMENT> tags
Returns:
dict with document metadata and content
"""
doc_data = {
'type': '',
'sequence': '',
'filename': '',
'description': '',
'content': content
}
# Extract document metadata using regex
type_match = re.search(r'<TYPE>([^<\n]+)', content)
if type_match:
doc_data['type'] = type_match.group(1).strip()
sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', content)
if sequence_match:
doc_data['sequence'] = sequence_match.group(1).strip()
filename_match = re.search(r'<FILENAME>([^<\n]+)', content)
if filename_match:
doc_data['filename'] = filename_match.group(1).strip()
description_match = re.search(r'<DESCRIPTION>([^<\n]+)', content)
if description_match:
doc_data['description'] = description_match.group(1).strip()
return doc_data
def list_documents(content:str) -> list[SGMLDocument]:
"""
Convenience method to parse all documents from a source into a list.
Args:
content: The content string to parse
Returns:
List of SGMLDocument objects
"""
return list(iter_documents(content))
def iter_documents(content:str) -> Iterator[SGMLDocument]:
"""
Stream SGML documents from either a URL or file path, yielding parsed documents.
Args:
content: The content string to parse
Yields:
SGMLDocument objects containing the parsed content
Raises:
ValueError: If the source is invalid
ConnectionError: If URL retrieval fails after retries
FileNotFoundError: If the file path doesn't exist
"""
document_pattern = re.compile(r'<DOCUMENT>([\s\S]*?)</DOCUMENT>')
for match in document_pattern.finditer(content):
document = parse_document(match.group(1))
if document:
yield document
def parse_document(document_str: str) -> SGMLDocument:
"""
Parse a single SGML document section, maintaining raw content.
"""
# Extract individual fields with separate patterns
type_match = re.search(r'<TYPE>([^<\n]+)', document_str)
sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', document_str)
filename_match = re.search(r'<FILENAME>([^<\n]+)', document_str)
description_match = re.search(r'<DESCRIPTION>([^<\n]+)', document_str)
return SGMLDocument(
type=type_match.group(1).strip() if type_match else "",
sequence=sequence_match.group(1).strip() if sequence_match else "",
filename=filename_match.group(1).strip() if filename_match else "",
description=description_match.group(1).strip() if description_match else "",
raw_content=document_str
)

View File

@@ -0,0 +1,349 @@
"""
Module for converting HTML tables from filing reports to pandas DataFrames.
This provides an alternative to XBRL parsing by extracting data directly from
company-formatted HTML tables.
"""
import re
from dataclasses import dataclass
from typing import Optional, Union
import pandas as pd
from edgar.files.html import Document, TableNode
from edgar.files.tables import ProcessedTable
@dataclass
class TableMetadata:
"""Metadata extracted from table headers and content"""
currency: Optional[str] = None
units: Optional[str] = None
scaling_factor: Optional[int] = None
period_type: Optional[str] = None # 'instant' or 'duration'
class FinancialTableExtractor:
"""Extract financial tables from HTML reports as pandas DataFrames"""
# Common patterns for financial data
# More comprehensive currency patterns
CURRENCY_PATTERN = re.compile(
r'\$|USD|EUR|GBP|JPY|CNY|CAD|AUD|CHF|'
r'£|€|¥|₹|' # Currency symbols
r'\bDollars?\b|\bPounds?\b|\bEuros?\b|\bYen\b',
re.IGNORECASE
)
# More flexible units pattern
UNITS_PATTERN = re.compile(
r'(?:in\s+)?(?:thousands?|millions?|billions?|000s?|000,000s?|mln|mil|bn)',
re.IGNORECASE
)
SCALING_PATTERN = re.compile(r'(\d+(?:,\d{3})*)\s*=\s*\$?1')
# More flexible date patterns to handle various formats
PERIOD_PATTERN = re.compile(
r'(\d{1,2}[\s/\-]\w{3,}[\s/\-]\d{2,4}|' # 31-Dec-2024, 31/December/24
r'\w{3,}\.?\s+\d{1,2},?\s+\d{4}|' # December 31, 2024
r'\d{4}[\s/\-]\d{1,2}[\s/\-]\d{1,2}|' # 2024-12-31
r'\d{1,2}[\s/\-]\d{1,2}[\s/\-]\d{2,4}|' # 12/31/2024, 31-12-24
r'Q[1-4]\s*\d{2,4}|' # Q1 2024, Q12024
r'\d{1}Q\s*\d{2,4}|' # 1Q 2024, 1Q24
r'FY\s*\d{2,4}|' # FY 2024, FY24
r'Fiscal\s+\d{4}|' # Fiscal 2024
r'Year\s+Ended)', # Year Ended
re.IGNORECASE
)
@classmethod
def extract_table_to_dataframe(cls, table_node: TableNode) -> pd.DataFrame:
"""
Convert a TableNode to a pandas DataFrame with appropriate data types.
Args:
table_node: The TableNode containing financial data
Returns:
pd.DataFrame with financial data, periods as columns, line items as index
"""
try:
# Get processed table
processed_table = table_node._processed
if not processed_table:
return pd.DataFrame()
# Extract metadata from headers
metadata = cls._extract_metadata(table_node, processed_table)
# Build DataFrame
df = cls._build_dataframe(processed_table, metadata)
# Apply data transformations
df = cls._apply_transformations(df, metadata)
return df
except Exception:
# Log error but return empty DataFrame to allow processing to continue
return pd.DataFrame()
@classmethod
def _extract_metadata(cls, table_node: TableNode, processed_table: ProcessedTable) -> TableMetadata:
"""Extract metadata from table headers and first few rows"""
metadata = TableMetadata()
# Check headers for currency and units
if processed_table.headers:
header_text = ' '.join(processed_table.headers)
# Extract currency
currency_match = cls.CURRENCY_PATTERN.search(header_text)
if currency_match:
metadata.currency = currency_match.group(0)
# Extract units
units_match = cls.UNITS_PATTERN.search(header_text)
if units_match:
unit_text = units_match.group(0).lower()
if any(x in unit_text for x in ['thousand', '000s', '000,']):
metadata.scaling_factor = 1000
metadata.units = 'thousands'
elif any(x in unit_text for x in ['million', 'mln', 'mil', '000,000']):
metadata.scaling_factor = 1000000
metadata.units = 'millions'
elif any(x in unit_text for x in ['billion', 'bn']):
metadata.scaling_factor = 1000000000
metadata.units = 'billions'
# Check if periods are durations or instants
if processed_table.headers:
period_headers = [h for h in processed_table.headers if cls.PERIOD_PATTERN.search(h)]
if period_headers:
# If headers contain "ended" it's likely duration periods
if any('ended' in h.lower() for h in period_headers):
metadata.period_type = 'duration'
else:
metadata.period_type = 'instant'
return metadata
@classmethod
def _build_dataframe(cls, processed_table: ProcessedTable, metadata: TableMetadata) -> pd.DataFrame:
"""Build initial DataFrame from processed table"""
if not processed_table.data_rows:
return pd.DataFrame()
# Identify period columns and line item column
headers = processed_table.headers or []
period_cols = []
line_item_col = 0
# Check if this is a "vertical" table (like Cover Page)
# where first column is labels and all others are data
is_vertical_table = False
if len(headers) >= 2:
# Check if first column has label-like patterns
first_header_lower = headers[0].lower() if headers[0] else ''
first_is_label = any(pattern in first_header_lower for pattern in
['entity', 'line item', 'information', 'abstract', 'cover page',
'detail', 'description', 'item'])
# Check if this looks like a cover page or entity info table
# by examining the first few data rows
looks_like_entity_info = False
if processed_table.data_rows and len(processed_table.data_rows) > 2:
# Check if first column has entity/document field names
first_col_values = []
for row in processed_table.data_rows[:10]: # Check more rows
if len(row) > 0 and isinstance(row[0], str):
first_col_values.append(row[0].lower())
# More comprehensive patterns for vertical tables
entity_patterns = ['entity', 'document', 'registrant', 'address',
'file number', 'incorporation', 'fiscal', 'telephone',
'securities', 'trading', 'exchange', 'ticker']
# Count how many rows match entity patterns
pattern_matches = sum(
any(pattern in val for pattern in entity_patterns)
for val in first_col_values
)
# If more than 30% of rows have entity-like labels, it's probably vertical
looks_like_entity_info = pattern_matches >= len(first_col_values) * 0.3
is_vertical_table = first_is_label or looks_like_entity_info
if is_vertical_table:
# For vertical tables, first column is index, rest are data
line_item_col = 0
period_cols = list(range(1, len(headers)))
# Ensure we don't include the line item column
if line_item_col in period_cols:
period_cols.remove(line_item_col)
else:
# For standard tables, identify period columns
for i, header in enumerate(headers):
if cls.PERIOD_PATTERN.search(header):
period_cols.append(i)
elif i == 0: # First column is usually line items
line_item_col = i
# Extract data
data = []
index = []
for row in processed_table.data_rows:
if len(row) > line_item_col:
line_item = row[line_item_col].strip()
if line_item and not line_item.isspace():
index.append(line_item)
row_data = []
for col_idx in period_cols:
if col_idx < len(row):
row_data.append(row[col_idx])
else:
row_data.append('')
data.append(row_data)
# Create DataFrame
if data:
column_names = []
for i, col_idx in enumerate(period_cols):
if col_idx < len(headers):
# Clean up column name and make unique if needed
col_name = headers[col_idx].strip()
# If duplicate, append index
if col_name in column_names:
col_name = f"{col_name}_{i}"
column_names.append(col_name)
else:
column_names.append(f'Col_{i}')
df = pd.DataFrame(data, index=index, columns=column_names)
else:
df = pd.DataFrame()
return df
@classmethod
def _apply_transformations(cls, df: pd.DataFrame, metadata: TableMetadata) -> pd.DataFrame:
"""Apply data type conversions and scaling"""
if df.empty:
return df
# Convert numeric columns
for col in df.columns:
df[col] = df[col].apply(cls._parse_financial_value)
# Apply scaling if specified
if metadata.scaling_factor:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols] * metadata.scaling_factor
# Add metadata as attributes
df.attrs['currency'] = metadata.currency
df.attrs['units'] = metadata.units
df.attrs['scaling_factor'] = metadata.scaling_factor
df.attrs['period_type'] = metadata.period_type
return df
@staticmethod
def _parse_financial_value(value: str) -> Union[float, str]:
"""Parse a financial value string to float or keep as string"""
if not isinstance(value, str):
return value
# Clean the value
clean_value = value.strip()
# Check for special markers and empty values
empty_markers = ['', '-', '', '', '', 'N/A', 'n/a', 'NA', 'nm', 'NM', '*', '**']
if clean_value in empty_markers or not clean_value:
return 0.0
# Remove currency symbols, whitespace, and other common symbols
# Keep negative sign and decimal points
clean_value = re.sub(r'[£€¥₹$,\s]', '', clean_value)
# Handle various negative formats
if clean_value.startswith('(') and clean_value.endswith(')'):
clean_value = '-' + clean_value[1:-1]
elif clean_value.endswith('-'): # Some companies put negative sign at end
clean_value = '-' + clean_value[:-1]
# Handle percentage values (remove % but keep the number)
clean_value = clean_value.replace('%', '')
# Try to convert to float
try:
return float(clean_value)
except ValueError:
# If it contains any digits, try harder to extract them
if re.search(r'\d', clean_value):
# Extract just the numeric part
numeric_match = re.search(r'-?\d+\.?\d*', clean_value)
if numeric_match:
try:
return float(numeric_match.group(0))
except ValueError:
pass
# Return original if not numeric
return value
def extract_statement_dataframe(report_content: str) -> pd.DataFrame:
"""
Convenience function to extract a DataFrame from report HTML content.
Args:
report_content: HTML content from a report
Returns:
pd.DataFrame containing the financial data
"""
# Parse HTML document
document = Document.parse(report_content)
if not document.tables:
return pd.DataFrame()
# Try each table to find one with financial data
for table_node in document.tables:
# Skip tables that are too small (likely headers or metadata)
if table_node.row_count < 3:
continue
# Check if table has numeric data
if _table_has_financial_data(table_node):
df = FinancialTableExtractor.extract_table_to_dataframe(table_node)
if not df.empty:
return df
# If no suitable table found, try the first table anyway
if document.tables:
return FinancialTableExtractor.extract_table_to_dataframe(document.tables[0])
return pd.DataFrame()
def _table_has_financial_data(table_node: TableNode) -> bool:
"""Check if a table contains financial data by looking for numeric patterns"""
if not table_node.content:
return False
# Check first few rows for numeric data
numeric_count = 0
total_cells = 0
for _i, row in enumerate(table_node.content[:10]): # Check first 10 rows
for cell in row.cells:
total_cells += 1
if isinstance(cell.content, str):
# Check for financial number patterns
if re.search(r'\$?\s*\d+[,.]?\d*', cell.content):
numeric_count += 1
# If more than 20% of cells have numbers, likely a financial table
return total_cells > 0 and (numeric_count / total_cells) > 0.2

View File

@@ -0,0 +1,82 @@
import base64
import re
__all__ = ['extract_text_between_tags', 'get_content_between_tags', 'strip_tags', 'is_xml', 'decode_uu']
def extract_text_between_tags(content: str, tag: str) -> str:
"""
Extracts text from provided content between the specified HTML/XML tags.
:param content: The text content to search through
:param tag: The tag to extract the content from
:return: The extracted text between the tags
"""
tag_start = f'<{tag}>'
tag_end = f'</{tag}>'
is_tag = False
extracted_content = ""
for line in content.splitlines():
if line.startswith(tag_start):
is_tag = True
continue # Skip the start tag line
elif line.startswith(tag_end):
break # Stop reading if end tag is found
elif is_tag:
extracted_content += line + '\n' # Add line to result
return extracted_content.strip()
def get_content_between_tags(content: str, outer_tag: str = None) -> str:
"""
Extract content between specified tags, starting from most nested tags.
Args:
content: Raw content containing tagged sections
outer_tag: Optional specific tag to extract from (e.g. 'XBRL', 'TEXT')
Returns:
str: Content between the specified tags, or innermost content if no tag specified
"""
known_tags = ["PDF", "XBRL", "XML", "TEXT"] # Ordered from most nested to least nested
if outer_tag:
# Extract content for specific tag
pattern = f'<{outer_tag}>(.*?)</{outer_tag}>'
match = re.search(pattern, content, re.DOTALL)
return match.group(1).strip() if match else ''
# If no tag specified, find the first matching tag from most nested to least
for tag in known_tags:
pattern = f'<{tag}>(.*?)</{tag}>'
match = re.search(pattern, content, re.DOTALL)
if match:
return match.group(1).strip()
return ''
def strip_tags(text: str, start_tag: str, end_tag: str) -> str:
"""Strip XML/HTML tags from text if present."""
if text.startswith(start_tag) and text.endswith(end_tag):
return text[len(start_tag):-len(end_tag)].strip()
return text
def is_xml(filename: str) -> bool:
"""Check if a file is XML based on the file extension.
.xsd, .xml, .xbrl
"""
return filename.lower().endswith(('.xsd', '.xml', '.xbrl'))
def decode_uu(uu_content):
lines = uu_content.split('\n')
data = ''
for line in lines[1:]: # Skip "begin" line
if line.startswith('`') or line.startswith('end'):
break
# Convert UU to base64 padding
data += ''.join([chr(((ord(c) - 32) & 63) + 32) for c in line.strip()])
return base64.b64decode(data)