Initial commit
This commit is contained in:
4
venv/lib/python3.10/site-packages/edgar/sgml/__init__.py
Normal file
4
venv/lib/python3.10/site-packages/edgar/sgml/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from edgar.sgml.filing_summary import FilingSummary, Report, Reports, Statements
|
||||
from edgar.sgml.sgml_common import FilingSGML, iter_documents, list_documents
|
||||
from edgar.sgml.sgml_header import Filer, FilingHeader, FilingMetadata, Issuer, ReportingOwner
|
||||
from edgar.sgml.sgml_parser import SGMLDocument
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
666
venv/lib/python3.10/site-packages/edgar/sgml/filing_summary.py
Normal file
666
venv/lib/python3.10/site-packages/edgar/sgml/filing_summary.py
Normal file
@@ -0,0 +1,666 @@
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from functools import lru_cache
|
||||
from typing import Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
from bs4 import BeautifulSoup
|
||||
from rich import box
|
||||
from rich.console import Group
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from edgar.core import DataPager, PagingState, log, strtobool
|
||||
from edgar.files.html import Document
|
||||
from edgar.richtools import print_rich, repr_rich, rich_to_text
|
||||
from edgar.xmltools import child_text
|
||||
|
||||
__all__ = ['Report', 'Reports', 'File', 'FilingSummary']
|
||||
|
||||
class Reports:
|
||||
|
||||
"""
|
||||
A collection of reports in a filing summary
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
data:pa.Table,
|
||||
filing_summary: Optional['FilingSummary'] = None,
|
||||
original_state: Optional[PagingState] = None,
|
||||
title: Optional[str] = "Reports"):
|
||||
self.data:pa.Table = data
|
||||
self.data_pager = DataPager(data)
|
||||
self._original_state = original_state or PagingState(0, len(self.data))
|
||||
self.n = 0
|
||||
self._filing_summary = filing_summary
|
||||
self.title = title
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __iter__(self):
|
||||
self.n = 0
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.n < len(self.data):
|
||||
report = Report(
|
||||
instance=self.data['instance'][self.n].as_py(),
|
||||
is_default=self.data['IsDefault'][self.n].as_py(),
|
||||
has_embedded_reports=self.data['HasEmbeddedReports'][self.n].as_py(),
|
||||
html_file_name=self.data['HtmlFileName'][self.n].as_py(),
|
||||
long_name=self.data['LongName'][self.n].as_py(),
|
||||
report_type=self.data['ReportType'][self.n].as_py(),
|
||||
role=self.data['Role'][self.n].as_py(),
|
||||
parent_role=self.data['ParentRole'][self.n].as_py(),
|
||||
short_name=self.data['ShortName'][self.n].as_py(),
|
||||
menu_category=self.data['MenuCategory'][self.n].as_py(),
|
||||
position=self.data['Position'][self.n].as_py(),
|
||||
reports = self
|
||||
)
|
||||
self.n += 1
|
||||
return report
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
def current(self):
|
||||
"""Display the current page ... which is the default for this filings object"""
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
"""Show the next page"""
|
||||
data_page = self.data_pager.next()
|
||||
if data_page is None:
|
||||
log.warning("End of data .. use prev() \u2190 ")
|
||||
return None
|
||||
start_index, _ = self.data_pager._current_range
|
||||
paging_state = PagingState(page_start=start_index, num_records=len(self))
|
||||
return Reports(data_page, original_state=paging_state)
|
||||
|
||||
def previous(self):
|
||||
"""
|
||||
Show the previous page of the data
|
||||
:return:
|
||||
"""
|
||||
data_page = self.data_pager.previous()
|
||||
if data_page is None:
|
||||
log.warning(" No previous data .. use next() \u2192 ")
|
||||
return None
|
||||
start_index, _ = self.data_pager._current_range
|
||||
paging_state = PagingState(page_start=start_index, num_records=len(self))
|
||||
return Reports(data_page, original_state=paging_state)
|
||||
|
||||
def to_pandas(self):
|
||||
return self.data.to_pandas()
|
||||
|
||||
def __getitem__(self, item):
|
||||
record = self.filter("Position", str(item))
|
||||
if record:
|
||||
return record
|
||||
|
||||
|
||||
def create_from_record(self, data:pa.Table):
|
||||
return Report(
|
||||
instance=data['instance'][0].as_py(),
|
||||
is_default=data['IsDefault'][0].as_py(),
|
||||
has_embedded_reports=data['HasEmbeddedReports'][0].as_py(),
|
||||
html_file_name=data['HtmlFileName'][0].as_py(),
|
||||
long_name=data['LongName'][0].as_py(),
|
||||
report_type=data['ReportType'][0].as_py(),
|
||||
role=data['Role'][0].as_py(),
|
||||
parent_role=data['ParentRole'][0].as_py(),
|
||||
short_name=data['ShortName'][0].as_py(),
|
||||
menu_category=data['MenuCategory'][0].as_py(),
|
||||
position=data['Position'][0].as_py(),
|
||||
reports = self
|
||||
)
|
||||
|
||||
@property
|
||||
def long_names(self) -> List[str]:
|
||||
return self.data['LongName'].to_pylist()
|
||||
|
||||
@property
|
||||
def short_names(self) -> List[str]:
|
||||
return self.data['ShortName'].to_pylist()
|
||||
|
||||
def get_by_category(self, category: str):
|
||||
"""
|
||||
Get a single report by category
|
||||
"""
|
||||
data = self.data.filter(pc.equal(self.data['MenuCategory'], category))
|
||||
return Reports(data, filing_summary=self._filing_summary, title=category)
|
||||
|
||||
@property
|
||||
def statements(self) -> Optional['Statements']:
|
||||
"""
|
||||
Get all reports in the Statements category
|
||||
"""
|
||||
reports = self.get_by_category('Statements')
|
||||
if reports:
|
||||
return Statements(reports)
|
||||
|
||||
def get_by_filename(self, file_name: str):
|
||||
"""
|
||||
Get a single report by file name
|
||||
"""
|
||||
data = self.data.filter(pc.equal(self.data['HtmlFileName'], file_name))
|
||||
if len(data) ==1:
|
||||
return self.create_from_record(data)
|
||||
|
||||
def get_by_short_name(self, short_name: str):
|
||||
"""
|
||||
Get a single report by short name
|
||||
"""
|
||||
data = self.data.filter(pc.equal(self.data['ShortName'], short_name))
|
||||
if len(data) == 1:
|
||||
return self.create_from_record(data)
|
||||
|
||||
def filter(self, column: Union[str, List[str]], value: Union[str, List[str]]):
|
||||
if isinstance(column, str):
|
||||
column = [column]
|
||||
if isinstance(value, str):
|
||||
value = [value]
|
||||
# Convert value list to a pyarrow array for proper comparison
|
||||
value_set = pa.array(value)
|
||||
# Initialize mask using the first column
|
||||
mask = pc.is_in(self.data[column[0]], value_set)
|
||||
# Combine with subsequent columns using logical AND
|
||||
for col in column[1:]:
|
||||
mask = pc.and_(mask, pc.is_in(self.data[col], value_set))
|
||||
# Apply the mask to filter the data
|
||||
data = self.data.filter(mask)
|
||||
# Return a single Report or new Reports instance
|
||||
if len(data) == 1:
|
||||
return self.create_from_record(data)
|
||||
return Reports(data)
|
||||
|
||||
def __rich__(self):
|
||||
table = Table(
|
||||
show_header=True,
|
||||
header_style="dim",
|
||||
show_lines=True,
|
||||
box=box.SIMPLE,
|
||||
border_style="bold grey54",
|
||||
row_styles=["", "bold"]
|
||||
)
|
||||
table.add_column("#", style="dim", justify="left")
|
||||
table.add_column("Report", style="bold", width=60)
|
||||
table.add_column("Category", width=12)
|
||||
table.add_column("File", justify="left")
|
||||
|
||||
# Iterate through rows in current page
|
||||
for i in range(len(self)):
|
||||
position = self.data['Position'][i].as_py()
|
||||
|
||||
row = [
|
||||
str(position) if position else "-",
|
||||
self.data['ShortName'][i].as_py(),
|
||||
self.data['MenuCategory'][i].as_py() or "",
|
||||
self.data['HtmlFileName'][i].as_py() or ""
|
||||
]
|
||||
table.add_row(*row)
|
||||
|
||||
panel = Panel(table, title=self.title, expand=False)
|
||||
return panel
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
class Report:
|
||||
|
||||
def __init__(self,
|
||||
instance: Optional[str],
|
||||
is_default: Optional[bool],
|
||||
has_embedded_reports: Optional[bool],
|
||||
long_name: Optional[str],
|
||||
short_name: Optional[str],
|
||||
menu_category: Optional[str],
|
||||
position: Optional[int],
|
||||
html_file_name: Optional[str],
|
||||
report_type: Optional[str],
|
||||
role: Optional[str],
|
||||
parent_role: Optional[str] = None,
|
||||
reports = None):
|
||||
self.instance = instance
|
||||
self.is_default = is_default
|
||||
self.has_embedded_reports = has_embedded_reports
|
||||
self.long_name = long_name
|
||||
self.short_name = short_name
|
||||
self.menu_category = menu_category
|
||||
self.position = position
|
||||
self.html_file_name = html_file_name
|
||||
self.report_type = report_type
|
||||
self.role = role
|
||||
self.parent_role = parent_role
|
||||
self._reports = reports
|
||||
|
||||
@property
|
||||
def content(self):
|
||||
"""
|
||||
Get the content of the report
|
||||
"""
|
||||
sgml = self._reports._filing_summary._filing_sgml
|
||||
if sgml:
|
||||
return sgml.get_content(self.html_file_name)
|
||||
|
||||
def text(self):
|
||||
"""
|
||||
Get the text content of the report
|
||||
"""
|
||||
table = self._get_report_table()
|
||||
if table:
|
||||
return rich_to_text(table.render(500))
|
||||
|
||||
@lru_cache
|
||||
def _get_report_table(self):
|
||||
"""
|
||||
Get the first table in the document
|
||||
"""
|
||||
document = Document.parse(self.content)
|
||||
if len(document.tables) == 0:
|
||||
log.warning(f"No tables found in {self.html_file_name}")
|
||||
return None
|
||||
return document.tables[0]
|
||||
|
||||
def view(self):
|
||||
table = self._get_report_table()
|
||||
if table:
|
||||
print_rich(table.render(500))
|
||||
|
||||
def to_dataframe(self):
|
||||
"""
|
||||
Extract the report's financial table as a pandas DataFrame.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Financial data with periods as columns and line items as index.
|
||||
Returns empty DataFrame if no tables found.
|
||||
|
||||
The DataFrame includes metadata attributes:
|
||||
- currency: The currency used (e.g., 'USD')
|
||||
- units: The units description (e.g., 'thousands')
|
||||
- scaling_factor: Numeric scaling factor (e.g., 1000 for thousands)
|
||||
- period_type: 'instant' or 'duration' for the time periods
|
||||
"""
|
||||
from edgar.sgml.table_to_dataframe import extract_statement_dataframe
|
||||
|
||||
content = self.content
|
||||
if content:
|
||||
return extract_statement_dataframe(content)
|
||||
|
||||
import pandas as pd
|
||||
return pd.DataFrame()
|
||||
|
||||
def __str__(self):
|
||||
return f"Report(short_name={self.short_name}, category={self.menu_category}, file_name={self.html_file_name})"
|
||||
|
||||
def __rich__(self):
|
||||
return Panel(
|
||||
Text.assemble(("Report ", "dim"), (self.long_name, "bold")),
|
||||
subtitle=Text(self.menu_category, style='dim italic'),
|
||||
expand=False,
|
||||
width=400,
|
||||
height=4
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
@dataclass
|
||||
class File:
|
||||
file_name: str
|
||||
doc_type: Optional[str]
|
||||
is_definitely_fs: Optional[bool]
|
||||
is_usgaap: Optional[bool]
|
||||
original: Optional[str]
|
||||
|
||||
class FilingSummary:
|
||||
|
||||
def __init__(self,
|
||||
reports: Reports,
|
||||
short_name_map: Dict[str, Report],
|
||||
category_map: Dict[str, List[Report]],
|
||||
input_files: List[File],
|
||||
supplemental_files: List[File],
|
||||
report_format: Optional[str] = None,
|
||||
context_count: Optional[int] = None,
|
||||
element_count: Optional[int] = None,
|
||||
entity_count: Optional[int] = None,
|
||||
footnotes_reported: Optional[bool] = None,
|
||||
segment_count: Optional[int] = None,
|
||||
scenario_count: Optional[int] = None,
|
||||
tuples_reported: Optional[bool] = None,
|
||||
has_presentation_linkbase: Optional[bool] = None,
|
||||
has_calculation_linkbase: Optional[bool] = None):
|
||||
self.reports:Reports = reports
|
||||
self.reports._filing_summary = self
|
||||
self._short_name_map = short_name_map
|
||||
self._category_map = category_map
|
||||
self.input_files = input_files
|
||||
self.supplemental_files = supplemental_files
|
||||
self.report_format = report_format
|
||||
self.context_count = context_count
|
||||
self.element_count = element_count
|
||||
self.entity_count = entity_count
|
||||
self.footnotes_reported = footnotes_reported
|
||||
self.segment_count = segment_count
|
||||
self.scenario_count = scenario_count
|
||||
self.tuples_reported = tuples_reported
|
||||
self.has_presentation_linkbase = has_presentation_linkbase
|
||||
self.has_calculation_linkbase = has_calculation_linkbase
|
||||
self._filing_sgml = None
|
||||
|
||||
@classmethod
|
||||
def parse(cls, xml_text:str):
|
||||
soup = BeautifulSoup(xml_text, 'xml')
|
||||
root = soup.find('FilingSummary')
|
||||
|
||||
# Main fields
|
||||
report_format = child_text(root, 'ReportFormat')
|
||||
context_count = child_text(root, 'ContextCount')
|
||||
element_count = child_text(root, 'ElementCount')
|
||||
entity_count = child_text(root, 'EntityCount')
|
||||
footnotes_reported = strtobool(child_text(root, 'FootnotesReported'))
|
||||
segment_count = child_text(root, 'SegmentCount')
|
||||
scenario_count = child_text(root, 'ScenarioCount')
|
||||
tuples_reported = strtobool(child_text(root, 'TuplesReported'))
|
||||
has_presentation_linkbase = strtobool(child_text(root, 'HasPresentationLinkbase'))
|
||||
has_calculation_linkbase = strtobool(child_text(root, 'HasCalculationLinkbase'))
|
||||
# Reports
|
||||
reports: List[Report] = []
|
||||
short_name_map: Dict[str, Report] = {}
|
||||
category_map: Dict[str, List[Report]] = {}
|
||||
report_records = []
|
||||
for report_tag in root.find_all("Report"):
|
||||
record = {
|
||||
'instance': report_tag.get('instance'),
|
||||
'IsDefault': strtobool(child_text(report_tag, 'IsDefault')),
|
||||
'HasEmbeddedReports': strtobool(child_text(report_tag, 'HasEmbeddedReports')),
|
||||
'HtmlFileName': child_text(report_tag, 'HtmlFileName'),
|
||||
'LongName': child_text(report_tag, 'LongName'),
|
||||
'ReportType': child_text(report_tag, 'ReportType'),
|
||||
'Role': child_text(report_tag, 'Role'),
|
||||
'ParentRole': child_text(report_tag, 'ParentRole'),
|
||||
'ShortName': child_text(report_tag, 'ShortName'),
|
||||
'MenuCategory': child_text(report_tag, 'MenuCategory'),
|
||||
'Position': child_text(report_tag, 'Position')
|
||||
}
|
||||
report = Report(
|
||||
instance = report_tag.get('instance'),
|
||||
is_default = strtobool(child_text(report_tag, 'IsDefault')),
|
||||
has_embedded_reports = strtobool(child_text(report_tag, 'HasEmbeddedReports')),
|
||||
html_file_name = child_text(report_tag, 'HtmlFileName'),
|
||||
long_name = child_text(report_tag, 'LongName'),
|
||||
report_type = child_text(report_tag, 'ReportType'),
|
||||
role = child_text(report_tag, 'Role'),
|
||||
parent_role=child_text(report_tag, 'ParentRole'),
|
||||
short_name = child_text(report_tag, 'ShortName'),
|
||||
menu_category = child_text(report_tag, 'MenuCategory'),
|
||||
position = child_text(report_tag, 'Position')
|
||||
)
|
||||
reports.append(report)
|
||||
report_records.append(record)
|
||||
short_name_map[report.short_name] = report
|
||||
if report.menu_category not in category_map:
|
||||
category_map[report.menu_category] = []
|
||||
category_map[report.menu_category].append(report)
|
||||
|
||||
# Reports Data
|
||||
reports_obj = Reports(data=pa.Table.from_pylist(report_records))
|
||||
# Input Files
|
||||
input_files_tag = root.find('InputFiles')
|
||||
input_files = []
|
||||
if input_files_tag:
|
||||
for file_tag in input_files_tag.find_all('File'):
|
||||
file = File(
|
||||
file_name = file_tag.text,
|
||||
doc_type = file_tag.get('doctype'),
|
||||
is_definitely_fs = strtobool(file_tag.get('isDefinitelyFs')),
|
||||
is_usgaap = strtobool(file_tag.get('isUsgaap')),
|
||||
original = file_tag.get('original')
|
||||
)
|
||||
input_files.append(file)
|
||||
|
||||
# Supplemental Files
|
||||
supplemental_files_tag = root.find('SupplementalFiles')
|
||||
supplemental_files = []
|
||||
if supplemental_files_tag:
|
||||
for file_tag in supplemental_files_tag.find_all('File'):
|
||||
file = File(
|
||||
file_name = file_tag.text,
|
||||
doc_type = file_tag.get('doctype'),
|
||||
is_definitely_fs = strtobool(file_tag.get('isDefinitelyFs')),
|
||||
is_usgaap = strtobool(file_tag.get('isUsgaap')),
|
||||
original = file_tag.get('original')
|
||||
)
|
||||
supplemental_files.append(file)
|
||||
return cls( report_format=report_format,
|
||||
short_name_map=short_name_map,
|
||||
category_map=category_map,
|
||||
context_count=context_count,
|
||||
element_count=element_count,
|
||||
entity_count=entity_count,
|
||||
footnotes_reported=footnotes_reported,
|
||||
segment_count=segment_count,
|
||||
scenario_count=scenario_count,
|
||||
tuples_reported=tuples_reported,
|
||||
has_presentation_linkbase=has_presentation_linkbase,
|
||||
has_calculation_linkbase=has_calculation_linkbase,
|
||||
reports=reports_obj,
|
||||
input_files=input_files,
|
||||
supplemental_files=supplemental_files)
|
||||
|
||||
def get_report_by_short_name(self, short_name: str) -> Optional[Report]:
|
||||
return self.reports.get_by_short_name(short_name)
|
||||
|
||||
def get_reports_by_category(self, category: str) -> Reports:
|
||||
return self.reports.get_by_category(category)
|
||||
|
||||
def get_reports_by_filename(self, file_name: str) -> Optional[Report]:
|
||||
return self.reports.get_by_filename(file_name)
|
||||
|
||||
@property
|
||||
def statements(self):
|
||||
reports = self.get_reports_by_category('Statements')
|
||||
return Statements(reports)
|
||||
|
||||
@property
|
||||
def tables(self):
|
||||
return self.get_reports_by_category('Tables')
|
||||
|
||||
def __str__(self):
|
||||
return f"FilingSummary(report_format={self.report_format})"
|
||||
|
||||
|
||||
def __rich__(self):
|
||||
renderables = [self.reports]
|
||||
return Panel(
|
||||
Group(*renderables),
|
||||
box=box.ROUNDED,
|
||||
title="Filing Summary"
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
class StatementType(Enum):
|
||||
INCOME = "income"
|
||||
BALANCE = "balance"
|
||||
CASH_FLOW = "cash_flow"
|
||||
COMPREHENSIVE_INCOME = "comprehensive_income"
|
||||
EQUITY = "equity"
|
||||
|
||||
|
||||
class StatementMapper:
|
||||
def __init__(self):
|
||||
# Define pattern matchers for each statement type
|
||||
self.patterns = {
|
||||
StatementType.INCOME: [
|
||||
(r'(?i)statement.*of.*(?:operation|income|earning)s?(?!\s+and\s+comprehensive)', 3),
|
||||
# High confidence direct match
|
||||
(r'(?i)(?:operation|income|earning)s?\s+statement', 2), # Alternative format
|
||||
(r'(?i)profit.*loss', 1), # P&L reference
|
||||
],
|
||||
StatementType.BALANCE: [
|
||||
(r'(?i)balance\s*sheet', 3), # Very consistent naming
|
||||
(r'(?i)statement.*of.*financial\s+position', 2), # Alternative format
|
||||
],
|
||||
StatementType.CASH_FLOW: [
|
||||
(r'(?i)statement.*of.*cash\s*flows?', 3), # Primary pattern
|
||||
(r'(?i)cash\s*flows?\s*statement', 2), # Alternative format
|
||||
],
|
||||
StatementType.COMPREHENSIVE_INCOME: [
|
||||
(r'(?i)statement.*of.*comprehensive\s*(?:income|loss)', 3), # Primary pattern
|
||||
(r'(?i)comprehensive\s*(?:income|loss)\s*statement', 2), # Alternative format
|
||||
],
|
||||
StatementType.EQUITY: [
|
||||
(r'(?i)statement.*of.*(?:stockholders|shareholders|owners)[\'\s]*equity', 3), # Primary pattern
|
||||
(r'(?i)statement.*of.*changes\s+in\s+(?:stockholders|shareholders|owners)[\'\s]*equity', 3),
|
||||
# With "changes in"
|
||||
(r'(?i)statement.*of.*equity', 2), # Generic equity
|
||||
]
|
||||
}
|
||||
|
||||
# Define combined statement patterns
|
||||
self.combined_patterns = [
|
||||
(r'(?i)statement.*of.*operations?\s+and\s+comprehensive\s*(?:income|loss)',
|
||||
{StatementType.INCOME, StatementType.COMPREHENSIVE_INCOME}),
|
||||
]
|
||||
|
||||
def normalize_statement(self, statement: str) -> str:
|
||||
"""Normalize statement name by removing common variations."""
|
||||
statement = statement.strip().upper()
|
||||
# Remove common prefixes if they exist
|
||||
prefixes = ['CONSOLIDATED', 'COMBINED']
|
||||
for prefix in prefixes:
|
||||
if statement.startswith(prefix):
|
||||
statement = statement[len(prefix):].strip()
|
||||
return statement
|
||||
|
||||
def match_statement(self, statement: str) -> Dict[StatementType, float]:
|
||||
"""
|
||||
Match a statement name to possible statement types with confidence scores.
|
||||
Returns a dictionary of {StatementType: confidence_score}
|
||||
"""
|
||||
normalized = self.normalize_statement(statement)
|
||||
scores: Dict[StatementType, float] = {}
|
||||
|
||||
# First check for combined statements
|
||||
for pattern, types in self.combined_patterns:
|
||||
if re.search(pattern, normalized):
|
||||
for stmt_type in types:
|
||||
scores[stmt_type] = 1.0
|
||||
return scores
|
||||
|
||||
# Then check individual patterns
|
||||
for stmt_type, patterns in self.patterns.items():
|
||||
max_score = 0
|
||||
for pattern, weight in patterns:
|
||||
if re.search(pattern, normalized):
|
||||
max_score = max(max_score, weight / 3.0) # Normalize to 0-1 range
|
||||
if max_score > 0:
|
||||
scores[stmt_type] = max_score
|
||||
|
||||
return scores
|
||||
|
||||
def classify_statement(self, statement: str, threshold: float = 0.5) -> Set[StatementType]:
|
||||
"""
|
||||
Classify a statement into one or more statement types.
|
||||
Returns a set of StatementType enums.
|
||||
"""
|
||||
scores = self.match_statement(statement)
|
||||
return {stmt_type for stmt_type, score in scores.items() if score >= threshold}
|
||||
|
||||
def get_best_matches(self, statements: List[str]) -> Dict[StatementType, str]:
|
||||
"""
|
||||
Given a list of statement names, returns the best matching statement
|
||||
for each statement type.
|
||||
"""
|
||||
result: Dict[StatementType, Tuple[str, float]] = {}
|
||||
|
||||
for statement in statements:
|
||||
scores = self.match_statement(statement)
|
||||
for stmt_type, score in scores.items():
|
||||
if (stmt_type not in result or
|
||||
score > result[stmt_type][1]):
|
||||
result[stmt_type] = (statement, score)
|
||||
|
||||
return {stmt_type: stmt for stmt_type, (stmt, _) in result.items()}
|
||||
|
||||
|
||||
class Statements:
|
||||
|
||||
"""
|
||||
A wrapper class for detected financial statements in a filing summary.
|
||||
"""
|
||||
def __init__(self, statement_reports:Reports):
|
||||
self._reports = statement_reports
|
||||
self.statements = [report.short_name for report in self._reports]
|
||||
self.mapper = StatementMapper()
|
||||
self._matches: Dict[StatementType, Tuple[str, float]] = {}
|
||||
self._initialize_matches()
|
||||
|
||||
def _initialize_matches(self) -> None:
|
||||
"""Initialize best matches for each statement type."""
|
||||
for statement in self.statements:
|
||||
scores = self.mapper.match_statement(statement)
|
||||
for stmt_type, score in scores.items():
|
||||
if (stmt_type not in self._matches or
|
||||
score > self._matches[stmt_type][1]):
|
||||
self._matches[stmt_type] = (statement, score)
|
||||
|
||||
def _get_statement(self, stmt_type: StatementType, threshold: float = 0.5) -> Optional[Report]:
|
||||
"""Helper method to get a statement of a specific type."""
|
||||
if stmt_type in self._matches:
|
||||
statement, score = self._matches[stmt_type]
|
||||
if score >= threshold:
|
||||
return self._reports.get_by_short_name(statement)
|
||||
return None
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self._reports[item]
|
||||
|
||||
@property
|
||||
def balance_sheet(self) -> Optional[Report]:
|
||||
"""Returns the detected balance sheet statement."""
|
||||
return self._get_statement(StatementType.BALANCE)
|
||||
|
||||
@property
|
||||
def income_statement(self) -> Optional[Report]:
|
||||
"""Returns the detected income statement."""
|
||||
return self._get_statement(StatementType.INCOME)
|
||||
|
||||
@property
|
||||
def cash_flow_statement(self) -> Optional[Report]:
|
||||
"""Returns the detected cash flow statement."""
|
||||
return self._get_statement(StatementType.CASH_FLOW)
|
||||
|
||||
@property
|
||||
def comprehensive_income_statement(self) -> Optional[Report]:
|
||||
"""Returns the detected comprehensive income statement."""
|
||||
return self._get_statement(StatementType.COMPREHENSIVE_INCOME)
|
||||
|
||||
@property
|
||||
def equity_statement(self) -> Optional[Report]:
|
||||
"""Returns the detected equity statement."""
|
||||
return self._get_statement(StatementType.EQUITY)
|
||||
|
||||
@property
|
||||
def detected_statements(self) -> Dict[StatementType, str]:
|
||||
"""Returns all detected statements with scores above threshold."""
|
||||
return {
|
||||
stmt_type: stmt for stmt_type, (stmt, score)
|
||||
in self._matches.items()
|
||||
if score >= 0.5
|
||||
}
|
||||
|
||||
def __rich__(self):
|
||||
return self._reports
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
482
venv/lib/python3.10/site-packages/edgar/sgml/sgml_common.py
Normal file
482
venv/lib/python3.10/site-packages/edgar/sgml/sgml_common.py
Normal file
@@ -0,0 +1,482 @@
|
||||
import re
|
||||
import zipfile
|
||||
from collections import defaultdict
|
||||
from functools import cached_property
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, DefaultDict, Dict, Iterator, List, Optional, Tuple, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar._filings import Filing
|
||||
|
||||
from edgar.attachments import Attachment, Attachments, get_document_type
|
||||
from edgar.httprequests import stream_with_retry
|
||||
from edgar.sgml.filing_summary import FilingSummary
|
||||
from edgar.sgml.sgml_header import FilingHeader
|
||||
from edgar.sgml.sgml_parser import SGMLDocument, SGMLFormatType, SGMLParser
|
||||
from edgar.sgml.tools import is_xml
|
||||
|
||||
__all__ = ['iter_documents', 'list_documents', 'FilingSGML', 'FilingHeader']
|
||||
|
||||
|
||||
def parse_document(document_str: str) -> SGMLDocument:
|
||||
"""
|
||||
Parse a single SGML document section, maintaining raw content.
|
||||
"""
|
||||
# Extract individual fields with separate patterns
|
||||
type_match = re.search(r'<TYPE>([^<\n]+)', document_str)
|
||||
sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', document_str)
|
||||
filename_match = re.search(r'<FILENAME>([^<\n]+)', document_str)
|
||||
description_match = re.search(r'<DESCRIPTION>([^<\n]+)', document_str)
|
||||
|
||||
return SGMLDocument(
|
||||
type=type_match.group(1).strip() if type_match else "",
|
||||
sequence=sequence_match.group(1).strip() if sequence_match else "",
|
||||
filename=filename_match.group(1).strip() if filename_match else "",
|
||||
description=description_match.group(1).strip() if description_match else "",
|
||||
raw_content=document_str
|
||||
)
|
||||
|
||||
|
||||
def read_content(source: Union[str, Path]) -> Iterator[str]:
|
||||
"""
|
||||
Read content from either a URL or file path, yielding lines as strings.
|
||||
Automatically handles gzip-compressed files with .gz extension.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path
|
||||
|
||||
Yields:
|
||||
str: Lines of content from the source
|
||||
|
||||
Raises:
|
||||
TooManyRequestsError: If the server returns a 429 response
|
||||
FileNotFoundError: If the file path doesn't exist
|
||||
gzip.BadGzipFile: If the file is not a valid gzip file
|
||||
"""
|
||||
if isinstance(source, str) and (source.startswith('http://') or source.startswith('https://')):
|
||||
# Handle URL using stream_with_retry
|
||||
for response in stream_with_retry(source):
|
||||
# Process each line from the response and decode from bytes
|
||||
for line in response.iter_lines():
|
||||
if line is not None:
|
||||
yield line + "\n"
|
||||
else:
|
||||
# Handle file path
|
||||
path = Path(source)
|
||||
|
||||
# Check if the file is gzip-compressed
|
||||
if str(path).endswith('.gz'):
|
||||
import gzip
|
||||
with gzip.open(path, 'rt', encoding='utf-8', errors='replace') as file:
|
||||
yield from file
|
||||
else:
|
||||
# Regular file handling
|
||||
with path.open('r', encoding='utf-8', errors='replace') as file:
|
||||
yield from file
|
||||
|
||||
|
||||
def read_content_as_string(source: Union[str, Path]) -> str:
|
||||
"""
|
||||
Read content from either a URL or file path into a string.
|
||||
Uses existing read_content generator function.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path
|
||||
|
||||
Returns:
|
||||
str: Full content as string
|
||||
|
||||
Raises:
|
||||
TooManyRequestsError: If the server returns a 429 response
|
||||
FileNotFoundError: If file path doesn't exist
|
||||
"""
|
||||
# Convert lines from read_content to string
|
||||
lines = []
|
||||
for line in read_content(source):
|
||||
# Handle both string and bytes from response
|
||||
if isinstance(line, bytes):
|
||||
lines.append(line.decode('utf-8', errors='replace'))
|
||||
else:
|
||||
lines.append(line)
|
||||
|
||||
return ''.join(lines)
|
||||
|
||||
|
||||
def iter_documents(source: Union[str, Path]) -> Iterator[SGMLDocument]:
|
||||
"""
|
||||
Stream SGML documents from either a URL or file path, yielding parsed documents.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path (string or Path object)
|
||||
|
||||
Yields:
|
||||
SGMLDocument objects containing the parsed content
|
||||
|
||||
Raises:
|
||||
ValueError: If the source is invalid
|
||||
ConnectionError: If URL retrieval fails after retries
|
||||
FileNotFoundError: If the file path doesn't exist
|
||||
"""
|
||||
try:
|
||||
content = ''.join(read_content(source))
|
||||
document_pattern = re.compile(r'<DOCUMENT>([\s\S]*?)</DOCUMENT>')
|
||||
|
||||
for match in document_pattern.finditer(content):
|
||||
document = parse_document(match.group(1))
|
||||
if document:
|
||||
yield document
|
||||
|
||||
except (ValueError, ConnectionError, FileNotFoundError) as e:
|
||||
raise type(e)(f"Error processing source {source}: {str(e)}") from e
|
||||
|
||||
def list_documents(source: Union[str, Path]) -> list[SGMLDocument]:
|
||||
"""
|
||||
Convenience method to parse all documents from a source into a list.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path
|
||||
|
||||
Returns:
|
||||
List of SGMLDocument objects
|
||||
"""
|
||||
return list(iter_documents(source))
|
||||
|
||||
|
||||
def parse_file(source: Union[str, Path]) -> list[SGMLDocument]:
|
||||
"""
|
||||
Convenience method to parse all documents from a source into a list.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path
|
||||
|
||||
Returns:
|
||||
List of SGMLDocument objects
|
||||
"""
|
||||
return list(iter_documents(source))
|
||||
|
||||
def parse_submission_text(content: str) -> Tuple[FilingHeader, DefaultDict[str, List[SGMLDocument]]]:
|
||||
"""
|
||||
Parses the raw submission text and returns the filing header along with
|
||||
a dictionary mapping document sequence numbers to lists of SGMLDocument objects.
|
||||
Args:
|
||||
content (str): The raw text content of the submission.
|
||||
Returns:
|
||||
Tuple[FilingHeader, DefaultDict[str, List[SGMLDocument]]]:
|
||||
A tuple where the first element is the FilingHeader object representing
|
||||
the parsed header information, and the second element is a defaultdict
|
||||
mapping document sequence identifiers to their corresponding list of SGMLDocument objects.
|
||||
Details:
|
||||
- For submissions with the SGMLFormatType.SUBMISSION format, the function uses
|
||||
the pre-parsed filer data to create the FilingHeader.
|
||||
- For SEC-DOCUMENT formatted content, the header is initially parsed from the SGML text;
|
||||
if this fails, the header is parsed again with preprocessing enabled.
|
||||
- The function creates an SGMLDocument for each parsed document and groups them by
|
||||
their sequence identifier.
|
||||
Raises:
|
||||
Exception: Any exceptions raised during header parsing (handled internally
|
||||
by attempting to preprocess the header in case of failure).
|
||||
"""
|
||||
# Create parser and get structure including header and documents
|
||||
parser = SGMLParser()
|
||||
parsed_data = parser.parse(content)
|
||||
|
||||
# Create FilingHeader using already parsed data
|
||||
if parsed_data['format'] == SGMLFormatType.SUBMISSION:
|
||||
# For submission format, we already have parsed filer data
|
||||
header = FilingHeader.parse_submission_format_header(parsed_data=parsed_data)
|
||||
else:
|
||||
# For SEC-DOCUMENT format, pass the header text to the
|
||||
# specialized header parser since we need additional processing
|
||||
try:
|
||||
header = FilingHeader.parse_from_sgml_text(parsed_data['header'])
|
||||
except Exception:
|
||||
header = FilingHeader.parse_from_sgml_text(parsed_data['header'], preprocess=True)
|
||||
|
||||
# Create document dictionary
|
||||
documents = defaultdict(list)
|
||||
for doc_data in parsed_data['documents']:
|
||||
doc = SGMLDocument.from_parsed_data(doc_data)
|
||||
documents[doc.sequence].append(doc)
|
||||
return header, documents
|
||||
|
||||
|
||||
|
||||
class FilingSGML:
|
||||
"""
|
||||
Main class that parses and provides access to both the header and documents
|
||||
from an SGML filing.
|
||||
"""
|
||||
__slots__ = ('header', '_documents_by_sequence', '__dict__') # Use slots for memory efficiency
|
||||
|
||||
def __init__(self, header: FilingHeader, documents: defaultdict[str, List[SGMLDocument]]):
|
||||
"""
|
||||
Initialize FilingSGML with parsed header and documents.
|
||||
|
||||
Args:
|
||||
header (FilingHeader): Parsed header information
|
||||
documents (Dict[str, SGMLDocument]): Dictionary of parsed documents keyed by sequence
|
||||
"""
|
||||
self.header:FilingHeader = header
|
||||
self._documents_by_sequence:defaultdict[str, List[SGMLDocument]] = documents
|
||||
self._documents_by_name:Dict[str, SGMLDocument] = {
|
||||
doc.filename: doc for doc_lst in documents.values() for doc in doc_lst
|
||||
}
|
||||
|
||||
@property
|
||||
def accession_number(self):
|
||||
return self.header.accession_number
|
||||
|
||||
@property
|
||||
def cik(self):
|
||||
return self.header.cik
|
||||
|
||||
@cached_property
|
||||
def entity(self):
|
||||
from edgar.entity import Entity
|
||||
cik = self.cik
|
||||
if cik:
|
||||
return Entity(cik)
|
||||
|
||||
@property
|
||||
def form(self):
|
||||
return self.header.form
|
||||
|
||||
@property
|
||||
def filing_date(self):
|
||||
return self.header.filing_date
|
||||
|
||||
@property
|
||||
def date_as_of_change(self):
|
||||
return self.header.date_as_of_change
|
||||
|
||||
@property
|
||||
def period_of_report(self):
|
||||
return self.header.period_of_report
|
||||
|
||||
@property
|
||||
def effective_date(self):
|
||||
return self.header.filing_metadata.get('EFFECTIVE DATE')
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
"""
|
||||
Get the root path of the filing.
|
||||
"""
|
||||
if self.accession_number:
|
||||
return f"/Archives/edgar/data/{self.header.cik}/{self.accession_number.replace('-', '')}"
|
||||
else:
|
||||
return "/<SGML FILE>"
|
||||
|
||||
|
||||
def html(self):
|
||||
html_document = self.attachments.primary_html_document
|
||||
if html_document and not html_document.is_binary() and not html_document.empty:
|
||||
html_text = self.get_content(html_document.document)
|
||||
if isinstance(html_text, bytes):
|
||||
html_text = html_text.decode('utf-8')
|
||||
return html_text
|
||||
|
||||
def xml(self):
|
||||
xml_document = self.attachments.primary_xml_document
|
||||
if xml_document and not xml_document.is_binary() and not xml_document.empty:
|
||||
xml_text = self.get_content(xml_document.document)
|
||||
if isinstance(xml_text, bytes):
|
||||
xml_text = xml_text.decode('utf-8')
|
||||
return xml_text
|
||||
|
||||
def get_content(self, filename: str) -> Optional[str]:
|
||||
"""
|
||||
Get the content of a document by its filename.
|
||||
"""
|
||||
document = self._documents_by_name.get(filename)
|
||||
if document:
|
||||
return document.content
|
||||
|
||||
@cached_property
|
||||
def attachments(self) -> Attachments:
|
||||
"""
|
||||
Get all attachments from the filing.
|
||||
"""
|
||||
is_datafile = False
|
||||
documents, datafiles, primary_files = [], [], []
|
||||
|
||||
# Get the filing summary
|
||||
filing_summary = self.filing_summary
|
||||
|
||||
for sequence, document_lst in self._documents_by_sequence.items():
|
||||
for document in document_lst:
|
||||
attachment = Attachment(
|
||||
sequence_number=sequence,
|
||||
ixbrl=False,
|
||||
path=f"{self.path}/{document.filename}",
|
||||
document=document.filename,
|
||||
document_type=get_document_type(filename=document.filename, declared_document_type=document.type),
|
||||
description=document.description,
|
||||
size=None,
|
||||
sgml_document=document,
|
||||
filing_sgml=self
|
||||
)
|
||||
# Add from the filing summary if available
|
||||
if filing_summary:
|
||||
report = filing_summary.get_reports_by_filename(document.filename)
|
||||
if report:
|
||||
attachment.purpose = report.short_name
|
||||
# Check if the document is a primary document
|
||||
if sequence == "1":
|
||||
primary_files.append(attachment)
|
||||
documents.append(attachment)
|
||||
else:
|
||||
if not is_datafile:
|
||||
is_datafile = is_xml(filename=document.filename)
|
||||
if is_datafile:
|
||||
datafiles.append(attachment)
|
||||
else:
|
||||
documents.append(attachment)
|
||||
|
||||
return Attachments(document_files=documents, data_files=datafiles, primary_documents=primary_files, sgml=self)
|
||||
|
||||
@cached_property
|
||||
def filing_summary(self):
|
||||
summary_attachment = self._documents_by_name.get("FilingSummary.xml")
|
||||
if summary_attachment:
|
||||
filing_summary = FilingSummary.parse(summary_attachment.content)
|
||||
filing_summary.reports._filing_summary = filing_summary
|
||||
filing_summary._filing_sgml = self
|
||||
return filing_summary
|
||||
|
||||
def download(self, path: Union[str, Path], archive: bool = False):
|
||||
"""
|
||||
Download all the attachments to a specified path.
|
||||
If the path is a directory, the file is saved with its original name in that directory.
|
||||
If the path is a file, the file is saved with the given path name.
|
||||
If archive is True, the attachments are saved in a zip file.
|
||||
path: str or Path - The path to save the attachments
|
||||
archive: bool (default False) - If True, save the attachments in a zip file
|
||||
"""
|
||||
if archive:
|
||||
if path.is_dir():
|
||||
raise ValueError("Path must be a zip file name to create zipfile")
|
||||
else:
|
||||
with zipfile.ZipFile(path, 'w') as zipf:
|
||||
for document in self._documents_by_name.values():
|
||||
zipf.writestr(document.filename, document.content)
|
||||
else:
|
||||
if path.is_dir():
|
||||
for document in self._documents_by_name.values():
|
||||
file_path = path / document.filename
|
||||
content = document.content
|
||||
if isinstance(content, bytes):
|
||||
file_path.write_bytes(content)
|
||||
else:
|
||||
file_path.write_text(content, encoding='utf-8')
|
||||
else:
|
||||
raise ValueError("Path must be a directory")
|
||||
|
||||
@property
|
||||
def primary_documents(self):
|
||||
"""
|
||||
Get the primary documents from the filing.
|
||||
"""
|
||||
return self.attachments.primary_documents
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_source(cls, source: Union[str, Path]) -> "FilingSGML":
|
||||
"""
|
||||
Create FilingSGML instance from either a URL or file path.
|
||||
Parses both header and documents.
|
||||
|
||||
Args:
|
||||
source: Either a URL string or a file path
|
||||
|
||||
Returns:
|
||||
FilingSGML: New instance with parsed header and documents
|
||||
|
||||
Raises:
|
||||
ValueError: If header section cannot be found
|
||||
IOError: If file cannot be read
|
||||
"""
|
||||
# Read content once
|
||||
content = read_content_as_string(source)
|
||||
|
||||
# Parse header and documents
|
||||
header, documents = parse_submission_text(content)
|
||||
|
||||
# Create FilingSGML instance
|
||||
return cls(header=header, documents=documents)
|
||||
|
||||
@classmethod
|
||||
def from_text(cls, full_text_submission: str) -> "FilingSGML":
|
||||
"""
|
||||
Create FilingSGML instance from either full text submission.
|
||||
Parses both header and documents.
|
||||
|
||||
Args:
|
||||
full_text_submission: String containing full text submission
|
||||
|
||||
Returns:
|
||||
FilingSGML: New instance with parsed header and documents
|
||||
|
||||
Raises:
|
||||
ValueError: If header section cannot be found
|
||||
"""
|
||||
# Parse header and documents
|
||||
header, documents = parse_submission_text(full_text_submission)
|
||||
|
||||
# Create FilingSGML instance
|
||||
return cls(header=header, documents=documents)
|
||||
|
||||
|
||||
def get_document_by_sequence(self, sequence: str) -> Optional[SGMLDocument]:
|
||||
"""
|
||||
Get a document by its sequence number.
|
||||
Direct dictionary lookup for O(1) performance.
|
||||
"""
|
||||
results = self._documents_by_sequence.get(sequence)
|
||||
if results and len(results) > 0:
|
||||
return results[0]
|
||||
|
||||
def get_document_by_name(self, filename: str) -> Optional[SGMLDocument]:
|
||||
"""
|
||||
Get a document by its filename.
|
||||
Direct dictionary lookup for O(1) performance.
|
||||
"""
|
||||
return self._documents_by_name.get(filename)
|
||||
|
||||
@classmethod
|
||||
def from_filing(cls, filing: 'Filing') -> 'FilingSGML':
|
||||
"""Create from a Filing object that provides text_url."""
|
||||
filing_sgml = cls.from_source(filing.text_url)
|
||||
if not filing_sgml.accession_number:
|
||||
filing_sgml.header.filing_metadata.update('ACCESSION NUMBER', filing.accession_no)
|
||||
if not filing_sgml.header.filing_metadata.get("CIK"):
|
||||
filing_sgml.header.filing_metadata.update('CIK', str(filing.cik).zfill(10))
|
||||
if not filing_sgml.header.form:
|
||||
filing_sgml.header.filing_metadata.update("CONFORMED SUBMISSION TYPE", filing.form)
|
||||
return filing_sgml
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""String representation with basic filing info."""
|
||||
doc_count = len(self._documents_by_name)
|
||||
return f"FilingSGML(accession={self.header.accession_number}, document_count={doc_count})"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return str(self)
|
||||
|
||||
def get_document_sequences(self) -> List[str]:
|
||||
"""
|
||||
Get all document sequences.
|
||||
Using list() is more efficient than sorted() when order doesn't matter.
|
||||
"""
|
||||
return list(self._documents_by_sequence.keys())
|
||||
|
||||
def get_all_document_types(self) -> List[str]:
|
||||
"""
|
||||
Get unique document types in filing.
|
||||
Using set for deduplication.
|
||||
"""
|
||||
return list({doc.type for doc in self._documents_by_sequence.values()})
|
||||
|
||||
def get_document_count(self) -> int:
|
||||
"""Get total number of documents."""
|
||||
return len(self._documents_by_sequence)
|
||||
1030
venv/lib/python3.10/site-packages/edgar/sgml/sgml_header.py
Normal file
1030
venv/lib/python3.10/site-packages/edgar/sgml/sgml_header.py
Normal file
File diff suppressed because it is too large
Load Diff
593
venv/lib/python3.10/site-packages/edgar/sgml/sgml_parser.py
Normal file
593
venv/lib/python3.10/site-packages/edgar/sgml/sgml_parser.py
Normal file
@@ -0,0 +1,593 @@
|
||||
import re
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from typing import Iterator, Optional
|
||||
|
||||
from edgar.core import has_html_content
|
||||
from edgar.sgml.tools import get_content_between_tags
|
||||
from edgar.vendored import uu
|
||||
|
||||
__all__ = ['SGMLParser', 'SGMLFormatType', 'SGMLDocument', 'SECIdentityError', 'SECFilingNotFoundError', 'SECHTMLResponseError']
|
||||
|
||||
|
||||
class SECIdentityError(Exception):
|
||||
"""Raised when SEC rejects request due to invalid or missing EDGAR_IDENTITY"""
|
||||
pass
|
||||
|
||||
|
||||
class SECFilingNotFoundError(Exception):
|
||||
"""Raised when SEC returns error for non-existent filing"""
|
||||
pass
|
||||
|
||||
|
||||
class SECHTMLResponseError(Exception):
|
||||
"""Raised when SEC returns HTML content instead of expected SGML"""
|
||||
pass
|
||||
|
||||
class SGMLFormatType(Enum):
|
||||
SEC_DOCUMENT = "sec_document" # <SEC-DOCUMENT>...<SEC-HEADER> style
|
||||
SUBMISSION = "submission" # <SUBMISSION>...<FILER> style
|
||||
|
||||
|
||||
@dataclass
|
||||
class SGMLDocument:
|
||||
type: str
|
||||
sequence: str
|
||||
filename: str
|
||||
description: str
|
||||
raw_content: str = ""
|
||||
|
||||
@classmethod
|
||||
def from_parsed_data(cls, data: dict) -> 'SGMLDocument':
|
||||
"""Create document from parser output"""
|
||||
return cls(
|
||||
type=data['type'],
|
||||
sequence=data['sequence'],
|
||||
filename=data['filename'],
|
||||
description=data['description'],
|
||||
raw_content=data['content']
|
||||
)
|
||||
|
||||
@property
|
||||
def content(self):
|
||||
raw_content = get_content_between_tags(self.raw_content)
|
||||
if raw_content:
|
||||
if raw_content.startswith("begin"):
|
||||
# Create input and output streams
|
||||
# Suppress the binascii warning
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# Create input and output streams
|
||||
input_stream = BytesIO(raw_content.encode("utf-8"))
|
||||
output_stream = BytesIO()
|
||||
|
||||
# Decode the UU content
|
||||
uu.decode(input_stream, output_stream, quiet=True)
|
||||
|
||||
# Get the decoded bytes
|
||||
return output_stream.getvalue()
|
||||
return raw_content
|
||||
|
||||
def __str__(self):
|
||||
return f"Document(type={self.type}, sequence={self.sequence}, filename={self.filename}, description={self.description})"
|
||||
|
||||
def text(self) -> str:
|
||||
"""Extract content between <TEXT> tags."""
|
||||
match = re.search(r'<TEXT>([\s\S]*?)</TEXT>', self.raw_content, re.DOTALL | re.IGNORECASE)
|
||||
return match.group(1).strip() if match else ""
|
||||
|
||||
def xml(self) -> Optional[str]:
|
||||
"""Extract content between <XML> tags if present."""
|
||||
match = re.search(r'<XML>([\s\S]*?)</XML>', self.raw_content, re.DOTALL | re.IGNORECASE)
|
||||
return match.group(1).strip() if match else None
|
||||
|
||||
def html(self) -> Optional[str]:
|
||||
"""Extract content between <HTML> tags if present."""
|
||||
match = re.search(r'<HTML>([\s\S]*?)</HTML>', self.raw_content, re.DOTALL | re.IGNORECASE)
|
||||
return match.group(1).strip() if match else None
|
||||
|
||||
def xbrl(self) -> Optional[str]:
|
||||
"""Extract content between <XBRL> tags if present."""
|
||||
match = re.search(r'<XBRL>([\s\S]*?)</XBRL>', self.raw_content, re.DOTALL | re.IGNORECASE)
|
||||
return match.group(1).strip() if match else None
|
||||
|
||||
def get_content_type(self) -> str:
|
||||
"""
|
||||
Determine the primary content type of the document.
|
||||
Returns: 'xml', 'html', 'xbrl', or 'text'
|
||||
"""
|
||||
if self.xml():
|
||||
return 'xml'
|
||||
elif self.html():
|
||||
return 'html'
|
||||
elif self.xbrl():
|
||||
return 'xbrl'
|
||||
return 'text'
|
||||
|
||||
def _raise_sec_html_error(content: str):
|
||||
"""
|
||||
Analyze HTML/XML error content from SEC and raise appropriate specific exception.
|
||||
|
||||
Args:
|
||||
content: HTML or XML content received from SEC
|
||||
|
||||
Raises:
|
||||
SECIdentityError: For identity-related errors
|
||||
SECFilingNotFoundError: For missing filing errors
|
||||
SECHTMLResponseError: For other HTML/XML responses
|
||||
"""
|
||||
# Check for identity error
|
||||
if "Your Request Originates from an Undeclared Automated Tool" in content:
|
||||
raise SECIdentityError(
|
||||
"SEC rejected request due to invalid or missing EDGAR_IDENTITY. "
|
||||
"Please set a valid identity using set_identity('Your Name your.email@domain.com'). "
|
||||
"See https://www.sec.gov/os/accessing-edgar-data"
|
||||
)
|
||||
|
||||
# Check for AWS S3 NoSuchKey error (XML format)
|
||||
if "<Code>NoSuchKey</Code>" in content and "<Message>The specified key does not exist.</Message>" in content:
|
||||
raise SECFilingNotFoundError(
|
||||
"SEC filing not found - the specified key does not exist in EDGAR archives. "
|
||||
"Check that the accession number and filing date are correct."
|
||||
)
|
||||
|
||||
# Check for general not found errors
|
||||
if "Not Found" in content or "404" in content:
|
||||
raise SECFilingNotFoundError(
|
||||
"SEC filing not found. Check that the accession number and filing date are correct."
|
||||
)
|
||||
|
||||
# Generic HTML/XML response error
|
||||
raise SECHTMLResponseError(
|
||||
"SEC returned HTML or XML content instead of expected SGML filing data. "
|
||||
"This may indicate an invalid request or temporary SEC server issue."
|
||||
)
|
||||
|
||||
|
||||
class SGMLParser:
|
||||
@staticmethod
|
||||
def detect_format(content: str) -> SGMLFormatType:
|
||||
"""Detect SGML format based on root element"""
|
||||
# First check for valid SGML structure before checking for HTML content
|
||||
content_stripped = content.lstrip()
|
||||
|
||||
# Check for valid SGML formats first
|
||||
if content_stripped.startswith('<SUBMISSION>'):
|
||||
return SGMLFormatType.SUBMISSION
|
||||
elif '<SEC-DOCUMENT>' in content:
|
||||
return SGMLFormatType.SEC_DOCUMENT
|
||||
elif '<IMS-DOCUMENT>' in content:
|
||||
# For old filings from the 1990's
|
||||
return SGMLFormatType.SEC_DOCUMENT
|
||||
elif '<DOCUMENT>' in content[:1000]:
|
||||
# For old filings from the 1990's
|
||||
return SGMLFormatType.SEC_DOCUMENT
|
||||
|
||||
# Only check for HTML content if it's not valid SGML structure
|
||||
# This prevents false positives when SGML contains HTML within <TEXT> sections
|
||||
if has_html_content(content):
|
||||
_raise_sec_html_error(content)
|
||||
|
||||
# Check if we received XML error content (like AWS S3 NoSuchKey errors)
|
||||
if content_stripped.startswith('<?xml') and '<Error>' in content:
|
||||
_raise_sec_html_error(content)
|
||||
|
||||
raise ValueError("Unknown SGML format")
|
||||
|
||||
def parse(self, content) -> dict:
|
||||
"""Main entry point for parsing"""
|
||||
format_type = self.detect_format(content)
|
||||
|
||||
if format_type == SGMLFormatType.SUBMISSION:
|
||||
return self._parse_submission_format(content)
|
||||
else:
|
||||
return self._parse_sec_document_format(content)
|
||||
|
||||
def _parse_submission_format(self, content):
|
||||
parser = SubmissionFormatParser()
|
||||
return parser.parse(content)
|
||||
|
||||
def _parse_sec_document_format(self, content):
|
||||
parser = SecDocumentFormatParser()
|
||||
return parser.parse(content)
|
||||
|
||||
|
||||
class SubmissionFormatParser:
|
||||
def __init__(self):
|
||||
# Initialize main data structure
|
||||
self.data = {
|
||||
'format': SGMLFormatType.SUBMISSION,
|
||||
'header': '',
|
||||
'documents': [],
|
||||
}
|
||||
|
||||
# Parser state
|
||||
self.current_path = [] # Stack to track current position in hierarchy
|
||||
self.header_lines = [] # Collect header lines
|
||||
self.in_documents = False
|
||||
|
||||
# Known section tags that can contain nested content
|
||||
self.SECTION_TAGS = {
|
||||
'FILER',
|
||||
'OWNER-DATA',
|
||||
'COMPANY-DATA',
|
||||
'REPORTING-OWNER',
|
||||
'ISSUER',
|
||||
'DEPOSITOR',
|
||||
'SECURITIZER',
|
||||
'UNDERWRITER',
|
||||
'ISSUING_ENTITY',
|
||||
'FORMER-COMPANY',
|
||||
'SUBJECT-COMPANY',
|
||||
'FILED-BY',
|
||||
'FORMER-NAME',
|
||||
'FILING-VALUES',
|
||||
'BUSINESS-ADDRESS',
|
||||
'MAIL-ADDRESS',
|
||||
'CLASS-CONTRACT',
|
||||
'SERIES',
|
||||
'NEW-SERIES',
|
||||
'NEW-CLASSES-CONTRACTS',
|
||||
'ACQUIRING-DATA',
|
||||
'TARGET-DATA',
|
||||
'SERIAL-COMPANY',
|
||||
'MERGER',
|
||||
'SERIES-AND-CLASSES-CONTRACTS-DATA',
|
||||
'NEW-SERIES-AND-CLASSES-CONTRACTS',
|
||||
'MERGER-SERIES-AND-CLASSES-CONTRACTS',
|
||||
'EXISTING-SERIES-AND-CLASSES-CONTRACTS',
|
||||
'RULE',
|
||||
'ITEM'
|
||||
}
|
||||
|
||||
# Tags that can appear multiple times and should be stored as lists
|
||||
self.REPEATABLE_TAGS = {
|
||||
'FILER',
|
||||
'REPORTING-OWNER',
|
||||
'UNDERWRITER',
|
||||
'SERIES',
|
||||
'CLASS-CONTRACT',
|
||||
'FORMER-COMPANY',
|
||||
'SUBJECT-COMPANY',
|
||||
'ITEM'
|
||||
}
|
||||
|
||||
def _get_current_context(self) -> dict:
|
||||
"""Navigate to current position in data hierarchy."""
|
||||
context = self.data
|
||||
for path_element in self.current_path:
|
||||
tag, index = path_element
|
||||
if index is not None:
|
||||
context = context[tag][index]
|
||||
else:
|
||||
context = context[tag]
|
||||
return context
|
||||
|
||||
def _is_unclosed_tag(self, line: str) -> bool:
|
||||
"""Check if line is an unclosed tag with value."""
|
||||
line = line.strip()
|
||||
if not (line.startswith('<') and '>' in line and not line.startswith('</')):
|
||||
return False
|
||||
|
||||
tag_end = line.index('>')
|
||||
content_after = line[tag_end + 1:].strip()
|
||||
return bool(content_after)
|
||||
|
||||
def _is_section_end(self, line: str) -> bool:
|
||||
"""Check if line ends a section."""
|
||||
return line.strip().startswith('</')
|
||||
|
||||
def _is_section_start(self, line: str) -> bool:
|
||||
"""Identifies if a line starts a new nested section."""
|
||||
line = line.strip()
|
||||
if not line.startswith('<') or not line.endswith('>'):
|
||||
return False
|
||||
|
||||
tag = line[1:-1] # Remove < and >
|
||||
return tag in self.SECTION_TAGS
|
||||
|
||||
def _is_data_tag(self, line: str) -> bool:
|
||||
"""Identifies if a line contains a tag with a value."""
|
||||
line = line.strip()
|
||||
if not line.startswith('<'):
|
||||
return False
|
||||
|
||||
parts = line.split('>')
|
||||
return len(parts) == 2 and bool(parts[1].strip())
|
||||
|
||||
def _is_empty_tag(self, line: str) -> bool:
|
||||
"""Identifies if a line is an empty tag."""
|
||||
line = line.strip()
|
||||
return (line.startswith('<') and
|
||||
line.endswith('>') and
|
||||
not line.startswith('</') and
|
||||
not self._is_section_start(line) and
|
||||
not self._is_data_tag(line))
|
||||
|
||||
def _handle_section_start(self, line: str) -> None:
|
||||
"""Handle start of nested section."""
|
||||
tag = line.strip()[1:-1] # Remove < and >
|
||||
|
||||
current_context = self._get_current_context()
|
||||
|
||||
# Initialize tag in current context if needed
|
||||
if tag not in current_context:
|
||||
if tag in self.REPEATABLE_TAGS:
|
||||
current_context[tag] = []
|
||||
else:
|
||||
current_context[tag] = {}
|
||||
|
||||
# For repeatable tags, append new dict and track index
|
||||
if tag in self.REPEATABLE_TAGS:
|
||||
current_context[tag].append({})
|
||||
self.current_path.append((tag, len(current_context[tag]) - 1))
|
||||
else:
|
||||
self.current_path.append((tag, None))
|
||||
|
||||
def _handle_section_end(self, line: str) -> None:
|
||||
"""Handle end of nested section."""
|
||||
tag = line.strip()[2:-1] # Remove </ and >
|
||||
|
||||
# Verify we're closing the correct tag
|
||||
current_tag, _ = self.current_path[-1]
|
||||
if tag != current_tag:
|
||||
raise ValueError(f"Mismatched tags: expected </{current_tag}>, got </{tag}>")
|
||||
|
||||
# Pop the current section from the path
|
||||
self.current_path.pop()
|
||||
|
||||
def _handle_data_tag(self, line: str) -> None:
|
||||
"""Handle tags with values."""
|
||||
line = line.strip()
|
||||
tag_end = line.index('>')
|
||||
tag = line[1:tag_end]
|
||||
value = line[tag_end + 1:].strip()
|
||||
|
||||
current_context = self._get_current_context()
|
||||
|
||||
# Handle repeated tags
|
||||
if tag in current_context:
|
||||
if not isinstance(current_context[tag], list):
|
||||
current_context[tag] = [current_context[tag]]
|
||||
current_context[tag].append(value)
|
||||
else:
|
||||
current_context[tag] = value
|
||||
|
||||
def _handle_empty_tag(self, line: str) -> None:
|
||||
"""Handle empty tags."""
|
||||
tag = line.strip()[1:-1] # Remove < and >
|
||||
current_context = self._get_current_context()
|
||||
current_context[tag] = ""
|
||||
|
||||
def _handle_unclosed_tag(self, line: str) -> None:
|
||||
"""Handle tags like <ITEMS>value."""
|
||||
line = line.strip()
|
||||
tag_end = line.index('>')
|
||||
tag = line[1:tag_end]
|
||||
value = line[tag_end + 1:].strip()
|
||||
|
||||
current_context = self._get_current_context()
|
||||
|
||||
if tag in current_context:
|
||||
if not isinstance(current_context[tag], list):
|
||||
current_context[tag] = [current_context[tag]]
|
||||
current_context[tag].append(value)
|
||||
else:
|
||||
current_context[tag] = value
|
||||
|
||||
def parse(self, content: str) -> dict:
|
||||
"""Parse SGML content in SUBMISSION format."""
|
||||
document_buffer = None
|
||||
|
||||
for line in content.splitlines():
|
||||
# Check for document section
|
||||
if '<DOCUMENT>' in line:
|
||||
self.data['header'] = '\n'.join(self.header_lines)
|
||||
self.in_documents = True
|
||||
document_buffer = [line]
|
||||
continue
|
||||
|
||||
if self.in_documents:
|
||||
if '</DOCUMENT>' in line:
|
||||
document_buffer.append(line)
|
||||
doc_content = '\n'.join(document_buffer)
|
||||
doc_data = self._parse_document_section(doc_content)
|
||||
if doc_data:
|
||||
self.data['documents'].append(doc_data)
|
||||
document_buffer = None
|
||||
elif document_buffer is not None:
|
||||
document_buffer.append(line)
|
||||
else:
|
||||
# Header section parsing
|
||||
self.header_lines.append(line)
|
||||
line = line.strip()
|
||||
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if self._is_section_start(line):
|
||||
self._handle_section_start(line)
|
||||
elif self._is_section_end(line):
|
||||
self._handle_section_end(line)
|
||||
elif self._is_data_tag(line):
|
||||
self._handle_data_tag(line)
|
||||
elif self._is_empty_tag(line):
|
||||
self._handle_empty_tag(line)
|
||||
elif self._is_unclosed_tag(line):
|
||||
self._handle_unclosed_tag(line)
|
||||
|
||||
return self.data
|
||||
|
||||
def _parse_document_section(self, content: str) -> dict:
|
||||
"""Parse a single document section."""
|
||||
doc_data = {
|
||||
'type': '',
|
||||
'sequence': '',
|
||||
'filename': '',
|
||||
'description': '',
|
||||
'content': content
|
||||
}
|
||||
|
||||
# Extract document metadata
|
||||
type_match = re.search(r'<TYPE>([^<\n]+)', content)
|
||||
if type_match:
|
||||
doc_data['type'] = type_match.group(1).strip()
|
||||
|
||||
sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', content)
|
||||
if sequence_match:
|
||||
doc_data['sequence'] = sequence_match.group(1).strip()
|
||||
|
||||
filename_match = re.search(r'<FILENAME>([^<\n]+)', content)
|
||||
if filename_match:
|
||||
doc_data['filename'] = filename_match.group(1).strip()
|
||||
|
||||
description_match = re.search(r'<DESCRIPTION>([^<\n]+)', content)
|
||||
if description_match:
|
||||
doc_data['description'] = description_match.group(1).strip()
|
||||
|
||||
return doc_data
|
||||
|
||||
class SecDocumentFormatParser:
|
||||
"""Parser for <SEC-DOCUMENT> style SGML"""
|
||||
|
||||
def __init__(self):
|
||||
self.in_header = False
|
||||
self.data = {
|
||||
'format': SGMLFormatType.SEC_DOCUMENT,
|
||||
'header': '',
|
||||
'documents': [],
|
||||
'filer': {}
|
||||
}
|
||||
self.current_document = {}
|
||||
self.header_text = []
|
||||
|
||||
def parse(self, content: str) -> dict:
|
||||
"""Parse SGML content in SEC-DOCUMENT format
|
||||
|
||||
Args:
|
||||
content: The full SGML content as string
|
||||
|
||||
Returns:
|
||||
dict containing parsed header and documents
|
||||
"""
|
||||
document_buffer = []
|
||||
|
||||
for line in content.splitlines():
|
||||
if '<SEC-HEADER>' in line or '<IMS-HEADER>' in line:
|
||||
self.in_header = True
|
||||
continue
|
||||
elif '</SEC-HEADER>' in line or '</IMS-HEADER>' in line:
|
||||
self.in_header = False
|
||||
self.data['header'] = '\n'.join(self.header_text)
|
||||
continue
|
||||
|
||||
if self.in_header:
|
||||
# Collect header text
|
||||
self.header_text.append(line)
|
||||
|
||||
# Handle document sections
|
||||
if '<DOCUMENT>' in line:
|
||||
document_buffer = [] # Start new document
|
||||
elif '</DOCUMENT>' in line and document_buffer:
|
||||
# Parse completed document
|
||||
doc_content = '\n'.join(document_buffer)
|
||||
doc_data = self._parse_document_section(doc_content)
|
||||
if doc_data:
|
||||
self.data['documents'].append(doc_data)
|
||||
document_buffer = []
|
||||
elif document_buffer is not None: # Currently collecting document content
|
||||
document_buffer.append(line)
|
||||
|
||||
return self.data
|
||||
|
||||
def _parse_document_section(self, content: str) -> dict:
|
||||
"""Parse a single document section
|
||||
|
||||
Args:
|
||||
content: Content between <DOCUMENT> tags
|
||||
|
||||
Returns:
|
||||
dict with document metadata and content
|
||||
"""
|
||||
doc_data = {
|
||||
'type': '',
|
||||
'sequence': '',
|
||||
'filename': '',
|
||||
'description': '',
|
||||
'content': content
|
||||
}
|
||||
|
||||
# Extract document metadata using regex
|
||||
type_match = re.search(r'<TYPE>([^<\n]+)', content)
|
||||
if type_match:
|
||||
doc_data['type'] = type_match.group(1).strip()
|
||||
|
||||
sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', content)
|
||||
if sequence_match:
|
||||
doc_data['sequence'] = sequence_match.group(1).strip()
|
||||
|
||||
filename_match = re.search(r'<FILENAME>([^<\n]+)', content)
|
||||
if filename_match:
|
||||
doc_data['filename'] = filename_match.group(1).strip()
|
||||
|
||||
description_match = re.search(r'<DESCRIPTION>([^<\n]+)', content)
|
||||
if description_match:
|
||||
doc_data['description'] = description_match.group(1).strip()
|
||||
|
||||
return doc_data
|
||||
|
||||
def list_documents(content:str) -> list[SGMLDocument]:
|
||||
"""
|
||||
Convenience method to parse all documents from a source into a list.
|
||||
|
||||
Args:
|
||||
content: The content string to parse
|
||||
|
||||
Returns:
|
||||
List of SGMLDocument objects
|
||||
"""
|
||||
return list(iter_documents(content))
|
||||
|
||||
def iter_documents(content:str) -> Iterator[SGMLDocument]:
|
||||
"""
|
||||
Stream SGML documents from either a URL or file path, yielding parsed documents.
|
||||
|
||||
Args:
|
||||
content: The content string to parse
|
||||
|
||||
Yields:
|
||||
SGMLDocument objects containing the parsed content
|
||||
|
||||
Raises:
|
||||
ValueError: If the source is invalid
|
||||
ConnectionError: If URL retrieval fails after retries
|
||||
FileNotFoundError: If the file path doesn't exist
|
||||
"""
|
||||
document_pattern = re.compile(r'<DOCUMENT>([\s\S]*?)</DOCUMENT>')
|
||||
|
||||
for match in document_pattern.finditer(content):
|
||||
document = parse_document(match.group(1))
|
||||
if document:
|
||||
yield document
|
||||
|
||||
|
||||
def parse_document(document_str: str) -> SGMLDocument:
|
||||
"""
|
||||
Parse a single SGML document section, maintaining raw content.
|
||||
"""
|
||||
# Extract individual fields with separate patterns
|
||||
type_match = re.search(r'<TYPE>([^<\n]+)', document_str)
|
||||
sequence_match = re.search(r'<SEQUENCE>([^<\n]+)', document_str)
|
||||
filename_match = re.search(r'<FILENAME>([^<\n]+)', document_str)
|
||||
description_match = re.search(r'<DESCRIPTION>([^<\n]+)', document_str)
|
||||
|
||||
return SGMLDocument(
|
||||
type=type_match.group(1).strip() if type_match else "",
|
||||
sequence=sequence_match.group(1).strip() if sequence_match else "",
|
||||
filename=filename_match.group(1).strip() if filename_match else "",
|
||||
description=description_match.group(1).strip() if description_match else "",
|
||||
raw_content=document_str
|
||||
)
|
||||
@@ -0,0 +1,349 @@
|
||||
"""
|
||||
Module for converting HTML tables from filing reports to pandas DataFrames.
|
||||
This provides an alternative to XBRL parsing by extracting data directly from
|
||||
company-formatted HTML tables.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from edgar.files.html import Document, TableNode
|
||||
from edgar.files.tables import ProcessedTable
|
||||
|
||||
|
||||
@dataclass
|
||||
class TableMetadata:
|
||||
"""Metadata extracted from table headers and content"""
|
||||
currency: Optional[str] = None
|
||||
units: Optional[str] = None
|
||||
scaling_factor: Optional[int] = None
|
||||
period_type: Optional[str] = None # 'instant' or 'duration'
|
||||
|
||||
|
||||
class FinancialTableExtractor:
|
||||
"""Extract financial tables from HTML reports as pandas DataFrames"""
|
||||
|
||||
# Common patterns for financial data
|
||||
# More comprehensive currency patterns
|
||||
CURRENCY_PATTERN = re.compile(
|
||||
r'\$|USD|EUR|GBP|JPY|CNY|CAD|AUD|CHF|'
|
||||
r'£|€|¥|₹|' # Currency symbols
|
||||
r'\bDollars?\b|\bPounds?\b|\bEuros?\b|\bYen\b',
|
||||
re.IGNORECASE
|
||||
)
|
||||
# More flexible units pattern
|
||||
UNITS_PATTERN = re.compile(
|
||||
r'(?:in\s+)?(?:thousands?|millions?|billions?|000s?|000,000s?|mln|mil|bn)',
|
||||
re.IGNORECASE
|
||||
)
|
||||
SCALING_PATTERN = re.compile(r'(\d+(?:,\d{3})*)\s*=\s*\$?1')
|
||||
# More flexible date patterns to handle various formats
|
||||
PERIOD_PATTERN = re.compile(
|
||||
r'(\d{1,2}[\s/\-]\w{3,}[\s/\-]\d{2,4}|' # 31-Dec-2024, 31/December/24
|
||||
r'\w{3,}\.?\s+\d{1,2},?\s+\d{4}|' # December 31, 2024
|
||||
r'\d{4}[\s/\-]\d{1,2}[\s/\-]\d{1,2}|' # 2024-12-31
|
||||
r'\d{1,2}[\s/\-]\d{1,2}[\s/\-]\d{2,4}|' # 12/31/2024, 31-12-24
|
||||
r'Q[1-4]\s*\d{2,4}|' # Q1 2024, Q12024
|
||||
r'\d{1}Q\s*\d{2,4}|' # 1Q 2024, 1Q24
|
||||
r'FY\s*\d{2,4}|' # FY 2024, FY24
|
||||
r'Fiscal\s+\d{4}|' # Fiscal 2024
|
||||
r'Year\s+Ended)', # Year Ended
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def extract_table_to_dataframe(cls, table_node: TableNode) -> pd.DataFrame:
|
||||
"""
|
||||
Convert a TableNode to a pandas DataFrame with appropriate data types.
|
||||
|
||||
Args:
|
||||
table_node: The TableNode containing financial data
|
||||
|
||||
Returns:
|
||||
pd.DataFrame with financial data, periods as columns, line items as index
|
||||
"""
|
||||
try:
|
||||
# Get processed table
|
||||
processed_table = table_node._processed
|
||||
if not processed_table:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Extract metadata from headers
|
||||
metadata = cls._extract_metadata(table_node, processed_table)
|
||||
|
||||
# Build DataFrame
|
||||
df = cls._build_dataframe(processed_table, metadata)
|
||||
|
||||
# Apply data transformations
|
||||
df = cls._apply_transformations(df, metadata)
|
||||
|
||||
return df
|
||||
|
||||
except Exception:
|
||||
# Log error but return empty DataFrame to allow processing to continue
|
||||
return pd.DataFrame()
|
||||
|
||||
@classmethod
|
||||
def _extract_metadata(cls, table_node: TableNode, processed_table: ProcessedTable) -> TableMetadata:
|
||||
"""Extract metadata from table headers and first few rows"""
|
||||
metadata = TableMetadata()
|
||||
|
||||
# Check headers for currency and units
|
||||
if processed_table.headers:
|
||||
header_text = ' '.join(processed_table.headers)
|
||||
|
||||
# Extract currency
|
||||
currency_match = cls.CURRENCY_PATTERN.search(header_text)
|
||||
if currency_match:
|
||||
metadata.currency = currency_match.group(0)
|
||||
|
||||
# Extract units
|
||||
units_match = cls.UNITS_PATTERN.search(header_text)
|
||||
if units_match:
|
||||
unit_text = units_match.group(0).lower()
|
||||
if any(x in unit_text for x in ['thousand', '000s', '000,']):
|
||||
metadata.scaling_factor = 1000
|
||||
metadata.units = 'thousands'
|
||||
elif any(x in unit_text for x in ['million', 'mln', 'mil', '000,000']):
|
||||
metadata.scaling_factor = 1000000
|
||||
metadata.units = 'millions'
|
||||
elif any(x in unit_text for x in ['billion', 'bn']):
|
||||
metadata.scaling_factor = 1000000000
|
||||
metadata.units = 'billions'
|
||||
|
||||
# Check if periods are durations or instants
|
||||
if processed_table.headers:
|
||||
period_headers = [h for h in processed_table.headers if cls.PERIOD_PATTERN.search(h)]
|
||||
if period_headers:
|
||||
# If headers contain "ended" it's likely duration periods
|
||||
if any('ended' in h.lower() for h in period_headers):
|
||||
metadata.period_type = 'duration'
|
||||
else:
|
||||
metadata.period_type = 'instant'
|
||||
|
||||
return metadata
|
||||
|
||||
@classmethod
|
||||
def _build_dataframe(cls, processed_table: ProcessedTable, metadata: TableMetadata) -> pd.DataFrame:
|
||||
"""Build initial DataFrame from processed table"""
|
||||
if not processed_table.data_rows:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Identify period columns and line item column
|
||||
headers = processed_table.headers or []
|
||||
period_cols = []
|
||||
line_item_col = 0
|
||||
|
||||
# Check if this is a "vertical" table (like Cover Page)
|
||||
# where first column is labels and all others are data
|
||||
is_vertical_table = False
|
||||
if len(headers) >= 2:
|
||||
# Check if first column has label-like patterns
|
||||
first_header_lower = headers[0].lower() if headers[0] else ''
|
||||
first_is_label = any(pattern in first_header_lower for pattern in
|
||||
['entity', 'line item', 'information', 'abstract', 'cover page',
|
||||
'detail', 'description', 'item'])
|
||||
|
||||
# Check if this looks like a cover page or entity info table
|
||||
# by examining the first few data rows
|
||||
looks_like_entity_info = False
|
||||
if processed_table.data_rows and len(processed_table.data_rows) > 2:
|
||||
# Check if first column has entity/document field names
|
||||
first_col_values = []
|
||||
for row in processed_table.data_rows[:10]: # Check more rows
|
||||
if len(row) > 0 and isinstance(row[0], str):
|
||||
first_col_values.append(row[0].lower())
|
||||
|
||||
# More comprehensive patterns for vertical tables
|
||||
entity_patterns = ['entity', 'document', 'registrant', 'address',
|
||||
'file number', 'incorporation', 'fiscal', 'telephone',
|
||||
'securities', 'trading', 'exchange', 'ticker']
|
||||
|
||||
# Count how many rows match entity patterns
|
||||
pattern_matches = sum(
|
||||
any(pattern in val for pattern in entity_patterns)
|
||||
for val in first_col_values
|
||||
)
|
||||
|
||||
# If more than 30% of rows have entity-like labels, it's probably vertical
|
||||
looks_like_entity_info = pattern_matches >= len(first_col_values) * 0.3
|
||||
|
||||
is_vertical_table = first_is_label or looks_like_entity_info
|
||||
|
||||
if is_vertical_table:
|
||||
# For vertical tables, first column is index, rest are data
|
||||
line_item_col = 0
|
||||
period_cols = list(range(1, len(headers)))
|
||||
# Ensure we don't include the line item column
|
||||
if line_item_col in period_cols:
|
||||
period_cols.remove(line_item_col)
|
||||
else:
|
||||
# For standard tables, identify period columns
|
||||
for i, header in enumerate(headers):
|
||||
if cls.PERIOD_PATTERN.search(header):
|
||||
period_cols.append(i)
|
||||
elif i == 0: # First column is usually line items
|
||||
line_item_col = i
|
||||
|
||||
# Extract data
|
||||
data = []
|
||||
index = []
|
||||
|
||||
for row in processed_table.data_rows:
|
||||
if len(row) > line_item_col:
|
||||
line_item = row[line_item_col].strip()
|
||||
if line_item and not line_item.isspace():
|
||||
index.append(line_item)
|
||||
row_data = []
|
||||
for col_idx in period_cols:
|
||||
if col_idx < len(row):
|
||||
row_data.append(row[col_idx])
|
||||
else:
|
||||
row_data.append('')
|
||||
data.append(row_data)
|
||||
|
||||
# Create DataFrame
|
||||
if data:
|
||||
column_names = []
|
||||
for i, col_idx in enumerate(period_cols):
|
||||
if col_idx < len(headers):
|
||||
# Clean up column name and make unique if needed
|
||||
col_name = headers[col_idx].strip()
|
||||
# If duplicate, append index
|
||||
if col_name in column_names:
|
||||
col_name = f"{col_name}_{i}"
|
||||
column_names.append(col_name)
|
||||
else:
|
||||
column_names.append(f'Col_{i}')
|
||||
|
||||
df = pd.DataFrame(data, index=index, columns=column_names)
|
||||
else:
|
||||
df = pd.DataFrame()
|
||||
|
||||
return df
|
||||
|
||||
@classmethod
|
||||
def _apply_transformations(cls, df: pd.DataFrame, metadata: TableMetadata) -> pd.DataFrame:
|
||||
"""Apply data type conversions and scaling"""
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
# Convert numeric columns
|
||||
for col in df.columns:
|
||||
df[col] = df[col].apply(cls._parse_financial_value)
|
||||
|
||||
# Apply scaling if specified
|
||||
if metadata.scaling_factor:
|
||||
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
|
||||
df[numeric_cols] = df[numeric_cols] * metadata.scaling_factor
|
||||
|
||||
# Add metadata as attributes
|
||||
df.attrs['currency'] = metadata.currency
|
||||
df.attrs['units'] = metadata.units
|
||||
df.attrs['scaling_factor'] = metadata.scaling_factor
|
||||
df.attrs['period_type'] = metadata.period_type
|
||||
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def _parse_financial_value(value: str) -> Union[float, str]:
|
||||
"""Parse a financial value string to float or keep as string"""
|
||||
if not isinstance(value, str):
|
||||
return value
|
||||
|
||||
# Clean the value
|
||||
clean_value = value.strip()
|
||||
|
||||
# Check for special markers and empty values
|
||||
empty_markers = ['—', '-', '–', '—', '‒', 'N/A', 'n/a', 'NA', 'nm', 'NM', '*', '**']
|
||||
if clean_value in empty_markers or not clean_value:
|
||||
return 0.0
|
||||
|
||||
# Remove currency symbols, whitespace, and other common symbols
|
||||
# Keep negative sign and decimal points
|
||||
clean_value = re.sub(r'[£€¥₹$,\s]', '', clean_value)
|
||||
|
||||
# Handle various negative formats
|
||||
if clean_value.startswith('(') and clean_value.endswith(')'):
|
||||
clean_value = '-' + clean_value[1:-1]
|
||||
elif clean_value.endswith('-'): # Some companies put negative sign at end
|
||||
clean_value = '-' + clean_value[:-1]
|
||||
|
||||
# Handle percentage values (remove % but keep the number)
|
||||
clean_value = clean_value.replace('%', '')
|
||||
|
||||
# Try to convert to float
|
||||
try:
|
||||
return float(clean_value)
|
||||
except ValueError:
|
||||
# If it contains any digits, try harder to extract them
|
||||
if re.search(r'\d', clean_value):
|
||||
# Extract just the numeric part
|
||||
numeric_match = re.search(r'-?\d+\.?\d*', clean_value)
|
||||
if numeric_match:
|
||||
try:
|
||||
return float(numeric_match.group(0))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Return original if not numeric
|
||||
return value
|
||||
|
||||
|
||||
def extract_statement_dataframe(report_content: str) -> pd.DataFrame:
|
||||
"""
|
||||
Convenience function to extract a DataFrame from report HTML content.
|
||||
|
||||
Args:
|
||||
report_content: HTML content from a report
|
||||
|
||||
Returns:
|
||||
pd.DataFrame containing the financial data
|
||||
"""
|
||||
# Parse HTML document
|
||||
document = Document.parse(report_content)
|
||||
|
||||
if not document.tables:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Try each table to find one with financial data
|
||||
for table_node in document.tables:
|
||||
# Skip tables that are too small (likely headers or metadata)
|
||||
if table_node.row_count < 3:
|
||||
continue
|
||||
|
||||
# Check if table has numeric data
|
||||
if _table_has_financial_data(table_node):
|
||||
df = FinancialTableExtractor.extract_table_to_dataframe(table_node)
|
||||
if not df.empty:
|
||||
return df
|
||||
|
||||
# If no suitable table found, try the first table anyway
|
||||
if document.tables:
|
||||
return FinancialTableExtractor.extract_table_to_dataframe(document.tables[0])
|
||||
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def _table_has_financial_data(table_node: TableNode) -> bool:
|
||||
"""Check if a table contains financial data by looking for numeric patterns"""
|
||||
if not table_node.content:
|
||||
return False
|
||||
|
||||
# Check first few rows for numeric data
|
||||
numeric_count = 0
|
||||
total_cells = 0
|
||||
|
||||
for _i, row in enumerate(table_node.content[:10]): # Check first 10 rows
|
||||
for cell in row.cells:
|
||||
total_cells += 1
|
||||
if isinstance(cell.content, str):
|
||||
# Check for financial number patterns
|
||||
if re.search(r'\$?\s*\d+[,.]?\d*', cell.content):
|
||||
numeric_count += 1
|
||||
|
||||
# If more than 20% of cells have numbers, likely a financial table
|
||||
return total_cells > 0 and (numeric_count / total_cells) > 0.2
|
||||
82
venv/lib/python3.10/site-packages/edgar/sgml/tools.py
Normal file
82
venv/lib/python3.10/site-packages/edgar/sgml/tools.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import base64
|
||||
import re
|
||||
|
||||
__all__ = ['extract_text_between_tags', 'get_content_between_tags', 'strip_tags', 'is_xml', 'decode_uu']
|
||||
|
||||
def extract_text_between_tags(content: str, tag: str) -> str:
|
||||
"""
|
||||
Extracts text from provided content between the specified HTML/XML tags.
|
||||
|
||||
:param content: The text content to search through
|
||||
:param tag: The tag to extract the content from
|
||||
:return: The extracted text between the tags
|
||||
"""
|
||||
tag_start = f'<{tag}>'
|
||||
tag_end = f'</{tag}>'
|
||||
is_tag = False
|
||||
extracted_content = ""
|
||||
|
||||
for line in content.splitlines():
|
||||
if line.startswith(tag_start):
|
||||
is_tag = True
|
||||
continue # Skip the start tag line
|
||||
elif line.startswith(tag_end):
|
||||
break # Stop reading if end tag is found
|
||||
elif is_tag:
|
||||
extracted_content += line + '\n' # Add line to result
|
||||
|
||||
return extracted_content.strip()
|
||||
|
||||
|
||||
def get_content_between_tags(content: str, outer_tag: str = None) -> str:
|
||||
"""
|
||||
Extract content between specified tags, starting from most nested tags.
|
||||
|
||||
Args:
|
||||
content: Raw content containing tagged sections
|
||||
outer_tag: Optional specific tag to extract from (e.g. 'XBRL', 'TEXT')
|
||||
|
||||
Returns:
|
||||
str: Content between the specified tags, or innermost content if no tag specified
|
||||
"""
|
||||
known_tags = ["PDF", "XBRL", "XML", "TEXT"] # Ordered from most nested to least nested
|
||||
|
||||
if outer_tag:
|
||||
# Extract content for specific tag
|
||||
pattern = f'<{outer_tag}>(.*?)</{outer_tag}>'
|
||||
match = re.search(pattern, content, re.DOTALL)
|
||||
return match.group(1).strip() if match else ''
|
||||
|
||||
# If no tag specified, find the first matching tag from most nested to least
|
||||
for tag in known_tags:
|
||||
pattern = f'<{tag}>(.*?)</{tag}>'
|
||||
match = re.search(pattern, content, re.DOTALL)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
def strip_tags(text: str, start_tag: str, end_tag: str) -> str:
|
||||
"""Strip XML/HTML tags from text if present."""
|
||||
if text.startswith(start_tag) and text.endswith(end_tag):
|
||||
return text[len(start_tag):-len(end_tag)].strip()
|
||||
return text
|
||||
|
||||
def is_xml(filename: str) -> bool:
|
||||
"""Check if a file is XML based on the file extension.
|
||||
.xsd, .xml, .xbrl
|
||||
"""
|
||||
return filename.lower().endswith(('.xsd', '.xml', '.xbrl'))
|
||||
|
||||
|
||||
def decode_uu(uu_content):
|
||||
lines = uu_content.split('\n')
|
||||
data = ''
|
||||
for line in lines[1:]: # Skip "begin" line
|
||||
if line.startswith('`') or line.startswith('end'):
|
||||
break
|
||||
# Convert UU to base64 padding
|
||||
data += ''.join([chr(((ord(c) - 32) & 63) + 32) for c in line.strip()])
|
||||
|
||||
return base64.b64decode(data)
|
||||
Reference in New Issue
Block a user