Files
edgartools/venv/lib/python3.10/site-packages/edgar/files/htmltools.py
2025-12-09 12:13:01 +01:00

583 lines
21 KiB
Python

import re
import warnings
from dataclasses import dataclass
from functools import lru_cache, partial
from io import StringIO
from typing import Any, Callable, Dict, List, Optional
import numpy as np
import pandas as pd
from rich import box
from rich.panel import Panel
from rich.table import Table
from edgar.core import pandas_version
from edgar.datatools import compress_dataframe
from edgar.files.html_documents import Block, HtmlDocument, LinkBlock, TableBlock, table_to_markdown
from edgar.richtools import repr_rich
__all__ = [
"Element",
"extract_tables",
'chunks2df',
"html_to_text",
'html_sections',
'decimal_chunk_fn',
"ChunkedDocument",
'remove_bold_tags',
'detect_decimal_items',
'adjust_for_empty_items',
"get_text_elements",
]
@dataclass
class Element:
id: str
type: str
element: Any
summary: Optional[str] = None
table: Optional[pd.DataFrame] = None
def extract_tables(html_str: str,
table_filters: List = None) -> List[pd.DataFrame]:
table_filters = table_filters or [filter_tiny_table]
tables = pd.read_html(StringIO(html_str))
# Compress and filter the tables
tables = [
compress_dataframe(table)
for table in tables
# if not all([tf(table) for tf in table_filters])
]
# Filter out empty tables
tables = [table for table in tables if len(table) > 0]
return tables
def html_sections(html_str: str,
ignore_tables: bool = False) -> List[str]:
"""split the html into sections"""
document = HtmlDocument.from_html(html_str)
return list(document.generate_text_chunks(ignore_tables=ignore_tables))
def html_to_text(html_str: str,
ignore_tables: bool = True,
sep: str = '\n'
) -> str:
document = HtmlDocument.from_html(html_str)
if not ignore_tables:
return document.text
return sep.join([chunk for chunk in document.generate_text_chunks(ignore_tables=True)])
def is_inline_xbrl(html: str) -> bool:
return "xmlns:ix=" in html[:2000]
def filter_tiny_table(table: pd.DataFrame, min_rows: int = 1, min_cols: int = 1):
return len(table) >= min_rows and len(table.columns) >= min_cols
def remove_bold_tags(html_content):
# Replace <b>...</b> and <strong>...</strong> tags with their content
html_content = re.sub(r'<b>(.*?)</b>', r'\1', html_content)
html_content = re.sub(r'<strong>(.*?)</strong>', r'\1', html_content)
return html_content
def get_text_elements(elements: List[Element]):
return [e for e in elements if e.type == "text"]
@lru_cache(maxsize=8)
def chunk(html: str):
document = HtmlDocument.from_html(html)
return list(document.generate_chunks())
int_item_pattern = r"^(Item\s{1,3}[0-9]{1,2}[A-Z]?)\.?"
decimal_item_pattern = r"^(Item\s{1,3}[0-9]{1,2}\.[0-9]{2})\.?"
def detect_table_of_contents(text: str):
"""Find the table of contents in the text"""
return text.lower().count('item') > 10
def detect_signature(text: str) -> bool:
"""Find the signature block in the text"""
matched = re.match(pattern='^SIGNATURE', string=text, flags=re.IGNORECASE | re.MULTILINE) is not None
# If no results are true in the series try anothr pattern
if not matched:
matched = 'to be signed on its behalf by the undersigned' in text
return matched
def detect_int_items(text: pd.Series):
return text.str.extract(int_item_pattern, expand=False, flags=re.IGNORECASE | re.MULTILINE)
def detect_part(text: pd.Series) -> pd.Series:
"""
Detect and extract 'Part' sections such as 'PART I', 'Part II', etc., from the given text Series.
Handles various formats found in SEC filings, including:
- 'PART I. Financial Information'
- 'Part II'
- 'PART III — Executive Overview'
- 'This section is PART IV'
Returns:
pd.Series: A series containing the extracted 'Part X' values (uppercase), or NaN if not found.
"""
# Match patterns like 'PART I', 'Part II', 'PART III.', etc.
part_pattern = r'^\b(PART\s+[IVXLC]+)\b'
# Extract using case-insensitive matching and convert result to uppercase
extracted = text.str.extract(part_pattern, flags=re.IGNORECASE | re.MULTILINE, expand=False)
# Normalize to uppercase for consistency (e.g., 'Part I' → 'PART I')
return extracted.str.upper().str.replace(r'\s+', ' ', regex=True)
def detect_decimal_items(text: pd.Series):
return text.str.extract(decimal_item_pattern, expand=False, flags=re.IGNORECASE | re.MULTILINE)
def find_next_item(index, normalized_items):
"""Find the next available item in the DataFrame starting from a given index."""
for i in range(index + 1, len(normalized_items)):
if normalized_items[i]:
return normalized_items[i]
return None
def normalize_item(item):
"""Normalize item string to a comparable format."""
if not pd.isna(item):
return re.sub(r"[^0-9A-Za-z ]", "", item) # Remove all but numbers and letters
return item
def extract_numeric_alpha_parts(item):
"""Extract numeric and alphabetic parts from an item."""
numeric_part = int(re.search(r"[0-9]+", item).group()) if item else 0
alpha_part = re.search(r"[A-Z]$", item)
alpha_part = alpha_part.group() if alpha_part else ''
return numeric_part, alpha_part
def is_valid_sequence(current_item, last_valid_item, next_available_item):
"""
Determine if the current item is valid considering the last and next available items.
"""
if not current_item or pd.isna(current_item) or not next_available_item or pd.isna(next_available_item):
return False
current_item_num, current_item_alpha = extract_numeric_alpha_parts(current_item)
last_item_num, last_item_alpha = extract_numeric_alpha_parts(last_valid_item)
next_item_num, next_item_alpha = extract_numeric_alpha_parts(next_available_item)
# Check if the current item is greater than the last valid item and less than or equal to the next available item
if current_item_num == last_item_num:
return current_item_alpha > last_item_alpha
elif current_item_num == next_item_num:
return current_item_alpha < next_item_alpha or next_item_alpha == ''
else:
return last_item_num < current_item_num <= next_item_num
def adjust_detected_items(chunk_df: pd.DataFrame, **kwargs) -> pd.DataFrame:
"""
Ensure that the items are in sequence and filter out any out of sequence items.
"""
chunk_df['NormalizedItem'] = chunk_df['DetectedItem'].apply(normalize_item)
normalized_items = chunk_df['NormalizedItem'].replace([np.nan], [None]).tolist()
last_valid_item = ""
valid_items = pd.Series(index=chunk_df.index, dtype=object) # Create a series to store valid items
# First find the index of the table of contents toc.
toc_index_rows = chunk_df[chunk_df.Toc.notnull() & chunk_df.Toc]
# If not found set to 0
toc_index = toc_index_rows.index[0] if len(toc_index_rows) > 0 else 0
# Iterate only through rows with non-null 'Item' starting at toc_index + 1
for index, row in chunk_df.iterrows():
if index < toc_index + 1:
continue
current_item = row['NormalizedItem']
next_available_item = find_next_item(index, normalized_items)
if is_valid_sequence(current_item, last_valid_item, next_available_item):
valid_items[index] = current_item
last_valid_item = current_item # Update the last valid item
else:
valid_items[index] = pd.NA # Mark as invalid/out of sequence
chunk_df['Item'] = valid_items
return chunk_df
def adjust_for_empty_items(chunk_df: pd.DataFrame,
**kwargs) -> pd.DataFrame:
chunk_df['Item'] = chunk_df.DetectedItem
for index, row in chunk_df[chunk_df.DetectedItem.notnull()].iterrows():
item = row.Item
# Get item_structure from kwargs
item_structure = kwargs.get('item_structure')
structure = item_structure.get_item(item)
if not structure:
break
title = structure.get('Title')
text = row.Text
# Look for Item NUM Description Item in the text
pattern = rf"^({item}.? {title}\W+)"
match = re.search(pattern + "Item [1-9]", text, flags=re.IGNORECASE | re.MULTILINE)
if match:
text = re.sub(pattern, "", text, flags=re.IGNORECASE | re.MULTILINE)
# extract the item from text using decimal_item_pattern
match = re.search(decimal_item_pattern, text, flags=re.IGNORECASE | re.MULTILINE)
if match:
new_item = match.group(1)
chunk_df.loc[index, 'Item'] = new_item
return chunk_df
def _render_blocks_using_old_markdown_tables(blocks:List[Block]):
"""
This renders tables as the old style markdown tables
Because thd item chunking uses these tables.
So this is a split from the newer table rendering logic
"""
return "".join([
table_to_markdown(block.table_element) if isinstance(block, TableBlock) else block.get_text()
for block in blocks
]).strip()
def chunks2df(chunks: List[List[Block]],
item_detector: Callable[[pd.Series], pd.Series] = detect_int_items,
item_adjuster: Callable[[pd.DataFrame, Dict[str, Any]], pd.DataFrame] = adjust_detected_items,
item_structure=None,
) -> pd.DataFrame:
"""Convert the chunks to a dataframe
: item_detector: A function that detects the item in the text column
: item_adjuster: A function that finds issues like out of sequence items and adjusts the item column
: item_structure: A dictionary of items specific to each filing e.g. 8-K, 10-K, 10-Q
"""
# Create a dataframe from the chunks. Add columns as necessary
chunk_df = pd.DataFrame([{'Text': _render_blocks_using_old_markdown_tables(blocks),
'Table': isinstance(blocks, TableBlock)}
for blocks in chunks]
).assign(Chars=lambda df: df.Text.apply(len),
Signature=lambda df: df.Text.apply(detect_signature).fillna(""),
TocLink=lambda df: df.Text.str.match('^Table of Contents$',
flags=re.IGNORECASE | re.MULTILINE),
Toc=lambda df: df.Text.head(100).apply(detect_table_of_contents),
Empty=lambda df: df.Text.str.contains('^$', na=True),
Part=lambda df: detect_part(df.Text),
Item=lambda df: item_detector(df.Text)
)
# If the row is 'toc' then set the item and part to empty
chunk_df.loc[chunk_df.Toc.notnull() & chunk_df.Toc, 'Item'] = ""
# if item_adjuster:
# chunk_df = item_adjuster(chunk_df, **{'item_structure': item_structure, 'item_detector': item_detector})
# Foward fill item and parts
# Handle deprecation warning in fillna(method='ffill')
if pandas_version >= (2, 1, 0):
# Opt-in to pandas future behavior to avoid silent downcasting warnings
with pd.option_context('future.no_silent_downcasting', True):
chunk_df['Item'] = chunk_df['Item'].ffill()
chunk_df['Part'] = chunk_df['Part'].ffill()
else:
chunk_df.Item = chunk_df.Item.fillna(method='ffill')
chunk_df.Part = chunk_df.Part.fillna(method='ffill')
# After forward fill handle the signature at the bottom
signature_rows = chunk_df[chunk_df.Signature]
if len(signature_rows) > 0:
signature_loc = signature_rows.index[0]
chunk_df.loc[signature_loc:, 'Item'] = pd.NA
chunk_df.Signature = chunk_df.Signature.fillna("")
# Fill the Item column with "" then set to title case
chunk_df.Item = chunk_df.Item.fillna("").str.title()
chunk_df.Part = chunk_df.Part.fillna("").str.title()
# Normalize spaces in item
chunk_df.Item = chunk_df.Item.apply(lambda item: re.sub(r'\s+', ' ', item))
chunk_df.Part = chunk_df.Part.apply(lambda part: re.sub(r'\s+', ' ', part).strip())
# Finalize the colums
chunk_df = chunk_df[['Text', 'Table', 'Chars', 'Signature', 'TocLink', 'Toc', 'Empty', 'Part', 'Item']]
return chunk_df
# This function is used by 8-K and other filings that have the item form 1.02 for example
decimal_chunk_fn = partial(chunks2df,
item_detector=detect_decimal_items,
item_adjuster=adjust_for_empty_items)
class ChunkedDocument:
"""
Contains the html as broken into chunks
"""
def __init__(self,
html: str,
chunk_fn: Callable[[List], pd.DataFrame] = chunks2df,
prefix_src: str = ""):
"""
:param html: The filing html
:param chunk_fn: A function that converts the chunks to a dataframe
:param file_path: The path to the filing
"""
self.chunks = chunk(html)
self._chunked_data = chunk_fn(self.chunks)
self.chunk_fn = chunk_fn
self.prefix_src = prefix_src
self.document_id_parse:Dict = {}
@lru_cache(maxsize=4)
def as_dataframe(self):
return self.chunk_fn(self.chunks)
def show_items(self, df_query: str, *columns):
result = self._chunked_data.query(df_query)
if len(columns) > 0:
columns = ["Text"] + list(columns)
result = result.filter(columns)
return result
def list_items(self):
return [item for item in self._chunked_data.Item.drop_duplicates().tolist() if item]
def _chunks_for(self, item_or_part: str, col: str = 'Item'):
chunk_df = self._chunked_data
# Handle cases where the item has the decimal point e.g. 5.02
item_or_part = item_or_part.replace('.', r'\.')
pattern = re.compile(rf'^{item_or_part}$', flags=re.IGNORECASE)
col_mask = chunk_df[col].str.match(pattern)
toc_mask = ~(~chunk_df.Toc.notnull() & chunk_df.Toc)
empty_mask = ~chunk_df.Empty
mask = col_mask & toc_mask & empty_mask
for i in mask[mask].index:
yield self.chunks[i]
def _chunks_mul_for(self, part: str, item: str):
chunk_df = self._chunked_data
# Handle cases where the item has the decimal point e.g. 5.02
part = part.replace('.', r'\.')
item = item.replace('.', r'\.')
pattern_part = re.compile(rf'^{part}$', flags=re.IGNORECASE)
pattern_item = re.compile(rf'^{item}$', flags=re.IGNORECASE)
item_mask = chunk_df["Item"].str.match(pattern_item)
part_mask = chunk_df["Part"].str.match(pattern_part)
toc_mask = ~(~chunk_df.Toc.notnull() & chunk_df.Toc)
empty_mask = ~chunk_df.Empty
mask = part_mask & item_mask & toc_mask & empty_mask
# Process to keep only consecutive indices, discard non-consecutive head/tail indices with warning
index_list = mask[mask].index.to_list()
if not index_list:
return
continuous_segments = []
current_segment = [index_list[0]]
for i in range(1, len(index_list)):
if index_list[i] <= current_segment[-1] + 5:
current_segment.append(index_list[i])
else:
continuous_segments.append(current_segment)
current_segment = [index_list[i]]
continuous_segments.append(current_segment)
# retain only the longest continuous segment
longest_segment = max(continuous_segments, key=len)
# warning dity content
if len(continuous_segments) > 1:
discarded_indices = []
for segment in continuous_segments:
if segment != longest_segment:
discarded_indices.extend(segment)
warnings.warn(
f"Discarded non-continuous indices: {discarded_indices}. "
f"""content: {''.join([
''.join(block.get_text() for block in self.chunks[idx])
for idx in discarded_indices
])}"""
)
for i in longest_segment:
yield self.chunks[i]
def chunks_for_item(self, item: str):
"""
Returns chunks of text for a given item from the document.
Args:
item (str): The item name to retrieve chunks for.
Returns:
List[str]: List of text chunks corresponding to the specified item.
"""
return self._chunks_for(item, col='Item')
def chunks_for_part(self, part: str):
return self._chunks_for(part, col='Part')
def average_chunk_size(self):
return int(self._chunked_data.Chars.mean())
def tables(self):
for chunk in self.chunks:
for block in chunk:
if isinstance(block, TableBlock):
yield block
def assemble_block_text(self, chunks: List[Block]):
if self.prefix_src:
for chunk in chunks:
for block in chunk:
if isinstance(block, LinkBlock):
yield block.to_markdown(prefix_src=self.prefix_src)
else:
yield block.get_text()
else:
for chunk in chunks:
yield "".join([block.get_text() for block in chunk])
def assemble_block_markdown(self, chunks: List[Block]):
if self.prefix_src:
for chunk in chunks:
for block in chunk:
if isinstance(block, LinkBlock):
yield block.to_markdown(prefix_src=self.prefix_src)
else:
yield block.to_markdown()
else:
for chunk in chunks:
yield "".join([block.to_markdown() for block in chunk])
def get_item_with_part(self, part: str, item: str, markdown:bool=False):
if isinstance(part, str):
chunks = list(self._chunks_mul_for(part, item))
if markdown:
return self.clean_part_line("".join([text for text in self.assemble_block_markdown(chunks)]))
else:
return self.clean_part_line("".join([text for text in self.assemble_block_text(chunks)]))
@staticmethod
def clean_part_line(text:str):
res = text.rstrip("\n")
last_line = res.split("\n")[-1]
if re.match(r'^\b(PART\s+[IVXLC]+)\b', last_line):
res = res.rstrip(last_line).rstrip()
return res
def get_signature(self, markdown:bool=False):
sig_index = self._chunked_data[self._chunked_data.Signature].index
if markdown:
res = "".join(
[text for text in
self.assemble_block_markdown(
[self.chunks[idx] for idx in sig_index]
)])
else:
res = "".join(
[text for text in
self.assemble_block_text(
[self.chunks[idx] for idx in sig_index]
)])
return self.clean_part_line(res)
def get_introduction(self, markdown:bool=False):
"""
Extract and return the introduction section of the filing document.
The introduction is defined as all content before the first valid Part or Item.
Returns:
str: The extracted introduction text, or an empty string if none found.
"""
# Find the first index where Part or Item appears
part_indices = self._chunked_data[self._chunked_data.Part != ""].index
item_indices = self._chunked_data[self._chunked_data.Item != ""].index
if len(part_indices) == 0 and len(item_indices) == 0:
return ""
# Use the last one
intro_index = max(
part_indices[0] if len(part_indices) else 0,
item_indices[0] if len(item_indices) else 0
)
if intro_index == 0:
return ""
# Reuse __getitem__ to extract chunks up to min_index
if markdown:
res = "".join(
[text for text in
self.assemble_block_markdown(
[self.chunks[idx] for idx in range(intro_index)]
)])
else:
res = "".join(
[text for text in
self.assemble_block_text(
[self.chunks[idx] for idx in range(intro_index)]
)])
return self.clean_part_line(res)
def __len__(self):
return len(self.chunks)
def __getitem__(self, item, markdown:bool=False):
if isinstance(item, int):
chunks = [self.chunks[item]]
elif isinstance(item, str):
chunks = list(self.chunks_for_item(item))
else:
return None
if len(chunks) == 0:
return None
# render the nested List of List [str]
if markdown:
return "".join([text for text in self.assemble_block_markdown(chunks)])
else:
return "".join([text for text in self.assemble_block_text(chunks)])
def __iter__(self):
return iter(self.chunks)
def __rich__(self):
table = Table("Chunks",
"Items",
"Avg Size", box=box.SIMPLE)
table.add_row(str(len(self.chunks)),
",".join(self.list_items()),
str(self.average_chunk_size()),
)
return Panel(table, box=box.ROUNDED, title="HTML Document")
def __repr__(self):
return repr_rich(self.__rich__())