Files
edgartools/venv/lib/python3.10/site-packages/edgar/files/html_documents.py
2025-12-09 12:13:01 +01:00

1157 lines
43 KiB
Python

import re
import warnings
from functools import lru_cache
from typing import Any, Dict, List, Optional, Tuple, Union
import pandas as pd
from bs4 import BeautifulSoup, Comment, NavigableString, Tag, XMLParsedAsHTMLWarning
from rich import box
from rich.table import Table
from edgar.datatools import clean_column_text, table_html_to_dataframe
from edgar.richtools import repr_rich
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
__all__ = ['DocumentData',
'HtmlDocument',
'Block',
'TextBlock',
'TableBlock',
'TextAnalysis',
'SECLine',
'table_to_text',
'table_to_markdown',
'html_to_text',
'get_clean_html', ]
NAMESPACES = {
"xbrli": 'http://www.xbrl.org/2003/instance',
"i": 'http://www.xbrl.org/2003/instance',
"ix": "http://www.xbrl.org/2013/inlineXBRL",
"xbrldi": 'http://xbrl.org/2006/xbrldi',
"xbrll": "http://www.xbrl.org/2003/linkbase",
"link": 'http://www.xbrl.org/2003/linkbase',
"xlink": "http://www.w3.org/1999/xlink",
"dei": "http://xbrl.sec.gov/dei/2023",
"country": "http://xbrl.sec.gov/country/2023",
"currency": "http://xbrl.sec.gov/currency/2023",
"exch": "http://xbrl.sec.gov/exch/2023",
"naics": "http://xbrl.sec.gov/naics/2023",
"sic": "http://xbrl.sec.gov/sic/2023",
"utr": "http://www.xbrl.org/2009/utr",
"cef": "http://xbrl.sec.gov/cef/2023",
"srt": "http://fasb.org/srt/2023",
"ixt": "http://www.xbrl.org/inlineXBRL/transformation/2022-02-16",
"ixt-sec": "http://www.sec.gov/inlineXBRL/transformation/2015-08-31"
# Add other namespaces as needed
}
def ns_tag(tag):
return re.compile(r'(?:' + '|'.join(NAMESPACES.keys()) + r'):' + tag)
class DocumentData:
"""
Represents the header of an ixbrl document
Contains the hidden properties, schema references, context and units
"""
def __init__(self,
data: pd.DataFrame,
schema_refs: Optional[List[str]] = None,
context: Dict[str, Dict[str, Union[str, None]]] = None,
units: Dict[str, str] = None):
self.data = data
self.context = context or {}
self.units = units or {}
self.schema_refs = schema_refs or {}
def __getitem__(self, item):
result = self.data[self.data.name == item]
if not result.empty:
# Return a dict
return result.to_dict(orient='records')[0]
def __contains__(self, item):
if self.data is None or self.data.empty:
return False
return item in self.data.name.to_list()
def __str__(self):
return "Inline Xbrl Header"
def __rich__(self):
table = Table("", "name", "value",
title="Inline Xbrl Document",
box=box.SIMPLE)
for row in self.data.itertuples():
table.add_row(row.namespace, row.name, row.value)
return table
def __repr__(self):
return repr_rich(self.__rich__())
@classmethod
def parse_headers(cls, ix_header_tags: List[Tag]):
ix_header = cls.parse_header(ix_header_tags[0])
if len(ix_header_tags) == 1:
return ix_header
for header_tag in ix_header_tags[1:]:
next_header = cls.parse_header(header_tag)
dfs = [df for df in [ix_header.data, next_header.data] if df is not None]
ix_header.properties = pd.concat(dfs) if len(dfs) > 0 else None
ix_header.schema_refs.extend(next_header.schema_refs)
ix_header.context.update(next_header.context)
ix_header.units.update(next_header.units)
return ix_header
@classmethod
def parse_header(cls, ix_header_element: Tag):
hidden_props, schema_refs, context_map, unit_map = None, [], {}, {}
resource_tag = ix_header_element.find(ns_tag('resources'))
if resource_tag:
# Parse contexts
context_tags = resource_tag.find_all(ns_tag('context'))
for ctx in context_tags:
context_id = ctx.get('id')
entity_tag = ctx.find(ns_tag('entity'))
identifier = entity_tag.find(ns_tag('identifier')).text if entity_tag else None
period_tag = ctx.find(ns_tag('period'))
instant = period_tag.find(ns_tag('instant'))
if instant:
start = end = instant.text
else:
start = period_tag.find(ns_tag('startdate')).text if period_tag.find(ns_tag('startdate')) else None
end = period_tag.find(ns_tag('enddate')).text if period_tag.find(ns_tag('enddate')) else None
context_map[context_id] = {'identifier': identifier, 'start': start, 'end': end}
segment = ctx.find(ns_tag('segment'))
if segment:
context_map[context_id]['dimensions'] = str({m.get('dimension'): m.text
for m in segment.find_all(ns_tag('explicitMember'))})
# Parse units
unit_tags = resource_tag.find_all(ns_tag('unit'))
for unit in unit_tags:
unit_id = unit.get('id')
divide = unit.find(ns_tag('divide'))
if divide:
numerator = divide.find(ns_tag('unitnumerator')).find(ns_tag('measure')).text
denominator = divide.find(ns_tag('unitdenominator')).find(ns_tag('measure')).text
unit_map[unit_id] = f"{numerator.split(':')[-1]} per {denominator.split(':')[-1]}"
else:
unit_map[unit_id] = unit.find(ns_tag('measure')).text.split(':')[-1]
# Parse hidden elements
hidden_elements = ix_header_element.find(ns_tag('hidden'))
if hidden_elements:
props = []
for el in hidden_elements.find_all():
name_parts = el.get('name', '').partition(':')
prop = {
'name': name_parts[2],
'namespace': name_parts[0],
'value': el.text.strip(),
'tag': el.name
}
ctx_ref = el.get('contextref')
if ctx_ref:
ctx = context_map.get(ctx_ref, {})
prop.update({
'start': ctx.get('start'),
'end': ctx.get('end'),
'identifier': ctx.get('identifier')
})
props.append(prop)
hidden_props = pd.DataFrame(props)
# Parse references
references = ix_header_element.find(ns_tag('references'))
if references:
schema_refs = [s.get('xlink:href') for s in references.find_all() if s.get('xlink:href')]
ix_header_element.decompose()
return cls(data=hidden_props, schema_refs=schema_refs, context=context_map, units=unit_map)
def parse_inline_data(self, start_element: Tag):
records = []
inline_tags = ns_tag('nonfraction|nonnumeric|fraction')
for ix_tag in start_element.find_all(inline_tags):
if ix_tag.name is None:
continue
record = dict(ix_tag.attrs)
record['tag'] = ix_tag.name
context_ref = record.get('contextref')
if context_ref:
record.update(self.context.get(context_ref, {}))
record.pop('contextref', None)
record['value'] = ix_tag.text.strip()
name_parts = record.get('name', '').partition(':')
record['namespace'], record['name'] = name_parts[0], name_parts[2]
unit_ref = record.get('unitref')
if unit_ref:
record['unit'] = self.units.get(unit_ref)
record.pop('unitref', None)
records.append(record)
records_df = pd.DataFrame(records)
self.data = pd.concat([self.data, records_df], ignore_index=True)
INLINE_IXBRL_TAGS = ['ix:nonfraction', 'ix:nonnumeric', 'ix:fraction']
class Block:
def __init__(self, text: Optional[str], **tags):
self.text: Optional[str] = text
self.inline: bool = False
self.metadata: Dict[str, Any] = tags
def __contains__(self, item):
return item in self.text
def to_markdown(self) -> str:
return self.text
def get_text(self) -> List[str]:
return self.text
def is_empty(self):
return not self.is_linebreak() and not self.text.strip()
def is_linebreak(self) -> bool:
# This block is a line break if it only has '\n'
return self.text != '' and self.text.strip('\n') == ''
def __str__(self):
return "Block"
def __repr__(self):
return self.text
class LinkBlock(Block):
def __init__(self, text: str, tag:str, alt:str, src:str, **tags):
super().__init__(text, **tags)
self.tag = tag
self.alt = alt
self.src = src
self.inline: bool = True
def get_text(self) -> str:
return '<{self.tag} alt="{self.alt}" src="{self.src}">'
def to_markdown(self, prefix_src:str=""):
return f"![alt {self.alt}]({prefix_src}/{self.src})\n"
def get_complete_text(self, prefix_src:str):
return f'<{self.tag} alt="{self.alt}" src="{prefix_src}/{self.src}">\n'
def __str__(self):
return "LinkBlock"
def __repr__(self):
return self.text
class TextBlock(Block):
def __init__(self, text: str, inline: bool = False, **tags):
super().__init__(text, **tags)
self.inline: bool = inline
@property
@lru_cache(maxsize=1)
def num_words(self):
"return the number of words in this text block"
if self.is_linebreak() or self.is_empty():
return 0
return len(self.text.split(" "))
@property
@lru_cache(maxsize=1)
def is_header(self):
return is_header(self.text)
@lru_cache(maxsize=1)
def analyze(self):
return TextAnalysis(self.text)
def __str__(self):
return "TextBlock"
def __repr__(self):
return self.text
class TableBlock(Block):
"""
Represents an HTML table in the document
"""
def __init__(self, table_element: Tag, **tag):
super().__init__(text=None, **tag)
self.table_element = table_element
@lru_cache()
def get_text(self):
_text = table_to_text(self.table_element)
_text = "\n" + _text + "\n"
return _text
def to_dataframe(self) -> pd.DataFrame:
table_df = table_html_to_dataframe(str(self.table_element))
return table_df
def to_markdown(self) -> str:
return self.to_dataframe().to_markdown() + "\n"
def __str__(self):
return "TableBlock"
def __repr__(self):
return str(self)
item_pattern = r"(?:ITEM|Item)\s+(?:[0-9]{1,2}[A-Z]?\.?|[0-9]{1,2}\.[0-9]{2})"
# part_pattern = r"^\b(PART\s+[IVXLC]+)\b"
part_pattern = re.compile(r"^\b(PART\s+[IVXLC]+)\b", re.IGNORECASE)
class HtmlDocument:
def __init__(self,
blocks: List[Block],
data: Optional[DocumentData] = None,
):
assert isinstance(blocks, list), "blocks must be a list of Block objects"
self.blocks: List[Block] = blocks # The text blocks
self.data: Optional[DocumentData] = data # Any data in the document
@property
def text(self) -> str:
_text = ""
for _i, block in enumerate(self.blocks):
_text += block.get_text()
return _text
@property
def markdown(self) -> str:
"""Convert the document to markdown"""
md = ""
for block in self.blocks:
line = block.to_markdown()
if is_header(line):
md += "\n" + line + "\n"
else:
md += line
return md
def get_table_blocks(self) -> List[TableBlock]:
"""Get a list of all the table blocks in the document"""
return [block for block in self.blocks if isinstance(block, TableBlock)]
@staticmethod
def _compress_blocks(blocks: List[Block]):
"""
Create a new block structure with blocks that are only whitespace appended to previous blocks
For example ... if there are consecutive blocks like so
'THIS is a block'
' '
the result should be
'THIS is a block '
Copy to a new block structure
"""
compressed_blocks = []
current_block = None
for _i, block in enumerate(blocks):
if isinstance(block, TableBlock):
if current_block:
compressed_blocks.append(current_block)
current_block = None # Reset the current block
compressed_blocks.append(block)
else:
if block.text.endswith("\n"):
if current_block:
if current_block.inline and block.inline:
current_block.text += block.text
compressed_blocks.append(current_block)
current_block = None # Reset the current block
else:
compressed_blocks.append(current_block)
compressed_blocks.append(block)
current_block = None # Reset the current block
else:
compressed_blocks.append(block)
elif block.is_empty(): # Empty blocks get appended to the previous block
if not current_block:
current_block = block
else:
current_block.text += block.text
else:
if current_block:
# If current is empty assume the inline status of the block
if current_block.is_empty():
current_block.inline = block.inline
current_block.text += block.text
else:
current_block = block
# Remember to add the last block
if current_block and not current_block.is_empty():
compressed_blocks.append(current_block)
# Strip the first block
if compressed_blocks:
compressed_blocks[0].text = compressed_blocks[0].get_text().lstrip()
return compressed_blocks
@classmethod
def extract_text(cls, start_element: Tag):
# Remove page numbers
decompose_page_numbers(start_element)
# Now find the full text
blocks: List[Block] = extract_and_format_content(start_element)
# Compress the blocks
blocks: List[Block] = HtmlDocument._compress_blocks(blocks)
return blocks
@classmethod
def extract_data(cls, start_element: Tag) -> Optional[DocumentData]:
header_elements = start_element.find_all('ix:header')
if len(header_elements) == 0:
return None
ixbrl_document: DocumentData = DocumentData.parse_headers(header_elements)
for header_element in header_elements:
header_element.decompose()
ixbrl_document.parse_inline_data(start_element.body)
return ixbrl_document
@classmethod
def get_root(cls, html: str) -> Tag:
# First check if the html is inside a <DOCUMENT><TEXT> block
if "<TEXT>" in html[:500]:
html = get_text_between_tags(html, 'TEXT')
soup = BeautifulSoup(html, features='lxml')
# Cleanup the soup before extracting text (including removing comments)
fixup_soup(soup)
return soup.find('html')
@classmethod
def from_html(cls, html: str, extract_data: bool = False):
"""Create from an html string"""
# Get the root element
root: Tag = cls.get_root(html)
# If the root cannot be located it's not valid HTML
if not root:
return None
# Extract any inline data inside the html
data = cls.extract_data(root) if extract_data else None
# Clean the root element .. strip out the header tags, script and style tags, and table of content links
root = clean_html_root(root)
# Now extract the text into blocks
blocks: List[Block] = cls.extract_text(root)
return cls(blocks=blocks, data=data)
@staticmethod
def _render_blocks(blocks: List[Block]) -> str:
text_ = "".join([block.get_text() for block in blocks])
return text_.strip()
def generate_text_chunks(self, ignore_tables: bool = False) -> List[str]:
for chunk in self.generate_chunks(ignore_tables=ignore_tables):
yield HtmlDocument._render_blocks(chunk)
def generate_chunks(self, ignore_tables: bool = False) -> List[List[Block]]:
current_chunk = []
accumulating_regular_text = False
header_detected = False
item_header_detected = False
for i, block in enumerate(self.blocks):
if isinstance(block, TableBlock) or block.metadata.get('element') in ['ol', 'ul']:
if isinstance(block, TableBlock) and ignore_tables:
continue
if current_chunk:
if any(block.text.strip() for block in current_chunk): # Avoid emitting empty chunks
yield current_chunk
current_chunk = []
yield [block] # Yield TableBlock as its own chunk
accumulating_regular_text = False
header_detected = False
item_header_detected = False
elif isinstance(block, TextBlock):
analysis = block.analyze()
is_regular_text = analysis.is_regular_text
# Check if the block is an "Item" header
is_item_header = bool(re.match(item_pattern, block.text))
is_part_header = bool(part_pattern.match(block.text))
if is_part_header:
# Yield the current chunk before starting a new one with the "Part" header
if current_chunk:
if any(block.text.strip() for block in current_chunk): # Avoid emitting empty chunks
yield current_chunk
yield [block]
else:
yield [block]
current_chunk = []
# Update flags accordingly
item_header_detected = True
header_detected = True # "Item" headers are considered regular headers for flag purposes
accumulating_regular_text = False # Reset since we're starting a new section
elif is_item_header:
# Yield the current chunk before starting a new one with the "Item" header
if current_chunk:
if any(block.text.strip() for block in current_chunk): # Avoid emitting empty chunks
yield current_chunk
# Initialize the new chunk with the "Item" header
current_chunk = [block]
# Update flags accordingly
item_header_detected = True
header_detected = True # "Item" headers are considered regular headers for flag purposes
accumulating_regular_text = False # Reset since we're starting a new section
elif analysis.is_header:
if current_chunk and not accumulating_regular_text and not item_header_detected:
if any(block.text.strip() for block in current_chunk): # Avoid emitting empty chunks
yield current_chunk
current_chunk = []
header_detected = True
accumulating_regular_text = False # Reset this flag since we found a new header
current_chunk.append(block) # Start accumulating from this header
item_header_detected = False # Reset this as we found a different type of header
elif is_regular_text and (header_detected or accumulating_regular_text):
current_chunk.append(block)
accumulating_regular_text = True
item_header_detected = False # Regular text resets the "Item" header detection
else:
if accumulating_regular_text or item_header_detected:
if any(block.text.strip() for block in current_chunk): # Avoid emitting empty chunks
yield current_chunk
current_chunk = []
accumulating_regular_text = False
header_detected = False
item_header_detected = False
current_chunk.append(block)
elif isinstance(block, LinkBlock):
analysis = False
is_regular_text = False
is_item_header = False
yield [block]
# Check to yield the remaining chunk if it's the last block
if i == len(self.blocks) - 1 and current_chunk:
if any(block.text.strip() for block in current_chunk): # Avoid emitting empty chunks
yield current_chunk
def extract_and_format_content(element) -> List[Block]:
"""
Recursively extract and format content from an element,
applying special formatting to tables and concatenating text for other elements.
"""
if element.name == 'table':
table_block = TableBlock(table_element=element, rows=len(element.find_all("tr")))
return [table_block]
elif element.name in ['ul', 'ol']:
return [TextBlock(text=fixup(element.text), element=element.name, text_type='list')]
elif element.name in ["img", ]:
return [
LinkBlock(text=str(element),
tag=element.name,
element=element.name,
alt=element.get('alt'),
src=element.get('src'),
text_type='string')
]
elif isinstance(element, NavigableString):
return [TextBlock(text=fixup(element.text), element=element.name, text_type='string')]
else:
inline = is_inline(element)
blocks: List[Block] = []
len_children = len(element.contents)
for index, child in enumerate(element.children):
if child.name:
blocks.extend(extract_and_format_content(child))
if not inline and len(blocks) > 0 and not isinstance(blocks[-1], TableBlock):
# are we at the end of the children?
if not blocks[-1].inline or index == len_children - 1:
if blocks[-1].text.strip():
blocks[-1].text += '\n'
else:
blocks[-1].text = '\n'
else:
stripped_string = replace_inline_newlines(child.string)
stripped_string = fixup(stripped_string)
if not stripped_string.strip() and len(blocks) > 0 and not blocks[-1].get_text().strip():
if not blocks[-1].get_text().endswith('\n'): # Don't add a space after a new line
blocks[-1].text += stripped_string
else:
blocks.append(TextBlock(stripped_string, inline=inline, element=element.name, text_type='string'))
return blocks
def table_to_markdown(table_tag):
rows = table_tag.find_all('tr')
col_widths = []
col_has_content = []
# Determine the maximum width for each column and identify empty columns
for row in rows:
cols = row.find_all(['td', 'th'])
for i, col in enumerate(cols):
width = len(col.get_text().strip())
if len(col_widths) <= i:
col_widths.append(width)
col_has_content.append(width > 0)
else:
col_widths[i] = max(col_widths[i], width)
if width > 0:
col_has_content[i] = True
# Create a list of indices for columns that have content
content_col_indices = [i for i, has_content in enumerate(col_has_content) if has_content]
# Adjust col_widths to only include columns with content
col_widths = [col_widths[i] for i in content_col_indices]
formatted_table = ""
for index, row in enumerate(rows):
cols = row.find_all(['td', 'th'])
# Map cols to their new indices based on content_col_indices, then format
row_text = []
for i, col in enumerate(cols):
if i in content_col_indices: # Check if column should be included
new_index = content_col_indices.index(i) # Get new index for col_widths
row_text.append(clean_column_text(col.get_text()).ljust(col_widths[new_index]))
if any([text.strip() for text in row_text]): # Skip entirely empty rows
formatted_row = ' | '.join(row_text)
formatted_table += formatted_row + '\n'
if index == 0:
formatted_table += '-+-'.join(['-' * len(text) for text in row_text]) + '\n'
return formatted_table
def html_to_text(html: str) -> str:
"""Converts HTML to plain text"""
return HtmlDocument.from_html(html, extract_data=False).text
def html_to_markdown(html: str) -> str:
"""Converts HTML to markdown"""
return HtmlDocument.from_html(html, extract_data=False).markdown
def decompose_toc_links(start_element: Tag):
regex = re.compile('Table [Oo]f [cC]ontents')
toc_tags = start_element.find_all('a', string=regex)
for toc_tag in toc_tags:
toc_tag.decompose()
def decompose_page_numbers(start_element: Tag):
span_tags_with_numbers = start_element.find_all('span', string=re.compile(r'^\d{1,3}$'))
sequences = [] # To store the sequences of tags for potential review
current_sequence = []
previous_number = None
for tag in span_tags_with_numbers:
'''
some page link need keep
<span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:400;line-height:100%">
<a href="#i7bfbfbe54b9647b1b4ba4ff4e0aba09d_73" style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:400;line-height:100%;text-decoration:none">
17</a></span>
'''
if tag.find("a"):
continue
if not tag.text:
continue
number = int(tag.text)
# Check if the number is sequentially next
if previous_number is None or number == previous_number + 1:
current_sequence.append(tag)
else:
# If a sequence is broken and the current sequence has more than one element, it's considered valid
if len(current_sequence) > 1:
sequences.append(current_sequence)
# Decompose all tags in the current valid sequence
for seq_tag in current_sequence:
seq_tag.decompose()
# Start a new sequence
current_sequence = [tag]
previous_number = number
# Check the last sequence
if len(current_sequence) > 1:
sequences.append(current_sequence)
for seq_tag in current_sequence:
seq_tag.decompose()
return sequences
def get_text_between_tags(html: str, tag: str, ):
tag_start = f'<{tag}>'
tag_end = f'</{tag}>'
is_header = False
content = ""
for line in html.splitlines():
if line:
# If line matches header_start, start capturing
if line.startswith(tag_start):
is_header = True
continue # Skip the current line as it's the opening tag
# If line matches header_end, stop capturing
elif line.startswith(tag_end):
break
# If within header lines, add to header_content
elif is_header:
content += line + '\n' # Add a newline to preserve original line breaks
return content
def is_inline(tag):
# is is navigable string return False
if not tag.name:
return False
# Common inline elements
inline_elements = {'a', 'span', 'strong', 'em', 'b', 'i', 'u', 'small', 'font', 'big', 'sub', 'sup', 'img', 'label',
'input', 'button'}
# Check if the tag's name is in the list of inline elements
if tag.name in inline_elements:
return True
# #ixbrl tags are inline
if tag.name.startswith("ix:"):
return True
# Check for inline styling
if tag.has_attr('style'):
styles = tag['style'].split(';')
for style in styles:
if style.strip().lower().startswith('display'):
property_value = style.split(':')
if len(property_value) > 1 and property_value[1].strip().lower() == 'inline':
return True
return False
def fixup(text: str):
# This pattern matches one or more non-breaking space (\xa0) or one or more whitespace characters (\s+)
text = re.sub(r'\xa0|[^\S\n]+', ' ', text)
return text
def get_clean_html(html: str) -> Optional[str]:
"""Get a clean version of the html without the header tags, script and style tags, and table of content links.
"""
root = HtmlDocument.get_root(html)
# If the root cannot be located it's not valid HTML
if not root:
return None
# Clean the root element
root = clean_html_root(root)
return str(root)
def clean_html_root(root: Tag) -> Tag:
"""Clean the root element by removing header tags, script and style tags, and table of content links."""
# Remove the header tags
for tag in root.find_all('ix:header'):
tag.decompose()
# Remove table of content links
decompose_toc_links(root)
# Remove script and style tags
for tag in root.find_all(['script', 'style']):
tag.decompose()
# Remove comments
for comment in root.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
return root
def replace_inline_newlines(text: str):
"""Replace newlines inside the text container"""
text = text.replace('\n', ' ')
return text
def fixup_soup(soup):
# Find and remove all comments
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
for comment in comments:
comment.extract()
# Find all pre tags
for pre in soup.find_all('pre'):
# Check if there's a single div with all content
divs = pre.find_all('div', recursive=False)
if len(divs) == 1 and len(pre.contents) == 1:
# If there's a single div, use it directly
pre.replace_with(divs[0])
continue
# Otherwise create a new div and preserve all content
raw_content = str(pre)
content = raw_content.replace('<pre>', '').replace('</pre>', '')
new_soup = BeautifulSoup(f'<div>{content}</div>', 'html.parser')
pre.replace_with(new_soup.div)
# List of words that are commonly not capitalized in titles
common_words = {'and', 'or', 'but', 'the', 'a', 'an', 'in', 'with', 'for', 'on', 'at', 'to', 'of', 'by', 'as'}
class SECLine:
def __init__(self, text):
self.text = text.strip() # Remove leading and trailing whitespace
self.is_header = False
self.is_empty = False
self.features = {}
self.analyze()
def analyze(self):
self.set_empty()
self.set_header()
self.set_features()
def set_empty(self):
if not self.text:
self.is_empty = True
def set_header(self):
if self.is_empty: # Skip empty lines for header detection
return
self.is_header = is_header(self.text)
def set_features(self):
# Additional features can be added here
self.features['word_count'] = len(self.text.split())
self.features['upper_case'] = self.text.isupper()
self.features['title_case'] = self.text.istitle()
def is_header(text: str):
# Remove numerical prefix for enumeration, e.g., "1. ", "I. ", "(1) "
trimmed_text = re.sub(r'^(\d+\.|\w\.\s|\(\d+\)\s)', '', text)
if not trimmed_text:
return False
# Split the line into words, considering special cases for common words
words = [word for word in trimmed_text.split() if word.isalpha()]
# Check if the line is mostly title case, ignoring common words and numerical prefixes
if words:
title_case_words = [word for word in words if (word.istitle() or word.lower() in common_words)]
upper_case_words = [word for word in words if word.isupper()]
mostly_title_case = len(title_case_words) / len(words) > 0.6 # Threshold for mostly title case
mostly_upper_case = len(upper_case_words) / len(words) > 0.6
if mostly_title_case or mostly_upper_case:
return True
return False
class TextAnalysis:
def __init__(self, text):
# Pre-compute and store these properties to avoid recalculating them for each method call
words = TextAnalysis._get_alpha_words(text)
self.num_words = len(words)
self.num_upper_case_words = len([word for word in words if word.isupper()])
self.num_title_case_words = len([word for word in words if word.istitle()])
# Show a preview of the text i.e. first 6 characters followed by ... if longer
self._text = text[:6] + "..." if len(text) > 6 else text
@staticmethod
def _get_alpha_words(text):
"""Removes numerical prefixes and splits the text into alphabetic words."""
trimmed_text = re.sub(r'[^a-zA-Z0-9\s]+', '', text)
return [word for word in trimmed_text.split() if word.isalpha()]
@property
def is_header(self):
"""Determines if the text is a header based on title or upper case predominance."""
mostly_title_case = (self.num_title_case_words / self.num_words > 0.6) if self.num_words > 0 else False
mostly_upper_case = (self.num_upper_case_words / self.num_words > 0.6) if self.num_words > 0 else False
return mostly_title_case or mostly_upper_case
@property
def is_mostly_upper(self):
"""Checks if the majority of the words in the text are in uppercase."""
return self.num_upper_case_words / self.num_words > 0.6
@property
def is_mostly_title_case(self):
"""Checks if the majority of the words in the text are in title case."""
return self.num_title_case_words / self.num_words > 0.6
@property
@lru_cache(maxsize=1)
def is_regular_text(self):
return self.num_words > 25
def __str__(self):
# Show the first 8 characters of the text
return f"Text Analysis: {self._text}"
def clean_cell_text(col) -> str:
text = re.sub(r'<br\s*/?>', '\n', str(col))
text = re.sub(r'<[^>]+>', '', text)
lines = [' '.join(line.strip().split()) for line in text.split('\n') if line.strip()]
return '\n'.join(lines)
def process_row(row) -> List[Tuple[str, int, int]]:
processed_cells = []
cells = row.find_all(['td', 'th'])
i = 0
while i < len(cells):
content = clean_cell_text(cells[i])
colspan = int(cells[i].get('colspan', 1))
rowspan = int(cells[i].get('rowspan', 1))
# Check if this cell is just a $ sign and the next cell exists
if content == '$' and i + 1 < len(cells):
next_content = clean_cell_text(cells[i + 1])
content = f'${next_content}'
colspan += int(cells[i + 1].get('colspan', 1))
i += 1 # Skip the next cell as we've combined it
# Check if this cell is empty and the next cell is numeric
elif not content.strip() and i + 1 < len(cells):
next_content = clean_cell_text(cells[i + 1])
if next_content.replace('.', '', 1).isdigit(): # Check if next cell is numeric
content = next_content
colspan += int(cells[i + 1].get('colspan', 1))
i += 1 # Skip the next cell as we've combined it
# Always add the cell, even if it's empty
processed_cells.append((content, colspan, rowspan))
i += 1
return processed_cells
def detect_header_rows(rows):
header_rows = []
for row in rows:
if row.find('th'):
header_rows.append(row)
elif not header_rows:
header_rows.append(row) # Use first row as header if no <th> found
else:
break
return header_rows
def merge_header_rows(header_rows):
processed_headers = []
for row in header_rows:
processed_row = []
for content, colspan, _ in row:
lines = content.split('\n')
processed_row.append((lines, colspan))
processed_headers.append(processed_row)
max_lines = max(len(lines) for row in processed_headers for lines, _ in row)
merged_header = []
for i in range(max_lines):
line = []
for row in processed_headers:
for lines, colspan in row:
if i < len(lines):
line.append((lines[i], colspan))
else:
line.append(('', colspan))
merged_header.append(line)
# Ensure the first cell is not empty across all lines
if all(not line[0][0] for line in merged_header):
for line in merged_header:
line[0] = (' ', line[0][1])
return merged_header
def is_numeric_or_financial(value):
pattern = r'^[\$€£(-]?\s{0,2}\d'
return bool(re.match(pattern, value.strip()))
def determine_column_justification(all_processed_rows):
max_cols = max(sum(colspan for _, colspan, _ in row) for row in all_processed_rows)
justifications = ['left'] * max_cols
for col in range(max_cols):
numeric_count = 0
total_count = 0
for row in all_processed_rows:
col_index = 0
for content, colspan, _ in row:
if col_index <= col < col_index + colspan:
if content.strip():
total_count += 1
if is_numeric_or_financial(content):
numeric_count += 1
break
col_index += colspan
if numeric_count > 1 and numeric_count / total_count > 0.5:
for i in range(col, min(col + colspan, max_cols)):
justifications[i] = 'right'
return justifications
def table_to_text(table_tag):
try:
rows = table_tag.find_all('tr')
if not rows:
return ""
header_rows = detect_header_rows(rows)
all_processed_rows = [process_row(row) for row in rows]
header_processed = all_processed_rows[:len(header_rows)]
data_processed = all_processed_rows[len(header_rows):]
merged_header = merge_header_rows(header_processed)
# Check if the header is entirely empty
header_is_empty = all(not content.strip() for header_line in merged_header for content, _ in header_line)
# Determine the maximum number of columns
max_cols = max((sum(colspan for _, colspan, _ in row) for row in all_processed_rows), default=0)
# Initialize column widths and track non-empty columns
col_widths = [0] * max_cols
non_empty_cols = set()
# Calculate header widths if header is not empty
if not header_is_empty:
for header_line in merged_header:
col_index = 0
for content, colspan in header_line:
if content.strip():
content_width = max((len(line) for line in content.split('\n')), default=0)
for i in range(colspan):
if col_index + i < max_cols:
col_widths[col_index + i] = max(col_widths[col_index + i], content_width // max(colspan, 1))
non_empty_cols.add(col_index + i)
col_index += colspan
# Update column widths based on data content
for processed_row in data_processed:
col_index = 0
for content, colspan, _ in processed_row:
if content.strip():
content_width = max((len(line) for line in content.split('\n')), default=0)
for i in range(colspan):
if col_index + i < max_cols:
col_widths[col_index + i] = max(col_widths[col_index + i], content_width // max(colspan, 1))
non_empty_cols.add(col_index + i)
col_index += colspan
# Filter out empty columns
col_widths = [width for i, width in enumerate(col_widths) if i in non_empty_cols]
# If all columns are empty, return an empty string
if not col_widths:
return ""
# Determine column justifications
justifications = determine_column_justification(all_processed_rows)
justifications = [just for i, just in enumerate(justifications) if i in non_empty_cols]
# Render the table
rendered_table = []
# Render header if it's not empty
if not header_is_empty:
for header_line in merged_header:
row_content = []
col_index = 0
non_empty_cell_count = 0
for content, colspan in header_line:
if any(col_index + i in non_empty_cols for i in range(colspan)):
width = sum(col_widths[non_empty_cell_count:non_empty_cell_count + colspan]) + 3 * (colspan - 1)
row_content.append(content.center(width))
non_empty_cell_count += colspan
col_index += colspan
rendered_table.append(' '.join(row_content))
# Add separator line only if header is not empty
rendered_table.append('-' * (sum(col_widths) + 3 * (len(col_widths) - 1)))
# Render data rows
for processed_row in data_processed:
non_empty_contents = [content for content, _, _ in processed_row if content.strip()]
if not non_empty_contents:
continue # Skip empty rows
row_lines = [''] * max((len(content.split('\n')) for content in non_empty_contents), default=1)
for i in range(len(row_lines)):
col_index = 0
cell_contents = []
non_empty_cell_count = 0
for content, colspan, _ in processed_row:
if any(col_index + i in non_empty_cols for i in range(colspan)):
width = sum(col_widths[non_empty_cell_count:non_empty_cell_count + colspan]) + 3 * (colspan - 1)
lines = content.split('\n')
if i < len(lines):
if justifications and non_empty_cell_count < len(justifications) and justifications[non_empty_cell_count] == 'right':
cell_contents.append(lines[i].rjust(width))
else:
cell_contents.append(lines[i].ljust(width))
else:
cell_contents.append(' ' * width)
non_empty_cell_count += colspan
col_index += colspan
row_lines[i] = ' '.join(cell_contents)
rendered_table.extend(row_lines)
return '\n'.join(rendered_table)
except Exception:
# Log the error or handle it as appropriate for your use case
return "" # Return an empty string in case of any error