Files
edgartools/venv/lib/python3.10/site-packages/edgar/datatools.py
2025-12-09 12:13:01 +01:00

388 lines
12 KiB
Python

from dataclasses import dataclass
from typing import Union
import numpy as np
import pandas as pd
import pyarrow as pa
from lxml import html as lxml_html
__all__ = [
"compress_dataframe",
"table_html_to_dataframe",
"table_tag_to_dataframe",
"markdown_to_dataframe",
"dataframe_to_text",
"clean_column_text",
'convert_to_numeric',
'describe_dataframe',
'na_value',
'replace_all_na_with_empty',
'convert_to_pyarrow_backend',
'drop_duplicates_pyarrow',
'repr_df',
'DataPager',
'PagingState',
]
def clean_column_text(text: str):
"""Remove newlines and extra spaces from column text.
' Per Share ' -> 'Per Share'
'Per\nShare' -> 'Per Share'
'Per Share' -> 'Per Share'
"""
text = ' '.join(text.strip().split())
text = text.strip()
return text
def compress_dataframe(df: pd.DataFrame) -> pd.DataFrame:
"""Remove empty rows and columns from a DataFrame.
Args:
df: DataFrame to compress
Returns:
Compressed DataFrame with empty rows and columns removed
"""
# Remove empty rows and columns
df = (df.replace('', pd.NA)
.dropna(axis=1, how="all")
.dropna(axis=0, how="all"))
# Fill na
df = df.fillna('')
return df
def repr_df(df: pd.DataFrame, hide_index: bool = True) -> str:
"""Return a string representation of a DataFrame.
Args:
df: DataFrame to represent as string
hide_index: Whether to hide the index in the output
Returns:
String representation of the DataFrame
"""
if hide_index:
return df.to_string(index=False)
return df.to_string()
@dataclass
class PagingState:
"""State for paginating through data."""
page: int = 1
page_size: int = 50
total_items: int = 0
@property
def start_idx(self) -> int:
"""Get the start index for the current page."""
return (self.page - 1) * self.page_size
@property
def end_idx(self) -> int:
"""Get the end index for the current page."""
return min(self.start_idx + self.page_size, self.total_items)
@property
def has_more(self) -> bool:
"""Check if there are more pages."""
return self.end_idx < self.total_items
class DataPager:
"""Class for paginating through data."""
def __init__(self, data: Union[pd.DataFrame, pa.Table], page_size: int = 50):
"""Initialize the pager.
Args:
data: Data to paginate through
page_size: Number of items per page
"""
self.data = data
self.state = PagingState(page_size=page_size, total_items=len(data))
def get_page(self, page: int = 1) -> Union[pd.DataFrame, pa.Table]:
"""Get a specific page of data.
Args:
page: Page number to get (1-based)
Returns:
Slice of data for the requested page
"""
self.state.page = page
return self.data[self.state.start_idx:self.state.end_idx]
def adjust_column_headers(df: pd.DataFrame):
""" Replace numeric column headers with blank strings. """
# Check if column names are integers (default index names in pandas DataFrames)
if all(isinstance(col, int) for col in df.columns):
# Replace them with blank strings
df.columns = ['' for _ in df.columns]
return df
def should_promote_to_header(df: pd.DataFrame) -> bool:
if df.shape[0] > 1:
first_row = df.iloc[0]
# Check for uniformity and non-numeric nature
if all(isinstance(item, str) for item in first_row):
# Pattern matching for typical header keywords
header_keywords = {'title', 'name', 'number', 'description', 'date', 'total', 'id'}
if any(any(keyword in str(cell).lower() for keyword in header_keywords) for cell in first_row):
return True
# Check distinctiveness compared to the second row (simple heuristic)
second_row = df.iloc[1]
difference_count = sum(1 for f, s in zip(first_row, second_row, strict=False) if f != s)
if difference_count > len(first_row) / 2: # Arbitrary threshold: more than half are different
return True
return False
def table_html_to_dataframe(html_str: str) -> pd.DataFrame:
tree = lxml_html.fromstring(html_str)
table_element = tree.xpath("//table")[0]
rows = table_element.xpath(".//tr")
data = []
for row in rows:
cols = row.xpath(".//td | .//th") # Handle both 'td' and 'th' if present
cols = [clean_column_text(lxml_html.tostring(c, method='text', encoding='unicode').strip()) for c in cols]
data.append(cols)
df = pd.DataFrame(data)
df = adjust_column_headers(df) # Adjust headers if not promoted
df = compress_dataframe(df)
return df
def table_tag_to_dataframe(table_tag):
"""Convert a BeautifulSoup table Tag to a DataFrame."""
rows = table_tag.find_all('tr')
data = []
for row in rows:
# Find all 'td' tags within each 'tr' tag
cols = row.find_all('td')
# Get the text from each 'td' tag, handling nested tags automatically
cols = [clean_column_text(col.get_text(strip=True)) for col in cols]
data.append(cols)
df = pd.DataFrame(data)
return df
def markdown_to_dataframe(markdown_table):
# Split the markdown table into rows
rows = markdown_table.split('\n')
# Extract the header row
header = rows[0].split('|')
header = [col.strip() for col in header]
# Extract the data rows
data_rows = []
for row in rows[2:]:
if not row.strip():
continue
data_row = row.split('|')
data_row = [col.strip() for col in data_row]
data_rows.append(data_row)
# Create a pandas DataFrame
if len(data_rows) == 0:
df = pd.DataFrame([header], columns=["" for col in header])
else:
df = pd.DataFrame(data_rows, columns=header)
df = compress_dataframe(df)
return df
def dataframe_to_text(df, include_index=False, include_headers=False):
"""
Convert a Pandas DataFrame to a plain text string, with formatting options for including
the index and column headers.
Parameters:
- df (pd.DataFrame): The dataframe to convert
- include_index (bool): Whether to include the index in the text output. Defaults to True.
- include_headers (bool): Whether to include column headers in the text output. Defaults to True.
Returns:
str: The dataframe converted to a text string.
"""
# Getting the maximum width for each column
column_widths = df.apply(lambda col: col.astype(str).str.len().max())
# If including indexes, get the maximum width of the index
index_label = ''
if include_index:
index_label = "Index"
index_width = max(df.index.astype(str).map(len).max(), len(index_label))
else:
index_width = 0
# Initialize an empty string to store the text
text_output = ""
# Include column headers if specified
if include_headers:
# Add index label if specified
if include_index:
text_output += f"{index_label:<{index_width}}\t"
# Create and add the header row
headers = [f"{col:<{width}}" for col, width in zip(df.columns, column_widths, strict=False)]
text_output += '\t'.join(headers) + '\n'
# Loop through each row of the dataframe
for index, row in df.iterrows():
# Include index if specified
if include_index:
text_output += f"{index:<{index_width}}\t"
# Format each value according to the column width and concatenate
row_values = [f"{val:<{width}}" for val, width in zip(row.astype(str), column_widths, strict=False)]
text_output += '\t'.join(row_values) + '\n'
return text_output
def convert_to_numeric(series):
"""Convert a pandas Series to numeric if possible, otherwise return the original series."""
try:
return pd.to_numeric(series)
except ValueError:
return series
def describe_dataframe(df: pd.DataFrame) -> pd.DataFrame:
# Get data types of columns
dtypes = df.dtypes
# Create a Series for the index dtype
index_dtype = pd.Series(df.index.dtype, index=['Index'])
# Concatenate the dtypes and index_dtype
all_dtypes = pd.concat([index_dtype, dtypes])
# Get memory usage of each column including the index, in kilobytes and round to 2 decimal places
memory_usage = df.memory_usage(deep=True) / 1024
memory_usage.index = memory_usage.index.astype(str) # Ensure index labels are string type
memory_usage = memory_usage.round(2) # Round memory usage to 2 decimal places
# Calculate total memory usage
total_memory_usage = memory_usage.sum()
# Create a DataFrame with the information
description_df = pd.DataFrame({
'Data type': all_dtypes.to_numpy(),
'Memory Usage (KB)': memory_usage.to_numpy()
}, index=all_dtypes.index)
# Append the total memory usage as the last row
total_row = pd.DataFrame({
'Data type': [''],
'Memory Usage (KB)': [total_memory_usage]
}, index=['Total'])
description_df = pd.concat([description_df, total_row])
return description_df
def convert_to_pyarrow_backend(data:pd.DataFrame):
# Convert dtypes carefully
for col in data.columns:
if data[col].dtype == 'object':
# For object columns, convert to string
data[col] = data[col].astype(str)
elif data[col].dtype == 'float64':
# For float columns, use float32 to match PyArrow's default
data[col] = data[col].astype('float32')
# Now convert to PyArrow
return data.convert_dtypes(dtype_backend="pyarrow")
def replace_all_na_with_empty(df_or_series):
if isinstance(df_or_series, pd.DataFrame):
for column in df_or_series.columns:
# Check if the column is all NA or None
if df_or_series[column].isna().all():
# Get the length of the DataFrame
length = len(df_or_series)
# Create a new Series of empty strings
empty_series = pd.Series([''] * length, name=column)
# Replace the column with the new Series
df_or_series[column] = empty_series
return df_or_series
elif isinstance(df_or_series, pd.Series):
# Check if the series is all NA or None
if df_or_series.isna().all():
# Create a new Series of empty strings with the same index and name
return pd.Series('', index=df_or_series.index, name=df_or_series.name)
else:
# If not all NA, return the original series
return df_or_series
def na_value(value, default_value:object=''):
if pd.isna(value):
return default_value
return value
def drop_duplicates_pyarrow(table, column_name, keep='first'):
"""
Drop duplicates from a PyArrow Table based on a specified column.
Parameters:
- table (pa.Table): The input PyArrow Table
- column_name (str): The column to check for duplicates
- keep (str): 'first' to keep first occurrence, 'last' to keep last occurrence
Returns:
- pa.Table: A new table with duplicates removed
"""
if column_name not in table.column_names:
raise ValueError(f"Column '{column_name}' not found in table")
if keep not in ['first', 'last']:
raise ValueError("Parameter 'keep' must be 'first' or 'last'")
# Extract the column as an array
column_array = table[column_name]
# Convert to NumPy array and get unique indices
np_array = column_array.to_numpy()
unique_values, unique_indices = np.unique(np_array, return_index=True)
if keep == 'first':
# Sort indices to maintain original order for first occurrences
sorted_indices = np.sort(unique_indices)
else: # keep == 'last'
# Get the last occurrence by reversing the array logic
reverse_indices = len(np_array) - 1 - np.unique(np_array[::-1], return_index=True)[1]
sorted_indices = np.sort(reverse_indices)
# Create a boolean mask to filter the table
mask = np.zeros(len(table), dtype=bool)
mask[sorted_indices] = True
# Filter the table using the mask
deduplicated_table = table.filter(pa.array(mask))
return deduplicated_table