Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
"""
Document processors for preprocessing and postprocessing.
"""
from edgar.documents.processors.preprocessor import HTMLPreprocessor
from edgar.documents.processors.postprocessor import DocumentPostprocessor
__all__ = [
'HTMLPreprocessor',
'DocumentPostprocessor'
]

View File

@@ -0,0 +1,283 @@
"""
Document postprocessor for final processing after parsing.
"""
from typing import List, Set
from edgar.documents.config import ParserConfig
from edgar.documents.document import Document
from edgar.documents.nodes import Node, TextNode, ParagraphNode, HeadingNode
from edgar.documents.types import NodeType
class DocumentPostprocessor:
"""
Postprocesses parsed documents to improve quality.
Handles:
- Adjacent node merging
- Empty node removal
- Heading level normalization
- Section detection enhancement
- Metadata enrichment
"""
def __init__(self, config: ParserConfig):
"""Initialize postprocessor with configuration."""
self.config = config
def process(self, document: Document) -> Document:
"""
Postprocess document.
Args:
document: Parsed document
Returns:
Processed document
"""
# Remove empty nodes
self._remove_empty_nodes(document.root)
# Merge adjacent text nodes if configured
if self.config.merge_adjacent_nodes:
self._merge_adjacent_nodes(document.root)
# Normalize heading levels
self._normalize_heading_levels(document.root)
# Enhance section detection if configured
if self.config.detect_sections:
self._enhance_sections(document)
# Add document statistics
self._add_statistics(document)
# Validate document structure
self._validate_structure(document)
return document
def _remove_empty_nodes(self, node: Node):
"""Remove empty nodes from tree."""
# Process children first (bottom-up)
children_to_remove = []
for child in node.children:
self._remove_empty_nodes(child)
# Check if child is empty
if self._is_empty_node(child):
children_to_remove.append(child)
# Remove empty children
for child in children_to_remove:
node.remove_child(child)
def _is_empty_node(self, node: Node) -> bool:
"""Check if node is empty and can be removed."""
# Never remove table nodes
if node.type == NodeType.TABLE:
return False
# Never remove nodes with metadata
if node.metadata:
return False
# Check text nodes
if isinstance(node, TextNode):
return not node.text().strip()
# Check other nodes with text content
if hasattr(node, 'content') and isinstance(node.content, str):
return not node.content.strip()
# Check container nodes
if not node.children:
# Empty container with no children
return True
return False
def _merge_adjacent_nodes(self, node: Node):
"""Merge adjacent text nodes with similar properties."""
if not node.children:
return
# Process children first
for child in node.children:
self._merge_adjacent_nodes(child)
# Merge adjacent text nodes
merged_children = []
i = 0
while i < len(node.children):
current = node.children[i]
# Look for mergeable nodes
if self._can_merge(current):
# Collect all adjacent mergeable nodes
merge_group = [current]
j = i + 1
while j < len(node.children) and self._can_merge_with(current, node.children[j]):
merge_group.append(node.children[j])
j += 1
# Merge if we have multiple nodes
if len(merge_group) > 1:
merged = self._merge_nodes(merge_group)
merged_children.append(merged)
i = j
else:
merged_children.append(current)
i += 1
else:
merged_children.append(current)
i += 1
# Update children
node.children = merged_children
# Update parent references
for child in node.children:
child.parent = node
def _can_merge(self, node: Node) -> bool:
"""Check if node can be merged."""
# Only merge TextNodes, not ParagraphNodes
return isinstance(node, TextNode) and not node.metadata
def _can_merge_with(self, node1: Node, node2: Node) -> bool:
"""Check if two nodes can be merged."""
# Must be same type
if type(node1) != type(node2):
return False
# Must have compatible styles
if not self._compatible_styles(node1.style, node2.style):
return False
# Must not have metadata
if node1.metadata or node2.metadata:
return False
return True
def _compatible_styles(self, style1, style2) -> bool:
"""Check if two styles are compatible for merging."""
# For now, just check key properties
return (
style1.font_size == style2.font_size and
style1.font_weight == style2.font_weight and
style1.text_align == style2.text_align
)
def _merge_nodes(self, nodes: List[Node]) -> Node:
"""Merge multiple nodes into one."""
if not nodes:
return None
# Use first node as base
merged = nodes[0]
# Merge content
if isinstance(merged, TextNode):
texts = [n.text() for n in nodes]
merged.content = '\n'.join(texts)
elif isinstance(merged, ParagraphNode):
# Merge all children
for node in nodes[1:]:
merged.children.extend(node.children)
return merged
def _normalize_heading_levels(self, node: Node):
"""Normalize heading levels to ensure proper hierarchy."""
# Collect all headings
headings = []
self._collect_headings(node, headings)
if not headings:
return
# Analyze heading structure
levels_used = set(h.level for h in headings)
# If we're missing level 1, promote headings
if 1 not in levels_used and levels_used:
min_level = min(levels_used)
adjustment = min_level - 1
for heading in headings:
heading.level = max(1, heading.level - adjustment)
def _collect_headings(self, node: Node, headings: List[HeadingNode]):
"""Collect all heading nodes."""
if isinstance(node, HeadingNode):
headings.append(node)
for child in node.children:
self._collect_headings(child, headings)
def _enhance_sections(self, document: Document):
"""Enhance section detection and metadata."""
# Only extract sections eagerly if configured to do so
if not self.config.eager_section_extraction:
return
# Force section extraction to populate cache
_ = document.sections
# Add section metadata to nodes
for section_name, section in document.sections.items():
# Add section name to all nodes in section
for node in section.node.walk():
node.set_metadata('section', section_name)
def _add_statistics(self, document: Document):
"""Add document statistics to metadata."""
stats = {
'node_count': sum(1 for _ in document.root.walk()),
'text_length': len(document.text()),
'table_count': len(document.tables),
'heading_count': len(document.headings),
}
# Only add section count if sections were extracted
if self.config.eager_section_extraction:
stats['section_count'] = len(document.sections)
document.metadata.statistics = stats
def _validate_structure(self, document: Document):
"""Validate document structure and fix issues."""
issues = []
# Check for orphaned nodes
for node in document.root.walk():
if node != document.root and node.parent is None:
issues.append(f"Orphaned node: {node.type}")
# Fix by adding to root
document.root.add_child(node)
# Check for circular references
visited = set()
def check_cycles(node: Node, path: Set[str]):
if node.id in path:
issues.append(f"Circular reference detected: {node.type}")
return
path.add(node.id)
visited.add(node.id)
for child in node.children:
if child.id not in visited:
check_cycles(child, path.copy())
check_cycles(document.root, set())
# Store validation results
if issues:
document.metadata.validation_issues = issues

View File

@@ -0,0 +1,242 @@
"""
HTML preprocessor for cleaning and normalizing HTML before parsing.
"""
import re
from edgar.documents.config import ParserConfig
from edgar.documents.utils.html_utils import remove_xml_declaration
class HTMLPreprocessor:
"""
Preprocesses HTML to fix common issues and normalize content.
Handles:
- Character encoding issues
- Malformed HTML
- Excessive whitespace
- Script/style removal
- Entity normalization
"""
def __init__(self, config: ParserConfig):
"""Initialize preprocessor with configuration."""
self.config = config
# Pre-compile regex patterns for performance
self._compiled_patterns = self._compile_patterns()
def _compile_patterns(self):
"""Pre-compile frequently used regex patterns."""
return {
# Encoding and cleanup
'control_chars': re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]'),
# Script/style removal
'script_tags': re.compile(r'<script[^>]*>.*?</script>', re.IGNORECASE | re.DOTALL),
'style_tags': re.compile(r'<style[^>]*>.*?</style>', re.IGNORECASE | re.DOTALL),
'link_tags': re.compile(r'<link[^>]*>', re.IGNORECASE),
'comments': re.compile(r'<!--.*?-->', re.DOTALL),
'ix_hidden': re.compile(r'<ix:hidden[^>]*>.*?</ix:hidden>', re.IGNORECASE | re.DOTALL),
'ix_header': re.compile(r'<ix:header[^>]*>.*?</ix:header>', re.IGNORECASE | re.DOTALL),
# Malformed tags
'br_tags': re.compile(r'<br(?![^>]*/)>', re.IGNORECASE),
'img_tags': re.compile(r'<img([^>]+)(?<!/)>', re.IGNORECASE),
'input_tags': re.compile(r'<input([^>]+)(?<!/)>', re.IGNORECASE),
'hr_tags': re.compile(r'<hr(?![^>]*/)>', re.IGNORECASE),
'nested_p_open': re.compile(r'<p>\s*<p>', re.IGNORECASE),
'nested_p_close': re.compile(r'</p>\s*</p>', re.IGNORECASE),
# Whitespace normalization
'multiple_spaces': re.compile(r'[ \t]+'),
'multiple_newlines': re.compile(r'\n{3,}'),
'spaces_around_tags': re.compile(r'\s*(<[^>]+>)\s*'),
# Block element newlines - combined pattern for opening tags
'block_open_tags': re.compile(
r'(<(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)[^>]*>)',
re.IGNORECASE
),
# Block element newlines - combined pattern for closing tags
'block_close_tags': re.compile(
r'(</(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)>)',
re.IGNORECASE
),
# Empty tags removal - combined pattern for all removable tags
'empty_tags': re.compile(
r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*>\s*</(?:span|div|p|font|b|i|u|strong|em)>',
re.IGNORECASE
),
'empty_self_closing': re.compile(
r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*/>\s*',
re.IGNORECASE
),
# Common issues
'multiple_br': re.compile(r'(<br\s*/?>[\s\n]*){3,}', re.IGNORECASE),
'space_before_punct': re.compile(r'\s+([.,;!?])'),
'missing_space_after_punct': re.compile(r'([.,;!?])([A-Z])'),
}
def process(self, html: str) -> str:
"""
Preprocess HTML content.
Args:
html: Raw HTML content
Returns:
Cleaned HTML ready for parsing
"""
# Remove BOM if present
if html.startswith('\ufeff'):
html = html[1:]
# Remove XML declaration if present
html = remove_xml_declaration(html)
# Fix common character encoding issues
html = self._fix_encoding_issues(html)
# Remove script and style tags
html = self._remove_script_style(html)
# Normalize entities
html = self._normalize_entities(html)
# Fix malformed tags
html = self._fix_malformed_tags(html)
# Normalize whitespace if not preserving
if not self.config.preserve_whitespace:
html = self._normalize_whitespace(html)
# Remove empty tags
html = self._remove_empty_tags(html)
# Fix common HTML issues
html = self._fix_common_issues(html)
return html
def _fix_encoding_issues(self, html: str) -> str:
"""Fix common character encoding issues."""
# Replace Windows-1252 characters with Unicode equivalents
replacements = {
'\x91': "'", # Left single quote
'\x92': "'", # Right single quote
'\x93': '"', # Left double quote
'\x94': '"', # Right double quote
'\x95': '', # Bullet
'\x96': '', # En dash
'\x97': '', # Em dash
'\xa0': ' ', # Non-breaking space
}
for old, new in replacements.items():
html = html.replace(old, new)
# Remove other control characters
html = self._compiled_patterns['control_chars'].sub('', html)
return html
def _remove_script_style(self, html: str) -> str:
"""Remove script and style tags with content."""
# Use pre-compiled patterns for better performance
html = self._compiled_patterns['script_tags'].sub('', html)
html = self._compiled_patterns['style_tags'].sub('', html)
html = self._compiled_patterns['link_tags'].sub('', html)
html = self._compiled_patterns['comments'].sub('', html)
html = self._compiled_patterns['ix_hidden'].sub('', html)
html = self._compiled_patterns['ix_header'].sub('', html)
return html
def _normalize_entities(self, html: str) -> str:
"""Normalize HTML entities."""
# Common entity replacements
entities = {
'&nbsp;': ' ',
'&ensp;': ' ',
'&emsp;': ' ',
'&thinsp;': ' ',
'&#160;': ' ',
'&#32;': ' ',
'&zwj;': '', # Zero-width joiner
'&zwnj;': '', # Zero-width non-joiner
'&#8203;': '', # Zero-width space
}
for entity, replacement in entities.items():
html = html.replace(entity, replacement)
# Fix double-encoded entities
html = html.replace('&amp;amp;', '&amp;')
html = html.replace('&amp;nbsp;', ' ')
html = html.replace('&amp;lt;', '&lt;')
html = html.replace('&amp;gt;', '&gt;')
return html
def _fix_malformed_tags(self, html: str) -> str:
"""Fix common malformed tag issues."""
# Use pre-compiled patterns for better performance
html = self._compiled_patterns['br_tags'].sub('<br/>', html)
html = self._compiled_patterns['img_tags'].sub(r'<img\1/>', html)
html = self._compiled_patterns['input_tags'].sub(r'<input\1/>', html)
html = self._compiled_patterns['hr_tags'].sub('<hr/>', html)
html = self._compiled_patterns['nested_p_open'].sub('<p>', html)
html = self._compiled_patterns['nested_p_close'].sub('</p>', html)
return html
def _normalize_whitespace(self, html: str) -> str:
"""Normalize whitespace in HTML."""
# Use pre-compiled patterns for better performance
# Replace multiple spaces with single space
html = self._compiled_patterns['multiple_spaces'].sub(' ', html)
# Replace multiple newlines with double newline
html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
# Remove spaces around tags
html = self._compiled_patterns['spaces_around_tags'].sub(r'\1', html)
# Add newlines around block elements for readability
# Using combined patterns instead of looping over individual tags
html = self._compiled_patterns['block_open_tags'].sub(r'\n\1', html)
html = self._compiled_patterns['block_close_tags'].sub(r'\1\n', html)
# Clean up excessive newlines (apply again after adding newlines)
html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
return html.strip()
def _remove_empty_tags(self, html: str) -> str:
"""Remove empty tags that don't contribute content."""
# Use pre-compiled combined patterns instead of looping
html = self._compiled_patterns['empty_tags'].sub('', html)
html = self._compiled_patterns['empty_self_closing'].sub('', html)
return html
def _fix_common_issues(self, html: str) -> str:
"""Fix other common HTML issues."""
# Use pre-compiled patterns for better performance
html = self._compiled_patterns['multiple_br'].sub('<br/><br/>', html)
html = self._compiled_patterns['space_before_punct'].sub(r'\1', html)
html = self._compiled_patterns['missing_space_after_punct'].sub(r'\1 \2', html)
# Remove zero-width spaces (simple string replace is faster than regex)
html = html.replace('\u200b', '')
html = html.replace('\ufeff', '')
# Fix common typos in tags (simple string replace is faster than regex)
html = html.replace('<tabel', '<table')
html = html.replace('</tabel>', '</table>')
return html