Initial commit
This commit is contained in:
@@ -0,0 +1,11 @@
|
||||
"""
|
||||
Document processors for preprocessing and postprocessing.
|
||||
"""
|
||||
|
||||
from edgar.documents.processors.preprocessor import HTMLPreprocessor
|
||||
from edgar.documents.processors.postprocessor import DocumentPostprocessor
|
||||
|
||||
__all__ = [
|
||||
'HTMLPreprocessor',
|
||||
'DocumentPostprocessor'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,283 @@
|
||||
"""
|
||||
Document postprocessor for final processing after parsing.
|
||||
"""
|
||||
|
||||
from typing import List, Set
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.document import Document
|
||||
from edgar.documents.nodes import Node, TextNode, ParagraphNode, HeadingNode
|
||||
from edgar.documents.types import NodeType
|
||||
|
||||
|
||||
class DocumentPostprocessor:
|
||||
"""
|
||||
Postprocesses parsed documents to improve quality.
|
||||
|
||||
Handles:
|
||||
- Adjacent node merging
|
||||
- Empty node removal
|
||||
- Heading level normalization
|
||||
- Section detection enhancement
|
||||
- Metadata enrichment
|
||||
"""
|
||||
|
||||
def __init__(self, config: ParserConfig):
|
||||
"""Initialize postprocessor with configuration."""
|
||||
self.config = config
|
||||
|
||||
def process(self, document: Document) -> Document:
|
||||
"""
|
||||
Postprocess document.
|
||||
|
||||
Args:
|
||||
document: Parsed document
|
||||
|
||||
Returns:
|
||||
Processed document
|
||||
"""
|
||||
# Remove empty nodes
|
||||
self._remove_empty_nodes(document.root)
|
||||
|
||||
# Merge adjacent text nodes if configured
|
||||
if self.config.merge_adjacent_nodes:
|
||||
self._merge_adjacent_nodes(document.root)
|
||||
|
||||
# Normalize heading levels
|
||||
self._normalize_heading_levels(document.root)
|
||||
|
||||
# Enhance section detection if configured
|
||||
if self.config.detect_sections:
|
||||
self._enhance_sections(document)
|
||||
|
||||
# Add document statistics
|
||||
self._add_statistics(document)
|
||||
|
||||
# Validate document structure
|
||||
self._validate_structure(document)
|
||||
|
||||
return document
|
||||
|
||||
def _remove_empty_nodes(self, node: Node):
|
||||
"""Remove empty nodes from tree."""
|
||||
# Process children first (bottom-up)
|
||||
children_to_remove = []
|
||||
|
||||
for child in node.children:
|
||||
self._remove_empty_nodes(child)
|
||||
|
||||
# Check if child is empty
|
||||
if self._is_empty_node(child):
|
||||
children_to_remove.append(child)
|
||||
|
||||
# Remove empty children
|
||||
for child in children_to_remove:
|
||||
node.remove_child(child)
|
||||
|
||||
def _is_empty_node(self, node: Node) -> bool:
|
||||
"""Check if node is empty and can be removed."""
|
||||
# Never remove table nodes
|
||||
if node.type == NodeType.TABLE:
|
||||
return False
|
||||
|
||||
# Never remove nodes with metadata
|
||||
if node.metadata:
|
||||
return False
|
||||
|
||||
# Check text nodes
|
||||
if isinstance(node, TextNode):
|
||||
return not node.text().strip()
|
||||
|
||||
# Check other nodes with text content
|
||||
if hasattr(node, 'content') and isinstance(node.content, str):
|
||||
return not node.content.strip()
|
||||
|
||||
# Check container nodes
|
||||
if not node.children:
|
||||
# Empty container with no children
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _merge_adjacent_nodes(self, node: Node):
|
||||
"""Merge adjacent text nodes with similar properties."""
|
||||
if not node.children:
|
||||
return
|
||||
|
||||
# Process children first
|
||||
for child in node.children:
|
||||
self._merge_adjacent_nodes(child)
|
||||
|
||||
# Merge adjacent text nodes
|
||||
merged_children = []
|
||||
i = 0
|
||||
|
||||
while i < len(node.children):
|
||||
current = node.children[i]
|
||||
|
||||
# Look for mergeable nodes
|
||||
if self._can_merge(current):
|
||||
# Collect all adjacent mergeable nodes
|
||||
merge_group = [current]
|
||||
j = i + 1
|
||||
|
||||
while j < len(node.children) and self._can_merge_with(current, node.children[j]):
|
||||
merge_group.append(node.children[j])
|
||||
j += 1
|
||||
|
||||
# Merge if we have multiple nodes
|
||||
if len(merge_group) > 1:
|
||||
merged = self._merge_nodes(merge_group)
|
||||
merged_children.append(merged)
|
||||
i = j
|
||||
else:
|
||||
merged_children.append(current)
|
||||
i += 1
|
||||
else:
|
||||
merged_children.append(current)
|
||||
i += 1
|
||||
|
||||
# Update children
|
||||
node.children = merged_children
|
||||
|
||||
# Update parent references
|
||||
for child in node.children:
|
||||
child.parent = node
|
||||
|
||||
def _can_merge(self, node: Node) -> bool:
|
||||
"""Check if node can be merged."""
|
||||
# Only merge TextNodes, not ParagraphNodes
|
||||
return isinstance(node, TextNode) and not node.metadata
|
||||
|
||||
def _can_merge_with(self, node1: Node, node2: Node) -> bool:
|
||||
"""Check if two nodes can be merged."""
|
||||
# Must be same type
|
||||
if type(node1) != type(node2):
|
||||
return False
|
||||
|
||||
# Must have compatible styles
|
||||
if not self._compatible_styles(node1.style, node2.style):
|
||||
return False
|
||||
|
||||
# Must not have metadata
|
||||
if node1.metadata or node2.metadata:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _compatible_styles(self, style1, style2) -> bool:
|
||||
"""Check if two styles are compatible for merging."""
|
||||
# For now, just check key properties
|
||||
return (
|
||||
style1.font_size == style2.font_size and
|
||||
style1.font_weight == style2.font_weight and
|
||||
style1.text_align == style2.text_align
|
||||
)
|
||||
|
||||
def _merge_nodes(self, nodes: List[Node]) -> Node:
|
||||
"""Merge multiple nodes into one."""
|
||||
if not nodes:
|
||||
return None
|
||||
|
||||
# Use first node as base
|
||||
merged = nodes[0]
|
||||
|
||||
# Merge content
|
||||
if isinstance(merged, TextNode):
|
||||
texts = [n.text() for n in nodes]
|
||||
merged.content = '\n'.join(texts)
|
||||
elif isinstance(merged, ParagraphNode):
|
||||
# Merge all children
|
||||
for node in nodes[1:]:
|
||||
merged.children.extend(node.children)
|
||||
|
||||
return merged
|
||||
|
||||
def _normalize_heading_levels(self, node: Node):
|
||||
"""Normalize heading levels to ensure proper hierarchy."""
|
||||
# Collect all headings
|
||||
headings = []
|
||||
self._collect_headings(node, headings)
|
||||
|
||||
if not headings:
|
||||
return
|
||||
|
||||
# Analyze heading structure
|
||||
levels_used = set(h.level for h in headings)
|
||||
|
||||
# If we're missing level 1, promote headings
|
||||
if 1 not in levels_used and levels_used:
|
||||
min_level = min(levels_used)
|
||||
adjustment = min_level - 1
|
||||
|
||||
for heading in headings:
|
||||
heading.level = max(1, heading.level - adjustment)
|
||||
|
||||
def _collect_headings(self, node: Node, headings: List[HeadingNode]):
|
||||
"""Collect all heading nodes."""
|
||||
if isinstance(node, HeadingNode):
|
||||
headings.append(node)
|
||||
|
||||
for child in node.children:
|
||||
self._collect_headings(child, headings)
|
||||
|
||||
def _enhance_sections(self, document: Document):
|
||||
"""Enhance section detection and metadata."""
|
||||
# Only extract sections eagerly if configured to do so
|
||||
if not self.config.eager_section_extraction:
|
||||
return
|
||||
|
||||
# Force section extraction to populate cache
|
||||
_ = document.sections
|
||||
|
||||
# Add section metadata to nodes
|
||||
for section_name, section in document.sections.items():
|
||||
# Add section name to all nodes in section
|
||||
for node in section.node.walk():
|
||||
node.set_metadata('section', section_name)
|
||||
|
||||
def _add_statistics(self, document: Document):
|
||||
"""Add document statistics to metadata."""
|
||||
stats = {
|
||||
'node_count': sum(1 for _ in document.root.walk()),
|
||||
'text_length': len(document.text()),
|
||||
'table_count': len(document.tables),
|
||||
'heading_count': len(document.headings),
|
||||
}
|
||||
|
||||
# Only add section count if sections were extracted
|
||||
if self.config.eager_section_extraction:
|
||||
stats['section_count'] = len(document.sections)
|
||||
|
||||
document.metadata.statistics = stats
|
||||
|
||||
def _validate_structure(self, document: Document):
|
||||
"""Validate document structure and fix issues."""
|
||||
issues = []
|
||||
|
||||
# Check for orphaned nodes
|
||||
for node in document.root.walk():
|
||||
if node != document.root and node.parent is None:
|
||||
issues.append(f"Orphaned node: {node.type}")
|
||||
# Fix by adding to root
|
||||
document.root.add_child(node)
|
||||
|
||||
# Check for circular references
|
||||
visited = set()
|
||||
|
||||
def check_cycles(node: Node, path: Set[str]):
|
||||
if node.id in path:
|
||||
issues.append(f"Circular reference detected: {node.type}")
|
||||
return
|
||||
|
||||
path.add(node.id)
|
||||
visited.add(node.id)
|
||||
|
||||
for child in node.children:
|
||||
if child.id not in visited:
|
||||
check_cycles(child, path.copy())
|
||||
|
||||
check_cycles(document.root, set())
|
||||
|
||||
# Store validation results
|
||||
if issues:
|
||||
document.metadata.validation_issues = issues
|
||||
@@ -0,0 +1,242 @@
|
||||
"""
|
||||
HTML preprocessor for cleaning and normalizing HTML before parsing.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from edgar.documents.config import ParserConfig
|
||||
from edgar.documents.utils.html_utils import remove_xml_declaration
|
||||
|
||||
|
||||
class HTMLPreprocessor:
|
||||
"""
|
||||
Preprocesses HTML to fix common issues and normalize content.
|
||||
|
||||
Handles:
|
||||
- Character encoding issues
|
||||
- Malformed HTML
|
||||
- Excessive whitespace
|
||||
- Script/style removal
|
||||
- Entity normalization
|
||||
"""
|
||||
|
||||
def __init__(self, config: ParserConfig):
|
||||
"""Initialize preprocessor with configuration."""
|
||||
self.config = config
|
||||
|
||||
# Pre-compile regex patterns for performance
|
||||
self._compiled_patterns = self._compile_patterns()
|
||||
|
||||
def _compile_patterns(self):
|
||||
"""Pre-compile frequently used regex patterns."""
|
||||
return {
|
||||
# Encoding and cleanup
|
||||
'control_chars': re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]'),
|
||||
|
||||
# Script/style removal
|
||||
'script_tags': re.compile(r'<script[^>]*>.*?</script>', re.IGNORECASE | re.DOTALL),
|
||||
'style_tags': re.compile(r'<style[^>]*>.*?</style>', re.IGNORECASE | re.DOTALL),
|
||||
'link_tags': re.compile(r'<link[^>]*>', re.IGNORECASE),
|
||||
'comments': re.compile(r'<!--.*?-->', re.DOTALL),
|
||||
'ix_hidden': re.compile(r'<ix:hidden[^>]*>.*?</ix:hidden>', re.IGNORECASE | re.DOTALL),
|
||||
'ix_header': re.compile(r'<ix:header[^>]*>.*?</ix:header>', re.IGNORECASE | re.DOTALL),
|
||||
|
||||
# Malformed tags
|
||||
'br_tags': re.compile(r'<br(?![^>]*/)>', re.IGNORECASE),
|
||||
'img_tags': re.compile(r'<img([^>]+)(?<!/)>', re.IGNORECASE),
|
||||
'input_tags': re.compile(r'<input([^>]+)(?<!/)>', re.IGNORECASE),
|
||||
'hr_tags': re.compile(r'<hr(?![^>]*/)>', re.IGNORECASE),
|
||||
'nested_p_open': re.compile(r'<p>\s*<p>', re.IGNORECASE),
|
||||
'nested_p_close': re.compile(r'</p>\s*</p>', re.IGNORECASE),
|
||||
|
||||
# Whitespace normalization
|
||||
'multiple_spaces': re.compile(r'[ \t]+'),
|
||||
'multiple_newlines': re.compile(r'\n{3,}'),
|
||||
'spaces_around_tags': re.compile(r'\s*(<[^>]+>)\s*'),
|
||||
|
||||
# Block element newlines - combined pattern for opening tags
|
||||
'block_open_tags': re.compile(
|
||||
r'(<(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)[^>]*>)',
|
||||
re.IGNORECASE
|
||||
),
|
||||
# Block element newlines - combined pattern for closing tags
|
||||
'block_close_tags': re.compile(
|
||||
r'(</(?:div|p|h[1-6]|table|tr|ul|ol|li|blockquote)>)',
|
||||
re.IGNORECASE
|
||||
),
|
||||
|
||||
# Empty tags removal - combined pattern for all removable tags
|
||||
'empty_tags': re.compile(
|
||||
r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*>\s*</(?:span|div|p|font|b|i|u|strong|em)>',
|
||||
re.IGNORECASE
|
||||
),
|
||||
'empty_self_closing': re.compile(
|
||||
r'<(?:span|div|p|font|b|i|u|strong|em)\b[^>]*/>\s*',
|
||||
re.IGNORECASE
|
||||
),
|
||||
|
||||
# Common issues
|
||||
'multiple_br': re.compile(r'(<br\s*/?>[\s\n]*){3,}', re.IGNORECASE),
|
||||
'space_before_punct': re.compile(r'\s+([.,;!?])'),
|
||||
'missing_space_after_punct': re.compile(r'([.,;!?])([A-Z])'),
|
||||
}
|
||||
|
||||
def process(self, html: str) -> str:
|
||||
"""
|
||||
Preprocess HTML content.
|
||||
|
||||
Args:
|
||||
html: Raw HTML content
|
||||
|
||||
Returns:
|
||||
Cleaned HTML ready for parsing
|
||||
"""
|
||||
# Remove BOM if present
|
||||
if html.startswith('\ufeff'):
|
||||
html = html[1:]
|
||||
|
||||
# Remove XML declaration if present
|
||||
html = remove_xml_declaration(html)
|
||||
|
||||
# Fix common character encoding issues
|
||||
html = self._fix_encoding_issues(html)
|
||||
|
||||
# Remove script and style tags
|
||||
html = self._remove_script_style(html)
|
||||
|
||||
# Normalize entities
|
||||
html = self._normalize_entities(html)
|
||||
|
||||
# Fix malformed tags
|
||||
html = self._fix_malformed_tags(html)
|
||||
|
||||
# Normalize whitespace if not preserving
|
||||
if not self.config.preserve_whitespace:
|
||||
html = self._normalize_whitespace(html)
|
||||
|
||||
# Remove empty tags
|
||||
html = self._remove_empty_tags(html)
|
||||
|
||||
# Fix common HTML issues
|
||||
html = self._fix_common_issues(html)
|
||||
|
||||
return html
|
||||
|
||||
def _fix_encoding_issues(self, html: str) -> str:
|
||||
"""Fix common character encoding issues."""
|
||||
# Replace Windows-1252 characters with Unicode equivalents
|
||||
replacements = {
|
||||
'\x91': "'", # Left single quote
|
||||
'\x92': "'", # Right single quote
|
||||
'\x93': '"', # Left double quote
|
||||
'\x94': '"', # Right double quote
|
||||
'\x95': '•', # Bullet
|
||||
'\x96': '–', # En dash
|
||||
'\x97': '—', # Em dash
|
||||
'\xa0': ' ', # Non-breaking space
|
||||
}
|
||||
|
||||
for old, new in replacements.items():
|
||||
html = html.replace(old, new)
|
||||
|
||||
# Remove other control characters
|
||||
html = self._compiled_patterns['control_chars'].sub('', html)
|
||||
|
||||
return html
|
||||
|
||||
def _remove_script_style(self, html: str) -> str:
|
||||
"""Remove script and style tags with content."""
|
||||
# Use pre-compiled patterns for better performance
|
||||
html = self._compiled_patterns['script_tags'].sub('', html)
|
||||
html = self._compiled_patterns['style_tags'].sub('', html)
|
||||
html = self._compiled_patterns['link_tags'].sub('', html)
|
||||
html = self._compiled_patterns['comments'].sub('', html)
|
||||
html = self._compiled_patterns['ix_hidden'].sub('', html)
|
||||
html = self._compiled_patterns['ix_header'].sub('', html)
|
||||
|
||||
return html
|
||||
|
||||
def _normalize_entities(self, html: str) -> str:
|
||||
"""Normalize HTML entities."""
|
||||
# Common entity replacements
|
||||
entities = {
|
||||
' ': ' ',
|
||||
' ': ' ',
|
||||
' ': ' ',
|
||||
' ': ' ',
|
||||
' ': ' ',
|
||||
' ': ' ',
|
||||
'‍': '', # Zero-width joiner
|
||||
'‌': '', # Zero-width non-joiner
|
||||
'​': '', # Zero-width space
|
||||
}
|
||||
|
||||
for entity, replacement in entities.items():
|
||||
html = html.replace(entity, replacement)
|
||||
|
||||
# Fix double-encoded entities
|
||||
html = html.replace('&amp;', '&')
|
||||
html = html.replace('&nbsp;', ' ')
|
||||
html = html.replace('&lt;', '<')
|
||||
html = html.replace('&gt;', '>')
|
||||
|
||||
return html
|
||||
|
||||
def _fix_malformed_tags(self, html: str) -> str:
|
||||
"""Fix common malformed tag issues."""
|
||||
# Use pre-compiled patterns for better performance
|
||||
html = self._compiled_patterns['br_tags'].sub('<br/>', html)
|
||||
html = self._compiled_patterns['img_tags'].sub(r'<img\1/>', html)
|
||||
html = self._compiled_patterns['input_tags'].sub(r'<input\1/>', html)
|
||||
html = self._compiled_patterns['hr_tags'].sub('<hr/>', html)
|
||||
html = self._compiled_patterns['nested_p_open'].sub('<p>', html)
|
||||
html = self._compiled_patterns['nested_p_close'].sub('</p>', html)
|
||||
|
||||
return html
|
||||
|
||||
def _normalize_whitespace(self, html: str) -> str:
|
||||
"""Normalize whitespace in HTML."""
|
||||
# Use pre-compiled patterns for better performance
|
||||
# Replace multiple spaces with single space
|
||||
html = self._compiled_patterns['multiple_spaces'].sub(' ', html)
|
||||
|
||||
# Replace multiple newlines with double newline
|
||||
html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
|
||||
|
||||
# Remove spaces around tags
|
||||
html = self._compiled_patterns['spaces_around_tags'].sub(r'\1', html)
|
||||
|
||||
# Add newlines around block elements for readability
|
||||
# Using combined patterns instead of looping over individual tags
|
||||
html = self._compiled_patterns['block_open_tags'].sub(r'\n\1', html)
|
||||
html = self._compiled_patterns['block_close_tags'].sub(r'\1\n', html)
|
||||
|
||||
# Clean up excessive newlines (apply again after adding newlines)
|
||||
html = self._compiled_patterns['multiple_newlines'].sub('\n\n', html)
|
||||
|
||||
return html.strip()
|
||||
|
||||
def _remove_empty_tags(self, html: str) -> str:
|
||||
"""Remove empty tags that don't contribute content."""
|
||||
# Use pre-compiled combined patterns instead of looping
|
||||
html = self._compiled_patterns['empty_tags'].sub('', html)
|
||||
html = self._compiled_patterns['empty_self_closing'].sub('', html)
|
||||
|
||||
return html
|
||||
|
||||
def _fix_common_issues(self, html: str) -> str:
|
||||
"""Fix other common HTML issues."""
|
||||
# Use pre-compiled patterns for better performance
|
||||
html = self._compiled_patterns['multiple_br'].sub('<br/><br/>', html)
|
||||
html = self._compiled_patterns['space_before_punct'].sub(r'\1', html)
|
||||
html = self._compiled_patterns['missing_space_after_punct'].sub(r'\1 \2', html)
|
||||
|
||||
# Remove zero-width spaces (simple string replace is faster than regex)
|
||||
html = html.replace('\u200b', '')
|
||||
html = html.replace('\ufeff', '')
|
||||
|
||||
# Fix common typos in tags (simple string replace is faster than regex)
|
||||
html = html.replace('<tabel', '<table')
|
||||
html = html.replace('</tabel>', '</table>')
|
||||
|
||||
return html
|
||||
Reference in New Issue
Block a user