Files
edgartools/venv/lib/python3.10/site-packages/edgar/files/page_breaks.py
2025-12-09 12:13:01 +01:00

245 lines
8.2 KiB
Python

"""Page break detection utilities for SEC documents.
This module provides shared page break detection functionality that can be used
by both the edgar library and external projects that need to detect page breaks
in SEC HTML documents.
"""
import re
from typing import Any, Dict, List
from bs4 import Tag
class PageBreakDetector:
"""Detects page breaks in SEC HTML documents."""
# Class-based page break selectors
CLASS_BASED_SELECTORS = [
'div.BRPFPageBreak',
'div.pagebreak',
'div.page-break'
]
# HR elements with specific styling
HR_PAGE_BREAK_SELECTORS = [
'hr[style*="height:3px"]',
'hr[style*="height: 3px"]'
]
@staticmethod
def _find_page_like_divs(element: Tag) -> List[Dict[str, Any]]:
"""Find div elements with page-like dimensions."""
page_divs = []
divs = element.find_all('div')
for div in divs:
style = div.get('style', '')
if not style:
continue
if PageBreakDetector._is_page_like_div(style):
page_divs.append({
'element': div.name,
'selector': 'page-like-div',
'style': style,
'classes': div.get('class', []),
'is_page_div': True
})
return page_divs
@staticmethod
def _is_page_like_div(style: str) -> bool:
"""Check if a div has page-like dimensions based on its style.
Args:
style: CSS style string to analyze
Returns:
True if the div has page-like dimensions and styling
"""
# Parse the style string to extract key properties
style_props = {}
for prop in style.split(';'):
if ':' in prop:
key, value = prop.split(':', 1)
style_props[key.strip().lower()] = value.strip().lower()
# Check for page-like dimensions
height = style_props.get('height', '')
width = style_props.get('width', '')
position = style_props.get('position', '')
overflow = style_props.get('overflow', '')
# Look for typical page dimensions
# Common page heights: 842.4pt (A4), 792pt (Letter), 1008pt (Legal)
# Common page widths: 597.6pt (A4), 612pt (Letter), 612pt (Legal)
page_heights = ['842.4pt', '792pt', '1008pt']
page_widths = ['597.6pt', '612pt']
has_page_height = any(ph in height for ph in page_heights)
has_page_width = any(pw in width for pw in page_widths)
has_position = position in ['relative', 'absolute']
has_overflow = 'hidden' in overflow
# Consider it a page div if it has both page-like dimensions
# and typical page styling properties
return has_page_height and has_page_width and (has_position or has_overflow)
@staticmethod
def mark_page_breaks(element: Tag) -> None:
"""Mark page break elements with a special attribute for detection.
This method adds '_is_page_break' attributes to elements that represent
page breaks, which can be used by other parts of the system.
Args:
element: BeautifulSoup Tag element to mark
"""
# Mark CSS page break elements using case-insensitive detection
PageBreakDetector._mark_css_page_breaks(element)
# Mark class-based page breaks
for selector in PageBreakDetector.CLASS_BASED_SELECTORS:
page_breaks = element.select(selector)
for pb in page_breaks:
pb['_is_page_break'] = 'true'
# Also mark parent containers that contain page breaks
if pb.parent and pb.parent.name == 'div':
parent_classes = pb.parent.get('class', [])
if any('pagebreak' in cls.lower() for cls in parent_classes):
pb.parent['_is_page_break'] = 'true'
# Mark HR page breaks
for selector in PageBreakDetector.HR_PAGE_BREAK_SELECTORS:
page_breaks = element.select(selector)
for pb in page_breaks:
pb['_is_page_break'] = 'true'
# Mark page-like divs
divs = element.find_all('div')
for div in divs:
style = div.get('style', '')
if style and PageBreakDetector._is_page_like_div(style):
div['_is_page_break'] = 'true'
@staticmethod
def _mark_css_page_breaks(element: Tag) -> None:
"""Mark CSS page break elements using case-insensitive detection."""
# Define the page break patterns we're looking for (case insensitive)
page_break_patterns = [
r'page-break-before\s*:\s*always',
r'page-break-after\s*:\s*always'
]
# Compile case-insensitive regex patterns
compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in page_break_patterns]
# Find all elements that could have page break styles
for tag_name in ['p', 'div', 'hr']:
elements = element.find_all(tag_name)
for el in elements:
style = el.get('style', '')
if not style:
continue
# Check if any page break pattern matches
for pattern in compiled_patterns:
if pattern.search(style):
el['_is_page_break'] = 'true'
break # Only mark each element once
def detect_page_breaks(html_content: str) -> List[Dict[str, Any]]:
"""Detect page breaks in HTML content.
This is the main public function for external use.
Args:
html_content: HTML string to analyze
Returns:
List of dictionaries containing page break information
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# For the public API, we need to collect info about page breaks
# This is mainly used for testing and external analysis
page_breaks = []
# Find CSS page break elements using case-insensitive detection
page_break_patterns = [
r'page-break-before\s*:\s*always',
r'page-break-after\s*:\s*always'
]
compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in page_break_patterns]
for tag_name in ['p', 'div', 'hr']:
elements = soup.find_all(tag_name)
for el in elements:
style = el.get('style', '')
if not style:
continue
for pattern in compiled_patterns:
if pattern.search(style):
page_breaks.append({
'element': el.name,
'selector': f'{tag_name}[style*="page-break"]',
'style': style,
'classes': el.get('class', []),
'is_page_div': False
})
break
# Find class-based page breaks
for selector in PageBreakDetector.CLASS_BASED_SELECTORS:
elements = soup.select(selector)
for el in elements:
page_breaks.append({
'element': el.name,
'selector': selector,
'style': el.get('style', ''),
'classes': el.get('class', []),
'is_page_div': False
})
# Find HR page breaks
for selector in PageBreakDetector.HR_PAGE_BREAK_SELECTORS:
elements = soup.select(selector)
for el in elements:
page_breaks.append({
'element': el.name,
'selector': selector,
'style': el.get('style', ''),
'classes': el.get('class', []),
'is_page_div': False
})
# Find page-like divs
page_divs = PageBreakDetector._find_page_like_divs(soup)
page_breaks.extend(page_divs)
return page_breaks
def mark_page_breaks(html_content: str) -> str:
"""Mark page breaks in HTML content and return the modified HTML.
Args:
html_content: HTML string to process
Returns:
Modified HTML string with page break markers added
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
PageBreakDetector.mark_page_breaks(soup)
return str(soup)