"""Page break detection utilities for SEC documents. This module provides shared page break detection functionality that can be used by both the edgar library and external projects that need to detect page breaks in SEC HTML documents. """ import re from typing import Any, Dict, List from bs4 import Tag class PageBreakDetector: """Detects page breaks in SEC HTML documents.""" # Class-based page break selectors CLASS_BASED_SELECTORS = [ 'div.BRPFPageBreak', 'div.pagebreak', 'div.page-break' ] # HR elements with specific styling HR_PAGE_BREAK_SELECTORS = [ 'hr[style*="height:3px"]', 'hr[style*="height: 3px"]' ] @staticmethod def _find_page_like_divs(element: Tag) -> List[Dict[str, Any]]: """Find div elements with page-like dimensions.""" page_divs = [] divs = element.find_all('div') for div in divs: style = div.get('style', '') if not style: continue if PageBreakDetector._is_page_like_div(style): page_divs.append({ 'element': div.name, 'selector': 'page-like-div', 'style': style, 'classes': div.get('class', []), 'is_page_div': True }) return page_divs @staticmethod def _is_page_like_div(style: str) -> bool: """Check if a div has page-like dimensions based on its style. Args: style: CSS style string to analyze Returns: True if the div has page-like dimensions and styling """ # Parse the style string to extract key properties style_props = {} for prop in style.split(';'): if ':' in prop: key, value = prop.split(':', 1) style_props[key.strip().lower()] = value.strip().lower() # Check for page-like dimensions height = style_props.get('height', '') width = style_props.get('width', '') position = style_props.get('position', '') overflow = style_props.get('overflow', '') # Look for typical page dimensions # Common page heights: 842.4pt (A4), 792pt (Letter), 1008pt (Legal) # Common page widths: 597.6pt (A4), 612pt (Letter), 612pt (Legal) page_heights = ['842.4pt', '792pt', '1008pt'] page_widths = ['597.6pt', '612pt'] has_page_height = any(ph in height for ph in page_heights) has_page_width = any(pw in width for pw in page_widths) has_position = position in ['relative', 'absolute'] has_overflow = 'hidden' in overflow # Consider it a page div if it has both page-like dimensions # and typical page styling properties return has_page_height and has_page_width and (has_position or has_overflow) @staticmethod def mark_page_breaks(element: Tag) -> None: """Mark page break elements with a special attribute for detection. This method adds '_is_page_break' attributes to elements that represent page breaks, which can be used by other parts of the system. Args: element: BeautifulSoup Tag element to mark """ # Mark CSS page break elements using case-insensitive detection PageBreakDetector._mark_css_page_breaks(element) # Mark class-based page breaks for selector in PageBreakDetector.CLASS_BASED_SELECTORS: page_breaks = element.select(selector) for pb in page_breaks: pb['_is_page_break'] = 'true' # Also mark parent containers that contain page breaks if pb.parent and pb.parent.name == 'div': parent_classes = pb.parent.get('class', []) if any('pagebreak' in cls.lower() for cls in parent_classes): pb.parent['_is_page_break'] = 'true' # Mark HR page breaks for selector in PageBreakDetector.HR_PAGE_BREAK_SELECTORS: page_breaks = element.select(selector) for pb in page_breaks: pb['_is_page_break'] = 'true' # Mark page-like divs divs = element.find_all('div') for div in divs: style = div.get('style', '') if style and PageBreakDetector._is_page_like_div(style): div['_is_page_break'] = 'true' @staticmethod def _mark_css_page_breaks(element: Tag) -> None: """Mark CSS page break elements using case-insensitive detection.""" # Define the page break patterns we're looking for (case insensitive) page_break_patterns = [ r'page-break-before\s*:\s*always', r'page-break-after\s*:\s*always' ] # Compile case-insensitive regex patterns compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in page_break_patterns] # Find all elements that could have page break styles for tag_name in ['p', 'div', 'hr']: elements = element.find_all(tag_name) for el in elements: style = el.get('style', '') if not style: continue # Check if any page break pattern matches for pattern in compiled_patterns: if pattern.search(style): el['_is_page_break'] = 'true' break # Only mark each element once def detect_page_breaks(html_content: str) -> List[Dict[str, Any]]: """Detect page breaks in HTML content. This is the main public function for external use. Args: html_content: HTML string to analyze Returns: List of dictionaries containing page break information """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') # For the public API, we need to collect info about page breaks # This is mainly used for testing and external analysis page_breaks = [] # Find CSS page break elements using case-insensitive detection page_break_patterns = [ r'page-break-before\s*:\s*always', r'page-break-after\s*:\s*always' ] compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in page_break_patterns] for tag_name in ['p', 'div', 'hr']: elements = soup.find_all(tag_name) for el in elements: style = el.get('style', '') if not style: continue for pattern in compiled_patterns: if pattern.search(style): page_breaks.append({ 'element': el.name, 'selector': f'{tag_name}[style*="page-break"]', 'style': style, 'classes': el.get('class', []), 'is_page_div': False }) break # Find class-based page breaks for selector in PageBreakDetector.CLASS_BASED_SELECTORS: elements = soup.select(selector) for el in elements: page_breaks.append({ 'element': el.name, 'selector': selector, 'style': el.get('style', ''), 'classes': el.get('class', []), 'is_page_div': False }) # Find HR page breaks for selector in PageBreakDetector.HR_PAGE_BREAK_SELECTORS: elements = soup.select(selector) for el in elements: page_breaks.append({ 'element': el.name, 'selector': selector, 'style': el.get('style', ''), 'classes': el.get('class', []), 'is_page_div': False }) # Find page-like divs page_divs = PageBreakDetector._find_page_like_divs(soup) page_breaks.extend(page_divs) return page_breaks def mark_page_breaks(html_content: str) -> str: """Mark page breaks in HTML content and return the modified HTML. Args: html_content: HTML string to process Returns: Modified HTML string with page break markers added """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') PageBreakDetector.mark_page_breaks(soup) return str(soup)