Initial commit
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
"""File processing utilities for SEC documents."""
|
||||
|
||||
from .page_breaks import PageBreakDetector, detect_page_breaks, mark_page_breaks
|
||||
|
||||
__all__ = [
|
||||
'detect_page_breaks',
|
||||
'mark_page_breaks',
|
||||
'PageBreakDetector'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,261 @@
|
||||
# ChunkedDocument Item Extraction Process
|
||||
|
||||
This document explains how the `ChunkedDocument` class in `edgar.files.htmltools` is used to extract items from SEC filings, particularly 10-K documents, and how the new `Document` implementation could replace this functionality.
|
||||
|
||||
## Overview
|
||||
|
||||
The `ChunkedDocument` class provides functionality to parse HTML from SEC filings and extract specific sections (items) based on their item numbers (e.g., "Item 1", "Item 1A", etc.). It works by:
|
||||
|
||||
1. Breaking the HTML into chunks
|
||||
2. Identifying item headings
|
||||
3. Creating a mapping of chunks to item numbers
|
||||
4. Providing access to specific items through indexing
|
||||
|
||||
This functionality is essential for extracting specific sections from 10-K, 10-Q, and other structured SEC filings.
|
||||
|
||||
## Key Components of the Extraction Process
|
||||
|
||||
### 1. ChunkedDocument Class
|
||||
|
||||
The `ChunkedDocument` class is initialized with HTML content and a chunking function:
|
||||
|
||||
```python
|
||||
def __init__(self, html: str, chunk_fn: Callable[[List], pd.DataFrame] = chunks2df):
|
||||
self.chunks = chunk(html)
|
||||
self._chunked_data = chunk_fn(self.chunks)
|
||||
self.chunk_fn = chunk_fn
|
||||
```
|
||||
|
||||
- `html`: The HTML content of the SEC filing
|
||||
- `chunk_fn`: A function that converts chunks to a DataFrame (defaults to `chunks2df`)
|
||||
|
||||
### 2. Chunking Process
|
||||
|
||||
The HTML is first broken into chunks using the `chunk` function:
|
||||
|
||||
```python
|
||||
@lru_cache(maxsize=8)
|
||||
def chunk(html: str):
|
||||
document = HtmlDocument.from_html(html)
|
||||
return list(document.generate_chunks())
|
||||
```
|
||||
|
||||
This leverages `HtmlDocument.from_html()` and its `generate_chunks()` method to divide the HTML into semantic chunks. The `HtmlDocument` class is part of the older implementation that the new `Document` class aims to replace.
|
||||
|
||||
### 3. Chunks to DataFrame Conversion
|
||||
|
||||
The chunks are then processed into a DataFrame using the `chunks2df` function:
|
||||
|
||||
```python
|
||||
def chunks2df(chunks: List[List[Block]],
|
||||
item_detector: Callable[[pd.Series], pd.Series] = detect_int_items,
|
||||
item_adjuster: Callable[[pd.DataFrame, Dict[str, Any]], pd.DataFrame] = adjust_detected_items,
|
||||
item_structure=None) -> pd.DataFrame:
|
||||
```
|
||||
|
||||
This function:
|
||||
- Takes the chunks and creates a DataFrame with columns for text, table flags, etc.
|
||||
- Detects item headings using the specified `item_detector` (default: `detect_int_items`)
|
||||
- Applies adjustments via `item_adjuster` (default: `adjust_detected_items`)
|
||||
- Adds metadata like character count, signature detection, etc.
|
||||
- Forward-fills item numbers so each chunk is associated with an item
|
||||
|
||||
### 4. Item Detection
|
||||
|
||||
The item detection process uses regular expressions to identify item headings:
|
||||
|
||||
```python
|
||||
int_item_pattern = r"^(Item\s{1,3}[0-9]{1,2}[A-Z]?)\.?"
|
||||
decimal_item_pattern = r"^(Item\s{1,3}[0-9]{1,2}\.[0-9]{2})\.?"
|
||||
|
||||
def detect_int_items(text: pd.Series):
|
||||
return text.str.extract(int_item_pattern, expand=False, flags=re.IGNORECASE | re.MULTILINE)
|
||||
|
||||
def detect_decimal_items(text: pd.Series):
|
||||
return text.str.extract(decimal_item_pattern, expand=False, flags=re.IGNORECASE | re.MULTILINE)
|
||||
```
|
||||
|
||||
These patterns match standard item headings like "Item 1" or "Item 1.01" and extract them from the text.
|
||||
|
||||
### 5. Item Adjustment
|
||||
|
||||
After initial detection, the `adjust_detected_items` function ensures the items are in the correct sequence and filters out invalid or out-of-sequence items:
|
||||
|
||||
```python
|
||||
def adjust_detected_items(chunk_df: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
||||
# Normalize items
|
||||
# Find table of contents
|
||||
# Process each item in sequence
|
||||
# Validate items against expected sequence
|
||||
```
|
||||
|
||||
This function:
|
||||
- Normalizes item strings to a comparable format
|
||||
- Locates the table of contents section
|
||||
- Validates each detected item against the previous and next valid items
|
||||
- Creates a sequence of valid items
|
||||
|
||||
### 6. Item Access
|
||||
|
||||
The `ChunkedDocument` class provides access to items through indexing:
|
||||
|
||||
```python
|
||||
def __getitem__(self, item):
|
||||
if isinstance(item, int):
|
||||
chunks = [self.chunks[item]]
|
||||
elif isinstance(item, str):
|
||||
chunks = list(self.chunks_for_item(item))
|
||||
else:
|
||||
return None
|
||||
# Convert chunks to text
|
||||
# ...
|
||||
```
|
||||
|
||||
This allows direct access to items by their number (e.g., `document["Item 1"]`) and returns the consolidated text for that item.
|
||||
|
||||
### 7. Integration with Company Reports
|
||||
|
||||
The `ChunkedDocument` is used by the `CompanyReport` and its subclasses (like `TenK`, `TenQ`, etc.) to provide structured access to filing sections:
|
||||
|
||||
```python
|
||||
class CompanyReport:
|
||||
@property
|
||||
@lru_cache(maxsize=1)
|
||||
def chunked_document(self):
|
||||
return ChunkedDocument(self._filing.html())
|
||||
|
||||
def __getitem__(self, item_or_part: str):
|
||||
item_text = self.chunked_document[item_or_part]
|
||||
return item_text
|
||||
```
|
||||
|
||||
This enables usage patterns like:
|
||||
|
||||
```python
|
||||
tenk = TenK(filing)
|
||||
business_description = tenk["Item 1"] # Gets the business description section
|
||||
risk_factors = tenk["Item 1A"] # Gets the risk factors section
|
||||
```
|
||||
|
||||
## Technical Details
|
||||
|
||||
### Chunk Creation and Rendering
|
||||
|
||||
Chunks are created from the HTML using `HtmlDocument.from_html(html).generate_chunks()`, which:
|
||||
|
||||
1. Parses the HTML using BeautifulSoup
|
||||
2. Extracts blocks of content (text, tables, etc.)
|
||||
3. Compresses blocks to avoid unnecessary whitespace
|
||||
4. Groups related blocks into logical chunks
|
||||
|
||||
When rendering a chunk, the original HTML structure of tables is preserved through the `_render_blocks_using_old_markdown_tables` function:
|
||||
|
||||
```python
|
||||
def _render_blocks_using_old_markdown_tables(blocks:List[Block]):
|
||||
return "".join([
|
||||
table_to_markdown(block.table_element) if isinstance(block, TableBlock) else block.get_text()
|
||||
for block in blocks
|
||||
]).strip()
|
||||
```
|
||||
|
||||
### Special Cases
|
||||
|
||||
The system handles several special cases:
|
||||
|
||||
1. **Table of Contents**: Items in the table of contents are identified and excluded from being treated as section headers.
|
||||
2. **Signatures**: Signature blocks at the end of filings are identified to prevent them from being treated as regular content.
|
||||
3. **Empty Items**: Logic in `adjust_for_empty_items` handles cases where an item has no content but is followed by another item.
|
||||
4. **Decimal Items**: The `decimal_chunk_fn` provides specialized handling for filings like 8-K that use decimal item numbers (e.g., "Item 1.01").
|
||||
|
||||
### Data Structure
|
||||
|
||||
The key data structure is the DataFrame created by `chunks2df`, which contains columns:
|
||||
|
||||
- `Text`: The text content of the chunk
|
||||
- `Table`: Boolean indicating if the chunk is a table
|
||||
- `Chars`: Character count of the chunk
|
||||
- `Signature`: Boolean indicating if the chunk is part of a signature block
|
||||
- `TocLink`: Boolean indicating if the chunk is a table of contents link
|
||||
- `Toc`: Boolean indicating if the chunk is part of the table of contents
|
||||
- `Empty`: Boolean indicating if the chunk is empty
|
||||
- `Item`: The item number associated with the chunk (forward-filled)
|
||||
|
||||
## Replacing with the New Document Implementation
|
||||
|
||||
The new `Document` class implementation could replace the `ChunkedDocument` functionality by:
|
||||
|
||||
1. **Preserving Document Structure**: The new `Document` class already has a node-based structure that preserves document semantics, including headings, text blocks, and tables.
|
||||
|
||||
2. **Item Identification**: Implementing an item detection system that leverages the existing heading detection, perhaps with a specialized function that identifies item headings from `HeadingNode` instances.
|
||||
|
||||
3. **Item Association**: Creating a system to associate all nodes following an item heading with that item, similar to the forward-filling approach used in `chunks2df`.
|
||||
|
||||
4. **Item Access API**: Implementing an indexing system that allows access to items by their number, similar to `ChunkedDocument.__getitem__`.
|
||||
|
||||
### Specific Implementation Steps
|
||||
|
||||
1. **Create Item Detector**: Create a function that identifies item headings from `HeadingNode` instances based on their content and level:
|
||||
|
||||
```python
|
||||
def identify_item_headings(document: Document) -> Dict[str, int]:
|
||||
"""Identify item headings in the document and return a mapping of item names to node indices."""
|
||||
item_headings = {}
|
||||
for i, node in enumerate(document.nodes):
|
||||
if node.type == 'heading':
|
||||
match = re.match(r'^(Item\s+[0-9]+[A-Z]?)', node.content, re.IGNORECASE)
|
||||
if match:
|
||||
item_headings[match.group(1).strip()] = i
|
||||
return item_headings
|
||||
```
|
||||
|
||||
2. **Create Item Association**: Create a function that associates nodes with their respective items:
|
||||
|
||||
```python
|
||||
def associate_nodes_with_items(document: Document, item_headings: Dict[str, int]) -> Dict[str, List[BaseNode]]:
|
||||
"""Associate document nodes with their respective items."""
|
||||
item_nodes = {}
|
||||
item_indices = sorted(item_headings.values())
|
||||
|
||||
for i, idx in enumerate(item_indices):
|
||||
item_name = next(k for k, v in item_headings.items() if v == idx)
|
||||
next_idx = item_indices[i+1] if i+1 < len(item_indices) else len(document.nodes)
|
||||
item_nodes[item_name] = document.nodes[idx:next_idx]
|
||||
|
||||
return item_nodes
|
||||
```
|
||||
|
||||
3. **Implement Item Access**: Add an indexing method to `Document` that allows access to items:
|
||||
|
||||
```python
|
||||
def get_item(self, item_name: str) -> Optional[str]:
|
||||
"""Get a specific item from the document by name."""
|
||||
item_headings = identify_item_headings(self)
|
||||
if item_name not in item_headings:
|
||||
return None
|
||||
|
||||
item_nodes = associate_nodes_with_items(self, item_headings)
|
||||
|
||||
# Convert nodes to text
|
||||
return "\n".join(node.content for node in item_nodes[item_name])
|
||||
```
|
||||
|
||||
4. **Integration with Company Reports**: Update the `CompanyReport` class to use the new `Document` implementation:
|
||||
|
||||
```python
|
||||
@property
|
||||
@lru_cache(maxsize=1)
|
||||
def document(self):
|
||||
html = self._filing.html()
|
||||
return Document.parse(html)
|
||||
|
||||
def __getitem__(self, item_or_part: str):
|
||||
return self.document.get_item(item_or_part)
|
||||
```
|
||||
|
||||
## Conclusion
|
||||
|
||||
The `ChunkedDocument` class provides a robust system for extracting items from SEC filings. While the implementation is complex, it handles many edge cases and provides a clean API for accessing specific sections of filings.
|
||||
|
||||
Replacing this functionality with the new `Document` implementation would require preserving the ability to identify item headings, associate content with items, and provide an item access API. However, the new implementation could benefit from the more structured node-based approach, potentially leading to more accurate item extraction and better handling of complex document structures.
|
||||
|
||||
The key challenge will be correctly identifying item boundaries, especially in cases where item headings might be nested or where the document structure is complex. Careful testing against a variety of filings will be essential to ensure the new implementation matches or exceeds the capabilities of the current system.
|
||||
@@ -0,0 +1,350 @@
|
||||
# Document Class Architecture Review
|
||||
|
||||
## Overview
|
||||
|
||||
The `Document` class in `edgar.files.html` provides a structured representation of HTML content extracted from SEC filings. It implements a node-based architecture that preserves document structure while supporting rich text formatting and tabular data extraction.
|
||||
|
||||
### Key Components
|
||||
|
||||
- **Document**: Top-level container for parsed document nodes
|
||||
- **BaseNode**: Abstract base class for all document node types
|
||||
- **HeadingNode**: Represents section and subsection headings
|
||||
- **TextBlockNode**: Represents paragraphs and text content
|
||||
- **TableNode**: Represents tabular data with advanced processing
|
||||
- **SECHTMLParser**: HTML parser that creates the node structure
|
||||
- **IXTagTracker**: Tracks inline XBRL tags during parsing
|
||||
|
||||
### Primary Functionality
|
||||
|
||||
The `Document.parse()` method serves as the entry point, converting HTML text into a structured node tree that preserves document semantics, formatting, and inline XBRL metadata.
|
||||
|
||||
## Implementation Analysis
|
||||
|
||||
### Architectural Patterns
|
||||
|
||||
1. **Composite Pattern**: Implemented through `BaseNode` with specialized node types, allowing for a heterogeneous tree of document elements.
|
||||
2. **Factory Method**: The `create_node()` function acts as a factory method for creating appropriate node instances based on content and type.
|
||||
3. **Decorator Pattern**: The `StyleInfo` class applies layers of styling information, merging styles from parent elements with child elements.
|
||||
4. **Strategy Pattern**: `TableProcessor` implements a strategy for processing tables, with specialized algorithms for different table structures.
|
||||
|
||||
### Code Quality
|
||||
|
||||
#### Strengths
|
||||
- Strong typing with appropriate use of Union and Optional types
|
||||
- Consistent use of dataclasses for node representations
|
||||
- Clear separation of parsing logic from rendering logic
|
||||
- Detailed handling of text formatting and whitespace normalization
|
||||
- Comprehensive table processing with column alignment detection
|
||||
|
||||
#### Areas for Improvement
|
||||
- High cyclomatic complexity in `_process_element` method
|
||||
- Duplicate style parsing logic between html.py and styles.py
|
||||
- Limited documentation for some private methods
|
||||
- Heavy use of instance checking rather than polymorphism
|
||||
- Some recursive methods lack depth limits for safety
|
||||
|
||||
## Parsing Workflow
|
||||
|
||||
The parsing process follows these key stages:
|
||||
|
||||
1. **HTML Parsing**: Uses BeautifulSoup to parse HTML into a DOM tree, handling malformed HTML and extracting the document root. (Implemented in `HtmlDocument.get_root()`)
|
||||
|
||||
2. **Node Creation**: Traverses the DOM tree, creating appropriate node objects based on element type, text content, and styling. (Implemented in `SECHTMLParser._process_element()` and helper methods)
|
||||
|
||||
3. **Inline XBRL Processing**: Tracks and processes inline XBRL tags, preserving metadata for fact extraction and financial data processing. (Implemented in `IXTagTracker` class methods)
|
||||
|
||||
4. **Style Analysis**: Analyzes CSS styles and element semantics to determine document structure, headings, and text formatting. (Implemented in `parse_style()` and `get_heading_level()`)
|
||||
|
||||
5. **Table Processing**: Processes HTML tables into structured TableNode objects with proper cell span handling and column alignment. (Implemented in `SECHTMLParser._process_table()`)
|
||||
|
||||
6. **Node Merging**: Merges adjacent text nodes with compatible styling to create a more concise document structure. (Implemented in `SECHTMLParser._merge_adjacent_nodes()`)
|
||||
|
||||
## Document.parse() Method Analysis
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def parse(cls, html: str) -> Optional['Document']:
|
||||
root = HtmlDocument.get_root(html)
|
||||
if root:
|
||||
parser = SECHTMLParser(root)
|
||||
return parser.parse()
|
||||
```
|
||||
|
||||
### Method Characteristics
|
||||
- **Cyclomatic Complexity**: Low (2)
|
||||
- **Lines of Code**: 5 lines
|
||||
- **Dependencies**: `HtmlDocument`, `SECHTMLParser`
|
||||
|
||||
### Method Flow
|
||||
1. Get document root using `HtmlDocument.get_root()`
|
||||
2. Create `SECHTMLParser` instance with root
|
||||
3. Call `parser.parse()` to create node structure
|
||||
4. Return `Document` instance with parsed nodes
|
||||
|
||||
### Edge Cases Handled
|
||||
- Returns None if document root cannot be found
|
||||
- Properly handles malformed HTML through BeautifulSoup
|
||||
|
||||
### Suggestions
|
||||
- Add error handling for parser.parse() failures
|
||||
- Consider adding optional caching for parsed documents
|
||||
- Add metadata extraction to the parse method signature
|
||||
|
||||
## Node Hierarchy Analysis
|
||||
|
||||
### BaseNode
|
||||
- Abstract base class for all document nodes
|
||||
- Key methods: `render()`, `type` property, metadata management
|
||||
- Good extensibility through ABC pattern
|
||||
|
||||
### HeadingNode
|
||||
- Represents section headings with level-based styling
|
||||
- Strengths:
|
||||
- Level-aware rendering with appropriate visual hierarchy
|
||||
- Comprehensive styling based on heading importance
|
||||
- Good metadata support for semantic information
|
||||
|
||||
### TextBlockNode
|
||||
- Represents paragraphs and formatted text content
|
||||
- Strengths:
|
||||
- Sophisticated text wrapping algorithm
|
||||
- Alignment and style preservation
|
||||
- Efficient handling of long text blocks
|
||||
- Improvements:
|
||||
- Could benefit from more advanced text styling capabilities
|
||||
- Limited support for lists and nested formatting
|
||||
|
||||
### TableNode
|
||||
- Represents tabular data with advanced processing
|
||||
- Strengths:
|
||||
- Sophisticated table processing with TableProcessor
|
||||
- Support for complex cell structures with colspan/rowspan
|
||||
- Intelligent column alignment detection
|
||||
- Efficient caching of processed tables
|
||||
- Improvements:
|
||||
- Limited support for nested tables
|
||||
- No handling for table captions or footer rows
|
||||
|
||||
## Style Processing Analysis
|
||||
|
||||
Style processing is a crucial component that determines document structure and formatting. It handles inheritance, merging, and semantic interpretation.
|
||||
|
||||
### Key Components
|
||||
|
||||
1. **StyleInfo**
|
||||
- Dataclass representing CSS properties with proper unit handling
|
||||
- Style inheritance through the merge method
|
||||
|
||||
2. **parse_style**
|
||||
- Parses inline CSS styles into StyleInfo objects
|
||||
- Handles units, validation, and fallback to standard values
|
||||
|
||||
3. **get_heading_level**
|
||||
- Uses sophisticated heuristics to determine heading levels
|
||||
- Based on style, content, and document context
|
||||
|
||||
### Strengths
|
||||
- Unit-aware style processing with proper conversions
|
||||
- Sophisticated heading detection with multi-factor analysis
|
||||
- Context-sensitive style inheritance model
|
||||
|
||||
### Improvements
|
||||
- Duplicate style logic between files could be consolidated
|
||||
- Limited support for advanced CSS features like flexbox
|
||||
- No caching for repeated style parsing of identical styles
|
||||
|
||||
## Inline XBRL Handling
|
||||
|
||||
The `IXTagTracker` provides tracking and processing of inline XBRL tags, preserving metadata for financial data extraction.
|
||||
|
||||
### Key Features
|
||||
- Tracks nested ix: tags and their attributes
|
||||
- Handles continuation tags for fragmented XBRL facts
|
||||
- Preserves context references for financial data analysis
|
||||
|
||||
### Integration Points
|
||||
- Called during element processing in SECHTMLParser
|
||||
- Metadata stored in node.metadata for downstream processing
|
||||
|
||||
### Improvements
|
||||
- Limited documentation of XBRL namespaces and tag semantics
|
||||
- No validation of XBRL context references
|
||||
- Could benefit from performance optimization for large documents
|
||||
|
||||
## Technical Debt
|
||||
|
||||
### Code Complexity
|
||||
1. **SECHTMLParser._process_element**
|
||||
- High cyclomatic complexity with nested conditions
|
||||
- Suggestion: Refactor into smaller, focused methods with clear single responsibilities
|
||||
|
||||
2. **SECHTMLParser._process_table**
|
||||
- Complex table cell processing with tight coupling
|
||||
- Suggestion: Extract cell processing to a dedicated class with clear interface
|
||||
|
||||
### Duplication
|
||||
1. **Style parsing logic**
|
||||
- Similar parsing logic in multiple files
|
||||
- Suggestion: Consolidate style parsing into a unified module
|
||||
|
||||
2. **Text normalization**
|
||||
- Multiple text normalization methods with similar functionality
|
||||
- Suggestion: Create a TextNormalizer utility class
|
||||
|
||||
### Performance
|
||||
1. **Deep recursion**
|
||||
- Recursive element processing without depth limits
|
||||
- Suggestion: Add depth tracking and limits to prevent stack overflows
|
||||
|
||||
2. **Repeated style parsing**
|
||||
- No caching for repeated style parsing
|
||||
- Suggestion: Implement LRU cache for parsed styles by element ID
|
||||
|
||||
## Recommendations
|
||||
|
||||
### Architecture
|
||||
1. Formalize node visitor pattern for operations on document structure
|
||||
2. Create dedicated NodeFactory class to encapsulate node creation logic
|
||||
3. Consider splitting large parser class into specialized parsers by content type
|
||||
|
||||
### Code Quality
|
||||
1. Refactor complex methods into smaller, focused functions
|
||||
2. Add comprehensive docstrings to all public methods
|
||||
3. Add type guards for complex type unions
|
||||
|
||||
### Performance
|
||||
1. Implement strategic caching for style parsing and heading detection
|
||||
2. Add depth limits to recursive methods
|
||||
3. Consider lazy parsing for large sections like tables
|
||||
|
||||
### Testing
|
||||
1. Add property-based testing for style inheritance
|
||||
2. Create test fixtures for complex document structures
|
||||
3. Add performance benchmarks for parsing large documents
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Document Parsing
|
||||
```python
|
||||
import requests
|
||||
from edgar.files.html import Document
|
||||
|
||||
# Get HTML content from a filing
|
||||
html_content = requests.get("https://www.sec.gov/filing/example").text
|
||||
|
||||
# Parse into document structure
|
||||
document = Document.parse(html_content)
|
||||
|
||||
# Access document nodes
|
||||
for node in document.nodes:
|
||||
print(f"Node type: {node.type}")
|
||||
if node.type == 'heading':
|
||||
print(f"Heading: {node.content}")
|
||||
```
|
||||
|
||||
### Extracting Tables from a Document
|
||||
```python
|
||||
from edgar.files.html import Document
|
||||
import pandas as pd
|
||||
|
||||
document = Document.parse(html_content)
|
||||
|
||||
# Extract all tables
|
||||
tables = document.tables
|
||||
|
||||
# Convert to pandas DataFrames for analysis
|
||||
dataframes = []
|
||||
for table_node in tables:
|
||||
# Access the processed table
|
||||
processed = table_node._processed
|
||||
if processed:
|
||||
# Create DataFrame with headers and data
|
||||
df = pd.DataFrame(processed.data_rows, columns=processed.headers)
|
||||
dataframes.append(df)
|
||||
```
|
||||
|
||||
### Converting Document to Markdown
|
||||
```python
|
||||
from edgar.files.html import Document
|
||||
|
||||
document = Document.parse(html_content)
|
||||
|
||||
# Convert to markdown
|
||||
markdown_text = document.to_markdown()
|
||||
|
||||
# Save to file
|
||||
with open("filing.md", "w") as f:
|
||||
f.write(markdown_text)
|
||||
```
|
||||
|
||||
### Accessing XBRL Data in Document Nodes
|
||||
```python
|
||||
from edgar.files.html import Document
|
||||
|
||||
document = Document.parse(html_content)
|
||||
|
||||
# Find nodes with XBRL facts
|
||||
xbrl_facts = []
|
||||
for node in document.nodes:
|
||||
if 'ix_tag' in node.metadata and 'ix_context' in node.metadata:
|
||||
xbrl_facts.append({
|
||||
'concept': node.metadata['ix_tag'],
|
||||
'context': node.metadata['ix_context'],
|
||||
'value': node.content,
|
||||
})
|
||||
|
||||
# Process extracted facts
|
||||
for fact in xbrl_facts:
|
||||
print(f"{fact['concept']}: {fact['value']}")
|
||||
```
|
||||
|
||||
## Known Issues and Limitations
|
||||
|
||||
### Heading Detection Issues
|
||||
|
||||
During testing, we discovered that headings in some filings (such as Oracle 10-K) are not properly detected by the underlying Document class, which prevents proper item identification. This is a critical issue that needs addressing in the implementation.
|
||||
|
||||
Potential causes:
|
||||
- Heading detection in the Document class may be too strict
|
||||
- Some filings use non-standard formatting for headings
|
||||
- Style inheritance might not be working correctly
|
||||
- Heading level determination may not account for all possible cases
|
||||
|
||||
Possible solutions:
|
||||
1. Add a fallback mechanism that uses regex-based item detection when structural detection fails
|
||||
2. Implement a hybrid approach that combines structural and textual analysis
|
||||
3. Create specialized detectors for specific filing types that account for their unique structures
|
||||
4. Add more signals to the heading detection (e.g., positional info, surrounding context)
|
||||
|
||||
**Priority:** High - This issue directly impacts the core functionality of extracting items from filings.
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Parsing Performance
|
||||
#### Bottlenecks
|
||||
- BeautifulSoup HTML parsing for large documents
|
||||
- Recursive DOM traversal with style inheritance computation
|
||||
- Complex table processing with layout analysis
|
||||
- Text normalization and whitespace handling
|
||||
|
||||
#### Optimization Opportunities
|
||||
- Add caching for parsed styles and computed node properties
|
||||
- Implement lazy parsing for complex structures like tables
|
||||
- Add document sectioning for parallel processing
|
||||
- Optimize text handling for large text blocks
|
||||
|
||||
#### Memory Considerations
|
||||
- Document representation can be memory-intensive for large filings
|
||||
- Caching parsed tables can increase memory usage
|
||||
- Consider streaming processing for very large documents
|
||||
|
||||
### Rendering Performance
|
||||
#### Considerations
|
||||
- Rich rendering is computation-intensive for large documents
|
||||
- Table rendering with column optimization is particularly expensive
|
||||
- Consider incremental or paginated rendering for large documents
|
||||
|
||||
#### Optimizations
|
||||
- Implement view windowing for large documents
|
||||
- Add caching for rendered nodes
|
||||
- Consider asynchronous rendering for complex structures
|
||||
@@ -0,0 +1,923 @@
|
||||
"""
|
||||
Enhanced SEC filing document representation with structured item extraction.
|
||||
|
||||
This module provides a high-level document class specialized for SEC filings, with
|
||||
rich support for extracting items, tables, and table of contents.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Iterator, List, Optional, Pattern
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from edgar.files.html import BaseNode, Document, HeadingNode, TableNode
|
||||
|
||||
|
||||
class Table:
|
||||
"""Rich representation of a table in a document."""
|
||||
|
||||
def __init__(self, table_node: TableNode):
|
||||
self._node = table_node
|
||||
self._processed = None # Lazy-loaded processed table
|
||||
|
||||
@property
|
||||
def rows(self) -> int:
|
||||
"""Get the number of rows in the table."""
|
||||
processed = self._get_processed()
|
||||
if processed is None:
|
||||
return 0
|
||||
|
||||
# Count header row if present plus data rows
|
||||
has_header = processed.headers is not None and len(processed.headers) > 0
|
||||
return len(processed.data_rows) + (1 if has_header else 0)
|
||||
|
||||
@property
|
||||
def columns(self) -> int:
|
||||
"""Get the number of columns in the table."""
|
||||
processed = self._get_processed()
|
||||
if processed is None:
|
||||
return 0
|
||||
|
||||
# Use headers if available, otherwise first data row
|
||||
if processed.headers and len(processed.headers) > 0:
|
||||
return len(processed.headers)
|
||||
elif processed.data_rows and len(processed.data_rows) > 0:
|
||||
return len(processed.data_rows[0])
|
||||
return 0
|
||||
|
||||
def _get_processed(self):
|
||||
"""Get or create the processed table."""
|
||||
if self._processed is None:
|
||||
if hasattr(self._node, '_processed'):
|
||||
self._processed = self._node._processed
|
||||
# Handle case where node doesn't have processed table yet
|
||||
if self._processed is None and hasattr(self._node, '_get_processed'):
|
||||
# Call node's processing method if available
|
||||
self._processed = self._node._get_processed()
|
||||
return self._processed
|
||||
|
||||
def to_dataframe(self) -> pd.DataFrame:
|
||||
"""Convert this table to a pandas DataFrame."""
|
||||
processed = self._get_processed()
|
||||
if processed and processed.headers and processed.data_rows:
|
||||
# Create DataFrame with proper headers and data
|
||||
return pd.DataFrame(processed.data_rows, columns=processed.headers)
|
||||
elif processed and processed.data_rows:
|
||||
# No headers, use numeric column names
|
||||
return pd.DataFrame(processed.data_rows)
|
||||
return pd.DataFrame()
|
||||
|
||||
def to_markdown(self) -> str:
|
||||
"""Convert this table to markdown format."""
|
||||
df = self.to_dataframe()
|
||||
if not df.empty:
|
||||
return df.to_markdown()
|
||||
return ""
|
||||
|
||||
def get_cell(self, row: int, col: int) -> str:
|
||||
"""Get the content of a specific cell."""
|
||||
processed = self._get_processed()
|
||||
if processed is None:
|
||||
return ""
|
||||
|
||||
# Handle header row (row 0)
|
||||
if row == 0 and processed.headers and col < len(processed.headers):
|
||||
return processed.headers[col]
|
||||
|
||||
# Adjust row index if we have headers (data rows start at index 1)
|
||||
data_row_idx = row if processed.headers is None else row - 1
|
||||
|
||||
# Get data from data rows
|
||||
if processed.data_rows and 0 <= data_row_idx < len(processed.data_rows):
|
||||
data_row = processed.data_rows[data_row_idx]
|
||||
if 0 <= col < len(data_row):
|
||||
return data_row[col]
|
||||
|
||||
return ""
|
||||
|
||||
def contains(self, text: str) -> bool:
|
||||
"""Check if the table contains the specified text."""
|
||||
processed = self._get_processed()
|
||||
if not processed:
|
||||
return False
|
||||
|
||||
# Check headers
|
||||
if processed.headers and any(text.lower() in str(header).lower() for header in processed.headers):
|
||||
return True
|
||||
|
||||
# Check data rows
|
||||
for row in processed.data_rows:
|
||||
if any(text.lower() in str(cell).lower() for cell in row):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.to_markdown()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"Table({self.rows}×{self.columns})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class TocEntry:
|
||||
"""Entry in a table of contents."""
|
||||
|
||||
text: str
|
||||
level: int
|
||||
page: Optional[int] = None
|
||||
reference: Optional[str] = None # Item reference, if applicable
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"TocEntry('{self.text}', level={self.level}, page={self.page})"
|
||||
|
||||
|
||||
class TableOfContents:
|
||||
"""Table of contents extracted from a document."""
|
||||
|
||||
def __init__(self, entries: List[TocEntry]):
|
||||
self.entries = entries
|
||||
|
||||
@classmethod
|
||||
def extract(cls, document: Document) -> 'TableOfContents':
|
||||
"""Extract table of contents from document."""
|
||||
entries = []
|
||||
|
||||
# Find TOC section (usually at the beginning)
|
||||
toc_node_index = cls._find_toc_section(document)
|
||||
if toc_node_index is None:
|
||||
return cls([])
|
||||
|
||||
# Get nodes after TOC heading until the next major heading
|
||||
toc_nodes = cls._get_toc_nodes(document, toc_node_index)
|
||||
|
||||
# Process nodes to extract entries
|
||||
entries = cls._process_toc_nodes(toc_nodes)
|
||||
|
||||
# Match entries to actual items
|
||||
cls._match_entries_to_items(entries, document)
|
||||
|
||||
return cls(entries)
|
||||
|
||||
@staticmethod
|
||||
def _find_toc_section(document: Document) -> Optional[int]:
|
||||
"""Find the TOC section in the document."""
|
||||
# Look for "Table of Contents" heading
|
||||
toc_patterns = [
|
||||
re.compile(r'table\s+of\s+contents', re.IGNORECASE),
|
||||
re.compile(r'contents', re.IGNORECASE)
|
||||
]
|
||||
|
||||
for i, node in enumerate(document.nodes):
|
||||
if node.type == 'heading':
|
||||
for pattern in toc_patterns:
|
||||
if pattern.search(node.content):
|
||||
return i
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _get_toc_nodes(document: Document, start_index: int) -> List[BaseNode]:
|
||||
"""Get nodes belonging to the TOC section."""
|
||||
# Get nodes between TOC heading and next heading of same or higher level
|
||||
nodes = []
|
||||
toc_heading = document.nodes[start_index]
|
||||
heading_level = toc_heading.level if hasattr(toc_heading, 'level') else 1
|
||||
|
||||
for i in range(start_index + 1, len(document.nodes)):
|
||||
node = document.nodes[i]
|
||||
if node.type == 'heading' and hasattr(node, 'level') and node.level <= heading_level:
|
||||
break
|
||||
nodes.append(node)
|
||||
|
||||
return nodes
|
||||
|
||||
@staticmethod
|
||||
def _process_toc_nodes(nodes: List[BaseNode]) -> List[TocEntry]:
|
||||
"""Process TOC nodes to extract entries."""
|
||||
entries = []
|
||||
|
||||
# Patterns for detecting TOC entries
|
||||
item_pattern = re.compile(r'(item\s+\d+[A-Za-z]?)', re.IGNORECASE)
|
||||
page_pattern = re.compile(r'(\d+)$')
|
||||
|
||||
for node in nodes:
|
||||
if node.type == 'text_block':
|
||||
# Process each line in the text block
|
||||
lines = node.content.splitlines()
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Extract indentation as a proxy for level
|
||||
leading_spaces = len(line) - len(line.lstrip())
|
||||
level = leading_spaces // 2 + 1 # Rough estimate of level
|
||||
|
||||
# Extract page number if present
|
||||
page_match = page_pattern.search(line)
|
||||
page = int(page_match.group(1)) if page_match else None
|
||||
|
||||
# Clean the text
|
||||
text = line
|
||||
if page_match:
|
||||
text = line[:page_match.start()].strip()
|
||||
|
||||
# Check for Item reference
|
||||
item_match = item_pattern.search(text)
|
||||
reference = item_match.group(1) if item_match else None
|
||||
|
||||
entries.append(TocEntry(text, level, page, reference))
|
||||
|
||||
elif node.type == 'table':
|
||||
# Process table rows as TOC entries
|
||||
table = Table(node)
|
||||
df = table.to_dataframe()
|
||||
|
||||
if not df.empty:
|
||||
for _, row in df.iterrows():
|
||||
if len(row) >= 2: # Assume col 0 is text, col 1 might be page
|
||||
text = str(row[0]).strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# Try to extract page number
|
||||
page = None
|
||||
if len(row) > 1:
|
||||
try:
|
||||
page = int(row[1])
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Extract level from indentation or formatting
|
||||
level = 1 # Default level
|
||||
leading_spaces = len(text) - len(text.lstrip())
|
||||
if leading_spaces > 0:
|
||||
level = leading_spaces // 2 + 1
|
||||
|
||||
# Check for Item reference
|
||||
item_match = item_pattern.search(text)
|
||||
reference = item_match.group(1) if item_match else None
|
||||
|
||||
entries.append(TocEntry(text, level, page, reference))
|
||||
|
||||
return entries
|
||||
|
||||
@staticmethod
|
||||
def _match_entries_to_items(entries: List[TocEntry], document: Document) -> None:
|
||||
"""Match TOC entries to actual items in the document."""
|
||||
# Create dictionary of potential item headings in the document
|
||||
item_headings = {}
|
||||
item_pattern = re.compile(r'(item\s+\d+[A-Za-z]?)', re.IGNORECASE)
|
||||
|
||||
for i, node in enumerate(document.nodes):
|
||||
if node.type == 'heading':
|
||||
match = item_pattern.search(node.content)
|
||||
if match:
|
||||
item_key = match.group(1).upper()
|
||||
item_headings[item_key] = i
|
||||
|
||||
# Match entries to items
|
||||
for entry in entries:
|
||||
if entry.reference:
|
||||
# Try to match reference to actual item
|
||||
item_key = entry.reference.upper()
|
||||
if item_key in item_headings:
|
||||
entry.reference = item_key
|
||||
|
||||
def find(self, text: str) -> Optional[TocEntry]:
|
||||
"""Find a TOC entry by text."""
|
||||
text = text.lower()
|
||||
for entry in self.entries:
|
||||
if text in entry.text.lower():
|
||||
return entry
|
||||
return None
|
||||
|
||||
def __iter__(self) -> Iterator[TocEntry]:
|
||||
return iter(self.entries)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.entries)
|
||||
|
||||
|
||||
class Item:
|
||||
"""Represents a logical item in an SEC filing."""
|
||||
|
||||
def __init__(self,
|
||||
name: str,
|
||||
heading_node: Optional[HeadingNode],
|
||||
content_nodes: List[BaseNode],
|
||||
metadata: Dict[str, Any] = None):
|
||||
self.name = name
|
||||
self.heading_node = heading_node
|
||||
self.content_nodes = content_nodes
|
||||
self.metadata = metadata or {}
|
||||
|
||||
@property
|
||||
def title(self) -> str:
|
||||
"""Get the title of this item."""
|
||||
if self.heading_node:
|
||||
# Extract title by removing the item number
|
||||
item_pattern = re.compile(r'^item\s+\d+[A-Za-z]?\.?\s*', re.IGNORECASE)
|
||||
return item_pattern.sub('', self.heading_node.content).strip()
|
||||
return ""
|
||||
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""Get the text content of this item."""
|
||||
parts = []
|
||||
for node in self.content_nodes:
|
||||
if hasattr(node, 'content'):
|
||||
if isinstance(node.content, str):
|
||||
parts.append(node.content)
|
||||
elif isinstance(node.content, list):
|
||||
# Handle list content (likely a table)
|
||||
parts.append(str(node))
|
||||
else:
|
||||
parts.append(str(node.content))
|
||||
else:
|
||||
parts.append(str(node))
|
||||
return "\n".join(parts)
|
||||
|
||||
@property
|
||||
def tables(self) -> List[Table]:
|
||||
"""Get all tables within this item."""
|
||||
return [
|
||||
Table(node) for node in self.content_nodes
|
||||
if node.type == 'table'
|
||||
]
|
||||
|
||||
def get_table(self, index: int) -> Optional[Table]:
|
||||
"""Get a specific table by index."""
|
||||
tables = self.tables
|
||||
return tables[index] if 0 <= index < len(tables) else None
|
||||
|
||||
def find_tables(self, pattern: str) -> List[Table]:
|
||||
"""Find tables containing the specified text pattern."""
|
||||
tables = []
|
||||
for table in self.tables:
|
||||
if table.contains(pattern):
|
||||
tables.append(table)
|
||||
return tables
|
||||
|
||||
def get_subsections(self) -> List['Item']:
|
||||
"""Extract nested subsections within this item."""
|
||||
subsections = []
|
||||
|
||||
# Find heading nodes with higher level than the main item heading
|
||||
item_level = self.heading_node.level if self.heading_node else 0
|
||||
|
||||
# Find all subsection headings
|
||||
subsection_indices = []
|
||||
for i, node in enumerate(self.content_nodes):
|
||||
if node.type == 'heading' and node.level > item_level:
|
||||
subsection_indices.append((i, node))
|
||||
|
||||
# Create subsections
|
||||
for i, (idx, heading) in enumerate(subsection_indices):
|
||||
next_idx = subsection_indices[i+1][0] if i+1 < len(subsection_indices) else len(self.content_nodes)
|
||||
subsection_content = self.content_nodes[idx+1:next_idx]
|
||||
|
||||
# Create an item for this subsection
|
||||
subsection = Item(
|
||||
name=heading.content,
|
||||
heading_node=heading,
|
||||
content_nodes=subsection_content
|
||||
)
|
||||
subsections.append(subsection)
|
||||
|
||||
return subsections
|
||||
|
||||
def to_markdown(self) -> str:
|
||||
"""Convert this item to markdown format."""
|
||||
parts = []
|
||||
|
||||
# Add heading
|
||||
if self.heading_node:
|
||||
parts.append(f"# {self.heading_node.content}\n")
|
||||
|
||||
# Process content nodes
|
||||
for node in self.content_nodes:
|
||||
if node.type == 'heading':
|
||||
# Add appropriate heading level
|
||||
level = min(node.level + 1, 6) # Ensure we don't exceed markdown's 6 levels
|
||||
parts.append(f"{'#' * level} {node.content}\n")
|
||||
|
||||
elif node.type == 'text_block':
|
||||
parts.append(f"{node.content}\n\n")
|
||||
|
||||
elif node.type == 'table':
|
||||
table = Table(node)
|
||||
parts.append(f"{table.to_markdown()}\n\n")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
def to_html(self) -> str:
|
||||
"""Convert this item to HTML format."""
|
||||
parts = []
|
||||
|
||||
# Add heading
|
||||
if self.heading_node:
|
||||
parts.append(f"<h1>{self.heading_node.content}</h1>")
|
||||
|
||||
# Process content nodes
|
||||
for node in self.content_nodes:
|
||||
if node.type == 'heading':
|
||||
# Add appropriate heading level
|
||||
level = min(node.level + 1, 6) # Ensure we don't exceed HTML's 6 levels
|
||||
parts.append(f"<h{level}>{node.content}</h{level}>")
|
||||
|
||||
elif node.type == 'text_block':
|
||||
lines = node.content.split('\n')
|
||||
paragraphs = [f"<p>{line}</p>" for line in lines if line.strip()]
|
||||
parts.append("\n".join(paragraphs))
|
||||
|
||||
elif node.type == 'table':
|
||||
# Convert the table to HTML
|
||||
table = Table(node)
|
||||
df = table.to_dataframe()
|
||||
parts.append(df.to_html(index=False))
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert this item to a dictionary."""
|
||||
return {
|
||||
'name': self.name,
|
||||
'title': self.title,
|
||||
'text': self.text,
|
||||
'metadata': self.metadata
|
||||
}
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.text
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"Item('{self.name}', title='{self.title}')"
|
||||
|
||||
|
||||
class ItemCollection:
|
||||
"""Collection of items in a document with convenient access methods."""
|
||||
|
||||
def __init__(self, items: Dict[str, Item]):
|
||||
self._items = items
|
||||
|
||||
def __getitem__(self, key: str) -> Item:
|
||||
"""Get an item by name, with flexible matching."""
|
||||
# Case-insensitive lookup
|
||||
key = key.strip().upper()
|
||||
|
||||
# Direct lookup
|
||||
if key in self._items:
|
||||
return self._items[key]
|
||||
|
||||
# Remove any trailing periods for matching
|
||||
clean_key = key.rstrip('.')
|
||||
if clean_key in self._items:
|
||||
return self._items[clean_key]
|
||||
|
||||
# Normalize for comparison (remove spaces and periods)
|
||||
normalized_key = re.sub(r'[.\s]', '', key)
|
||||
|
||||
# Try to match normalized keys
|
||||
for item_key in self._items:
|
||||
normalized_item_key = re.sub(r'[.\s]', '', item_key)
|
||||
if normalized_key == normalized_item_key:
|
||||
return self._items[item_key]
|
||||
|
||||
# Partial match (e.g., "1" matches "ITEM 1")
|
||||
if normalized_key.isdigit() or (len(normalized_key) > 1 and normalized_key[0].isdigit()):
|
||||
for item_key in self._items:
|
||||
normalized_item_key = re.sub(r'[.\s]', '', item_key)
|
||||
if normalized_key in normalized_item_key:
|
||||
return self._items[item_key]
|
||||
|
||||
raise KeyError(f"Item '{key}' not found")
|
||||
|
||||
def __contains__(self, key: str) -> bool:
|
||||
"""Check if an item exists."""
|
||||
try:
|
||||
self[key]
|
||||
return True
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
def __iter__(self) -> Iterator[Item]:
|
||||
"""Iterate through items in order."""
|
||||
for key in sorted(self._items.keys()):
|
||||
yield self._items[key]
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get the number of items."""
|
||||
return len(self._items)
|
||||
|
||||
def list(self) -> List[str]:
|
||||
"""Get a list of item names."""
|
||||
return sorted(self._items.keys())
|
||||
|
||||
|
||||
class DocumentIndex:
|
||||
"""Index of document structure for efficient lookups."""
|
||||
|
||||
def __init__(self):
|
||||
self._headings = {} # Map of heading text to node index
|
||||
self._items = {} # Map of item name to Item object
|
||||
|
||||
@classmethod
|
||||
def build(cls, document: Document, filing_type: str = None) -> 'DocumentIndex':
|
||||
"""Build an index from a document."""
|
||||
index = cls()
|
||||
index._build_heading_index(document)
|
||||
index._build_item_index(document, filing_type)
|
||||
return index
|
||||
|
||||
def _build_heading_index(self, document: Document) -> None:
|
||||
"""Build an index of all headings in the document."""
|
||||
for i, node in enumerate(document.nodes):
|
||||
if node.type == 'heading':
|
||||
self._headings[node.content] = i
|
||||
|
||||
def _build_item_index(self, document: Document, filing_type: str = None) -> None:
|
||||
"""Build an index of items in the document."""
|
||||
# Get appropriate item pattern based on filing type
|
||||
item_pattern = self._get_item_pattern(filing_type)
|
||||
|
||||
# Find all item headings
|
||||
item_headings = []
|
||||
for i, node in enumerate(document.nodes):
|
||||
if node.type == 'heading':
|
||||
match = item_pattern.search(node.content)
|
||||
if match:
|
||||
item_name = match.group(1).strip().upper()
|
||||
item_headings.append((item_name, i, node))
|
||||
|
||||
# If no heading-based items found, use fallback text-based detection
|
||||
if not item_headings:
|
||||
item_headings = self._fallback_item_detection(document, item_pattern)
|
||||
|
||||
# Sort by position in document
|
||||
item_headings.sort(key=lambda x: x[1])
|
||||
|
||||
# Create items
|
||||
for i, (item_name, node_idx, heading_node) in enumerate(item_headings):
|
||||
# Find content nodes
|
||||
start_idx = node_idx + 1
|
||||
end_idx = (item_headings[i+1][1]
|
||||
if i+1 < len(item_headings) else len(document.nodes))
|
||||
content_nodes = document.nodes[start_idx:end_idx]
|
||||
|
||||
# Create item
|
||||
self._items[item_name] = Item(item_name, heading_node, content_nodes)
|
||||
|
||||
def _fallback_item_detection(self, document: Document, item_pattern: re.Pattern) -> list:
|
||||
"""
|
||||
Fallback item detection when heading-based detection fails.
|
||||
Uses text content and positional analysis to identify items.
|
||||
"""
|
||||
from edgar.files.html import HeadingNode
|
||||
from edgar.files.styles import StyleInfo
|
||||
|
||||
# Create reusable heading nodes
|
||||
def create_heading_node(content, level=2):
|
||||
return HeadingNode(
|
||||
content=content,
|
||||
style=StyleInfo(font_weight='bold'), # minimal required style
|
||||
level=level,
|
||||
metadata={}
|
||||
)
|
||||
|
||||
item_headings = []
|
||||
|
||||
# Step 1: Oracle-specific table-based TOC detection (handles Oracle 10-K format)
|
||||
# First check for a table that contains item patterns and looks like a TOC
|
||||
table_nodes = [node for node in document.nodes if node.type == 'table']
|
||||
|
||||
# Create a map of item references to detect in content
|
||||
item_references = {}
|
||||
toc_table_idx = None
|
||||
|
||||
# First pass: find the table of contents and extract item references
|
||||
for table_idx, node in enumerate(table_nodes):
|
||||
toc_candidate = False
|
||||
item_to_content_map = {}
|
||||
|
||||
# Check if this looks like a TOC table
|
||||
if hasattr(node, 'content') and isinstance(node.content, list):
|
||||
rows = node.content
|
||||
|
||||
# Process each row to find item patterns
|
||||
for row_idx, row in enumerate(rows):
|
||||
if not hasattr(row, 'cells'):
|
||||
continue
|
||||
|
||||
# Check if this row contains an item pattern
|
||||
for cell_idx, cell in enumerate(row.cells):
|
||||
cell_content = cell.content if hasattr(cell, 'content') else ""
|
||||
if not isinstance(cell_content, str):
|
||||
continue
|
||||
|
||||
# Look for item pattern in this cell
|
||||
match = item_pattern.search(cell_content)
|
||||
if match:
|
||||
toc_candidate = True
|
||||
item_name = match.group(1).strip().upper()
|
||||
|
||||
# Extract title - could be in same cell after item name or in next cell
|
||||
title = ""
|
||||
# First look in the same cell after the item name
|
||||
remaining_content = cell_content[match.end():].strip()
|
||||
if remaining_content:
|
||||
title = remaining_content
|
||||
# If no title found in same cell, check next cell
|
||||
elif cell_idx + 1 < len(row.cells):
|
||||
next_cell = row.cells[cell_idx + 1]
|
||||
next_content = next_cell.content if hasattr(next_cell, 'content') else ""
|
||||
if isinstance(next_content, str):
|
||||
title = next_content.strip()
|
||||
|
||||
# Look for page number or anchor reference in later cells
|
||||
ref = None
|
||||
if cell_idx + 2 < len(row.cells):
|
||||
ref_cell = row.cells[cell_idx + 2]
|
||||
ref_content = ref_cell.content if hasattr(ref_cell, 'content') else ""
|
||||
if isinstance(ref_content, str) and ref_content.strip():
|
||||
ref = ref_content.strip()
|
||||
|
||||
# Store item details with full context
|
||||
item_to_content_map[item_name] = {
|
||||
'title': title,
|
||||
'reference': ref,
|
||||
'row_idx': row_idx
|
||||
}
|
||||
|
||||
# Add to global item references
|
||||
item_references[item_name] = {
|
||||
'title': title,
|
||||
'reference': ref,
|
||||
'found': False # Will be set to True when we find the content
|
||||
}
|
||||
|
||||
# If this table is a TOC candidate with multiple items, remember it
|
||||
if toc_candidate and len(item_to_content_map) >= 2:
|
||||
toc_table_idx = table_idx
|
||||
|
||||
# Second pass: if we found a TOC table, look for items in the document
|
||||
if item_references:
|
||||
# Look for anchor IDs that match item references
|
||||
anchor_nodes = {}
|
||||
for i, node in enumerate(document.nodes):
|
||||
# Check for id attribute that might be a target for TOC links
|
||||
if hasattr(node, 'attrs') and node.attrs.get('id'):
|
||||
anchor_id = node.attrs.get('id')
|
||||
anchor_nodes[anchor_id] = i
|
||||
|
||||
# Look for nodes that might contain items
|
||||
for i, node in enumerate(document.nodes):
|
||||
# Skip nodes before the TOC table if we found one
|
||||
if toc_table_idx is not None and i <= toc_table_idx:
|
||||
continue
|
||||
|
||||
# Get node content
|
||||
if not hasattr(node, 'content'):
|
||||
continue
|
||||
|
||||
node_content = node.content
|
||||
if not isinstance(node_content, str):
|
||||
continue
|
||||
|
||||
# Check for each item in our reference map
|
||||
for item_name, item_info in item_references.items():
|
||||
if item_info['found']:
|
||||
continue # Already found this item
|
||||
|
||||
# Method 1: Look for exact item pattern at start of text
|
||||
if node_content.strip().upper().startswith(item_name):
|
||||
# Found item directly
|
||||
content = f"{item_name} {item_info['title']}".strip()
|
||||
heading_node = create_heading_node(content)
|
||||
item_headings.append((item_name, i, heading_node))
|
||||
item_references[item_name]['found'] = True
|
||||
break
|
||||
|
||||
# Method 2: Look for the title text if we have it
|
||||
if item_info['title'] and item_info['title'].strip():
|
||||
# This can have false positives, so make sure it's a good match
|
||||
title = item_info['title'].strip()
|
||||
# Check if the title appears together with the item name
|
||||
if (f"{item_name} {title}".upper() in node_content.upper() or
|
||||
title.upper() in node_content.upper() and
|
||||
"ITEM" in node_content.upper()):
|
||||
|
||||
content = f"{item_name} {title}".strip()
|
||||
heading_node = create_heading_node(content)
|
||||
item_headings.append((item_name, i, heading_node))
|
||||
item_references[item_name]['found'] = True
|
||||
break
|
||||
|
||||
# If we found items from the TOC references, return them
|
||||
if any(info['found'] for info in item_references.values()):
|
||||
# Sort by position in document
|
||||
item_headings.sort(key=lambda x: x[1])
|
||||
return item_headings
|
||||
|
||||
# Step 2: Oracle table cell detection
|
||||
# This specifically targets Oracle 10-K's format where items are in table cells
|
||||
# but not marked as headings and not part of a formal TOC
|
||||
item_section_map = {}
|
||||
|
||||
for table_idx, node in enumerate(table_nodes):
|
||||
if hasattr(node, 'content') and isinstance(node.content, list):
|
||||
rows = node.content
|
||||
|
||||
for row_idx, row in enumerate(rows):
|
||||
if not hasattr(row, 'cells'):
|
||||
continue
|
||||
|
||||
for cell_idx, cell in enumerate(row.cells):
|
||||
cell_content = cell.content if hasattr(cell, 'content') else ""
|
||||
if not isinstance(cell_content, str):
|
||||
continue
|
||||
|
||||
# Check if this cell contains an item pattern as an isolated entry
|
||||
# This is common in Oracle 10-K where items are in cells by themselves
|
||||
match = item_pattern.search(cell_content)
|
||||
if match and len(cell_content.strip()) < 50: # Short isolated item cell
|
||||
item_name = match.group(1).strip().upper()
|
||||
|
||||
# Look for the title in adjacent cells
|
||||
title = ""
|
||||
if cell_idx + 1 < len(row.cells):
|
||||
next_cell = row.cells[cell_idx + 1]
|
||||
next_content = next_cell.content if hasattr(next_cell, 'content') else ""
|
||||
if isinstance(next_content, str):
|
||||
title = next_content.strip()
|
||||
|
||||
# Check for bold text or other emphasis indicators
|
||||
is_emphasized = False
|
||||
if hasattr(cell, 'style'):
|
||||
if hasattr(cell.style, 'font_weight') and cell.style.font_weight in ['bold', '700', '800', '900']:
|
||||
is_emphasized = True
|
||||
elif hasattr(cell.style, 'font_style') and cell.style.font_style == 'italic':
|
||||
is_emphasized = True
|
||||
|
||||
# Store the item with its table position for later extraction
|
||||
item_section_map[item_name] = {
|
||||
'table_idx': table_idx,
|
||||
'row_idx': row_idx,
|
||||
'title': title,
|
||||
'emphasized': is_emphasized
|
||||
}
|
||||
|
||||
# If we found items in tables, try to map them to content sections
|
||||
if item_section_map:
|
||||
# Create a mapping of items to their positions in the document
|
||||
table_positions = {}
|
||||
for i, node in enumerate(document.nodes):
|
||||
if node.type == 'table':
|
||||
table_positions[node] = i
|
||||
|
||||
for item_name, info in item_section_map.items():
|
||||
# Create a heading node with the item and title
|
||||
content = f"{item_name} {info['title']}".strip()
|
||||
heading_node = create_heading_node(content)
|
||||
|
||||
# Find this table's position in the document
|
||||
target_table = table_nodes[info['table_idx']]
|
||||
if target_table in table_positions:
|
||||
table_pos = table_positions[target_table]
|
||||
# Add this item, prioritizing emphasized ones
|
||||
if info['emphasized']:
|
||||
item_headings.insert(0, (item_name, table_pos, heading_node))
|
||||
else:
|
||||
item_headings.append((item_name, table_pos, heading_node))
|
||||
|
||||
# Sort item headings by position and check if we found enough
|
||||
if item_headings:
|
||||
item_headings.sort(key=lambda x: x[1])
|
||||
# If we found multiple items from tables, return them
|
||||
if len(item_headings) >= 2:
|
||||
return item_headings
|
||||
|
||||
# Step 3: Iterate through all nodes looking for text blocks that might be item headings
|
||||
for i, node in enumerate(document.nodes):
|
||||
# Check text blocks that might be mis-classified headings
|
||||
if node.type == 'text_block':
|
||||
# Use only the first line to avoid matching within paragraphs
|
||||
first_line = node.content.split('\n')[0] if hasattr(node, 'content') else ''
|
||||
match = item_pattern.search(first_line)
|
||||
|
||||
if match:
|
||||
item_name = match.group(1).strip().upper()
|
||||
|
||||
# Additional validation to reduce false positives
|
||||
# Check if this looks like a real item heading:
|
||||
# 1. Should be relatively short
|
||||
# 2. Should start with the matched pattern
|
||||
# 3. Should not be part of a longer paragraph
|
||||
if (len(first_line) < 100 and
|
||||
first_line.lower().startswith(match.group(1).lower()) and
|
||||
len(first_line.split()) < 15):
|
||||
|
||||
# Check for bold font-weight in the node's style if available
|
||||
is_bold = False
|
||||
if hasattr(node, 'style') and hasattr(node.style, 'font_weight'):
|
||||
fw = node.style.font_weight
|
||||
is_bold = fw in ['bold', '700', '800', '900']
|
||||
|
||||
# Prioritize bold text that matches item patterns
|
||||
if is_bold:
|
||||
item_headings.insert(0, (item_name, i, node))
|
||||
else:
|
||||
item_headings.append((item_name, i, node))
|
||||
|
||||
# If we found items, return them
|
||||
if item_headings:
|
||||
return item_headings
|
||||
|
||||
# Step 4: Last resort - check all nodes for ANY mention of items
|
||||
# This is a last resort to find something when other methods fail
|
||||
for i, node in enumerate(document.nodes):
|
||||
if hasattr(node, 'content') and isinstance(node.content, str):
|
||||
lines = node.content.split('\n')
|
||||
for _line_idx, line in enumerate(lines):
|
||||
match = item_pattern.search(line)
|
||||
if match and len(line.strip()) < 100: # Avoid matching in long paragraphs
|
||||
item_name = match.group(1).strip().upper()
|
||||
|
||||
# Create a heading node with just the matching line
|
||||
heading_node = create_heading_node(line)
|
||||
|
||||
# We'll use the position of the node containing the pattern
|
||||
item_headings.append((item_name, i, heading_node))
|
||||
|
||||
return item_headings
|
||||
|
||||
@staticmethod
|
||||
def _get_item_pattern(filing_type: str) -> Pattern:
|
||||
"""Get the regex pattern for identifying items in this filing type."""
|
||||
# Default to standard 10-K/10-Q item pattern
|
||||
if filing_type in ('10-K', '10-K/A', '10-Q', '10-Q/A', '20-F', '20-F/A'):
|
||||
# Enhanced pattern to better handle different formats:
|
||||
# - Normal format: "Item 1." or "ITEM 1"
|
||||
# - Oracle format: "ITEM 1." or "Item 1"
|
||||
# - With periods: "Item 1." or without "Item 1"
|
||||
# - With trailing spaces: "Item 1 "
|
||||
# - With different spacing: "Item1" or "ITEM 1"
|
||||
return re.compile(r'(item\s*\d+[A-Za-z]?)\.?\s*', re.IGNORECASE)
|
||||
elif filing_type in ('8-K', '8-K/A', '6-K', '6-K/A'):
|
||||
# 8-K uses decimal format like "Item 1.01"
|
||||
return re.compile(r'(item\s*\d+\.\d+)\.?\s*', re.IGNORECASE)
|
||||
else:
|
||||
# Default pattern for other filings - most flexible
|
||||
return re.compile(r'(item\s*\d+(?:\.\d+)?[A-Za-z]?)\.?\s*', re.IGNORECASE)
|
||||
|
||||
@property
|
||||
def items(self) -> ItemCollection:
|
||||
"""Get the collection of items in this document."""
|
||||
return ItemCollection(self._items)
|
||||
|
||||
|
||||
class FilingDocument:
|
||||
"""High-level document class specialized for SEC filings."""
|
||||
|
||||
def __init__(self, html: str, filing_type: str = None):
|
||||
self._document = Document.parse(html)
|
||||
self._filing_type = filing_type
|
||||
self._index = None # Lazy-loaded
|
||||
self._toc = None # Lazy-loaded
|
||||
|
||||
@property
|
||||
def document(self) -> Document:
|
||||
"""Access the underlying Document instance."""
|
||||
return self._document
|
||||
|
||||
@property
|
||||
def index(self) -> DocumentIndex:
|
||||
"""Get or create the document index."""
|
||||
if self._index is None:
|
||||
self._index = DocumentIndex.build(self._document, self._filing_type)
|
||||
return self._index
|
||||
|
||||
@property
|
||||
def items(self) -> ItemCollection:
|
||||
"""Access items in the document."""
|
||||
return self.index.items
|
||||
|
||||
@property
|
||||
def table_of_contents(self) -> TableOfContents:
|
||||
"""Get the table of contents for this document."""
|
||||
if self._toc is None:
|
||||
self._toc = TableOfContents.extract(self._document)
|
||||
return self._toc
|
||||
|
||||
@property
|
||||
def tables(self) -> List[Table]:
|
||||
"""Get all tables in the document."""
|
||||
return [
|
||||
Table(node) for node in self._document.nodes
|
||||
if node.type == 'table'
|
||||
]
|
||||
|
||||
def __getitem__(self, key: str) -> Item:
|
||||
"""Dictionary-style access to items."""
|
||||
return self.items[key]
|
||||
@@ -0,0 +1,678 @@
|
||||
# SEC Filing Item Extraction - New Design
|
||||
|
||||
## Analysis of Current Implementation
|
||||
|
||||
### Strengths
|
||||
1. Simple item access via dictionary-style indexing (`doc["Item 1"]`)
|
||||
2. Caching mechanisms for performance optimization
|
||||
3. Robust detection of item headings with regex patterns
|
||||
4. Sequence validation to ensure correct item ordering
|
||||
5. Special handling for edge cases (table of contents, signatures)
|
||||
6. Strong integration with company report classes
|
||||
|
||||
### Weaknesses
|
||||
1. Overreliance on DataFrame as intermediate representation
|
||||
2. Complex chunking process that operates on strings rather than document structure
|
||||
3. Text-based pattern matching instead of leveraging semantic document structure
|
||||
4. Forward-filling item associations rather than using hierarchical structure
|
||||
5. Limited metadata about items (just text)
|
||||
6. Mixing of responsibilities (parsing, chunking, indexing, item detection)
|
||||
7. Tight coupling between chunking and item detection
|
||||
8. Limited extensibility for new filing types
|
||||
|
||||
## Design Principles
|
||||
|
||||
For our new implementation, we'll follow these principles from successful software projects:
|
||||
|
||||
1. **Single Responsibility Principle**: Each component should have one clearly defined responsibility
|
||||
2. **Separation of Concerns**: Parsing, structure analysis, and item extraction should be separate
|
||||
3. **Fluent, Intuitive API**: Provide a clean, discoverable interface
|
||||
4. **Progressive Disclosure**: Simple operations should be simple, complex operations possible
|
||||
5. **Rich Models**: Return structured objects with useful methods, not just strings
|
||||
6. **Immutability**: Operations produce new objects rather than modifying existing ones
|
||||
7. **Extensibility**: Design for future enhancements and filing types
|
||||
8. **Performance**: Optimize for common operations with appropriate caching
|
||||
|
||||
## New Design
|
||||
|
||||
### Core Components
|
||||
|
||||
#### 1. `FilingDocument` Class
|
||||
|
||||
A high-level wrapper around `Document` that specializes in SEC filing structure:
|
||||
|
||||
```python
|
||||
class FilingDocument:
|
||||
"""High-level document class specialized for SEC filings."""
|
||||
|
||||
def __init__(self, html: str, filing_type: str = None):
|
||||
self._document = Document.parse(html)
|
||||
self._filing_type = filing_type
|
||||
self._index = None # Lazy-loaded
|
||||
self._toc = None # Lazy-loaded
|
||||
|
||||
@property
|
||||
def document(self) -> Document:
|
||||
"""Access the underlying Document instance."""
|
||||
return self._document
|
||||
|
||||
@property
|
||||
def index(self) -> 'DocumentIndex':
|
||||
"""Get or create the document index."""
|
||||
if self._index is None:
|
||||
self._index = DocumentIndex.build(self._document, self._filing_type)
|
||||
return self._index
|
||||
|
||||
@property
|
||||
def items(self) -> 'ItemCollection':
|
||||
"""Access items in the document."""
|
||||
return self.index.items
|
||||
|
||||
@property
|
||||
def table_of_contents(self) -> 'TableOfContents':
|
||||
"""Get the table of contents for this document."""
|
||||
if self._toc is None:
|
||||
self._toc = TableOfContents.extract(self._document)
|
||||
return self._toc
|
||||
|
||||
@property
|
||||
def tables(self) -> List['Table']:
|
||||
"""Get all tables in the document."""
|
||||
return [
|
||||
Table(node) for node in self._document.nodes
|
||||
if node.type == 'table'
|
||||
]
|
||||
|
||||
def __getitem__(self, key: str) -> 'Item':
|
||||
"""Dictionary-style access to items."""
|
||||
return self.items[key]
|
||||
```
|
||||
|
||||
#### 2. `DocumentIndex` Class
|
||||
|
||||
Analyzes document structure and builds indices for fast lookup:
|
||||
|
||||
```python
|
||||
class DocumentIndex:
|
||||
"""Index of document structure for efficient lookups."""
|
||||
|
||||
@classmethod
|
||||
def build(cls, document: Document, filing_type: str = None) -> 'DocumentIndex':
|
||||
"""Build an index from a document."""
|
||||
index = cls()
|
||||
index._build_heading_index(document)
|
||||
index._build_item_index(document, filing_type)
|
||||
return index
|
||||
|
||||
def _build_heading_index(self, document: Document) -> None:
|
||||
"""Build an index of all headings in the document."""
|
||||
# Implementation details...
|
||||
|
||||
def _build_item_index(self, document: Document, filing_type: str = None) -> None:
|
||||
"""Build an index of items in the document."""
|
||||
# Implementation details...
|
||||
|
||||
@property
|
||||
def items(self) -> 'ItemCollection':
|
||||
"""Get the collection of items in this document."""
|
||||
return ItemCollection(self._items)
|
||||
```
|
||||
|
||||
#### 3. `Item` Class
|
||||
|
||||
Represents a logical item in a filing with rich functionality:
|
||||
|
||||
```python
|
||||
class Item:
|
||||
"""Represents a logical item in an SEC filing."""
|
||||
|
||||
def __init__(self,
|
||||
name: str,
|
||||
heading_node: Optional[HeadingNode],
|
||||
content_nodes: List[BaseNode],
|
||||
metadata: Dict[str, Any] = None):
|
||||
self.name = name
|
||||
self.heading_node = heading_node
|
||||
self.content_nodes = content_nodes
|
||||
self.metadata = metadata or {}
|
||||
|
||||
@property
|
||||
def title(self) -> str:
|
||||
"""Get the title of this item."""
|
||||
if self.heading_node:
|
||||
# Extract title from heading
|
||||
return self._extract_title(self.heading_node.content)
|
||||
return ""
|
||||
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""Get the text content of this item."""
|
||||
return "\n".join(
|
||||
node.content if hasattr(node, 'content') else str(node)
|
||||
for node in self.content_nodes
|
||||
)
|
||||
|
||||
@property
|
||||
def tables(self) -> List['Table']:
|
||||
"""Get all tables within this item."""
|
||||
return [
|
||||
Table(node) for node in self.content_nodes
|
||||
if node.type == 'table'
|
||||
]
|
||||
|
||||
def get_table(self, index: int) -> Optional['Table']:
|
||||
"""Get a specific table by index."""
|
||||
tables = self.tables
|
||||
return tables[index] if 0 <= index < len(tables) else None
|
||||
|
||||
def find_tables(self, pattern: str) -> List['Table']:
|
||||
"""Find tables containing the specified text pattern."""
|
||||
tables = []
|
||||
for table in self.tables:
|
||||
if table.contains(pattern):
|
||||
tables.append(table)
|
||||
return tables
|
||||
|
||||
def to_markdown(self) -> str:
|
||||
"""Convert this item to markdown format."""
|
||||
# Implementation details...
|
||||
|
||||
def to_html(self) -> str:
|
||||
"""Convert this item to HTML format."""
|
||||
# Implementation details...
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert this item to a dictionary."""
|
||||
return {
|
||||
'name': self.name,
|
||||
'title': self.title,
|
||||
'text': self.text,
|
||||
'metadata': self.metadata
|
||||
}
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.text
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"Item('{self.name}', title='{self.title}')"
|
||||
```
|
||||
|
||||
#### 4. `ItemCollection` Class
|
||||
|
||||
Provides a collection interface for working with items:
|
||||
|
||||
```python
|
||||
class ItemCollection:
|
||||
"""Collection of items in a document with convenient access methods."""
|
||||
|
||||
def __init__(self, items: Dict[str, Item]):
|
||||
self._items = items
|
||||
|
||||
def __getitem__(self, key: str) -> Item:
|
||||
"""Get an item by name, with flexible matching."""
|
||||
# Case-insensitive lookup
|
||||
key = key.strip().upper()
|
||||
|
||||
# Direct lookup
|
||||
if key in self._items:
|
||||
return self._items[key]
|
||||
|
||||
# Partial match (e.g., "1" matches "ITEM 1")
|
||||
if key.isdigit() or (len(key) > 1 and key[0].isdigit()):
|
||||
for item_key in self._items:
|
||||
if key in item_key:
|
||||
return self._items[item_key]
|
||||
|
||||
raise KeyError(f"Item '{key}' not found")
|
||||
|
||||
def __contains__(self, key: str) -> bool:
|
||||
"""Check if an item exists."""
|
||||
try:
|
||||
self[key]
|
||||
return True
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
def __iter__(self) -> Iterator[Item]:
|
||||
"""Iterate through items in order."""
|
||||
return iter(self._items.values())
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get the number of items."""
|
||||
return len(self._items)
|
||||
|
||||
def list(self) -> List[str]:
|
||||
"""Get a list of item names."""
|
||||
return list(self._items.keys())
|
||||
```
|
||||
|
||||
#### 5. `FilingRegistry` Class
|
||||
|
||||
Registry of known filing types and their structures:
|
||||
|
||||
```python
|
||||
class FilingRegistry:
|
||||
"""Registry of known filing types and their structures."""
|
||||
|
||||
_registry = {}
|
||||
|
||||
@classmethod
|
||||
def register(cls, filing_type: str, structure: Dict[str, Any]) -> None:
|
||||
"""Register a filing type structure."""
|
||||
cls._registry[filing_type.upper()] = structure
|
||||
|
||||
@classmethod
|
||||
def get_structure(cls, filing_type: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get structure for a filing type."""
|
||||
return cls._registry.get(filing_type.upper())
|
||||
|
||||
@classmethod
|
||||
def get_item_pattern(cls, filing_type: str) -> Optional[str]:
|
||||
"""Get the regex pattern for identifying items in this filing type."""
|
||||
structure = cls.get_structure(filing_type)
|
||||
return structure.get('item_pattern') if structure else None
|
||||
```
|
||||
|
||||
### Algorithm for Item Extraction
|
||||
|
||||
The core algorithm for extracting items will:
|
||||
|
||||
1. Identify all heading nodes in the document
|
||||
2. Filter for headings that match item patterns
|
||||
3. For each item heading:
|
||||
- Determine the item name and normalize it
|
||||
- Find all nodes between this item heading and the next one
|
||||
- Create an Item object with the heading and content nodes
|
||||
4. Build a mapping of item names to Item objects
|
||||
|
||||
```python
|
||||
def extract_items(document: Document, filing_type: str = None) -> Dict[str, Item]:
|
||||
"""Extract items from a document."""
|
||||
# Get all heading nodes
|
||||
heading_nodes = [node for node in document.nodes if node.type == 'heading']
|
||||
|
||||
# Get item pattern for this filing type
|
||||
item_pattern = get_item_pattern(filing_type)
|
||||
|
||||
# Filter for item headings
|
||||
item_headings = []
|
||||
for node in heading_nodes:
|
||||
match = re.search(item_pattern, node.content, re.IGNORECASE)
|
||||
if match:
|
||||
item_name = match.group(1).strip().upper()
|
||||
item_headings.append((item_name, node))
|
||||
|
||||
# Sort by position in document
|
||||
item_headings.sort(key=lambda x: document.nodes.index(x[1]))
|
||||
|
||||
# Create items
|
||||
items = {}
|
||||
for i, (item_name, heading_node) in enumerate(item_headings):
|
||||
# Find content nodes
|
||||
start_idx = document.nodes.index(heading_node) + 1
|
||||
end_idx = (document.nodes.index(item_headings[i+1][1])
|
||||
if i+1 < len(item_headings) else len(document.nodes))
|
||||
content_nodes = document.nodes[start_idx:end_idx]
|
||||
|
||||
# Create item
|
||||
items[item_name] = Item(item_name, heading_node, content_nodes)
|
||||
|
||||
return items
|
||||
```
|
||||
|
||||
### Integration with Company Reports
|
||||
|
||||
Update the CompanyReport class to use the new FilingDocument:
|
||||
|
||||
```python
|
||||
class CompanyReport:
|
||||
def __init__(self, filing):
|
||||
self._filing = filing
|
||||
self._document = None
|
||||
|
||||
@property
|
||||
def document(self) -> FilingDocument:
|
||||
"""Get the filing document."""
|
||||
if self._document is None:
|
||||
html = self._filing.html()
|
||||
self._document = FilingDocument(html, self._filing.form)
|
||||
return self._document
|
||||
|
||||
@property
|
||||
def items(self) -> ItemCollection:
|
||||
"""Get all items in this filing."""
|
||||
return self.document.items
|
||||
|
||||
def __getitem__(self, key: str) -> Item:
|
||||
"""Get an item by name."""
|
||||
return self.items[key]
|
||||
```
|
||||
|
||||
Specialized classes like TenK would add property accessors for common items:
|
||||
|
||||
```python
|
||||
class TenK(CompanyReport):
|
||||
@property
|
||||
def business(self) -> Item:
|
||||
"""Get Item 1: Business."""
|
||||
return self.items["Item 1"]
|
||||
|
||||
@property
|
||||
def risk_factors(self) -> Item:
|
||||
"""Get Item 1A: Risk Factors."""
|
||||
return self.items["Item 1A"]
|
||||
|
||||
@property
|
||||
def management_discussion(self) -> Item:
|
||||
"""Get Item 7: Management's Discussion and Analysis."""
|
||||
return self.items["Item 7"]
|
||||
```
|
||||
|
||||
## Implementation Strategy
|
||||
|
||||
To implement this design, we'll follow these steps:
|
||||
|
||||
1. Implement the `Item` and `ItemCollection` classes first
|
||||
2. Create the `DocumentIndex` class
|
||||
3. Implement the `FilingDocument` class
|
||||
4. Set up the `FilingRegistry` with known filing structures
|
||||
5. Update the `CompanyReport` hierarchy to use the new classes
|
||||
6. Write comprehensive tests
|
||||
7. Deprecate the old implementation with appropriate warnings
|
||||
|
||||
## Optimizations
|
||||
|
||||
Performance is critical for this component. Key optimizations include:
|
||||
|
||||
1. **Lazy Loading**: Only build indices when needed
|
||||
2. **Caching**: Cache document and index objects
|
||||
3. **Efficient Node Traversal**: Use direct node references instead of searching by content
|
||||
4. **Smart Item Matching**: Support flexible item lookup patterns
|
||||
5. **Document Structure Awareness**: Leverage heading levels and hierarchy
|
||||
|
||||
## Comparison with Old Implementation
|
||||
|
||||
| Feature | Old Implementation | New Implementation |
|
||||
|---------|-------------------|-------------------|
|
||||
| Primary structure | DataFrame of chunks | Tree of nodes |
|
||||
| Item detection | Regex on plaintext | Pattern matching on heading nodes |
|
||||
| Item boundaries | Forward-fill in DataFrame | Node ranges in document |
|
||||
| Return value | Text string | Rich Item object |
|
||||
| Extensibility | Limited | Registry-based design |
|
||||
| Performance | Good with caching | Better with structural analysis |
|
||||
| API clarity | Medium (mixed responsibilities) | High (clear separation) |
|
||||
| Edge case handling | Good, but complex | Simpler with structure awareness |
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```python
|
||||
# Get a filing
|
||||
filing = edgartools.get_filing("AAPL", "10-K", latest=True)
|
||||
|
||||
# Create a 10-K report
|
||||
tenk = TenK(filing)
|
||||
|
||||
# Access an item
|
||||
business = tenk.business
|
||||
print(f"Business description: {business.title}")
|
||||
print(business.text[:100] + "...")
|
||||
|
||||
# Access using dictionary style
|
||||
risk_factors = tenk["Item 1A"]
|
||||
print(f"Risk factors ({len(risk_factors.text)} chars)")
|
||||
```
|
||||
|
||||
### Working with Tables
|
||||
|
||||
```python
|
||||
# Get the financial statements item
|
||||
financial_statements = tenk["Item 8"]
|
||||
|
||||
# Get all tables in the item
|
||||
tables = financial_statements.tables
|
||||
print(f"Found {len(tables)} tables in financial statements")
|
||||
|
||||
# Get a specific table (e.g., income statement)
|
||||
income_statement = financial_statements.get_table(0)
|
||||
if income_statement:
|
||||
# Convert to pandas DataFrame
|
||||
df = income_statement.to_dataframe()
|
||||
print(df.head())
|
||||
|
||||
# Get table metadata
|
||||
print(f"Table dimensions: {income_statement.rows} rows × {income_statement.columns} columns")
|
||||
|
||||
# Access specific cell
|
||||
revenue = income_statement.get_cell(1, 1)
|
||||
print(f"Revenue: {revenue}")
|
||||
|
||||
# Find tables containing specific text
|
||||
revenue_tables = financial_statements.find_tables("revenue")
|
||||
for table in revenue_tables:
|
||||
print(f"Found table with {table.rows} rows about revenue")
|
||||
```
|
||||
|
||||
### Table of Contents
|
||||
|
||||
```python
|
||||
# Get the table of contents
|
||||
toc = tenk.document.table_of_contents
|
||||
|
||||
# Print TOC structure
|
||||
for entry in toc.entries:
|
||||
print(f"{entry.level * ' '}{entry.text} (page {entry.page})")
|
||||
|
||||
# Navigate directly to a TOC entry
|
||||
item7 = toc.find("Management's Discussion")
|
||||
if item7:
|
||||
print(f"Found MD&A at level {item7.level}")
|
||||
# Jump to that section
|
||||
mda = tenk[item7.reference]
|
||||
print(mda.title)
|
||||
```
|
||||
|
||||
### Advanced Usage
|
||||
|
||||
```python
|
||||
# Get all items
|
||||
for item in tenk.items:
|
||||
print(f"{item.name}: {item.title}")
|
||||
|
||||
# Convert to markdown
|
||||
md_text = tenk.business.to_markdown()
|
||||
|
||||
# Get as JSON
|
||||
import json
|
||||
items_json = json.dumps({
|
||||
name: item.to_dict()
|
||||
for name, item in tenk.items.items()
|
||||
})
|
||||
|
||||
# Search within items
|
||||
for item in tenk.items:
|
||||
if "revenue" in item.text.lower():
|
||||
print(f"Found revenue discussion in {item.name}")
|
||||
|
||||
# Extract nested sections within an item
|
||||
mda = tenk.management_discussion
|
||||
subsections = mda.get_subsections()
|
||||
for section in subsections:
|
||||
print(f"Subsection: {section.title}")
|
||||
```
|
||||
|
||||
## Table Components
|
||||
|
||||
To complete our design, we'll implement these additional classes for handling tables and table of contents:
|
||||
|
||||
### Table Class
|
||||
|
||||
```python
|
||||
class Table:
|
||||
"""Rich representation of a table in a document."""
|
||||
|
||||
def __init__(self, table_node: 'TableNode'):
|
||||
self._node = table_node
|
||||
self._processed = None # Lazy-loaded processed table
|
||||
|
||||
@property
|
||||
def rows(self) -> int:
|
||||
"""Get the number of rows in the table."""
|
||||
return self._get_processed().processed_row_count
|
||||
|
||||
@property
|
||||
def columns(self) -> int:
|
||||
"""Get the number of columns in the table."""
|
||||
return self._get_processed().processed_column_count
|
||||
|
||||
def _get_processed(self) -> 'ProcessedTable':
|
||||
"""Get or create the processed table."""
|
||||
if self._processed is None:
|
||||
self._processed = self._node._processed
|
||||
return self._processed
|
||||
|
||||
def to_dataframe(self) -> 'pd.DataFrame':
|
||||
"""Convert this table to a pandas DataFrame."""
|
||||
processed = self._get_processed()
|
||||
if processed and processed.headers and processed.data_rows:
|
||||
return pd.DataFrame(processed.data_rows, columns=processed.headers)
|
||||
return pd.DataFrame()
|
||||
|
||||
def to_markdown(self) -> str:
|
||||
"""Convert this table to markdown format."""
|
||||
# Implementation details...
|
||||
|
||||
def get_cell(self, row: int, col: int) -> str:
|
||||
"""Get the content of a specific cell."""
|
||||
processed = self._get_processed()
|
||||
if processed and 0 <= row < len(processed.data_rows):
|
||||
data_row = processed.data_rows[row]
|
||||
if 0 <= col < len(data_row):
|
||||
return data_row[col]
|
||||
return ""
|
||||
|
||||
def contains(self, text: str) -> bool:
|
||||
"""Check if the table contains the specified text."""
|
||||
processed = self._get_processed()
|
||||
if not processed:
|
||||
return False
|
||||
|
||||
# Check headers
|
||||
if processed.headers and any(text.lower() in header.lower() for header in processed.headers):
|
||||
return True
|
||||
|
||||
# Check data rows
|
||||
for row in processed.data_rows:
|
||||
if any(text.lower() in str(cell).lower() for cell in row):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.to_markdown()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"Table({self.rows}×{self.columns})"
|
||||
```
|
||||
|
||||
### TableOfContents Class
|
||||
|
||||
```python
|
||||
class TocEntry:
|
||||
"""Entry in a table of contents."""
|
||||
|
||||
def __init__(self, text: str, level: int, page: Optional[int] = None, reference: Optional[str] = None):
|
||||
self.text = text
|
||||
self.level = level
|
||||
self.page = page
|
||||
self.reference = reference # Item reference, if applicable
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"TocEntry('{self.text}', level={self.level}, page={self.page})"
|
||||
|
||||
|
||||
class TableOfContents:
|
||||
"""Table of contents extracted from a document."""
|
||||
|
||||
def __init__(self, entries: List[TocEntry]):
|
||||
self.entries = entries
|
||||
|
||||
@classmethod
|
||||
def extract(cls, document: Document) -> 'TableOfContents':
|
||||
"""Extract table of contents from document."""
|
||||
entries = []
|
||||
|
||||
# Find TOC section (usually at the beginning)
|
||||
toc_node_index = cls._find_toc_section(document)
|
||||
if toc_node_index is None:
|
||||
return cls([])
|
||||
|
||||
# Get nodes after TOC heading until the next major heading
|
||||
toc_nodes = cls._get_toc_nodes(document, toc_node_index)
|
||||
|
||||
# Process nodes to extract entries
|
||||
entries = cls._process_toc_nodes(toc_nodes)
|
||||
|
||||
# Match entries to actual items
|
||||
cls._match_entries_to_items(entries, document)
|
||||
|
||||
return cls(entries)
|
||||
|
||||
@staticmethod
|
||||
def _find_toc_section(document: Document) -> Optional[int]:
|
||||
"""Find the TOC section in the document."""
|
||||
# Look for "Table of Contents" heading
|
||||
toc_patterns = [
|
||||
re.compile(r'table\s+of\s+contents', re.IGNORECASE),
|
||||
re.compile(r'contents', re.IGNORECASE)
|
||||
]
|
||||
|
||||
for i, node in enumerate(document.nodes):
|
||||
if node.type == 'heading':
|
||||
for pattern in toc_patterns:
|
||||
if pattern.search(node.content):
|
||||
return i
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _get_toc_nodes(document: Document, start_index: int) -> List['BaseNode']:
|
||||
"""Get nodes belonging to the TOC section."""
|
||||
# Implementation details...
|
||||
|
||||
@staticmethod
|
||||
def _process_toc_nodes(nodes: List['BaseNode']) -> List[TocEntry]:
|
||||
"""Process TOC nodes to extract entries."""
|
||||
# Implementation details...
|
||||
|
||||
@staticmethod
|
||||
def _match_entries_to_items(entries: List[TocEntry], document: Document) -> None:
|
||||
"""Match TOC entries to actual items in the document."""
|
||||
# Implementation details...
|
||||
|
||||
def find(self, text: str) -> Optional[TocEntry]:
|
||||
"""Find a TOC entry by text."""
|
||||
text = text.lower()
|
||||
for entry in self.entries:
|
||||
if text in entry.text.lower():
|
||||
return entry
|
||||
return None
|
||||
|
||||
def __iter__(self) -> Iterator[TocEntry]:
|
||||
return iter(self.entries)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.entries)
|
||||
```
|
||||
|
||||
## Challenges and Mitigations
|
||||
|
||||
1. **Accurate Item Detection**: Use a combination of patterns and structural analysis
|
||||
2. **Handling Malformed Documents**: Fall back to text-based detection when structure is unclear
|
||||
3. **Performance with Large Documents**: Use lazy evaluation and partial parsing
|
||||
4. **Backward Compatibility**: Provide adapters for old API patterns
|
||||
5. **Content Transformation**: Preserve tables and formatting during item extraction
|
||||
6. **TOC Detection**: Use multiple heuristics to find and parse table of contents
|
||||
7. **Table Extraction**: Handle complex tables with rowspan/colspan and formatting
|
||||
|
||||
By following this design, we'll create a cleaner, more robust API for extracting items from SEC filings that leverages the structural advantages of the new Document class while improving on the functionality of the current implementation.
|
||||
1758
venv/lib/python3.10/site-packages/edgar/files/html.py
Normal file
1758
venv/lib/python3.10/site-packages/edgar/files/html.py
Normal file
File diff suppressed because it is too large
Load Diff
1156
venv/lib/python3.10/site-packages/edgar/files/html_documents.py
Normal file
1156
venv/lib/python3.10/site-packages/edgar/files/html_documents.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,687 @@
|
||||
import re
|
||||
from typing import Dict, List
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
|
||||
from edgar.files.html_documents import (
|
||||
Block,
|
||||
HtmlDocument,
|
||||
LinkBlock,
|
||||
clean_html_root,
|
||||
decompose_page_numbers,
|
||||
extract_and_format_content,
|
||||
)
|
||||
from edgar.files.htmltools import ChunkedDocument
|
||||
|
||||
|
||||
class AssembleText:
|
||||
|
||||
@staticmethod
|
||||
def assemble_block_text(chunks: List[Block], prefix_src: str = None):
|
||||
if prefix_src:
|
||||
for block in chunks:
|
||||
if isinstance(block, LinkBlock):
|
||||
yield block.to_markdown(prefix_src=prefix_src)
|
||||
else:
|
||||
yield block.get_text()
|
||||
else:
|
||||
for block in chunks:
|
||||
yield block.get_text()
|
||||
|
||||
@staticmethod
|
||||
def assemble_block_markdown(chunks: List[Block], prefix_src: str = None):
|
||||
if prefix_src:
|
||||
for block in chunks:
|
||||
if isinstance(block, LinkBlock):
|
||||
yield block.to_markdown(prefix_src=prefix_src)
|
||||
else:
|
||||
yield block.to_markdown()
|
||||
else:
|
||||
for block in chunks:
|
||||
yield block.to_markdown()
|
||||
|
||||
@staticmethod
|
||||
def clean_and_assemble_text(
|
||||
start_element: Tag, markdown: bool = False
|
||||
) -> str:
|
||||
# Now find the full text
|
||||
blocks: List[Block] = extract_and_format_content(start_element)
|
||||
# Compress the blocks
|
||||
blocks: List[Block] = HtmlDocument._compress_blocks(blocks)
|
||||
if markdown:
|
||||
return "".join(
|
||||
[text for text in AssembleText.assemble_block_markdown(blocks)]
|
||||
)
|
||||
else:
|
||||
return "".join(
|
||||
[text for text in AssembleText.assemble_block_text(blocks)]
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def assemble_html_document(tags: List[Tag], markdown: bool = False) -> str:
|
||||
return ChunkedDocument.clean_part_line(
|
||||
"".join(
|
||||
[
|
||||
AssembleText.clean_and_assemble_text(
|
||||
tag, markdown=markdown
|
||||
)
|
||||
for tag in tags
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def find_block_level_parent(tag, all_link_tag: list):
|
||||
ori_tag = tag
|
||||
while tag and tag.parent is not None:
|
||||
parent = tag.parent
|
||||
link_count = 0
|
||||
for link in all_link_tag:
|
||||
matched = parent.find(id=link) or parent.find(
|
||||
"a", attrs={"name": link}
|
||||
)
|
||||
if matched:
|
||||
link_count += 1
|
||||
if link_count > 1:
|
||||
return tag
|
||||
tag = parent
|
||||
return tag if tag else ori_tag
|
||||
|
||||
@staticmethod
|
||||
def assemble_items(
|
||||
html_content: str, item_links: List, markdown: bool = False
|
||||
) -> dict:
|
||||
try:
|
||||
root: Tag = HtmlDocument.get_root(html_content)
|
||||
start_element = clean_html_root(root)
|
||||
decompose_page_numbers(start_element)
|
||||
soup = start_element
|
||||
|
||||
link_ids = [item_id for item_name, item_id in item_links]
|
||||
items = {}
|
||||
|
||||
# Helper method to extract content up to a specific element
|
||||
def get_intro_content(first_item_id: str) -> List[Tag]:
|
||||
intro_content = []
|
||||
current = soup.find(id=first_item_id) or soup.find(
|
||||
"a", attrs={"name": first_item_id}
|
||||
)
|
||||
if current:
|
||||
container = AssembleText.find_block_level_parent(current, link_ids)
|
||||
|
||||
if container:
|
||||
for sibling in container.previous_siblings:
|
||||
if isinstance(sibling, Tag):
|
||||
intro_content.append(sibling)
|
||||
sibling = current.previous_sibling
|
||||
while sibling:
|
||||
if isinstance(sibling, Tag):
|
||||
intro_content.append(sibling)
|
||||
sibling = sibling.previous_sibling
|
||||
intro_content.reverse()
|
||||
return intro_content
|
||||
|
||||
# Step 1: Extract intro (from start of document to first item)
|
||||
first_item_id = item_links[0][1] if item_links else None
|
||||
if first_item_id:
|
||||
intro_content = get_intro_content(first_item_id)
|
||||
items["Item 0"] = AssembleText.assemble_html_document(
|
||||
intro_content, markdown=markdown
|
||||
)
|
||||
|
||||
# Step 2: Extract items
|
||||
id_to_content = {}
|
||||
for idx, (item_name, item_id) in enumerate(item_links):
|
||||
|
||||
if idx < len(item_links)-1:
|
||||
n_item_id = item_links[idx+1][1]
|
||||
else:
|
||||
n_item_id = None
|
||||
# Try both id and name attributes
|
||||
target = soup.find(id=item_id) or soup.find(
|
||||
"a", attrs={"name": item_id}
|
||||
)
|
||||
if not target:
|
||||
raise Exception(f"link id error. item_name:{item_name}, item_id:{item_id}")
|
||||
|
||||
target = AssembleText.find_block_level_parent(target, link_ids)
|
||||
if target:
|
||||
content = []
|
||||
current = target
|
||||
while current:
|
||||
if (
|
||||
current
|
||||
and not isinstance(current, (str, NavigableString))
|
||||
and n_item_id
|
||||
and current.find(id=n_item_id) or soup.find(
|
||||
"a", attrs={"name": n_item_id}
|
||||
)
|
||||
):
|
||||
break
|
||||
if current.name is not None or (
|
||||
current.string and current.string.strip()
|
||||
):
|
||||
content.append(current)
|
||||
current = current.next_sibling
|
||||
id_to_content[item_id] = AssembleText.assemble_html_document(
|
||||
content, markdown=markdown
|
||||
)
|
||||
|
||||
if item_id in id_to_content:
|
||||
items[item_name] = id_to_content[item_id]
|
||||
|
||||
# Step 3: Handle Signatures
|
||||
if "Signature" not in items and item_links:
|
||||
last_item_name, last_item_id = item_links[-1]
|
||||
last_content = items.get(last_item_name, "")
|
||||
sig_key = ["SIGNATURES", "SIGNATURE"]
|
||||
content_lines = last_content.split("\n")
|
||||
signature_line_index = None
|
||||
for i, line in enumerate(content_lines):
|
||||
if line.strip().upper() in sig_key:
|
||||
signature_line_index = i
|
||||
break
|
||||
if signature_line_index is not None:
|
||||
before_sig = "\n".join(content_lines[:signature_line_index])
|
||||
sig_start_pos = len(before_sig) + 1
|
||||
items["Signature"] = last_content[sig_start_pos:].strip()
|
||||
items[last_item_name] = before_sig.strip()
|
||||
else:
|
||||
items["Signature"] = ""
|
||||
|
||||
return items
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
class ParsedHtml10K:
|
||||
|
||||
@staticmethod
|
||||
def extract_element_id(href: str) -> str:
|
||||
"""
|
||||
Extract element ID from an XLink href.
|
||||
|
||||
Args:
|
||||
href: XLink href attribute value
|
||||
|
||||
Returns:
|
||||
Element ID
|
||||
"""
|
||||
return href.split("#")[-1]
|
||||
|
||||
def extract_html_link_info(self, html_content: str) -> List:
|
||||
"""
|
||||
Find rows in tables that:
|
||||
1. Contain links
|
||||
2. Have a separate cell storing page numbers
|
||||
"""
|
||||
|
||||
html_content = html_content.replace(" ", " ")
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
|
||||
# Remove script and style tags
|
||||
for tag in soup(["script", "style", "noscript"]):
|
||||
tag.decompose()
|
||||
|
||||
link_info: List = []
|
||||
tables = soup.find_all("table")
|
||||
for table in tables:
|
||||
table_links: List[Dict] = []
|
||||
for row in table.find_all("tr"):
|
||||
cells = row.find_all("td", recursive=False)
|
||||
exist_page_num = False
|
||||
if cells:
|
||||
text = [
|
||||
" ".join(
|
||||
cell.get_text(separator="", strip=True).split()
|
||||
)
|
||||
for cell in cells
|
||||
]
|
||||
for cell in cells:
|
||||
cell_text = cell.text.strip()
|
||||
if cell_text.isdigit() or (
|
||||
"-" in cell_text
|
||||
and all(
|
||||
p.strip().isdigit()
|
||||
for p in cell_text.split("-")
|
||||
)
|
||||
):
|
||||
exist_page_num = True
|
||||
|
||||
links = [
|
||||
cell.find("a")
|
||||
for cell in cells
|
||||
if cell.find("a")
|
||||
and cell.find("a").attrs.get("href")
|
||||
and cell.find("a").attrs.get("href").startswith("#")
|
||||
]
|
||||
|
||||
if links and exist_page_num:
|
||||
link = links[0].attrs.get("href").split("#")[-1]
|
||||
table_links.append({"text": text, "link": link})
|
||||
if table_links:
|
||||
link_info.append(table_links)
|
||||
|
||||
return link_info
|
||||
|
||||
@staticmethod
|
||||
def extract_item_and_split(link_info: List):
|
||||
"""
|
||||
Defines matching patterns and functions for extracting and splitting SEC filing items.
|
||||
|
||||
The code provides:
|
||||
1. Four dictionaries (items_match_1 to items_match_4) containing different formats of SEC item identifiers
|
||||
2. A match_function_map tuple that pairs each dictionary with its corresponding matching function
|
||||
3. Matching functions that handle case-insensitive comparisons (startswith, equals, contains)
|
||||
|
||||
The matching strategies cover:
|
||||
- Standard item formats (e.g., "Item 1.")
|
||||
- Part/item combinations (e.g., "Part I, Item 1")
|
||||
- Full item descriptions (e.g., "Business")
|
||||
- Combined items (e.g., "Items 1 and 2.")
|
||||
|
||||
Note: The actual item processing loop is not implemented in the selected code.
|
||||
"""
|
||||
if not link_info:
|
||||
return []
|
||||
|
||||
link_info = [item for sublist in link_info for item in sublist]
|
||||
items_match_1 = { # Match items starting with these patterns
|
||||
"Item 1": "Item 1.",
|
||||
"Item 1A": "Item 1A.",
|
||||
"Item 1B": "Item 1B.",
|
||||
"Item 1C": "Item 1C.",
|
||||
"Item 2": "Item 2.",
|
||||
"Item 3": "Item 3.",
|
||||
"Item 4": "Item 4.",
|
||||
"Item 5": "Item 5.",
|
||||
"Item 6": "Item 6.",
|
||||
"Item 7": "Item 7.",
|
||||
"Item 7A": "Item 7A.",
|
||||
"Item 8": "Item 8.",
|
||||
"Item 9": "Item 9.",
|
||||
"Item 9A": "Item 9A.",
|
||||
"Item 9B": "Item 9B.",
|
||||
"Item 9C": "Item 9C.",
|
||||
"Item 10": "Item 10.",
|
||||
"Item 11": "Item 11.",
|
||||
"Item 12": "Item 12.",
|
||||
"Item 13": "Item 13.",
|
||||
"Item 14": "Item 14.",
|
||||
"Item 15": "Item 15.",
|
||||
"Item 16": "Item 16.",
|
||||
"Signatures": "Signature",
|
||||
}
|
||||
items_match_0 = {key: key for key in items_match_1}
|
||||
|
||||
items_match_2 = { # Exact match after stripping whitespace
|
||||
"Item 1": "Part I, Item 1",
|
||||
"Item 1A": "Part I, Item 1A",
|
||||
"Item 1B": "Part I, Item 1B",
|
||||
"Item 1C": "Part I, Item 1C",
|
||||
"Item 2": "Part I, Item 2",
|
||||
"Item 3": "Part I, Item 3",
|
||||
"Item 4": "Part I, Item 4",
|
||||
"Item 5": "Part II, Item 5",
|
||||
"Item 6": "Part II, Item 6",
|
||||
"Item 7": "Part II, Item 7",
|
||||
"Item 7A": "Part II, Item 7A",
|
||||
"Item 8": "Part II, Item 8",
|
||||
"Item 9": "Part II, Item 9",
|
||||
"Item 9A": "Part II, Item 9A",
|
||||
"Item 9B": "Part II, Item 9B",
|
||||
"Item 9C": "Part II, Item 9C",
|
||||
"Item 10": "Part III, Item 10",
|
||||
"Item 11": "Part III, Item 11",
|
||||
"Item 12": "Part III, Item 12",
|
||||
"Item 13": "Part III, Item 13",
|
||||
"Item 14": "Part III, Item 14",
|
||||
"Item 15": "Part IV, Item 15",
|
||||
"Item 16": "Part IV, Item 16",
|
||||
"Signatures": "Signature",
|
||||
}
|
||||
items_match_2_1 = {
|
||||
"Item 1": "Item No. 1",
|
||||
"Item 1A": "Item No. 1A",
|
||||
"Item 1B": "Item No. 1B",
|
||||
"Item 1C": "Item No. 1C",
|
||||
"Item 2": "Item No. 2",
|
||||
"Item 3": "Item No. 3",
|
||||
"Item 4": "Item No. 4",
|
||||
"Item 5": "Item No. 5",
|
||||
"Item 6": "Item No. 6",
|
||||
"Item 7": "Item No. 7",
|
||||
"Item 7A": "Item No. 7A",
|
||||
"Item 8": "Item No. 8",
|
||||
"Item 9": "Item No. 9",
|
||||
"Item 9A": "Item No. 9A",
|
||||
"Item 9B": "Item No. 9B",
|
||||
"Item 9C": "Item No. 9C",
|
||||
"Item 10": "Item No. 10",
|
||||
"Item 11": "Item No. 11",
|
||||
"Item 12": "Item No. 12",
|
||||
"Item 13": "Item No. 13",
|
||||
"Item 14": "Item No. 14",
|
||||
"Item 15": "Item No. 15",
|
||||
"Item 16": "Item No. 16",
|
||||
}
|
||||
|
||||
items_match_3 = { # Match item names (startswith comparison)
|
||||
"Item 1": "Business",
|
||||
"Item 1A": "Risk Factors",
|
||||
"Item 1B": "Unresolved Staff Comments",
|
||||
"Item 1C": "Cybersecurity",
|
||||
"Item 2": "Properties",
|
||||
"Item 3": "Legal Proceedings",
|
||||
"Item 4": "Mine Safety Disclosures",
|
||||
"Item 5": "Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
|
||||
"Item 6": "[Reserved]",
|
||||
"Item 7": "Management’s Discussion and Analysis of Financial Condition and Results of Operations",
|
||||
"Item 7A": "Quantitative and Qualitative Disclosures About Market Risk",
|
||||
"Item 8": "Financial Statements and Supplementary Data",
|
||||
"Item 9": "Changes in and Disagreements with Accountants on Accounting and Financial Disclosure",
|
||||
"Item 9A": "Controls and Procedures",
|
||||
"Item 9B": "Other Information",
|
||||
"Item 9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
|
||||
"Item 10": "Directors, Executive Officers and Corporate Governance",
|
||||
"Item 11": "Executive Compensation",
|
||||
"Item 12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
|
||||
"Item 13": "Certain Relationships and Related Transactions, and Director Independence",
|
||||
"Item 14": "Principal Accountant Fees and Services",
|
||||
"Item 15": "Exhibit and Financial Statement Schedules",
|
||||
"Item 16": "Form 10-K Summary",
|
||||
}
|
||||
|
||||
items_match_4 = { # Match combined items (startswith comparison)
|
||||
"Item 1": "Items 1 and 2.",
|
||||
"Item 2": "Items 1 and 2.",
|
||||
}
|
||||
|
||||
items_match_5 = {
|
||||
"Item 1": "1. Business",
|
||||
"Item 1A": "1A. Risk Factors",
|
||||
"Item 1B": "1B. Unresolved Staff Comments",
|
||||
"Item 1C": "1C. Cybersecurity",
|
||||
"Item 2": "2. Properties",
|
||||
"Item 3": "3. Legal Proceedings",
|
||||
"Item 4": "4. Mine Safety Disclosures",
|
||||
"Item 5": "5. Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
|
||||
"Item 6": "6. [Reserved]",
|
||||
"Item 7": "7. Management's Discussion and Analysis of Financial Condition and Results of Operations",
|
||||
"Item 7A": "7A. Quantitative and Qualitative Disclosures about Market Risk",
|
||||
"Item 8": "8. Financial Statements and Supplementary Data",
|
||||
"Item 9": "9. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure",
|
||||
"Item 9A": "9A. Controls and Procedures",
|
||||
"Item 9B": "9B. Other Information",
|
||||
"Item 9C": "9C. Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
|
||||
"Item 10": "10. Directors, Executive Officers and Corporate Governance",
|
||||
"Item 11": "11. Executive Compensation",
|
||||
"Item 12": "12. Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
|
||||
"Item 13": "13. Certain Relationships and Related Transactions, and Director Independence",
|
||||
"Item 14": "14. Principal Accountant Fees and Services",
|
||||
"Item 15": "15. Exhibit and Financial Statement Schedules",
|
||||
"Item 16": "16. Form 10-K Summary",
|
||||
}
|
||||
|
||||
items_match_6 = {
|
||||
"Item 1": "1 and 2. Business and Properties",
|
||||
"Item 2": "1 and 2. Business and Properties",
|
||||
}
|
||||
|
||||
# Matching function types:
|
||||
# 1. equal
|
||||
# 2. startswith
|
||||
# 3. contains
|
||||
# 4. regex
|
||||
match_function_map = [ # The current page has an order
|
||||
(
|
||||
items_match_4,
|
||||
lambda x, y: x.strip().lower().startswith(y.lower()),
|
||||
),
|
||||
(
|
||||
items_match_6,
|
||||
lambda x, y: x.strip().lower().startswith(y.lower()),
|
||||
),
|
||||
(items_match_0, lambda x, y: x.strip().lower() == y.lower()),
|
||||
(
|
||||
items_match_1,
|
||||
lambda x, y: x.strip().lower().startswith(y.lower()),
|
||||
),
|
||||
(items_match_2, lambda x, y: x.strip().lower() == y.lower()),
|
||||
(items_match_2_1, lambda x, y: x.strip().lower() == y.lower()),
|
||||
(items_match_3, lambda x, y: y.lower() in x.lower()),
|
||||
(items_match_5, lambda x, y: y.lower() in x.lower()),
|
||||
]
|
||||
|
||||
# Process matches and ensure unique items with ascending page numbers
|
||||
item_dict = {}
|
||||
|
||||
for one_link in link_info:
|
||||
for match_map, match_function in match_function_map:
|
||||
for item_name, match_text in match_map.items():
|
||||
for cell in one_link["text"]:
|
||||
if match_function(cell, match_text):
|
||||
link = one_link["link"]
|
||||
|
||||
# Only keep the first matching link for each item
|
||||
if item_name not in item_dict:
|
||||
item_dict[item_name] = link
|
||||
|
||||
# Convert to list format without sorting
|
||||
item_links = [(name, link) for name, link in item_dict.items()]
|
||||
return item_links
|
||||
|
||||
def extract_html(
|
||||
self, html_content: str, structure, markdown: bool = False
|
||||
) -> dict:
|
||||
"""
|
||||
Find rows in tables that:
|
||||
1. Contain links
|
||||
2. Have a separate cell storing page numbers
|
||||
"""
|
||||
index_table = self.extract_html_link_info(html_content)
|
||||
item_links = self.extract_item_and_split(index_table)
|
||||
|
||||
item_result = AssembleText.assemble_items(
|
||||
html_content, item_links, markdown=markdown
|
||||
)
|
||||
|
||||
item_to_part = {}
|
||||
for part_name in structure.structure:
|
||||
part_items = structure.get_part(part_name)
|
||||
for item_name in part_items:
|
||||
item_to_part[item_name.lower()] = part_name.lower()
|
||||
|
||||
# Step 4: Group items by part
|
||||
result = {part_name.lower(): {} for part_name in structure.structure}
|
||||
result["extracted"] = {}
|
||||
|
||||
for item_name, content in item_result.items():
|
||||
item_name = item_name.lower()
|
||||
part_name = item_to_part.get(item_name)
|
||||
if part_name:
|
||||
result[part_name][item_name] = content
|
||||
else:
|
||||
result["extracted"][item_name] = content
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class ParsedHtml10Q:
|
||||
"""Parser for 10-Q HTML documents that handles same item numbers in different parts."""
|
||||
|
||||
@staticmethod
|
||||
def extract_element_id(href: str) -> str:
|
||||
"""Extract element ID from an XLink href."""
|
||||
return href.split("#")[-1]
|
||||
|
||||
def extract_html_link_info(self, html_content: str) -> List:
|
||||
"""Find rows in tables that contain links and page numbers."""
|
||||
html_content = html_content.replace(" ", " ")
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
|
||||
# Remove script and style tags
|
||||
for tag in soup(["script", "style", "noscript"]):
|
||||
tag.decompose()
|
||||
|
||||
link_info: List = []
|
||||
tables = soup.find_all("table")
|
||||
part_regex = re.compile(r"^\s*(Part\s+[IVXLC]+)\s*", re.IGNORECASE)
|
||||
part = None
|
||||
for table in tables:
|
||||
table_links: List[Dict] = []
|
||||
for row in table.find_all("tr"):
|
||||
row_text = row.get_text()
|
||||
row_text = row.get_text().strip()
|
||||
part_match = part_regex.match(row_text)
|
||||
if part_match:
|
||||
part = re.sub(r'\s+', ' ', part_match.group(1).lower())
|
||||
cells = row.find_all("td", recursive=False)
|
||||
exist_page_num = False
|
||||
if cells:
|
||||
text = [
|
||||
cell.get_text(separator=" ", strip=True)
|
||||
for cell in cells
|
||||
]
|
||||
for cell in cells:
|
||||
cell_text = cell.text.strip()
|
||||
if cell_text.isdigit() or (
|
||||
"-" in cell_text
|
||||
and all(
|
||||
p.strip().isdigit()
|
||||
for p in cell_text.split("-")
|
||||
)
|
||||
):
|
||||
exist_page_num = True
|
||||
|
||||
links = [
|
||||
cell.find("a")
|
||||
for cell in cells
|
||||
if cell.find("a")
|
||||
and cell.find("a").attrs.get("href")
|
||||
and cell.find("a").attrs.get("href").startswith("#")
|
||||
]
|
||||
|
||||
if part and links and exist_page_num:
|
||||
link = links[0].attrs.get("href").split("#")[-1]
|
||||
table_links.append(
|
||||
{"part": part, "text": text, "link": link}
|
||||
)
|
||||
if table_links:
|
||||
link_info.append(table_links)
|
||||
|
||||
return link_info
|
||||
|
||||
@staticmethod
|
||||
def extract_item_and_split(link_info: List):
|
||||
"""Extract and match 10-Q specific items, handling same item numbers in different parts."""
|
||||
if not link_info:
|
||||
return []
|
||||
|
||||
link_info = [item for sublist in link_info for item in sublist]
|
||||
|
||||
# 10-Q specific item patterns
|
||||
items_match_1 = { # Standard 10-Q item formats
|
||||
"part i": {
|
||||
"Item 1": "Item 1.",
|
||||
"Item 2": "Item 2.",
|
||||
"Item 3": "Item 3.",
|
||||
"Item 4": "Item 4.",
|
||||
},
|
||||
"part ii": {
|
||||
"Item 1": "Item 1.",
|
||||
"Item 1A": "Item 1A.",
|
||||
"Item 2": "Item 2.",
|
||||
"Item 3": "Item 3.",
|
||||
"Item 4": "Item 4.",
|
||||
"Item 5": "Item 5.",
|
||||
"Item 6": "Item 6.",
|
||||
},
|
||||
"Extarect": {"Signatures": "Signature"},
|
||||
}
|
||||
|
||||
items_match_2 = { # Part-prefixed items
|
||||
"part i": {
|
||||
"Item 1": "part i, Item 1",
|
||||
"Item 2": "part i, Item 2",
|
||||
"Item 3": "part i, Item 3",
|
||||
"Item 4": "part i, Item 4",
|
||||
},
|
||||
"part ii": {
|
||||
"Item 1": "part ii, Item 1",
|
||||
"Item 1A": "part ii, Item 1A",
|
||||
"Item 2": "part ii, Item 2",
|
||||
"Item 3": "part ii, Item 3",
|
||||
"Item 4": "part ii, Item 4",
|
||||
"Item 5": "part ii, Item 5",
|
||||
"Item 6": "part ii, Item 6",
|
||||
},
|
||||
}
|
||||
|
||||
items_match_3 = { # Item descriptions
|
||||
"part i": {
|
||||
"Item 1": "Financial Statements",
|
||||
"Item 2": "Management’s Discussion and Analysis of Financial Condition and Results of Operations",
|
||||
"Item 3": "Quantitative and Qualitative Disclosures About Market Risk",
|
||||
"Item 4": "Controls and Procedures",
|
||||
},
|
||||
"part ii": {
|
||||
"Item 1": "Legal Proceedings",
|
||||
"Item 1A": "Risk Factors",
|
||||
"Item 2": "Unregistered Sales of Equity Securities and Use of Proceeds",
|
||||
"Item 3": "Defaults Upon Senior Securities",
|
||||
"Item 4": "Mine Safety Disclosures",
|
||||
"Item 5": "Other Information",
|
||||
"Item 6": "Exhibits",
|
||||
},
|
||||
}
|
||||
|
||||
match_function_map = [
|
||||
(
|
||||
items_match_1,
|
||||
lambda x, y: x.strip().lower().startswith(y.lower()),
|
||||
),
|
||||
(items_match_2, lambda x, y: x.strip().lower() == y.lower()),
|
||||
(items_match_3, lambda x, y: y.lower() in x.lower()),
|
||||
# (items_match_4, lambda x, y: x.strip().lower().startswith(y.lower())),
|
||||
]
|
||||
|
||||
# Process matches and ensure unique items
|
||||
item_dict = {}
|
||||
|
||||
for one_link in link_info:
|
||||
for match_map, match_function in match_function_map:
|
||||
for part in match_map:
|
||||
for item_name, match_text in match_map[part].items():
|
||||
for cell in one_link["text"]:
|
||||
if match_function(cell, match_text):
|
||||
link = one_link["link"]
|
||||
if (
|
||||
one_link["part"] == part
|
||||
and item_name not in item_dict
|
||||
):
|
||||
item_dict[(part, item_name)] = link
|
||||
|
||||
# Convert to list format without sorting
|
||||
item_links = [(name, link) for name, link in item_dict.items()]
|
||||
if len(item_links) > 11:
|
||||
return {}
|
||||
return item_links
|
||||
|
||||
def extract_html(self, html_content: str, structure, markdown:bool=True) -> dict:
|
||||
"""Extract 10-Q items from HTML content, handling same item numbers in different parts."""
|
||||
index_table = self.extract_html_link_info(html_content)
|
||||
item_links = self.extract_item_and_split(index_table)
|
||||
|
||||
# Assemble items with part information preserved
|
||||
item_result = AssembleText.assemble_items(html_content, item_links, markdown=markdown)
|
||||
res = {
|
||||
"part i": {},
|
||||
"part ii":{},
|
||||
"extracted":{}
|
||||
}
|
||||
for one in item_result:
|
||||
if isinstance(one, str):
|
||||
res["extracted"][one.lower()] = item_result[one]
|
||||
elif isinstance(one, tuple):
|
||||
res[one[0].lower()][one[1].lower()] = item_result[one]
|
||||
return res
|
||||
582
venv/lib/python3.10/site-packages/edgar/files/htmltools.py
Normal file
582
venv/lib/python3.10/site-packages/edgar/files/htmltools.py
Normal file
@@ -0,0 +1,582 @@
|
||||
import re
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache, partial
|
||||
from io import StringIO
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from rich import box
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
|
||||
from edgar.core import pandas_version
|
||||
from edgar.datatools import compress_dataframe
|
||||
from edgar.files.html_documents import Block, HtmlDocument, LinkBlock, TableBlock, table_to_markdown
|
||||
from edgar.richtools import repr_rich
|
||||
|
||||
__all__ = [
|
||||
"Element",
|
||||
"extract_tables",
|
||||
'chunks2df',
|
||||
"html_to_text",
|
||||
'html_sections',
|
||||
'decimal_chunk_fn',
|
||||
"ChunkedDocument",
|
||||
'remove_bold_tags',
|
||||
'detect_decimal_items',
|
||||
'adjust_for_empty_items',
|
||||
"get_text_elements",
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Element:
|
||||
id: str
|
||||
type: str
|
||||
element: Any
|
||||
summary: Optional[str] = None
|
||||
table: Optional[pd.DataFrame] = None
|
||||
|
||||
|
||||
def extract_tables(html_str: str,
|
||||
table_filters: List = None) -> List[pd.DataFrame]:
|
||||
table_filters = table_filters or [filter_tiny_table]
|
||||
tables = pd.read_html(StringIO(html_str))
|
||||
# Compress and filter the tables
|
||||
tables = [
|
||||
compress_dataframe(table)
|
||||
for table in tables
|
||||
# if not all([tf(table) for tf in table_filters])
|
||||
]
|
||||
# Filter out empty tables
|
||||
tables = [table for table in tables if len(table) > 0]
|
||||
return tables
|
||||
|
||||
|
||||
def html_sections(html_str: str,
|
||||
ignore_tables: bool = False) -> List[str]:
|
||||
"""split the html into sections"""
|
||||
document = HtmlDocument.from_html(html_str)
|
||||
return list(document.generate_text_chunks(ignore_tables=ignore_tables))
|
||||
|
||||
|
||||
def html_to_text(html_str: str,
|
||||
ignore_tables: bool = True,
|
||||
sep: str = '\n'
|
||||
) -> str:
|
||||
document = HtmlDocument.from_html(html_str)
|
||||
if not ignore_tables:
|
||||
return document.text
|
||||
return sep.join([chunk for chunk in document.generate_text_chunks(ignore_tables=True)])
|
||||
|
||||
|
||||
def is_inline_xbrl(html: str) -> bool:
|
||||
return "xmlns:ix=" in html[:2000]
|
||||
|
||||
|
||||
def filter_tiny_table(table: pd.DataFrame, min_rows: int = 1, min_cols: int = 1):
|
||||
return len(table) >= min_rows and len(table.columns) >= min_cols
|
||||
|
||||
|
||||
def remove_bold_tags(html_content):
|
||||
# Replace <b>...</b> and <strong>...</strong> tags with their content
|
||||
html_content = re.sub(r'<b>(.*?)</b>', r'\1', html_content)
|
||||
html_content = re.sub(r'<strong>(.*?)</strong>', r'\1', html_content)
|
||||
return html_content
|
||||
|
||||
|
||||
def get_text_elements(elements: List[Element]):
|
||||
return [e for e in elements if e.type == "text"]
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def chunk(html: str):
|
||||
document = HtmlDocument.from_html(html)
|
||||
return list(document.generate_chunks())
|
||||
|
||||
|
||||
int_item_pattern = r"^(Item\s{1,3}[0-9]{1,2}[A-Z]?)\.?"
|
||||
decimal_item_pattern = r"^(Item\s{1,3}[0-9]{1,2}\.[0-9]{2})\.?"
|
||||
|
||||
|
||||
def detect_table_of_contents(text: str):
|
||||
"""Find the table of contents in the text"""
|
||||
return text.lower().count('item') > 10
|
||||
|
||||
|
||||
def detect_signature(text: str) -> bool:
|
||||
"""Find the signature block in the text"""
|
||||
matched = re.match(pattern='^SIGNATURE', string=text, flags=re.IGNORECASE | re.MULTILINE) is not None
|
||||
# If no results are true in the series try anothr pattern
|
||||
if not matched:
|
||||
matched = 'to be signed on its behalf by the undersigned' in text
|
||||
return matched
|
||||
|
||||
|
||||
def detect_int_items(text: pd.Series):
|
||||
return text.str.extract(int_item_pattern, expand=False, flags=re.IGNORECASE | re.MULTILINE)
|
||||
|
||||
def detect_part(text: pd.Series) -> pd.Series:
|
||||
"""
|
||||
Detect and extract 'Part' sections such as 'PART I', 'Part II', etc., from the given text Series.
|
||||
|
||||
Handles various formats found in SEC filings, including:
|
||||
- 'PART I. Financial Information'
|
||||
- 'Part II'
|
||||
- 'PART III — Executive Overview'
|
||||
- 'This section is PART IV'
|
||||
|
||||
Returns:
|
||||
pd.Series: A series containing the extracted 'Part X' values (uppercase), or NaN if not found.
|
||||
"""
|
||||
# Match patterns like 'PART I', 'Part II', 'PART III.', etc.
|
||||
part_pattern = r'^\b(PART\s+[IVXLC]+)\b'
|
||||
# Extract using case-insensitive matching and convert result to uppercase
|
||||
extracted = text.str.extract(part_pattern, flags=re.IGNORECASE | re.MULTILINE, expand=False)
|
||||
# Normalize to uppercase for consistency (e.g., 'Part I' → 'PART I')
|
||||
return extracted.str.upper().str.replace(r'\s+', ' ', regex=True)
|
||||
|
||||
def detect_decimal_items(text: pd.Series):
|
||||
return text.str.extract(decimal_item_pattern, expand=False, flags=re.IGNORECASE | re.MULTILINE)
|
||||
|
||||
|
||||
def find_next_item(index, normalized_items):
|
||||
"""Find the next available item in the DataFrame starting from a given index."""
|
||||
for i in range(index + 1, len(normalized_items)):
|
||||
if normalized_items[i]:
|
||||
return normalized_items[i]
|
||||
return None
|
||||
|
||||
|
||||
def normalize_item(item):
|
||||
"""Normalize item string to a comparable format."""
|
||||
if not pd.isna(item):
|
||||
return re.sub(r"[^0-9A-Za-z ]", "", item) # Remove all but numbers and letters
|
||||
return item
|
||||
|
||||
|
||||
def extract_numeric_alpha_parts(item):
|
||||
"""Extract numeric and alphabetic parts from an item."""
|
||||
numeric_part = int(re.search(r"[0-9]+", item).group()) if item else 0
|
||||
alpha_part = re.search(r"[A-Z]$", item)
|
||||
alpha_part = alpha_part.group() if alpha_part else ''
|
||||
return numeric_part, alpha_part
|
||||
|
||||
|
||||
def is_valid_sequence(current_item, last_valid_item, next_available_item):
|
||||
"""
|
||||
Determine if the current item is valid considering the last and next available items.
|
||||
"""
|
||||
if not current_item or pd.isna(current_item) or not next_available_item or pd.isna(next_available_item):
|
||||
return False
|
||||
|
||||
current_item_num, current_item_alpha = extract_numeric_alpha_parts(current_item)
|
||||
last_item_num, last_item_alpha = extract_numeric_alpha_parts(last_valid_item)
|
||||
next_item_num, next_item_alpha = extract_numeric_alpha_parts(next_available_item)
|
||||
|
||||
# Check if the current item is greater than the last valid item and less than or equal to the next available item
|
||||
if current_item_num == last_item_num:
|
||||
return current_item_alpha > last_item_alpha
|
||||
elif current_item_num == next_item_num:
|
||||
return current_item_alpha < next_item_alpha or next_item_alpha == ''
|
||||
else:
|
||||
return last_item_num < current_item_num <= next_item_num
|
||||
|
||||
|
||||
def adjust_detected_items(chunk_df: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
||||
"""
|
||||
Ensure that the items are in sequence and filter out any out of sequence items.
|
||||
"""
|
||||
chunk_df['NormalizedItem'] = chunk_df['DetectedItem'].apply(normalize_item)
|
||||
normalized_items = chunk_df['NormalizedItem'].replace([np.nan], [None]).tolist()
|
||||
|
||||
last_valid_item = ""
|
||||
valid_items = pd.Series(index=chunk_df.index, dtype=object) # Create a series to store valid items
|
||||
|
||||
# First find the index of the table of contents toc.
|
||||
toc_index_rows = chunk_df[chunk_df.Toc.notnull() & chunk_df.Toc]
|
||||
# If not found set to 0
|
||||
toc_index = toc_index_rows.index[0] if len(toc_index_rows) > 0 else 0
|
||||
|
||||
# Iterate only through rows with non-null 'Item' starting at toc_index + 1
|
||||
|
||||
for index, row in chunk_df.iterrows():
|
||||
if index < toc_index + 1:
|
||||
continue
|
||||
current_item = row['NormalizedItem']
|
||||
next_available_item = find_next_item(index, normalized_items)
|
||||
|
||||
if is_valid_sequence(current_item, last_valid_item, next_available_item):
|
||||
valid_items[index] = current_item
|
||||
last_valid_item = current_item # Update the last valid item
|
||||
else:
|
||||
valid_items[index] = pd.NA # Mark as invalid/out of sequence
|
||||
|
||||
chunk_df['Item'] = valid_items
|
||||
return chunk_df
|
||||
|
||||
|
||||
def adjust_for_empty_items(chunk_df: pd.DataFrame,
|
||||
**kwargs) -> pd.DataFrame:
|
||||
chunk_df['Item'] = chunk_df.DetectedItem
|
||||
for index, row in chunk_df[chunk_df.DetectedItem.notnull()].iterrows():
|
||||
item = row.Item
|
||||
# Get item_structure from kwargs
|
||||
item_structure = kwargs.get('item_structure')
|
||||
structure = item_structure.get_item(item)
|
||||
if not structure:
|
||||
break
|
||||
title = structure.get('Title')
|
||||
text = row.Text
|
||||
# Look for Item NUM Description Item in the text
|
||||
pattern = rf"^({item}.? {title}\W+)"
|
||||
match = re.search(pattern + "Item [1-9]", text, flags=re.IGNORECASE | re.MULTILINE)
|
||||
if match:
|
||||
text = re.sub(pattern, "", text, flags=re.IGNORECASE | re.MULTILINE)
|
||||
|
||||
# extract the item from text using decimal_item_pattern
|
||||
match = re.search(decimal_item_pattern, text, flags=re.IGNORECASE | re.MULTILINE)
|
||||
if match:
|
||||
new_item = match.group(1)
|
||||
chunk_df.loc[index, 'Item'] = new_item
|
||||
|
||||
return chunk_df
|
||||
|
||||
|
||||
def _render_blocks_using_old_markdown_tables(blocks:List[Block]):
|
||||
"""
|
||||
This renders tables as the old style markdown tables
|
||||
Because thd item chunking uses these tables.
|
||||
So this is a split from the newer table rendering logic
|
||||
"""
|
||||
return "".join([
|
||||
table_to_markdown(block.table_element) if isinstance(block, TableBlock) else block.get_text()
|
||||
for block in blocks
|
||||
]).strip()
|
||||
|
||||
def chunks2df(chunks: List[List[Block]],
|
||||
item_detector: Callable[[pd.Series], pd.Series] = detect_int_items,
|
||||
item_adjuster: Callable[[pd.DataFrame, Dict[str, Any]], pd.DataFrame] = adjust_detected_items,
|
||||
item_structure=None,
|
||||
) -> pd.DataFrame:
|
||||
"""Convert the chunks to a dataframe
|
||||
: item_detector: A function that detects the item in the text column
|
||||
: item_adjuster: A function that finds issues like out of sequence items and adjusts the item column
|
||||
: item_structure: A dictionary of items specific to each filing e.g. 8-K, 10-K, 10-Q
|
||||
"""
|
||||
# Create a dataframe from the chunks. Add columns as necessary
|
||||
chunk_df = pd.DataFrame([{'Text': _render_blocks_using_old_markdown_tables(blocks),
|
||||
'Table': isinstance(blocks, TableBlock)}
|
||||
for blocks in chunks]
|
||||
).assign(Chars=lambda df: df.Text.apply(len),
|
||||
Signature=lambda df: df.Text.apply(detect_signature).fillna(""),
|
||||
TocLink=lambda df: df.Text.str.match('^Table of Contents$',
|
||||
flags=re.IGNORECASE | re.MULTILINE),
|
||||
Toc=lambda df: df.Text.head(100).apply(detect_table_of_contents),
|
||||
Empty=lambda df: df.Text.str.contains('^$', na=True),
|
||||
Part=lambda df: detect_part(df.Text),
|
||||
Item=lambda df: item_detector(df.Text)
|
||||
)
|
||||
# If the row is 'toc' then set the item and part to empty
|
||||
chunk_df.loc[chunk_df.Toc.notnull() & chunk_df.Toc, 'Item'] = ""
|
||||
# if item_adjuster:
|
||||
# chunk_df = item_adjuster(chunk_df, **{'item_structure': item_structure, 'item_detector': item_detector})
|
||||
# Foward fill item and parts
|
||||
# Handle deprecation warning in fillna(method='ffill')
|
||||
if pandas_version >= (2, 1, 0):
|
||||
# Opt-in to pandas future behavior to avoid silent downcasting warnings
|
||||
with pd.option_context('future.no_silent_downcasting', True):
|
||||
chunk_df['Item'] = chunk_df['Item'].ffill()
|
||||
chunk_df['Part'] = chunk_df['Part'].ffill()
|
||||
else:
|
||||
chunk_df.Item = chunk_df.Item.fillna(method='ffill')
|
||||
chunk_df.Part = chunk_df.Part.fillna(method='ffill')
|
||||
|
||||
# After forward fill handle the signature at the bottom
|
||||
signature_rows = chunk_df[chunk_df.Signature]
|
||||
if len(signature_rows) > 0:
|
||||
signature_loc = signature_rows.index[0]
|
||||
chunk_df.loc[signature_loc:, 'Item'] = pd.NA
|
||||
chunk_df.Signature = chunk_df.Signature.fillna("")
|
||||
|
||||
# Fill the Item column with "" then set to title case
|
||||
chunk_df.Item = chunk_df.Item.fillna("").str.title()
|
||||
chunk_df.Part = chunk_df.Part.fillna("").str.title()
|
||||
|
||||
# Normalize spaces in item
|
||||
chunk_df.Item = chunk_df.Item.apply(lambda item: re.sub(r'\s+', ' ', item))
|
||||
chunk_df.Part = chunk_df.Part.apply(lambda part: re.sub(r'\s+', ' ', part).strip())
|
||||
|
||||
# Finalize the colums
|
||||
chunk_df = chunk_df[['Text', 'Table', 'Chars', 'Signature', 'TocLink', 'Toc', 'Empty', 'Part', 'Item']]
|
||||
|
||||
return chunk_df
|
||||
|
||||
|
||||
# This function is used by 8-K and other filings that have the item form 1.02 for example
|
||||
decimal_chunk_fn = partial(chunks2df,
|
||||
item_detector=detect_decimal_items,
|
||||
item_adjuster=adjust_for_empty_items)
|
||||
|
||||
|
||||
class ChunkedDocument:
|
||||
"""
|
||||
Contains the html as broken into chunks
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
html: str,
|
||||
chunk_fn: Callable[[List], pd.DataFrame] = chunks2df,
|
||||
prefix_src: str = ""):
|
||||
"""
|
||||
:param html: The filing html
|
||||
:param chunk_fn: A function that converts the chunks to a dataframe
|
||||
:param file_path: The path to the filing
|
||||
"""
|
||||
self.chunks = chunk(html)
|
||||
self._chunked_data = chunk_fn(self.chunks)
|
||||
self.chunk_fn = chunk_fn
|
||||
self.prefix_src = prefix_src
|
||||
self.document_id_parse:Dict = {}
|
||||
|
||||
@lru_cache(maxsize=4)
|
||||
def as_dataframe(self):
|
||||
return self.chunk_fn(self.chunks)
|
||||
|
||||
def show_items(self, df_query: str, *columns):
|
||||
result = self._chunked_data.query(df_query)
|
||||
if len(columns) > 0:
|
||||
columns = ["Text"] + list(columns)
|
||||
result = result.filter(columns)
|
||||
|
||||
return result
|
||||
|
||||
def list_items(self):
|
||||
return [item for item in self._chunked_data.Item.drop_duplicates().tolist() if item]
|
||||
|
||||
def _chunks_for(self, item_or_part: str, col: str = 'Item'):
|
||||
chunk_df = self._chunked_data
|
||||
|
||||
# Handle cases where the item has the decimal point e.g. 5.02
|
||||
item_or_part = item_or_part.replace('.', r'\.')
|
||||
pattern = re.compile(rf'^{item_or_part}$', flags=re.IGNORECASE)
|
||||
|
||||
col_mask = chunk_df[col].str.match(pattern)
|
||||
toc_mask = ~(~chunk_df.Toc.notnull() & chunk_df.Toc)
|
||||
empty_mask = ~chunk_df.Empty
|
||||
|
||||
mask = col_mask & toc_mask & empty_mask
|
||||
|
||||
for i in mask[mask].index:
|
||||
yield self.chunks[i]
|
||||
|
||||
def _chunks_mul_for(self, part: str, item: str):
|
||||
chunk_df = self._chunked_data
|
||||
|
||||
# Handle cases where the item has the decimal point e.g. 5.02
|
||||
part = part.replace('.', r'\.')
|
||||
item = item.replace('.', r'\.')
|
||||
pattern_part = re.compile(rf'^{part}$', flags=re.IGNORECASE)
|
||||
pattern_item = re.compile(rf'^{item}$', flags=re.IGNORECASE)
|
||||
|
||||
item_mask = chunk_df["Item"].str.match(pattern_item)
|
||||
part_mask = chunk_df["Part"].str.match(pattern_part)
|
||||
toc_mask = ~(~chunk_df.Toc.notnull() & chunk_df.Toc)
|
||||
empty_mask = ~chunk_df.Empty
|
||||
mask = part_mask & item_mask & toc_mask & empty_mask
|
||||
|
||||
# Process to keep only consecutive indices, discard non-consecutive head/tail indices with warning
|
||||
index_list = mask[mask].index.to_list()
|
||||
if not index_list:
|
||||
return
|
||||
|
||||
continuous_segments = []
|
||||
current_segment = [index_list[0]]
|
||||
|
||||
for i in range(1, len(index_list)):
|
||||
if index_list[i] <= current_segment[-1] + 5:
|
||||
current_segment.append(index_list[i])
|
||||
else:
|
||||
continuous_segments.append(current_segment)
|
||||
current_segment = [index_list[i]]
|
||||
|
||||
continuous_segments.append(current_segment)
|
||||
|
||||
# retain only the longest continuous segment
|
||||
longest_segment = max(continuous_segments, key=len)
|
||||
|
||||
# warning dity content
|
||||
if len(continuous_segments) > 1:
|
||||
discarded_indices = []
|
||||
for segment in continuous_segments:
|
||||
if segment != longest_segment:
|
||||
discarded_indices.extend(segment)
|
||||
warnings.warn(
|
||||
f"Discarded non-continuous indices: {discarded_indices}. "
|
||||
f"""content: {''.join([
|
||||
''.join(block.get_text() for block in self.chunks[idx])
|
||||
for idx in discarded_indices
|
||||
])}"""
|
||||
)
|
||||
for i in longest_segment:
|
||||
yield self.chunks[i]
|
||||
|
||||
def chunks_for_item(self, item: str):
|
||||
"""
|
||||
Returns chunks of text for a given item from the document.
|
||||
|
||||
Args:
|
||||
item (str): The item name to retrieve chunks for.
|
||||
|
||||
Returns:
|
||||
List[str]: List of text chunks corresponding to the specified item.
|
||||
"""
|
||||
return self._chunks_for(item, col='Item')
|
||||
|
||||
def chunks_for_part(self, part: str):
|
||||
return self._chunks_for(part, col='Part')
|
||||
|
||||
def average_chunk_size(self):
|
||||
return int(self._chunked_data.Chars.mean())
|
||||
|
||||
def tables(self):
|
||||
for chunk in self.chunks:
|
||||
for block in chunk:
|
||||
if isinstance(block, TableBlock):
|
||||
yield block
|
||||
|
||||
def assemble_block_text(self, chunks: List[Block]):
|
||||
|
||||
if self.prefix_src:
|
||||
for chunk in chunks:
|
||||
for block in chunk:
|
||||
if isinstance(block, LinkBlock):
|
||||
yield block.to_markdown(prefix_src=self.prefix_src)
|
||||
else:
|
||||
yield block.get_text()
|
||||
else:
|
||||
for chunk in chunks:
|
||||
yield "".join([block.get_text() for block in chunk])
|
||||
|
||||
def assemble_block_markdown(self, chunks: List[Block]):
|
||||
if self.prefix_src:
|
||||
for chunk in chunks:
|
||||
for block in chunk:
|
||||
if isinstance(block, LinkBlock):
|
||||
yield block.to_markdown(prefix_src=self.prefix_src)
|
||||
else:
|
||||
yield block.to_markdown()
|
||||
else:
|
||||
for chunk in chunks:
|
||||
yield "".join([block.to_markdown() for block in chunk])
|
||||
|
||||
def get_item_with_part(self, part: str, item: str, markdown:bool=False):
|
||||
if isinstance(part, str):
|
||||
chunks = list(self._chunks_mul_for(part, item))
|
||||
if markdown:
|
||||
return self.clean_part_line("".join([text for text in self.assemble_block_markdown(chunks)]))
|
||||
else:
|
||||
return self.clean_part_line("".join([text for text in self.assemble_block_text(chunks)]))
|
||||
|
||||
@staticmethod
|
||||
def clean_part_line(text:str):
|
||||
res = text.rstrip("\n")
|
||||
last_line = res.split("\n")[-1]
|
||||
if re.match(r'^\b(PART\s+[IVXLC]+)\b', last_line):
|
||||
res = res.rstrip(last_line).rstrip()
|
||||
return res
|
||||
|
||||
def get_signature(self, markdown:bool=False):
|
||||
sig_index = self._chunked_data[self._chunked_data.Signature].index
|
||||
if markdown:
|
||||
res = "".join(
|
||||
[text for text in
|
||||
self.assemble_block_markdown(
|
||||
[self.chunks[idx] for idx in sig_index]
|
||||
)])
|
||||
else:
|
||||
res = "".join(
|
||||
[text for text in
|
||||
self.assemble_block_text(
|
||||
[self.chunks[idx] for idx in sig_index]
|
||||
)])
|
||||
return self.clean_part_line(res)
|
||||
|
||||
|
||||
def get_introduction(self, markdown:bool=False):
|
||||
"""
|
||||
Extract and return the introduction section of the filing document.
|
||||
|
||||
The introduction is defined as all content before the first valid Part or Item.
|
||||
|
||||
Returns:
|
||||
str: The extracted introduction text, or an empty string if none found.
|
||||
"""
|
||||
# Find the first index where Part or Item appears
|
||||
part_indices = self._chunked_data[self._chunked_data.Part != ""].index
|
||||
item_indices = self._chunked_data[self._chunked_data.Item != ""].index
|
||||
|
||||
if len(part_indices) == 0 and len(item_indices) == 0:
|
||||
return ""
|
||||
|
||||
# Use the last one
|
||||
intro_index = max(
|
||||
part_indices[0] if len(part_indices) else 0,
|
||||
item_indices[0] if len(item_indices) else 0
|
||||
)
|
||||
|
||||
if intro_index == 0:
|
||||
return ""
|
||||
|
||||
# Reuse __getitem__ to extract chunks up to min_index
|
||||
if markdown:
|
||||
res = "".join(
|
||||
[text for text in
|
||||
self.assemble_block_markdown(
|
||||
[self.chunks[idx] for idx in range(intro_index)]
|
||||
)])
|
||||
else:
|
||||
res = "".join(
|
||||
[text for text in
|
||||
self.assemble_block_text(
|
||||
[self.chunks[idx] for idx in range(intro_index)]
|
||||
)])
|
||||
return self.clean_part_line(res)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.chunks)
|
||||
|
||||
def __getitem__(self, item, markdown:bool=False):
|
||||
if isinstance(item, int):
|
||||
chunks = [self.chunks[item]]
|
||||
elif isinstance(item, str):
|
||||
chunks = list(self.chunks_for_item(item))
|
||||
else:
|
||||
return None
|
||||
if len(chunks) == 0:
|
||||
return None
|
||||
# render the nested List of List [str]
|
||||
if markdown:
|
||||
return "".join([text for text in self.assemble_block_markdown(chunks)])
|
||||
else:
|
||||
return "".join([text for text in self.assemble_block_text(chunks)])
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.chunks)
|
||||
|
||||
def __rich__(self):
|
||||
table = Table("Chunks",
|
||||
"Items",
|
||||
"Avg Size", box=box.SIMPLE)
|
||||
table.add_row(str(len(self.chunks)),
|
||||
",".join(self.list_items()),
|
||||
str(self.average_chunk_size()),
|
||||
)
|
||||
return Panel(table, box=box.ROUNDED, title="HTML Document")
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__())
|
||||
|
||||
|
||||
198
venv/lib/python3.10/site-packages/edgar/files/markdown.py
Normal file
198
venv/lib/python3.10/site-packages/edgar/files/markdown.py
Normal file
@@ -0,0 +1,198 @@
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from edgar.files.html import BaseNode, Document
|
||||
from edgar.files.tables import ProcessedTable, TableProcessor
|
||||
|
||||
__all__ = ['to_markdown', 'MarkdownRenderer']
|
||||
|
||||
|
||||
class MarkdownRenderer:
|
||||
def __init__(self, document: Document, start_page_number: int = 0):
|
||||
self.document = document
|
||||
self.start_page_number = start_page_number
|
||||
self.toc_entries: List[Tuple[int, str, str]] = [] # level, text, anchor
|
||||
self.reference_links: Dict[str, str] = {}
|
||||
self.current_section = ""
|
||||
|
||||
def render(self) -> str:
|
||||
"""Render complete document"""
|
||||
rendered_parts = []
|
||||
|
||||
for node in self.document.nodes:
|
||||
rendered = ""
|
||||
if node.type == 'text_block': # Changed from 'paragraph'
|
||||
rendered = self._render_text_block(node)
|
||||
elif node.type == 'table':
|
||||
processed_table = TableProcessor.process_table(node)
|
||||
rendered = self._render_table(processed_table) if processed_table else ""
|
||||
elif node.type == 'heading':
|
||||
rendered = self._render_heading(node)
|
||||
elif node.type == 'page_break':
|
||||
rendered = self._render_page_break(node)
|
||||
|
||||
if rendered:
|
||||
rendered_parts.append(rendered.rstrip()) # Remove trailing whitespace
|
||||
|
||||
# Join with single newline and clean up multiple newlines
|
||||
return self._clean_spacing('\n\n'.join(filter(None, rendered_parts)))
|
||||
|
||||
def _clean_spacing(self, text: str) -> str:
|
||||
"""Clean up spacing while maintaining valid markdown"""
|
||||
# Replace 3 or more newlines with 2 newlines
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
# Fix header spacing by treating the header line as a complete unit
|
||||
text = re.sub(r'\n*(#{1,6} [^\n]*[A-Za-z0-9][^\n]*)\n*', r'\n\n\1\n', text)
|
||||
|
||||
# Clean up spacing around paragraphs
|
||||
text = re.sub(r'\n{2,}(?=\S)', '\n\n', text)
|
||||
|
||||
text = re.sub("\xa0", " ", text)
|
||||
|
||||
# Trim leading/trailing whitespace
|
||||
return text.strip()
|
||||
|
||||
def _render_header(self) -> str:
|
||||
"""Render SEC filing header with metadata"""
|
||||
header_parts = []
|
||||
|
||||
# Try to find filing type and registration number
|
||||
for node in self.document.nodes[:5]: # Check first few nodes
|
||||
if node.type == 'text_block': # Changed from 'paragraph'
|
||||
text = node.content
|
||||
if 'registration no.' in text.lower():
|
||||
header_parts.append(f"**Registration No.:** {text.split('.')[-1].strip()}")
|
||||
if 'filed pursuant to' in text.lower():
|
||||
header_parts.append(f"**Filing Type:** {text.strip()}")
|
||||
|
||||
return "\n".join(header_parts) if header_parts else ""
|
||||
|
||||
def _render_heading(self, node: BaseNode) -> str:
|
||||
"""Render heading with metadata support"""
|
||||
if node.type != 'heading':
|
||||
raise ValueError(f"Expected heading node, got {node.type}")
|
||||
|
||||
prefix = '#' * node.level
|
||||
text = node.content
|
||||
|
||||
# Check metadata for any special rendering instructions
|
||||
if node.get_metadata('render_style') == 'centered':
|
||||
return f"{prefix} <div align='center'>{text}</div>"
|
||||
|
||||
return f"{prefix} {text}"
|
||||
|
||||
def _render_text_block(self, node: BaseNode) -> str:
|
||||
"""Render text block (formerly paragraph) with metadata support"""
|
||||
if node.type != 'text_block':
|
||||
raise ValueError(f"Expected text_block node, got {node.type}")
|
||||
|
||||
text = node.content
|
||||
|
||||
# Apply styling
|
||||
if node.style:
|
||||
if node.style.font_weight == 'bold':
|
||||
text = f"**{text}**"
|
||||
if node.style.text_align == 'center':
|
||||
text = f"<div align='center'>{text}</div>"
|
||||
|
||||
# Check metadata for special handling
|
||||
if node.get_metadata('is_note', False):
|
||||
text = f"> Note: {text}"
|
||||
elif node.get_metadata('is_quote', False):
|
||||
text = f"> {text}"
|
||||
|
||||
return text
|
||||
|
||||
def _render_table(self, processed: ProcessedTable) -> str:
|
||||
"""Render processed table as Markdown"""
|
||||
if not processed.data_rows:
|
||||
return ""
|
||||
|
||||
# Calculate column widths
|
||||
col_widths = []
|
||||
for col_idx in range(len(processed.data_rows[0])):
|
||||
# Consider headers in width calculation
|
||||
col_content = []
|
||||
if processed.headers:
|
||||
col_content.append(processed.headers[col_idx])
|
||||
col_content.extend(row[col_idx] for row in processed.data_rows)
|
||||
|
||||
# Calculate max width, considering multiline content and handling empty columns
|
||||
widths = []
|
||||
for cell in col_content:
|
||||
if cell.strip(): # Only consider non-empty cells
|
||||
widths.extend(len(line) for line in cell.split('\n'))
|
||||
|
||||
# Default to minimum width of 3 if column is empty
|
||||
max_width = max(widths) if widths else 3
|
||||
col_widths.append(max_width)
|
||||
|
||||
# Build table lines
|
||||
lines = []
|
||||
|
||||
# Add headers if present
|
||||
if processed.headers:
|
||||
header_lines = []
|
||||
for col_idx, header in enumerate(processed.headers):
|
||||
header_lines.append(self._format_markdown_cell(
|
||||
header, col_widths[col_idx], processed.column_alignments[col_idx]))
|
||||
lines.append('|' + '|'.join(header_lines) + '|')
|
||||
|
||||
# Add separator line
|
||||
separators = []
|
||||
for idx, width in enumerate(col_widths):
|
||||
align = processed.column_alignments[idx]
|
||||
if align == "left":
|
||||
sep = ':' + '-' * (width + 1)
|
||||
else: # right
|
||||
sep = '-' * (width + 1) + ':'
|
||||
separators.append(sep)
|
||||
lines.append('|' + '|'.join(separators) + '|')
|
||||
|
||||
# Add data rows
|
||||
for row in processed.data_rows:
|
||||
row_cells = []
|
||||
for col_idx, cell in enumerate(row):
|
||||
row_cells.append(self._format_markdown_cell(
|
||||
cell, col_widths[col_idx], processed.column_alignments[col_idx]))
|
||||
lines.append('|' + '|'.join(row_cells) + '|')
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
def _format_markdown_cell(self, content: str, width: int, alignment: str) -> str:
|
||||
"""Format cell content for markdown table"""
|
||||
if not content.strip():
|
||||
return ' ' * (width + 2) # Add padding
|
||||
|
||||
lines = content.split('\n')
|
||||
formatted_lines = []
|
||||
for line in lines:
|
||||
if alignment == "left":
|
||||
formatted_lines.append(f" {line:<{width}} ")
|
||||
else: # right
|
||||
formatted_lines.append(f" {line:>{width}} ")
|
||||
|
||||
return '\n'.join(formatted_lines)
|
||||
|
||||
def _render_page_break(self, node: BaseNode) -> str:
|
||||
"""Render page break as delimiter"""
|
||||
adjusted_page_number = node.page_number + self.start_page_number
|
||||
return f"{{{adjusted_page_number}}}------------------------------------------------"
|
||||
|
||||
|
||||
def to_markdown(html_content: str, include_page_breaks: bool = False, start_page_number: int = 0) -> Optional[str]:
|
||||
"""Convert HTML content to markdown with optional page breaks
|
||||
|
||||
Args:
|
||||
html_content: HTML string to convert
|
||||
include_page_breaks: Whether to include page break markers
|
||||
start_page_number: Starting page number for page break markers (default: 0)
|
||||
|
||||
Returns:
|
||||
Markdown string or None if parsing failed
|
||||
"""
|
||||
document = Document.parse(html_content, include_page_breaks=include_page_breaks)
|
||||
if document:
|
||||
return document.to_markdown(start_page_number=start_page_number)
|
||||
return None
|
||||
244
venv/lib/python3.10/site-packages/edgar/files/page_breaks.py
Normal file
244
venv/lib/python3.10/site-packages/edgar/files/page_breaks.py
Normal file
@@ -0,0 +1,244 @@
|
||||
"""Page break detection utilities for SEC documents.
|
||||
|
||||
This module provides shared page break detection functionality that can be used
|
||||
by both the edgar library and external projects that need to detect page breaks
|
||||
in SEC HTML documents.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from bs4 import Tag
|
||||
|
||||
|
||||
class PageBreakDetector:
|
||||
"""Detects page breaks in SEC HTML documents."""
|
||||
|
||||
# Class-based page break selectors
|
||||
CLASS_BASED_SELECTORS = [
|
||||
'div.BRPFPageBreak',
|
||||
'div.pagebreak',
|
||||
'div.page-break'
|
||||
]
|
||||
|
||||
# HR elements with specific styling
|
||||
HR_PAGE_BREAK_SELECTORS = [
|
||||
'hr[style*="height:3px"]',
|
||||
'hr[style*="height: 3px"]'
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _find_page_like_divs(element: Tag) -> List[Dict[str, Any]]:
|
||||
"""Find div elements with page-like dimensions."""
|
||||
page_divs = []
|
||||
divs = element.find_all('div')
|
||||
|
||||
for div in divs:
|
||||
style = div.get('style', '')
|
||||
if not style:
|
||||
continue
|
||||
|
||||
if PageBreakDetector._is_page_like_div(style):
|
||||
page_divs.append({
|
||||
'element': div.name,
|
||||
'selector': 'page-like-div',
|
||||
'style': style,
|
||||
'classes': div.get('class', []),
|
||||
'is_page_div': True
|
||||
})
|
||||
|
||||
return page_divs
|
||||
|
||||
@staticmethod
|
||||
def _is_page_like_div(style: str) -> bool:
|
||||
"""Check if a div has page-like dimensions based on its style.
|
||||
|
||||
Args:
|
||||
style: CSS style string to analyze
|
||||
|
||||
Returns:
|
||||
True if the div has page-like dimensions and styling
|
||||
"""
|
||||
# Parse the style string to extract key properties
|
||||
style_props = {}
|
||||
for prop in style.split(';'):
|
||||
if ':' in prop:
|
||||
key, value = prop.split(':', 1)
|
||||
style_props[key.strip().lower()] = value.strip().lower()
|
||||
|
||||
# Check for page-like dimensions
|
||||
height = style_props.get('height', '')
|
||||
width = style_props.get('width', '')
|
||||
position = style_props.get('position', '')
|
||||
overflow = style_props.get('overflow', '')
|
||||
|
||||
# Look for typical page dimensions
|
||||
# Common page heights: 842.4pt (A4), 792pt (Letter), 1008pt (Legal)
|
||||
# Common page widths: 597.6pt (A4), 612pt (Letter), 612pt (Legal)
|
||||
page_heights = ['842.4pt', '792pt', '1008pt']
|
||||
page_widths = ['597.6pt', '612pt']
|
||||
|
||||
has_page_height = any(ph in height for ph in page_heights)
|
||||
has_page_width = any(pw in width for pw in page_widths)
|
||||
has_position = position in ['relative', 'absolute']
|
||||
has_overflow = 'hidden' in overflow
|
||||
|
||||
# Consider it a page div if it has both page-like dimensions
|
||||
# and typical page styling properties
|
||||
return has_page_height and has_page_width and (has_position or has_overflow)
|
||||
|
||||
@staticmethod
|
||||
def mark_page_breaks(element: Tag) -> None:
|
||||
"""Mark page break elements with a special attribute for detection.
|
||||
|
||||
This method adds '_is_page_break' attributes to elements that represent
|
||||
page breaks, which can be used by other parts of the system.
|
||||
|
||||
Args:
|
||||
element: BeautifulSoup Tag element to mark
|
||||
"""
|
||||
# Mark CSS page break elements using case-insensitive detection
|
||||
PageBreakDetector._mark_css_page_breaks(element)
|
||||
|
||||
# Mark class-based page breaks
|
||||
for selector in PageBreakDetector.CLASS_BASED_SELECTORS:
|
||||
page_breaks = element.select(selector)
|
||||
for pb in page_breaks:
|
||||
pb['_is_page_break'] = 'true'
|
||||
# Also mark parent containers that contain page breaks
|
||||
if pb.parent and pb.parent.name == 'div':
|
||||
parent_classes = pb.parent.get('class', [])
|
||||
if any('pagebreak' in cls.lower() for cls in parent_classes):
|
||||
pb.parent['_is_page_break'] = 'true'
|
||||
|
||||
# Mark HR page breaks
|
||||
for selector in PageBreakDetector.HR_PAGE_BREAK_SELECTORS:
|
||||
page_breaks = element.select(selector)
|
||||
for pb in page_breaks:
|
||||
pb['_is_page_break'] = 'true'
|
||||
|
||||
# Mark page-like divs
|
||||
divs = element.find_all('div')
|
||||
for div in divs:
|
||||
style = div.get('style', '')
|
||||
if style and PageBreakDetector._is_page_like_div(style):
|
||||
div['_is_page_break'] = 'true'
|
||||
|
||||
@staticmethod
|
||||
def _mark_css_page_breaks(element: Tag) -> None:
|
||||
"""Mark CSS page break elements using case-insensitive detection."""
|
||||
# Define the page break patterns we're looking for (case insensitive)
|
||||
page_break_patterns = [
|
||||
r'page-break-before\s*:\s*always',
|
||||
r'page-break-after\s*:\s*always'
|
||||
]
|
||||
|
||||
# Compile case-insensitive regex patterns
|
||||
compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in page_break_patterns]
|
||||
|
||||
# Find all elements that could have page break styles
|
||||
for tag_name in ['p', 'div', 'hr']:
|
||||
elements = element.find_all(tag_name)
|
||||
for el in elements:
|
||||
style = el.get('style', '')
|
||||
if not style:
|
||||
continue
|
||||
|
||||
# Check if any page break pattern matches
|
||||
for pattern in compiled_patterns:
|
||||
if pattern.search(style):
|
||||
el['_is_page_break'] = 'true'
|
||||
break # Only mark each element once
|
||||
|
||||
|
||||
def detect_page_breaks(html_content: str) -> List[Dict[str, Any]]:
|
||||
"""Detect page breaks in HTML content.
|
||||
|
||||
This is the main public function for external use.
|
||||
|
||||
Args:
|
||||
html_content: HTML string to analyze
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing page break information
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# For the public API, we need to collect info about page breaks
|
||||
# This is mainly used for testing and external analysis
|
||||
page_breaks = []
|
||||
|
||||
# Find CSS page break elements using case-insensitive detection
|
||||
page_break_patterns = [
|
||||
r'page-break-before\s*:\s*always',
|
||||
r'page-break-after\s*:\s*always'
|
||||
]
|
||||
compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in page_break_patterns]
|
||||
|
||||
for tag_name in ['p', 'div', 'hr']:
|
||||
elements = soup.find_all(tag_name)
|
||||
for el in elements:
|
||||
style = el.get('style', '')
|
||||
if not style:
|
||||
continue
|
||||
|
||||
for pattern in compiled_patterns:
|
||||
if pattern.search(style):
|
||||
page_breaks.append({
|
||||
'element': el.name,
|
||||
'selector': f'{tag_name}[style*="page-break"]',
|
||||
'style': style,
|
||||
'classes': el.get('class', []),
|
||||
'is_page_div': False
|
||||
})
|
||||
break
|
||||
|
||||
# Find class-based page breaks
|
||||
for selector in PageBreakDetector.CLASS_BASED_SELECTORS:
|
||||
elements = soup.select(selector)
|
||||
for el in elements:
|
||||
page_breaks.append({
|
||||
'element': el.name,
|
||||
'selector': selector,
|
||||
'style': el.get('style', ''),
|
||||
'classes': el.get('class', []),
|
||||
'is_page_div': False
|
||||
})
|
||||
|
||||
# Find HR page breaks
|
||||
for selector in PageBreakDetector.HR_PAGE_BREAK_SELECTORS:
|
||||
elements = soup.select(selector)
|
||||
for el in elements:
|
||||
page_breaks.append({
|
||||
'element': el.name,
|
||||
'selector': selector,
|
||||
'style': el.get('style', ''),
|
||||
'classes': el.get('class', []),
|
||||
'is_page_div': False
|
||||
})
|
||||
|
||||
# Find page-like divs
|
||||
page_divs = PageBreakDetector._find_page_like_divs(soup)
|
||||
page_breaks.extend(page_divs)
|
||||
|
||||
return page_breaks
|
||||
|
||||
|
||||
def mark_page_breaks(html_content: str) -> str:
|
||||
"""Mark page breaks in HTML content and return the modified HTML.
|
||||
|
||||
Args:
|
||||
html_content: HTML string to process
|
||||
|
||||
Returns:
|
||||
Modified HTML string with page break markers added
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
PageBreakDetector.mark_page_breaks(soup)
|
||||
return str(soup)
|
||||
|
||||
|
||||
724
venv/lib/python3.10/site-packages/edgar/files/styles.py
Normal file
724
venv/lib/python3.10/site-packages/edgar/files/styles.py
Normal file
@@ -0,0 +1,724 @@
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Optional, Tuple, Union
|
||||
|
||||
from bs4 import Tag
|
||||
|
||||
from edgar.core import log as logger
|
||||
|
||||
__all__ = ['StyleInfo', 'UnitType', 'StyleUnit', 'parse_style', 'is_heading', 'get_heading_level']
|
||||
|
||||
base_font_size = 10.0
|
||||
|
||||
# First define the patterns at module level for reliability
|
||||
HEADING_PATTERNS = {
|
||||
# Level 1 patterns (Parts)
|
||||
'l1': re.compile(r'(?i)^part\s+[IVX0-9]+(?:\s.*)?$', re.IGNORECASE),
|
||||
|
||||
# Level 2 patterns (Items, Articles, Major Sections)
|
||||
'l2': [
|
||||
re.compile(r'(?i)^item\s+[0-9]+[A-Z]?\.?(?:\s.*)?$'),
|
||||
re.compile(r'(?i)^article\s+[IVX0-9]+(?:[\s\.].*)?$'),
|
||||
re.compile(r'(?i)^section\s+[0-9]+(?:\.[0-9]+)*(?:\s.*)?$')
|
||||
],
|
||||
|
||||
# Level 3 patterns (Major subsections)
|
||||
'l3': [
|
||||
re.compile(r'^[A-Z][A-Z\s\-\&]{5,}$'),
|
||||
re.compile(r'(?i)^(?:consolidated|combined)\s+[A-Z\s]+$'),
|
||||
re.compile(r'(?i)^management[A-Z\s]+(?:discussion|analysis)$'),
|
||||
re.compile(r'(?i)^notes?\s+to\s+[A-Z\s]+$'),
|
||||
re.compile(r'(?i)^selected\s+financial\s+data$'),
|
||||
re.compile(r'(?i)^supplementary\s+information$'),
|
||||
re.compile(r'(?i)^signatures?$'),
|
||||
re.compile(r'(?i)^exhibits?\s+and\s+financial\s+statement\s+schedules$')
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
class UnitType(Enum):
|
||||
POINT = 'pt'
|
||||
PIXEL = 'px'
|
||||
INCH = 'in'
|
||||
CM = 'cm'
|
||||
MM = 'mm'
|
||||
PERCENT = '%'
|
||||
EM = 'em'
|
||||
REM = 'rem'
|
||||
|
||||
|
||||
@dataclass
|
||||
class StyleUnit:
|
||||
"""Represents a CSS measurement with original and normalized values
|
||||
The original value is what was parsed from the CSS string, while the normalized
|
||||
value is converted to a standard unit characters for display in the terminal.
|
||||
"""
|
||||
value: float
|
||||
unit: UnitType
|
||||
|
||||
def __init__(self, value: float, unit: Union[str, UnitType]):
|
||||
self.value = value
|
||||
self.unit = UnitType(unit) if isinstance(unit, str) else unit
|
||||
|
||||
def to_chars(self, console_width: int) -> int:
|
||||
"""Convert width to character count based on console width"""
|
||||
# Base conversion rates (at standard 80-char width)
|
||||
BASE_CONSOLE_WIDTH = 80 # standard width
|
||||
CHARS_PER_INCH = 12.3 # at standard width
|
||||
|
||||
# Scale factor based on actual console width
|
||||
scale = console_width / BASE_CONSOLE_WIDTH
|
||||
|
||||
# Handle percentage specifically
|
||||
if self.unit == UnitType.PERCENT:
|
||||
return round(console_width * (self.value / 100))
|
||||
|
||||
# Convert to inches first
|
||||
inches = self._to_inches()
|
||||
|
||||
# Convert to characters, scaling based on console width
|
||||
chars = round(inches * CHARS_PER_INCH * scale)
|
||||
|
||||
return chars
|
||||
|
||||
def _to_inches(self) -> float:
|
||||
"""Convert any unit to inches"""
|
||||
conversions = {
|
||||
UnitType.INCH: 1.0,
|
||||
UnitType.POINT: 1 / 72, # 72 points per inch
|
||||
UnitType.PIXEL: 1 / 96, # 96 pixels per inch
|
||||
UnitType.CM: 0.393701, # 1 cm = 0.393701 inches
|
||||
UnitType.MM: 0.0393701, # 1 mm = 0.0393701 inches
|
||||
UnitType.EM: 1 / 6, # Approximate, assumes 1em = 1/6 inch
|
||||
UnitType.REM: 1 / 6, # Same as EM
|
||||
UnitType.PERCENT: 1.0 # Handled separately in to_chars
|
||||
}
|
||||
return self.value * conversions[self.unit]
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, StyleUnit):
|
||||
return NotImplemented
|
||||
if self.unit == other.unit:
|
||||
return self.value == other.value
|
||||
# Compare by converting both to inches
|
||||
return self._to_inches() == other._to_inches()
|
||||
|
||||
def __gt__(self, other: Union['StyleUnit', float]) -> bool:
|
||||
if isinstance(other, float):
|
||||
# Assume points when comparing with raw numbers
|
||||
other = StyleUnit(other, UnitType.POINT)
|
||||
return self._to_inches() > other._to_inches()
|
||||
|
||||
def __ge__(self, other: Union['StyleUnit', float]) -> bool:
|
||||
if isinstance(other, float):
|
||||
other = StyleUnit(other, UnitType.POINT)
|
||||
return self._to_inches() >= other._to_inches()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.value}{self.unit.value}"
|
||||
|
||||
@dataclass
|
||||
class Width:
|
||||
"""Represents a width value with its unit"""
|
||||
value: float
|
||||
unit: UnitType
|
||||
|
||||
def to_chars(self, console_width: int) -> int:
|
||||
"""Convert width to character count based on console width"""
|
||||
# Base conversion rates (at standard 80-char width)
|
||||
BASE_CONSOLE_WIDTH = 80 # standard width
|
||||
CHARS_PER_INCH = 12.3 # at standard width
|
||||
|
||||
# Scale factor based on actual console width
|
||||
scale = console_width / BASE_CONSOLE_WIDTH
|
||||
|
||||
# Convert to inches first
|
||||
inches = self._to_inches()
|
||||
|
||||
# Convert to characters, scaling based on console width
|
||||
chars = round(inches * CHARS_PER_INCH * scale)
|
||||
|
||||
# Handle percentage
|
||||
if self.unit == '%':
|
||||
return round(console_width * (self.value / 100))
|
||||
|
||||
return min(chars, console_width)
|
||||
|
||||
def _to_inches(self) -> float:
|
||||
"""Convert any unit to inches"""
|
||||
conversions = {
|
||||
'in': 1.0,
|
||||
'pt': 1 / 72, # 72 points per inch
|
||||
'px': 1 / 96, # 96 pixels per inch
|
||||
'cm': 0.393701, # 1 cm = 0.393701 inches
|
||||
'mm': 0.0393701, # 1 mm = 0.0393701 inches
|
||||
'%': 1.0 # percentage handled separately in to_chars
|
||||
}
|
||||
return self.value * conversions[self.unit]
|
||||
|
||||
|
||||
@dataclass
|
||||
class StyleInfo:
|
||||
"""Style information with proper unit handling"""
|
||||
display: Optional[str] = None
|
||||
margin_top: Optional[StyleUnit] = None
|
||||
margin_bottom: Optional[StyleUnit] = None
|
||||
font_size: Optional[StyleUnit] = None
|
||||
font_weight: Optional[str] = None
|
||||
text_align: Optional[str] = None
|
||||
line_height: Optional[StyleUnit] = None
|
||||
width: Optional[StyleUnit] = None
|
||||
text_decoration: Optional[str] = None
|
||||
|
||||
def merge(self, parent_style: Optional['StyleInfo']) -> 'StyleInfo':
|
||||
"""Merge with parent style, child properties take precedence"""
|
||||
if not parent_style:
|
||||
return self
|
||||
|
||||
return StyleInfo(
|
||||
display=self.display or parent_style.display,
|
||||
margin_top=self.margin_top or parent_style.margin_top,
|
||||
margin_bottom=self.margin_bottom or parent_style.margin_bottom,
|
||||
font_size=self.font_size or parent_style.font_size,
|
||||
font_weight=self.font_weight or parent_style.font_weight,
|
||||
text_align=self.text_align or parent_style.text_align,
|
||||
line_height=self.line_height or parent_style.line_height,
|
||||
width=self.width or parent_style.width,
|
||||
text_decoration=self.text_decoration or parent_style.text_decoration
|
||||
)
|
||||
|
||||
|
||||
def parse_style(style_str: str) -> StyleInfo:
|
||||
"""Parse inline CSS style string into StyleInfo object with robust unit validation"""
|
||||
style = StyleInfo()
|
||||
if not style_str:
|
||||
return style
|
||||
|
||||
# Use UnitType enum for valid units
|
||||
valid_units = {unit.value for unit in UnitType}
|
||||
|
||||
properties = [p.strip() for p in style_str.split(';') if p.strip()]
|
||||
for prop in properties:
|
||||
if ':' not in prop:
|
||||
continue
|
||||
|
||||
key, value = prop.split(':', 1)
|
||||
key = key.strip().lower()
|
||||
value = value.strip().lower()
|
||||
|
||||
# Handle non-numeric properties
|
||||
if key == 'font-weight':
|
||||
style.font_weight = value
|
||||
continue
|
||||
elif key == 'text-align':
|
||||
style.text_align = value
|
||||
continue
|
||||
elif key == 'display':
|
||||
style.display = value
|
||||
continue
|
||||
elif key == 'text-decoration':
|
||||
style.text_decoration = value
|
||||
continue
|
||||
|
||||
# For properties that expect numeric values with units
|
||||
match = re.match(r'(-?\d*\.?\d+)([a-z%]*)', value)
|
||||
if match:
|
||||
try:
|
||||
num_val = float(match.group(1))
|
||||
unit = match.group(2) or 'px' # Default to pixels
|
||||
|
||||
# Validate the unit is supported
|
||||
if unit not in valid_units:
|
||||
continue # Skip this property if unit is invalid
|
||||
|
||||
# Scientific notation check
|
||||
if 'e' in str(num_val).lower():
|
||||
continue # Skip scientific notation values
|
||||
|
||||
style_unit = StyleUnit(num_val, unit)
|
||||
|
||||
if key == 'margin-top':
|
||||
style.margin_top = style_unit
|
||||
elif key == 'margin-bottom':
|
||||
style.margin_bottom = style_unit
|
||||
elif key == 'font-size':
|
||||
style.font_size = style_unit
|
||||
elif key == 'line-height':
|
||||
style.line_height = style_unit
|
||||
elif key == 'width':
|
||||
style.width = style_unit
|
||||
except (ValueError, TypeError):
|
||||
continue # Skip this property if number parsing fails
|
||||
|
||||
return style
|
||||
|
||||
def is_heading(element: Tag, style: StyleInfo) -> bool:
|
||||
"""
|
||||
Detect if an element is likely a heading based on multiple weighted factors.
|
||||
Returns True if enough heading indicators are present.
|
||||
"""
|
||||
if not style:
|
||||
return False
|
||||
|
||||
# Initialize score and evidence
|
||||
score = 0
|
||||
max_score = 6
|
||||
|
||||
# Get text content
|
||||
text = element.get_text(strip=True)
|
||||
if not text:
|
||||
return False
|
||||
|
||||
debug_evidence = []
|
||||
|
||||
# 1. Length checks - fail fast for long text
|
||||
if len(text) > 100:
|
||||
debug_evidence.append("-5 excessive length")
|
||||
score -= 5
|
||||
return False
|
||||
elif len(text) > 50:
|
||||
score -= 2
|
||||
debug_evidence.append("-2 for medium length")
|
||||
|
||||
|
||||
# Primary document structure patterns
|
||||
primary_patterns = [
|
||||
(r'(?i)^part\s+[IVX0-9]+(?:\s.*)?$', "PART pattern", 4),
|
||||
(r'(?i)^section\s+[0-9]+(?:\.[0-9]+)*(?:\s.*)?$', "SECTION pattern", 4),
|
||||
(r'(?i)^article\s+[IVX0-9]+(?:[\s\.].*)?$', "ARTICLE pattern", 4),
|
||||
(r'(?i)^item\s+[0-9]+[A-Z]?\.?(?:\s.*)?$', "ITEM pattern", 4),
|
||||
]
|
||||
|
||||
# Common SEC heading patterns
|
||||
sec_heading_patterns = [
|
||||
(r'(?i)^(?:consolidated|combined)\s+[A-Z\s]+$', "Financial statement heading", 3),
|
||||
(r'(?i)^management[A-Z\s]+(?:discussion|analysis)$', "MD&A heading", 3),
|
||||
(r'(?i)^notes?\s+to\s+[A-Z\s]+$', "Notes heading", 3),
|
||||
(r'(?i)^[A-Z][A-Z\s]{2,}\s+(?:and|of|to|for|from)\s+[A-Z\s]+$', "Complex heading", 3),
|
||||
]
|
||||
|
||||
# Secondary patterns
|
||||
secondary_patterns = [
|
||||
(r'^\d+\.\s*[A-Z].*$', "Numbered pattern", 3),
|
||||
(r'^[A-Z][A-Z\s\-\&]+$', "All caps text", 3),
|
||||
]
|
||||
|
||||
# Check patterns in order
|
||||
all_patterns = primary_patterns + sec_heading_patterns + secondary_patterns
|
||||
for pattern, desc, points in all_patterns:
|
||||
if re.match(pattern, text):
|
||||
score += points
|
||||
debug_evidence.append(f"+{points} for {desc}")
|
||||
break
|
||||
|
||||
# 3. All caps bonus for short text
|
||||
if text.isupper() and len(text) <= 30 and not any(char.isdigit() for char in text):
|
||||
score += 1
|
||||
debug_evidence.append("+1 for short all-caps text")
|
||||
|
||||
# 4. Style properties
|
||||
if style.font_weight in ['bold', '700', '800', '900']:
|
||||
points = 2 if len(text) < 30 else 1
|
||||
score += points
|
||||
debug_evidence.append(f"+{points} for bold weight")
|
||||
|
||||
if style.font_size:
|
||||
base_size = StyleUnit(base_font_size, 'pt')
|
||||
size_ratio = style.font_size._to_inches() / base_size._to_inches()
|
||||
|
||||
if size_ratio >= 1.2:
|
||||
score += 2
|
||||
debug_evidence.append(f"+2 for large font ({size_ratio:.1f}x base)")
|
||||
elif size_ratio >= 1.1:
|
||||
score += 1
|
||||
debug_evidence.append(f"+1 for medium font ({size_ratio:.1f}x base)")
|
||||
|
||||
# Margin handling
|
||||
if style.margin_top:
|
||||
large_margin = StyleUnit(18, 'pt')
|
||||
medium_margin = StyleUnit(12, 'pt')
|
||||
|
||||
if style.margin_top >= large_margin:
|
||||
score += 2
|
||||
debug_evidence.append(f"+2 for large margin ({style.margin_top.value}{style.margin_top.unit.value})")
|
||||
elif style.margin_top >= medium_margin:
|
||||
score += 2
|
||||
debug_evidence.append(f"+2 for medium margin ({style.margin_top.value}{style.margin_top.unit.value})")
|
||||
|
||||
# Parent margin
|
||||
parent = element.parent
|
||||
if parent and isinstance(parent, Tag):
|
||||
parent_style = parse_style(parent.get('style', ''))
|
||||
if parent_style.margin_top:
|
||||
if parent_style.margin_top >= StyleUnit(18, 'pt'):
|
||||
score += 2
|
||||
debug_evidence.append("+2 for large parent margin")
|
||||
elif parent_style.margin_top >= StyleUnit(12, 'pt'):
|
||||
score += 1
|
||||
debug_evidence.append("+1 for medium parent margin")
|
||||
|
||||
# Debug output
|
||||
|
||||
return score >= max_score
|
||||
|
||||
|
||||
def _get_effective_style(element: Tag, base_style: StyleInfo, debug: bool = False) -> StyleInfo:
|
||||
"""Get combined styles with parent-first approach and semantic tag handling"""
|
||||
if debug:
|
||||
pass
|
||||
|
||||
# Start with base style
|
||||
effective_style = base_style or StyleInfo()
|
||||
|
||||
# Get parent styles working up the tree
|
||||
for parent in element.parents:
|
||||
if parent.name == 'div':
|
||||
parent_style = parse_style(parent.get('style', ''))
|
||||
if debug:
|
||||
pass
|
||||
if parent_style:
|
||||
effective_style = effective_style.merge(parent_style)
|
||||
# Stop at first div to avoid going too far up
|
||||
if parent.name == 'div':
|
||||
break
|
||||
|
||||
# Get styles from span parents for font-size
|
||||
span_parent = element.find_parent('span')
|
||||
if span_parent:
|
||||
span_style = parse_style(span_parent.get('style', ''))
|
||||
if debug:
|
||||
pass
|
||||
if span_style:
|
||||
effective_style = effective_style.merge(span_style)
|
||||
|
||||
# Apply element's own style
|
||||
element_style = parse_style(element.get('style', ''))
|
||||
if element_style:
|
||||
effective_style = effective_style.merge(element_style)
|
||||
|
||||
# Handle semantic bold tags
|
||||
if element.name in ['strong', 'b'] or element.find_parent(['strong', 'b']):
|
||||
effective_style = StyleInfo(
|
||||
font_weight='700',
|
||||
margin_top=effective_style.margin_top,
|
||||
margin_bottom=effective_style.margin_bottom,
|
||||
font_size=effective_style.font_size,
|
||||
text_align=effective_style.text_align,
|
||||
line_height=effective_style.line_height,
|
||||
width=effective_style.width,
|
||||
text_decoration=effective_style.text_decoration,
|
||||
display=effective_style.display
|
||||
)
|
||||
|
||||
if debug:
|
||||
pass
|
||||
|
||||
return effective_style
|
||||
|
||||
def _merge_styles(parent_style: StyleInfo, child_style: StyleInfo, debug: bool = False) -> StyleInfo:
|
||||
"""
|
||||
Helper function to properly merge parent and child styles
|
||||
"""
|
||||
if not parent_style:
|
||||
return child_style
|
||||
if not child_style:
|
||||
return parent_style
|
||||
|
||||
merged = StyleInfo(
|
||||
display=child_style.display or parent_style.display,
|
||||
margin_top=child_style.margin_top or parent_style.margin_top,
|
||||
margin_bottom=child_style.margin_bottom or parent_style.margin_bottom,
|
||||
font_size=child_style.font_size or parent_style.font_size,
|
||||
font_weight=child_style.font_weight or parent_style.font_weight,
|
||||
text_align=child_style.text_align or parent_style.text_align,
|
||||
line_height=child_style.line_height or parent_style.line_height,
|
||||
width=child_style.width or parent_style.width,
|
||||
text_decoration=child_style.text_decoration or parent_style.text_decoration
|
||||
)
|
||||
|
||||
if debug:
|
||||
logger.debug("Merged style: %s", _format_style_debug(merged))
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
def get_heading_level(element: Tag, style: StyleInfo, text: str, debug: bool = False) -> Optional[int]:
|
||||
"""Get heading level with comprehensive debugging"""
|
||||
debug_info: Dict[str, Any] = {'text': text, 'decisions': []}
|
||||
|
||||
def log_decision(stage: str, result: bool, reason: str):
|
||||
if debug:
|
||||
debug_info['decisions'].append({
|
||||
'stage': stage,
|
||||
'result': result,
|
||||
'reason': reason
|
||||
})
|
||||
|
||||
# Early return for empty or whitespace-only text
|
||||
if not text.strip():
|
||||
if debug:
|
||||
pass
|
||||
return None
|
||||
|
||||
# Special handling for elements inside a div
|
||||
parent_div = element.find_parent('div')
|
||||
if parent_div:
|
||||
# Get all spans in the div
|
||||
spans = parent_div.find_all('span')
|
||||
if len(spans) > 1: # Only process as split heading if multiple spans
|
||||
# Combine text from all spans
|
||||
combined_text = ' '.join(span.get_text(strip=True) for span in spans)
|
||||
if combined_text.strip():
|
||||
# Get div's style
|
||||
div_style = parse_style(parent_div.get('style', ''))
|
||||
# Check for bold styling in any span
|
||||
has_bold = any(
|
||||
'font-weight' in span.get('style', '').lower() and
|
||||
any(weight in span.get('style', '').lower()
|
||||
for weight in ['bold', '700', '800', '900'])
|
||||
for span in spans
|
||||
)
|
||||
if has_bold:
|
||||
div_style = StyleInfo(
|
||||
font_weight='700',
|
||||
margin_top=div_style.margin_top,
|
||||
font_size=div_style.font_size,
|
||||
text_align=div_style.text_align,
|
||||
display=div_style.display
|
||||
)
|
||||
|
||||
if debug:
|
||||
pass
|
||||
|
||||
# Process the combined heading
|
||||
return get_heading_level(parent_div, div_style, combined_text, debug)
|
||||
|
||||
# Get complete style for the element
|
||||
complete_style = _get_effective_style(element, style, debug)
|
||||
if debug:
|
||||
pass
|
||||
|
||||
# Check minimum heading traits
|
||||
has_min_traits, trait_details = _has_minimum_heading_traits(complete_style, text, return_details=True)
|
||||
if debug:
|
||||
for _trait, _value in trait_details.items():
|
||||
pass
|
||||
|
||||
if not has_min_traits:
|
||||
log_decision("Style Check", False, "Does not meet minimum heading traits")
|
||||
return None
|
||||
|
||||
log_decision("Style Check", True, "Meets minimum heading traits")
|
||||
text_to_check = text.strip()
|
||||
|
||||
# First check prominence since it affects L3 pattern matching
|
||||
is_prominent = _is_prominently_styled(complete_style, debug=debug)
|
||||
|
||||
# Level 1 check (PART headers)
|
||||
if debug:
|
||||
pass
|
||||
|
||||
if HEADING_PATTERNS['l1'].match(text_to_check):
|
||||
log_decision("Pattern Check", True, "Matches Level 1 (PART) pattern")
|
||||
return 1
|
||||
|
||||
# Level 2 check (Items, Articles)
|
||||
if debug:
|
||||
pass
|
||||
for pattern in HEADING_PATTERNS['l2']:
|
||||
if debug:
|
||||
pass
|
||||
if pattern.match(text_to_check):
|
||||
log_decision("Pattern Check", True, f"Matches Level 2 pattern: {pattern.pattern}")
|
||||
return 2
|
||||
|
||||
# Level 3 check (requires prominence)
|
||||
if is_prominent:
|
||||
if debug:
|
||||
pass
|
||||
for pattern in HEADING_PATTERNS['l3']:
|
||||
if debug:
|
||||
pass
|
||||
if pattern.match(text_to_check):
|
||||
log_decision("Pattern Check", True, f"Matches Level 3 pattern: {pattern.pattern}")
|
||||
return 3
|
||||
|
||||
# Check if it's a likely section heading even if it doesn't match exact patterns
|
||||
if _is_likely_section_heading(text_to_check, complete_style):
|
||||
log_decision("Pattern Check", True, "Matches section heading criteria")
|
||||
return 3
|
||||
elif debug:
|
||||
pass
|
||||
|
||||
# Level 4 check (minor subsections)
|
||||
# Check for basic heading traits that didn't match higher level patterns
|
||||
if (text_to_check and # Ensure there is non-empty text
|
||||
complete_style.font_weight in ['bold', '700', '800', '900'] and
|
||||
len(text_to_check) < 50 and
|
||||
not text_to_check.startswith(('Note:', '*', '(', '$')) and
|
||||
not text_to_check.endswith(':')):
|
||||
log_decision("Pattern Check", True, "Matches Level 4 (minor heading) criteria")
|
||||
return 4
|
||||
|
||||
log_decision("Pattern Check", False, "No heading patterns matched")
|
||||
return None
|
||||
|
||||
|
||||
def _format_style_debug(style: StyleInfo) -> Dict[str, str]:
|
||||
"""Format style information for debugging"""
|
||||
if not style:
|
||||
return {"status": "no style"}
|
||||
|
||||
return {
|
||||
"font_weight": str(style.font_weight),
|
||||
"font_size": str(style.font_size) if style.font_size else None,
|
||||
"margin_top": str(style.margin_top) if style.margin_top else None,
|
||||
"text_align": style.text_align,
|
||||
"display": style.display
|
||||
}
|
||||
|
||||
|
||||
def _has_minimum_heading_traits(style: StyleInfo, text: str, return_details: bool = False) -> Union[
|
||||
bool, Tuple[bool, Dict[str, bool]]]:
|
||||
"""
|
||||
Check for minimum heading characteristics with improved font-weight handling
|
||||
"""
|
||||
if not style:
|
||||
return (False, {"reason": "no style"}) if return_details else False
|
||||
|
||||
# Improved font-weight checking
|
||||
has_bold = False
|
||||
if style.font_weight:
|
||||
has_bold = (
|
||||
style.font_weight == 'bold' or
|
||||
style.font_weight == '700' or
|
||||
style.font_weight == '800' or
|
||||
style.font_weight == '900' or
|
||||
# Also handle possible numeric values
|
||||
(style.font_weight.isdigit() and int(style.font_weight) >= 700)
|
||||
)
|
||||
|
||||
details = {
|
||||
"has_bold": has_bold,
|
||||
"has_large_font": bool(style.font_size and style.font_size > StyleUnit(11, 'pt')),
|
||||
"has_margin": bool(style.margin_top and style.margin_top >= StyleUnit(12, 'pt')),
|
||||
"has_center_caps": bool(style.text_align == 'center' and text.isupper() and len(text) > 4)
|
||||
}
|
||||
|
||||
# Consider any combination of significant styling as valid
|
||||
result = details["has_bold"] or details["has_large_font"] or \
|
||||
(details["has_margin"] and (details["has_bold"] or details["has_center_caps"]))
|
||||
|
||||
if return_details:
|
||||
return result, details
|
||||
return result
|
||||
|
||||
|
||||
def _is_prominently_styled(style: StyleInfo, debug: bool = False) -> bool:
|
||||
"""Check for prominent styling with detailed debug output"""
|
||||
if not style:
|
||||
if debug:
|
||||
pass
|
||||
return False
|
||||
|
||||
prominence_checks = {
|
||||
"large_font": bool(style.font_size and style.font_size > StyleUnit(12, 'pt')),
|
||||
"large_margin": bool(style.margin_top and style.margin_top >= StyleUnit(18, 'pt')),
|
||||
"centered": style.text_align == 'center',
|
||||
"bold_with_margin": bool(style.font_weight in ('700', '800', '900', 'bold') and style.margin_top)
|
||||
}
|
||||
|
||||
if debug:
|
||||
for _check, result in prominence_checks.items():
|
||||
if result:
|
||||
pass
|
||||
|
||||
result = any(prominence_checks.values())
|
||||
if debug:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _get_prominence_detail(style: StyleInfo, check: str) -> str:
|
||||
"""Get detailed information about why a prominence check passed"""
|
||||
if check == "large_font" and style.font_size:
|
||||
return f"Font size: {style.font_size}"
|
||||
elif check == "large_margin" and style.margin_top:
|
||||
return f"Margin top: {style.margin_top}"
|
||||
elif check == "centered":
|
||||
return f"Text align: {style.text_align}"
|
||||
elif check == "bold_with_margin":
|
||||
return f"Font weight: {style.font_weight}, Margin top: {style.margin_top}"
|
||||
return ""
|
||||
|
||||
|
||||
|
||||
|
||||
def _is_likely_minor_heading(text: str, style: StyleInfo, return_details: bool = False) -> Union[
|
||||
bool, Tuple[bool, Dict[str, Any]]]:
|
||||
"""Detect minor headings with detailed output"""
|
||||
details = {
|
||||
"length_ok": len(text) < 40,
|
||||
"has_bold": bool(style and style.font_weight in ('bold', '700')),
|
||||
"no_exclusions": not text.startswith(('Note:', '*', '(', '$')) and not text.endswith(':'),
|
||||
"text_sample": text[:30] + ('...' if len(text) > 30 else '')
|
||||
}
|
||||
|
||||
result = all([details["length_ok"], details["has_bold"], details["no_exclusions"]])
|
||||
|
||||
if return_details:
|
||||
return result, details
|
||||
return result
|
||||
|
||||
|
||||
def _print_debug_info(debug_info: Dict[str, Any], debug: bool):
|
||||
"""Print formatted debug information"""
|
||||
if not debug:
|
||||
return
|
||||
|
||||
logger.debug("\nHeading Detection Analysis:")
|
||||
logger.debug("-" * 50)
|
||||
logger.debug(f"Text: '{debug_info['text']}'")
|
||||
logger.debug("\nStyle Information:")
|
||||
logger.debug(f" {debug_info.get('effective_style', 'No style info')}")
|
||||
|
||||
if 'style_traits' in debug_info:
|
||||
logger.debug("\nStyle Traits:")
|
||||
for trait, value in debug_info['style_traits'].items():
|
||||
logger.debug(f" {trait}: {value}")
|
||||
|
||||
logger.debug("\nDecision Process:")
|
||||
for decision in debug_info['decisions']:
|
||||
result_mark = "✓" if decision['result'] else "✗"
|
||||
logger.debug(f" {result_mark} {decision['stage']}: {decision['reason']}")
|
||||
|
||||
logger.debug("-" * 50)
|
||||
|
||||
|
||||
def _is_likely_section_heading(text: str, style: StyleInfo) -> bool:
|
||||
"""
|
||||
Check if text matches common SEC section heading patterns
|
||||
Uses heuristics based on common SEC document structure
|
||||
"""
|
||||
# Skip common false positives
|
||||
if len(text) < 8 or len(text) > 60:
|
||||
return False
|
||||
|
||||
text_lower = text.lower()
|
||||
|
||||
# Common SEC section keywords
|
||||
section_keywords = {
|
||||
'overview', 'background', 'business', 'operations',
|
||||
'risk factors', 'management', 'financial', 'discussion',
|
||||
'analysis', 'results', 'liquidity', 'capital resources',
|
||||
'critical accounting', 'controls', 'procedures'
|
||||
}
|
||||
|
||||
# Check for keyword matches
|
||||
words = set(text_lower.split())
|
||||
if len(words & section_keywords) >= 1:
|
||||
return True
|
||||
|
||||
return False
|
||||
657
venv/lib/python3.10/site-packages/edgar/files/tables.py
Normal file
657
venv/lib/python3.10/site-packages/edgar/files/tables.py
Normal file
@@ -0,0 +1,657 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from edgar.files.html import BaseNode
|
||||
import re
|
||||
from functools import lru_cache
|
||||
|
||||
from edgar.richtools import rich_to_text
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessedTable:
|
||||
"""Represents a processed table ready for rendering"""
|
||||
headers: Optional[list[str]]
|
||||
data_rows: list[list[str]]
|
||||
column_alignments: list[str] # "left" or "right" for each column
|
||||
|
||||
|
||||
# Looks for actual numeric data values, currency, or calculations
|
||||
data_indicators = [
|
||||
r'\$\s*\d', # Currency with numbers
|
||||
r'\d+(?:,\d{3})+', # Numbers with thousands separators
|
||||
r'\d+\s*[+\-*/]\s*\d+', # Basic calculations
|
||||
r'\(\s*\d+(?:,\d{3})*\s*\)', # Parenthesized numbers
|
||||
]
|
||||
|
||||
data_pattern = '|'.join(data_indicators)
|
||||
|
||||
|
||||
def is_number(s: str) -> bool:
|
||||
"""
|
||||
Check if a string represents a number in common financial formats.
|
||||
|
||||
Handles:
|
||||
- Regular numbers (123, -123, 123.45)
|
||||
- Currency ($123, $123.45)
|
||||
- Parenthetical negatives ((123), (123.45))
|
||||
- Thousands separators (1,234, 1,234.56)
|
||||
- Mixed formats ($1,234.56)
|
||||
- Various whitespace
|
||||
- En/Em dashes for negatives
|
||||
- Multiple decimal formats (123.45, 123,45)
|
||||
|
||||
Args:
|
||||
s: String to check
|
||||
|
||||
Returns:
|
||||
bool: True if string represents a valid number
|
||||
"""
|
||||
if not s or s.isspace():
|
||||
return False
|
||||
|
||||
# Convert unicode minus/dash characters to regular minus
|
||||
s = s.replace('−', '-').replace('–', '-').replace('—', '-')
|
||||
|
||||
# Handle parenthetical negatives
|
||||
s = s.strip()
|
||||
if s.startswith('(') and s.endswith(')'):
|
||||
s = '-' + s[1:-1]
|
||||
|
||||
# Remove currency symbols and whitespace
|
||||
s = s.replace('$', '').replace(' ', '')
|
||||
|
||||
# Handle European number format (convert 123,45 to 123.45)
|
||||
if ',' in s and '.' not in s and len(s.split(',')[1]) == 2:
|
||||
s = s.replace(',', '.')
|
||||
else:
|
||||
# Remove thousands separators
|
||||
s = s.replace(',', '')
|
||||
|
||||
try:
|
||||
float(s)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
class TableProcessor:
|
||||
@staticmethod
|
||||
def process_table(node) -> Optional[ProcessedTable]:
|
||||
"""Process table node into a format ready for rendering"""
|
||||
if not isinstance(node.content, list) or not node.content:
|
||||
return None
|
||||
|
||||
def process_cell_content(content: Union[str, 'BaseNode']) -> str:
|
||||
"""Process cell content to handle HTML breaks and cleanup"""
|
||||
if isinstance(content, str):
|
||||
content = content.replace('<br/>', '\n').replace('<br>', '\n')
|
||||
lines = [line.strip() for line in content.split('\n')]
|
||||
return '\n'.join(line for line in lines if line)
|
||||
else:
|
||||
# Recursively process nested nodes
|
||||
processed_table = content.render(500)
|
||||
return rich_to_text(processed_table)
|
||||
|
||||
# Process all rows into virtual columns
|
||||
virtual_rows = []
|
||||
max_cols = max(sum(cell.colspan for cell in row.cells) for row in node.content)
|
||||
|
||||
# Convert all rows to virtual columns first
|
||||
for row in node.content:
|
||||
virtual_row = [""] * max_cols
|
||||
current_col = 0
|
||||
|
||||
for cell in row.cells:
|
||||
content = process_cell_content(cell.content)
|
||||
|
||||
if '\n' not in content and cell.is_currency and content.replace(',', '').replace('.', '').isdigit():
|
||||
content = f"${float(content.replace(',', '')):,.2f}"
|
||||
|
||||
if cell.colspan > 1:
|
||||
virtual_row[current_col + 1] = content
|
||||
else:
|
||||
virtual_row[current_col] = content
|
||||
|
||||
current_col += cell.colspan
|
||||
|
||||
virtual_rows.append(virtual_row)
|
||||
|
||||
# Analyze and remove empty columns
|
||||
empty_cols = []
|
||||
for col in range(max_cols):
|
||||
if all(row[col].strip() == "" for row in virtual_rows):
|
||||
empty_cols.append(col)
|
||||
|
||||
# Process empty columns
|
||||
cols_to_remove = TableProcessor._get_columns_to_remove(empty_cols, max_cols)
|
||||
|
||||
# Create optimized rows, filtering out empty ones
|
||||
optimized_rows = []
|
||||
for virtual_row in virtual_rows:
|
||||
has_content = any(col.strip() for col in virtual_row)
|
||||
if not has_content:
|
||||
continue
|
||||
optimized_row = [col for idx, col in enumerate(virtual_row) if idx not in cols_to_remove]
|
||||
optimized_rows.append(optimized_row)
|
||||
|
||||
if not optimized_rows:
|
||||
return None
|
||||
|
||||
# Detect headers
|
||||
header_rows, data_start_idx = TableProcessor._analyze_table_structure(optimized_rows)
|
||||
|
||||
# Detect and fix misalignment in all rows
|
||||
fixed_rows = TableProcessor._detect_and_fix_misalignment(optimized_rows, data_start_idx)
|
||||
|
||||
# Use the fixed header portion for processing headers
|
||||
headers = None
|
||||
if header_rows:
|
||||
fixed_headers = fixed_rows[:data_start_idx] # Take header portion from fixed rows
|
||||
headers = TableProcessor._merge_header_rows(fixed_headers)
|
||||
|
||||
# Determine column alignments
|
||||
col_count = len(optimized_rows[0])
|
||||
alignments = TableProcessor._determine_column_alignments(
|
||||
optimized_rows, data_start_idx, col_count)
|
||||
|
||||
# Format data rows
|
||||
formatted_rows = TableProcessor._format_data_rows(
|
||||
optimized_rows[data_start_idx:])
|
||||
|
||||
return ProcessedTable(
|
||||
headers=headers,
|
||||
data_rows=formatted_rows,
|
||||
column_alignments=alignments
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _is_date_header(text: str) -> bool:
|
||||
"""Detect if text looks like a date header (year, quarter, month)"""
|
||||
text = text.lower().strip()
|
||||
|
||||
# Year patterns
|
||||
if text.isdigit() and len(text) == 4:
|
||||
return True
|
||||
|
||||
# Quarter patterns
|
||||
quarter_patterns = ['q1', 'q2', 'q3', 'q4', 'first quarter', 'second quarter',
|
||||
'third quarter', 'fourth quarter']
|
||||
if any(pattern in text for pattern in quarter_patterns):
|
||||
return True
|
||||
|
||||
# Month patterns
|
||||
months = ['january', 'february', 'march', 'april', 'may', 'june',
|
||||
'july', 'august', 'september', 'october', 'november', 'december',
|
||||
'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
|
||||
return any(month in text for month in months)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _analyze_row(row: list) -> dict:
|
||||
"""Analyze characteristics of a row"""
|
||||
return {
|
||||
'empty_first': not bool(row[0].strip()),
|
||||
'date_headers': sum(1 for cell in row if TableProcessor._is_date_header(cell)),
|
||||
'financial_values': sum(1 for i, cell in enumerate(row)
|
||||
if TableProcessor._is_financial_value(cell, row, i)),
|
||||
'financial_metrics': sum(1 for cell in row if TableProcessor._is_financial_metric(cell)),
|
||||
'empty_cells': sum(1 for cell in row if not cell.strip()),
|
||||
'dollar_signs': sum(1 for cell in row if cell.strip() == '$'),
|
||||
'total_cells': len(row)
|
||||
}
|
||||
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=None)
|
||||
def _get_period_header_pattern() -> re.Pattern:
|
||||
"""Create regex pattern for common financial period headers"""
|
||||
# Base components
|
||||
periods = r'(?:three|six|nine|twelve|[1-4]|first|second|third|fourth)'
|
||||
timeframes = r'(?:month|quarter|year|week)'
|
||||
ended_variants = r'(?:ended|ending|end|period)'
|
||||
as_of_variants = r'(?:as\s+of|at|as\s+at)'
|
||||
|
||||
# Enhanced date pattern
|
||||
months = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
|
||||
day = r'\d{1,2}'
|
||||
year = r'(?:19|20)\d{2}'
|
||||
date = fr'{months}\s*\.?\s*{day}\s*,?\s*{year}'
|
||||
|
||||
# Combine into patterns
|
||||
patterns = [
|
||||
# Standard period headers
|
||||
fr'{periods}\s+{timeframes}\s+{ended_variants}(?:\s+{date})?',
|
||||
fr'(?:fiscal\s+)?{timeframes}\s+{ended_variants}',
|
||||
fr'{timeframes}\s+{ended_variants}(?:\s+{date})?',
|
||||
|
||||
# Balance sheet date headers
|
||||
fr'{as_of_variants}\s+{date}',
|
||||
|
||||
# Multiple dates in sequence (common in headers)
|
||||
fr'{date}(?:\s*(?:and|,)\s*{date})*',
|
||||
|
||||
# Single date with optional period specification
|
||||
fr'(?:{ended_variants}\s+)?{date}'
|
||||
]
|
||||
|
||||
# Combine all patterns
|
||||
combined_pattern = '|'.join(f'(?:{p})' for p in patterns)
|
||||
return re.compile(combined_pattern, re.IGNORECASE)
|
||||
|
||||
@staticmethod
|
||||
def _contains_data(self, text):
|
||||
# Check if the string contains data indicators
|
||||
return bool(re.search(data_pattern, text))
|
||||
|
||||
@staticmethod
|
||||
def _analyze_table_structure(rows: list) -> tuple[list, int]:
|
||||
"""
|
||||
Analyze table structure to determine headers and data rows.
|
||||
Returns (header_rows, data_start_index)
|
||||
"""
|
||||
if not rows:
|
||||
return [], 0
|
||||
|
||||
row_analyses = [TableProcessor._analyze_row(row) for row in rows[:4]]
|
||||
period_pattern = TableProcessor._get_period_header_pattern()
|
||||
|
||||
# Pattern 1: Look for period headers
|
||||
for i, row in enumerate(rows[:3]): # Check first 4 rows
|
||||
header_text = ' '.join(cell.strip() for cell in row).lower()
|
||||
has_period_header = period_pattern.search(header_text)
|
||||
contains_data = bool(re.search(data_pattern, header_text))
|
||||
if has_period_header and not contains_data:
|
||||
# Found a period header, check if next row has years or is part of header
|
||||
if i + 1 < len(rows):
|
||||
next_row = rows[i + 1]
|
||||
next_text = ' '.join(cell.strip() for cell in next_row)
|
||||
# Check if next row has years or quarter references
|
||||
if (any(str(year) in next_text for year in range(2010, 2030)) or
|
||||
any(q in next_text.lower() for q in ['q1', 'q2', 'q3', 'q4'])):
|
||||
return rows[:i + 2], i + 2
|
||||
return rows[:i + 1], i + 1
|
||||
|
||||
# Pattern 2: $ symbols in their own columns
|
||||
for i, analysis in enumerate(row_analyses):
|
||||
if analysis['dollar_signs'] > 0:
|
||||
# If we see $ symbols, previous row might be header
|
||||
if i > 0:
|
||||
return rows[:i], i
|
||||
return [], i
|
||||
|
||||
# Pattern 3: Look for transition from text to numbers with $ alignment
|
||||
for i in range(len(rows) - 1):
|
||||
curr_analysis = TableProcessor._analyze_row(rows[i])
|
||||
next_analysis = TableProcessor._analyze_row(rows[i + 1])
|
||||
|
||||
if (curr_analysis['financial_values'] == 0 and
|
||||
next_analysis['financial_values'] > 0 and
|
||||
next_analysis['dollar_signs'] > 0):
|
||||
return rows[:i + 1], i + 1
|
||||
|
||||
# Default to no headers if no clear pattern found
|
||||
return [], 0
|
||||
|
||||
# In TableProcessor class
|
||||
@staticmethod
|
||||
def _detect_and_fix_misalignment(virtual_rows: list[list[str]], data_start_idx: int) -> list[list[str]]:
|
||||
"""
|
||||
Detect and fix misalignment between date headers and numeric data columns.
|
||||
Returns corrected virtual rows.
|
||||
"""
|
||||
if not virtual_rows or data_start_idx >= len(virtual_rows):
|
||||
return virtual_rows
|
||||
|
||||
# Get header row (assumes dates are in the last header row)
|
||||
header_idx = data_start_idx - 1
|
||||
if header_idx < 0:
|
||||
return virtual_rows
|
||||
|
||||
header_row = virtual_rows[data_start_idx - 1]
|
||||
|
||||
# Find date columns in header
|
||||
date_columns = []
|
||||
for i, cell in enumerate(header_row):
|
||||
if TableProcessor._is_date_header(cell):
|
||||
date_columns.append(i)
|
||||
|
||||
if not date_columns:
|
||||
return virtual_rows # No date headers found
|
||||
|
||||
# Find numeric columns in first few data rows
|
||||
numeric_columns = set()
|
||||
for row in virtual_rows[data_start_idx:data_start_idx + 3]: # Check first 3 data rows
|
||||
for i, cell in enumerate(row):
|
||||
if TableProcessor._is_financial_value(cell, row, i):
|
||||
numeric_columns.add(i)
|
||||
|
||||
# Detect misalignment
|
||||
if date_columns and numeric_columns:
|
||||
# Check if dates are shifted right compared to numeric columns
|
||||
dates_shifted = all(
|
||||
(i + 1) in numeric_columns
|
||||
for i in date_columns
|
||||
)
|
||||
if dates_shifted:
|
||||
# Fix alignment by shifting only the row containing dates
|
||||
fixed_rows = virtual_rows.copy()
|
||||
# Find and fix only the row containing the dates
|
||||
for row_idx, row in enumerate(virtual_rows):
|
||||
if row_idx < data_start_idx: # Only check header rows
|
||||
# Check if this row contains the dates by counting date headers
|
||||
date_count = sum(1 for cell in row if TableProcessor._is_date_header(cell))
|
||||
if date_count >= 2: # If multiple dates found, this is our target row
|
||||
new_row = [""] * len(row) # Start with empty row
|
||||
for i in range(len(row) - 1):
|
||||
new_row[i + 1] = row[i] # Copy each value one position right
|
||||
fixed_rows[row_idx] = new_row
|
||||
break # Only fix one row
|
||||
return fixed_rows
|
||||
|
||||
return virtual_rows
|
||||
|
||||
@staticmethod
|
||||
def _get_columns_to_remove(empty_cols: list[int], max_cols: int) -> set[int]:
|
||||
cols_to_remove = set()
|
||||
|
||||
# Handle leading empty columns
|
||||
for col in range(max_cols):
|
||||
if col in empty_cols:
|
||||
cols_to_remove.add(col)
|
||||
else:
|
||||
break
|
||||
|
||||
# Handle trailing empty columns
|
||||
for col in reversed(range(max_cols)):
|
||||
if col in empty_cols:
|
||||
cols_to_remove.add(col)
|
||||
else:
|
||||
break
|
||||
|
||||
# Handle consecutive empty columns in the middle
|
||||
i = 0
|
||||
while i < max_cols - 1:
|
||||
if i in empty_cols and (i + 1) in empty_cols:
|
||||
consecutive_empty = 0
|
||||
j = i
|
||||
while j < max_cols and j in empty_cols:
|
||||
consecutive_empty += 1
|
||||
j += 1
|
||||
cols_to_remove.update(range(i + 1, i + consecutive_empty))
|
||||
i = j
|
||||
else:
|
||||
i += 1
|
||||
|
||||
return cols_to_remove
|
||||
|
||||
@staticmethod
|
||||
def _merge_header_rows(header_rows: list[list[str]]) -> list[str]:
|
||||
"""Merge multiple header rows into one"""
|
||||
if not header_rows:
|
||||
return []
|
||||
|
||||
merged = []
|
||||
for col_idx in range(len(header_rows[0])):
|
||||
parts = []
|
||||
for row in header_rows:
|
||||
text = row[col_idx].strip()
|
||||
if text and text != '$': # Skip empty cells and lone $ symbols
|
||||
parts.append(text)
|
||||
merged.append('\n'.join(parts))
|
||||
return merged
|
||||
|
||||
@staticmethod
|
||||
def _determine_column_alignments(rows: list[list[str]],
|
||||
data_start_idx: int,
|
||||
col_count: int) -> list[str]:
|
||||
"""Determine alignment for each column"""
|
||||
alignments = []
|
||||
for col_idx in range(col_count):
|
||||
# First column always left-aligned
|
||||
if col_idx == 0:
|
||||
alignments.append("left")
|
||||
continue
|
||||
|
||||
# Check if column contains numbers
|
||||
is_numeric = False
|
||||
for row in rows[data_start_idx:]:
|
||||
cell = row[col_idx].strip()
|
||||
if cell and cell != '$':
|
||||
if TableProcessor._is_financial_value(cell, row, col_idx):
|
||||
is_numeric = True
|
||||
break
|
||||
alignments.append("right" if is_numeric else "left")
|
||||
|
||||
return alignments
|
||||
|
||||
@staticmethod
|
||||
def _is_financial_value(text: str, row: list, col_idx: int) -> bool:
|
||||
"""
|
||||
Check if text represents a financial value, considering layout context
|
||||
Takes the full row and column index to check for adjacent $ symbols
|
||||
"""
|
||||
text = text.strip()
|
||||
|
||||
# If it's a $ symbol by itself, not a financial value
|
||||
if text == '$':
|
||||
return False
|
||||
|
||||
# Check if it's a number
|
||||
is_numeric = is_number(text)
|
||||
|
||||
if not is_numeric:
|
||||
return False
|
||||
|
||||
# Look for $ in adjacent columns (considering empty columns in between)
|
||||
# Look left for $
|
||||
left_idx = col_idx - 1
|
||||
while left_idx >= 0:
|
||||
left_cell = row[left_idx].strip()
|
||||
if left_cell == '$':
|
||||
return True
|
||||
elif left_cell: # If we hit any non-empty cell that's not $, stop looking
|
||||
break
|
||||
left_idx -= 1
|
||||
|
||||
return is_numeric # If we found a number but no $, still treat as financial value
|
||||
|
||||
@staticmethod
|
||||
def _is_financial_metric(text: str) -> bool:
|
||||
"""Check if text represents a common financial metric"""
|
||||
text = text.lower().strip()
|
||||
metrics = [
|
||||
'revenue', 'sales', 'income', 'earnings', 'profit', 'loss',
|
||||
'assets', 'liabilities', 'equity', 'cash', 'expenses',
|
||||
'cost', 'margin', 'ebitda', 'eps', 'shares', 'tax',
|
||||
'operating', 'net', 'gross', 'total', 'capital',
|
||||
'depreciation', 'amortization', 'interest', 'debt'
|
||||
]
|
||||
return any(metric in text for metric in metrics)
|
||||
|
||||
@staticmethod
|
||||
def _format_data_rows(rows: list[list[str]]) -> list[list[str]]:
|
||||
"""Format data rows for display"""
|
||||
formatted_rows = []
|
||||
for row in rows:
|
||||
formatted_row = []
|
||||
for col_idx, cell in enumerate(row):
|
||||
content = cell.strip()
|
||||
if col_idx > 0: # Don't format first column
|
||||
# Handle parenthesized numbers
|
||||
if content.startswith('(') and content.endswith(')'):
|
||||
content = f"-{content[1:-1]}"
|
||||
formatted_row.append(content)
|
||||
formatted_rows.append(formatted_row)
|
||||
return formatted_rows
|
||||
|
||||
|
||||
class ColumnOptimizer:
|
||||
"""Optimizes column widths for table rendering"""
|
||||
|
||||
def __init__(self, total_width: int = 100, min_data_col_width: int = 15,
|
||||
max_left_col_ratio: float = 0.5, target_left_col_ratio: float = 0.4):
|
||||
self.total_width = total_width
|
||||
self.min_data_col_width = min_data_col_width
|
||||
self.max_left_col_ratio = max_left_col_ratio # Maximum portion of total width for left column
|
||||
self.target_left_col_ratio = target_left_col_ratio # Target portion for left column
|
||||
|
||||
def _measure_content_width(self, content: str) -> int:
|
||||
"""Measure the display width of content, handling multiline text"""
|
||||
if not content:
|
||||
return 0
|
||||
lines = content.split('\n')
|
||||
return max(len(line) for line in lines)
|
||||
|
||||
def _wrap_text(self, text: str, max_width: int) -> str:
|
||||
"""
|
||||
Wrap text to specified width, preserving existing line breaks and word boundaries.
|
||||
If text already contains line breaks, preserve the original formatting.
|
||||
"""
|
||||
if not text or len(text) <= max_width:
|
||||
return text
|
||||
|
||||
# If text already contains line breaks, preserve them
|
||||
if '\n' in text:
|
||||
return text
|
||||
|
||||
# Special handling for financial statement line items
|
||||
if ',' in text and ':' in text:
|
||||
# Split into main description and details
|
||||
parts = text.split(':', 1)
|
||||
if len(parts) == 2:
|
||||
desc, details = parts
|
||||
wrapped_desc = self._wrap_text(desc.strip(), max_width)
|
||||
wrapped_details = self._wrap_text(details.strip(), max_width)
|
||||
return f"{wrapped_desc}:\n{wrapped_details}"
|
||||
|
||||
words = text.split()
|
||||
lines = []
|
||||
current_line = []
|
||||
current_length = 0
|
||||
|
||||
for word in words:
|
||||
word_length = len(word)
|
||||
|
||||
# Handle very long words
|
||||
if word_length > max_width:
|
||||
# If we have a current line, add it first
|
||||
if current_line:
|
||||
lines.append(' '.join(current_line))
|
||||
current_line = []
|
||||
current_length = 0
|
||||
|
||||
# Split long word across lines
|
||||
while word_length > max_width:
|
||||
lines.append(word[:max_width - 1] + '-')
|
||||
word = word[max_width - 1:]
|
||||
word_length = len(word)
|
||||
if word:
|
||||
current_line = [word]
|
||||
current_length = word_length
|
||||
continue
|
||||
|
||||
if current_length + word_length + (1 if current_line else 0) <= max_width:
|
||||
current_line.append(word)
|
||||
current_length += word_length + (1 if current_length else 0)
|
||||
else:
|
||||
if current_line:
|
||||
lines.append(' '.join(current_line))
|
||||
current_line = [word]
|
||||
current_length = word_length
|
||||
|
||||
if current_line:
|
||||
lines.append(' '.join(current_line))
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
def optimize_columns(self, table: ProcessedTable) -> tuple[list[int], ProcessedTable]:
|
||||
"""
|
||||
Optimize column widths and wrap text as needed.
|
||||
Returns (column_widths, modified_table)
|
||||
"""
|
||||
col_count = len(table.data_rows[0]) if table.data_rows else 0
|
||||
if not col_count:
|
||||
return [], table
|
||||
|
||||
# Calculate maximum left column width based on total width
|
||||
max_left_col_width = int(self.total_width * self.max_left_col_ratio)
|
||||
target_left_col_width = int(self.total_width * self.target_left_col_ratio)
|
||||
|
||||
# Initialize widths array
|
||||
widths = [0] * col_count
|
||||
|
||||
# First pass: calculate minimum required widths for data columns
|
||||
for col in range(1, col_count):
|
||||
col_content_width = self.min_data_col_width
|
||||
if table.headers:
|
||||
col_content_width = max(col_content_width,
|
||||
self._measure_content_width(table.headers[col]))
|
||||
|
||||
# Check numeric data width
|
||||
for row in table.data_rows:
|
||||
if col < len(row):
|
||||
col_content_width = max(col_content_width,
|
||||
self._measure_content_width(row[col]))
|
||||
|
||||
widths[col] = col_content_width
|
||||
|
||||
# Calculate available space for left column
|
||||
data_cols_width = sum(widths[1:])
|
||||
available_left_width = self.total_width - data_cols_width
|
||||
|
||||
# Determine left column width
|
||||
left_col_max_content = 0
|
||||
if table.headers and table.headers[0]:
|
||||
left_col_max_content = self._measure_content_width(table.headers[0])
|
||||
for row in table.data_rows:
|
||||
if row:
|
||||
left_col_max_content = max(left_col_max_content,
|
||||
self._measure_content_width(row[0]))
|
||||
|
||||
# Set left column width based on constraints
|
||||
if left_col_max_content <= target_left_col_width:
|
||||
widths[0] = left_col_max_content
|
||||
else:
|
||||
widths[0] = min(max_left_col_width,
|
||||
max(target_left_col_width, available_left_width))
|
||||
|
||||
# If we still exceed total width, redistribute data column space
|
||||
total_width = sum(widths)
|
||||
if total_width > self.total_width:
|
||||
excess = total_width - self.total_width
|
||||
data_cols = len(widths) - 1
|
||||
reduction_per_col = excess // data_cols
|
||||
|
||||
# Reduce data columns while ensuring minimum width
|
||||
for i in range(1, len(widths)):
|
||||
if widths[i] - reduction_per_col >= self.min_data_col_width:
|
||||
widths[i] -= reduction_per_col
|
||||
|
||||
# Apply width constraints and wrap text
|
||||
modified_table = self._apply_column_constraints(table, widths)
|
||||
|
||||
return widths, modified_table
|
||||
|
||||
def _apply_column_constraints(self, table: ProcessedTable, widths: list[int]) -> ProcessedTable:
|
||||
"""Apply width constraints to table content, wrapping text as needed"""
|
||||
# Wrap headers if present
|
||||
wrapped_headers = None
|
||||
if table.headers:
|
||||
wrapped_headers = [
|
||||
self._wrap_text(header, widths[i])
|
||||
for i, header in enumerate(table.headers)
|
||||
]
|
||||
|
||||
# Wrap data in first column only
|
||||
wrapped_rows = []
|
||||
for row in table.data_rows:
|
||||
wrapped_row = list(row) # Make a copy
|
||||
wrapped_row[0] = self._wrap_text(row[0], widths[0])
|
||||
wrapped_rows.append(wrapped_row)
|
||||
|
||||
return ProcessedTable(
|
||||
headers=wrapped_headers,
|
||||
data_rows=wrapped_rows,
|
||||
column_alignments=table.column_alignments
|
||||
)
|
||||
85
venv/lib/python3.10/site-packages/edgar/files/text.py
Normal file
85
venv/lib/python3.10/site-packages/edgar/files/text.py
Normal file
@@ -0,0 +1,85 @@
|
||||
from rich.console import Console
|
||||
from rich.highlighter import RegexHighlighter
|
||||
from rich.text import Text
|
||||
from rich.theme import Theme
|
||||
|
||||
from edgar.richtools import repr_rich
|
||||
|
||||
__all__ = ['PlainDocument', 'XmlDocument', 'JsonDocument', 'print_xml']
|
||||
|
||||
|
||||
|
||||
class PlainDocument:
|
||||
|
||||
def __init__(self, content: str):
|
||||
self.content = content
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(Text(self.content))
|
||||
|
||||
def __str__(self):
|
||||
return self.content
|
||||
|
||||
|
||||
class XMLHighlighter(RegexHighlighter):
|
||||
"""Apply style to XML syntax elements."""
|
||||
|
||||
base_style = "xml."
|
||||
highlights = [
|
||||
# XML tags with namespaces
|
||||
r'(?P<namespace>[a-zA-Z0-9_-]+)(?=:)', # matches the namespace prefix
|
||||
r'(?P<colon>:)', # matches the colon separator
|
||||
r'(?P<tagname>[a-zA-Z0-9_-]+)(?:\s|>|/>)', # matches the tag name after namespace
|
||||
# Attribute names and values
|
||||
r'(?P<attribute>\s[a-zA-Z0-9_-]+)(?==)',
|
||||
r'(?P<value>"[^"]*")',
|
||||
# Comments
|
||||
r'(?P<comment><!--[\s\S]*?-->)',
|
||||
# URLs in xmlns attributes
|
||||
r'(?P<url>http://[^\s<>"]+)',
|
||||
]
|
||||
|
||||
# Define theme colors for different XML elements
|
||||
xml_theme = Theme({
|
||||
"xml.namespace": "magenta", # pink/magenta for namespaces like 'us-gaap'
|
||||
"xml.colon": "magenta", # keeping the colon the same color as namespace
|
||||
"xml.tagname": "bold orange1", # tag names after the namespace
|
||||
"xml.attribute": "grey70", # gray for attributes like 'contextRef'
|
||||
"xml.value": "green", # green for attribute values and URLs
|
||||
"xml.comment": "grey58", # gray for comments
|
||||
"xml.url": "green", # green for URLs in xmlns
|
||||
})
|
||||
|
||||
def print_xml(xml: str):
|
||||
console = Console(highlighter=XMLHighlighter(), theme=xml_theme)
|
||||
console.print(xml)
|
||||
|
||||
class XmlDocument:
|
||||
|
||||
def __init__(self, content: str):
|
||||
self.content = content
|
||||
|
||||
def __rich__(self):
|
||||
return Text(self.content)
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(self.__rich__(), highlighter=XMLHighlighter(), theme=xml_theme)
|
||||
|
||||
def __str__(self):
|
||||
return repr(self)
|
||||
|
||||
|
||||
|
||||
class JsonDocument:
|
||||
|
||||
def __init__(self, content: str):
|
||||
self.content = content
|
||||
|
||||
def __repr__(self):
|
||||
return repr_rich(Text(self.content))
|
||||
|
||||
def __str__(self):
|
||||
return self.content
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user