edgartools/venv/lib/python3.10/site-packages/edgar/richtools.py

import itertools
import re
from io import StringIO
from typing import Optional, Union

import pandas as pd
import pyarrow as pa
from rich import box
from rich.console import Console
from rich.highlighter import RegexHighlighter
from rich.table import Table
from rich.text import Text
from rich.theme import Theme

__all__ = [
    'repr_rich',
    'rich_to_text',
    'strip_ansi_text',
    'df_to_rich_table',
    'colorize_words',
    'print_xml',
    'print_rich',
    'Docs'
]

table_styles = {
    'form': 'dark_sea_green4',
    'filingDate': 'deep_sky_blue1',
    'filing_date': 'deep_sky_blue1',
    'filed': 'deep_sky_blue1',
    'Shares': 'deep_sky_blue1',
    'Reporting Owner': 'deep_sky_blue1',
    'issuer': 'deep_sky_blue1',
    'fact': 'deep_sky_blue1',
    'industry': 'deep_sky_blue1',
    'document': 'deep_sky_blue1'
}


def df_to_rich_table(
        df: Union[pd.DataFrame, pa.Table],
        index_name: Optional[str] = None,
        title: str = "",
        title_style: str = "",
        max_rows: int = 20,
        table_box:box.Box = box.SIMPLE) -> Table:
    """
    Convert a dataframe to a rich table


    :param index_name: The name of the index
    :param df: The dataframe to convert to a rich Table
    :param max_rows: The maximum number of rows in the rich Table
    :param title: The title of the Table
    :param title_style: The title of the Table
    :param table_box: The rich box style e.g. box.SIMPLE
    :return: a rich Table
    """
    if isinstance(df, pa.Table):
        # For speed, learn to sample the head and tail of the pyarrow table
        df = df.to_pandas()

    rich_table = Table(box=table_box, row_styles=["bold", ""], title=title, title_style=title_style or "bold")
    index_name = str(index_name) if index_name else ""
    index_style = table_styles.get(index_name)
    rich_table.add_column(index_name, style=index_style, header_style=index_style)

    for column in df.columns:
        style_name = table_styles.get(column)
        rich_table.add_column(column, style=style_name, header_style=style_name)

    if len(df) > max_rows:
        head = df.head(max_rows // 2)
        tail = df.tail(max_rows // 2)
        data_for_display = pd.concat([head,
                                      pd.DataFrame([{col: '...' for col in df.columns}], index=['...']),
                                      tail])
    else:
        data_for_display = df

    data_for_display = data_for_display.reset_index()

    for _index, value_list in enumerate(data_for_display.to_numpy().tolist()):
        # row = [str(index)] if show_index else []
        row = [str(x) for x in value_list]
        rich_table.add_row(*row)

    return rich_table


def strip_ansi_text(text: str) -> str:
    """
    Remove ANSI escape sequences from text

    :param text: Text containing ANSI escape sequences
    :return: Clean text without ANSI formatting
    """
    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    return ansi_escape.sub('', text)

def repr_rich(renderable, strip_ansi:bool=False, **console_args) -> str:
    """
    This renders a rich object to a string

    It implements one of the methods of capturing output listed here

    https://rich.readthedocs.io/en/stable/console.html#capturing-output

     This is the recommended method if you are testing console output in unit tests

        from io import StringIO
        from rich.console import Console
        console = Console(file=StringIO())
        console.print("[bold red]Hello[/] World")
        str_output = console.file.getvalue()

    :param renderable: A rich renderable object
    :param strip_ansi: Whether to strip ANSI escape sequences from the output
    :param console_args: The console arguments
    :return: A string representation of the renderable object
    """
    from rich.console import Console
    console = Console(**console_args)
    with console.capture() as capture:
        console.print(renderable)
    str_output = capture.get()
    if strip_ansi:
        str_output = strip_ansi_text(str_output)
    return str_output


def rich_to_text(rich_object, width:int=None) -> str:
    """
    Convert a Rich renderable object to plain text while preserving layout.

    Args:
        rich_object: Any Rich renderable object (Panel, Table, Tree, etc.)
        width: The width of the output in characters (default: None)

    Returns:
        str: Plain text representation with layout preserved
    """
    if width:
        text = repr_rich(rich_object, force_terminal=False, width=width)
    else:
        text = repr_rich(rich_object, force_terminal=False)
    text = strip_ansi_text(text)
    return text


def rich_to_svg(rich_object, width: int = 120) -> str:
    """
    Convert a Rich renderable object to SVG format while preserving layout and styling.

    This function uses Rich's built-in SVG export capabilities to convert any Rich
    renderable (Panel, Table, Tree, etc.) to an SVG string representation.

    Args:
        rich_object: Any Rich renderable object (Panel, Table, Tree, etc.)
        width: The width of the output in characters (default: 120)

    Returns:
        str: SVG representation of the Rich object with preserved layout and styling

    Example:
        >>> from rich.table import Table
        >>> table = Table(title="Example")
        >>> table.add_column("Name")
        >>> table.add_row("Alice")
        >>> svg_output = rich_to_svg(table)
    """
    from rich.console import Console

    # Create a console specifically for SVG export
    console = Console(
        file=StringIO(),
        force_terminal=True,  # Ensure styling is applied
        record=True,  # Enable recording for SVG export
        width=width,  # Set desired width
        color_system="standard"  # Use standard colors for better SVG compatibility
    )

    # Record the rich object rendering
    console.print(rich_object)

    # Export to SVG with default styling
    svg_output = console.export_svg()

    return svg_output


def rich_to_png(rich_object, width: int = 120, output_path: str = None) -> Optional[bytes]:
    """
    Convert a Rich renderable object to PNG format.

    This function first converts the Rich object to SVG using rich_to_svg,
    then converts that SVG to PNG using CairoSVG.

    Args:
        rich_object: Any Rich renderable object (Panel, Table, Tree, etc.)
        width: The width of the output in characters (default: 120)
        output_path: Optional path to save the PNG file. If not provided,
                    returns the PNG as bytes.

    Returns:
        bytes: PNG image data if output_path is None,
              None if output_path is provided (file is saved instead)

    Example:
        >>> from rich.table import Table
        >>> table = Table(title="Example")
        >>> table.add_column("Name")
        >>> table.add_row("Alice")
        >>> png_data = rich_to_png(table)
        >>> # Or save to file:
        >>> rich_to_png(table, output_path="output.png")
    """
    try:
        import cairosvg
    except ImportError:
        raise ImportError(
            "CairoSVG is required for PNG conversion. "
            "Install it with: pip install cairosvg"
        ) from None

    # First get the SVG output
    svg_content = rich_to_svg(rich_object, width=width)

    # Convert SVG to PNG
    if output_path:
        cairosvg.svg2png(
            bytestring=svg_content.encode('utf-8'),
            write_to=output_path
        )
        return None
    else:
        png_data = cairosvg.svg2png(bytestring=svg_content.encode('utf-8'))
        return png_data


def colorize_words(words, colors=None) -> Text:
    """ Colorize a list of words with a list of colors"
    """
    colors = colors or ["deep_sky_blue3", "red3", "dark_sea_green4"]
    colored_words = []
    color_cycle = itertools.cycle(colors)

    for word in words:
        color = next(color_cycle)
        colored_words.append((word, color))

    return Text.assemble(*colored_words)


class XMLHighlighter(RegexHighlighter):
    """Apply style to XML syntax elements."""

    base_style = "xml."
    highlights = [
        # XML tags with namespaces
        r'(?P<namespace>[a-zA-Z0-9_-]+)(?=:)',  # matches the namespace prefix
        r'(?P<colon>:)',  # matches the colon separator
        r'(?P<tagname>[a-zA-Z0-9_-]+)(?:\s|>|/>)',  # matches the tag name after namespace
        # Attribute names and values
        r'(?P<attribute>\s[a-zA-Z0-9_-]+)(?==)',
        r'(?P<value>"[^"]*")',
        # Comments
        r'(?P<comment><!--[\s\S]*?-->)',
        # URLs in xmlns attributes
        r'(?P<url>http://[^\s<>"]+)',
    ]

# Define theme colors for different XML elements
xml_theme = Theme({
    "xml.namespace": "magenta",  # pink/magenta for namespaces like 'us-gaap'
    "xml.colon": "magenta",     # keeping the colon the same color as namespace
    "xml.tagname": "light_goldenrod3",   # tag names after the namespace
    "xml.attribute": "grey70",  # gray for attributes like 'contextRef'
    "xml.value": "green",       # green for attribute values and URLs
    "xml.comment": "grey58",  # gray for comments
    "xml.url": "green",         # green for URLs in xmlns
})

def print_xml(xml: str):
    console = Console(highlighter=XMLHighlighter(), theme=xml_theme)
    console.print(xml)

def print_rich(rich_object, **args):
    console = Console(**args)
    console.print(rich_object)


class Docs:
    """
    A class that will show documentation for any class in edgartools

    Usage
    ```python
        filing = filings[0]

        filing.docs # Will create a Docs instance from the __docs__ attribute
    ```
    """

    def __init__(self, obj, docs_content: str = None):
        """
        Initialize the Docs class with an object and optional documentation content.

        Args:
            obj: The object to document
            docs_content: Optional documentation content string. If not provided,
                         will try to find markdown file, then __doc__
        """
        self.obj = obj
        self.docs_content = docs_content

        # If no docs_content provided, try to get from various sources in order of preference
        if not self.docs_content:
            # 1. Try to find markdown file with class name
            markdown_content = self._find_markdown_docs()
            if markdown_content:
                self.docs_content = markdown_content
            # 2. Fall back to __doc__ attribute
            elif hasattr(obj, '__doc__') and obj.__doc__:
                self.docs_content = obj.__doc__
            else:
                self.docs_content = f"No documentation available for {type(obj).__name__}"

    def _find_markdown_docs(self) -> Optional[str]:
        """
        Look for a markdown file with the same name as the class in a docs directory
        in the same package as the class.

        Returns:
            Optional[str]: The content of the markdown file if found, None otherwise
        """
        import inspect
        import os

        # Get the class name
        class_name = getattr(self.obj, '__name__', None) or type(self.obj).__name__

        # Get the module where the object is defined
        try:
            if hasattr(self.obj, '__module__'):
                module = inspect.getmodule(self.obj)
            else:
                module = inspect.getmodule(type(self.obj))

            if not module or not hasattr(module, '__file__') or not module.__file__:
                return None

            # Get the directory containing the module
            module_dir = os.path.dirname(os.path.abspath(module.__file__))

            # Look for docs directory in the same package
            docs_dir = os.path.join(module_dir, 'docs')

            if not os.path.exists(docs_dir):
                return None

            # Look for markdown file with class name
            markdown_file = os.path.join(docs_dir, f"{class_name}.md")

            if os.path.exists(markdown_file):
                try:
                    with open(markdown_file, 'r', encoding='utf-8') as f:
                        return f.read()
                except (IOError, OSError):
                    return None

        except Exception:
            # If anything goes wrong, silently return None
            return None

        return None

    def __rich__(self):
        """
        Return a Rich renderable representation of the documentation.
        """
        from rich.markdown import Markdown
        from rich.panel import Panel
        from rich.text import Text

        # Get the object name for the title
        obj_name = getattr(self.obj, '__name__', None) or type(self.obj).__name__

        # Create the title
        title = Text.assemble((obj_name, "bold white"))

        # Try to render as markdown if it looks like markdown, otherwise as plain text
        if self.docs_content and ('```' in self.docs_content or '#' in self.docs_content or '*' in self.docs_content):
            content = Markdown(self.docs_content)
        else:
            content = Text(self.docs_content or "No documentation available")

        # Create a panel with the documentation
        return Panel(
            content,
            title=title,
            border_style="blue",
            padding=(1, 2),
            expand=False
        )

    def __repr__(self):
        """
        Return a string representation of the Docs object.
        """
        return repr_rich(self.__rich__())

    def _split_into_sections(self):
        """
        Split markdown content into sections by ## headings.

        Returns:
            List[str]: List of document sections, each starting with a ## heading
        """
        if not self.docs_content:
            return []

        lines = self.docs_content.split('\n')
        sections = []
        current_section = []

        for line in lines:
            # Check if this is a ## heading (but not # or ### or ####)
            if line.startswith('## ') and not line.startswith('### '):
                # Save previous section if it exists
                if current_section:
                    sections.append('\n'.join(current_section))
                # Start new section with this heading
                current_section = [line]
            else:
                # Add line to current section
                current_section.append(line)

        # Don't forget the last section
        if current_section:
            sections.append('\n'.join(current_section))

        return sections

    def search(self, query: str, use_bm25: bool = True):
        """
        Search documentation content for relevant sections.

        Uses BM25 semantic search by default to find sections matching the query.
        Splits documentation by ## headings and returns matching sections with scores.

        Args:
            query: Search query (e.g., "extract revenue", "query by period")
            use_bm25: Use semantic BM25 search (True) or regex pattern matching (False)

        Returns:
            SearchResults: Matching documentation sections with scores

        Example:
            >>> filing.docs.search("get attachments")
            # Returns sections about accessing filing attachments

            >>> xbrl.docs.search("extract revenue")
            # Returns sections about extracting revenue from statements

            >>> xbrl.docs.search("\.to_dataframe\(\)", use_bm25=False)
            # Regex search for exact pattern
        """
        from edgar.search.textsearch import BM25Search, RegexSearch

        # Split content into searchable sections
        sections = self._split_into_sections()

        if not sections:
            # Return empty results if no content
            from edgar.search.textsearch import SearchResults
            return SearchResults(query=query, sections=[], tables=False)

        # Use appropriate search method
        if use_bm25:
            searcher = BM25Search(sections)
        else:
            searcher = RegexSearch(sections)

        return searcher.search(query, tables=False)