edgartools/venv/lib/python3.10/site-packages/rapidfuzz/distance/Jaro_py.py

# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations

from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas


def _jaro_calculate_similarity(pattern_len, text_len, common_chars, transpositions):
    transpositions //= 2
    sim = 0.0
    sim += common_chars / pattern_len
    sim += common_chars / text_len
    sim += (common_chars - transpositions) / common_chars
    return sim / 3.0


def _jaro_length_filter(pattern_len, text_len, score_cutoff):
    """
    filter matches below score_cutoff based on string lengths
    """
    if not pattern_len or not text_len:
        return False

    sim = _jaro_calculate_similarity(pattern_len, text_len, min(pattern_len, text_len), 0)
    return sim >= score_cutoff


def _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
    """
    filter matches below score_cutoff based on string lengths and common characters
    """
    if not common_chars:
        return False

    sim = _jaro_calculate_similarity(pattern_len, text_len, common_chars, 0)
    return sim >= score_cutoff


def _jaro_bounds(s1, s2):
    """
    find bounds and skip out of bound parts of the sequences
    """
    pattern_len = len(s1)
    text_len = len(s2)

    # since jaro uses a sliding window some parts of T/P might never be in
    # range an can be removed ahead of time
    bound = 0
    if text_len > pattern_len:
        bound = text_len // 2 - 1
        if text_len > pattern_len + bound:
            s2 = s2[: pattern_len + bound]
    else:
        bound = pattern_len // 2 - 1
        if pattern_len > text_len + bound:
            s1 = s1[: text_len + bound]
    return s1, s2, bound


def similarity(
    s1,
    s2,
    *,
    processor=None,
    score_cutoff=None,
):
    """
    Calculates the jaro similarity

    Parameters
    ----------
    s1 : Sequence[Hashable]
        First string to compare.
    s2 : Sequence[Hashable]
        Second string to compare.
    processor: callable, optional
        Optional callable that is used to preprocess the strings before
        comparing them. Default is None, which deactivates this behaviour.
    score_cutoff : float, optional
        Optional argument for a score threshold as a float between 0 and 1.0.
        For ratio < score_cutoff 0 is returned instead. Default is None,
        which deactivates this behaviour.

    Returns
    -------
    similarity : float
        similarity between s1 and s2 as a float between 0 and 1.0
    """
    setupPandas()
    if is_none(s1) or is_none(s2):
        return 0.0

    if processor is not None:
        s1 = processor(s1)
        s2 = processor(s2)

    if not s1 and not s2:
        return 1.0

    if score_cutoff is None:
        score_cutoff = 0

    s1, s2 = conv_sequences(s1, s2)
    pattern_len = len(s1)
    text_len = len(s2)

    # short circuit if score_cutoff can not be reached
    if not _jaro_length_filter(pattern_len, text_len, score_cutoff):
        return 0

    if pattern_len == 1 and text_len == 1:
        return float(s1[0] == s2[0])

    s1, s2, bound = _jaro_bounds(s1, s2)

    s1_flags = [False] * pattern_len
    s2_flags = [False] * text_len

    # todo use bitparallel implementation
    # looking only within search range, count & flag matched pairs
    common_chars = 0
    for i, s1_ch in enumerate(s1):
        low = max(0, i - bound)
        hi = min(i + bound, text_len - 1)
        for j in range(low, hi + 1):
            if not s2_flags[j] and s2[j] == s1_ch:
                s1_flags[i] = s2_flags[j] = True
                common_chars += 1
                break

    # short circuit if score_cutoff can not be reached
    if not _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
        return 0

    # todo use bitparallel implementation
    # count transpositions
    k = trans_count = 0
    for i, s1_f in enumerate(s1_flags):
        if s1_f:
            for j in range(k, text_len):
                if s2_flags[j]:
                    k = j + 1
                    break
            if s1[i] != s2[j]:
                trans_count += 1

    return _jaro_calculate_similarity(pattern_len, text_len, common_chars, trans_count)


def normalized_similarity(
    s1,
    s2,
    *,
    processor=None,
    score_cutoff=None,
):
    """
    Calculates the normalized jaro similarity

    Parameters
    ----------
    s1 : Sequence[Hashable]
        First string to compare.
    s2 : Sequence[Hashable]
        Second string to compare.
    processor: callable, optional
        Optional callable that is used to preprocess the strings before
        comparing them. Default is None, which deactivates this behaviour.
    score_cutoff : float, optional
        Optional argument for a score threshold as a float between 0 and 1.0.
        For ratio < score_cutoff 0 is returned instead. Default is None,
        which deactivates this behaviour.

    Returns
    -------
    normalized similarity : float
        normalized similarity between s1 and s2 as a float between 0 and 1.0
    """
    return similarity(s1, s2, processor=processor, score_cutoff=score_cutoff)


def distance(
    s1,
    s2,
    *,
    processor=None,
    score_cutoff=None,
):
    """
    Calculates the jaro distance

    Parameters
    ----------
    s1 : Sequence[Hashable]
        First string to compare.
    s2 : Sequence[Hashable]
        Second string to compare.
    processor: callable, optional
        Optional callable that is used to preprocess the strings before
        comparing them. Default is None, which deactivates this behaviour.
    score_cutoff : float, optional
        Optional argument for a score threshold as a float between 0 and 1.0.
        For ratio < score_cutoff 0 is returned instead. Default is None,
        which deactivates this behaviour.

    Returns
    -------
    distance : float
        distance between s1 and s2 as a float between 1.0 and 0.0
    """
    setupPandas()
    if is_none(s1) or is_none(s2):
        return 1.0

    if processor is not None:
        s1 = processor(s1)
        s2 = processor(s2)

    cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
    sim = similarity(s1, s2, score_cutoff=cutoff_distance)
    dist = 1.0 - sim
    return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0


def normalized_distance(
    s1,
    s2,
    *,
    processor=None,
    score_cutoff=None,
):
    """
    Calculates the normalized jaro distance

    Parameters
    ----------
    s1 : Sequence[Hashable]
        First string to compare.
    s2 : Sequence[Hashable]
        Second string to compare.
    processor: callable, optional
        Optional callable that is used to preprocess the strings before
        comparing them. Default is None, which deactivates this behaviour.
    score_cutoff : float, optional
        Optional argument for a score threshold as a float between 0 and 1.0.
        For ratio < score_cutoff 0 is returned instead. Default is None,
        which deactivates this behaviour.

    Returns
    -------
    normalized distance : float
        normalized distance between s1 and s2 as a float between 1.0 and 0.0
    """
    return distance(s1, s2, processor=processor, score_cutoff=score_cutoff)