Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/rapidfuzz/distance/LCSseq_py.py
+++ b/venv/lib/python3.10/site-packages/rapidfuzz/distance/LCSseq_py.py
@@ -0,0 +1,426 @@
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2022 Max Bachmann
+from __future__ import annotations
+
+from rapidfuzz._common_py import common_affix, conv_sequences
+from rapidfuzz._utils import is_none, setupPandas
+from rapidfuzz.distance._initialize_py import Editop, Editops
+
+
+def similarity(
+    s1,
+    s2,
+    *,
+    processor=None,
+    score_cutoff=None,
+):
+    """
+    Calculates the length of the longest common subsequence
+
+    Parameters
+    ----------
+    s1 : Sequence[Hashable]
+        First string to compare.
+    s2 : Sequence[Hashable]
+        Second string to compare.
+    processor: callable, optional
+        Optional callable that is used to preprocess the strings before
+        comparing them. Default is None, which deactivates this behaviour.
+    score_cutoff : int, optional
+        Maximum distance between s1 and s2, that is
+        considered as a result. If the similarity is smaller than score_cutoff,
+        0 is returned instead. Default is None, which deactivates
+        this behaviour.
+
+    Returns
+    -------
+    similarity : int
+        similarity between s1 and s2
+    """
+    if processor is not None:
+        s1 = processor(s1)
+        s2 = processor(s2)
+
+    if not s1:
+        return 0
+
+    s1, s2 = conv_sequences(s1, s2)
+    S = (1 << len(s1)) - 1
+    block = {}
+    block_get = block.get
+    x = 1
+    for ch1 in s1:
+        block[ch1] = block_get(ch1, 0) | x
+        x <<= 1
+
+    for ch2 in s2:
+        Matches = block_get(ch2, 0)
+        u = S & Matches
+        S = (S + u) | (S - u)
+
+    # calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
+    res = bin(S)[-len(s1) :].count("0")
+    return res if (score_cutoff is None or res >= score_cutoff) else 0
+
+
+def _block_similarity(
+    block,
+    s1,
+    s2,
+    score_cutoff=None,
+):
+    if not s1:
+        return 0
+
+    S = (1 << len(s1)) - 1
+    block_get = block.get
+
+    for ch2 in s2:
+        Matches = block_get(ch2, 0)
+        u = S & Matches
+        S = (S + u) | (S - u)
+
+    # calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
+    res = bin(S)[-len(s1) :].count("0")
+    return res if (score_cutoff is None or res >= score_cutoff) else 0
+
+
+def distance(
+    s1,
+    s2,
+    *,
+    processor=None,
+    score_cutoff=None,
+):
+    """
+    Calculates the LCS distance in the range [0, max].
+
+    This is calculated as ``max(len1, len2) - similarity``.
+
+    Parameters
+    ----------
+    s1 : Sequence[Hashable]
+        First string to compare.
+    s2 : Sequence[Hashable]
+        Second string to compare.
+    processor: callable, optional
+        Optional callable that is used to preprocess the strings before
+        comparing them. Default is None, which deactivates this behaviour.
+    score_cutoff : int, optional
+        Maximum distance between s1 and s2, that is
+        considered as a result. If the distance is bigger than score_cutoff,
+        score_cutoff + 1 is returned instead. Default is None, which deactivates
+        this behaviour.
+
+    Returns
+    -------
+    distance : int
+        distance between s1 and s2
+
+    Examples
+    --------
+    Find the LCS distance between two strings:
+
+    >>> from rapidfuzz.distance import LCSseq
+    >>> LCSseq.distance("lewenstein", "levenshtein")
+    2
+
+    Setting a maximum distance allows the implementation to select
+    a more efficient implementation:
+
+    >>> LCSseq.distance("lewenstein", "levenshtein", score_cutoff=1)
+    2
+
+    """
+    if processor is not None:
+        s1 = processor(s1)
+        s2 = processor(s2)
+
+    s1, s2 = conv_sequences(s1, s2)
+    maximum = max(len(s1), len(s2))
+    sim = similarity(s1, s2)
+    dist = maximum - sim
+    return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
+
+
+def normalized_distance(
+    s1,
+    s2,
+    *,
+    processor=None,
+    score_cutoff=None,
+):
+    """
+    Calculates a normalized LCS similarity in the range [1, 0].
+
+    This is calculated as ``distance / max(len1, len2)``.
+
+    Parameters
+    ----------
+    s1 : Sequence[Hashable]
+        First string to compare.
+    s2 : Sequence[Hashable]
+        Second string to compare.
+    processor: callable, optional
+        Optional callable that is used to preprocess the strings before
+        comparing them. Default is None, which deactivates this behaviour.
+    score_cutoff : float, optional
+        Optional argument for a score threshold as a float between 0 and 1.0.
+        For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
+        which deactivates this behaviour.
+
+    Returns
+    -------
+    norm_dist : float
+        normalized distance between s1 and s2 as a float between 0 and 1.0
+    """
+    setupPandas()
+    if is_none(s1) or is_none(s2):
+        return 1.0
+
+    if processor is not None:
+        s1 = processor(s1)
+        s2 = processor(s2)
+
+    if not s1 or not s2:
+        return 0
+
+    s1, s2 = conv_sequences(s1, s2)
+    maximum = max(len(s1), len(s2))
+    norm_sim = distance(s1, s2) / maximum
+    return norm_sim if (score_cutoff is None or norm_sim <= score_cutoff) else 1
+
+
+def normalized_similarity(
+    s1,
+    s2,
+    *,
+    processor=None,
+    score_cutoff=None,
+):
+    """
+    Calculates a normalized LCS similarity in the range [0, 1].
+
+    This is calculated as ``1 - normalized_distance``
+
+    Parameters
+    ----------
+    s1 : Sequence[Hashable]
+        First string to compare.
+    s2 : Sequence[Hashable]
+        Second string to compare.
+    processor: callable, optional
+        Optional callable that is used to preprocess the strings before
+        comparing them. Default is None, which deactivates this behaviour.
+    score_cutoff : float, optional
+        Optional argument for a score threshold as a float between 0 and 1.0.
+        For norm_sim < score_cutoff 0 is returned instead. Default is 0,
+        which deactivates this behaviour.
+
+    Returns
+    -------
+    norm_sim : float
+        normalized similarity between s1 and s2 as a float between 0 and 1.0
+
+    Examples
+    --------
+    Find the normalized LCS similarity between two strings:
+
+    >>> from rapidfuzz.distance import LCSseq
+    >>> LCSseq.normalized_similarity("lewenstein", "levenshtein")
+    0.8181818181818181
+
+    Setting a score_cutoff allows the implementation to select
+    a more efficient implementation:
+
+    >>> LCSseq.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.9)
+    0.0
+
+    When a different processor is used s1 and s2 do not have to be strings
+
+    >>> LCSseq.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
+    0.81818181818181
+    """
+    setupPandas()
+    if is_none(s1) or is_none(s2):
+        return 0.0
+
+    if processor is not None:
+        s1 = processor(s1)
+        s2 = processor(s2)
+
+    norm_sim = 1.0 - normalized_distance(s1, s2)
+    return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
+
+
+def _matrix(s1, s2):
+    if not s1:
+        return (0, [])
+
+    S = (1 << len(s1)) - 1
+    block = {}
+    block_get = block.get
+    x = 1
+    for ch1 in s1:
+        block[ch1] = block_get(ch1, 0) | x
+        x <<= 1
+
+    matrix = []
+    for ch2 in s2:
+        Matches = block_get(ch2, 0)
+        u = S & Matches
+        S = (S + u) | (S - u)
+        matrix.append(S)
+
+    # calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
+    sim = bin(S)[-len(s1) :].count("0")
+    return (sim, matrix)
+
+
+def editops(
+    s1,
+    s2,
+    *,
+    processor=None,
+):
+    """
+    Return Editops describing how to turn s1 into s2.
+
+    Parameters
+    ----------
+    s1 : Sequence[Hashable]
+        First string to compare.
+    s2 : Sequence[Hashable]
+        Second string to compare.
+    processor: callable, optional
+        Optional callable that is used to preprocess the strings before
+        comparing them. Default is None, which deactivates this behaviour.
+
+    Returns
+    -------
+    editops : Editops
+        edit operations required to turn s1 into s2
+
+    Notes
+    -----
+    The alignment is calculated using an algorithm of Heikki Hyyrö, which is
+    described in [6]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
+
+    References
+    ----------
+    .. [6] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
+           Stringology (2004).
+
+    Examples
+    --------
+    >>> from rapidfuzz.distance import LCSseq
+    >>> for tag, src_pos, dest_pos in LCSseq.editops("qabxcd", "abycdf"):
+    ...    print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
+     delete s1[0] s2[0]
+     delete s1[3] s2[2]
+     insert s1[4] s2[2]
+     insert s1[6] s2[5]
+    """
+    if processor is not None:
+        s1 = processor(s1)
+        s2 = processor(s2)
+
+    s1, s2 = conv_sequences(s1, s2)
+    prefix_len, suffix_len = common_affix(s1, s2)
+    s1 = s1[prefix_len : len(s1) - suffix_len]
+    s2 = s2[prefix_len : len(s2) - suffix_len]
+    sim, matrix = _matrix(s1, s2)
+
+    editops = Editops([], 0, 0)
+    editops._src_len = len(s1) + prefix_len + suffix_len
+    editops._dest_len = len(s2) + prefix_len + suffix_len
+
+    dist = len(s1) + len(s2) - 2 * sim
+    if dist == 0:
+        return editops
+
+    editop_list = [None] * dist
+    col = len(s1)
+    row = len(s2)
+    while row != 0 and col != 0:
+        # deletion
+        if matrix[row - 1] & (1 << (col - 1)):
+            dist -= 1
+            col -= 1
+            editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
+        else:
+            row -= 1
+
+            # insertion
+            if row and not (matrix[row - 1] & (1 << (col - 1))):
+                dist -= 1
+                editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
+            # match
+            else:
+                col -= 1
+
+    while col != 0:
+        dist -= 1
+        col -= 1
+        editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
+
+    while row != 0:
+        dist -= 1
+        row -= 1
+        editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
+
+    editops._editops = editop_list
+    return editops
+
+
+def opcodes(
+    s1,
+    s2,
+    *,
+    processor=None,
+):
+    """
+    Return Opcodes describing how to turn s1 into s2.
+
+    Parameters
+    ----------
+    s1 : Sequence[Hashable]
+        First string to compare.
+    s2 : Sequence[Hashable]
+        Second string to compare.
+    processor: callable, optional
+        Optional callable that is used to preprocess the strings before
+        comparing them. Default is None, which deactivates this behaviour.
+
+    Returns
+    -------
+    opcodes : Opcodes
+        edit operations required to turn s1 into s2
+
+    Notes
+    -----
+    The alignment is calculated using an algorithm of Heikki Hyyrö, which is
+    described in [7]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
+
+    References
+    ----------
+    .. [7] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
+           Stringology (2004).
+
+    Examples
+    --------
+    >>> from rapidfuzz.distance import LCSseq
+
+    >>> a = "qabxcd"
+    >>> b = "abycdf"
+    >>> for tag, i1, i2, j1, j2 in LCSseq.opcodes(a, b):
+    ...    print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
+    ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
+     delete a[0:1] (q) b[0:0] ()
+      equal a[1:3] (ab) b[0:2] (ab)
+     delete a[3:4] (x) b[2:2] ()
+     insert a[4:4] () b[2:3] (y)
+      equal a[4:6] (cd) b[3:5] (cd)
+     insert a[6:6] () b[5:6] (f)
+    """
+    return editops(s1, s2, processor=processor).as_opcodes()