Initial commit
This commit is contained in:
358
venv/lib/python3.10/site-packages/rapidfuzz/distance/Indel_py.py
Normal file
358
venv/lib/python3.10/site-packages/rapidfuzz/distance/Indel_py.py
Normal file
@@ -0,0 +1,358 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance.LCSseq_py import (
|
||||
_block_similarity as lcs_seq_block_similarity,
|
||||
editops as lcs_seq_editops,
|
||||
opcodes as lcs_seq_opcodes,
|
||||
similarity as lcs_seq_similarity,
|
||||
)
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the minimum number of insertions and deletions
|
||||
required to change one sequence into the other. This is equivalent to the
|
||||
Levenshtein distance with a substitution weight of 2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the Indel distance between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
>>> Indel.distance("lewenstein", "levenshtein")
|
||||
3
|
||||
|
||||
Setting a maximum distance allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> Indel.distance("lewenstein", "levenshtein", score_cutoff=1)
|
||||
2
|
||||
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = len(s1) + len(s2)
|
||||
lcs_sim = lcs_seq_similarity(s1, s2)
|
||||
dist = maximum - 2 * lcs_sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def _block_distance(
|
||||
block,
|
||||
s1,
|
||||
s2,
|
||||
score_cutoff=None,
|
||||
):
|
||||
maximum = len(s1) + len(s2)
|
||||
lcs_sim = lcs_seq_block_similarity(block, s1, s2)
|
||||
dist = maximum - 2 * lcs_sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Indel similarity in the range [max, 0].
|
||||
|
||||
This is calculated as ``(len1 + len2) - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : int
|
||||
similarity between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = len(s1) + len(s2)
|
||||
dist = distance(s1, s2)
|
||||
sim = maximum - dist
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized levenshtein similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / (len1 + len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = len(s1) + len(s2)
|
||||
dist = distance(s1, s2)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def _block_normalized_distance(
|
||||
block,
|
||||
s1,
|
||||
s2,
|
||||
score_cutoff=None,
|
||||
):
|
||||
maximum = len(s1) + len(s2)
|
||||
dist = _block_distance(block, s1, s2)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized indel similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the normalized Indel similarity between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
>>> Indel.normalized_similarity("lewenstein", "levenshtein")
|
||||
0.85714285714285
|
||||
|
||||
Setting a score_cutoff allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> Indel.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.9)
|
||||
0.0
|
||||
|
||||
When a different processor is used s1 and s2 do not have to be strings
|
||||
|
||||
>>> Indel.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
|
||||
0.8571428571428572
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
norm_dist = normalized_distance(s1, s2)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def _block_normalized_similarity(
|
||||
block,
|
||||
s1,
|
||||
s2,
|
||||
score_cutoff=None,
|
||||
):
|
||||
norm_dist = _block_normalized_distance(block, s1, s2)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def editops(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Editops describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described [6]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [6] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
>>> for tag, src_pos, dest_pos in Indel.editops("qabxcd", "abycdf"):
|
||||
... print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
|
||||
delete s1[0] s2[0]
|
||||
delete s1[3] s2[2]
|
||||
insert s1[4] s2[2]
|
||||
insert s1[6] s2[5]
|
||||
"""
|
||||
return lcs_seq_editops(s1, s2, processor=processor)
|
||||
|
||||
|
||||
def opcodes(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Opcodes describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described [7]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [7] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
|
||||
>>> a = "qabxcd"
|
||||
>>> b = "abycdf"
|
||||
>>> for tag, i1, i2, j1, j2 in Indel.opcodes(a, b):
|
||||
... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
|
||||
... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
|
||||
delete a[0:1] (q) b[0:0] ()
|
||||
equal a[1:3] (ab) b[0:2] (ab)
|
||||
delete a[3:4] (x) b[2:2] ()
|
||||
insert a[4:4] () b[2:3] (y)
|
||||
equal a[4:6] (cd) b[3:5] (cd)
|
||||
insert a[6:6] () b[5:6] (f)
|
||||
"""
|
||||
return lcs_seq_opcodes(s1, s2, processor=processor)
|
||||
Reference in New Issue
Block a user