Initial commit
This commit is contained in:
255
venv/lib/python3.10/site-packages/rapidfuzz/distance/Jaro_py.py
Normal file
255
venv/lib/python3.10/site-packages/rapidfuzz/distance/Jaro_py.py
Normal file
@@ -0,0 +1,255 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
|
||||
|
||||
def _jaro_calculate_similarity(pattern_len, text_len, common_chars, transpositions):
|
||||
transpositions //= 2
|
||||
sim = 0.0
|
||||
sim += common_chars / pattern_len
|
||||
sim += common_chars / text_len
|
||||
sim += (common_chars - transpositions) / common_chars
|
||||
return sim / 3.0
|
||||
|
||||
|
||||
def _jaro_length_filter(pattern_len, text_len, score_cutoff):
|
||||
"""
|
||||
filter matches below score_cutoff based on string lengths
|
||||
"""
|
||||
if not pattern_len or not text_len:
|
||||
return False
|
||||
|
||||
sim = _jaro_calculate_similarity(pattern_len, text_len, min(pattern_len, text_len), 0)
|
||||
return sim >= score_cutoff
|
||||
|
||||
|
||||
def _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
|
||||
"""
|
||||
filter matches below score_cutoff based on string lengths and common characters
|
||||
"""
|
||||
if not common_chars:
|
||||
return False
|
||||
|
||||
sim = _jaro_calculate_similarity(pattern_len, text_len, common_chars, 0)
|
||||
return sim >= score_cutoff
|
||||
|
||||
|
||||
def _jaro_bounds(s1, s2):
|
||||
"""
|
||||
find bounds and skip out of bound parts of the sequences
|
||||
"""
|
||||
pattern_len = len(s1)
|
||||
text_len = len(s2)
|
||||
|
||||
# since jaro uses a sliding window some parts of T/P might never be in
|
||||
# range an can be removed ahead of time
|
||||
bound = 0
|
||||
if text_len > pattern_len:
|
||||
bound = text_len // 2 - 1
|
||||
if text_len > pattern_len + bound:
|
||||
s2 = s2[: pattern_len + bound]
|
||||
else:
|
||||
bound = pattern_len // 2 - 1
|
||||
if pattern_len > text_len + bound:
|
||||
s1 = s1[: text_len + bound]
|
||||
return s1, s2, bound
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if not s1 and not s2:
|
||||
return 1.0
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = 0
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
pattern_len = len(s1)
|
||||
text_len = len(s2)
|
||||
|
||||
# short circuit if score_cutoff can not be reached
|
||||
if not _jaro_length_filter(pattern_len, text_len, score_cutoff):
|
||||
return 0
|
||||
|
||||
if pattern_len == 1 and text_len == 1:
|
||||
return float(s1[0] == s2[0])
|
||||
|
||||
s1, s2, bound = _jaro_bounds(s1, s2)
|
||||
|
||||
s1_flags = [False] * pattern_len
|
||||
s2_flags = [False] * text_len
|
||||
|
||||
# todo use bitparallel implementation
|
||||
# looking only within search range, count & flag matched pairs
|
||||
common_chars = 0
|
||||
for i, s1_ch in enumerate(s1):
|
||||
low = max(0, i - bound)
|
||||
hi = min(i + bound, text_len - 1)
|
||||
for j in range(low, hi + 1):
|
||||
if not s2_flags[j] and s2[j] == s1_ch:
|
||||
s1_flags[i] = s2_flags[j] = True
|
||||
common_chars += 1
|
||||
break
|
||||
|
||||
# short circuit if score_cutoff can not be reached
|
||||
if not _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
|
||||
return 0
|
||||
|
||||
# todo use bitparallel implementation
|
||||
# count transpositions
|
||||
k = trans_count = 0
|
||||
for i, s1_f in enumerate(s1_flags):
|
||||
if s1_f:
|
||||
for j in range(k, text_len):
|
||||
if s2_flags[j]:
|
||||
k = j + 1
|
||||
break
|
||||
if s1[i] != s2[j]:
|
||||
trans_count += 1
|
||||
|
||||
return _jaro_calculate_similarity(pattern_len, text_len, common_chars, trans_count)
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized similarity : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
return similarity(s1, s2, processor=processor, score_cutoff=score_cutoff)
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : float
|
||||
distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
|
||||
sim = similarity(s1, s2, score_cutoff=cutoff_distance)
|
||||
dist = 1.0 - sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized distance : float
|
||||
normalized distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
"""
|
||||
return distance(s1, s2, processor=processor, score_cutoff=score_cutoff)
|
||||
Reference in New Issue
Block a user