Initial commit
This commit is contained in:
@@ -0,0 +1,235 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance import Jaro_py as Jaro
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro winkler similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = 0
|
||||
|
||||
if prefix_weight > 1.0 or prefix_weight < 0.0:
|
||||
msg = "prefix_weight has to be in the range 0.0 - 1.0"
|
||||
raise ValueError(msg)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
P_len = len(s1)
|
||||
T_len = len(s2)
|
||||
min_len = min(P_len, T_len)
|
||||
prefix = 0
|
||||
max_prefix = min(min_len, 4)
|
||||
|
||||
for _ in range(max_prefix):
|
||||
if s1[prefix] != s2[prefix]:
|
||||
break
|
||||
prefix += 1
|
||||
|
||||
jaro_score_cutoff = score_cutoff
|
||||
if jaro_score_cutoff > 0.7:
|
||||
prefix_sim = prefix * prefix_weight
|
||||
|
||||
if prefix_sim >= 1.0:
|
||||
jaro_score_cutoff = 0.7
|
||||
else:
|
||||
jaro_score_cutoff = max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0))
|
||||
|
||||
Sim = Jaro.similarity(s1, s2, score_cutoff=jaro_score_cutoff)
|
||||
if Sim > 0.7:
|
||||
Sim += prefix * prefix_weight * (1.0 - Sim)
|
||||
Sim = min(Sim, 1.0)
|
||||
|
||||
return Sim if Sim >= score_cutoff else 0
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro winkler similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized similarity : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
return similarity(
|
||||
s1,
|
||||
s2,
|
||||
prefix_weight=prefix_weight,
|
||||
processor=processor,
|
||||
score_cutoff=score_cutoff,
|
||||
)
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro winkler distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : float
|
||||
distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
|
||||
sim = similarity(s1, s2, prefix_weight=prefix_weight, score_cutoff=cutoff_distance)
|
||||
dist = 1.0 - sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro winkler distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized distance : float
|
||||
normalized distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
return distance(
|
||||
s1,
|
||||
s2,
|
||||
prefix_weight=prefix_weight,
|
||||
processor=processor,
|
||||
score_cutoff=score_cutoff,
|
||||
)
|
||||
Reference in New Issue
Block a user