Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/rapidfuzz/distance/JaroWinkler_py.py
+++ b/venv/lib/python3.10/site-packages/rapidfuzz/distance/JaroWinkler_py.py
@@ -0,0 +1,235 @@
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2022 Max Bachmann
+from __future__ import annotations
+
+from rapidfuzz._common_py import conv_sequences
+from rapidfuzz._utils import is_none, setupPandas
+from rapidfuzz.distance import Jaro_py as Jaro
+
+
+def similarity(
+    s1,
+    s2,
+    *,
+    prefix_weight=0.1,
+    processor=None,
+    score_cutoff=None,
+):
+    """
+    Calculates the jaro winkler similarity
+
+    Parameters
+    ----------
+    s1 : Sequence[Hashable]
+        First string to compare.
+    s2 : Sequence[Hashable]
+        Second string to compare.
+    prefix_weight : float, optional
+        Weight used for the common prefix of the two strings.
+        Has to be between 0 and 0.25. Default is 0.1.
+    processor: callable, optional
+        Optional callable that is used to preprocess the strings before
+        comparing them. Default is None, which deactivates this behaviour.
+    score_cutoff : float, optional
+        Optional argument for a score threshold as a float between 0 and 1.0.
+        For ratio < score_cutoff 0 is returned instead. Default is None,
+        which deactivates this behaviour.
+
+    Returns
+    -------
+    similarity : float
+        similarity between s1 and s2 as a float between 0 and 1.0
+
+    Raises
+    ------
+    ValueError
+        If prefix_weight is invalid
+    """
+    setupPandas()
+    if is_none(s1) or is_none(s2):
+        return 0.0
+
+    if processor is not None:
+        s1 = processor(s1)
+        s2 = processor(s2)
+
+    if score_cutoff is None:
+        score_cutoff = 0
+
+    if prefix_weight > 1.0 or prefix_weight < 0.0:
+        msg = "prefix_weight has to be in the range 0.0 - 1.0"
+        raise ValueError(msg)
+
+    s1, s2 = conv_sequences(s1, s2)
+    P_len = len(s1)
+    T_len = len(s2)
+    min_len = min(P_len, T_len)
+    prefix = 0
+    max_prefix = min(min_len, 4)
+
+    for _ in range(max_prefix):
+        if s1[prefix] != s2[prefix]:
+            break
+        prefix += 1
+
+    jaro_score_cutoff = score_cutoff
+    if jaro_score_cutoff > 0.7:
+        prefix_sim = prefix * prefix_weight
+
+        if prefix_sim >= 1.0:
+            jaro_score_cutoff = 0.7
+        else:
+            jaro_score_cutoff = max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0))
+
+    Sim = Jaro.similarity(s1, s2, score_cutoff=jaro_score_cutoff)
+    if Sim > 0.7:
+        Sim += prefix * prefix_weight * (1.0 - Sim)
+        Sim = min(Sim, 1.0)
+
+    return Sim if Sim >= score_cutoff else 0
+
+
+def normalized_similarity(
+    s1,
+    s2,
+    *,
+    prefix_weight=0.1,
+    processor=None,
+    score_cutoff=None,
+):
+    """
+    Calculates the normalized jaro winkler similarity
+
+    Parameters
+    ----------
+    s1 : Sequence[Hashable]
+        First string to compare.
+    s2 : Sequence[Hashable]
+        Second string to compare.
+    prefix_weight : float, optional
+        Weight used for the common prefix of the two strings.
+        Has to be between 0 and 0.25. Default is 0.1.
+    processor: callable, optional
+        Optional callable that is used to preprocess the strings before
+        comparing them. Default is None, which deactivates this behaviour.
+    score_cutoff : float, optional
+        Optional argument for a score threshold as a float between 0 and 1.0.
+        For ratio < score_cutoff 0 is returned instead. Default is None,
+        which deactivates this behaviour.
+
+    Returns
+    -------
+    normalized similarity : float
+        normalized similarity between s1 and s2 as a float between 0 and 1.0
+
+    Raises
+    ------
+    ValueError
+        If prefix_weight is invalid
+    """
+    return similarity(
+        s1,
+        s2,
+        prefix_weight=prefix_weight,
+        processor=processor,
+        score_cutoff=score_cutoff,
+    )
+
+
+def distance(
+    s1,
+    s2,
+    *,
+    prefix_weight=0.1,
+    processor=None,
+    score_cutoff=None,
+):
+    """
+    Calculates the jaro winkler distance
+
+    Parameters
+    ----------
+    s1 : Sequence[Hashable]
+        First string to compare.
+    s2 : Sequence[Hashable]
+        Second string to compare.
+    prefix_weight : float, optional
+        Weight used for the common prefix of the two strings.
+        Has to be between 0 and 0.25. Default is 0.1.
+    processor: callable, optional
+        Optional callable that is used to preprocess the strings before
+        comparing them. Default is None, which deactivates this behaviour.
+    score_cutoff : float, optional
+        Optional argument for a score threshold as a float between 0 and 1.0.
+        For ratio < score_cutoff 0 is returned instead. Default is None,
+        which deactivates this behaviour.
+
+    Returns
+    -------
+    distance : float
+        distance between s1 and s2 as a float between 1.0 and 0.0
+
+    Raises
+    ------
+    ValueError
+        If prefix_weight is invalid
+    """
+    setupPandas()
+    if is_none(s1) or is_none(s2):
+        return 1.0
+
+    if processor is not None:
+        s1 = processor(s1)
+        s2 = processor(s2)
+
+    cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
+    sim = similarity(s1, s2, prefix_weight=prefix_weight, score_cutoff=cutoff_distance)
+    dist = 1.0 - sim
+    return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0
+
+
+def normalized_distance(
+    s1,
+    s2,
+    *,
+    prefix_weight=0.1,
+    processor=None,
+    score_cutoff=None,
+):
+    """
+    Calculates the normalized jaro winkler distance
+
+    Parameters
+    ----------
+    s1 : Sequence[Hashable]
+        First string to compare.
+    s2 : Sequence[Hashable]
+        Second string to compare.
+    prefix_weight : float, optional
+        Weight used for the common prefix of the two strings.
+        Has to be between 0 and 0.25. Default is 0.1.
+    processor: callable, optional
+        Optional callable that is used to preprocess the strings before
+        comparing them. Default is None, which deactivates this behaviour.
+    score_cutoff : float, optional
+        Optional argument for a score threshold as a float between 0 and 1.0.
+        For ratio < score_cutoff 0 is returned instead. Default is None,
+        which deactivates this behaviour.
+
+    Returns
+    -------
+    normalized distance : float
+        normalized distance between s1 and s2 as a float between 1.0 and 0.0
+
+    Raises
+    ------
+    ValueError
+        If prefix_weight is invalid
+    """
+    return distance(
+        s1,
+        s2,
+        prefix_weight=prefix_weight,
+        processor=processor,
+        score_cutoff=score_cutoff,
+    )