Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/textdistance/algorithms/sequence_based.py
+++ b/venv/lib/python3.10/site-packages/textdistance/algorithms/sequence_based.py
@@ -0,0 +1,186 @@
+from __future__ import annotations
+
+# built-in
+from difflib import SequenceMatcher as _SequenceMatcher
+from typing import Any
+
+# app
+from ..utils import find_ngrams
+from .base import BaseSimilarity as _BaseSimilarity
+from .types import TestFunc
+
+
+try:
+    # external
+    import numpy
+except ImportError:
+    # built-in
+    from array import array
+    numpy = None  # type: ignore[assignment]
+
+
+__all__ = [
+    'lcsseq', 'lcsstr', 'ratcliff_obershelp',
+    'LCSSeq', 'LCSStr', 'RatcliffObershelp',
+]
+
+
+class LCSSeq(_BaseSimilarity):
+    """longest common subsequence similarity
+
+    https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
+    """
+
+    def __init__(
+        self,
+        qval: int = 1,
+        test_func: TestFunc = None,
+        external: bool = True,
+    ) -> None:
+        self.qval = qval
+        self.test_func = test_func or self._ident
+        self.external = external
+
+    def _dynamic(self, seq1: str, seq2: str) -> str:
+        """
+        https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py
+        http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py
+        http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8
+        """
+        lengths: Any
+        if numpy:
+            lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=int)
+        else:
+            lengths = [array('L', [0] * (len(seq2) + 1)) for _ in range(len(seq1) + 1)]
+
+        # row 0 and column 0 are initialized to 0 already
+        for i, char1 in enumerate(seq1):
+            for j, char2 in enumerate(seq2):
+                if char1 == char2:
+                    lengths[i + 1][j + 1] = lengths[i][j] + 1
+                else:
+                    lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
+
+        # read the substring out from the matrix
+        result = ''
+        i, j = len(seq1), len(seq2)
+        while i != 0 and j != 0:
+            if lengths[i][j] == lengths[i - 1][j]:
+                i -= 1
+            elif lengths[i][j] == lengths[i][j - 1]:
+                j -= 1
+            else:
+                assert seq1[i - 1] == seq2[j - 1]
+                result = seq1[i - 1] + result
+                i -= 1
+                j -= 1
+        return result
+
+    def _recursive(self, *sequences: str) -> str:
+        if not all(sequences):
+            return type(sequences[0])()  # empty sequence
+        if self.test_func(*[s[-1] for s in sequences]):
+            c = sequences[0][-1]
+            sequences = tuple(s[:-1] for s in sequences)
+            return self(*sequences) + c
+        m = type(sequences[0])()  # empty sequence
+        for i, s in enumerate(sequences):
+            ss = sequences[:i] + (s[:-1], ) + sequences[i + 1:]
+            m = max([self(*ss), m], key=len)
+        return m
+
+    def __call__(self, *sequences: str) -> str:
+        if not sequences:
+            return ''
+        sequences = self._get_sequences(*sequences)
+        if len(sequences) == 2:
+            return self._dynamic(*sequences)
+        else:
+            return self._recursive(*sequences)
+
+    def similarity(self, *sequences) -> int:
+        return len(self(*sequences))
+
+
+class LCSStr(_BaseSimilarity):
+    """longest common substring similarity
+    """
+
+    def _standart(self, s1: str, s2: str) -> str:
+        matcher = _SequenceMatcher(a=s1, b=s2)
+        match = matcher.find_longest_match(0, len(s1), 0, len(s2))
+        return s1[match.a: match.a + match.size]
+
+    def _custom(self, *sequences: str) -> str:
+        short = min(sequences, key=len)
+        length = len(short)
+        for n in range(length, 0, -1):
+            for subseq in find_ngrams(short, n):
+                joined = ''.join(subseq)
+                for seq in sequences:
+                    if joined not in seq:
+                        break
+                else:
+                    return joined
+        return type(short)()  # empty sequence
+
+    def __call__(self, *sequences: str) -> str:
+        if not all(sequences):
+            return ''
+        length = len(sequences)
+        if length == 0:
+            return ''
+        if length == 1:
+            return sequences[0]
+
+        sequences = self._get_sequences(*sequences)
+        if length == 2 and max(map(len, sequences)) < 200:
+            return self._standart(*sequences)
+        return self._custom(*sequences)
+
+    def similarity(self, *sequences: str) -> int:
+        return len(self(*sequences))
+
+
+class RatcliffObershelp(_BaseSimilarity):
+    """Ratcliff-Obershelp similarity
+    This follows the Ratcliff-Obershelp algorithm to derive a similarity
+    measure:
+        1. Find the length of the longest common substring in sequences.
+        2. Recurse on the strings to the left & right of each this substring
+           in sequences. The base case is a 0 length common substring, in which
+           case, return 0. Otherwise, return the sum of the current longest
+           common substring and the left & right recursed sums.
+        3. Multiply this length by 2 and divide by the sum of the lengths of
+           sequences.
+
+    https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching
+    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/ratcliff-obershelp.js
+    https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html
+    """
+
+    def maximum(self, *sequences: str) -> int:
+        return 1
+
+    def _find(self, *sequences: str) -> int:
+        subseq = LCSStr()(*sequences)
+        length = len(subseq)
+        if length == 0:
+            return 0
+        before = [s[:s.find(subseq)] for s in sequences]
+        after = [s[s.find(subseq) + length:] for s in sequences]
+        return self._find(*before) + length + self._find(*after)
+
+    def __call__(self, *sequences: str) -> float:
+        result = self.quick_answer(*sequences)
+        if result is not None:
+            return result
+        scount = len(sequences)  # sequences count
+        ecount = sum(map(len, sequences))  # elements count
+        sequences = self._get_sequences(*sequences)
+        return scount * self._find(*sequences) / ecount
+
+
+lcsseq = LCSSeq()
+lcsstr = LCSStr()
+ratcliff_obershelp = RatcliffObershelp()