Initial commit
This commit is contained in:
@@ -0,0 +1,191 @@
|
||||
from __future__ import annotations
|
||||
|
||||
# built-in
|
||||
from collections import Counter
|
||||
from contextlib import suppress
|
||||
from typing import Sequence, TypeVar
|
||||
|
||||
# app
|
||||
from ..libraries import prototype
|
||||
from ..utils import find_ngrams
|
||||
|
||||
|
||||
libraries = prototype.clone()
|
||||
libraries.optimize()
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
class Base:
|
||||
def __init__(self, qval: int = 1, external: bool = True) -> None:
|
||||
self.qval = qval
|
||||
self.external = external
|
||||
|
||||
def __call__(self, *sequences: Sequence[object]) -> float:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def maximum(*sequences: Sequence[object]) -> float:
|
||||
"""Get maximum possible value
|
||||
"""
|
||||
return max(map(len, sequences))
|
||||
|
||||
def distance(self, *sequences: Sequence[object]) -> float:
|
||||
"""Get distance between sequences
|
||||
"""
|
||||
return self(*sequences)
|
||||
|
||||
def similarity(self, *sequences: Sequence[object]) -> float:
|
||||
"""Get sequences similarity.
|
||||
|
||||
similarity = maximum - distance
|
||||
"""
|
||||
return self.maximum(*sequences) - self.distance(*sequences)
|
||||
|
||||
def normalized_distance(self, *sequences: Sequence[object]) -> float:
|
||||
"""Get distance from 0 to 1
|
||||
"""
|
||||
maximum = self.maximum(*sequences)
|
||||
if maximum == 0:
|
||||
return 0
|
||||
return self.distance(*sequences) / maximum
|
||||
|
||||
def normalized_similarity(self, *sequences: Sequence[object]) -> float:
|
||||
"""Get similarity from 0 to 1
|
||||
|
||||
normalized_similarity = 1 - normalized_distance
|
||||
"""
|
||||
return 1 - self.normalized_distance(*sequences)
|
||||
|
||||
def external_answer(self, *sequences: Sequence[object]) -> float | None:
|
||||
"""Try to get answer from known external libraries.
|
||||
"""
|
||||
# if this feature disabled
|
||||
if not getattr(self, 'external', False):
|
||||
return None
|
||||
# all external libs don't support test_func
|
||||
test_func = getattr(self, 'test_func', self._ident)
|
||||
if test_func is not self._ident:
|
||||
return None
|
||||
# try to get external libs for algorithm
|
||||
libs = libraries.get_libs(self.__class__.__name__)
|
||||
for lib in libs:
|
||||
# if conditions not satisfied
|
||||
if not lib.check_conditions(self, *sequences):
|
||||
continue
|
||||
# if library is not installed yet
|
||||
func = lib.get_function()
|
||||
if func is None:
|
||||
continue
|
||||
prepared_sequences = lib.prepare(*sequences)
|
||||
# fail side libraries silently and try next libs
|
||||
with suppress(Exception):
|
||||
return func(*prepared_sequences)
|
||||
return None
|
||||
|
||||
def quick_answer(self, *sequences: Sequence[object]) -> float | None:
|
||||
"""Try to get answer quick without main implementation calling.
|
||||
|
||||
If no sequences, 1 sequence or all sequences are equal then return 0.
|
||||
If any sequence are empty then return maximum.
|
||||
And in finish try to get external answer.
|
||||
"""
|
||||
if not sequences:
|
||||
return 0
|
||||
if len(sequences) == 1:
|
||||
return 0
|
||||
if self._ident(*sequences):
|
||||
return 0
|
||||
if not all(sequences):
|
||||
return self.maximum(*sequences)
|
||||
# try get answer from external libs
|
||||
return self.external_answer(*sequences)
|
||||
|
||||
@staticmethod
|
||||
def _ident(*elements: object) -> bool:
|
||||
"""Return True if all sequences are equal.
|
||||
"""
|
||||
try:
|
||||
# for hashable elements
|
||||
return len(set(elements)) == 1
|
||||
except TypeError:
|
||||
# for unhashable elements
|
||||
for e1, e2 in zip(elements, elements[1:]):
|
||||
if e1 != e2:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _get_sequences(self, *sequences: Sequence[object]) -> list:
|
||||
"""Prepare sequences.
|
||||
|
||||
qval=None: split text by words
|
||||
qval=1: do not split sequences. For text this is mean comparing by letters.
|
||||
qval>1: split sequences by q-grams
|
||||
"""
|
||||
# by words
|
||||
if not self.qval:
|
||||
return [s.split() for s in sequences] # type: ignore[attr-defined]
|
||||
# by chars
|
||||
if self.qval == 1:
|
||||
return list(sequences)
|
||||
# by n-grams
|
||||
return [find_ngrams(s, self.qval) for s in sequences]
|
||||
|
||||
def _get_counters(self, *sequences: Sequence[object]) -> list[Counter]:
|
||||
"""Prepare sequences and convert it to Counters.
|
||||
"""
|
||||
# already Counters
|
||||
if all(isinstance(s, Counter) for s in sequences):
|
||||
return list(sequences) # type: ignore[arg-type]
|
||||
return [Counter(s) for s in self._get_sequences(*sequences)]
|
||||
|
||||
def _intersect_counters(self, *sequences: Counter[T]) -> Counter[T]:
|
||||
intersection = sequences[0].copy()
|
||||
for s in sequences[1:]:
|
||||
intersection &= s
|
||||
return intersection
|
||||
|
||||
def _union_counters(self, *sequences: Counter[T]) -> Counter[T]:
|
||||
union = sequences[0].copy()
|
||||
for s in sequences[1:]:
|
||||
union |= s
|
||||
return union
|
||||
|
||||
def _sum_counters(self, *sequences: Counter[T]) -> Counter[T]:
|
||||
result = sequences[0].copy()
|
||||
for s in sequences[1:]:
|
||||
result += s
|
||||
return result
|
||||
|
||||
def _count_counters(self, counter: Counter) -> int:
|
||||
"""Return all elements count from Counter
|
||||
"""
|
||||
if getattr(self, 'as_set', False):
|
||||
return len(set(counter))
|
||||
else:
|
||||
return sum(counter.values())
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '{name}({data})'.format(
|
||||
name=type(self).__name__,
|
||||
data=self.__dict__,
|
||||
)
|
||||
|
||||
|
||||
class BaseSimilarity(Base):
|
||||
def distance(self, *sequences: Sequence[object]) -> float:
|
||||
return self.maximum(*sequences) - self.similarity(*sequences)
|
||||
|
||||
def similarity(self, *sequences: Sequence[object]) -> float:
|
||||
return self(*sequences)
|
||||
|
||||
def quick_answer(self, *sequences: Sequence[object]) -> float | None:
|
||||
if not sequences:
|
||||
return self.maximum(*sequences)
|
||||
if len(sequences) == 1:
|
||||
return self.maximum(*sequences)
|
||||
if self._ident(*sequences):
|
||||
return self.maximum(*sequences)
|
||||
if not all(sequences):
|
||||
return 0
|
||||
# try get answer from external libs
|
||||
return self.external_answer(*sequences)
|
||||
Reference in New Issue
Block a user