Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/rank_bm25.py
+++ b/venv/lib/python3.10/site-packages/rank_bm25.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python
+
+import math
+import numpy as np
+from multiprocessing import Pool, cpu_count
+
+"""
+All of these algorithms have been taken from the paper:
+Trotmam et al, Improvements to BM25 and Language Models Examined
+
+Here we implement all the BM25 variations mentioned. 
+"""
+
+
+class BM25:
+    def __init__(self, corpus, tokenizer=None):
+        self.corpus_size = 0
+        self.avgdl = 0
+        self.doc_freqs = []
+        self.idf = {}
+        self.doc_len = []
+        self.tokenizer = tokenizer
+
+        if tokenizer:
+            corpus = self._tokenize_corpus(corpus)
+
+        nd = self._initialize(corpus)
+        self._calc_idf(nd)
+
+    def _initialize(self, corpus):
+        nd = {}  # word -> number of documents with word
+        num_doc = 0
+        for document in corpus:
+            self.doc_len.append(len(document))
+            num_doc += len(document)
+
+            frequencies = {}
+            for word in document:
+                if word not in frequencies:
+                    frequencies[word] = 0
+                frequencies[word] += 1
+            self.doc_freqs.append(frequencies)
+
+            for word, freq in frequencies.items():
+                try:
+                    nd[word]+=1
+                except KeyError:
+                    nd[word] = 1
+
+            self.corpus_size += 1
+
+        self.avgdl = num_doc / self.corpus_size
+        return nd
+
+    def _tokenize_corpus(self, corpus):
+        pool = Pool(cpu_count())
+        tokenized_corpus = pool.map(self.tokenizer, corpus)
+        return tokenized_corpus
+
+    def _calc_idf(self, nd):
+        raise NotImplementedError()
+
+    def get_scores(self, query):
+        raise NotImplementedError()
+
+    def get_batch_scores(self, query, doc_ids):
+        raise NotImplementedError()
+
+    def get_top_n(self, query, documents, n=5):
+
+        assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
+
+        scores = self.get_scores(query)
+        top_n = np.argsort(scores)[::-1][:n]
+        return [documents[i] for i in top_n]
+
+
+class BM25Okapi(BM25):
+    def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25):
+        self.k1 = k1
+        self.b = b
+        self.epsilon = epsilon
+        super().__init__(corpus, tokenizer)
+
+    def _calc_idf(self, nd):
+        """
+        Calculates frequencies of terms in documents and in corpus.
+        This algorithm sets a floor on the idf values to eps * average_idf
+        """
+        # collect idf sum to calculate an average idf for epsilon value
+        idf_sum = 0
+        # collect words with negative idf to set them a special epsilon value.
+        # idf can be negative if word is contained in more than half of documents
+        negative_idfs = []
+        for word, freq in nd.items():
+            idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
+            self.idf[word] = idf
+            idf_sum += idf
+            if idf < 0:
+                negative_idfs.append(word)
+        self.average_idf = idf_sum / len(self.idf)
+
+        eps = self.epsilon * self.average_idf
+        for word in negative_idfs:
+            self.idf[word] = eps
+
+    def get_scores(self, query):
+        """
+        The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores,
+        this algorithm also adds a floor to the idf value of epsilon.
+        See [Trotman, A., X. Jia, M. Crane, Towards an Efficient and Effective Search Engine] for more info
+        :param query:
+        :return:
+        """
+        score = np.zeros(self.corpus_size)
+        doc_len = np.array(self.doc_len)
+        for q in query:
+            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
+            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
+                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
+        return score
+
+    def get_batch_scores(self, query, doc_ids):
+        """
+        Calculate bm25 scores between query and subset of all docs
+        """
+        assert all(di < len(self.doc_freqs) for di in doc_ids)
+        score = np.zeros(len(doc_ids))
+        doc_len = np.array(self.doc_len)[doc_ids]
+        for q in query:
+            q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
+            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
+                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
+        return score.tolist()
+
+
+class BM25L(BM25):
+    def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, delta=0.5):
+        # Algorithm specific parameters
+        self.k1 = k1
+        self.b = b
+        self.delta = delta
+        super().__init__(corpus, tokenizer)
+
+    def _calc_idf(self, nd):
+        for word, freq in nd.items():
+            idf = math.log(self.corpus_size + 1) - math.log(freq + 0.5)
+            self.idf[word] = idf
+
+    def get_scores(self, query):
+        score = np.zeros(self.corpus_size)
+        doc_len = np.array(self.doc_len)
+        for q in query:
+            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
+            ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl)
+            score += (self.idf.get(q) or 0) * q_freq * (self.k1 + 1) * (ctd + self.delta) / \
+                     (self.k1 + ctd + self.delta)
+        return score
+
+    def get_batch_scores(self, query, doc_ids):
+        """
+        Calculate bm25 scores between query and subset of all docs
+        """
+        assert all(di < len(self.doc_freqs) for di in doc_ids)
+        score = np.zeros(len(doc_ids))
+        doc_len = np.array(self.doc_len)[doc_ids]
+        for q in query:
+            q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
+            ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl)
+            score += (self.idf.get(q) or 0) * q_freq * (self.k1 + 1) * (ctd + self.delta) / \
+                     (self.k1 + ctd + self.delta)
+        return score.tolist()
+
+
+class BM25Plus(BM25):
+    def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, delta=1):
+        # Algorithm specific parameters
+        self.k1 = k1
+        self.b = b
+        self.delta = delta
+        super().__init__(corpus, tokenizer)
+
+    def _calc_idf(self, nd):
+        for word, freq in nd.items():
+            idf = math.log((self.corpus_size + 1) / freq)
+            self.idf[word] = idf
+
+    def get_scores(self, query):
+        score = np.zeros(self.corpus_size)
+        doc_len = np.array(self.doc_len)
+        for q in query:
+            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
+            score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
+                                               (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
+        return score
+
+    def get_batch_scores(self, query, doc_ids):
+        """
+        Calculate bm25 scores between query and subset of all docs
+        """
+        assert all(di < len(self.doc_freqs) for di in doc_ids)
+        score = np.zeros(len(doc_ids))
+        doc_len = np.array(self.doc_len)[doc_ids]
+        for q in query:
+            q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
+            score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
+                                               (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
+        return score.tolist()
+
+
+# BM25Adpt and BM25T are a bit more complicated than the previous algorithms here. Here a term-specific k1
+# parameter is calculated before scoring is done
+
+# class BM25Adpt(BM25):
+#     def __init__(self, corpus, k1=1.5, b=0.75, delta=1):
+#         # Algorithm specific parameters
+#         self.k1 = k1
+#         self.b = b
+#         self.delta = delta
+#         super().__init__(corpus)
+#
+#     def _calc_idf(self, nd):
+#         for word, freq in nd.items():
+#             idf = math.log((self.corpus_size + 1) / freq)
+#             self.idf[word] = idf
+#
+#     def get_scores(self, query):
+#         score = np.zeros(self.corpus_size)
+#         doc_len = np.array(self.doc_len)
+#         for q in query:
+#             q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
+#             score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
+#                                                (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
+#         return score
+#
+#
+# class BM25T(BM25):
+#     def __init__(self, corpus, k1=1.5, b=0.75, delta=1):
+#         # Algorithm specific parameters
+#         self.k1 = k1
+#         self.b = b
+#         self.delta = delta
+#         super().__init__(corpus)
+#
+#     def _calc_idf(self, nd):
+#         for word, freq in nd.items():
+#             idf = math.log((self.corpus_size + 1) / freq)
+#             self.idf[word] = idf
+#
+#     def get_scores(self, query):
+#         score = np.zeros(self.corpus_size)
+#         doc_len = np.array(self.doc_len)
+#         for q in query:
+#             q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
+#             score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
+#                                                (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
+#         return score