OiO.lk Community platform!

Oio.lk is an excellent forum for developers, providing a wide range of resources, discussions, and support for those in the developer community. Join oio.lk today to connect with like-minded professionals, share insights, and stay updated on the latest trends and technologies in the development field.
  You need to log in or register to access the solved answers to this problem.
  • You have reached the maximum number of guest views allowed
  • Please register below to remove this limitation

Implementation of Okapi BM25 in python

  • Thread starter Thread starter Ledian K.
  • Start date Start date
L

Ledian K.

Guest
I am trying to implement Okapi BM25 in python. While I have seen some tutorials how to do it, it seems I am stuck in the process.

So I have collection of documents (and has as columns 'id' and 'text') and queries (and has as columns 'id' and 'text'). I have done the pre-processing steps and I have my documents and queries as a list:

Code:
documents = list(train_docs['text'])        #put the documents text to list
queries = list(train_queries_all['text'])   #put the queries text to list

Then for BM25 I do this:

Code:
pip install rank_bm25

#calculate BM25

Code:
from rank_bm25 import BM25Okapi

bm25 = BM25Okapi(documents)

#compute the score

bm_score = BM25Okapi.get_scores(documents, query=queries)

But it wouldn't work.



Then I tried to do this:

Code:
import math
import numpy as np
from multiprocessing import Pool, cpu_count

nd = len(documents) # corpus_size = 3612 (I am not sure if this is necessary)

Code:
class BM25:
    def __init__(self, documents, tokenizer=None):
        self.corpus_size = len(documents)
        self.avgdl = 0
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []
        self.tokenizer = tokenizer

        if tokenizer:
            documents = self._tokenize_corpus(documents)

        nd = self._initialize(documents)
        self._calc_idf(nd)

    def _initialize(self, documents):
        nd = {}  # word -> number of documents with word
        num_doc = 0
        for document in documents:
            self.doc_len.append(len(document))
            num_doc += len(document)

            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.doc_freqs.append(frequencies)

            for word, freq in frequencies.items():
                if word not in nd:
                    nd[word] = 0
                nd[word] += 1

        self.avgdl = num_doc / self.corpus_size
        return nd

    def _tokenize_corpus(self, documents):
        pool = Pool(cpu_count())
        tokenized_corpus = pool.map(self.tokenizer, documents)
        return tokenized_corpus

    def _calc_idf(self, nd):
        raise NotImplementedError()

    def get_scores(self, queries):
        raise NotImplementedError()

    def get_top_n(self, queries, documents, n=5):

        assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"

        scores = self.get_scores(queries)
        top_n = np.argsort(scores)[::-1][:n]
        return [documents[i] for i in top_n]

class BM25T(BM25):
    def __init__(self, documents, k1=1.5, b=0.75, delta=1):
        # Algorithm specific parameters
        self.k1 = k1
        self.b = b
        self.delta = delta
        super().__init__(documents)

    def _calc_idf(self, nd):
        for word, freq in nd.items():
            idf = math.log((self.corpus_size + 1) / freq)
            self.idf[word] = idf

    def get_scores(self, queries):
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in queries:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
                                               (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
        return score

and then I try to get the scores:

Code:
score = BM25.get_scores(self=documents, queries)

But I get as a meesage: score = BM25.get_scores(self=documents, queries)

SyntaxError: positional argument follows keyword argument



Does anyone has an idea why there is this error? Thank you in advance.
<p>I am trying to implement Okapi BM25 in python. While I have seen some tutorials how to do it, it seems I am stuck in the process. </p>

<p>So I have collection of documents (and has as columns 'id' and 'text') and queries (and has as columns 'id' and 'text'). I have done the pre-processing steps and I have my documents and queries as a list:</p>

<pre><code>documents = list(train_docs['text']) #put the documents text to list
queries = list(train_queries_all['text']) #put the queries text to list
</code></pre>

<p>Then for BM25 I do this:</p>

<pre><code>pip install rank_bm25
</code></pre>

<p>#calculate BM25</p>

<pre><code>from rank_bm25 import BM25Okapi

bm25 = BM25Okapi(documents)
</code></pre>

<p>#compute the score</p>

<p><code>bm_score = BM25Okapi.get_scores(documents, query=queries</code>)</p>

<p>But it wouldn't work.</p>

<hr>

<p>Then I tried to do this:</p>

<pre><code>import math
import numpy as np
from multiprocessing import Pool, cpu_count
</code></pre>

<p><code>nd = len(documents) # corpus_size = 3612</code> (I am not sure if this is necessary)</p>

<pre><code>class BM25:
def __init__(self, documents, tokenizer=None):
self.corpus_size = len(documents)
self.avgdl = 0
self.doc_freqs = []
self.idf = {}
self.doc_len = []
self.tokenizer = tokenizer

if tokenizer:
documents = self._tokenize_corpus(documents)

nd = self._initialize(documents)
self._calc_idf(nd)

def _initialize(self, documents):
nd = {} # word -> number of documents with word
num_doc = 0
for document in documents:
self.doc_len.append(len(document))
num_doc += len(document)

frequencies = {}
for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.doc_freqs.append(frequencies)

for word, freq in frequencies.items():
if word not in nd:
nd[word] = 0
nd[word] += 1

self.avgdl = num_doc / self.corpus_size
return nd

def _tokenize_corpus(self, documents):
pool = Pool(cpu_count())
tokenized_corpus = pool.map(self.tokenizer, documents)
return tokenized_corpus

def _calc_idf(self, nd):
raise NotImplementedError()

def get_scores(self, queries):
raise NotImplementedError()

def get_top_n(self, queries, documents, n=5):

assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"

scores = self.get_scores(queries)
top_n = np.argsort(scores)[::-1][:n]
return [documents for i in top_n]

class BM25T(BM25):
def __init__(self, documents, k1=1.5, b=0.75, delta=1):
# Algorithm specific parameters
self.k1 = k1
self.b = b
self.delta = delta
super().__init__(documents)

def _calc_idf(self, nd):
for word, freq in nd.items():
idf = math.log((self.corpus_size + 1) / freq)
self.idf[word] = idf

def get_scores(self, queries):
score = np.zeros(self.corpus_size)
doc_len = np.array(self.doc_len)
for q in queries:
q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
(self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
return score
</code></pre>

<p>and then I try to get the scores:</p>

<pre><code>score = BM25.get_scores(self=documents, queries)
</code></pre>

<p>But I get as a meesage:
score = BM25.get_scores(self=documents, queries)</p>

<p>SyntaxError: positional argument follows keyword argument</p>

<hr>

<p>Does anyone has an idea why there is this error? Thank you in advance.</p>
 

Latest posts

A
Replies
0
Views
1
AgencyAnalytics
A
S
Replies
0
Views
1
Stacker Media
S
C
Replies
0
Views
1
CC.Talent
C
Top