File size: 5,670 Bytes
613c93d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import abc
from typing import List

import numpy as np
import pandas as pd
import sklearn
import streamlit as st
from sentence_transformers.cross_encoder import CrossEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sparse_dot_topn import awesome_cossim_topn

from src.models import Chapter


class Retriever:
    @abc.abstractmethod
    def retrieve(self, query, n=10) -> List[Chapter]:
        pass


class SemanticRetriever:
    def __init__(
        self,
        bible_df,
        embeddings_manager,
        threshold=0.4,
        cross_encoder_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
    ):
        self.bible_df = bible_df
        self.embeddings_manager = embeddings_manager
        self.threshold = threshold
        self.cross_encoder_model = (
            CrossEncoder(cross_encoder_model) if cross_encoder_model else None
        )

        # 'cross-encoder/stsb-distilroberta-base'
        # cross-encoder/ms-marco-MiniLM-L-12-v2

    def retrieve(self, query, n=10) -> List[Chapter]:

        verse_candidates_df = self.semantic_search(
            query=query,
            texts=self.bible_df["text"].tolist(),
            embeddings_manager=self.embeddings_manager,
            n=n * 2,
            threshold=self.threshold,
        )

        if len(verse_candidates_df) == 0:
            return []

        if self.cross_encoder_model is not None:
            verse_candidates_df = self.cross_encode(
                query, verse_candidates_df["text"].tolist()
            )

        # TODO: revisit this logic as some verses can have the same exact text
        # For now, workaround is to drop duplicates
        verse_candidates_df.drop_duplicates(subset="text", inplace=True)

        # Join back verse metadata
        verse_candidates_df = pd.merge(
            verse_candidates_df, self.bible_df, how="left", on="text"
        )
        # DEBUG
        # st.write(verse_candidates_df)

        chapter_candidates = self.extract_chapters_from_verses(
            self.bible_df, verse_candidates_df
        )
        return chapter_candidates

    def cross_encode(self, query, texts):
        combinations = [[query, text] for text in texts]
        sim_scores = self.cross_encoder_model.predict(combinations)
        sim_scores = MinMaxScaler().fit_transform(sim_scores.reshape(-1, 1)).flatten()
        reranked_texts_scores = sorted(
            zip(texts, sim_scores), key=lambda x: x[1], reverse=True
        )
        df = pd.DataFrame(reranked_texts_scores, columns=["text", "score"])
        return df

    def semantic_search(self, query, texts, embeddings_manager, n=None, threshold=0):
        embeddings = embeddings_manager.get_embeddings(texts)
        query_embedding = embeddings_manager.get_embeddings([query])
        sim_scores = sklearn.metrics.pairwise.cosine_similarity(
            query_embedding, embeddings
        )[0]

        # Results is a list of tuples: [(text, score)]
        results = sorted(list(zip(texts, sim_scores)), key=lambda x: x[1], reverse=True)

        # Take top n only if specified
        if n:
            results = results[:n]

        # Apply a threshold to filter irrelevant results
        if threshold:
            results = [x for x in results if x[1] >= threshold]

        df = pd.DataFrame(results, columns=["text", "score"])

        return df

    def extract_chapters_from_verses(self, bible_df, verse_results_df) -> List[Chapter]:
        # Simple, naive assumption now is to just follow order of first appearance
        # I.e. The per-verse scores dictate the order
        # TODO: Revisit ranking

        # The goal here is to extract all the unique chapters based on the top verse results
        verse_results_df = verse_results_df.copy()
        verse_results_df["book_chapter"] = (
            verse_results_df["book"] + " " + verse_results_df["chapter"].astype(str)
        )
        unique_chapters = verse_results_df["book_chapter"].unique()

        bible_df = bible_df.copy()
        bible_df["book_chapter"] = (
            bible_df["book"] + " " + bible_df["chapter"].astype(str)
        )

        chapters = []
        for unique_chapter in unique_chapters:
            chapter_verses_df = bible_df[bible_df["book_chapter"] == unique_chapter]
            book = chapter_verses_df["book"].tolist()[0]
            chapter = chapter_verses_df["chapter"].tolist()[0]

            # Keep track of the matched verses as highlight verses
            highlight_verses_df = pd.merge(
                chapter_verses_df,
                verse_results_df[["text", "score", "book", "chapter"]],
                how="inner",
                on=["text", "book", "chapter"],
            )

            chapter = Chapter(
                book_name=book,
                chapter_num=chapter,
                verses_df=chapter_verses_df,
                highlight_verses_df=highlight_verses_df,
            )

            chapters.append(chapter)

        return chapters


class TfIdfRetriever(Retriever):
    def __init__(self, texts, preprocessors=[]) -> None:
        self.vectorizer = TfidfVectorizer(analyzer="word", stop_words="english")
        self.preprocessors = preprocessors
        # TODO: pre-process the texts
        self.tfidf_vectors = self.vectorizer.fit_transform(texts)
        self.tfidf_vectors_transposed = self.tfidf_vectors.transpose()

    def search(self, query, n=10):
        query_tfidf_vector = self.vectorizer.transform([query])
        results = awesome_cossim_topn(
            query_tfidf_vector, self.tfidf_vectors_transposed, n, 0.01
        )
        return results