bible-search / src /retriever.py
alronlam's picture
Add app and data files
613c93d
import abc
from typing import List
import numpy as np
import pandas as pd
import sklearn
import streamlit as st
from sentence_transformers.cross_encoder import CrossEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sparse_dot_topn import awesome_cossim_topn
from src.models import Chapter
class Retriever:
@abc.abstractmethod
def retrieve(self, query, n=10) -> List[Chapter]:
pass
class SemanticRetriever:
def __init__(
self,
bible_df,
embeddings_manager,
threshold=0.4,
cross_encoder_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
):
self.bible_df = bible_df
self.embeddings_manager = embeddings_manager
self.threshold = threshold
self.cross_encoder_model = (
CrossEncoder(cross_encoder_model) if cross_encoder_model else None
)
# 'cross-encoder/stsb-distilroberta-base'
# cross-encoder/ms-marco-MiniLM-L-12-v2
def retrieve(self, query, n=10) -> List[Chapter]:
verse_candidates_df = self.semantic_search(
query=query,
texts=self.bible_df["text"].tolist(),
embeddings_manager=self.embeddings_manager,
n=n * 2,
threshold=self.threshold,
)
if len(verse_candidates_df) == 0:
return []
if self.cross_encoder_model is not None:
verse_candidates_df = self.cross_encode(
query, verse_candidates_df["text"].tolist()
)
# TODO: revisit this logic as some verses can have the same exact text
# For now, workaround is to drop duplicates
verse_candidates_df.drop_duplicates(subset="text", inplace=True)
# Join back verse metadata
verse_candidates_df = pd.merge(
verse_candidates_df, self.bible_df, how="left", on="text"
)
# DEBUG
# st.write(verse_candidates_df)
chapter_candidates = self.extract_chapters_from_verses(
self.bible_df, verse_candidates_df
)
return chapter_candidates
def cross_encode(self, query, texts):
combinations = [[query, text] for text in texts]
sim_scores = self.cross_encoder_model.predict(combinations)
sim_scores = MinMaxScaler().fit_transform(sim_scores.reshape(-1, 1)).flatten()
reranked_texts_scores = sorted(
zip(texts, sim_scores), key=lambda x: x[1], reverse=True
)
df = pd.DataFrame(reranked_texts_scores, columns=["text", "score"])
return df
def semantic_search(self, query, texts, embeddings_manager, n=None, threshold=0):
embeddings = embeddings_manager.get_embeddings(texts)
query_embedding = embeddings_manager.get_embeddings([query])
sim_scores = sklearn.metrics.pairwise.cosine_similarity(
query_embedding, embeddings
)[0]
# Results is a list of tuples: [(text, score)]
results = sorted(list(zip(texts, sim_scores)), key=lambda x: x[1], reverse=True)
# Take top n only if specified
if n:
results = results[:n]
# Apply a threshold to filter irrelevant results
if threshold:
results = [x for x in results if x[1] >= threshold]
df = pd.DataFrame(results, columns=["text", "score"])
return df
def extract_chapters_from_verses(self, bible_df, verse_results_df) -> List[Chapter]:
# Simple, naive assumption now is to just follow order of first appearance
# I.e. The per-verse scores dictate the order
# TODO: Revisit ranking
# The goal here is to extract all the unique chapters based on the top verse results
verse_results_df = verse_results_df.copy()
verse_results_df["book_chapter"] = (
verse_results_df["book"] + " " + verse_results_df["chapter"].astype(str)
)
unique_chapters = verse_results_df["book_chapter"].unique()
bible_df = bible_df.copy()
bible_df["book_chapter"] = (
bible_df["book"] + " " + bible_df["chapter"].astype(str)
)
chapters = []
for unique_chapter in unique_chapters:
chapter_verses_df = bible_df[bible_df["book_chapter"] == unique_chapter]
book = chapter_verses_df["book"].tolist()[0]
chapter = chapter_verses_df["chapter"].tolist()[0]
# Keep track of the matched verses as highlight verses
highlight_verses_df = pd.merge(
chapter_verses_df,
verse_results_df[["text", "score", "book", "chapter"]],
how="inner",
on=["text", "book", "chapter"],
)
chapter = Chapter(
book_name=book,
chapter_num=chapter,
verses_df=chapter_verses_df,
highlight_verses_df=highlight_verses_df,
)
chapters.append(chapter)
return chapters
class TfIdfRetriever(Retriever):
def __init__(self, texts, preprocessors=[]) -> None:
self.vectorizer = TfidfVectorizer(analyzer="word", stop_words="english")
self.preprocessors = preprocessors
# TODO: pre-process the texts
self.tfidf_vectors = self.vectorizer.fit_transform(texts)
self.tfidf_vectors_transposed = self.tfidf_vectors.transpose()
def search(self, query, n=10):
query_tfidf_vector = self.vectorizer.transform([query])
results = awesome_cossim_topn(
query_tfidf_vector, self.tfidf_vectors_transposed, n, 0.01
)
return results