bible-search / src /retriever.py
alronlam's picture
Add app and data files
613c93d
raw
history blame
No virus
5.67 kB
import abc
from typing import List
import numpy as np
import pandas as pd
import sklearn
import streamlit as st
from sentence_transformers.cross_encoder import CrossEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sparse_dot_topn import awesome_cossim_topn
from src.models import Chapter
class Retriever:
@abc.abstractmethod
def retrieve(self, query, n=10) -> List[Chapter]:
pass
class SemanticRetriever:
def __init__(
self,
bible_df,
embeddings_manager,
threshold=0.4,
cross_encoder_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
):
self.bible_df = bible_df
self.embeddings_manager = embeddings_manager
self.threshold = threshold
self.cross_encoder_model = (
CrossEncoder(cross_encoder_model) if cross_encoder_model else None
)
# 'cross-encoder/stsb-distilroberta-base'
# cross-encoder/ms-marco-MiniLM-L-12-v2
def retrieve(self, query, n=10) -> List[Chapter]:
verse_candidates_df = self.semantic_search(
query=query,
texts=self.bible_df["text"].tolist(),
embeddings_manager=self.embeddings_manager,
n=n * 2,
threshold=self.threshold,
)
if len(verse_candidates_df) == 0:
return []
if self.cross_encoder_model is not None:
verse_candidates_df = self.cross_encode(
query, verse_candidates_df["text"].tolist()
)
# TODO: revisit this logic as some verses can have the same exact text
# For now, workaround is to drop duplicates
verse_candidates_df.drop_duplicates(subset="text", inplace=True)
# Join back verse metadata
verse_candidates_df = pd.merge(
verse_candidates_df, self.bible_df, how="left", on="text"
)
# DEBUG
# st.write(verse_candidates_df)
chapter_candidates = self.extract_chapters_from_verses(
self.bible_df, verse_candidates_df
)
return chapter_candidates
def cross_encode(self, query, texts):
combinations = [[query, text] for text in texts]
sim_scores = self.cross_encoder_model.predict(combinations)
sim_scores = MinMaxScaler().fit_transform(sim_scores.reshape(-1, 1)).flatten()
reranked_texts_scores = sorted(
zip(texts, sim_scores), key=lambda x: x[1], reverse=True
)
df = pd.DataFrame(reranked_texts_scores, columns=["text", "score"])
return df
def semantic_search(self, query, texts, embeddings_manager, n=None, threshold=0):
embeddings = embeddings_manager.get_embeddings(texts)
query_embedding = embeddings_manager.get_embeddings([query])
sim_scores = sklearn.metrics.pairwise.cosine_similarity(
query_embedding, embeddings
)[0]
# Results is a list of tuples: [(text, score)]
results = sorted(list(zip(texts, sim_scores)), key=lambda x: x[1], reverse=True)
# Take top n only if specified
if n:
results = results[:n]
# Apply a threshold to filter irrelevant results
if threshold:
results = [x for x in results if x[1] >= threshold]
df = pd.DataFrame(results, columns=["text", "score"])
return df
def extract_chapters_from_verses(self, bible_df, verse_results_df) -> List[Chapter]:
# Simple, naive assumption now is to just follow order of first appearance
# I.e. The per-verse scores dictate the order
# TODO: Revisit ranking
# The goal here is to extract all the unique chapters based on the top verse results
verse_results_df = verse_results_df.copy()
verse_results_df["book_chapter"] = (
verse_results_df["book"] + " " + verse_results_df["chapter"].astype(str)
)
unique_chapters = verse_results_df["book_chapter"].unique()
bible_df = bible_df.copy()
bible_df["book_chapter"] = (
bible_df["book"] + " " + bible_df["chapter"].astype(str)
)
chapters = []
for unique_chapter in unique_chapters:
chapter_verses_df = bible_df[bible_df["book_chapter"] == unique_chapter]
book = chapter_verses_df["book"].tolist()[0]
chapter = chapter_verses_df["chapter"].tolist()[0]
# Keep track of the matched verses as highlight verses
highlight_verses_df = pd.merge(
chapter_verses_df,
verse_results_df[["text", "score", "book", "chapter"]],
how="inner",
on=["text", "book", "chapter"],
)
chapter = Chapter(
book_name=book,
chapter_num=chapter,
verses_df=chapter_verses_df,
highlight_verses_df=highlight_verses_df,
)
chapters.append(chapter)
return chapters
class TfIdfRetriever(Retriever):
def __init__(self, texts, preprocessors=[]) -> None:
self.vectorizer = TfidfVectorizer(analyzer="word", stop_words="english")
self.preprocessors = preprocessors
# TODO: pre-process the texts
self.tfidf_vectors = self.vectorizer.fit_transform(texts)
self.tfidf_vectors_transposed = self.tfidf_vectors.transpose()
def search(self, query, n=10):
query_tfidf_vector = self.vectorizer.transform([query])
results = awesome_cossim_topn(
query_tfidf_vector, self.tfidf_vectors_transposed, n, 0.01
)
return results