Spaces:
Sleeping
Sleeping
import abc | |
from typing import List | |
import numpy as np | |
import pandas as pd | |
import sklearn | |
import streamlit as st | |
from sentence_transformers.cross_encoder import CrossEncoder | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.preprocessing import MinMaxScaler | |
from sparse_dot_topn import awesome_cossim_topn | |
from src.models import Chapter | |
class Retriever: | |
def retrieve(self, query, n=10) -> List[Chapter]: | |
pass | |
class SemanticRetriever: | |
def __init__( | |
self, | |
bible_df, | |
embeddings_manager, | |
threshold=0.4, | |
cross_encoder_model="cross-encoder/ms-marco-MiniLM-L-12-v2", | |
): | |
self.bible_df = bible_df | |
self.embeddings_manager = embeddings_manager | |
self.threshold = threshold | |
self.cross_encoder_model = ( | |
CrossEncoder(cross_encoder_model) if cross_encoder_model else None | |
) | |
# 'cross-encoder/stsb-distilroberta-base' | |
# cross-encoder/ms-marco-MiniLM-L-12-v2 | |
def retrieve(self, query, n=10) -> List[Chapter]: | |
verse_candidates_df = self.semantic_search( | |
query=query, | |
texts=self.bible_df["text"].tolist(), | |
embeddings_manager=self.embeddings_manager, | |
n=n * 2, | |
threshold=self.threshold, | |
) | |
if len(verse_candidates_df) == 0: | |
return [] | |
if self.cross_encoder_model is not None: | |
verse_candidates_df = self.cross_encode( | |
query, verse_candidates_df["text"].tolist() | |
) | |
# TODO: revisit this logic as some verses can have the same exact text | |
# For now, workaround is to drop duplicates | |
verse_candidates_df.drop_duplicates(subset="text", inplace=True) | |
# Join back verse metadata | |
verse_candidates_df = pd.merge( | |
verse_candidates_df, self.bible_df, how="left", on="text" | |
) | |
# DEBUG | |
# st.write(verse_candidates_df) | |
chapter_candidates = self.extract_chapters_from_verses( | |
self.bible_df, verse_candidates_df | |
) | |
return chapter_candidates | |
def cross_encode(self, query, texts): | |
combinations = [[query, text] for text in texts] | |
sim_scores = self.cross_encoder_model.predict(combinations) | |
sim_scores = MinMaxScaler().fit_transform(sim_scores.reshape(-1, 1)).flatten() | |
reranked_texts_scores = sorted( | |
zip(texts, sim_scores), key=lambda x: x[1], reverse=True | |
) | |
df = pd.DataFrame(reranked_texts_scores, columns=["text", "score"]) | |
return df | |
def semantic_search(self, query, texts, embeddings_manager, n=None, threshold=0): | |
embeddings = embeddings_manager.get_embeddings(texts) | |
query_embedding = embeddings_manager.get_embeddings([query]) | |
sim_scores = sklearn.metrics.pairwise.cosine_similarity( | |
query_embedding, embeddings | |
)[0] | |
# Results is a list of tuples: [(text, score)] | |
results = sorted(list(zip(texts, sim_scores)), key=lambda x: x[1], reverse=True) | |
# Take top n only if specified | |
if n: | |
results = results[:n] | |
# Apply a threshold to filter irrelevant results | |
if threshold: | |
results = [x for x in results if x[1] >= threshold] | |
df = pd.DataFrame(results, columns=["text", "score"]) | |
return df | |
def extract_chapters_from_verses(self, bible_df, verse_results_df) -> List[Chapter]: | |
# Simple, naive assumption now is to just follow order of first appearance | |
# I.e. The per-verse scores dictate the order | |
# TODO: Revisit ranking | |
# The goal here is to extract all the unique chapters based on the top verse results | |
verse_results_df = verse_results_df.copy() | |
verse_results_df["book_chapter"] = ( | |
verse_results_df["book"] + " " + verse_results_df["chapter"].astype(str) | |
) | |
unique_chapters = verse_results_df["book_chapter"].unique() | |
bible_df = bible_df.copy() | |
bible_df["book_chapter"] = ( | |
bible_df["book"] + " " + bible_df["chapter"].astype(str) | |
) | |
chapters = [] | |
for unique_chapter in unique_chapters: | |
chapter_verses_df = bible_df[bible_df["book_chapter"] == unique_chapter] | |
book = chapter_verses_df["book"].tolist()[0] | |
chapter = chapter_verses_df["chapter"].tolist()[0] | |
# Keep track of the matched verses as highlight verses | |
highlight_verses_df = pd.merge( | |
chapter_verses_df, | |
verse_results_df[["text", "score", "book", "chapter"]], | |
how="inner", | |
on=["text", "book", "chapter"], | |
) | |
chapter = Chapter( | |
book_name=book, | |
chapter_num=chapter, | |
verses_df=chapter_verses_df, | |
highlight_verses_df=highlight_verses_df, | |
) | |
chapters.append(chapter) | |
return chapters | |
class TfIdfRetriever(Retriever): | |
def __init__(self, texts, preprocessors=[]) -> None: | |
self.vectorizer = TfidfVectorizer(analyzer="word", stop_words="english") | |
self.preprocessors = preprocessors | |
# TODO: pre-process the texts | |
self.tfidf_vectors = self.vectorizer.fit_transform(texts) | |
self.tfidf_vectors_transposed = self.tfidf_vectors.transpose() | |
def search(self, query, n=10): | |
query_tfidf_vector = self.vectorizer.transform([query]) | |
results = awesome_cossim_topn( | |
query_tfidf_vector, self.tfidf_vectors_transposed, n, 0.01 | |
) | |
return results | |