Spaces:
Sleeping
Sleeping
File size: 5,670 Bytes
613c93d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import abc
from typing import List
import numpy as np
import pandas as pd
import sklearn
import streamlit as st
from sentence_transformers.cross_encoder import CrossEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sparse_dot_topn import awesome_cossim_topn
from src.models import Chapter
class Retriever:
@abc.abstractmethod
def retrieve(self, query, n=10) -> List[Chapter]:
pass
class SemanticRetriever:
def __init__(
self,
bible_df,
embeddings_manager,
threshold=0.4,
cross_encoder_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
):
self.bible_df = bible_df
self.embeddings_manager = embeddings_manager
self.threshold = threshold
self.cross_encoder_model = (
CrossEncoder(cross_encoder_model) if cross_encoder_model else None
)
# 'cross-encoder/stsb-distilroberta-base'
# cross-encoder/ms-marco-MiniLM-L-12-v2
def retrieve(self, query, n=10) -> List[Chapter]:
verse_candidates_df = self.semantic_search(
query=query,
texts=self.bible_df["text"].tolist(),
embeddings_manager=self.embeddings_manager,
n=n * 2,
threshold=self.threshold,
)
if len(verse_candidates_df) == 0:
return []
if self.cross_encoder_model is not None:
verse_candidates_df = self.cross_encode(
query, verse_candidates_df["text"].tolist()
)
# TODO: revisit this logic as some verses can have the same exact text
# For now, workaround is to drop duplicates
verse_candidates_df.drop_duplicates(subset="text", inplace=True)
# Join back verse metadata
verse_candidates_df = pd.merge(
verse_candidates_df, self.bible_df, how="left", on="text"
)
# DEBUG
# st.write(verse_candidates_df)
chapter_candidates = self.extract_chapters_from_verses(
self.bible_df, verse_candidates_df
)
return chapter_candidates
def cross_encode(self, query, texts):
combinations = [[query, text] for text in texts]
sim_scores = self.cross_encoder_model.predict(combinations)
sim_scores = MinMaxScaler().fit_transform(sim_scores.reshape(-1, 1)).flatten()
reranked_texts_scores = sorted(
zip(texts, sim_scores), key=lambda x: x[1], reverse=True
)
df = pd.DataFrame(reranked_texts_scores, columns=["text", "score"])
return df
def semantic_search(self, query, texts, embeddings_manager, n=None, threshold=0):
embeddings = embeddings_manager.get_embeddings(texts)
query_embedding = embeddings_manager.get_embeddings([query])
sim_scores = sklearn.metrics.pairwise.cosine_similarity(
query_embedding, embeddings
)[0]
# Results is a list of tuples: [(text, score)]
results = sorted(list(zip(texts, sim_scores)), key=lambda x: x[1], reverse=True)
# Take top n only if specified
if n:
results = results[:n]
# Apply a threshold to filter irrelevant results
if threshold:
results = [x for x in results if x[1] >= threshold]
df = pd.DataFrame(results, columns=["text", "score"])
return df
def extract_chapters_from_verses(self, bible_df, verse_results_df) -> List[Chapter]:
# Simple, naive assumption now is to just follow order of first appearance
# I.e. The per-verse scores dictate the order
# TODO: Revisit ranking
# The goal here is to extract all the unique chapters based on the top verse results
verse_results_df = verse_results_df.copy()
verse_results_df["book_chapter"] = (
verse_results_df["book"] + " " + verse_results_df["chapter"].astype(str)
)
unique_chapters = verse_results_df["book_chapter"].unique()
bible_df = bible_df.copy()
bible_df["book_chapter"] = (
bible_df["book"] + " " + bible_df["chapter"].astype(str)
)
chapters = []
for unique_chapter in unique_chapters:
chapter_verses_df = bible_df[bible_df["book_chapter"] == unique_chapter]
book = chapter_verses_df["book"].tolist()[0]
chapter = chapter_verses_df["chapter"].tolist()[0]
# Keep track of the matched verses as highlight verses
highlight_verses_df = pd.merge(
chapter_verses_df,
verse_results_df[["text", "score", "book", "chapter"]],
how="inner",
on=["text", "book", "chapter"],
)
chapter = Chapter(
book_name=book,
chapter_num=chapter,
verses_df=chapter_verses_df,
highlight_verses_df=highlight_verses_df,
)
chapters.append(chapter)
return chapters
class TfIdfRetriever(Retriever):
def __init__(self, texts, preprocessors=[]) -> None:
self.vectorizer = TfidfVectorizer(analyzer="word", stop_words="english")
self.preprocessors = preprocessors
# TODO: pre-process the texts
self.tfidf_vectors = self.vectorizer.fit_transform(texts)
self.tfidf_vectors_transposed = self.tfidf_vectors.transpose()
def search(self, query, n=10):
query_tfidf_vector = self.vectorizer.transform([query])
results = awesome_cossim_topn(
query_tfidf_vector, self.tfidf_vectors_transposed, n, 0.01
)
return results
|