import gradio as gr from sentence_transformers import SentenceTransformer import pandas as pd import pickle from pathlib import Path import time from datetime import datetime print("load model start") print(datetime.fromtimestamp(time.time())) model = SentenceTransformer('intfloat/multilingual-e5-large-instruct') print("load model end") print(datetime.fromtimestamp(time.time())) quran = pd.read_csv('quran-eng.csv', delimiter=",") print("load quran eng") print(datetime.fromtimestamp(time.time())) file = open('encoded_quran_text_split_multilingual-e5-large-instructs.sav','rb') document_embeddings = pickle.load(file) print("load quran embedding") print(datetime.fromtimestamp(time.time())) def make_clickable_both(val): name, url = val.split('#') print(name+"\n") print(url+"\n") return f'{name}' def find(query): print("start") print(datetime.fromtimestamp(time.time())) def get_detailed_instruct(task_description: str, query: str) -> str: return f'Instruct: {task_description}\nQuery: {query}' # Each query must come with a one-sentence instruction that describes the task task = 'Given a web search query, retrieve relevant passages that answer the query' queries = [ get_detailed_instruct(task, query) ] #file = open('quran-splitted.sav','rb') #quran_splitted = pickle.load(file) #print("load quran\n") #print(datetime.fromtimestamp(time.time())) #documents = quran_splitted['text'].tolist() # document_embeddings = model.encode(documents, convert_to_tensor=True, normalize_embeddings=True) # filename = 'encoded_quran_text_split_multilingual-e5-large-instruct.sav' # pickle.dump(embeddings, open(filename, 'wb')) query_embeddings = model.encode(queries, convert_to_tensor=True, normalize_embeddings=True) print("embed query") print(datetime.fromtimestamp(time.time())) scores = (query_embeddings @ document_embeddings.T) * 100 print("count similarities") print(datetime.fromtimestamp(time.time())) # insert the similarity value to dataframe & sort it file = open('quran-splitted.sav','rb') quran_splitted = pickle.load(file) print("load quran") print(datetime.fromtimestamp(time.time())) quran_splitted['similarity'] = scores.tolist()[0] sorted_quran = quran_splitted.sort_values(by='similarity', ascending=False) print("sort by similarity") print(datetime.fromtimestamp(time.time())) #results = "" results = pd.DataFrame() i = 0 while i<3: result = sorted_quran.iloc[i] result_quran = quran.loc[(quran['sura']==result['sura']) & (quran['aya']==result['aya'])] results = pd.concat([results, result_quran]) #results = results + result_quran['text'].item()+" (Q.S "+str(result['sura']).rstrip('.0')+":"+str(result['aya']).rstrip('.0')+")\n" i=i+1 print("collect results") print(datetime.fromtimestamp(time.time())) url = 'https://quran.com/'+results['sura'].astype(str)+':'+results['aya'].astype(str)+'/tafsirs/en-tafisr-ibn-kathir' results['text'] = ''+results['text']+ '' + ' (QS. ' + results['sura'].astype(str) + ':' + results['aya'].astype(str) + ')' results = results.drop(columns=['sura', 'aya']) #results['text'] = results['text'] + '#' + 'https://quran.com/'+results['sura'].astype(str)+':'+results['aya'].astype(str)+'/tafsirs/en-tafisr-ibn-kathir' #results = results.style.format({'text': make_clickable_both}) #return sorted_quran #filepath = Path(query+'.csv') #results.to_csv(filepath,index=False) #return results, filepath return results demo = gr.Interface( fn=find, inputs="textbox", #outputs=[gr.Dataframe(headers=['text'],datatype=["markdown"],wrap=True),gr.DownloadButton()], outputs=[gr.Dataframe(headers=['text'],datatype=["markdown"],wrap=True)], cache_examples="lazy", examples=[ ["law of inheritance in islam"], ["tunjukilah jalan yang lurus"], ["سليمان"], ], title="Quran Finder") #demo = gr.Interface(fn=find, inputs="textbox", outputs="textbox") if __name__ == "__main__": demo.launch()