import gradio as gr
from sentence_transformers import SentenceTransformer
import pandas as pd
import pickle
from pathlib import Path
import time
from datetime import datetime
print("load model start")
print(datetime.fromtimestamp(time.time()))
model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')
print("load model end")
print(datetime.fromtimestamp(time.time()))
quran = pd.read_csv('quran-eng.csv', delimiter=",")
print("load quran eng")
print(datetime.fromtimestamp(time.time()))
file = open('encoded_quran_text_split_multilingual-e5-large-instructs.sav','rb')
document_embeddings = pickle.load(file)
print("load quran embedding")
print(datetime.fromtimestamp(time.time()))
def make_clickable_both(val):
name, url = val.split('#')
print(name+"\n")
print(url+"\n")
return f'{name}'
def find(query):
print("start")
print(datetime.fromtimestamp(time.time()))
def get_detailed_instruct(task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery: {query}'
# Each query must come with a one-sentence instruction that describes the task
task = 'Given a web search query, retrieve relevant passages that answer the query'
queries = [
get_detailed_instruct(task, query)
]
#file = open('quran-splitted.sav','rb')
#quran_splitted = pickle.load(file)
#print("load quran\n")
#print(datetime.fromtimestamp(time.time()))
#documents = quran_splitted['text'].tolist()
# document_embeddings = model.encode(documents, convert_to_tensor=True, normalize_embeddings=True)
# filename = 'encoded_quran_text_split_multilingual-e5-large-instruct.sav'
# pickle.dump(embeddings, open(filename, 'wb'))
query_embeddings = model.encode(queries, convert_to_tensor=True, normalize_embeddings=True)
print("embed query")
print(datetime.fromtimestamp(time.time()))
scores = (query_embeddings @ document_embeddings.T) * 100
print("count similarities")
print(datetime.fromtimestamp(time.time()))
# insert the similarity value to dataframe & sort it
file = open('quran-splitted.sav','rb')
quran_splitted = pickle.load(file)
print("load quran")
print(datetime.fromtimestamp(time.time()))
quran_splitted['similarity'] = scores.tolist()[0]
sorted_quran = quran_splitted.sort_values(by='similarity', ascending=False)
print("sort by similarity")
print(datetime.fromtimestamp(time.time()))
#results = ""
results = pd.DataFrame()
i = 0
while i<3:
result = sorted_quran.iloc[i]
result_quran = quran.loc[(quran['sura']==result['sura']) & (quran['aya']==result['aya'])]
results = pd.concat([results, result_quran])
#results = results + result_quran['text'].item()+" (Q.S "+str(result['sura']).rstrip('.0')+":"+str(result['aya']).rstrip('.0')+")\n"
i=i+1
print("collect results")
print(datetime.fromtimestamp(time.time()))
url = 'https://quran.com/'+results['sura'].astype(str)+':'+results['aya'].astype(str)+'/tafsirs/en-tafisr-ibn-kathir'
results['text'] = ''+results['text']+ '' + ' (QS. ' + results['sura'].astype(str) + ':' + results['aya'].astype(str) + ')'
results = results.drop(columns=['sura', 'aya'])
#results['text'] = results['text'] + '#' + 'https://quran.com/'+results['sura'].astype(str)+':'+results['aya'].astype(str)+'/tafsirs/en-tafisr-ibn-kathir'
#results = results.style.format({'text': make_clickable_both})
#return sorted_quran
#filepath = Path(query+'.csv')
#results.to_csv(filepath,index=False)
#return results, filepath
return results
demo = gr.Interface(
fn=find,
inputs="textbox",
#outputs=[gr.Dataframe(headers=['text'],datatype=["markdown"],wrap=True),gr.DownloadButton()],
outputs=[gr.Dataframe(headers=['text'],datatype=["markdown"],wrap=True)],
cache_examples="lazy",
examples=[
["law of inheritance in islam"],
["tunjukilah jalan yang lurus"],
["سليمان"],
],
title="Quran Finder")
#demo = gr.Interface(fn=find, inputs="textbox", outputs="textbox")
if __name__ == "__main__":
demo.launch()