sivan22's picture
Upload folder using huggingface_hub
618357a verified
import streamlit as st
from streamlit.logger import get_logger
import gematriapy
from timeit import default_timer as timer
import sqlite3
import pandas as pd
import ast
import pymongo
LOGGER = get_logger(__name__)
@st.cache_resource
def get_dfs()->object:
import pandas as pd
def to_daf_long(i:int)->str:
if i>0 and i<999:
i+=1
if i%2 ==0:
return gematriapy.to_hebrew(i//2)+' ืขืžื•ื“ ื '
else:
return gematriapy.to_hebrew(i//2)+' ืขืžื•ื“ ื‘'
return i
def gematria(i)->str:
if type(i) == int and i>0 and i<999:
return gematriapy.to_hebrew(i) + ' '
else: return i if type(i)==str else ''
# //get the books table//
print('hello from get_dfs..')
# Connect to the database
conn = sqlite3.connect('test42.db')
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM books")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
books = pd.DataFrame(list(results))
books.columns=list(map(lambda x: x[0], cursor.description))
# convert the array format string "["Section","Section"]" that came from the database into a real array [Section,Section]
books['heSectionNames']=books['heSectionNames'].apply(lambda x: ast.literal_eval(x) if x is not None else [''] )
# //get the texts table//
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM texts")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
texts = pd.DataFrame(results)
texts.columns=list(map(lambda x: x[0], cursor.description))
# get the table that includes the titles, from the MongoDB database - b/c the sqlite just don't have it
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM titles")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
titles = pd.DataFrame(results)
titles.columns=list(map(lambda x: x[0], cursor.description))
# merge the texts with the original books table (without the extra hebrew titles)
merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id')
#convert the Talmud marks (1,2,3...) into dafs (ื ืขืžื•ื“ ื..)
has_dafs = merged.loc[merged['heSectionNames'].apply(lambda x: True if len(x)>1 and x[-2] == 'ื“ืฃ' else False)==True]
merged.loc[has_dafs.index,'level2'] = has_dafs['level2'].map(to_daf_long)
# create a reference text, for exapmle: ืจืฉ"ื™ ืขืœ ื‘ืจืืฉื™ืช ืคืจืง ื ืคืกื•ืง ื
merged['ref_text_long']= merged['heTitle'] + ' ' + \
merged['heSectionNames'].map(lambda x:x[-4] + ' ' if len(x)>3 else "") + merged['level4'].map(gematria) + \
merged['heSectionNames'].map(lambda x:x[-3] + ' ' if len(x)>2 else "") + merged['level3'].map(gematria) + \
merged['heSectionNames'].map(lambda x:x[-2] + ' ' if len(x)>1 else "") + merged['level2'].map(gematria)
titles_df = titles
texts_df = merged
return titles_df, texts_df
def find_ref(titles_df,texts_df,input_text,top_k,num_of_results):
from rapidfuzz import process as rapidfuzz_process
print('hello from find_ref..')
if not input_text: return
results = []
books = titles_df['he_titles']
input_text = input_text.replace(':','ืขืžื•ื“ ื‘').replace('.','ืขืžื•ื“ ื')
# search only the references database in case the user set the top_k to 0
if top_k == 0:
refs = texts_df['ref_text_long'].unique()
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results):
results += [{'ref':ref,'ref_score':ref_score}]
else:
# search first only in the books database (for top_k books)
for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k):
# get all the references of that book
book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
# then search these references and add them all to the results
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10):
results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
# finaly, sort all the references by their own score (and not the book score)
results.sort(key=lambda x: x['ref_score'],reverse=True)
return results[:num_of_results]
def run():
st.set_page_config(
page_title=" ื—ื™ืคื•ืฉ ืžืงื•ืจื•ืช",
page_icon="๐Ÿ“š",
layout="wide",
initial_sidebar_state="expanded"
)
get_dfs()
st.write("# ื—ื™ืคื•ืฉ ืžืงื•ืจื•ืช ื‘ืืžืฆืขื•ืช ืžืจื—ืง ืœื•ื™ื ืฉื˜ื™ื™ืŸ")
titles_df,texts_df = get_dfs()
user_input = st.text_input('ื›ืชื•ื‘ ืืช ื”ืžืงื•ืจ ื”ืžื‘ื•ืงืฉ', placeholder='ื‘ื‘ื ืงืžื ื“ืฃ ื‘ ืขืžื•ื“ ื‘')
top_k = st.sidebar.slider('ื›ืžื” ืกืคืจื™ื ืœืกืจื•ืง top_k:',0,20,10)
num_of_results = st.sidebar.slider('ืžืกืคืจ ื”ืชื•ืฆืื•ืช ืฉื‘ืจืฆื•ื ืš ืœื”ืฆื™ื’:',1,25,5)
if user_input!="":
time0 = timer()
results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results)
time = f"finished in {1e3*(timer()-time0):.1f} ms"
st.write(time)
for result in results:
st.write(result)
if __name__ == "__main__":
run()