sivan22
/

sefaria-ref-finder

Model card Files Files and versions Community

sefaria-ref-finder / app.py

sivan22

Upload folder using huggingface_hub

618357a verified 10 months ago

raw

history blame contribute delete

5.59 kB

	import streamlit as st
	from streamlit.logger import get_logger
	import gematriapy

	from timeit import default_timer as timer
	import sqlite3
	import pandas as pd
	import ast
	import pymongo



	LOGGER = get_logger(__name__)

	@st.cache_resource
	def get_dfs()->object:
	import pandas as pd

	def to_daf_long(i:int)->str:
	if i>0 and i<999:
	i+=1
	if i%2 ==0:
	return gematriapy.to_hebrew(i//2)+' עמוד א '
	else:
	return gematriapy.to_hebrew(i//2)+' עמוד ב'
	return i

	def gematria(i)->str:
	if type(i) == int and i>0 and i<999:
	return gematriapy.to_hebrew(i) + ' '
	else: return i if type(i)==str else ''

	# //get the books table//
	print('hello from get_dfs..')
	# Connect to the database
	conn = sqlite3.connect('test42.db')

	# Query the database and retrieve the results
	cursor = conn.execute("SELECT * FROM books")
	results = cursor.fetchall()

	# Convert the query results into a Pandas DataFrame
	books = pd.DataFrame(list(results))
	books.columns=list(map(lambda x: x[0], cursor.description))

	# convert the array format string "["Section","Section"]" that came from the database into a real array [Section,Section]
	books['heSectionNames']=books['heSectionNames'].apply(lambda x: ast.literal_eval(x) if x is not None else [''] )

	# //get the texts table//

	# Query the database and retrieve the results
	cursor = conn.execute("SELECT * FROM texts")
	results = cursor.fetchall()

	# Convert the query results into a Pandas DataFrame
	texts = pd.DataFrame(results)
	texts.columns=list(map(lambda x: x[0], cursor.description))

	# get the table that includes the titles, from the MongoDB database - b/c the sqlite just don't have it
	# Query the database and retrieve the results
	cursor = conn.execute("SELECT * FROM titles")
	results = cursor.fetchall()

	# Convert the query results into a Pandas DataFrame
	titles = pd.DataFrame(results)
	titles.columns=list(map(lambda x: x[0], cursor.description))
	# merge the texts with the original books table (without the extra hebrew titles)
	merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id')

	#convert the Talmud marks (1,2,3...) into dafs (א עמוד א..)
	has_dafs = merged.loc[merged['heSectionNames'].apply(lambda x: True if len(x)>1 and x[-2] == 'דף' else False)==True]
	merged.loc[has_dafs.index,'level2'] = has_dafs['level2'].map(to_daf_long)

	# create a reference text, for exapmle: רש"י על בראשית פרק א פסוק א
	merged['ref_text_long']= merged['heTitle'] + ' ' + \
	merged['heSectionNames'].map(lambda x:x[-4] + ' ' if len(x)>3 else "") + merged['level4'].map(gematria) + \
	merged['heSectionNames'].map(lambda x:x[-3] + ' ' if len(x)>2 else "") + merged['level3'].map(gematria) + \
	merged['heSectionNames'].map(lambda x:x[-2] + ' ' if len(x)>1 else "") + merged['level2'].map(gematria)

	titles_df = titles
	texts_df = merged
	return titles_df, texts_df


	def find_ref(titles_df,texts_df,input_text,top_k,num_of_results):
	from rapidfuzz import process as rapidfuzz_process
	print('hello from find_ref..')
	if not input_text: return

	results = []
	books = titles_df['he_titles']
	input_text = input_text.replace(':','עמוד ב').replace('.','עמוד א')

	# search only the references database in case the user set the top_k to 0
	if top_k == 0:
	refs = texts_df['ref_text_long'].unique()
	for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results):
	results += [{'ref':ref,'ref_score':ref_score}]

	else:
	# search first only in the books database (for top_k books)
	for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k):
	# get all the references of that book
	book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
	refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
	# then search these references and add them all to the results
	for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10):
	results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
	# finaly, sort all the references by their own score (and not the book score)
	results.sort(key=lambda x: x['ref_score'],reverse=True)

	return results[:num_of_results]


	def run():

	st.set_page_config(
	page_title=" חיפוש מקורות",
	page_icon="📚",
	layout="wide",
	initial_sidebar_state="expanded"
	)
	get_dfs()
	st.write("# חיפוש מקורות באמצעות מרחק לוינשטיין")

	titles_df,texts_df = get_dfs()
	user_input = st.text_input('כתוב את המקור המבוקש', placeholder='בבא קמא דף ב עמוד ב')
	top_k = st.sidebar.slider('כמה ספרים לסרוק top_k:',0,20,10)
	num_of_results = st.sidebar.slider('מספר התוצאות שברצונך להציג:',1,25,5)

	if user_input!="":
	time0 = timer()
	results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results)
	time = f"finished in {1e3*(timer()-time0):.1f} ms"
	st.write(time)
	for result in results:
	st.write(result)

	if __name__ == "__main__":
	run()