import streamlit as st from streamlit.logger import get_logger import gematriapy from timeit import default_timer as timer import sqlite3 import pandas as pd import ast import pymongo LOGGER = get_logger(__name__) @st.cache_resource def get_dfs()->object: import pandas as pd def to_daf_long(i:int)->str: if i>0 and i<999: i+=1 if i%2 ==0: return gematriapy.to_hebrew(i//2)+' עמוד א ' else: return gematriapy.to_hebrew(i//2)+' עמוד ב' return i def gematria(i)->str: if type(i) == int and i>0 and i<999: return gematriapy.to_hebrew(i) + ' ' else: return i if type(i)==str else '' # //get the books table// print('hello from get_dfs..') # Connect to the database conn = sqlite3.connect('test42.db') # Query the database and retrieve the results cursor = conn.execute("SELECT * FROM books") results = cursor.fetchall() # Convert the query results into a Pandas DataFrame books = pd.DataFrame(list(results)) books.columns=list(map(lambda x: x[0], cursor.description)) # convert the array format string "["Section","Section"]" that came from the database into a real array [Section,Section] books['heSectionNames']=books['heSectionNames'].apply(lambda x: ast.literal_eval(x) if x is not None else [''] ) # //get the texts table// # Query the database and retrieve the results cursor = conn.execute("SELECT * FROM texts") results = cursor.fetchall() # Convert the query results into a Pandas DataFrame texts = pd.DataFrame(results) texts.columns=list(map(lambda x: x[0], cursor.description)) # get the table that includes the titles, from the MongoDB database - b/c the sqlite just don't have it # Query the database and retrieve the results cursor = conn.execute("SELECT * FROM titles") results = cursor.fetchall() # Convert the query results into a Pandas DataFrame titles = pd.DataFrame(results) titles.columns=list(map(lambda x: x[0], cursor.description)) # merge the texts with the original books table (without the extra hebrew titles) merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id') #convert the Talmud marks (1,2,3...) into dafs (א עמוד א..) has_dafs = merged.loc[merged['heSectionNames'].apply(lambda x: True if len(x)>1 and x[-2] == 'דף' else False)==True] merged.loc[has_dafs.index,'level2'] = has_dafs['level2'].map(to_daf_long) # create a reference text, for exapmle: רש"י על בראשית פרק א פסוק א merged['ref_text_long']= merged['heTitle'] + ' ' + \ merged['heSectionNames'].map(lambda x:x[-4] + ' ' if len(x)>3 else "") + merged['level4'].map(gematria) + \ merged['heSectionNames'].map(lambda x:x[-3] + ' ' if len(x)>2 else "") + merged['level3'].map(gematria) + \ merged['heSectionNames'].map(lambda x:x[-2] + ' ' if len(x)>1 else "") + merged['level2'].map(gematria) titles_df = titles texts_df = merged return titles_df, texts_df def find_ref(titles_df,texts_df,input_text,top_k,num_of_results): from rapidfuzz import process as rapidfuzz_process print('hello from find_ref..') if not input_text: return results = [] books = titles_df['he_titles'] input_text = input_text.replace(':','עמוד ב').replace('.','עמוד א') # search only the references database in case the user set the top_k to 0 if top_k == 0: refs = texts_df['ref_text_long'].unique() for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results): results += [{'ref':ref,'ref_score':ref_score}] else: # search first only in the books database (for top_k books) for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k): # get all the references of that book book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0] refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique() # then search these references and add them all to the results for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10): results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}] # finaly, sort all the references by their own score (and not the book score) results.sort(key=lambda x: x['ref_score'],reverse=True) return results[:num_of_results] def run(): st.set_page_config( page_title=" חיפוש מקורות", page_icon="📚", layout="wide", initial_sidebar_state="expanded" ) get_dfs() st.write("# חיפוש מקורות באמצעות מרחק לוינשטיין") titles_df,texts_df = get_dfs() user_input = st.text_input('כתוב את המקור המבוקש', placeholder='בבא קמא דף ב עמוד ב') top_k = st.sidebar.slider('כמה ספרים לסרוק top_k:',0,20,10) num_of_results = st.sidebar.slider('מספר התוצאות שברצונך להציג:',1,25,5) if user_input!="": time0 = timer() results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results) time = f"finished in {1e3*(timer()-time0):.1f} ms" st.write(time) for result in results: st.write(result) if __name__ == "__main__": run()